base65536.c

   1 /* base65536 semi-library.
   2 (C) 2016 Tobias Girstmair, http://isticktoit.net/
   3 Released under the GNU GPL v3. See LICENSE for details. */
   4
   5 #include <stdlib.h>
   6 #include <string.h>
   7 #include "base65536.h"
   8
   9
  10 struct block b65k_tree[] = {
  11         {128, 132608},
  12         {64, 29952},
  13         {192, 148992},
  14         {32, 21760},
  15         {96, 38144},
  16         {160, 140800},
  17         {224, 157184},
  18         {16, 17408},
  19         {48, 25856},
  20         {80, 34048},
  21         {112, 73984},
  22         {144, 136704},
  23         {176, 144896},
  24         {208, 153088},
  25         {240, 161280},
  26         {8, 15360},
  27         {24, 19456},
  28         {40, 23808},
  29         {56, 27904},
  30         {72, 32000},
  31         {88, 36096},
  32         {104, 40192},
  33         {120, 92160},
  34         {136, 134656},
  35         {152, 138752},
  36         {168, 142848},
  37         {184, 146944},
  38         {200, 151040},
  39         {216, 155136},
  40         {232, 159232},
  41         {248, 163328},
  42         {4, 14336},
  43         {12, 16384},
  44         {20, 18432},
  45         {28, 20736},
  46         {36, 22784},
  47         {44, 24832},
  48         {52, 26880},
  49         {60, 28928},
  50         {68, 30976},
  51         {76, 33024},
  52         {84, 35072},
  53         {92, 37120},
  54         {100, 39168},
  55         {108, 41728},
  56         {116, 78336},
  57         {124, 131584},
  58         {132, 133632},
  59         {140, 135680},
  60         {148, 137728},
  61         {156, 139776},
  62         {164, 141824},
  63         {172, 143872},
  64         {180, 145920},
  65         {188, 147968},
  66         {196, 150016},
  67         {204, 152064},
  68         {212, 154112},
  69         {220, 156160},
  70         {228, 158208},
  71         {236, 160256},
  72         {244, 162304},
  73         {252, 164352},
  74         {2, 13824},
  75         {6, 14848},
  76         {10, 15872},
  77         {14, 16896},
  78         {18, 17920},
  79         {22, 18944},
  80         {26, 20224},
  81         {30, 21248},
  82         {34, 22272},
  83         {38, 23296},
  84         {42, 24320},
  85         {46, 25344},
  86         {50, 26368},
  87         {54, 27392},
  88         {58, 28416},
  89         {62, 29440},
  90         {66, 30464},
  91         {70, 31488},
  92         {74, 32512},
  93         {78, 33536},
  94         {82, 34560},
  95         {86, 35584},
  96         {90, 36608},
  97         {94, 37632},
  98         {98, 38656},
  99         {102, 39680},
 100         {106, 41216},
 101         {110, 67072},
 102         {114, 77824},
 103         {118, 82944},
 104         {122, 131072},
 105         {126, 132096},
 106         {130, 133120},
 107         {134, 134144},
 108         {138, 135168},
 109         {142, 136192},
 110         {146, 137216},
 111         {150, 138240},
 112         {154, 139264},
 113         {158, 140288},
 114         {162, 141312},
 115         {166, 142336},
 116         {170, 143360},
 117         {174, 144384},
 118         {178, 145408},
 119         {182, 146432},
 120         {186, 147456},
 121         {190, 148480},
 122         {194, 149504},
 123         {198, 150528},
 124         {202, 151552},
 125         {206, 152576},
 126         {210, 153600},
 127         {214, 154624},
 128         {218, 155648},
 129         {222, 156672},
 130         {226, 157696},
 131         {230, 158720},
 132         {234, 159744},
 133         {238, 160768},
 134         {242, 161792},
 135         {246, 162816},
 136         {250, 163840},
 137         {254, 164864},
 138         {0, 13312},
 139         {3, 14080},
 140         {5, 14592},
 141         {7, 15104},
 142         {9, 15616},
 143         {11, 16128},
 144         {13, 16640},
 145         {15, 17152},
 146         {17, 17664},
 147         {19, 18176},
 148         {21, 18688},
 149         {23, 19200},
 150         {25, 19968},
 151         {27, 20480},
 152         {29, 20992},
 153         {31, 21504},
 154         {33, 22016},
 155         {35, 22528},
 156         {37, 23040},
 157         {39, 23552},
 158         {41, 24064},
 159         {43, 24576},
 160         {45, 25088},
 161         {47, 25600},
 162         {49, 26112},
 163         {51, 26624},
 164         {53, 27136},
 165         {55, 27648},
 166         {57, 28160},
 167         {59, 28672},
 168         {61, 29184},
 169         {63, 29696},
 170         {65, 30208},
 171         {67, 30720},
 172         {69, 31232},
 173         {71, 31744},
 174         {73, 32256},
 175         {75, 32768},
 176         {77, 33280},
 177         {79, 33792},
 178         {81, 34304},
 179         {83, 34816},
 180         {85, 35328},
 181         {87, 35840},
 182         {89, 36352},
 183         {91, 36864},
 184         {93, 37376},
 185         {95, 37888},
 186         {97, 38400},
 187         {99, 38912},
 188         {101, 39424},
 189         {103, 39936},
 190         {105, 40448},
 191         {107, 41472},
 192         {109, 42240},
 193         {111, 73728},
 194         {113, 74240},
 195         {115, 78080},
 196         {117, 78592},
 197         {119, 83200},
 198         {121, 92416},
 199         {123, 131328},
 200         {125, 131840},
 201         {127, 132352},
 202         {129, 132864},
 203         {131, 133376},
 204         {133, 133888},
 205         {135, 134400},
 206         {137, 134912},
 207         {139, 135424},
 208         {141, 135936},
 209         {143, 136448},
 210         {145, 136960},
 211         {147, 137472},
 212         {149, 137984},
 213         {151, 138496},
 214         {153, 139008},
 215         {155, 139520},
 216         {157, 140032},
 217         {159, 140544},
 218         {161, 141056},
 219         {163, 141568},
 220         {165, 142080},
 221         {167, 142592},
 222         {169, 143104},
 223         {171, 143616},
 224         {173, 144128},
 225         {175, 144640},
 226         {177, 145152},
 227         {179, 145664},
 228         {181, 146176},
 229         {183, 146688},
 230         {185, 147200},
 231         {187, 147712},
 232         {189, 148224},
 233         {191, 148736},
 234         {193, 149248},
 235         {195, 149760},
 236         {197, 150272},
 237         {199, 150784},
 238         {201, 151296},
 239         {203, 151808},
 240         {205, 152320},
 241         {207, 152832},
 242         {209, 153344},
 243         {211, 153856},
 244         {213, 154368},
 245         {215, 154880},
 246         {217, 155392},
 247         {219, 155904},
 248         {221, 156416},
 249         {223, 156928},
 250         {225, 157440},
 251         {227, 157952},
 252         {229, 158464},
 253         {231, 158976},
 254         {233, 159488},
 255         {235, 160000},
 256         {237, 160512},
 257         {239, 161024},
 258         {241, 161536},
 259         {243, 162048},
 260         {245, 162560},
 261         {247, 163072},
 262         {249, 163584},
 263         {251, 164096},
 264         {253, 164608},
 265         {255, 165120},
 266         {B65K_EOF, 5376},
 267         {1, 13568}
 268 }; /* autogenerated with generate_struct.c */
 269
 270 struct block* get_block_by_index (struct block* tree, int index, int len, int pos) {
 271         if (pos >= len) return NULL;
 272         if (tree[pos].num == index) return &tree[pos];
 273         if (index < tree[pos].num) return get_block_by_index (tree, index, len, 2*pos+1);
 274         if (index > tree[pos].num) return get_block_by_index (tree, index, len, 2*pos+2);
 275
 276         return NULL; //will never be reached
 277 }
 278
 279 struct block* get_block_by_start (struct block* tree, int start, int len, int pos) {
 280         if (pos >= len) return NULL;
 281         if (tree[pos].start == start) return &tree[pos];
 282         if (start < tree[pos].start) return get_block_by_start (tree, start, len, 2*pos+1);
 283         if (start > tree[pos].start) return get_block_by_start (tree, start, len, 2*pos+2);
 284
 285         return NULL; //will never be reached
 286 }
 287
 288 //convenience functions (hardcoded tree-struct):
 289 int tree_find_index (int block) {
 290         struct block* node = get_block_by_start (b65k_tree, block, B65536_TREE_SIZE, 0);
 291         if (node == NULL) return NOT_FOUND;
 292         return node->num;
 293 }
 294
 295 int tree_find_block(int index) {
 296         struct block* node = get_block_by_index (b65k_tree, index, B65536_TREE_SIZE, 0);
 297         if (node == NULL) return NOT_FOUND;
 298         return node->start;
 299 }
 300
 301
 302 /* base65536_encode_char: convert two bytes to base65536 (unicode-codepoint).
 303         in_buf: bytes to encode casted to int. [1] may be EOF.
 304         is_eof: if true, b2 will be B65K_EOF.
 305         returns: number of bytes processed.
 306 */
 307 int base65536_encode_char (const int* in_buf) {
 308         int b1 = in_buf[0];
 309         int b2 = in_buf[1];
 310
 311         return tree_find_block (b2) + b1;
 312 }
 313
 314 /* base65536_decode_char: decode base65536-codepoint to bytes.
 315         out_buf: expects an array of size of two integers to write to.
 316         returns: number of bytes processed, zero on error.
 317 */
 318 int base65536_decode_char (const int in_cp, int* out_buf) {
 319         int b1, b2;
 320
 321         b1 = in_cp & ((1 << 8) -1);
 322         b2 = tree_find_index (in_cp-b1);
 323
 324         out_buf[0] = b1;
 325         out_buf[1] = b2;
 326
 327         if (b2 == NOT_FOUND) return 0;
 328         return (b2 != B65K_EOF) + 1;
 329 }
 330
 331 /* codepoint_to_utf8: utf8-encode given codepoint.
 332         unicode: codepoint to covert.
 333         buf: buffer to write to; expected to be of size 5.
 334         returns: number of bytes written or zero on error.
 335 */
 336 int codepoint_to_utf8 (int unicode, char* buf) {
 337         if (unicode < 0x80) {
 338                 buf[0] = unicode;
 339                 buf[1] = '\0';
 340                 return 1;
 341         } else if (unicode < 0x800) {
 342                 buf[0] = 0xc0 | (unicode>> 6 & 0x1f);
 343                 buf[1] = 0x80 | (unicode     & 0x3f);
 344                 buf[2] = '\0';
 345                 return 2;
 346         } else if (unicode >=0xd800 && unicode < 0x8000) {
 347                 return 0; /*is invalid code block*/
 348         } else if (unicode < 0x10000) {
 349                 buf[0] = 0xe0 | (unicode>>12 & 0x0f);
 350                 buf[1] = 0x80 | (unicode>> 6 & 0x3f);
 351                 buf[2] = 0x80 | (unicode     & 0x3f);
 352                 buf[3] = '\0';
 353                 return 3;
 354         } else {
 355                 buf[0] = 0xf0 | (unicode>>18 & 0x07);
 356                 buf[1] = 0x80 | (unicode>>12 & 0x3f);
 357                 buf[2] = 0x80 | (unicode>> 6 & 0x3f);
 358                 buf[3] = 0x80 | (unicode     & 0x3f);
 359                 buf[4] = '\0';
 360                 return 4;
 361
 362         }
 363         return 0;
 364 }
 365
 366 /* utf8_to_codepoint: decodes a utf-8 character to its unicode-codepoint.
 367         buf: buffer to read from, expected to be of size 5.
 368         returns: unicode codepoint.
 369         WARN: will not detect non-utf8-sequence!
 370 */
 371 int utf8_to_codepoint (char* buf) {
 372         int codepoint = 0; //cummulative
 373         int l = 0;
 374
 375         for (char first = buf[0];;) {
 376                 //this loop counts the number of consecutive binary 1s at the
 377                 //start of the byte to determine, how many follow-up bytes
 378                 //(l-1) will come.
 379                 if (first>>7) l++;
 380                 else break;
 381                 first <<= 1;
 382         }
 383
 384         if (l == 0) { //ascii chars are 01xxxxxx
 385                 codepoint = buf[0];
 386         } else {
 387                 for (int i = l-1; i; i--) {
 388                         //take lower 6bits of every byte and stick them end to end.
 389                         codepoint += (buf[i] & ((1<<6)-1)) << ((l-1)-i)*6;
 390                 }
 391                 //add the bits from the first byte to the beginning, but not
 392                 //the 1s that determine the number of follow-up-bytes
 393                 codepoint += (buf[0] & ((1<<(7-l))-1)) << ((l-1))*6;
 394         }
 395         return codepoint;
 396 }