]>
git.gir.st - base65536.git/blob - base65536.c
1 /* base65536 semi-library.
2 (C) 2016 Tobias Girstmair, http://isticktoit.net/
3 Released under the GNU GPL v3. See LICENSE for details. */
10 struct block b65k_tree
[] = {
268 }; /* autogenerated with generate_struct.c */
270 struct block
* get_block_by_index (struct block
* tree
, int index
, int len
, int pos
) {
271 if (pos
>= len
) return NULL
;
272 if (tree
[pos
].num
== index
) return &tree
[pos
];
273 if (index
< tree
[pos
].num
) return get_block_by_index (tree
, index
, len
, 2*pos
+1);
274 if (index
> tree
[pos
].num
) return get_block_by_index (tree
, index
, len
, 2*pos
+2);
276 return NULL
; //will never be reached
279 struct block
* get_block_by_start (struct block
* tree
, int start
, int len
, int pos
) {
280 if (pos
>= len
) return NULL
;
281 if (tree
[pos
].start
== start
) return &tree
[pos
];
282 if (start
< tree
[pos
].start
) return get_block_by_start (tree
, start
, len
, 2*pos
+1);
283 if (start
> tree
[pos
].start
) return get_block_by_start (tree
, start
, len
, 2*pos
+2);
285 return NULL
; //will never be reached
288 //convenience functions (hardcoded tree-struct):
289 int tree_find_index (int block
) {
290 struct block
* node
= get_block_by_start (b65k_tree
, block
, B65536_TREE_SIZE
, 0);
291 if (node
== NULL
) return NOT_FOUND
;
295 int tree_find_block(int index
) {
296 struct block
* node
= get_block_by_index (b65k_tree
, index
, B65536_TREE_SIZE
, 0);
297 if (node
== NULL
) return NOT_FOUND
;
302 /* base65536_encode_char: convert two bytes to base65536 (unicode-codepoint).
303 in_buf: bytes to encode casted to int. [1] may be EOF.
304 is_eof: if true, b2 will be B65K_EOF.
305 returns: number of bytes processed.
307 int base65536_encode_char (const int* in_buf
) {
311 return tree_find_block (b2
) + b1
;
314 /* base65536_decode_char: decode base65536-codepoint to bytes.
315 out_buf: expects an array of size of two integers to write to.
316 returns: number of bytes processed, zero on error.
318 int base65536_decode_char (const int in_cp
, int* out_buf
) {
321 b1
= in_cp
& ((1 << 8) -1);
322 b2
= tree_find_index (in_cp
-b1
);
327 if (b2
== NOT_FOUND
) return 0;
328 return (b2
!= B65K_EOF
) + 1;
331 /* codepoint_to_utf8: utf8-encode given codepoint.
332 unicode: codepoint to covert.
333 buf: buffer to write to; expected to be of size 5.
334 returns: number of bytes written or zero on error.
336 int codepoint_to_utf8 (int unicode
, char* buf
) {
337 if (unicode
< 0x80) {
341 } else if (unicode
< 0x800) {
342 buf
[0] = 0xc0 | (unicode
>> 6 & 0x1f);
343 buf
[1] = 0x80 | (unicode
& 0x3f);
346 } else if (unicode
>=0xd800 && unicode
< 0x8000) {
347 return 0; /*is invalid code block*/
348 } else if (unicode
< 0x10000) {
349 buf
[0] = 0xe0 | (unicode
>>12 & 0x0f);
350 buf
[1] = 0x80 | (unicode
>> 6 & 0x3f);
351 buf
[2] = 0x80 | (unicode
& 0x3f);
355 buf
[0] = 0xf0 | (unicode
>>18 & 0x07);
356 buf
[1] = 0x80 | (unicode
>>12 & 0x3f);
357 buf
[2] = 0x80 | (unicode
>> 6 & 0x3f);
358 buf
[3] = 0x80 | (unicode
& 0x3f);
366 /* utf8_to_codepoint: decodes a utf-8 character to its unicode-codepoint.
367 buf: buffer to read from, expected to be of size 5.
368 returns: unicode codepoint.
369 WARN: will not detect non-utf8-sequence!
371 int utf8_to_codepoint (char* buf
) {
372 int codepoint
= 0; //cummulative
375 for (char first
= buf
[0];;) {
376 //this loop counts the number of consecutive binary 1s at the
377 //start of the byte to determine, how many follow-up bytes
384 if (l
== 0) { //ascii chars are 01xxxxxx
387 for (int i
= l
-1; i
; i
--) {
388 //take lower 6bits of every byte and stick them end to end.
389 codepoint
+= (buf
[i
] & ((1<<6)-1)) << ((l
-1)-i
)*6;
391 //add the bits from the first byte to the beginning, but not
392 //the 1s that determine the number of follow-up-bytes
393 codepoint
+= (buf
[0] & ((1<<(7-l
))-1)) << ((l
-1))*6;