//***************************************************************** // // $file: utf8.c $ // $author: Martin Fouilleul $ // $date: 05/11/2016 $ // $revision: $ // $note: (C) 2016 by Martin Fouilleul - all rights reserved $ // //***************************************************************** #include "utf8.h" //----------------------------------------------------------------- // utf-8 gore //----------------------------------------------------------------- static const u32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; static const char trailingBytesForUTF8[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; #define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80) //----------------------------------------------------------------- //NOTE: getting sizes / offsets / indices //----------------------------------------------------------------- u32 oc_utf8_size_from_leading_char(char leadingChar) { return (trailingBytesForUTF8[(unsigned int)(unsigned char)leadingChar] + 1); } u32 oc_utf8_codepoint_size(oc_utf32 codePoint) { if(codePoint < 0x80) { return (1); } if(codePoint < 0x800) { return (2); } if(codePoint < 0x10000) { return (3); } if(codePoint < 0x110000) { return (4); } return (0); } u64 oc_utf8_codepoint_count_for_string(oc_str8 string) { u64 byteOffset = 0; u64 codePointIndex = 0; for(; (byteOffset < string.len) && (string.ptr[byteOffset] != 0); codePointIndex++) { oc_utf8_dec decode = oc_utf8_decode_at(string, byteOffset); byteOffset += decode.size; } return (codePointIndex); } u64 oc_utf8_byte_count_for_codepoints(oc_str32 codePoints) { //NOTE(martin): return the exact number of bytes taken by the encoded // version of codePoints. (ie do not attempt to provision // for a zero terminator). u64 byteCount = 0; for(u64 i = 0; i < codePoints.len; i++) { byteCount += oc_utf8_codepoint_size(codePoints.ptr[i]); } return (byteCount); } u64 oc_utf8_next_offset(oc_str8 string, u64 byteOffset) { u64 res = 0; if(byteOffset >= string.len) { res = string.len; } else { u64 nextOffset = byteOffset + oc_utf8_size_from_leading_char(string.ptr[byteOffset]); res = oc_min(nextOffset, string.len); } return (res); } u64 oc_utf8_prev_offset(oc_str8 string, u64 byteOffset) { u64 res = 0; if(byteOffset > string.len) { res = string.len; } else if(byteOffset) { byteOffset--; while(byteOffset > 0 && !oc_utf8_is_start_byte(string.ptr[byteOffset])) { byteOffset--; } res = byteOffset; } return (res); } //----------------------------------------------------------------- //NOTE: encoding / decoding //----------------------------------------------------------------- oc_utf8_dec oc_utf8_decode_at(oc_str8 string, u64 offset) { //NOTE(martin): get the first codepoint in str, and advance index to the // next oc_utf8 character //TODO(martin): check for utf-16 surrogate pairs oc_utf32 cp = 0; u64 sz = 0; if(offset >= string.len || !string.ptr[offset]) { cp = 0; sz = 1; } else if(!oc_utf8_is_start_byte(string.ptr[offset])) { //NOTE(martin): unexpected continuation or invalid character. cp = 0xfffd; sz = 1; } else { int expectedSize = oc_utf8_size_from_leading_char(string.ptr[offset]); do { /*NOTE(martin): we shift 6 bits and add the next byte at each round. at the end we have our oc_utf8 codepoint, added to the shifted versions of the oc_utf8 leading bits for each encoded byte. These values are precomputed in offsetsFromUTF8. */ unsigned char b = string.ptr[offset]; cp <<= 6; cp += b; offset += 1; sz++; if(b == 0xc0 || b == 0xc1 || b >= 0xc5) { //NOTE(martin): invalid byte encountered break; } } while(offset < string.len && string.ptr[offset] && !oc_utf8_is_start_byte(string.ptr[offset]) && sz < expectedSize); if(sz != expectedSize) { //NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD cp = 0xfffd; } else { cp -= offsetsFromUTF8[sz - 1]; //NOTE(martin): check for invalid codepoints if(cp > 0x10ffff || (cp >= 0xd800 && cp <= 0xdfff)) { cp = 0xfffd; } } } oc_utf8_dec res = { .codepoint = cp, .size = sz }; return (res); } oc_utf8_dec oc_utf8_decode(oc_str8 string) { return (oc_utf8_decode_at(string, 0)); } oc_str8 oc_utf8_encode(char* dest, oc_utf32 codePoint) { u64 sz = 0; if(codePoint < 0x80) { dest[0] = (char)codePoint; sz = 1; } else if(codePoint < 0x800) { dest[0] = (codePoint >> 6) | 0xC0; dest[1] = (codePoint & 0x3F) | 0x80; sz = 2; } else if(codePoint < 0x10000) { dest[0] = (codePoint >> 12) | 0xE0; dest[1] = ((codePoint >> 6) & 0x3F) | 0x80; dest[2] = (codePoint & 0x3F) | 0x80; sz = 3; } else if(codePoint < 0x110000) { dest[0] = (codePoint >> 18) | 0xF0; dest[1] = ((codePoint >> 12) & 0x3F) | 0x80; dest[2] = ((codePoint >> 6) & 0x3F) | 0x80; dest[3] = (codePoint & 0x3F) | 0x80; sz = 4; } oc_str8 res = { .len = sz, .ptr = dest }; return (res); } oc_str32 oc_utf8_to_codepoints(u64 maxCount, oc_utf32* backing, oc_str8 string) { u64 codePointIndex = 0; u64 byteOffset = 0; for(; codePointIndex < maxCount && byteOffset < string.len; codePointIndex++) { oc_utf8_dec decode = oc_utf8_decode_at(string, byteOffset); backing[codePointIndex] = decode.codepoint; byteOffset += decode.size; } oc_str32 res = { .len = codePointIndex, .ptr = backing }; return (res); } oc_str8 oc_utf8_from_codepoints(u64 maxBytes, char* backing, oc_str32 codePoints) { u64 byteOffset = 0; for(u64 codePointIndex = 0; (codePointIndex < codePoints.len); codePointIndex++) { oc_utf32 codePoint = codePoints.ptr[codePointIndex]; u32 byteCount = oc_utf8_codepoint_size(codePoint); if(byteOffset + byteCount > maxBytes) { break; } oc_utf8_encode(backing + byteOffset, codePoint); byteOffset += byteCount; } oc_str8 res = { .len = byteOffset, .ptr = backing }; return (res); } oc_str32 oc_utf8_push_to_codepoints(oc_arena* arena, oc_str8 string) { u64 count = oc_utf8_codepoint_count_for_string(string); oc_utf32* backing = oc_arena_push_array(arena, oc_utf32, count); oc_str32 res = oc_utf8_to_codepoints(count, backing, string); return (res); } oc_str8 oc_utf8_push_from_codepoints(oc_arena* arena, oc_str32 codePoints) { u64 count = oc_utf8_byte_count_for_codepoints(codePoints); char* backing = oc_arena_push_array(arena, char, count); oc_str8 res = oc_utf8_from_codepoints(count, backing, codePoints); return (res); } #define OC_UNICODE_RANGE(start, cnt, name) ORCA_API const oc_unicode_range OC_CAT2(OC_UNICODE_, name) = { .firstCodePoint = start, .count = cnt }; OC_UNICODE_RANGES #undef OC_UNICODE_RANGE