orca/src/util/utf8.c

284 lines
8.2 KiB
C

//*****************************************************************
//
// $file: utf8.c $
// $author: Martin Fouilleul $
// $date: 05/11/2016 $
// $revision: $
// $note: (C) 2016 by Martin Fouilleul - all rights reserved $
//
//*****************************************************************
#include "utf8.h"
//-----------------------------------------------------------------
// utf-8 gore
//-----------------------------------------------------------------
static const u32 offsetsFromUTF8[6] = {
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const char trailingBytesForUTF8[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
#define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80)
//-----------------------------------------------------------------
//NOTE: getting sizes / offsets / indices
//-----------------------------------------------------------------
u32 oc_utf8_size_from_leading_char(char leadingChar)
{
return (trailingBytesForUTF8[(unsigned int)(unsigned char)leadingChar] + 1);
}
u32 oc_utf8_codepoint_size(oc_utf32 codePoint)
{
if(codePoint < 0x80)
{
return (1);
}
if(codePoint < 0x800)
{
return (2);
}
if(codePoint < 0x10000)
{
return (3);
}
if(codePoint < 0x110000)
{
return (4);
}
return (0);
}
u64 oc_utf8_codepoint_count_for_string(oc_str8 string)
{
u64 byteOffset = 0;
u64 codePointIndex = 0;
for(;
(byteOffset < string.len) && (string.ptr[byteOffset] != 0);
codePointIndex++)
{
oc_utf8_dec decode = oc_utf8_decode_at(string, byteOffset);
byteOffset += decode.size;
}
return (codePointIndex);
}
u64 oc_utf8_byte_count_for_codepoints(oc_str32 codePoints)
{
//NOTE(martin): return the exact number of bytes taken by the encoded
// version of codePoints. (ie do not attempt to provision
// for a zero terminator).
u64 byteCount = 0;
for(u64 i = 0; i < codePoints.len; i++)
{
byteCount += oc_utf8_codepoint_size(codePoints.ptr[i]);
}
return (byteCount);
}
u64 oc_utf8_next_offset(oc_str8 string, u64 byteOffset)
{
u64 res = 0;
if(byteOffset >= string.len)
{
res = string.len;
}
else
{
u64 nextOffset = byteOffset + oc_utf8_size_from_leading_char(string.ptr[byteOffset]);
res = oc_min(nextOffset, string.len);
}
return (res);
}
u64 oc_utf8_prev_offset(oc_str8 string, u64 byteOffset)
{
u64 res = 0;
if(byteOffset > string.len)
{
res = string.len;
}
else if(byteOffset)
{
byteOffset--;
while(byteOffset > 0 && !oc_utf8_is_start_byte(string.ptr[byteOffset]))
{
byteOffset--;
}
res = byteOffset;
}
return (res);
}
//-----------------------------------------------------------------
//NOTE: encoding / decoding
//-----------------------------------------------------------------
oc_utf8_dec oc_utf8_decode_at(oc_str8 string, u64 offset)
{
//NOTE(martin): get the first codepoint in str, and advance index to the
// next oc_utf8 character
//TODO(martin): check for utf-16 surrogate pairs
oc_utf32 cp = 0;
u64 sz = 0;
if(offset >= string.len || !string.ptr[offset])
{
cp = 0;
sz = 1;
}
else if(!oc_utf8_is_start_byte(string.ptr[offset]))
{
//NOTE(martin): unexpected continuation or invalid character.
cp = 0xfffd;
sz = 1;
}
else
{
int expectedSize = oc_utf8_size_from_leading_char(string.ptr[offset]);
do
{
/*NOTE(martin):
we shift 6 bits and add the next byte at each round.
at the end we have our oc_utf8 codepoint, added to the shifted versions
of the oc_utf8 leading bits for each encoded byte. These values are
precomputed in offsetsFromUTF8.
*/
unsigned char b = string.ptr[offset];
cp <<= 6;
cp += b;
offset += 1;
sz++;
if(b == 0xc0 || b == 0xc1 || b >= 0xc5)
{
//NOTE(martin): invalid byte encountered
break;
}
} while(offset < string.len
&& string.ptr[offset]
&& !oc_utf8_is_start_byte(string.ptr[offset])
&& sz < expectedSize);
if(sz != expectedSize)
{
//NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD
cp = 0xfffd;
}
else
{
cp -= offsetsFromUTF8[sz - 1];
//NOTE(martin): check for invalid codepoints
if(cp > 0x10ffff || (cp >= 0xd800 && cp <= 0xdfff))
{
cp = 0xfffd;
}
}
}
oc_utf8_dec res = { .codepoint = cp, .size = sz };
return (res);
}
oc_utf8_dec oc_utf8_decode(oc_str8 string)
{
return (oc_utf8_decode_at(string, 0));
}
oc_str8 oc_utf8_encode(char* dest, oc_utf32 codePoint)
{
u64 sz = 0;
if(codePoint < 0x80)
{
dest[0] = (char)codePoint;
sz = 1;
}
else if(codePoint < 0x800)
{
dest[0] = (codePoint >> 6) | 0xC0;
dest[1] = (codePoint & 0x3F) | 0x80;
sz = 2;
}
else if(codePoint < 0x10000)
{
dest[0] = (codePoint >> 12) | 0xE0;
dest[1] = ((codePoint >> 6) & 0x3F) | 0x80;
dest[2] = (codePoint & 0x3F) | 0x80;
sz = 3;
}
else if(codePoint < 0x110000)
{
dest[0] = (codePoint >> 18) | 0xF0;
dest[1] = ((codePoint >> 12) & 0x3F) | 0x80;
dest[2] = ((codePoint >> 6) & 0x3F) | 0x80;
dest[3] = (codePoint & 0x3F) | 0x80;
sz = 4;
}
oc_str8 res = { .len = sz, .ptr = dest };
return (res);
}
oc_str32 oc_utf8_to_codepoints(u64 maxCount, oc_utf32* backing, oc_str8 string)
{
u64 codePointIndex = 0;
u64 byteOffset = 0;
for(; codePointIndex < maxCount && byteOffset < string.len; codePointIndex++)
{
oc_utf8_dec decode = oc_utf8_decode_at(string, byteOffset);
backing[codePointIndex] = decode.codepoint;
byteOffset += decode.size;
}
oc_str32 res = { .len = codePointIndex, .ptr = backing };
return (res);
}
oc_str8 oc_utf8_from_codepoints(u64 maxBytes, char* backing, oc_str32 codePoints)
{
u64 byteOffset = 0;
for(u64 codePointIndex = 0; (codePointIndex < codePoints.len); codePointIndex++)
{
oc_utf32 codePoint = codePoints.ptr[codePointIndex];
u32 byteCount = oc_utf8_codepoint_size(codePoint);
if(byteOffset + byteCount > maxBytes)
{
break;
}
oc_utf8_encode(backing + byteOffset, codePoint);
byteOffset += byteCount;
}
oc_str8 res = { .len = byteOffset, .ptr = backing };
return (res);
}
oc_str32 oc_utf8_push_to_codepoints(oc_arena* arena, oc_str8 string)
{
u64 count = oc_utf8_codepoint_count_for_string(string);
oc_utf32* backing = oc_arena_push_array(arena, oc_utf32, count);
oc_str32 res = oc_utf8_to_codepoints(count, backing, string);
return (res);
}
oc_str8 oc_utf8_push_from_codepoints(oc_arena* arena, oc_str32 codePoints)
{
u64 count = oc_utf8_byte_count_for_codepoints(codePoints);
char* backing = oc_arena_push_array(arena, char, count);
oc_str8 res = oc_utf8_from_codepoints(count, backing, codePoints);
return (res);
}
#define OC_UNICODE_RANGE(start, cnt, name) ORCA_API const oc_unicode_range OC_CAT2(OC_UNICODE_, name) = { .firstCodePoint = start, .count = cnt };
OC_UNICODE_RANGES
#undef OC_UNICODE_RANGE