orca/src/util/utf8.c

//*****************************************************************
//
//	$file: utf8.c $
//	$author: Martin Fouilleul $
//	$date: 05/11/2016 $
//	$revision: $
//	$note: (C) 2016 by Martin Fouilleul - all rights reserved $
//
//*****************************************************************
#include "utf8.h"

//-----------------------------------------------------------------
//	utf-8 gore
//-----------------------------------------------------------------
static const u32 offsetsFromUTF8[6] = {
    0x00000000UL, 0x00003080UL, 0x000E2080UL,
    0x03C82080UL, 0xFA082080UL, 0x82082080UL
};

static const char trailingBytesForUTF8[256] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};

#define oc_utf8_is_start_byte(c) (((c)&0xc0) != 0x80)

//-----------------------------------------------------------------
//NOTE: getting sizes / offsets / indices
//-----------------------------------------------------------------

u32 oc_utf8_size_from_leading_char(char leadingChar)
{
    return (trailingBytesForUTF8[(unsigned int)(unsigned char)leadingChar] + 1);
}

u32 oc_utf8_codepoint_size(oc_utf32 codePoint)
{
    if(codePoint < 0x80)
    {
        return (1);
    }
    if(codePoint < 0x800)
    {
        return (2);
    }
    if(codePoint < 0x10000)
    {
        return (3);
    }
    if(codePoint < 0x110000)
    {
        return (4);
    }
    return (0);
}

u64 oc_utf8_codepoint_count_for_string(oc_str8 string)
{
    u64 byteOffset = 0;
    u64 codePointIndex = 0;
    for(;
        (byteOffset < string.len) && (string.ptr[byteOffset] != 0);
        codePointIndex++)
    {
        oc_utf8_dec decode = oc_utf8_decode_at(string, byteOffset);
        byteOffset += decode.size;
    }
    return (codePointIndex);
}

u64 oc_utf8_byte_count_for_codepoints(oc_str32 codePoints)
{
    //NOTE(martin): return the exact number of bytes taken by the encoded
    //              version of codePoints. (ie do not attempt to provision
    //              for a zero terminator).
    u64 byteCount = 0;
    for(u64 i = 0; i < codePoints.len; i++)
    {
        byteCount += oc_utf8_codepoint_size(codePoints.ptr[i]);
    }
    return (byteCount);
}

u64 oc_utf8_next_offset(oc_str8 string, u64 byteOffset)
{
    u64 res = 0;
    if(byteOffset >= string.len)
    {
        res = string.len;
    }
    else
    {
        u64 nextOffset = byteOffset + oc_utf8_size_from_leading_char(string.ptr[byteOffset]);
        res = oc_min(nextOffset, string.len);
    }
    return (res);
}

u64 oc_utf8_prev_offset(oc_str8 string, u64 byteOffset)
{
    u64 res = 0;
    if(byteOffset > string.len)
    {
        res = string.len;
    }
    else if(byteOffset)
    {
        byteOffset--;
        while(byteOffset > 0 && !oc_utf8_is_start_byte(string.ptr[byteOffset]))
        {
            byteOffset--;
        }
        res = byteOffset;
    }
    return (res);
}

//-----------------------------------------------------------------
//NOTE: encoding / decoding
//-----------------------------------------------------------------

oc_utf8_dec oc_utf8_decode_at(oc_str8 string, u64 offset)
{
    //NOTE(martin): get the first codepoint in str, and advance index to the
    //              next oc_utf8 character
    //TODO(martin): check for utf-16 surrogate pairs
    oc_utf32 cp = 0;
    u64 sz = 0;

    if(offset >= string.len || !string.ptr[offset])
    {
        cp = 0;
        sz = 1;
    }
    else if(!oc_utf8_is_start_byte(string.ptr[offset]))
    {
        //NOTE(martin): unexpected continuation or invalid character.
        cp = 0xfffd;
        sz = 1;
    }
    else
    {
        int expectedSize = oc_utf8_size_from_leading_char(string.ptr[offset]);
        do
        {
            /*NOTE(martin):
				we shift 6 bits and add the next byte at each round.
				at the end we have our oc_utf8 codepoint, added to the shifted versions
				of the oc_utf8 leading bits for each encoded byte. These values are
				precomputed in offsetsFromUTF8.
			*/
            unsigned char b = string.ptr[offset];
            cp <<= 6;
            cp += b;
            offset += 1;
            sz++;

            if(b == 0xc0 || b == 0xc1 || b >= 0xc5)
            {
                //NOTE(martin): invalid byte encountered
                break;
            }

        } while(offset < string.len
                && string.ptr[offset]
                && !oc_utf8_is_start_byte(string.ptr[offset])
                && sz < expectedSize);

        if(sz != expectedSize)
        {
            //NOTE(martin): if we encountered an error, we return the replacement codepoint U+FFFD
            cp = 0xfffd;
        }
        else
        {
            cp -= offsetsFromUTF8[sz - 1];

            //NOTE(martin): check for invalid codepoints
            if(cp > 0x10ffff || (cp >= 0xd800 && cp <= 0xdfff))
            {
                cp = 0xfffd;
            }
        }
    }
    oc_utf8_dec res = { .codepoint = cp, .size = sz };
    return (res);
}

oc_utf8_dec oc_utf8_decode(oc_str8 string)
{
    return (oc_utf8_decode_at(string, 0));
}

oc_str8 oc_utf8_encode(char* dest, oc_utf32 codePoint)
{
    u64 sz = 0;
    if(codePoint < 0x80)
    {
        dest[0] = (char)codePoint;
        sz = 1;
    }
    else if(codePoint < 0x800)
    {
        dest[0] = (codePoint >> 6) | 0xC0;
        dest[1] = (codePoint & 0x3F) | 0x80;
        sz = 2;
    }
    else if(codePoint < 0x10000)
    {
        dest[0] = (codePoint >> 12) | 0xE0;
        dest[1] = ((codePoint >> 6) & 0x3F) | 0x80;
        dest[2] = (codePoint & 0x3F) | 0x80;
        sz = 3;
    }
    else if(codePoint < 0x110000)
    {
        dest[0] = (codePoint >> 18) | 0xF0;
        dest[1] = ((codePoint >> 12) & 0x3F) | 0x80;
        dest[2] = ((codePoint >> 6) & 0x3F) | 0x80;
        dest[3] = (codePoint & 0x3F) | 0x80;
        sz = 4;
    }
    oc_str8 res = { .len = sz, .ptr = dest };
    return (res);
}

oc_str32 oc_utf8_to_codepoints(u64 maxCount, oc_utf32* backing, oc_str8 string)
{
    u64 codePointIndex = 0;
    u64 byteOffset = 0;
    for(; codePointIndex < maxCount && byteOffset < string.len; codePointIndex++)
    {
        oc_utf8_dec decode = oc_utf8_decode_at(string, byteOffset);
        backing[codePointIndex] = decode.codepoint;
        byteOffset += decode.size;
    }
    oc_str32 res = { .len = codePointIndex, .ptr = backing };
    return (res);
}

oc_str8 oc_utf8_from_codepoints(u64 maxBytes, char* backing, oc_str32 codePoints)
{
    u64 byteOffset = 0;
    for(u64 codePointIndex = 0; (codePointIndex < codePoints.len); codePointIndex++)
    {
        oc_utf32 codePoint = codePoints.ptr[codePointIndex];
        u32 byteCount = oc_utf8_codepoint_size(codePoint);
        if(byteOffset + byteCount > maxBytes)
        {
            break;
        }
        oc_utf8_encode(backing + byteOffset, codePoint);
        byteOffset += byteCount;
    }
    oc_str8 res = { .len = byteOffset, .ptr = backing };
    return (res);
}

oc_str32 oc_utf8_push_to_codepoints(oc_arena* arena, oc_str8 string)
{
    u64 count = oc_utf8_codepoint_count_for_string(string);
    oc_utf32* backing = oc_arena_push_array(arena, oc_utf32, count);
    oc_str32 res = oc_utf8_to_codepoints(count, backing, string);
    return (res);
}

oc_str8 oc_utf8_push_from_codepoints(oc_arena* arena, oc_str32 codePoints)
{
    u64 count = oc_utf8_byte_count_for_codepoints(codePoints);
    char* backing = oc_arena_push_array(arena, char, count);
    oc_str8 res = oc_utf8_from_codepoints(count, backing, codePoints);
    return (res);
}

#define OC_UNICODE_RANGE(start, cnt, name) ORCA_API const oc_unicode_range OC_CAT2(OC_UNICODE_, name) = { .firstCodePoint = start, .count = cnt };
OC_UNICODE_RANGES
#undef OC_UNICODE_RANGE