diff --git a/inc/unicode.h b/inc/unicode.h index c7d3318..d8c0ea6 100644 --- a/inc/unicode.h +++ b/inc/unicode.h @@ -14,7 +14,7 @@ typedef int32_t uchar_t; #define UNI_EIBYTE (-3) #define UNI_ETBYTE (-4) #define UNI_ESTRLN (-5) - +#define UNI_EOLONG (-6) enum { UCHAR_BAD, @@ -127,15 +127,15 @@ int uni_is_hsur(char16_t cp); int uni_is_lsur(char16_t cp); uchar_t uni_surtoc (char16_t hsur, char16_t lsur); -size_t utf16_chlen(char16_t const *str); -size_t utf8_chlen (char const *str); +int utf16_chlen(char16_t const *str); +int utf8_chlen (char const *str); -size_t utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch); -size_t utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch); -size_t utf16_dec (char16_t const *restrict str, uchar_t *restrict ch); -size_t utf8_dec (char const *restrict str, uchar_t *restrict ch); +int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch); +int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch); +int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch); +int utf8_dec (char const *restrict str, uchar_t *restrict ch); -size_t utf16_enc_s(char16_t *str, size_t len, uchar_t ch); -size_t utf8_enc_s (char *str, size_t len, uchar_t ch); -size_t utf16_enc (char16_t *str, uchar_t ch); -size_t utf8_enc (char *str, uchar_t ch); +int utf16_enc_s(char16_t *str, size_t len, uchar_t ch); +int utf8_enc_s (char *str, size_t len, uchar_t ch); +int utf16_enc (char16_t *str, uchar_t ch); +int utf8_enc (char *str, uchar_t ch); diff --git a/src/code/unicode.c b/src/code/unicode.c index c2008df..00d8c11 100644 --- a/src/code/unicode.c +++ b/src/code/unicode.c @@ -53,7 +53,7 @@ int utf16_chlen(char16_t const *str) { else return 1; } -size_t utf8_chlen(char const *str) { +int utf8_chlen(char const *str) { uint8_t byte0 = (uint8_t)*str; if(byte0 < 0x80) return 1; else if(byte0 < 0xc0) return UNI_EIBYTE; @@ -63,8 +63,8 @@ size_t utf8_chlen(char const *str) { return UNI_EIBYTE; } -size_t utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { - size_t chlen = 0; +int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { + int chlen = 0; uchar_t ch; if(uni_is_hsur(str[0])) { char16_t hsur = str[0]; @@ -87,13 +87,13 @@ size_t utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { return chlen; } -size_t utf16_dec_s( +int utf16_dec_s( char16_t const *restrict str, - size_t len, + int len, uchar_t *restrict chp ) { if(len == 0) return 0; - size_t chlen; + int chlen; uchar_t ch; if(uni_is_hsur(str[0])) { if(len < 2) return 0; @@ -118,10 +118,10 @@ size_t utf16_dec_s( return chlen; } -size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) { +int utf8_dec(char const *restrict str, uchar_t *restrict chp) { uint8_t const *ustr = (uint8_t const *)str; - size_t chlen; + int chlen; uchar_t ch; if(ustr[0] < 0x80) ch = ustr[0], chlen = 1; else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE; @@ -130,7 +130,7 @@ size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) { else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4; else ch = 0xfffd, chlen = UNI_EIBYTE; - if(chlen > 0) for(size_t i = 1; i < chlen; ++i) { + if(chlen > 0) for(int i = 1; i < chlen; ++i) { uint8_t trail = ustr[i]; if((trail & 0xc0) != 0x80) { chlen = UNI_ETBYTE; @@ -148,14 +148,14 @@ size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) { return chlen; } -size_t utf8_dec_s( +int utf8_dec_s( char const *restrict str, - size_t len, + int len, uchar_t *restrict chp ) { if(len == 0) return 0; uint8_t const *restrict ustr = (uint8_t const *restrict)str; - size_t chlen; + int chlen; uchar_t ch; if(ustr[0] < 0x80) ch = ustr[0], chlen = 1; else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE; @@ -166,7 +166,7 @@ size_t utf8_dec_s( if(chlen > len) { return UNI_ESTRLN; } - if(chlen > 0) for(size_t i = 1; i < chlen; ++i) { + if(chlen > 0) for(int i = 1; i < chlen; ++i) { uint8_t trail = ustr[i]; if((trail & 0xc0) != 0x80) { chlen = UNI_ETBYTE; @@ -184,3 +184,96 @@ size_t utf8_dec_s( return chlen; } +int utf16_enc(char16_t *str, uchar_t cp) { + if(!is_valid(cp)) { + return UNI_EBADCP; + } + if(cp < 0x10000) { + str[0] = cp; + return 1; + } + else { + cp -= 0x10000; + str[0] = 0xD800 + (cp >> 10); + str[1] = 0xDC00 + (cp & 0x3ff); + return 2; + } +} + +int utf8_enc(char *str, uchar_t ch) { + if(!is_valid(cp)) { + return UNI_EBADCP; + } + if(cp < 0x80) { + str[0] = ch; + return 1; + } + else if(cp < 0x800) { + str[0] = 0xc0 | (ch >> 6); + str[1] = 0x80 | ((ch >> 0) & 0x3f); + return 2; + } + else if(cp < 0x10000) { + str[0] = 0xe0 | (ch >> 18); + str[1] = 0x80 | ((ch >> 6) & 0x3f); + str[2] = 0x80 | ((ch >> 0) & 0x3f); + return 3; + } + else { + str[0] = 0xe0 | (ch >> 24); + str[1] = 0x80 | ((ch >> 18) & 0x3f); + str[2] = 0x80 | ((ch >> 6) & 0x3f); + str[3] = 0x80 | ((ch >> 0) & 0x3f); + return 4; + } +} + +int utf16_enc_s(char16_t *str, size_t len, uchar_t ch) { + if(!is_valid(cp)) { + return UNI_EBADCP; + } + if(len == 0) return 0; + if(cp < 0x10000) { + str[0] = cp; + return 1; + } + else { + if(len < 2) return UNI_ESTRLN; + cp -= 0x10000; + str[0] = 0xD800 + (cp >> 10); + str[1] = 0xDC00 + (cp & 0x3ff); + return 2; + } +} + +int utf8_enc_s(char *str, size_t len, uchar_t ch) { + if(!is_valid(cp)) { + return UNI_EBADCP; + } + if(len == 0) return 0; + if(cp < 0x80) { + str[0] = ch; + return 1; + } + else if(cp < 0x800) { + if(len < 2) return UNI_ESTRLN; + str[0] = 0xc0 | (ch >> 6); + str[1] = 0x80 | ((ch >> 0) & 0x3f); + return 2; + } + else if(cp < 0x10000) { + if(len < 3) return UNI_ESTRLN; + str[0] = 0xe0 | (ch >> 18); + str[1] = 0x80 | ((ch >> 6) & 0x3f); + str[2] = 0x80 | ((ch >> 0) & 0x3f); + return 3; + } + else { + if(len < 4) return UNI_ESTRLN; + str[0] = 0xe0 | (ch >> 24); + str[1] = 0x80 | ((ch >> 18) & 0x3f); + str[2] = 0x80 | ((ch >> 6) & 0x3f); + str[3] = 0x80 | ((ch >> 0) & 0x3f); + return 4; + } +}