diff --git a/inc/unicode.h b/inc/unicode.h index 2a31f5a..c7d3318 100644 --- a/inc/unicode.h +++ b/inc/unicode.h @@ -9,6 +9,13 @@ typedef uint_least32_t char32_t; typedef int32_t uchar_t; +#define UNI_EBADCP (-1) +#define UNI_EULSUR (-2) +#define UNI_EIBYTE (-3) +#define UNI_ETBYTE (-4) +#define UNI_ESTRLN (-5) + + enum { UCHAR_BAD, UCHAR_Cc, @@ -112,23 +119,23 @@ struct uchar_props { uchar_props *uni_props (uchar_t cp); int uni_valid (uchar_t cp); int uni_classify(uchar_t cp); -uchar_t uni_tolower(uchar_t cp); -uchar_t uni_toupper(uchar_t cp); -uchar_t uni_totitle(uchar_t cp); +uchar_t uni_tolower (uchar_t cp); +uchar_t uni_toupper (uchar_t cp); +uchar_t uni_totitle (uchar_t cp); int uni_is_hsur(char16_t cp); int uni_is_lsur(char16_t cp); uchar_t uni_surtoc (char16_t hsur, char16_t lsur); -int utf16_chlen(char16_t const *str); -int utf8_chlen (char const *str); +size_t utf16_chlen(char16_t const *str); +size_t utf8_chlen (char const *str); -int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch); -int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch); -int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch); -int utf8_dec (char const *restrict str, uchar_t *restrict ch); +size_t utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch); +size_t utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch); +size_t utf16_dec (char16_t const *restrict str, uchar_t *restrict ch); +size_t utf8_dec (char const *restrict str, uchar_t *restrict ch); -int utf16_enc_s(char16_t *str, size_t len, uchar_t ch); -int utf8_enc_s (char *str, size_t len, uchar_t ch); -int utf16_enc (char16_t *str, uchar_t ch); -int utf8_enc (char *str, uchar_t ch); +size_t utf16_enc_s(char16_t *str, size_t len, uchar_t ch); +size_t utf8_enc_s (char *str, size_t len, uchar_t ch); +size_t utf16_enc (char16_t *str, uchar_t ch); +size_t utf8_enc (char *str, uchar_t ch); diff --git a/src/code/unicode.c b/src/code/unicode.c index db2384e..c2008df 100644 --- a/src/code/unicode.c +++ b/src/code/unicode.c @@ -49,48 +49,51 @@ uchar_t uni_surtoc(char16_t hsur, char16_t lsur) { int utf16_chlen(char16_t const *str) { char16_t cp = *str; if(uni_is_hsur(cp)) return 2; - else if(uni_is_lsur(cp)) return 0; - else if(uni_valid(cp)) return 1; - return 0; + else if(uni_is_lsur(cp)) return UNI_EULSUR; + else return 1; } -int utf8_chlen(char const *str) { +size_t utf8_chlen(char const *str) { uint8_t byte0 = (uint8_t)*str; if(byte0 < 0x80) return 1; - else if(byte0 < 0xc0) return 0; // error + else if(byte0 < 0xc0) return UNI_EIBYTE; else if(byte0 < 0xe0) return 2; else if(byte0 < 0xf0) return 3; else if(byte0 < 0xf8) return 4; - return 0; + return UNI_EIBYTE; } -int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { - int chlen = 0; +size_t utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { + size_t chlen = 0; uchar_t ch; if(uni_is_hsur(str[0])) { char16_t hsur = str[0]; char16_t lsur = str[1]; ch = uni_surtoc(hsur, lsur); chlen = 2; + if(ch > 0x10ffff) { + chlen = UNI_EBADCP; + ch = 0xfffd; + } } - else { + else(!uni_is_lsur(str[0])) { ch = str[0]; } - if(!uni_valid(ch)) { - chlen = 0; + else { + chlen = UNI_EULSUR; ch = 0xfffd; } if(chp != NULL) *chp = ch; return chlen; } -int utf16_dec_s( +size_t utf16_dec_s( char16_t const *restrict str, size_t len, uchar_t *restrict chp ) { if(len == 0) return 0; - int chlen; + size_t chlen; uchar_t ch; if(uni_is_hsur(str[0])) { if(len < 2) return 0; @@ -98,33 +101,39 @@ int utf16_dec_s( char16_t lsur = str[1]; ch = uni_surtoc(hsur, lsur); chlen = 2; + if(ch > 0x10ffff) { + chlen = UNI_EBADCP; + ch = 0xfffd; + } } - else { + else if(!uni_is_lsur(str[0])) { ch = str[0]; chlen = 1; } - if(!uni_valid(ch)) { + else { + chlen = UNI_EULSUR; ch = 0xfffd; - chlen = 0; } if(chp != NULL) *chp = ch; return chlen; } -int utf8_dec(char const *restrict str, uchar_t *restrict chp) { +size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) { uint8_t const *ustr = (uint8_t const *)str; - int chlen; + + size_t chlen; uchar_t ch; - if(ustr[0] < 0x80) chlen = 1, ch = ustr[0]; - else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd; - else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f; - else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f; - else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07; - else chlen = 0; - for(int i = 1; i < chlen; ++i) { + if(ustr[0] < 0x80) ch = ustr[0], chlen = 1; + else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE; + else if(ustr[0] < 0xe0) ch = ustr[0] & 0x1f, chlen = 2; + else if(ustr[0] < 0xf0) ch = ustr[0] & 0x0f, chlen = 3; + else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4; + else ch = 0xfffd, chlen = UNI_EIBYTE; + + if(chlen > 0) for(size_t i = 1; i < chlen; ++i) { uint8_t trail = ustr[i]; if((trail & 0xc0) != 0x80) { - chlen = 0; + chlen = UNI_ETBYTE; ch = 0xfffd; break; } @@ -132,36 +141,35 @@ int utf8_dec(char const *restrict str, uchar_t *restrict chp) { ch |= (trail & 0x3f); } if(!uni_valid(ch)) { - chlen = 0; + chlen = UNI_EBADCP; ch = 0xfffd; } if(chp != NULL) *chp = ch; return chlen; } -int utf8_dec_s( +size_t utf8_dec_s( char const *restrict str, size_t len, uchar_t *restrict chp ) { if(len == 0) return 0; uint8_t const *restrict ustr = (uint8_t const *restrict)str; - int chlen; + size_t chlen; uchar_t ch; - if(ustr[0] < 0x80) chlen = 1, ch = ustr[0]; - else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd; - else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f; - else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f; - else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07; - else chlen = 0; - if(len < chlen) { - return 0; + if(ustr[0] < 0x80) ch = ustr[0], chlen = 1; + else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE; + else if(ustr[0] < 0xe0) ch = ustr[0] & 0x1f, chlen = 2; + else if(ustr[0] < 0xf0) ch = ustr[0] & 0x0f, chlen = 3; + else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4; + else ch = 0xfffd, chlen = UNI_EIBYTE; + if(chlen > len) { + return UNI_ESTRLN; } - else chlen = 0; - for(int i = 1; i < chlen; ++i) { + if(chlen > 0) for(size_t i = 1; i < chlen; ++i) { uint8_t trail = ustr[i]; if((trail & 0xc0) != 0x80) { - chlen = 0; + chlen = UNI_ETBYTE; ch = 0xfffd; break; } @@ -169,7 +177,7 @@ int utf8_dec_s( ch |= (trail & 0x3f); } if(!uni_valid(ch)) { - chlen = 0; + chlen = UNI_EBADCP; ch = 0xfffd; } if(chp != NULL) *chp = ch;