diff --git a/inc/stdlib.h b/inc/stdlib.h index 3813e09..929c99e 100644 --- a/inc/stdlib.h +++ b/inc/stdlib.h @@ -34,6 +34,8 @@ typedef struct lldiv_t { #define EXIT_SUCCESS 0 #define RAND_MAX 65536 + +#define MB_LEN_MAX 4 #define MB_CUR_MAX 4 // Microsoft extension, COUNTOF(x) counts array elements diff --git a/inc/uchar.h b/inc/uchar.h index 770c2b3..0871202 100644 --- a/inc/uchar.h +++ b/inc/uchar.h @@ -9,7 +9,10 @@ typedef uint_least32_t char32_t; typedef struct mbstate_t mbstate_t; struct mbstate_t { - char16_t leftover; + union { + char16_t leftover; + char16_t high_surrogate; + } }; size_t mbrtoc16( diff --git a/src/code/uchar.c b/src/code/uchar.c index 2b27c11..e4773a4 100644 --- a/src/code/uchar.c +++ b/src/code/uchar.c @@ -2,7 +2,8 @@ #include #include -#include +#include + size_t mbrtoc16( char16_t *restrict pc16, @@ -10,93 +11,83 @@ size_t mbrtoc16( size_t n, mbstate_t *restrict ps ) { + // Figure out the conversion state + static mbstate_t static_mbstate = {0}; + if(ps == NULL) ps = &static_mbstate; if(s == NULL) { - *ps = (mbstate_t) {0}; + *ps = (mbstate_t) {0xd800}; return 0; } - // First check leftovers, using 0xd800 as marker because it doesn't - // encode a valid character. - if(ps->leftover != 0xd800) { + // Check leftovers, using 0xd800 as "no leftover" marker because it + // doesn't encode a valid character. + if(ps->leftover == 0xd800) { + // Decode the UTF-8 encoded codepoint + char32_t code_point; + int mblen = utf8_chdec((char8_t *)s, n, &code_point); + if(mblen == UNI_ESTRLN) return (size_t)(-2); + if(mblen <= 0) goto invalid_seq; + // Encode the codepoint into UTF-16 string + char16_t str[2]; + int c16len = utf16_chenc(str, 2, code_point); + if(c16len <= 0) goto invalid_seq; + // Assign the decoded UTF-16 character, decide leftover + if(pc16 != NULL) *pc16 = str[0]; + ps->leftover = (c16len == 2? str[1] : 0xd800); + return (size_t)mblen; + } + else { + // Otherwise use and reset the leftover if(pc16 != NULL) *pc16 = ps->leftover; ps->leftover = 0xd800; return (size_t)(-3); } - else { - uchar_t ch; - char16_t str[3]; - int chlen = utf8_dec(s, &ch); - if(chlen <= 0) goto encoding_error; - int wrlen = utf16_enc(str, ch); - char16_t curc; - char16_t next; - if(wrlen <= 0) goto encoding_error; - else if(wrlen == 2) { - curc = str[0]; - next = 0xd800; - } - else { - curc = str[0]; - next = str[1]; - } - ps->leftover = next; - if(pc16 != NULL) *pc16 = curc; - return (size_t)-2; - } -encoding_error: +invalid_seq: errno = EILSEQ; return (size_t)(-1); } + + size_t c16rtomb( char *restrict s, char16_t c16, mbstate_t *restrict ps ) { + // Figure out conversion state + static mbstate_t static_mbstate = {0}; + if(ps == NULL) ps = &static_mbstate; if(s == NULL) { - *ps = (mbstate_t) {0}; + *ps = (mbstate_t) {0xd800}; return 0; } - unsigned cp; - // High surrogate (save) - if(0xd800 <= c16 && c16 < 0xdc00) { - ps->leftover = c16; - return 0; + char32_t codepoint_to_write; + // Check whether a high surrogate was detected in a previous call to the + // function. If not, the high_surrogate value is 0xd800 + if(ps->high_surrogate == 0xd800) { + // If c16 is a surrogate record it, or throw an error + if(uni_is_hsur(c16)) { + ps->high_surrogate = c16; + return 0; + } + else if(uni_is_lsur(c16)) { + goto invalid_char; + } + // We'll just write c16 + codepoint_to_write = c16; } - // Low surrogate (parse) - else if(0xdc00 <= c16 && c16 < 0xe000) { - if(ps->leftover == 0) goto encoding_error; - cp = ((ps->leftover & 0x3ff) << 10) | (c16 & 0x3ff); + // If high surrogate exists, the next character must be a low surrogate + // so we'll write a codepoint made out of high and low surrogates + else if(uni_is_lsur(c16)) { + codepoint_to_write = uni_surtoc(ps->high_surrogate, c16); } - // Other char - else { - cp = c16; + else goto invalid_char; + // Write the codepoint that we decided to write to multibyte string + int written_len = utf8_chenc(s, 4, codepoint_to_write); + if(written_len < 0) { + goto invalid_char; } - size_t nbytes = 4; - if(cp < 0x10000) nbytes = 3; - if(cp < 0x800) nbytes = 2; - if(cp < 0x80) nbytes = 1; - switch(nbytes) { - case 1: { - s[0] = cp; - } break; - case 2: { - s[0] = 0xc0 | (cp >> 6); - s[1] = 0x80 | ((cp >> 0) & 0x3f); - } break; - case 3: { - s[0] = 0xe0 | (cp >> 12); - s[1] = 0x80 | ((cp >> 6) & 0x3f); - s[2] = 0x80 | ((cp >> 0) & 0x3f); - } break; - case 4: { - s[0] = 0xf0 | (cp >> 18); - s[1] = 0x80 | ((cp >> 12) & 0x3f); - s[2] = 0x80 | ((cp >> 6) & 0x3f); - s[3] = 0x80 | ((cp >> 0) & 0x3f); - } break; - } - return nbytes; -encoding_error: + return (size_t)written_len; +invalid_char: errno = EILSEQ; return (size_t)(-1); } diff --git a/src/code/wctype.c b/src/code/wctype.c index 7574abd..6589587 100644 --- a/src/code/wctype.c +++ b/src/code/wctype.c @@ -2,7 +2,7 @@ #include #include -#include +#include int iswctype(wint_t wc, wctype_t desc) { return desc(wc); @@ -113,9 +113,9 @@ int iswxdigit(wint_t wc) { } wint_t towlower(wint_t wc) { - return uni_to_lower(wc); + return uni_tolower(wc); } wint_t towupper(wint_t wc) { - return uni_to_upper(wc); + return uni_toupper(wc); }