restartable c16 <-> mb conversions

2022-07-16 04:33:55 +11:00 · 2022-07-16 04:33:55 +11:00 · bf428a9924
parent 7a1ffc534b
commit bf428a9924
4 changed files with 66 additions and 70 deletions
--- a/inc/stdlib.h
+++ b/inc/stdlib.h
@ -34,6 +34,8 @@ typedef struct lldiv_t {
 #define EXIT_SUCCESS 0
 #define RAND_MAX 65536
 #define MB_LEN_MAX 4
 #define MB_CUR_MAX 4
 // Microsoft extension, COUNTOF(x) counts array elements
--- a/inc/uchar.h
+++ b/inc/uchar.h
@ -9,7 +9,10 @@ typedef uint_least32_t char32_t;
 typedef struct mbstate_t mbstate_t;
 struct mbstate_t {
-    char16_t leftover;
+    union {
        char16_t leftover;
        char16_t high_surrogate;
    }
 };
 size_t mbrtoc16(
--- a/src/code/uchar.c
+++ b/src/code/uchar.c
@ -2,7 +2,8 @@
 #include <uchar.h>
 #include <errno.h>
-#include <unicode.h>
+#include <unicope.h>
 size_t mbrtoc16(
    char16_t   *restrict pc16,
@ -10,93 +11,83 @@ size_t mbrtoc16(
    size_t               n,
    mbstate_t  *restrict ps
 ) {
    // Figure out the conversion state
    static mbstate_t static_mbstate = {0};
    if(ps == NULL) ps = &static_mbstate;
    if(s == NULL) {
-        *ps = (mbstate_t) {0};
+        *ps = (mbstate_t) {0xd800};
        return 0;
    }
-    // First check leftovers, using 0xd800 as marker because it doesn't
+    // Check leftovers, using 0xd800 as "no leftover" marker because it
-    // encode a valid character.
+    // doesn't encode a valid character.
-    if(ps->leftover != 0xd800) {
+    if(ps->leftover == 0xd800) {
        // Decode the UTF-8 encoded codepoint
        char32_t code_point;
        int mblen = utf8_chdec((char8_t *)s, n, &code_point);
        if(mblen == UNI_ESTRLN) return (size_t)(-2);
        if(mblen <= 0) goto invalid_seq;
        // Encode the codepoint into UTF-16 string
        char16_t str[2];
        int c16len = utf16_chenc(str, 2, code_point);
        if(c16len <= 0) goto invalid_seq;
        // Assign the decoded UTF-16 character, decide leftover
        if(pc16 != NULL) *pc16 = str[0];
        ps->leftover = (c16len == 2? str[1] : 0xd800);
        return (size_t)mblen;
    }
    else {
        // Otherwise use and reset the leftover
        if(pc16 != NULL) *pc16 = ps->leftover;
        ps->leftover = 0xd800;
        return (size_t)(-3);
    }
-    else {
+invalid_seq:
        uchar_t ch;
        char16_t str[3];
        int chlen = utf8_dec(s, &ch);
        if(chlen <= 0) goto encoding_error;
        int wrlen = utf16_enc(str, ch);
        char16_t curc;
        char16_t next;
        if(wrlen <= 0) goto encoding_error;
        else if(wrlen == 2) {
            curc = str[0];
            next = 0xd800;
        }
        else {
            curc = str[0];
            next = str[1];
        }
        ps->leftover = next;
        if(pc16 != NULL) *pc16 = curc;
        return (size_t)-2;
    }
 encoding_error:
    errno = EILSEQ;
    return (size_t)(-1);
 }
 size_t c16rtomb(
    char *restrict      s,
    char16_t            c16,
    mbstate_t *restrict ps
 ) {
    // Figure out conversion state
    static mbstate_t static_mbstate = {0};
    if(ps == NULL) ps = &static_mbstate;
    if(s == NULL) {
-        *ps = (mbstate_t) {0};
+        *ps = (mbstate_t) {0xd800};
        return 0;
    }
-    unsigned cp;
+    char32_t codepoint_to_write;
-    // High surrogate (save)
+    // Check whether a high surrogate was detected in a previous call to the
-    if(0xd800 <= c16 && c16 < 0xdc00) {
+    // function. If not, the high_surrogate value is 0xd800
-        ps->leftover = c16;
+    if(ps->high_surrogate == 0xd800) {
-        return 0;
+        // If c16 is a surrogate record it, or throw an error
        if(uni_is_hsur(c16)) {
            ps->high_surrogate = c16;
            return 0;
        }
        else if(uni_is_lsur(c16)) {
            goto invalid_char;
        }
        // We'll just write c16
        codepoint_to_write = c16;
    }
-    // Low surrogate (parse)
+    // If high surrogate exists, the next character must be a low surrogate
-    else if(0xdc00 <= c16 && c16 < 0xe000) {
+    // so we'll write a codepoint made out of high and low surrogates
-        if(ps->leftover == 0) goto encoding_error;
+    else if(uni_is_lsur(c16)) {
-        cp = ((ps->leftover & 0x3ff) << 10) | (c16 & 0x3ff);
+        codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
    }
-    // Other char
+    else goto invalid_char;
-    else {
+    // Write the codepoint that we decided to write to multibyte string
-        cp = c16;
+    int written_len = utf8_chenc(s, 4, codepoint_to_write);
    if(written_len < 0) {
        goto invalid_char;
    }
-    size_t nbytes = 4;
+    return (size_t)written_len;
-    if(cp < 0x10000) nbytes = 3;
+invalid_char:
    if(cp < 0x800)   nbytes = 2;
    if(cp < 0x80)    nbytes = 1;
    switch(nbytes) {
        case 1: {
            s[0] = cp;
        } break;
        case 2: {
            s[0] = 0xc0 | (cp >> 6);
            s[1] = 0x80 | ((cp >> 0)  & 0x3f);
        } break;
        case 3: {
            s[0] = 0xe0 | (cp >> 12);
            s[1] = 0x80 | ((cp >> 6)  & 0x3f);
            s[2] = 0x80 | ((cp >> 0)  & 0x3f);
        } break;
        case 4: {
            s[0] = 0xf0 | (cp >> 18);
            s[1] = 0x80 | ((cp >> 12) & 0x3f);
            s[2] = 0x80 | ((cp >> 6)  & 0x3f);
            s[3] = 0x80 | ((cp >> 0)  & 0x3f);
        } break;
    }
    return nbytes;
 encoding_error:
    errno = EILSEQ;
    return (size_t)(-1);
 }
--- a/src/code/wctype.c
+++ b/src/code/wctype.c
@ -2,7 +2,7 @@
 #include <wctype.h>
 #include <string.h>
-#include <unicode.h>
+#include <unicope.h>
 int iswctype(wint_t wc, wctype_t desc) {
    return desc(wc);
@ -113,9 +113,9 @@ int iswxdigit(wint_t wc) {
 }
 wint_t towlower(wint_t wc) {
-    return uni_to_lower(wc);
+    return uni_tolower(wc);
 }
 wint_t towupper(wint_t wc) {
-    return uni_to_upper(wc);
+    return uni_toupper(wc);
 }