restartable c16 <-> mb conversions

2022-07-16 04:33:55 +11:00 · 2022-07-16 04:33:55 +11:00 · bf428a9924
parent 7a1ffc534b
commit bf428a9924
4 changed files with 66 additions and 70 deletions
--- a/inc/stdlib.h
+++ b/inc/stdlib.h
@ -34,6 +34,8 @@ typedef struct lldiv_t {
 #define EXIT_SUCCESS 0

 #define RAND_MAX 65536
+
+#define MB_LEN_MAX 4
 #define MB_CUR_MAX 4

 // Microsoft extension, COUNTOF(x) counts array elements
--- a/inc/uchar.h
+++ b/inc/uchar.h
@ -9,7 +9,10 @@ typedef uint_least32_t char32_t;

 typedef struct mbstate_t mbstate_t;
 struct mbstate_t {
+    union {
        char16_t leftover;
+        char16_t high_surrogate;
+    }
 };

 size_t mbrtoc16(
--- a/src/code/uchar.c
+++ b/src/code/uchar.c
@ -2,7 +2,8 @@
 #include <uchar.h>
 #include <errno.h>

-#include <unicode.h>
+#include <unicope.h>
+

 size_t mbrtoc16(
    char16_t   *restrict pc16,
@ -10,93 +11,83 @@ size_t mbrtoc16(
    size_t               n,
    mbstate_t  *restrict ps
 ) {
+    // Figure out the conversion state
+    static mbstate_t static_mbstate = {0};
+    if(ps == NULL) ps = &static_mbstate;
    if(s == NULL) {
-        *ps = (mbstate_t) {0};
+        *ps = (mbstate_t) {0xd800};
        return 0;
    }
-    // First check leftovers, using 0xd800 as marker because it doesn't
-    // encode a valid character.
-    if(ps->leftover != 0xd800) {
+    // Check leftovers, using 0xd800 as "no leftover" marker because it
+    // doesn't encode a valid character.
+    if(ps->leftover == 0xd800) {
+        // Decode the UTF-8 encoded codepoint
+        char32_t code_point;
+        int mblen = utf8_chdec((char8_t *)s, n, &code_point);
+        if(mblen == UNI_ESTRLN) return (size_t)(-2);
+        if(mblen <= 0) goto invalid_seq;
+        // Encode the codepoint into UTF-16 string
+        char16_t str[2];
+        int c16len = utf16_chenc(str, 2, code_point);
+        if(c16len <= 0) goto invalid_seq;
+        // Assign the decoded UTF-16 character, decide leftover
+        if(pc16 != NULL) *pc16 = str[0];
+        ps->leftover = (c16len == 2? str[1] : 0xd800);
+        return (size_t)mblen;
+    }
+    else {
+        // Otherwise use and reset the leftover
        if(pc16 != NULL) *pc16 = ps->leftover;
        ps->leftover = 0xd800;
        return (size_t)(-3);
    }
-    else {
-        uchar_t ch;
-        char16_t str[3];
-        int chlen = utf8_dec(s, &ch);
-        if(chlen <= 0) goto encoding_error;
-        int wrlen = utf16_enc(str, ch);
-        char16_t curc;
-        char16_t next;
-        if(wrlen <= 0) goto encoding_error;
-        else if(wrlen == 2) {
-            curc = str[0];
-            next = 0xd800;
-        }
-        else {
-            curc = str[0];
-            next = str[1];
-        }
-        ps->leftover = next;
-        if(pc16 != NULL) *pc16 = curc;
-        return (size_t)-2;
-    }
-encoding_error:
+invalid_seq:
    errno = EILSEQ;
    return (size_t)(-1);
 }

+
+
 size_t c16rtomb(
    char *restrict      s,
    char16_t            c16,
    mbstate_t *restrict ps
 ) {
+    // Figure out conversion state
+    static mbstate_t static_mbstate = {0};
+    if(ps == NULL) ps = &static_mbstate;
    if(s == NULL) {
-        *ps = (mbstate_t) {0};
+        *ps = (mbstate_t) {0xd800};
        return 0;
    }
-    unsigned cp;
-    // High surrogate (save)
-    if(0xd800 <= c16 && c16 < 0xdc00) {
-        ps->leftover = c16;
+    char32_t codepoint_to_write;
+    // Check whether a high surrogate was detected in a previous call to the
+    // function. If not, the high_surrogate value is 0xd800
+    if(ps->high_surrogate == 0xd800) {
+        // If c16 is a surrogate record it, or throw an error
+        if(uni_is_hsur(c16)) {
+            ps->high_surrogate = c16;
            return 0;
        }
-    // Low surrogate (parse)
-    else if(0xdc00 <= c16 && c16 < 0xe000) {
-        if(ps->leftover == 0) goto encoding_error;
-        cp = ((ps->leftover & 0x3ff) << 10) | (c16 & 0x3ff);
+        else if(uni_is_lsur(c16)) {
+            goto invalid_char;
        }
-    // Other char
-    else {
-        cp = c16;
+        // We'll just write c16
+        codepoint_to_write = c16;
    }
-    size_t nbytes = 4;
-    if(cp < 0x10000) nbytes = 3;
-    if(cp < 0x800)   nbytes = 2;
-    if(cp < 0x80)    nbytes = 1;
-    switch(nbytes) {
-        case 1: {
-            s[0] = cp;
-        } break;
-        case 2: {
-            s[0] = 0xc0 | (cp >> 6);
-            s[1] = 0x80 | ((cp >> 0)  & 0x3f);
-        } break;
-        case 3: {
-            s[0] = 0xe0 | (cp >> 12);
-            s[1] = 0x80 | ((cp >> 6)  & 0x3f);
-            s[2] = 0x80 | ((cp >> 0)  & 0x3f);
-        } break;
-        case 4: {
-            s[0] = 0xf0 | (cp >> 18);
-            s[1] = 0x80 | ((cp >> 12) & 0x3f);
-            s[2] = 0x80 | ((cp >> 6)  & 0x3f);
-            s[3] = 0x80 | ((cp >> 0)  & 0x3f);
-        } break;
+    // If high surrogate exists, the next character must be a low surrogate
+    // so we'll write a codepoint made out of high and low surrogates
+    else if(uni_is_lsur(c16)) {
+        codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
    }
-    return nbytes;
-encoding_error:
+    else goto invalid_char;
+    // Write the codepoint that we decided to write to multibyte string
+    int written_len = utf8_chenc(s, 4, codepoint_to_write);
+    if(written_len < 0) {
+        goto invalid_char;
+    }
+    return (size_t)written_len;
+invalid_char:
    errno = EILSEQ;
    return (size_t)(-1);
 }
--- a/src/code/wctype.c
+++ b/src/code/wctype.c
@ -2,7 +2,7 @@
 #include <wctype.h>
 #include <string.h>

-#include <unicode.h>
+#include <unicope.h>

 int iswctype(wint_t wc, wctype_t desc) {
    return desc(wc);
@ -113,9 +113,9 @@ int iswxdigit(wint_t wc) {
 }

 wint_t towlower(wint_t wc) {
-    return uni_to_lower(wc);
+    return uni_tolower(wc);
 }

 wint_t towupper(wint_t wc) {
-    return uni_to_upper(wc);
+    return uni_toupper(wc);
 }