unicode.h: encoding/decoding UTF-8/16

2022-07-02 18:51:30 +11:00 · 2022-07-02 18:51:30 +11:00 · 2bfdcc94cf
parent f1319e6f1d
commit 2bfdcc94cf
4 changed files with 218 additions and 32 deletions
--- a/inc/unicode.h
+++ b/inc/unicode.h
@ -8,7 +8,8 @@ typedef uint_least16_t char16_t;
 typedef uint_least32_t char32_t;
 typedef int32_t uchar_t;
-typedef enum {
+
 enum {
    UCHAR_Invalid,
    UCHAR_Cc,
    UCHAR_Cf,
@ -39,19 +40,57 @@ typedef enum {
    UCHAR_Zl,
    UCHAR_Zp,
    UCHAR_Zs,
-} uchar_class;
+};
 typedef struct uchar_props uchar_props;
 struct uchar_props {
    int     bidi_class;
    int     bidi_mirrored;
    int     bidi_paired_bracket;
    int     bidi_paired_bracket_type;
    int     block;
    int     canon_comb_class;
    uchar_t ch_lower;
    uchar_t ch_upper;
    int     ndecomp;
    uchar_t const decomp[4];
    uchar_t default_igncp;
    int     deprecated;
    int     east_asian_width;
    int     gcat;
    int     hangul_syl_type;
    int     join_type;
    int     join_group;
    int     line_brk;
    char    const *name;
    uchar_t nc_cp;
    int     num_val;
    int     ws;
    int     dash;
    int     letter_props;
    int     math_props;
    int     script;
 };
 int uni_classify(uchar_t ch);
 int uni_valid(uchar_t ch);
 uchar_t uni_to_lower(uchar_t u);
 uchar_t uni_to_upper(uchar_t u);
-int utf8_dec   (char const *restrict utf8_str,  uchar_t *restrict ch);
+int     uni_is_hsur(char16_t ch);
-int utf16_dec  (char const *restrict utf16_str, uchar_t *restrict ch);
+int     uni_is_lsur(char16_t ch);
-int utf8_dec_s (char const *restrict utf8_str,  size_t len, uchar_t *restrict ch);
+uchar_t uni_surtoc (char16_t hsur, char16_t lsur);
 int utf16_dec_s(char const *restrict utf16_str, size_t len, uchar_t *restrict ch);
-int utf8_enc   (char *utf8_str,  uchar_t ch);
+int utf16_chlen(char16_t const *str);
-int utf16_enc  (char *utf16_str, uchar_t ch);
+int utf8_chlen (char     const *str);
-int utf8_enc_s (char *utf8_str,  size_t len, uchar_t ch);
+
-int utf16_enc_s(char *utf16_str, size_t len, uchar_t ch);
+int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch);
 int utf8_dec_s (char     const *restrict str, size_t len, uchar_t *restrict ch);
 int utf16_dec  (char16_t const *restrict str,             uchar_t *restrict ch);
 int utf8_dec   (char     const *restrict str,             uchar_t *restrict ch);
 int utf16_enc_s(char16_t *str, size_t len, uchar_t ch);
 int utf8_enc_s (char     *str, size_t len, uchar_t ch);
 int utf16_enc  (char16_t *str,             uchar_t ch);
 int utf8_enc   (char     *str,             uchar_t ch);
--- a/src/code/unicode.c
+++ b/src/code/unicode.c
@ -15,3 +15,150 @@ uchar_t uni_to_upper(uchar_t cp) {
    return uni_codepoints[cp].upper;
 }
 int uni_valid(uchar_t ch) {
    return (0x0000 <= ch && ch <= 0xd7ff) || (0xe000 <= ch && ch <= 0x10ffff);
 }
 int uni_is_hsur(char16_t ch) {
    return 0xd800 <= ch && ch <= 0xdbff;
 }
 int uni_is_lsur(char16_t ch) {
    return 0xdc00 <= ch && ch <= 0xdfff;
 }
 uchar_t uni_surtoc(char16_t hsur, char16_t lsur) {
    uchar_t u = ((0x3ff & hsur) << 10) | (lsur & 0x3ff);
    return u + 0x10000;
 }
 int utf16_chlen(char16_t const *str) {
    char16_t cp = *str;
    if(uni_is_hsur(cp))      return 2;
    else if(uni_is_lsur(cp)) return 0;
    else if(uni_valid(cp))   return 1;
    return 0;
 }
 int utf8_chlen(char const *str) {
    uint8_t byte0 = (uint8_t)*str;
    if(byte0 < 0x80)      return 1;
    else if(byte0 < 0xc0) return 0; // error
    else if(byte0 < 0xe0) return 2;
    else if(byte0 < 0xf0) return 3;
    else if(byte0 < 0xf8) return 4;
    return 0;
 }
 int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
    int chlen = 0;
    uchar_t ch;
    if(uni_is_hsur(str[0])) {
        char16_t hsur = str[0];
        char16_t lsur = str[1];
        ch = uni_surtoc(hsur, lsur);
        chlen = 2;
    }
    else {
        ch = str[0];
    }
    if(!uni_valid(ch)) {
        chlen = 0;
        ch = 0xfffd;
    }
    if(chp != NULL) *chp = ch;
    return chlen;
 }
 int utf16_dec_s(
    char16_t const *restrict str,
    size_t len,
    uchar_t *restrict chp
 ) {
    if(len == 0) return 0;
    int chlen;
    uchar_t ch;
    if(uni_is_hsur(str[0])) {
        if(len < 2) return 0;
        char16_t hsur = str[0];
        char16_t lsur = str[1];
        ch = uni_surtoc(hsur, lsur);
        chlen = 2;
    }
    else {
        ch = str[0];
        chlen = 1;
    }
    if(!uni_valid(ch)) {
        ch = 0xfffd;
        chlen = 0;
    }
    if(chp != NULL) *chp = ch;
    return chlen;
 }
 int utf8_dec(char const *restrict str, uchar_t *restrict chp) {
    uint8_t const *ustr = (uint8_t const *)str;
    int chlen;
    uchar_t ch;
    if(ustr[0] < 0x80)      chlen = 1, ch = ustr[0];
    else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd;
    else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f;
    else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f;
    else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07;
    else chlen = 0;
    for(int i = 1; i < chlen; ++i) {
        uint8_t trail = ustr[i];
        if((trail & 0xc0) != 0x80) {
            chlen = 0;
            ch = 0xfffd;
            break;
        }
        ch <<= 6;
        ch |= (trail & 0x3f); 
    }
    if(!uni_valid(ch)) {
        chlen = 0;
        ch = 0xfffd;
    }
    if(chp != NULL) *chp = ch;
    return chlen;
 }
 int utf8_dec_s(
    char const *restrict str,
    size_t len,
    uchar_t *restrict chp
 ) {
    if(len == 0) return 0;
    uint8_t const *restrict ustr = (uint8_t const *restrict)str;
    int chlen;
    uchar_t ch;
    if(ustr[0] < 0x80)      chlen = 1, ch = ustr[0];
    else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd;
    else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f;
    else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f;
    else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07;
    else chlen = 0;
    if(len < chlen) {
        return 0;
    }
    else chlen = 0;
    for(int i = 1; i < chlen; ++i) {
        uint8_t trail = ustr[i];
        if((trail & 0xc0) != 0x80) {
            chlen = 0;
            ch = 0xfffd;
            break;
        }
        ch <<= 6;
        ch |= (trail & 0x3f); 
    }
    if(!uni_valid(ch)) {
        chlen = 0;
        ch = 0xfffd;
    }
    if(chp != NULL) *chp = ch;
    return chlen;
 }
--- a/src/linux/syscalls.asm
+++ b/src/linux/syscalls.asm
@ -0,0 +1,9 @@
 bits 64
 segment .text
 global _exit
 _exit:
    mov rax, 60
    syscall
    ret
--- a/test/test_uchar.c
+++ b/test/test_uchar.c
@ -1,28 +1,19 @@
-#include <uchar.h>
+#include <unicode.h>
 #include <stdio.h>
 mbstate_t state;
 int main() {
-    char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C"
+    char *mbstr = u8"улыбок тебе дед макар";
    size_t in_sz = sizeof in / sizeof *in;
    char16_t out[in_sz];
    char *p_in = in, *end = in + in_sz;
    char16_t *p_out = out;
    size_t rc;
    while((rc = mbrtoc16(p_out, p_in, end - p_in, &state)))
    {
-        if(rc == (size_t)-1)      // invalid input
+        char *str = mbstr;
-            break;
+        uchar_t ch;
-        else if(rc == (size_t)-2) // truncated input
+        int len;
-            break;
+        while((len = utf8_dec(str, &ch)) > 0 && ch != 0) {
-        else if(rc == (size_t)-3) // UTF-16 high surrogate
+            printf("char: %d\n", ch);
-            p_out += 1;
+            str += len;
-        else {
+        }
-            p_in += rc;
+        if(len <= 0) {
-            p_out += 1;
+            printf("This string is not utf8\n");
-        };
+        }
    }
    size_t out_sz = p_out - out + 1;
 }