Fix using invalid unicode data and add other unicode character data

2022-07-05 15:37:15 +11:00 · 2022-07-05 15:37:15 +11:00 · f52bd57a8f
parent 2bfdcc94cf
commit f52bd57a8f
4 changed files with 658069 additions and 34698 deletions
--- a/inc/unicode.h
+++ b/inc/unicode.h
@ -10,7 +10,7 @@ typedef uint_least32_t char32_t;
 typedef int32_t uchar_t;

 enum {
-    UCHAR_Invalid,
+    UCHAR_BAD,
    UCHAR_Cc,
    UCHAR_Cf,
    UCHAR_Co,
@ -42,44 +42,82 @@ enum {
    UCHAR_Zs,
 };

-typedef struct uchar_props uchar_props;
-struct uchar_props {
-    int     bidi_class;
-    int     bidi_mirrored;
-    int     bidi_paired_bracket;
-    int     bidi_paired_bracket_type;
-    int     block;
-    int     canon_comb_class;
-    uchar_t ch_lower;
-    uchar_t ch_upper;
-    int     ndecomp;
-    uchar_t const decomp[4];
-    uchar_t default_igncp;
-    int     deprecated;
-    int     east_asian_width;
-    int     gcat;
-    int     hangul_syl_type;
-    int     join_type;
-    int     join_group;
-    int     line_brk;
-    char    const *name;
-    uchar_t nc_cp;
-    int     num_val;
-    int     ws;
-    int     dash;
-    int     letter_props;
-    int     math_props;
-    int     script;
+enum {
+    UCHAR_BIDI_AL,
+    UCHAR_BIDI_AN,
+    UCHAR_BIDI_B,
+    UCHAR_BIDI_BN,
+    UCHAR_BIDI_CS,
+    UCHAR_BIDI_EN,
+    UCHAR_BIDI_ES,
+    UCHAR_BIDI_ET,
+    UCHAR_BIDI_FSI,
+    UCHAR_BIDI_L,
+    UCHAR_BIDI_LRE,
+    UCHAR_BIDI_LRI,
+    UCHAR_BIDI_LRO,
+    UCHAR_BIDI_NSM,
+    UCHAR_BIDI_ON,
+    UCHAR_BIDI_PDF,
+    UCHAR_BIDI_PDI,
+    UCHAR_BIDI_R,
+    UCHAR_BIDI_RLE,
+    UCHAR_BIDI_RLI,
+    UCHAR_BIDI_RLO,
+    UCHAR_BIDI_S,
+    UCHAR_BIDI_WS,
 };

+enum {
+    UCHAR_DECOMP_CANON,
+    UCHAR_DECOMP_FONT,
+    UCHAR_DECOMP_NOBREAK,
+    UCHAR_DECOMP_INITIAL,
+    UCHAR_DECOMP_MEDIAL,
+    UCHAR_DECOMP_FINAL,
+    UCHAR_DECOMP_ISOLATED,
+    UCHAR_DECOMP_CIRCLE,
+    UCHAR_DECOMP_SUPER,
+    UCHAR_DECOMP_SUB,
+    UCHAR_DECOMP_VERTICAL,
+    UCHAR_DECOMP_WIDE,
+    UCHAR_DECOMP_NARROW,
+    UCHAR_DECOMP_SMALL,
+    UCHAR_DECOMP_SQUARE,
+    UCHAR_DECOMP_FRACTION,
+    UCHAR_DECOMP_COMPAT,
+};

-int uni_classify(uchar_t ch);
-int uni_valid(uchar_t ch);
-uchar_t uni_to_lower(uchar_t u);
-uchar_t uni_to_upper(uchar_t u);
+typedef struct uchar_props uchar_props;
+struct uchar_props {
+    uchar_t       code;
+    char const   *name;
+    int           cat_gen;
+    int           cat_bidi;
+    int           comb_class;
+    int           dec_type;
+    int           dec_map_n;
+    uchar_t const dec_map[18]; // U+FDFA takes 18, everything else takes up <8
+    int           dec_value;
+    int           dig_value;
+    double        num_value;
+    int           bidi_mirrored;
+    char const   *old_name;
+    char const   *comment;
+    uchar_t       lower;
+    uchar_t       upper;
+    uchar_t       title;
+};

-int     uni_is_hsur(char16_t ch);
-int     uni_is_lsur(char16_t ch);
+uchar_props *uni_props   (uchar_t cp);
+int          uni_valid   (uchar_t cp);
+int          uni_classify(uchar_t cp);
+uchar_t      uni_tolower(uchar_t cp);
+uchar_t      uni_toupper(uchar_t cp);
+uchar_t      uni_totitle(uchar_t cp);
+
+int     uni_is_hsur(char16_t cp);
+int     uni_is_lsur(char16_t cp);
 uchar_t uni_surtoc (char16_t hsur, char16_t lsur);

 int utf16_chlen(char16_t const *str);
--- a/src/code/unicode.c
+++ b/src/code/unicode.c
@ -3,16 +3,30 @@

 #include "unicode/data.h"

-int uni_classify(uchar_t cp) {
-    return uni_codepoints[cp].cat;
+uchar_props *uni_props(uchar_t cp) {
+    if(!uni_valid(cp))              return NULL;
+    if(unicode_data[cp].code != cp) return NULL;
+    return &unicode_data[cp];
 }

-uchar_t uni_to_lower(uchar_t cp) {
-    return uni_codepoints[cp].lower;
+int uni_cat_gen(uchar_t cp) {
+    uchar_props *props = uni_props(cp);
+    if(props != NULL)
+        return unicode_data[cp].cat_gen;
+    else
+        return UCHAR_BAD;
 }

-uchar_t uni_to_upper(uchar_t cp) {
-    return uni_codepoints[cp].upper;
+uchar_t uni_tolower(uchar_t cp) {
+    return unicode_data[cp].lower;
+}
+
+uchar_t uni_toupper(uchar_t cp) {
+    return unicode_data[cp].upper;
+}
+
+uchar_t uni_totitle(uchar_t cp) {
+    return unicode_data[cp].title;
 }

 int uni_valid(uchar_t ch) {
--- a/src/code/unicode/compile.py
+++ b/src/code/unicode/compile.py
@ -9,34 +9,90 @@ os.chdir(dname)
 with open('data.h', 'w') as header:
    header.write('\n');
    header.write('#pragma once\n\n');
-    header.write('#include<unicode.h>\n');
+    header.write('#include <unicode.h>\n');
    header.write(
 '''
-struct _uni_elm {
-    uchar_t code;
-    int     cat;
-    uchar_t lower;
-    uchar_t upper;
-} uni_codepoints[] = {
+uchar_props unicode_data[] = {
 ''');

    with open('data.txt') as file:
        for line in file:
            row = line.split(';')
            code       = row[0].strip()
-            cat   = row[2].strip()
+            name       = row[1].strip()
+            cat_gen    = row[2].strip()
+            cat_bidi   = row[4].strip()
+            comb_class = row[3].strip()
+            dec_map    = row[5].strip()
+            dec_value  = row[6].strip()
+            dig_value  = row[7].strip()
+            num_value  = row[8].strip()
+            mirrored   = row[9].strip()
+            old_name   = row[10].strip()
+            comment    = row[11].strip()
+            upper      = row[12].strip()
            lower      = row[13].strip()
-            upper = row[14].strip()
+            title      = row[14].strip()
+            # Process decompositional mapping
+            dec_map_n  = 0
+            dec_type   = 'CANON'
+            if dec_map != '':
+                dec_map = dec_map.split(' ')
+                if dec_map[0][0] == '<':
+                    dec_type = dec_map[0][1:-1].upper()
+                    dec_map = dec_map[1:]
+                dec_map_n = len(dec_map)
+            if dec_map_n != 0:
+                dec_map = ', '.join(list(map(lambda x: '0x' + x, dec_map)))
+            else:
+                dec_map = '0'
+            # Make sure lowercase and uppercase mappings are defined
            if lower == '':
                lower = code
-            if upper == '' or upper == '\n':
+            if upper == '':
                upper = code
-            header.write('    {' + \
-                '0x'     + code  + ', ' + \
-                'UCHAR_' + cat   + ', ' + \
-                '0x'     + lower + ', ' + \
-                '0x'     + upper + '},\n'
-            );
+            if title == '' or title == '\n':
+                title = code
+            header.write(
+            '''
+    [0x%s] = {
+        .code = 0x%s,
+        .name = "%s",
+        .cat_gen = UCHAR_%s,
+        .cat_bidi = UCHAR_BIDI_%s,
+        .comb_class = %s,
+        .dec_type = UCHAR_DECOMP_%s,
+        .dec_map_n = %s,
+        .dec_map = {%s},
+        .dec_value = %s,
+        .dig_value = %s,
+        .num_value = %s,
+        .bidi_mirrored = %s,
+        .old_name = "%s",
+        .comment  = "%s",
+        .lower = 0x%s,
+        .upper = 0x%s,
+        .title = 0x%s,
+    },''' % (
+                code,
+                code,
+                name,
+                cat_gen,
+                cat_bidi,
+                comb_class,
+                dec_type,
+                dec_map_n,
+                dec_map,
+                dec_value if dec_value != '' else '-1',
+                dig_value if dig_value != '' else '-1',
+                num_value if num_value != '' else '-1',
+                '1' if mirrored == 'Y' else '0',
+                old_name,
+                comment,
+                lower,
+                upper,
+                title
+            ));

    header.write('};\n\n');
    header.close();
--- a/src/code/unicode/data.h
+++ b/src/code/unicode/data.h