Fix using invalid unicode data and add other unicode character data

This commit is contained in:
bumbread 2022-07-05 15:37:15 +11:00
parent 2bfdcc94cf
commit f52bd57a8f
4 changed files with 658069 additions and 34698 deletions

View File

@ -10,7 +10,7 @@ typedef uint_least32_t char32_t;
typedef int32_t uchar_t; typedef int32_t uchar_t;
enum { enum {
UCHAR_Invalid, UCHAR_BAD,
UCHAR_Cc, UCHAR_Cc,
UCHAR_Cf, UCHAR_Cf,
UCHAR_Co, UCHAR_Co,
@ -42,44 +42,82 @@ enum {
UCHAR_Zs, UCHAR_Zs,
}; };
typedef struct uchar_props uchar_props; enum {
struct uchar_props { UCHAR_BIDI_AL,
int bidi_class; UCHAR_BIDI_AN,
int bidi_mirrored; UCHAR_BIDI_B,
int bidi_paired_bracket; UCHAR_BIDI_BN,
int bidi_paired_bracket_type; UCHAR_BIDI_CS,
int block; UCHAR_BIDI_EN,
int canon_comb_class; UCHAR_BIDI_ES,
uchar_t ch_lower; UCHAR_BIDI_ET,
uchar_t ch_upper; UCHAR_BIDI_FSI,
int ndecomp; UCHAR_BIDI_L,
uchar_t const decomp[4]; UCHAR_BIDI_LRE,
uchar_t default_igncp; UCHAR_BIDI_LRI,
int deprecated; UCHAR_BIDI_LRO,
int east_asian_width; UCHAR_BIDI_NSM,
int gcat; UCHAR_BIDI_ON,
int hangul_syl_type; UCHAR_BIDI_PDF,
int join_type; UCHAR_BIDI_PDI,
int join_group; UCHAR_BIDI_R,
int line_brk; UCHAR_BIDI_RLE,
char const *name; UCHAR_BIDI_RLI,
uchar_t nc_cp; UCHAR_BIDI_RLO,
int num_val; UCHAR_BIDI_S,
int ws; UCHAR_BIDI_WS,
int dash;
int letter_props;
int math_props;
int script;
}; };
enum {
UCHAR_DECOMP_CANON,
UCHAR_DECOMP_FONT,
UCHAR_DECOMP_NOBREAK,
UCHAR_DECOMP_INITIAL,
UCHAR_DECOMP_MEDIAL,
UCHAR_DECOMP_FINAL,
UCHAR_DECOMP_ISOLATED,
UCHAR_DECOMP_CIRCLE,
UCHAR_DECOMP_SUPER,
UCHAR_DECOMP_SUB,
UCHAR_DECOMP_VERTICAL,
UCHAR_DECOMP_WIDE,
UCHAR_DECOMP_NARROW,
UCHAR_DECOMP_SMALL,
UCHAR_DECOMP_SQUARE,
UCHAR_DECOMP_FRACTION,
UCHAR_DECOMP_COMPAT,
};
int uni_classify(uchar_t ch); typedef struct uchar_props uchar_props;
int uni_valid(uchar_t ch); struct uchar_props {
uchar_t uni_to_lower(uchar_t u); uchar_t code;
uchar_t uni_to_upper(uchar_t u); char const *name;
int cat_gen;
int cat_bidi;
int comb_class;
int dec_type;
int dec_map_n;
uchar_t const dec_map[18]; // U+FDFA takes 18, everything else takes up <8
int dec_value;
int dig_value;
double num_value;
int bidi_mirrored;
char const *old_name;
char const *comment;
uchar_t lower;
uchar_t upper;
uchar_t title;
};
int uni_is_hsur(char16_t ch); uchar_props *uni_props (uchar_t cp);
int uni_is_lsur(char16_t ch); int uni_valid (uchar_t cp);
int uni_classify(uchar_t cp);
uchar_t uni_tolower(uchar_t cp);
uchar_t uni_toupper(uchar_t cp);
uchar_t uni_totitle(uchar_t cp);
int uni_is_hsur(char16_t cp);
int uni_is_lsur(char16_t cp);
uchar_t uni_surtoc (char16_t hsur, char16_t lsur); uchar_t uni_surtoc (char16_t hsur, char16_t lsur);
int utf16_chlen(char16_t const *str); int utf16_chlen(char16_t const *str);

View File

@ -3,16 +3,30 @@
#include "unicode/data.h" #include "unicode/data.h"
int uni_classify(uchar_t cp) { uchar_props *uni_props(uchar_t cp) {
return uni_codepoints[cp].cat; if(!uni_valid(cp)) return NULL;
if(unicode_data[cp].code != cp) return NULL;
return &unicode_data[cp];
} }
uchar_t uni_to_lower(uchar_t cp) { int uni_cat_gen(uchar_t cp) {
return uni_codepoints[cp].lower; uchar_props *props = uni_props(cp);
if(props != NULL)
return unicode_data[cp].cat_gen;
else
return UCHAR_BAD;
} }
uchar_t uni_to_upper(uchar_t cp) { uchar_t uni_tolower(uchar_t cp) {
return uni_codepoints[cp].upper; return unicode_data[cp].lower;
}
uchar_t uni_toupper(uchar_t cp) {
return unicode_data[cp].upper;
}
uchar_t uni_totitle(uchar_t cp) {
return unicode_data[cp].title;
} }
int uni_valid(uchar_t ch) { int uni_valid(uchar_t ch) {

View File

@ -9,34 +9,90 @@ os.chdir(dname)
with open('data.h', 'w') as header: with open('data.h', 'w') as header:
header.write('\n'); header.write('\n');
header.write('#pragma once\n\n'); header.write('#pragma once\n\n');
header.write('#include<unicode.h>\n'); header.write('#include <unicode.h>\n');
header.write( header.write(
''' '''
struct _uni_elm { uchar_props unicode_data[] = {
uchar_t code;
int cat;
uchar_t lower;
uchar_t upper;
} uni_codepoints[] = {
'''); ''');
with open('data.txt') as file: with open('data.txt') as file:
for line in file: for line in file:
row = line.split(';') row = line.split(';')
code = row[0].strip() code = row[0].strip()
cat = row[2].strip() name = row[1].strip()
cat_gen = row[2].strip()
cat_bidi = row[4].strip()
comb_class = row[3].strip()
dec_map = row[5].strip()
dec_value = row[6].strip()
dig_value = row[7].strip()
num_value = row[8].strip()
mirrored = row[9].strip()
old_name = row[10].strip()
comment = row[11].strip()
upper = row[12].strip()
lower = row[13].strip() lower = row[13].strip()
upper = row[14].strip() title = row[14].strip()
# Process decompositional mapping
dec_map_n = 0
dec_type = 'CANON'
if dec_map != '':
dec_map = dec_map.split(' ')
if dec_map[0][0] == '<':
dec_type = dec_map[0][1:-1].upper()
dec_map = dec_map[1:]
dec_map_n = len(dec_map)
if dec_map_n != 0:
dec_map = ', '.join(list(map(lambda x: '0x' + x, dec_map)))
else:
dec_map = '0'
# Make sure lowercase and uppercase mappings are defined
if lower == '': if lower == '':
lower = code lower = code
if upper == '' or upper == '\n': if upper == '':
upper = code upper = code
header.write(' {' + \ if title == '' or title == '\n':
'0x' + code + ', ' + \ title = code
'UCHAR_' + cat + ', ' + \ header.write(
'0x' + lower + ', ' + \ '''
'0x' + upper + '},\n' [0x%s] = {
); .code = 0x%s,
.name = "%s",
.cat_gen = UCHAR_%s,
.cat_bidi = UCHAR_BIDI_%s,
.comb_class = %s,
.dec_type = UCHAR_DECOMP_%s,
.dec_map_n = %s,
.dec_map = {%s},
.dec_value = %s,
.dig_value = %s,
.num_value = %s,
.bidi_mirrored = %s,
.old_name = "%s",
.comment = "%s",
.lower = 0x%s,
.upper = 0x%s,
.title = 0x%s,
},''' % (
code,
code,
name,
cat_gen,
cat_bidi,
comb_class,
dec_type,
dec_map_n,
dec_map,
dec_value if dec_value != '' else '-1',
dig_value if dig_value != '' else '-1',
num_value if num_value != '' else '-1',
'1' if mirrored == 'Y' else '0',
old_name,
comment,
lower,
upper,
title
));
header.write('};\n\n'); header.write('};\n\n');
header.close(); header.close();

File diff suppressed because it is too large Load Diff