diff --git a/inc/unicode.h b/inc/unicode.h index e3880d7..c81350c 100644 --- a/inc/unicode.h +++ b/inc/unicode.h @@ -8,7 +8,8 @@ typedef uint_least16_t char16_t; typedef uint_least32_t char32_t; typedef int32_t uchar_t; -typedef enum { + +enum { UCHAR_Invalid, UCHAR_Cc, UCHAR_Cf, @@ -39,19 +40,57 @@ typedef enum { UCHAR_Zl, UCHAR_Zp, UCHAR_Zs, -} uchar_class; +}; + +typedef struct uchar_props uchar_props; +struct uchar_props { + int bidi_class; + int bidi_mirrored; + int bidi_paired_bracket; + int bidi_paired_bracket_type; + int block; + int canon_comb_class; + uchar_t ch_lower; + uchar_t ch_upper; + int ndecomp; + uchar_t const decomp[4]; + uchar_t default_igncp; + int deprecated; + int east_asian_width; + int gcat; + int hangul_syl_type; + int join_type; + int join_group; + int line_brk; + char const *name; + uchar_t nc_cp; + int num_val; + int ws; + int dash; + int letter_props; + int math_props; + int script; +}; + int uni_classify(uchar_t ch); int uni_valid(uchar_t ch); uchar_t uni_to_lower(uchar_t u); uchar_t uni_to_upper(uchar_t u); -int utf8_dec (char const *restrict utf8_str, uchar_t *restrict ch); -int utf16_dec (char const *restrict utf16_str, uchar_t *restrict ch); -int utf8_dec_s (char const *restrict utf8_str, size_t len, uchar_t *restrict ch); -int utf16_dec_s(char const *restrict utf16_str, size_t len, uchar_t *restrict ch); +int uni_is_hsur(char16_t ch); +int uni_is_lsur(char16_t ch); +uchar_t uni_surtoc (char16_t hsur, char16_t lsur); -int utf8_enc (char *utf8_str, uchar_t ch); -int utf16_enc (char *utf16_str, uchar_t ch); -int utf8_enc_s (char *utf8_str, size_t len, uchar_t ch); -int utf16_enc_s(char *utf16_str, size_t len, uchar_t ch); +int utf16_chlen(char16_t const *str); +int utf8_chlen (char const *str); + +int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch); +int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch); +int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch); +int utf8_dec (char const *restrict str, uchar_t *restrict ch); + +int utf16_enc_s(char16_t *str, size_t len, uchar_t ch); +int utf8_enc_s (char *str, size_t len, uchar_t ch); +int utf16_enc (char16_t *str, uchar_t ch); +int utf8_enc (char *str, uchar_t ch); diff --git a/src/code/unicode.c b/src/code/unicode.c index 93da509..9585a12 100644 --- a/src/code/unicode.c +++ b/src/code/unicode.c @@ -15,3 +15,150 @@ uchar_t uni_to_upper(uchar_t cp) { return uni_codepoints[cp].upper; } +int uni_valid(uchar_t ch) { + return (0x0000 <= ch && ch <= 0xd7ff) || (0xe000 <= ch && ch <= 0x10ffff); +} + +int uni_is_hsur(char16_t ch) { + return 0xd800 <= ch && ch <= 0xdbff; +} + +int uni_is_lsur(char16_t ch) { + return 0xdc00 <= ch && ch <= 0xdfff; +} + +uchar_t uni_surtoc(char16_t hsur, char16_t lsur) { + uchar_t u = ((0x3ff & hsur) << 10) | (lsur & 0x3ff); + return u + 0x10000; +} + +int utf16_chlen(char16_t const *str) { + char16_t cp = *str; + if(uni_is_hsur(cp)) return 2; + else if(uni_is_lsur(cp)) return 0; + else if(uni_valid(cp)) return 1; + return 0; +} + +int utf8_chlen(char const *str) { + uint8_t byte0 = (uint8_t)*str; + if(byte0 < 0x80) return 1; + else if(byte0 < 0xc0) return 0; // error + else if(byte0 < 0xe0) return 2; + else if(byte0 < 0xf0) return 3; + else if(byte0 < 0xf8) return 4; + return 0; +} + +int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { + int chlen = 0; + uchar_t ch; + if(uni_is_hsur(str[0])) { + char16_t hsur = str[0]; + char16_t lsur = str[1]; + ch = uni_surtoc(hsur, lsur); + chlen = 2; + } + else { + ch = str[0]; + } + if(!uni_valid(ch)) { + chlen = 0; + ch = 0xfffd; + } + if(chp != NULL) *chp = ch; + return chlen; +} + +int utf16_dec_s( + char16_t const *restrict str, + size_t len, + uchar_t *restrict chp +) { + if(len == 0) return 0; + int chlen; + uchar_t ch; + if(uni_is_hsur(str[0])) { + if(len < 2) return 0; + char16_t hsur = str[0]; + char16_t lsur = str[1]; + ch = uni_surtoc(hsur, lsur); + chlen = 2; + } + else { + ch = str[0]; + chlen = 1; + } + if(!uni_valid(ch)) { + ch = 0xfffd; + chlen = 0; + } + if(chp != NULL) *chp = ch; + return chlen; +} + +int utf8_dec(char const *restrict str, uchar_t *restrict chp) { + uint8_t const *ustr = (uint8_t const *)str; + int chlen; + uchar_t ch; + if(ustr[0] < 0x80) chlen = 1, ch = ustr[0]; + else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd; + else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f; + else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f; + else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07; + else chlen = 0; + for(int i = 1; i < chlen; ++i) { + uint8_t trail = ustr[i]; + if((trail & 0xc0) != 0x80) { + chlen = 0; + ch = 0xfffd; + break; + } + ch <<= 6; + ch |= (trail & 0x3f); + } + if(!uni_valid(ch)) { + chlen = 0; + ch = 0xfffd; + } + if(chp != NULL) *chp = ch; + return chlen; +} + +int utf8_dec_s( + char const *restrict str, + size_t len, + uchar_t *restrict chp +) { + if(len == 0) return 0; + uint8_t const *restrict ustr = (uint8_t const *restrict)str; + int chlen; + uchar_t ch; + if(ustr[0] < 0x80) chlen = 1, ch = ustr[0]; + else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd; + else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f; + else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f; + else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07; + else chlen = 0; + if(len < chlen) { + return 0; + } + else chlen = 0; + for(int i = 1; i < chlen; ++i) { + uint8_t trail = ustr[i]; + if((trail & 0xc0) != 0x80) { + chlen = 0; + ch = 0xfffd; + break; + } + ch <<= 6; + ch |= (trail & 0x3f); + } + if(!uni_valid(ch)) { + chlen = 0; + ch = 0xfffd; + } + if(chp != NULL) *chp = ch; + return chlen; +} + diff --git a/src/linux/syscalls.asm b/src/linux/syscalls.asm new file mode 100644 index 0000000..21fa358 --- /dev/null +++ b/src/linux/syscalls.asm @@ -0,0 +1,9 @@ + +bits 64 +segment .text + +global _exit +_exit: + mov rax, 60 + syscall + ret diff --git a/test/test_uchar.c b/test/test_uchar.c index 9490d3d..1e187dc 100644 --- a/test/test_uchar.c +++ b/test/test_uchar.c @@ -1,28 +1,19 @@ -#include +#include +#include -mbstate_t state; int main() { - char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C" - size_t in_sz = sizeof in / sizeof *in; - - char16_t out[in_sz]; - char *p_in = in, *end = in + in_sz; - char16_t *p_out = out; - size_t rc; - while((rc = mbrtoc16(p_out, p_in, end - p_in, &state))) + char *mbstr = u8"улыбок тебе дед макар"; { - if(rc == (size_t)-1) // invalid input - break; - else if(rc == (size_t)-2) // truncated input - break; - else if(rc == (size_t)-3) // UTF-16 high surrogate - p_out += 1; - else { - p_in += rc; - p_out += 1; - }; + char *str = mbstr; + uchar_t ch; + int len; + while((len = utf8_dec(str, &ch)) > 0 && ch != 0) { + printf("char: %d\n", ch); + str += len; + } + if(len <= 0) { + printf("This string is not utf8\n"); + } } - - size_t out_sz = p_out - out + 1; }