unicode.h: encoding/decoding UTF-8/16

This commit is contained in:
bumbread 2022-07-02 18:51:30 +11:00
parent f1319e6f1d
commit 2bfdcc94cf
4 changed files with 218 additions and 32 deletions

View File

@ -8,7 +8,8 @@ typedef uint_least16_t char16_t;
typedef uint_least32_t char32_t; typedef uint_least32_t char32_t;
typedef int32_t uchar_t; typedef int32_t uchar_t;
typedef enum {
enum {
UCHAR_Invalid, UCHAR_Invalid,
UCHAR_Cc, UCHAR_Cc,
UCHAR_Cf, UCHAR_Cf,
@ -39,19 +40,57 @@ typedef enum {
UCHAR_Zl, UCHAR_Zl,
UCHAR_Zp, UCHAR_Zp,
UCHAR_Zs, UCHAR_Zs,
} uchar_class; };
typedef struct uchar_props uchar_props;
struct uchar_props {
int bidi_class;
int bidi_mirrored;
int bidi_paired_bracket;
int bidi_paired_bracket_type;
int block;
int canon_comb_class;
uchar_t ch_lower;
uchar_t ch_upper;
int ndecomp;
uchar_t const decomp[4];
uchar_t default_igncp;
int deprecated;
int east_asian_width;
int gcat;
int hangul_syl_type;
int join_type;
int join_group;
int line_brk;
char const *name;
uchar_t nc_cp;
int num_val;
int ws;
int dash;
int letter_props;
int math_props;
int script;
};
int uni_classify(uchar_t ch); int uni_classify(uchar_t ch);
int uni_valid(uchar_t ch); int uni_valid(uchar_t ch);
uchar_t uni_to_lower(uchar_t u); uchar_t uni_to_lower(uchar_t u);
uchar_t uni_to_upper(uchar_t u); uchar_t uni_to_upper(uchar_t u);
int utf8_dec (char const *restrict utf8_str, uchar_t *restrict ch); int uni_is_hsur(char16_t ch);
int utf16_dec (char const *restrict utf16_str, uchar_t *restrict ch); int uni_is_lsur(char16_t ch);
int utf8_dec_s (char const *restrict utf8_str, size_t len, uchar_t *restrict ch); uchar_t uni_surtoc (char16_t hsur, char16_t lsur);
int utf16_dec_s(char const *restrict utf16_str, size_t len, uchar_t *restrict ch);
int utf8_enc (char *utf8_str, uchar_t ch); int utf16_chlen(char16_t const *str);
int utf16_enc (char *utf16_str, uchar_t ch); int utf8_chlen (char const *str);
int utf8_enc_s (char *utf8_str, size_t len, uchar_t ch);
int utf16_enc_s(char *utf16_str, size_t len, uchar_t ch); int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch);
int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch);
int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch);
int utf8_dec (char const *restrict str, uchar_t *restrict ch);
int utf16_enc_s(char16_t *str, size_t len, uchar_t ch);
int utf8_enc_s (char *str, size_t len, uchar_t ch);
int utf16_enc (char16_t *str, uchar_t ch);
int utf8_enc (char *str, uchar_t ch);

View File

@ -15,3 +15,150 @@ uchar_t uni_to_upper(uchar_t cp) {
return uni_codepoints[cp].upper; return uni_codepoints[cp].upper;
} }
int uni_valid(uchar_t ch) {
return (0x0000 <= ch && ch <= 0xd7ff) || (0xe000 <= ch && ch <= 0x10ffff);
}
int uni_is_hsur(char16_t ch) {
return 0xd800 <= ch && ch <= 0xdbff;
}
int uni_is_lsur(char16_t ch) {
return 0xdc00 <= ch && ch <= 0xdfff;
}
uchar_t uni_surtoc(char16_t hsur, char16_t lsur) {
uchar_t u = ((0x3ff & hsur) << 10) | (lsur & 0x3ff);
return u + 0x10000;
}
int utf16_chlen(char16_t const *str) {
char16_t cp = *str;
if(uni_is_hsur(cp)) return 2;
else if(uni_is_lsur(cp)) return 0;
else if(uni_valid(cp)) return 1;
return 0;
}
int utf8_chlen(char const *str) {
uint8_t byte0 = (uint8_t)*str;
if(byte0 < 0x80) return 1;
else if(byte0 < 0xc0) return 0; // error
else if(byte0 < 0xe0) return 2;
else if(byte0 < 0xf0) return 3;
else if(byte0 < 0xf8) return 4;
return 0;
}
int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
int chlen = 0;
uchar_t ch;
if(uni_is_hsur(str[0])) {
char16_t hsur = str[0];
char16_t lsur = str[1];
ch = uni_surtoc(hsur, lsur);
chlen = 2;
}
else {
ch = str[0];
}
if(!uni_valid(ch)) {
chlen = 0;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf16_dec_s(
char16_t const *restrict str,
size_t len,
uchar_t *restrict chp
) {
if(len == 0) return 0;
int chlen;
uchar_t ch;
if(uni_is_hsur(str[0])) {
if(len < 2) return 0;
char16_t hsur = str[0];
char16_t lsur = str[1];
ch = uni_surtoc(hsur, lsur);
chlen = 2;
}
else {
ch = str[0];
chlen = 1;
}
if(!uni_valid(ch)) {
ch = 0xfffd;
chlen = 0;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf8_dec(char const *restrict str, uchar_t *restrict chp) {
uint8_t const *ustr = (uint8_t const *)str;
int chlen;
uchar_t ch;
if(ustr[0] < 0x80) chlen = 1, ch = ustr[0];
else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd;
else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f;
else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f;
else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07;
else chlen = 0;
for(int i = 1; i < chlen; ++i) {
uint8_t trail = ustr[i];
if((trail & 0xc0) != 0x80) {
chlen = 0;
ch = 0xfffd;
break;
}
ch <<= 6;
ch |= (trail & 0x3f);
}
if(!uni_valid(ch)) {
chlen = 0;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf8_dec_s(
char const *restrict str,
size_t len,
uchar_t *restrict chp
) {
if(len == 0) return 0;
uint8_t const *restrict ustr = (uint8_t const *restrict)str;
int chlen;
uchar_t ch;
if(ustr[0] < 0x80) chlen = 1, ch = ustr[0];
else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd;
else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f;
else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f;
else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07;
else chlen = 0;
if(len < chlen) {
return 0;
}
else chlen = 0;
for(int i = 1; i < chlen; ++i) {
uint8_t trail = ustr[i];
if((trail & 0xc0) != 0x80) {
chlen = 0;
ch = 0xfffd;
break;
}
ch <<= 6;
ch |= (trail & 0x3f);
}
if(!uni_valid(ch)) {
chlen = 0;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}

9
src/linux/syscalls.asm Normal file
View File

@ -0,0 +1,9 @@
bits 64
segment .text
global _exit
_exit:
mov rax, 60
syscall
ret

View File

@ -1,28 +1,19 @@
#include <uchar.h> #include <unicode.h>
#include <stdio.h>
mbstate_t state;
int main() { int main() {
char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C" char *mbstr = u8"улыбок тебе дед макар";
size_t in_sz = sizeof in / sizeof *in;
char16_t out[in_sz];
char *p_in = in, *end = in + in_sz;
char16_t *p_out = out;
size_t rc;
while((rc = mbrtoc16(p_out, p_in, end - p_in, &state)))
{ {
if(rc == (size_t)-1) // invalid input char *str = mbstr;
break; uchar_t ch;
else if(rc == (size_t)-2) // truncated input int len;
break; while((len = utf8_dec(str, &ch)) > 0 && ch != 0) {
else if(rc == (size_t)-3) // UTF-16 high surrogate printf("char: %d\n", ch);
p_out += 1; str += len;
else { }
p_in += rc; if(len <= 0) {
p_out += 1; printf("This string is not utf8\n");
}; }
} }
size_t out_sz = p_out - out + 1;
} }