unicode enc/dec functions

This commit is contained in:
bumbread 2022-07-06 10:59:57 +11:00
parent 5ab103a4ef
commit 097d7cf300
2 changed files with 117 additions and 24 deletions

View File

@ -14,7 +14,7 @@ typedef int32_t uchar_t;
#define UNI_EIBYTE (-3) #define UNI_EIBYTE (-3)
#define UNI_ETBYTE (-4) #define UNI_ETBYTE (-4)
#define UNI_ESTRLN (-5) #define UNI_ESTRLN (-5)
#define UNI_EOLONG (-6)
enum { enum {
UCHAR_BAD, UCHAR_BAD,
@ -127,15 +127,15 @@ int uni_is_hsur(char16_t cp);
int uni_is_lsur(char16_t cp); int uni_is_lsur(char16_t cp);
uchar_t uni_surtoc (char16_t hsur, char16_t lsur); uchar_t uni_surtoc (char16_t hsur, char16_t lsur);
size_t utf16_chlen(char16_t const *str); int utf16_chlen(char16_t const *str);
size_t utf8_chlen (char const *str); int utf8_chlen (char const *str);
size_t utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch); int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch);
size_t utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch); int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch);
size_t utf16_dec (char16_t const *restrict str, uchar_t *restrict ch); int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch);
size_t utf8_dec (char const *restrict str, uchar_t *restrict ch); int utf8_dec (char const *restrict str, uchar_t *restrict ch);
size_t utf16_enc_s(char16_t *str, size_t len, uchar_t ch); int utf16_enc_s(char16_t *str, size_t len, uchar_t ch);
size_t utf8_enc_s (char *str, size_t len, uchar_t ch); int utf8_enc_s (char *str, size_t len, uchar_t ch);
size_t utf16_enc (char16_t *str, uchar_t ch); int utf16_enc (char16_t *str, uchar_t ch);
size_t utf8_enc (char *str, uchar_t ch); int utf8_enc (char *str, uchar_t ch);

View File

@ -53,7 +53,7 @@ int utf16_chlen(char16_t const *str) {
else return 1; else return 1;
} }
size_t utf8_chlen(char const *str) { int utf8_chlen(char const *str) {
uint8_t byte0 = (uint8_t)*str; uint8_t byte0 = (uint8_t)*str;
if(byte0 < 0x80) return 1; if(byte0 < 0x80) return 1;
else if(byte0 < 0xc0) return UNI_EIBYTE; else if(byte0 < 0xc0) return UNI_EIBYTE;
@ -63,8 +63,8 @@ size_t utf8_chlen(char const *str) {
return UNI_EIBYTE; return UNI_EIBYTE;
} }
size_t utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) { int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
size_t chlen = 0; int chlen = 0;
uchar_t ch; uchar_t ch;
if(uni_is_hsur(str[0])) { if(uni_is_hsur(str[0])) {
char16_t hsur = str[0]; char16_t hsur = str[0];
@ -87,13 +87,13 @@ size_t utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
return chlen; return chlen;
} }
size_t utf16_dec_s( int utf16_dec_s(
char16_t const *restrict str, char16_t const *restrict str,
size_t len, int len,
uchar_t *restrict chp uchar_t *restrict chp
) { ) {
if(len == 0) return 0; if(len == 0) return 0;
size_t chlen; int chlen;
uchar_t ch; uchar_t ch;
if(uni_is_hsur(str[0])) { if(uni_is_hsur(str[0])) {
if(len < 2) return 0; if(len < 2) return 0;
@ -118,10 +118,10 @@ size_t utf16_dec_s(
return chlen; return chlen;
} }
size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) { int utf8_dec(char const *restrict str, uchar_t *restrict chp) {
uint8_t const *ustr = (uint8_t const *)str; uint8_t const *ustr = (uint8_t const *)str;
size_t chlen; int chlen;
uchar_t ch; uchar_t ch;
if(ustr[0] < 0x80) ch = ustr[0], chlen = 1; if(ustr[0] < 0x80) ch = ustr[0], chlen = 1;
else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE; else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
@ -130,7 +130,7 @@ size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) {
else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4; else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4;
else ch = 0xfffd, chlen = UNI_EIBYTE; else ch = 0xfffd, chlen = UNI_EIBYTE;
if(chlen > 0) for(size_t i = 1; i < chlen; ++i) { if(chlen > 0) for(int i = 1; i < chlen; ++i) {
uint8_t trail = ustr[i]; uint8_t trail = ustr[i];
if((trail & 0xc0) != 0x80) { if((trail & 0xc0) != 0x80) {
chlen = UNI_ETBYTE; chlen = UNI_ETBYTE;
@ -148,14 +148,14 @@ size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) {
return chlen; return chlen;
} }
size_t utf8_dec_s( int utf8_dec_s(
char const *restrict str, char const *restrict str,
size_t len, int len,
uchar_t *restrict chp uchar_t *restrict chp
) { ) {
if(len == 0) return 0; if(len == 0) return 0;
uint8_t const *restrict ustr = (uint8_t const *restrict)str; uint8_t const *restrict ustr = (uint8_t const *restrict)str;
size_t chlen; int chlen;
uchar_t ch; uchar_t ch;
if(ustr[0] < 0x80) ch = ustr[0], chlen = 1; if(ustr[0] < 0x80) ch = ustr[0], chlen = 1;
else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE; else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
@ -166,7 +166,7 @@ size_t utf8_dec_s(
if(chlen > len) { if(chlen > len) {
return UNI_ESTRLN; return UNI_ESTRLN;
} }
if(chlen > 0) for(size_t i = 1; i < chlen; ++i) { if(chlen > 0) for(int i = 1; i < chlen; ++i) {
uint8_t trail = ustr[i]; uint8_t trail = ustr[i];
if((trail & 0xc0) != 0x80) { if((trail & 0xc0) != 0x80) {
chlen = UNI_ETBYTE; chlen = UNI_ETBYTE;
@ -184,3 +184,96 @@ size_t utf8_dec_s(
return chlen; return chlen;
} }
int utf16_enc(char16_t *str, uchar_t cp) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(cp < 0x10000) {
str[0] = cp;
return 1;
}
else {
cp -= 0x10000;
str[0] = 0xD800 + (cp >> 10);
str[1] = 0xDC00 + (cp & 0x3ff);
return 2;
}
}
int utf8_enc(char *str, uchar_t ch) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(cp < 0x80) {
str[0] = ch;
return 1;
}
else if(cp < 0x800) {
str[0] = 0xc0 | (ch >> 6);
str[1] = 0x80 | ((ch >> 0) & 0x3f);
return 2;
}
else if(cp < 0x10000) {
str[0] = 0xe0 | (ch >> 18);
str[1] = 0x80 | ((ch >> 6) & 0x3f);
str[2] = 0x80 | ((ch >> 0) & 0x3f);
return 3;
}
else {
str[0] = 0xe0 | (ch >> 24);
str[1] = 0x80 | ((ch >> 18) & 0x3f);
str[2] = 0x80 | ((ch >> 6) & 0x3f);
str[3] = 0x80 | ((ch >> 0) & 0x3f);
return 4;
}
}
int utf16_enc_s(char16_t *str, size_t len, uchar_t ch) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(len == 0) return 0;
if(cp < 0x10000) {
str[0] = cp;
return 1;
}
else {
if(len < 2) return UNI_ESTRLN;
cp -= 0x10000;
str[0] = 0xD800 + (cp >> 10);
str[1] = 0xDC00 + (cp & 0x3ff);
return 2;
}
}
int utf8_enc_s(char *str, size_t len, uchar_t ch) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(len == 0) return 0;
if(cp < 0x80) {
str[0] = ch;
return 1;
}
else if(cp < 0x800) {
if(len < 2) return UNI_ESTRLN;
str[0] = 0xc0 | (ch >> 6);
str[1] = 0x80 | ((ch >> 0) & 0x3f);
return 2;
}
else if(cp < 0x10000) {
if(len < 3) return UNI_ESTRLN;
str[0] = 0xe0 | (ch >> 18);
str[1] = 0x80 | ((ch >> 6) & 0x3f);
str[2] = 0x80 | ((ch >> 0) & 0x3f);
return 3;
}
else {
if(len < 4) return UNI_ESTRLN;
str[0] = 0xe0 | (ch >> 24);
str[1] = 0x80 | ((ch >> 18) & 0x3f);
str[2] = 0x80 | ((ch >> 6) & 0x3f);
str[3] = 0x80 | ((ch >> 0) & 0x3f);
return 4;
}
}