mirror of https://github.com/flysand7/ciabatta.git
unicode 3rror codes
This commit is contained in:
parent
f52bd57a8f
commit
9b13717b0d
|
@ -9,6 +9,13 @@ typedef uint_least32_t char32_t;
|
|||
|
||||
typedef int32_t uchar_t;
|
||||
|
||||
#define UNI_EBADCP (-1)
|
||||
#define UNI_EULSUR (-2)
|
||||
#define UNI_EIBYTE (-3)
|
||||
#define UNI_ETBYTE (-4)
|
||||
#define UNI_ESTRLN (-5)
|
||||
|
||||
|
||||
enum {
|
||||
UCHAR_BAD,
|
||||
UCHAR_Cc,
|
||||
|
@ -112,23 +119,23 @@ struct uchar_props {
|
|||
uchar_props *uni_props (uchar_t cp);
|
||||
int uni_valid (uchar_t cp);
|
||||
int uni_classify(uchar_t cp);
|
||||
uchar_t uni_tolower(uchar_t cp);
|
||||
uchar_t uni_toupper(uchar_t cp);
|
||||
uchar_t uni_totitle(uchar_t cp);
|
||||
uchar_t uni_tolower (uchar_t cp);
|
||||
uchar_t uni_toupper (uchar_t cp);
|
||||
uchar_t uni_totitle (uchar_t cp);
|
||||
|
||||
int uni_is_hsur(char16_t cp);
|
||||
int uni_is_lsur(char16_t cp);
|
||||
uchar_t uni_surtoc (char16_t hsur, char16_t lsur);
|
||||
|
||||
int utf16_chlen(char16_t const *str);
|
||||
int utf8_chlen (char const *str);
|
||||
size_t utf16_chlen(char16_t const *str);
|
||||
size_t utf8_chlen (char const *str);
|
||||
|
||||
int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch);
|
||||
int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch);
|
||||
int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch);
|
||||
int utf8_dec (char const *restrict str, uchar_t *restrict ch);
|
||||
size_t utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch);
|
||||
size_t utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch);
|
||||
size_t utf16_dec (char16_t const *restrict str, uchar_t *restrict ch);
|
||||
size_t utf8_dec (char const *restrict str, uchar_t *restrict ch);
|
||||
|
||||
int utf16_enc_s(char16_t *str, size_t len, uchar_t ch);
|
||||
int utf8_enc_s (char *str, size_t len, uchar_t ch);
|
||||
int utf16_enc (char16_t *str, uchar_t ch);
|
||||
int utf8_enc (char *str, uchar_t ch);
|
||||
size_t utf16_enc_s(char16_t *str, size_t len, uchar_t ch);
|
||||
size_t utf8_enc_s (char *str, size_t len, uchar_t ch);
|
||||
size_t utf16_enc (char16_t *str, uchar_t ch);
|
||||
size_t utf8_enc (char *str, uchar_t ch);
|
||||
|
|
|
@ -49,48 +49,51 @@ uchar_t uni_surtoc(char16_t hsur, char16_t lsur) {
|
|||
int utf16_chlen(char16_t const *str) {
|
||||
char16_t cp = *str;
|
||||
if(uni_is_hsur(cp)) return 2;
|
||||
else if(uni_is_lsur(cp)) return 0;
|
||||
else if(uni_valid(cp)) return 1;
|
||||
return 0;
|
||||
else if(uni_is_lsur(cp)) return UNI_EULSUR;
|
||||
else return 1;
|
||||
}
|
||||
|
||||
int utf8_chlen(char const *str) {
|
||||
size_t utf8_chlen(char const *str) {
|
||||
uint8_t byte0 = (uint8_t)*str;
|
||||
if(byte0 < 0x80) return 1;
|
||||
else if(byte0 < 0xc0) return 0; // error
|
||||
else if(byte0 < 0xc0) return UNI_EIBYTE;
|
||||
else if(byte0 < 0xe0) return 2;
|
||||
else if(byte0 < 0xf0) return 3;
|
||||
else if(byte0 < 0xf8) return 4;
|
||||
return 0;
|
||||
return UNI_EIBYTE;
|
||||
}
|
||||
|
||||
int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
|
||||
int chlen = 0;
|
||||
size_t utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
|
||||
size_t chlen = 0;
|
||||
uchar_t ch;
|
||||
if(uni_is_hsur(str[0])) {
|
||||
char16_t hsur = str[0];
|
||||
char16_t lsur = str[1];
|
||||
ch = uni_surtoc(hsur, lsur);
|
||||
chlen = 2;
|
||||
if(ch > 0x10ffff) {
|
||||
chlen = UNI_EBADCP;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
}
|
||||
else {
|
||||
else(!uni_is_lsur(str[0])) {
|
||||
ch = str[0];
|
||||
}
|
||||
if(!uni_valid(ch)) {
|
||||
chlen = 0;
|
||||
else {
|
||||
chlen = UNI_EULSUR;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(chp != NULL) *chp = ch;
|
||||
return chlen;
|
||||
}
|
||||
|
||||
int utf16_dec_s(
|
||||
size_t utf16_dec_s(
|
||||
char16_t const *restrict str,
|
||||
size_t len,
|
||||
uchar_t *restrict chp
|
||||
) {
|
||||
if(len == 0) return 0;
|
||||
int chlen;
|
||||
size_t chlen;
|
||||
uchar_t ch;
|
||||
if(uni_is_hsur(str[0])) {
|
||||
if(len < 2) return 0;
|
||||
|
@ -98,33 +101,39 @@ int utf16_dec_s(
|
|||
char16_t lsur = str[1];
|
||||
ch = uni_surtoc(hsur, lsur);
|
||||
chlen = 2;
|
||||
if(ch > 0x10ffff) {
|
||||
chlen = UNI_EBADCP;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
}
|
||||
else {
|
||||
else if(!uni_is_lsur(str[0])) {
|
||||
ch = str[0];
|
||||
chlen = 1;
|
||||
}
|
||||
if(!uni_valid(ch)) {
|
||||
else {
|
||||
chlen = UNI_EULSUR;
|
||||
ch = 0xfffd;
|
||||
chlen = 0;
|
||||
}
|
||||
if(chp != NULL) *chp = ch;
|
||||
return chlen;
|
||||
}
|
||||
|
||||
int utf8_dec(char const *restrict str, uchar_t *restrict chp) {
|
||||
size_t utf8_dec(char const *restrict str, uchar_t *restrict chp) {
|
||||
uint8_t const *ustr = (uint8_t const *)str;
|
||||
int chlen;
|
||||
|
||||
size_t chlen;
|
||||
uchar_t ch;
|
||||
if(ustr[0] < 0x80) chlen = 1, ch = ustr[0];
|
||||
else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd;
|
||||
else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f;
|
||||
else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f;
|
||||
else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07;
|
||||
else chlen = 0;
|
||||
for(int i = 1; i < chlen; ++i) {
|
||||
if(ustr[0] < 0x80) ch = ustr[0], chlen = 1;
|
||||
else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
|
||||
else if(ustr[0] < 0xe0) ch = ustr[0] & 0x1f, chlen = 2;
|
||||
else if(ustr[0] < 0xf0) ch = ustr[0] & 0x0f, chlen = 3;
|
||||
else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4;
|
||||
else ch = 0xfffd, chlen = UNI_EIBYTE;
|
||||
|
||||
if(chlen > 0) for(size_t i = 1; i < chlen; ++i) {
|
||||
uint8_t trail = ustr[i];
|
||||
if((trail & 0xc0) != 0x80) {
|
||||
chlen = 0;
|
||||
chlen = UNI_ETBYTE;
|
||||
ch = 0xfffd;
|
||||
break;
|
||||
}
|
||||
|
@ -132,36 +141,35 @@ int utf8_dec(char const *restrict str, uchar_t *restrict chp) {
|
|||
ch |= (trail & 0x3f);
|
||||
}
|
||||
if(!uni_valid(ch)) {
|
||||
chlen = 0;
|
||||
chlen = UNI_EBADCP;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(chp != NULL) *chp = ch;
|
||||
return chlen;
|
||||
}
|
||||
|
||||
int utf8_dec_s(
|
||||
size_t utf8_dec_s(
|
||||
char const *restrict str,
|
||||
size_t len,
|
||||
uchar_t *restrict chp
|
||||
) {
|
||||
if(len == 0) return 0;
|
||||
uint8_t const *restrict ustr = (uint8_t const *restrict)str;
|
||||
int chlen;
|
||||
size_t chlen;
|
||||
uchar_t ch;
|
||||
if(ustr[0] < 0x80) chlen = 1, ch = ustr[0];
|
||||
else if(ustr[0] < 0xc0) chlen = 0, ch = 0xfffd;
|
||||
else if(ustr[0] < 0xe0) chlen = 2, ch = ustr[0] & 0x1f;
|
||||
else if(ustr[0] < 0xf0) chlen = 3, ch = ustr[0] & 0x0f;
|
||||
else if(ustr[0] < 0xf8) chlen = 4, ch = ustr[0] & 0x07;
|
||||
else chlen = 0;
|
||||
if(len < chlen) {
|
||||
return 0;
|
||||
if(ustr[0] < 0x80) ch = ustr[0], chlen = 1;
|
||||
else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
|
||||
else if(ustr[0] < 0xe0) ch = ustr[0] & 0x1f, chlen = 2;
|
||||
else if(ustr[0] < 0xf0) ch = ustr[0] & 0x0f, chlen = 3;
|
||||
else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4;
|
||||
else ch = 0xfffd, chlen = UNI_EIBYTE;
|
||||
if(chlen > len) {
|
||||
return UNI_ESTRLN;
|
||||
}
|
||||
else chlen = 0;
|
||||
for(int i = 1; i < chlen; ++i) {
|
||||
if(chlen > 0) for(size_t i = 1; i < chlen; ++i) {
|
||||
uint8_t trail = ustr[i];
|
||||
if((trail & 0xc0) != 0x80) {
|
||||
chlen = 0;
|
||||
chlen = UNI_ETBYTE;
|
||||
ch = 0xfffd;
|
||||
break;
|
||||
}
|
||||
|
@ -169,7 +177,7 @@ int utf8_dec_s(
|
|||
ch |= (trail & 0x3f);
|
||||
}
|
||||
if(!uni_valid(ch)) {
|
||||
chlen = 0;
|
||||
chlen = UNI_EBADCP;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(chp != NULL) *chp = ch;
|
||||
|
|
Loading…
Reference in New Issue