mirror of https://github.com/flysand7/ciabatta.git
restartable c16 <-> mb conversions
This commit is contained in:
parent
7a1ffc534b
commit
bf428a9924
|
@ -34,6 +34,8 @@ typedef struct lldiv_t {
|
|||
#define EXIT_SUCCESS 0
|
||||
|
||||
#define RAND_MAX 65536
|
||||
|
||||
#define MB_LEN_MAX 4
|
||||
#define MB_CUR_MAX 4
|
||||
|
||||
// Microsoft extension, COUNTOF(x) counts array elements
|
||||
|
|
|
@ -9,7 +9,10 @@ typedef uint_least32_t char32_t;
|
|||
|
||||
typedef struct mbstate_t mbstate_t;
|
||||
struct mbstate_t {
|
||||
union {
|
||||
char16_t leftover;
|
||||
char16_t high_surrogate;
|
||||
}
|
||||
};
|
||||
|
||||
size_t mbrtoc16(
|
||||
|
|
119
src/code/uchar.c
119
src/code/uchar.c
|
@ -2,7 +2,8 @@
|
|||
#include <uchar.h>
|
||||
#include <errno.h>
|
||||
|
||||
#include <unicode.h>
|
||||
#include <unicope.h>
|
||||
|
||||
|
||||
size_t mbrtoc16(
|
||||
char16_t *restrict pc16,
|
||||
|
@ -10,93 +11,83 @@ size_t mbrtoc16(
|
|||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
// Figure out the conversion state
|
||||
static mbstate_t static_mbstate = {0};
|
||||
if(ps == NULL) ps = &static_mbstate;
|
||||
if(s == NULL) {
|
||||
*ps = (mbstate_t) {0};
|
||||
*ps = (mbstate_t) {0xd800};
|
||||
return 0;
|
||||
}
|
||||
// First check leftovers, using 0xd800 as marker because it doesn't
|
||||
// encode a valid character.
|
||||
if(ps->leftover != 0xd800) {
|
||||
// Check leftovers, using 0xd800 as "no leftover" marker because it
|
||||
// doesn't encode a valid character.
|
||||
if(ps->leftover == 0xd800) {
|
||||
// Decode the UTF-8 encoded codepoint
|
||||
char32_t code_point;
|
||||
int mblen = utf8_chdec((char8_t *)s, n, &code_point);
|
||||
if(mblen == UNI_ESTRLN) return (size_t)(-2);
|
||||
if(mblen <= 0) goto invalid_seq;
|
||||
// Encode the codepoint into UTF-16 string
|
||||
char16_t str[2];
|
||||
int c16len = utf16_chenc(str, 2, code_point);
|
||||
if(c16len <= 0) goto invalid_seq;
|
||||
// Assign the decoded UTF-16 character, decide leftover
|
||||
if(pc16 != NULL) *pc16 = str[0];
|
||||
ps->leftover = (c16len == 2? str[1] : 0xd800);
|
||||
return (size_t)mblen;
|
||||
}
|
||||
else {
|
||||
// Otherwise use and reset the leftover
|
||||
if(pc16 != NULL) *pc16 = ps->leftover;
|
||||
ps->leftover = 0xd800;
|
||||
return (size_t)(-3);
|
||||
}
|
||||
else {
|
||||
uchar_t ch;
|
||||
char16_t str[3];
|
||||
int chlen = utf8_dec(s, &ch);
|
||||
if(chlen <= 0) goto encoding_error;
|
||||
int wrlen = utf16_enc(str, ch);
|
||||
char16_t curc;
|
||||
char16_t next;
|
||||
if(wrlen <= 0) goto encoding_error;
|
||||
else if(wrlen == 2) {
|
||||
curc = str[0];
|
||||
next = 0xd800;
|
||||
}
|
||||
else {
|
||||
curc = str[0];
|
||||
next = str[1];
|
||||
}
|
||||
ps->leftover = next;
|
||||
if(pc16 != NULL) *pc16 = curc;
|
||||
return (size_t)-2;
|
||||
}
|
||||
encoding_error:
|
||||
invalid_seq:
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
size_t c16rtomb(
|
||||
char *restrict s,
|
||||
char16_t c16,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
// Figure out conversion state
|
||||
static mbstate_t static_mbstate = {0};
|
||||
if(ps == NULL) ps = &static_mbstate;
|
||||
if(s == NULL) {
|
||||
*ps = (mbstate_t) {0};
|
||||
*ps = (mbstate_t) {0xd800};
|
||||
return 0;
|
||||
}
|
||||
unsigned cp;
|
||||
// High surrogate (save)
|
||||
if(0xd800 <= c16 && c16 < 0xdc00) {
|
||||
ps->leftover = c16;
|
||||
char32_t codepoint_to_write;
|
||||
// Check whether a high surrogate was detected in a previous call to the
|
||||
// function. If not, the high_surrogate value is 0xd800
|
||||
if(ps->high_surrogate == 0xd800) {
|
||||
// If c16 is a surrogate record it, or throw an error
|
||||
if(uni_is_hsur(c16)) {
|
||||
ps->high_surrogate = c16;
|
||||
return 0;
|
||||
}
|
||||
// Low surrogate (parse)
|
||||
else if(0xdc00 <= c16 && c16 < 0xe000) {
|
||||
if(ps->leftover == 0) goto encoding_error;
|
||||
cp = ((ps->leftover & 0x3ff) << 10) | (c16 & 0x3ff);
|
||||
else if(uni_is_lsur(c16)) {
|
||||
goto invalid_char;
|
||||
}
|
||||
// Other char
|
||||
else {
|
||||
cp = c16;
|
||||
// We'll just write c16
|
||||
codepoint_to_write = c16;
|
||||
}
|
||||
size_t nbytes = 4;
|
||||
if(cp < 0x10000) nbytes = 3;
|
||||
if(cp < 0x800) nbytes = 2;
|
||||
if(cp < 0x80) nbytes = 1;
|
||||
switch(nbytes) {
|
||||
case 1: {
|
||||
s[0] = cp;
|
||||
} break;
|
||||
case 2: {
|
||||
s[0] = 0xc0 | (cp >> 6);
|
||||
s[1] = 0x80 | ((cp >> 0) & 0x3f);
|
||||
} break;
|
||||
case 3: {
|
||||
s[0] = 0xe0 | (cp >> 12);
|
||||
s[1] = 0x80 | ((cp >> 6) & 0x3f);
|
||||
s[2] = 0x80 | ((cp >> 0) & 0x3f);
|
||||
} break;
|
||||
case 4: {
|
||||
s[0] = 0xf0 | (cp >> 18);
|
||||
s[1] = 0x80 | ((cp >> 12) & 0x3f);
|
||||
s[2] = 0x80 | ((cp >> 6) & 0x3f);
|
||||
s[3] = 0x80 | ((cp >> 0) & 0x3f);
|
||||
} break;
|
||||
// If high surrogate exists, the next character must be a low surrogate
|
||||
// so we'll write a codepoint made out of high and low surrogates
|
||||
else if(uni_is_lsur(c16)) {
|
||||
codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
|
||||
}
|
||||
return nbytes;
|
||||
encoding_error:
|
||||
else goto invalid_char;
|
||||
// Write the codepoint that we decided to write to multibyte string
|
||||
int written_len = utf8_chenc(s, 4, codepoint_to_write);
|
||||
if(written_len < 0) {
|
||||
goto invalid_char;
|
||||
}
|
||||
return (size_t)written_len;
|
||||
invalid_char:
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
#include <wctype.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <unicode.h>
|
||||
#include <unicope.h>
|
||||
|
||||
int iswctype(wint_t wc, wctype_t desc) {
|
||||
return desc(wc);
|
||||
|
@ -113,9 +113,9 @@ int iswxdigit(wint_t wc) {
|
|||
}
|
||||
|
||||
wint_t towlower(wint_t wc) {
|
||||
return uni_to_lower(wc);
|
||||
return uni_tolower(wc);
|
||||
}
|
||||
|
||||
wint_t towupper(wint_t wc) {
|
||||
return uni_to_upper(wc);
|
||||
return uni_toupper(wc);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue