From 3f7c3de28871a1eb133e5025448fedd8745be333 Mon Sep 17 00:00:00 2001 From: bumbread Date: Mon, 18 Jul 2022 02:09:26 +1100 Subject: [PATCH] mb <-> c32 functions --- src/code/uchar.c | 80 +++++++++--------------------------------------- 1 file changed, 14 insertions(+), 66 deletions(-) diff --git a/src/code/uchar.c b/src/code/uchar.c index 3a01b97..30aa02b 100644 --- a/src/code/uchar.c +++ b/src/code/uchar.c @@ -100,46 +100,18 @@ size_t mbrtoc32( mbstate_t *restrict ps ) { if(s == NULL) { - *ps = (mbstate_t) {0}; return 0; } - size_t nbytes; - - // Decode the first byte of UTF-8 sequence - unsigned byte0 = *s; - if (0x00 <= byte0 && byte0 < 0x80) nbytes = 1; - else if(0xc0 <= byte0 && byte0 < 0xe0) nbytes = 2; - else if(0xe0 <= byte0 && byte0 < 0xf0) nbytes = 3; - else if(0xf0 <= byte0 && byte0 < 0xf8) nbytes = 4; - else goto encoding_error; - unsigned nbytesreq = nbytes; - if(n < nbytesreq) { - return (size_t)(-2); + char32_t code_point; + int mblen = utf8_chdec((char8_t *)s, n, &code_point); + if(mblen == UNI_ESTRLN) return (size_t)(-2); + if(mblen <= 0) { + errno = EILSEQ; + return (size_t)(-1); } - char32_t cp = byte0; - switch(nbytesreq) { - case 2: cp &= 0x1f; break; - case 3: cp &= 0x0f; break; - case 4: cp &= 0x07; break; - } - while(--nbytesreq) - cp |= (cp << 6) | ((*++s) & 0x3f); - if(0xdc00 <= cp && cp <= 0xe000) - goto encoding_error; - // Overloing seqs - if(cp < 0x80 && nbytes > 1) goto encoding_error; - if(cp < 0x800 && nbytes > 2) goto encoding_error; - if(cp < 0x10000 && nbytes > 3) goto encoding_error; - if(cp > 0x10ffff) goto encoding_error; - - if(pc32 != NULL) *pc32 = cp; - if(cp == 0) - return 0; - else - return nbytes; -encoding_error: - errno = EILSEQ; - return (size_t)(-1); + *pc32 = code_point; + if(code_point == 0) return 0; + return (size_t)mblen; } size_t c32rtomb( @@ -151,34 +123,10 @@ size_t c32rtomb( *ps = (mbstate_t) {0}; return 0; } - unsigned cp = c32; - if(cp >= 0x10ffff) goto encoding_error; - size_t nbytes = 4; - if(cp < 0x10000) nbytes = 3; - if(cp < 0x800) nbytes = 2; - if(cp < 0x80) nbytes = 1; - switch(nbytes) { - case 1: { - s[0] = cp; - } break; - case 2: { - s[0] = 0xc0 | (cp >> 6); - s[1] = 0x80 | ((cp >> 0) & 0x3f); - } break; - case 3: { - s[0] = 0xe0 | (cp >> 12); - s[1] = 0x80 | ((cp >> 6) & 0x3f); - s[2] = 0x80 | ((cp >> 0) & 0x3f); - } break; - case 4: { - s[0] = 0xf0 | (cp >> 18); - s[1] = 0x80 | ((cp >> 12) & 0x3f); - s[2] = 0x80 | ((cp >> 6) & 0x3f); - s[3] = 0x80 | ((cp >> 0) & 0x3f); - } break; + int mblen = utf8_enc(s, 4, c32); + if(mblen <= 0) { + errno = EILSEQ; + return (size_t)(-1); } - return nbytes; -encoding_error: - errno = EILSEQ; - return (size_t)(-1); + return (size_t)mblen; }