From 337e6734a7542689c842b14d24d8ec7cc234b800 Mon Sep 17 00:00:00 2001 From: bumbread Date: Wed, 22 Jun 2022 23:36:26 +1100 Subject: [PATCH] Stack checking et uchar utf8->utf16 --- bake.cmd | 10 ++++-- inc/stdlib.h | 2 +- inc/uchar.h | 38 ++++++++++++++++---- inc/wchar.h | 5 ++- src/code/uchar.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++ src/win/chkstk.asm | 28 +++++++++++++++ test/test_uchar.c | 28 +++++++++++++++ todo | 1 + 8 files changed, 189 insertions(+), 12 deletions(-) create mode 100644 src/code/uchar.c create mode 100644 src/win/chkstk.asm create mode 100644 test/test_uchar.c diff --git a/bake.cmd b/bake.cmd index 4c93e61..47dbac4 100644 --- a/bake.cmd +++ b/bake.cmd @@ -20,8 +20,8 @@ if "%1"=="test" ( goto :skip_crt_compilation ) -if not exist src\code\unicode\unicode_data.h ( - py src\code\unicode\unicode_compile.py +if not exist src\code\unicode\data.h ( + py src\code\unicode\compile.py ) if exist bin rd/s/q bin @@ -33,6 +33,10 @@ for /R src\%PLATFORM% %%F in (*.c) do ( echo %%F clang -Isrc/win -c -o bin\%PLATFORM%\%%~nF.obj %%F %CIABATTA_OPTIONS% ) +for /R src\%PLATFORM% %%F in (*.asm) do ( + echo %%F + nasm %%F -f win64 -o bin\%PLATFORM%\%%~nF.obj +) for /R src\code %%F in (*.c) do ( echo %%F clang -c -o bin\%%~nF.obj %%F %CIABATTA_OPTIONS% @@ -44,5 +48,5 @@ llvm-ar rc ciabatta.lib bin\*.obj bin\%PLATFORM%\*.obj if "%TEST%"=="" set TEST=assert echo Compiling test_%TEST%.c -clang -fno-builtin test\test_%TEST%.c ciabatta.lib -std=c11 -lDbghelp -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS% +clang test\test_%TEST%.c ciabatta.lib -std=c11 -lDbghelp -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS% ::cl test\test_math.c /Iinc -D_CRT_SECURE_NO_WARNINGS /Z7 /link ciabatta.lib kernel32.lib user32.lib shell32.lib -nostdlib -nodefaultlibs diff --git a/inc/stdlib.h b/inc/stdlib.h index 0e587a9..d5b1fe1 100644 --- a/inc/stdlib.h +++ b/inc/stdlib.h @@ -34,7 +34,7 @@ typedef struct lldiv_t { // #define EXIT_SUCCESS 0 #define RAND_MAX 65536 -// #define MB_CUR_MAX 5 +#define MB_CUR_MAX 4 // Microsoft extension, COUNTOF(x) counts array elements #ifndef COUNTOF diff --git a/inc/uchar.h b/inc/uchar.h index 7e232b6..2813313 100644 --- a/inc/uchar.h +++ b/inc/uchar.h @@ -1,15 +1,39 @@ #pragma once +#include +#include + typedef struct mbstate_t mbstate_t; -typedef uint16_t char16_t; -typedef uint32_t char32_t; +typedef uint_least16_t char16_t; +typedef uint_least32_t char32_t; struct mbstate_t { - char filler[4]; + char16_t leftover; }; -size_t mbrtoc16(char16_t * restrict pc16, const char * restrict s, size_t n, mbstate_t * restrict ps); -size_t c16rtomb(char * restrict s, char16_t c16, mbstate_t * restrict ps); -size_t mbrtoc32(char32_t * restrict pc32, const char * restrict s, size_t n, mbstate_t * restrict ps); -size_t c32rtomb(char * restrict s, char32_t c32, mbstate_t * restrict ps); +size_t mbrtoc16( + char16_t *restrict pc16, + char const *restrict s, + size_t n, + mbstate_t *restrict ps +); + +size_t c16rtomb( + char *restrict s, + char16_t c16, + mbstate_t *restrict ps +); + +size_t mbrtoc32( + char32_t *restrict pc32, + char const *restrict s, + size_t n, + mbstate_t *restrict ps +); + +size_t c32rtomb( + char *restrict s, + char32_t c32, + mbstate_t *restrict ps +); diff --git a/inc/wchar.h b/inc/wchar.h index 40f91e0..8b7c58d 100644 --- a/inc/wchar.h +++ b/inc/wchar.h @@ -1,11 +1,14 @@ #pragma once +#include + typedef struct mbstate_t mbstate_t; typedef wchar_t wint_t; struct mbstate_t { - char filler[4]; + char16_t next; + char bytes[4]; }; #define WCHAR_MIN 0x0000 diff --git a/src/code/uchar.c b/src/code/uchar.c new file mode 100644 index 0000000..7bfc528 --- /dev/null +++ b/src/code/uchar.c @@ -0,0 +1,89 @@ + +#include +#include + +size_t mbrtoc16( + char16_t *restrict pc16, + char const *restrict s, + size_t n, + mbstate_t *restrict ps +) { + if(s == NULL) { + return 0; + } + size_t nbytes; + char16_t parsed_char; + char16_t next_char; + // First check leftovers + if(ps->leftover == 0) { + // Decode the first byte of UTF-8 sequence + unsigned byte0 = *s; + if (0x00 <= byte0 && byte0 < 0x80) nbytes = 1; + else if(0xc0 <= byte0 && byte0 < 0xe0) nbytes = 2; + else if(0xe0 <= byte0 && byte0 < 0xf0) nbytes = 3; + else if(0xf0 <= byte0 && byte0 < 0xf8) nbytes = 4; + else goto encoding_error; + unsigned nbytesreq = nbytes; + if(n < nbytesreq) { + return (size_t)(-2); + } + char32_t cp = byte0; + switch(nbytesreq) { + case 2: cp &= 0x1f; break; + case 3: cp &= 0x0f; break; + case 4: cp &= 0x07; break; + } + while(--nbytesreq) + cp |= (cp << 6) | ((*++s) & 0x3f); + if(0xdc00 <= cp && cp <= 0xe000) + goto encoding_error; + // Overloing seqs + if(cp < 0x80 && nbytes > 1) goto encoding_error; + if(cp < 0x800 && nbytes > 2) goto encoding_error; + if(cp < 0x10000 && nbytes > 3) goto encoding_error; + if(cp > 0x10ffff) goto encoding_error; + // Now convert this char shit to UTF-16 + if(cp < 0x10000) { + parsed_char = cp; + next_char = 0; // no next + } + else { + cp -= 0x10000; + parsed_char = 0xd800 | (cp & 0x3ff); + next_char = 0xdc00 | (cp >> 10); + } + } + else { + if(pc16 != NULL) *pc16 = ps->leftover; + ps->leftover = 0; + return (size_t)(-3); + } + if(pc16 != NULL) *pc16 = parsed_char; + ps->leftover = next_char; + if(parsed_char == 0) + return 0; + else + return nbytes; +encoding_error: + errno = EILSEQ; + return (size_t)(-1); +} + +size_t c16rtomb( + char *restrict s, + char16_t c16, + mbstate_t *restrict ps +); + +size_t mbrtoc32( + char32_t *restrict pc32, + char const *restrict s, + size_t n, + mbstate_t *restrict ps +); + +size_t c32rtomb( + char *restrict s, + char32_t c32, + mbstate_t *restrict ps +); diff --git a/src/win/chkstk.asm b/src/win/chkstk.asm new file mode 100644 index 0000000..5ab5b5b --- /dev/null +++ b/src/win/chkstk.asm @@ -0,0 +1,28 @@ + +bits 64 + +segment .text + +global __chkstk +__chkstk: + sub rsp, 0x10 + mov [rsp], r10 + mov [rsp+0x8], r11 + xor r11, r11 + lea r10, [rsp+0x18] + sub r10, rax + cmovb r10, r11 + mov r11, gs:[0x10] + cmp r10, r11 + jnb .end + and r10w, 0xf000 +.loop: + lea r11, [r11-0x1000] + mov byte [r11], 0x0 + cmp r10, r11 + jnz .loop +.end: + mov r10, [rsp] + mov r11, [rsp+0x8] + add rsp, 0x10 + ret diff --git a/test/test_uchar.c b/test/test_uchar.c new file mode 100644 index 0000000..9490d3d --- /dev/null +++ b/test/test_uchar.c @@ -0,0 +1,28 @@ + +#include + +mbstate_t state; +int main() { + char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C" + size_t in_sz = sizeof in / sizeof *in; + + char16_t out[in_sz]; + char *p_in = in, *end = in + in_sz; + char16_t *p_out = out; + size_t rc; + while((rc = mbrtoc16(p_out, p_in, end - p_in, &state))) + { + if(rc == (size_t)-1) // invalid input + break; + else if(rc == (size_t)-2) // truncated input + break; + else if(rc == (size_t)-3) // UTF-16 high surrogate + p_out += 1; + else { + p_in += rc; + p_out += 1; + }; + } + + size_t out_sz = p_out - out + 1; +} diff --git a/todo b/todo index bb2e79a..1cc0a56 100644 --- a/todo +++ b/todo @@ -36,6 +36,7 @@ stdio.h: stdlib.h: qsort Better PRNG + MB_CUR_MAX should be locale-dependent Probably other stuff threads.h: