Stack checking et uchar utf8->utf16

This commit is contained in:
bumbread 2022-06-22 23:36:26 +11:00
parent 7165ac1c41
commit 337e6734a7
8 changed files with 189 additions and 12 deletions

View File

@ -20,8 +20,8 @@ if "%1"=="test" (
goto :skip_crt_compilation
)
if not exist src\code\unicode\unicode_data.h (
py src\code\unicode\unicode_compile.py
if not exist src\code\unicode\data.h (
py src\code\unicode\compile.py
)
if exist bin rd/s/q bin
@ -33,6 +33,10 @@ for /R src\%PLATFORM% %%F in (*.c) do (
echo %%F
clang -Isrc/win -c -o bin\%PLATFORM%\%%~nF.obj %%F %CIABATTA_OPTIONS%
)
for /R src\%PLATFORM% %%F in (*.asm) do (
echo %%F
nasm %%F -f win64 -o bin\%PLATFORM%\%%~nF.obj
)
for /R src\code %%F in (*.c) do (
echo %%F
clang -c -o bin\%%~nF.obj %%F %CIABATTA_OPTIONS%
@ -44,5 +48,5 @@ llvm-ar rc ciabatta.lib bin\*.obj bin\%PLATFORM%\*.obj
if "%TEST%"=="" set TEST=assert
echo Compiling test_%TEST%.c
clang -fno-builtin test\test_%TEST%.c ciabatta.lib -std=c11 -lDbghelp -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS%
clang test\test_%TEST%.c ciabatta.lib -std=c11 -lDbghelp -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS%
::cl test\test_math.c /Iinc -D_CRT_SECURE_NO_WARNINGS /Z7 /link ciabatta.lib kernel32.lib user32.lib shell32.lib -nostdlib -nodefaultlibs

View File

@ -34,7 +34,7 @@ typedef struct lldiv_t {
// #define EXIT_SUCCESS 0
#define RAND_MAX 65536
// #define MB_CUR_MAX 5
#define MB_CUR_MAX 4
// Microsoft extension, COUNTOF(x) counts array elements
#ifndef COUNTOF

View File

@ -1,15 +1,39 @@
#pragma once
#include <stddef.h>
#include <stdint.h>
typedef struct mbstate_t mbstate_t;
typedef uint16_t char16_t;
typedef uint32_t char32_t;
typedef uint_least16_t char16_t;
typedef uint_least32_t char32_t;
struct mbstate_t {
char filler[4];
char16_t leftover;
};
size_t mbrtoc16(char16_t * restrict pc16, const char * restrict s, size_t n, mbstate_t * restrict ps);
size_t c16rtomb(char * restrict s, char16_t c16, mbstate_t * restrict ps);
size_t mbrtoc32(char32_t * restrict pc32, const char * restrict s, size_t n, mbstate_t * restrict ps);
size_t c32rtomb(char * restrict s, char32_t c32, mbstate_t * restrict ps);
size_t mbrtoc16(
char16_t *restrict pc16,
char const *restrict s,
size_t n,
mbstate_t *restrict ps
);
size_t c16rtomb(
char *restrict s,
char16_t c16,
mbstate_t *restrict ps
);
size_t mbrtoc32(
char32_t *restrict pc32,
char const *restrict s,
size_t n,
mbstate_t *restrict ps
);
size_t c32rtomb(
char *restrict s,
char32_t c32,
mbstate_t *restrict ps
);

View File

@ -1,11 +1,14 @@
#pragma once
#include <stdint.h>
typedef struct mbstate_t mbstate_t;
typedef wchar_t wint_t;
struct mbstate_t {
char filler[4];
char16_t next;
char bytes[4];
};
#define WCHAR_MIN 0x0000

89
src/code/uchar.c Normal file
View File

@ -0,0 +1,89 @@
#include <uchar.h>
#include <errno.h>
size_t mbrtoc16(
char16_t *restrict pc16,
char const *restrict s,
size_t n,
mbstate_t *restrict ps
) {
if(s == NULL) {
return 0;
}
size_t nbytes;
char16_t parsed_char;
char16_t next_char;
// First check leftovers
if(ps->leftover == 0) {
// Decode the first byte of UTF-8 sequence
unsigned byte0 = *s;
if (0x00 <= byte0 && byte0 < 0x80) nbytes = 1;
else if(0xc0 <= byte0 && byte0 < 0xe0) nbytes = 2;
else if(0xe0 <= byte0 && byte0 < 0xf0) nbytes = 3;
else if(0xf0 <= byte0 && byte0 < 0xf8) nbytes = 4;
else goto encoding_error;
unsigned nbytesreq = nbytes;
if(n < nbytesreq) {
return (size_t)(-2);
}
char32_t cp = byte0;
switch(nbytesreq) {
case 2: cp &= 0x1f; break;
case 3: cp &= 0x0f; break;
case 4: cp &= 0x07; break;
}
while(--nbytesreq)
cp |= (cp << 6) | ((*++s) & 0x3f);
if(0xdc00 <= cp && cp <= 0xe000)
goto encoding_error;
// Overloing seqs
if(cp < 0x80 && nbytes > 1) goto encoding_error;
if(cp < 0x800 && nbytes > 2) goto encoding_error;
if(cp < 0x10000 && nbytes > 3) goto encoding_error;
if(cp > 0x10ffff) goto encoding_error;
// Now convert this char shit to UTF-16
if(cp < 0x10000) {
parsed_char = cp;
next_char = 0; // no next
}
else {
cp -= 0x10000;
parsed_char = 0xd800 | (cp & 0x3ff);
next_char = 0xdc00 | (cp >> 10);
}
}
else {
if(pc16 != NULL) *pc16 = ps->leftover;
ps->leftover = 0;
return (size_t)(-3);
}
if(pc16 != NULL) *pc16 = parsed_char;
ps->leftover = next_char;
if(parsed_char == 0)
return 0;
else
return nbytes;
encoding_error:
errno = EILSEQ;
return (size_t)(-1);
}
size_t c16rtomb(
char *restrict s,
char16_t c16,
mbstate_t *restrict ps
);
size_t mbrtoc32(
char32_t *restrict pc32,
char const *restrict s,
size_t n,
mbstate_t *restrict ps
);
size_t c32rtomb(
char *restrict s,
char32_t c32,
mbstate_t *restrict ps
);

28
src/win/chkstk.asm Normal file
View File

@ -0,0 +1,28 @@
bits 64
segment .text
global __chkstk
__chkstk:
sub rsp, 0x10
mov [rsp], r10
mov [rsp+0x8], r11
xor r11, r11
lea r10, [rsp+0x18]
sub r10, rax
cmovb r10, r11
mov r11, gs:[0x10]
cmp r10, r11
jnb .end
and r10w, 0xf000
.loop:
lea r11, [r11-0x1000]
mov byte [r11], 0x0
cmp r10, r11
jnz .loop
.end:
mov r10, [rsp]
mov r11, [rsp+0x8]
add rsp, 0x10
ret

28
test/test_uchar.c Normal file
View File

@ -0,0 +1,28 @@
#include <uchar.h>
mbstate_t state;
int main() {
char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C"
size_t in_sz = sizeof in / sizeof *in;
char16_t out[in_sz];
char *p_in = in, *end = in + in_sz;
char16_t *p_out = out;
size_t rc;
while((rc = mbrtoc16(p_out, p_in, end - p_in, &state)))
{
if(rc == (size_t)-1) // invalid input
break;
else if(rc == (size_t)-2) // truncated input
break;
else if(rc == (size_t)-3) // UTF-16 high surrogate
p_out += 1;
else {
p_in += rc;
p_out += 1;
};
}
size_t out_sz = p_out - out + 1;
}

1
todo
View File

@ -36,6 +36,7 @@ stdio.h:
stdlib.h:
qsort
Better PRNG
MB_CUR_MAX should be locale-dependent
Probably other stuff
threads.h: