mirror of https://github.com/flysand7/ciabatta.git
Stack checking et uchar utf8->utf16
This commit is contained in:
parent
7165ac1c41
commit
337e6734a7
10
bake.cmd
10
bake.cmd
|
@ -20,8 +20,8 @@ if "%1"=="test" (
|
|||
goto :skip_crt_compilation
|
||||
)
|
||||
|
||||
if not exist src\code\unicode\unicode_data.h (
|
||||
py src\code\unicode\unicode_compile.py
|
||||
if not exist src\code\unicode\data.h (
|
||||
py src\code\unicode\compile.py
|
||||
)
|
||||
|
||||
if exist bin rd/s/q bin
|
||||
|
@ -33,6 +33,10 @@ for /R src\%PLATFORM% %%F in (*.c) do (
|
|||
echo %%F
|
||||
clang -Isrc/win -c -o bin\%PLATFORM%\%%~nF.obj %%F %CIABATTA_OPTIONS%
|
||||
)
|
||||
for /R src\%PLATFORM% %%F in (*.asm) do (
|
||||
echo %%F
|
||||
nasm %%F -f win64 -o bin\%PLATFORM%\%%~nF.obj
|
||||
)
|
||||
for /R src\code %%F in (*.c) do (
|
||||
echo %%F
|
||||
clang -c -o bin\%%~nF.obj %%F %CIABATTA_OPTIONS%
|
||||
|
@ -44,5 +48,5 @@ llvm-ar rc ciabatta.lib bin\*.obj bin\%PLATFORM%\*.obj
|
|||
if "%TEST%"=="" set TEST=assert
|
||||
|
||||
echo Compiling test_%TEST%.c
|
||||
clang -fno-builtin test\test_%TEST%.c ciabatta.lib -std=c11 -lDbghelp -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS%
|
||||
clang test\test_%TEST%.c ciabatta.lib -std=c11 -lDbghelp -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS%
|
||||
::cl test\test_math.c /Iinc -D_CRT_SECURE_NO_WARNINGS /Z7 /link ciabatta.lib kernel32.lib user32.lib shell32.lib -nostdlib -nodefaultlibs
|
||||
|
|
|
@ -34,7 +34,7 @@ typedef struct lldiv_t {
|
|||
// #define EXIT_SUCCESS 0
|
||||
|
||||
#define RAND_MAX 65536
|
||||
// #define MB_CUR_MAX 5
|
||||
#define MB_CUR_MAX 4
|
||||
|
||||
// Microsoft extension, COUNTOF(x) counts array elements
|
||||
#ifndef COUNTOF
|
||||
|
|
38
inc/uchar.h
38
inc/uchar.h
|
@ -1,15 +1,39 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct mbstate_t mbstate_t;
|
||||
typedef uint16_t char16_t;
|
||||
typedef uint32_t char32_t;
|
||||
typedef uint_least16_t char16_t;
|
||||
typedef uint_least32_t char32_t;
|
||||
|
||||
struct mbstate_t {
|
||||
char filler[4];
|
||||
char16_t leftover;
|
||||
};
|
||||
|
||||
size_t mbrtoc16(char16_t * restrict pc16, const char * restrict s, size_t n, mbstate_t * restrict ps);
|
||||
size_t c16rtomb(char * restrict s, char16_t c16, mbstate_t * restrict ps);
|
||||
size_t mbrtoc32(char32_t * restrict pc32, const char * restrict s, size_t n, mbstate_t * restrict ps);
|
||||
size_t c32rtomb(char * restrict s, char32_t c32, mbstate_t * restrict ps);
|
||||
size_t mbrtoc16(
|
||||
char16_t *restrict pc16,
|
||||
char const *restrict s,
|
||||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
);
|
||||
|
||||
size_t c16rtomb(
|
||||
char *restrict s,
|
||||
char16_t c16,
|
||||
mbstate_t *restrict ps
|
||||
);
|
||||
|
||||
size_t mbrtoc32(
|
||||
char32_t *restrict pc32,
|
||||
char const *restrict s,
|
||||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
);
|
||||
|
||||
size_t c32rtomb(
|
||||
char *restrict s,
|
||||
char32_t c32,
|
||||
mbstate_t *restrict ps
|
||||
);
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct mbstate_t mbstate_t;
|
||||
typedef wchar_t wint_t;
|
||||
|
||||
struct mbstate_t {
|
||||
char filler[4];
|
||||
char16_t next;
|
||||
char bytes[4];
|
||||
};
|
||||
|
||||
#define WCHAR_MIN 0x0000
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
|
||||
#include <uchar.h>
|
||||
#include <errno.h>
|
||||
|
||||
size_t mbrtoc16(
|
||||
char16_t *restrict pc16,
|
||||
char const *restrict s,
|
||||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
if(s == NULL) {
|
||||
return 0;
|
||||
}
|
||||
size_t nbytes;
|
||||
char16_t parsed_char;
|
||||
char16_t next_char;
|
||||
// First check leftovers
|
||||
if(ps->leftover == 0) {
|
||||
// Decode the first byte of UTF-8 sequence
|
||||
unsigned byte0 = *s;
|
||||
if (0x00 <= byte0 && byte0 < 0x80) nbytes = 1;
|
||||
else if(0xc0 <= byte0 && byte0 < 0xe0) nbytes = 2;
|
||||
else if(0xe0 <= byte0 && byte0 < 0xf0) nbytes = 3;
|
||||
else if(0xf0 <= byte0 && byte0 < 0xf8) nbytes = 4;
|
||||
else goto encoding_error;
|
||||
unsigned nbytesreq = nbytes;
|
||||
if(n < nbytesreq) {
|
||||
return (size_t)(-2);
|
||||
}
|
||||
char32_t cp = byte0;
|
||||
switch(nbytesreq) {
|
||||
case 2: cp &= 0x1f; break;
|
||||
case 3: cp &= 0x0f; break;
|
||||
case 4: cp &= 0x07; break;
|
||||
}
|
||||
while(--nbytesreq)
|
||||
cp |= (cp << 6) | ((*++s) & 0x3f);
|
||||
if(0xdc00 <= cp && cp <= 0xe000)
|
||||
goto encoding_error;
|
||||
// Overloing seqs
|
||||
if(cp < 0x80 && nbytes > 1) goto encoding_error;
|
||||
if(cp < 0x800 && nbytes > 2) goto encoding_error;
|
||||
if(cp < 0x10000 && nbytes > 3) goto encoding_error;
|
||||
if(cp > 0x10ffff) goto encoding_error;
|
||||
// Now convert this char shit to UTF-16
|
||||
if(cp < 0x10000) {
|
||||
parsed_char = cp;
|
||||
next_char = 0; // no next
|
||||
}
|
||||
else {
|
||||
cp -= 0x10000;
|
||||
parsed_char = 0xd800 | (cp & 0x3ff);
|
||||
next_char = 0xdc00 | (cp >> 10);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if(pc16 != NULL) *pc16 = ps->leftover;
|
||||
ps->leftover = 0;
|
||||
return (size_t)(-3);
|
||||
}
|
||||
if(pc16 != NULL) *pc16 = parsed_char;
|
||||
ps->leftover = next_char;
|
||||
if(parsed_char == 0)
|
||||
return 0;
|
||||
else
|
||||
return nbytes;
|
||||
encoding_error:
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
|
||||
size_t c16rtomb(
|
||||
char *restrict s,
|
||||
char16_t c16,
|
||||
mbstate_t *restrict ps
|
||||
);
|
||||
|
||||
size_t mbrtoc32(
|
||||
char32_t *restrict pc32,
|
||||
char const *restrict s,
|
||||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
);
|
||||
|
||||
size_t c32rtomb(
|
||||
char *restrict s,
|
||||
char32_t c32,
|
||||
mbstate_t *restrict ps
|
||||
);
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
bits 64
|
||||
|
||||
segment .text
|
||||
|
||||
global __chkstk
|
||||
__chkstk:
|
||||
sub rsp, 0x10
|
||||
mov [rsp], r10
|
||||
mov [rsp+0x8], r11
|
||||
xor r11, r11
|
||||
lea r10, [rsp+0x18]
|
||||
sub r10, rax
|
||||
cmovb r10, r11
|
||||
mov r11, gs:[0x10]
|
||||
cmp r10, r11
|
||||
jnb .end
|
||||
and r10w, 0xf000
|
||||
.loop:
|
||||
lea r11, [r11-0x1000]
|
||||
mov byte [r11], 0x0
|
||||
cmp r10, r11
|
||||
jnz .loop
|
||||
.end:
|
||||
mov r10, [rsp]
|
||||
mov r11, [rsp+0x8]
|
||||
add rsp, 0x10
|
||||
ret
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
#include <uchar.h>
|
||||
|
||||
mbstate_t state;
|
||||
int main() {
|
||||
char in[] = u8"zß水🍌"; // or "z\u00df\u6c34\U0001F34C"
|
||||
size_t in_sz = sizeof in / sizeof *in;
|
||||
|
||||
char16_t out[in_sz];
|
||||
char *p_in = in, *end = in + in_sz;
|
||||
char16_t *p_out = out;
|
||||
size_t rc;
|
||||
while((rc = mbrtoc16(p_out, p_in, end - p_in, &state)))
|
||||
{
|
||||
if(rc == (size_t)-1) // invalid input
|
||||
break;
|
||||
else if(rc == (size_t)-2) // truncated input
|
||||
break;
|
||||
else if(rc == (size_t)-3) // UTF-16 high surrogate
|
||||
p_out += 1;
|
||||
else {
|
||||
p_in += rc;
|
||||
p_out += 1;
|
||||
};
|
||||
}
|
||||
|
||||
size_t out_sz = p_out - out + 1;
|
||||
}
|
Loading…
Reference in New Issue