Use unicope as submodule like a true chad

This commit is contained in:
bumbread 2022-07-06 13:31:15 +11:00
parent 097d7cf300
commit 704141d550
13 changed files with 44 additions and 693122 deletions

3
.gitmodules vendored Normal file
View File

@ -0,0 +1,3 @@
[submodule "unicope"]
path = unicope
url = https://github.com/bumbread/unicope.git

View File

@ -2,7 +2,7 @@
CC=clang CC=clang
SRC_DIR := src SRC_DIR := src
OBJ_DIR := bin OBJ_DIR := bin
IFLAGS := -Iinc -Isrc/win IFLAGS := -Iinc -Isrc/win -Iunicope/inc
# Detect target operating system # Detect target operating system
ifeq ($(OS),Windows_NT) ifeq ($(OS),Windows_NT)
@ -47,9 +47,13 @@ $(OBJ_DIR)/%.obj: $(SRC_DIR)/%.c
@-mkdir $(MKDIR_P_FLAG) "$(dir $@)" 2> $(NUL_FILE) @-mkdir $(MKDIR_P_FLAG) "$(dir $@)" 2> $(NUL_FILE)
$(CC) $(CFLAGS) -c -o $@ $< $(CC) $(CFLAGS) -c -o $@ $<
ciabatta.lib: $(OBJ_FILES) ciabatta.lib: $(OBJ_FILES) unicope/unicope.lib
llvm-ar rc $@ $^ llvm-ar rc $@ $^
unicope/unicope.lib:
clang -I unicope/inc -c unicope/src/unicode.c -o unicope/unicope.lib
test: ciabatta.lib test: ciabatta.lib
clang -g test/test_$(test).c ciabatta.lib -std=c11 $(LIBS) -nostdlib -Iinc clang -g test/test_$(test).c ciabatta.lib -std=c11 $(LIBS) -nostdlib -Iinc

View File

@ -13,17 +13,13 @@ if "%~1" neq "_start_" (
) )
shift /1 shift /1
set CIABATTA_OPTIONS=-Iinc -Wall -g -gcodeview -nodefaultlibs -D_CRT_SECURE_NO_WARNINGS -mfma set CIABATTA_OPTIONS=-Iinc -I unicope\inc -Wall -g -gcodeview -nodefaultlibs -D_CRT_SECURE_NO_WARNINGS -mfma
set PLATFORM=win set PLATFORM=win
if "%1"=="test" ( if "%1"=="test" (
goto :skip_crt_compilation goto :skip_crt_compilation
) )
if not exist src\code\unicode\data.h (
py src\code\unicode\compile.py
)
if exist bin rd/s/q bin if exist bin rd/s/q bin
mkdir bin mkdir bin
mkdir bin\%PLATFORM% mkdir bin\%PLATFORM%

View File

@ -1,8 +1,8 @@
for /R src\%PLATFORM% %%F in (*.c) do ( for /R src\%PLATFORM% %%F in (*.c) do (
echo %%F echo %%F
start /B clang -Isrc/win -c -o bin\%PLATFORM%\%%~nF.obj %%F %CIABATTA_OPTIONS% start /B clang -I unicope\inc -Isrc/win -c -o bin\%PLATFORM%\%%~nF.obj %%F %CIABATTA_OPTIONS%
) )
for /R src\code %%F in (*.c) do ( for /R src\code %%F in (*.c) do (
echo %%F echo %%F
start /B clang -c -o bin\%%~nF.obj %%F %CIABATTA_OPTIONS% start /B clang -I unicope\inc -c -o bin\%%~nF.obj %%F %CIABATTA_OPTIONS%
) )

View File

@ -1,18 +1,16 @@
@echo off @echo off
setlocal enabledelayedexpansion setlocal enabledelayedexpansion
if not exist src\code\unicode\data.h (
py src\code\unicode\compile.py
)
set PLATFORM=win set PLATFORM=win
set CIABATTA_OPTIONS=--crt none -I %% -I inc set CIABATTA_OPTIONS=--crt none -I %% -I inc -I unicope/inc
del ciabatta.lib del ciabatta.lib
del unicope\unicope.lib
cuik unicope\src\unicode.c -I unicope\inc -c -o unicope\unicope.lib
cuik %CIABATTA_OPTIONS% src\code\*.c src\%PLATFORM%\*.c -c -o ciabatta.obj cuik %CIABATTA_OPTIONS% src\code\*.c src\%PLATFORM%\*.c -c -o ciabatta.obj
lib /out:ciabatta.lib ciabatta.obj lib /out:ciabatta.lib ciabatta.obj
if "%TEST%"=="" set TEST=assert if "%TEST%"=="" set TEST=assert
cuik test\test_%TEST%.c --lib ciabatta.lib,kernel32.lib,user32.lib,shell32.lib %CIABATTA_OPTIONS% cuik test\test_%TEST%.c --lib ciabatta.lib,unicope/unicope.lib,kernel32.lib,user32.lib,shell32.lib %CIABATTA_OPTIONS%
del ciabatta.obj del ciabatta.obj

View File

@ -1,141 +0,0 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
typedef uint_least16_t char16_t;
typedef uint_least32_t char32_t;
typedef int32_t uchar_t;
#define UNI_EBADCP (-1)
#define UNI_EULSUR (-2)
#define UNI_EIBYTE (-3)
#define UNI_ETBYTE (-4)
#define UNI_ESTRLN (-5)
#define UNI_EOLONG (-6)
enum {
UCHAR_BAD,
UCHAR_Cc,
UCHAR_Cf,
UCHAR_Co,
UCHAR_Cs,
UCHAR_Ll,
UCHAR_Lm,
UCHAR_Lo,
UCHAR_Lt,
UCHAR_Lu,
UCHAR_Mc,
UCHAR_Me,
UCHAR_Mn,
UCHAR_Nd,
UCHAR_Nl,
UCHAR_No,
UCHAR_Pc,
UCHAR_Pd,
UCHAR_Pe,
UCHAR_Pf,
UCHAR_Pi,
UCHAR_Po,
UCHAR_Ps,
UCHAR_Sc,
UCHAR_Sk,
UCHAR_Sm,
UCHAR_So,
UCHAR_Zl,
UCHAR_Zp,
UCHAR_Zs,
};
enum {
UCHAR_BIDI_AL,
UCHAR_BIDI_AN,
UCHAR_BIDI_B,
UCHAR_BIDI_BN,
UCHAR_BIDI_CS,
UCHAR_BIDI_EN,
UCHAR_BIDI_ES,
UCHAR_BIDI_ET,
UCHAR_BIDI_FSI,
UCHAR_BIDI_L,
UCHAR_BIDI_LRE,
UCHAR_BIDI_LRI,
UCHAR_BIDI_LRO,
UCHAR_BIDI_NSM,
UCHAR_BIDI_ON,
UCHAR_BIDI_PDF,
UCHAR_BIDI_PDI,
UCHAR_BIDI_R,
UCHAR_BIDI_RLE,
UCHAR_BIDI_RLI,
UCHAR_BIDI_RLO,
UCHAR_BIDI_S,
UCHAR_BIDI_WS,
};
enum {
UCHAR_DECOMP_CANON,
UCHAR_DECOMP_FONT,
UCHAR_DECOMP_NOBREAK,
UCHAR_DECOMP_INITIAL,
UCHAR_DECOMP_MEDIAL,
UCHAR_DECOMP_FINAL,
UCHAR_DECOMP_ISOLATED,
UCHAR_DECOMP_CIRCLE,
UCHAR_DECOMP_SUPER,
UCHAR_DECOMP_SUB,
UCHAR_DECOMP_VERTICAL,
UCHAR_DECOMP_WIDE,
UCHAR_DECOMP_NARROW,
UCHAR_DECOMP_SMALL,
UCHAR_DECOMP_SQUARE,
UCHAR_DECOMP_FRACTION,
UCHAR_DECOMP_COMPAT,
};
typedef struct uchar_props uchar_props;
struct uchar_props {
uchar_t code;
char const *name;
int cat_gen;
int cat_bidi;
int comb_class;
int dec_type;
int dec_map_n;
uchar_t const dec_map[18]; // U+FDFA takes 18, everything else takes up <8
int dec_value;
int dig_value;
double num_value;
int bidi_mirrored;
char const *old_name;
char const *comment;
uchar_t lower;
uchar_t upper;
uchar_t title;
};
uchar_props *uni_props (uchar_t cp);
int uni_valid (uchar_t cp);
int uni_classify(uchar_t cp);
uchar_t uni_tolower (uchar_t cp);
uchar_t uni_toupper (uchar_t cp);
uchar_t uni_totitle (uchar_t cp);
int uni_is_hsur(char16_t cp);
int uni_is_lsur(char16_t cp);
uchar_t uni_surtoc (char16_t hsur, char16_t lsur);
int utf16_chlen(char16_t const *str);
int utf8_chlen (char const *str);
int utf16_dec_s(char16_t const *restrict str, size_t len, uchar_t *restrict ch);
int utf8_dec_s (char const *restrict str, size_t len, uchar_t *restrict ch);
int utf16_dec (char16_t const *restrict str, uchar_t *restrict ch);
int utf8_dec (char const *restrict str, uchar_t *restrict ch);
int utf16_enc_s(char16_t *str, size_t len, uchar_t ch);
int utf8_enc_s (char *str, size_t len, uchar_t ch);
int utf16_enc (char16_t *str, uchar_t ch);
int utf8_enc (char *str, uchar_t ch);

View File

@ -2,6 +2,8 @@
#include <uchar.h> #include <uchar.h>
#include <errno.h> #include <errno.h>
#include <unicode.h>
size_t mbrtoc16( size_t mbrtoc16(
char16_t *restrict pc16, char16_t *restrict pc16,
char const *restrict s, char const *restrict s,
@ -12,59 +14,34 @@ size_t mbrtoc16(
*ps = (mbstate_t) {0}; *ps = (mbstate_t) {0};
return 0; return 0;
} }
size_t nbytes; // First check leftovers, using 0xd800 as marker because it doesn't
char16_t parsed_char; // encode a valid character.
char16_t next_char; if(ps->leftover != 0xd800) {
// First check leftovers
if(ps->leftover == 0) {
// Decode the first byte of UTF-8 sequence
unsigned byte0 = *s;
if (0x00 <= byte0 && byte0 < 0x80) nbytes = 1;
else if(0xc0 <= byte0 && byte0 < 0xe0) nbytes = 2;
else if(0xe0 <= byte0 && byte0 < 0xf0) nbytes = 3;
else if(0xf0 <= byte0 && byte0 < 0xf8) nbytes = 4;
else goto encoding_error;
unsigned nbytesreq = nbytes;
if(n < nbytesreq) {
return (size_t)(-2);
}
char32_t cp = byte0;
switch(nbytesreq) {
case 2: cp &= 0x1f; break;
case 3: cp &= 0x0f; break;
case 4: cp &= 0x07; break;
}
while(--nbytesreq)
cp |= (cp << 6) | ((*++s) & 0x3f);
if(0xdc00 <= cp && cp <= 0xe000)
goto encoding_error;
// Overloing seqs
if(cp < 0x80 && nbytes > 1) goto encoding_error;
if(cp < 0x800 && nbytes > 2) goto encoding_error;
if(cp < 0x10000 && nbytes > 3) goto encoding_error;
if(cp > 0x10ffff) goto encoding_error;
// Now convert this char shit to UTF-16
if(cp < 0x10000) {
parsed_char = cp;
next_char = 0; // no next
}
else {
cp -= 0x10000;
parsed_char = 0xd800 | (cp >> 10);
next_char = 0xdc00 | (cp & 0x3ff);
}
}
else {
if(pc16 != NULL) *pc16 = ps->leftover; if(pc16 != NULL) *pc16 = ps->leftover;
ps->leftover = 0; ps->leftover = 0xd800;
return (size_t)(-3); return (size_t)(-3);
} }
if(pc16 != NULL) *pc16 = parsed_char; else {
ps->leftover = next_char; uchar_t ch;
if(parsed_char == 0) char16_t str[3];
return 0; int chlen = utf8_dec(s, &ch);
else if(chlen <= 0) goto encoding_error;
return nbytes; int wrlen = utf16_enc(str, ch);
char16_t curc;
char16_t next;
if(wrlen <= 0) goto encoding_error;
else if(wrlen == 2) {
curc = str[0];
next = 0xd800;
}
else {
curc = str[0];
next = str[1];
}
ps->leftover = next;
if(pc16 != NULL) *pc16 = curc;
return (size_t)-2;
}
encoding_error: encoding_error:
errno = EILSEQ; errno = EILSEQ;
return (size_t)(-1); return (size_t)(-1);

View File

@ -1,279 +0,0 @@
#include <unicode.h>
#include "unicode/data.h"
uchar_props *uni_props(uchar_t cp) {
if(!uni_valid(cp)) return NULL;
if(unicode_data[cp].code != cp) return NULL;
return &unicode_data[cp];
}
int uni_cat_gen(uchar_t cp) {
uchar_props *props = uni_props(cp);
if(props != NULL)
return unicode_data[cp].cat_gen;
else
return UCHAR_BAD;
}
uchar_t uni_tolower(uchar_t cp) {
return unicode_data[cp].lower;
}
uchar_t uni_toupper(uchar_t cp) {
return unicode_data[cp].upper;
}
uchar_t uni_totitle(uchar_t cp) {
return unicode_data[cp].title;
}
int uni_valid(uchar_t ch) {
return (0x0000 <= ch && ch <= 0xd7ff) || (0xe000 <= ch && ch <= 0x10ffff);
}
int uni_is_hsur(char16_t ch) {
return 0xd800 <= ch && ch <= 0xdbff;
}
int uni_is_lsur(char16_t ch) {
return 0xdc00 <= ch && ch <= 0xdfff;
}
uchar_t uni_surtoc(char16_t hsur, char16_t lsur) {
uchar_t u = ((0x3ff & hsur) << 10) | (lsur & 0x3ff);
return u + 0x10000;
}
int utf16_chlen(char16_t const *str) {
char16_t cp = *str;
if(uni_is_hsur(cp)) return 2;
else if(uni_is_lsur(cp)) return UNI_EULSUR;
else return 1;
}
int utf8_chlen(char const *str) {
uint8_t byte0 = (uint8_t)*str;
if(byte0 < 0x80) return 1;
else if(byte0 < 0xc0) return UNI_EIBYTE;
else if(byte0 < 0xe0) return 2;
else if(byte0 < 0xf0) return 3;
else if(byte0 < 0xf8) return 4;
return UNI_EIBYTE;
}
int utf16_dec(char16_t const *restrict str, uchar_t *restrict chp) {
int chlen = 0;
uchar_t ch;
if(uni_is_hsur(str[0])) {
char16_t hsur = str[0];
char16_t lsur = str[1];
ch = uni_surtoc(hsur, lsur);
chlen = 2;
if(ch > 0x10ffff) {
chlen = UNI_EBADCP;
ch = 0xfffd;
}
}
else(!uni_is_lsur(str[0])) {
ch = str[0];
}
else {
chlen = UNI_EULSUR;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf16_dec_s(
char16_t const *restrict str,
int len,
uchar_t *restrict chp
) {
if(len == 0) return 0;
int chlen;
uchar_t ch;
if(uni_is_hsur(str[0])) {
if(len < 2) return 0;
char16_t hsur = str[0];
char16_t lsur = str[1];
ch = uni_surtoc(hsur, lsur);
chlen = 2;
if(ch > 0x10ffff) {
chlen = UNI_EBADCP;
ch = 0xfffd;
}
}
else if(!uni_is_lsur(str[0])) {
ch = str[0];
chlen = 1;
}
else {
chlen = UNI_EULSUR;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf8_dec(char const *restrict str, uchar_t *restrict chp) {
uint8_t const *ustr = (uint8_t const *)str;
int chlen;
uchar_t ch;
if(ustr[0] < 0x80) ch = ustr[0], chlen = 1;
else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
else if(ustr[0] < 0xe0) ch = ustr[0] & 0x1f, chlen = 2;
else if(ustr[0] < 0xf0) ch = ustr[0] & 0x0f, chlen = 3;
else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4;
else ch = 0xfffd, chlen = UNI_EIBYTE;
if(chlen > 0) for(int i = 1; i < chlen; ++i) {
uint8_t trail = ustr[i];
if((trail & 0xc0) != 0x80) {
chlen = UNI_ETBYTE;
ch = 0xfffd;
break;
}
ch <<= 6;
ch |= (trail & 0x3f);
}
if(!uni_valid(ch)) {
chlen = UNI_EBADCP;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf8_dec_s(
char const *restrict str,
int len,
uchar_t *restrict chp
) {
if(len == 0) return 0;
uint8_t const *restrict ustr = (uint8_t const *restrict)str;
int chlen;
uchar_t ch;
if(ustr[0] < 0x80) ch = ustr[0], chlen = 1;
else if(ustr[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
else if(ustr[0] < 0xe0) ch = ustr[0] & 0x1f, chlen = 2;
else if(ustr[0] < 0xf0) ch = ustr[0] & 0x0f, chlen = 3;
else if(ustr[0] < 0xf8) ch = ustr[0] & 0x07, chlen = 4;
else ch = 0xfffd, chlen = UNI_EIBYTE;
if(chlen > len) {
return UNI_ESTRLN;
}
if(chlen > 0) for(int i = 1; i < chlen; ++i) {
uint8_t trail = ustr[i];
if((trail & 0xc0) != 0x80) {
chlen = UNI_ETBYTE;
ch = 0xfffd;
break;
}
ch <<= 6;
ch |= (trail & 0x3f);
}
if(!uni_valid(ch)) {
chlen = UNI_EBADCP;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf16_enc(char16_t *str, uchar_t cp) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(cp < 0x10000) {
str[0] = cp;
return 1;
}
else {
cp -= 0x10000;
str[0] = 0xD800 + (cp >> 10);
str[1] = 0xDC00 + (cp & 0x3ff);
return 2;
}
}
int utf8_enc(char *str, uchar_t ch) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(cp < 0x80) {
str[0] = ch;
return 1;
}
else if(cp < 0x800) {
str[0] = 0xc0 | (ch >> 6);
str[1] = 0x80 | ((ch >> 0) & 0x3f);
return 2;
}
else if(cp < 0x10000) {
str[0] = 0xe0 | (ch >> 18);
str[1] = 0x80 | ((ch >> 6) & 0x3f);
str[2] = 0x80 | ((ch >> 0) & 0x3f);
return 3;
}
else {
str[0] = 0xe0 | (ch >> 24);
str[1] = 0x80 | ((ch >> 18) & 0x3f);
str[2] = 0x80 | ((ch >> 6) & 0x3f);
str[3] = 0x80 | ((ch >> 0) & 0x3f);
return 4;
}
}
int utf16_enc_s(char16_t *str, size_t len, uchar_t ch) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(len == 0) return 0;
if(cp < 0x10000) {
str[0] = cp;
return 1;
}
else {
if(len < 2) return UNI_ESTRLN;
cp -= 0x10000;
str[0] = 0xD800 + (cp >> 10);
str[1] = 0xDC00 + (cp & 0x3ff);
return 2;
}
}
int utf8_enc_s(char *str, size_t len, uchar_t ch) {
if(!is_valid(cp)) {
return UNI_EBADCP;
}
if(len == 0) return 0;
if(cp < 0x80) {
str[0] = ch;
return 1;
}
else if(cp < 0x800) {
if(len < 2) return UNI_ESTRLN;
str[0] = 0xc0 | (ch >> 6);
str[1] = 0x80 | ((ch >> 0) & 0x3f);
return 2;
}
else if(cp < 0x10000) {
if(len < 3) return UNI_ESTRLN;
str[0] = 0xe0 | (ch >> 18);
str[1] = 0x80 | ((ch >> 6) & 0x3f);
str[2] = 0x80 | ((ch >> 0) & 0x3f);
return 3;
}
else {
if(len < 4) return UNI_ESTRLN;
str[0] = 0xe0 | (ch >> 24);
str[1] = 0x80 | ((ch >> 18) & 0x3f);
str[2] = 0x80 | ((ch >> 6) & 0x3f);
str[3] = 0x80 | ((ch >> 0) & 0x3f);
return 4;
}
}

View File

@ -1,98 +0,0 @@
import os;
import sys;
abspath = os.path.abspath(sys.argv[0])
dname = os.path.dirname(abspath)
os.chdir(dname)
with open('data.h', 'w') as header:
header.write('\n');
header.write('#pragma once\n\n');
header.write('#include <unicode.h>\n');
header.write(
'''
uchar_props unicode_data[] = {
''');
with open('data.txt') as file:
for line in file:
row = line.split(';')
code = row[0].strip()
name = row[1].strip()
cat_gen = row[2].strip()
cat_bidi = row[4].strip()
comb_class = row[3].strip()
dec_map = row[5].strip()
dec_value = row[6].strip()
dig_value = row[7].strip()
num_value = row[8].strip()
mirrored = row[9].strip()
old_name = row[10].strip()
comment = row[11].strip()
upper = row[12].strip()
lower = row[13].strip()
title = row[14].strip()
# Process decompositional mapping
dec_map_n = 0
dec_type = 'CANON'
if dec_map != '':
dec_map = dec_map.split(' ')
if dec_map[0][0] == '<':
dec_type = dec_map[0][1:-1].upper()
dec_map = dec_map[1:]
dec_map_n = len(dec_map)
if dec_map_n != 0:
dec_map = ', '.join(list(map(lambda x: '0x' + x, dec_map)))
else:
dec_map = '0'
# Make sure lowercase and uppercase mappings are defined
if lower == '':
lower = code
if upper == '':
upper = code
if title == '' or title == '\n':
title = code
header.write(
'''
[0x%s] = {
.code = 0x%s,
.name = "%s",
.cat_gen = UCHAR_%s,
.cat_bidi = UCHAR_BIDI_%s,
.comb_class = %s,
.dec_type = UCHAR_DECOMP_%s,
.dec_map_n = %s,
.dec_map = {%s},
.dec_value = %s,
.dig_value = %s,
.num_value = %s,
.bidi_mirrored = %s,
.old_name = "%s",
.comment = "%s",
.lower = 0x%s,
.upper = 0x%s,
.title = 0x%s,
},''' % (
code,
code,
name,
cat_gen,
cat_bidi,
comb_class,
dec_type,
dec_map_n,
dec_map,
dec_value if dec_value != '' else '-1',
dig_value if dig_value != '' else '-1',
num_value if num_value != '' else '-1',
'1' if mirrored == 'Y' else '0',
old_name,
comment,
lower,
upper,
title
));
header.write('};\n\n');
header.close();

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,11 +0,0 @@
The unicode-based functions work based on official unicode data. You can find
the file with Unicode data at:
https://unicode.org/Public/UNIDATA/UnicodeData.txt
This file is placed into this directory by the name data.txt. To update the
unicode standard a new file is put under that name, then unicode_compile.py is
ran with python interpreter. It will generate a new unicode.h header file.
DO NOT MODIFY data.h DIRECTLY BRUH.

1
unicope Submodule

@ -0,0 +1 @@
Subproject commit 5402d9b6987795856dd870c17075839453238503