Factor out some unicode stuff also it compiles on linux

2022-06-28 22:49:30 +11:00 · 2022-06-28 22:49:30 +11:00 · 328e9f6c35
parent 0d58124c26
commit 328e9f6c35
9 changed files with 34865 additions and 34744 deletions
--- a/55
+++ b/55
@ -1,20 +1,53 @@

-GNUFLAGS=-Werror -Wall -Iinc -Isrc/win
-CLFLAGS=/I:inc /I:src/win /link /incremental:no /subsystem:windows /nodefaultlib kernel32.lib 
-
 CC=clang
-CFLAGS=$(GNUFLAGS)
-LDFLAGS=/nologo /nodefaultlib /entry:mainCRTStartup
-
 SRC_DIR := src
 OBJ_DIR := bin
-SRC_FILES := $(wildcard $(SRC_DIR)/code/*.c) $(wildcard $(SRC_DIR)/win/*.c)
-OBJ_FILES := $(patsubst $(SRC_DIR)/%.c,$(OBJ_DIR)/%.obj,$(SRC_FILES))
+IFLAGS := -Iinc -Isrc/win

-ciabatta.lib: $(OBJ_FILES)
-	lib $(LDFLAGS) /out:$@ $^
+# Detect target operating system
+ifeq ($(OS),Windows_NT) 
+    PLATFORM := win
+else
+	PLATFORM := $(shell sh -c 'uname 2>/dev/null || echo Unknown')
+	PLATFORM := $(shell sh -c 'echo $(PLATFORM) | tr A-Z a-z')
+endif
+ifeq ($(PLATFORM),Unknown)
+	echo Unknown platform
+	exit 1
+endif
+
+# If we're compiling under windows we'll link to these libraries
+ifeq ($(PLATFORM),win)
+	LIBS := -lDbghelp -lkernel32 -luser32 -lshell32 
+endif
+
+# Compiler flags
+ifeq ($(CC), clang)
+	CFLAGS=$(GNUFLAGS) -Werror -Wall -msse2 $(IFLAGS)
+else
+	echo BAD CC
+	exit 1
+endif
+
+# Figure out what we want to compile at the end
+SRC_FILES := $(wildcard $(SRC_DIR)/code/*.c) $(wildcard $(SRC_DIR)/$(PLATFORM)/*.c)
+OBJ_FILES := $(patsubst $(SRC_DIR)/%.c,$(OBJ_DIR)/%.obj,$(SRC_FILES))

 $(OBJ_DIR)/%.obj: $(SRC_DIR)/%.c
 	$(CC) $(CFLAGS) -c -o $@ $<

-.PHONY: ciabatta.lib
+ciabatta.lib: $(OBJ_FILES)
+	llvm-ar rc $@ $^
+
+test: ciabatta.lib
+	clang test/test_$(test).c ciabatta.lib -std=c11 $(LIBS) -nostdlib -Iinc
+
+clean:
+	rd/s/q bin || true
+	rm -Rf bin || true
+	mkdir bin
+	mkdir bin/code
+	mkdir bin/win
+	mkdir bin/linux
+
+.PHONY: ciabatta.lib test
--- a/inc/uchar.h
+++ b/inc/uchar.h
@ -4,10 +4,10 @@
 #include <stddef.h>
 #include <stdint.h>

-typedef struct mbstate_t mbstate_t;
 typedef uint_least16_t char16_t;
 typedef uint_least32_t char32_t;

+typedef struct mbstate_t mbstate_t;
 struct mbstate_t {
    char16_t leftover;
 };
--- a/inc/unicode.h
+++ b/inc/unicode.h
@ -0,0 +1,57 @@
+
+#pragma once
+
+#include <stdint.h>
+#include <stddef.h>
+
+typedef uint_least16_t char16_t;
+typedef uint_least32_t char32_t;
+
+typedef int32_t uchar_t;
+typedef enum {
+    UCHAR_Invalid,
+    UCHAR_Cc,
+    UCHAR_Cf,
+    UCHAR_Co,
+    UCHAR_Cs,
+    UCHAR_Ll,
+    UCHAR_Lm,
+    UCHAR_Lo,
+    UCHAR_Lt,
+    UCHAR_Lu,
+    UCHAR_Mc,
+    UCHAR_Me,
+    UCHAR_Mn,
+    UCHAR_Nd,
+    UCHAR_Nl,
+    UCHAR_No,
+    UCHAR_Pc,
+    UCHAR_Pd,
+    UCHAR_Pe,
+    UCHAR_Pf,
+    UCHAR_Pi,
+    UCHAR_Po,
+    UCHAR_Ps,
+    UCHAR_Sc,
+    UCHAR_Sk,
+    UCHAR_Sm,
+    UCHAR_So,
+    UCHAR_Zl,
+    UCHAR_Zp,
+    UCHAR_Zs,
+} uchar_class;
+
+int uni_classify(uchar_t ch);
+int uni_valid(uchar_t ch);
+uchar_t uni_to_lower(uchar_t u);
+uchar_t uni_to_upper(uchar_t u);
+
+int utf8_dec   (char const *restrict utf8_str,  uchar_t *restrict ch);
+int utf16_dec  (char const *restrict utf16_str, uchar_t *restrict ch);
+int utf8_dec_s (char const *restrict utf8_str,  size_t len, uchar_t *restrict ch);
+int utf16_dec_s(char const *restrict utf16_str, size_t len, uchar_t *restrict ch);
+
+int utf8_enc   (char *utf8_str,  uchar_t ch);
+int utf16_enc  (char *utf16_str, uchar_t ch);
+int utf8_enc_s (char *utf8_str,  size_t len, uchar_t ch);
+int utf16_enc_s(char *utf16_str, size_t len, uchar_t ch);
--- a/src/code/fenv.c
+++ b/src/code/fenv.c
@ -1,6 +1,11 @@

 #include <fenv.h>
-#include <intrin.h>
+
+#if defined(_WIN32)
+    #include <intrin.h>
+#else
+    #include <x86intrin.h>
+#endif

 #define fe_masks(excepts) (((fexcept_t)(excepts)) << 7)
 #define fe_flags(excepts) ((fexcept_t)(excepts))
--- a/src/code/unicode.c
+++ b/src/code/unicode.c
@ -0,0 +1,17 @@
+
+#include <unicode.h>
+
+#include "unicode/data.h"
+
+int uni_classify(uchar_t cp) {
+    return uni_codepoints[cp].cat;
+}
+
+uchar_t uni_to_lower(uchar_t cp) {
+    return uni_codepoints[cp].lower;
+}
+
+uchar_t uni_to_upper(uchar_t cp) {
+    return uni_codepoints[cp].upper;
+}
+
--- a/src/code/unicode/compile.py
+++ b/src/code/unicode/compile.py
@ -9,42 +9,14 @@ os.chdir(dname)
 with open('data.h', 'w') as header:
    header.write('\n');
    header.write('#pragma once\n\n');
-    header.write('#define Cc  0\n');
-    header.write('#define Cf  1\n');
-    header.write('#define Co  2\n');
-    header.write('#define Cs  3\n');
-    header.write('#define Ll  4\n');
-    header.write('#define Lm  5\n');
-    header.write('#define Lo  6\n');
-    header.write('#define Lt  7\n');
-    header.write('#define Lu  8\n');
-    header.write('#define Mc  9\n');
-    header.write('#define Me 10\n');
-    header.write('#define Mn 11\n');
-    header.write('#define Nd 12\n');
-    header.write('#define Nl 13\n');
-    header.write('#define No 14\n');
-    header.write('#define Pc 15\n');
-    header.write('#define Pd 16\n');
-    header.write('#define Pe 17\n');
-    header.write('#define Pf 18\n');
-    header.write('#define Pi 19\n');
-    header.write('#define Po 20\n');
-    header.write('#define Ps 21\n');
-    header.write('#define Sc 22\n');
-    header.write('#define Sk 23\n');
-    header.write('#define Sm 24\n');
-    header.write('#define So 25\n');
-    header.write('#define Zl 26\n');
-    header.write('#define Zp 27\n');
-    header.write('#define Zs 28\n');
+    header.write('#include<unicode.h>\n');
    header.write(
 '''
 struct _uni_elm {
-    wint_t code;
-    wint_t cat;
-    wint_t lower;
-    wint_t upper;
+    uchar_t code;
+    int     cat;
+    uchar_t lower;
+    uchar_t upper;
 } uni_codepoints[] = {
 ''');

@ -60,10 +32,11 @@ struct _uni_elm {
            if upper == '' or upper == '\n':
                upper = code
            header.write('    {' + \
-                '0x' + code  + ', ' + \
-                cat          + ', ' + \
-                '0x' + lower + ', ' + \
-                '0x' + upper + '},\n');
+                '0x'     + code  + ', ' + \
+                'UCHAR_' + cat   + ', ' + \
+                '0x'     + lower + ', ' + \
+                '0x'     + upper + '},\n'
+            );

    header.write('};\n\n');
    header.close();
--- a/src/code/unicode/data.h
+++ b/src/code/unicode/data.h
--- a/src/code/wctype.c
+++ b/src/code/wctype.c
@ -2,11 +2,7 @@
 #include <wctype.h>
 #include <string.h>

-#include "unicode/data.h"
-
-static inline int char_cat(wint_t wc) {
-    return uni_codepoints[wc].cat;
-}
+#include <unicode.h>

 int iswctype(wint_t wc, wctype_t desc) {
    return desc(wc);
@ -51,7 +47,7 @@ int iswblank(wint_t wc) {
 }

 int iswcntrl(wint_t wc) {
-    return char_cat(wc) == Cc;
+    return uni_classify(wc) == UCHAR_Cc;
 }

 int iswdigit(wint_t wc) {
@ -63,33 +59,33 @@ int iswgraph(wint_t wc) {
 }

 int iswlower(wint_t wc) {
-    return char_cat(wc) == Ll;
+    return uni_classify(wc) == UCHAR_Ll;
 }

 int iswprint(wint_t wc) {
-    switch(char_cat(wc)) {
-        case Cc:
-        case Cf:
-        case Co:
-        case Cs:
+    switch(uni_classify(wc)) {
+        case UCHAR_Cc:
+        case UCHAR_Cf:
+        case UCHAR_Co:
+        case UCHAR_Cs:
            return 0;
    }
    return 1;
 }

 int iswpunct(wint_t wc) {
-    switch(char_cat(wc)) {
-        case Pc:
-        case Pd:
-        case Pe:
-        case Pf:
-        case Pi:
-        case Po:
-        case Ps:
-        case Sk:
-        case Sc:
-        case Sm:
-        case So:
+    switch(uni_classify(wc)) {
+        case UCHAR_Pc:
+        case UCHAR_Pd:
+        case UCHAR_Pe:
+        case UCHAR_Pf:
+        case UCHAR_Pi:
+        case UCHAR_Po:
+        case UCHAR_Ps:
+        case UCHAR_Sk:
+        case UCHAR_Sc:
+        case UCHAR_Sm:
+        case UCHAR_So:
            return 1;
    }
    return 0;
@ -109,7 +105,7 @@ int iswspace(wint_t wc) {
 }

 int iswupper(wint_t wc) {
-    return char_cat(wc) == Lu;
+    return uni_classify(wc) == UCHAR_Lu;
 }

 int iswxdigit(wint_t wc) {
@ -117,9 +113,9 @@ int iswxdigit(wint_t wc) {
 }

 wint_t towlower(wint_t wc) {
-    return uni_codepoints[wc].lower;
+    return uni_to_lower(wc);
 }

 wint_t towupper(wint_t wc) {
-    return uni_codepoints[wc].upper;
+    return uni_to_upper(wc);
 }
--- a/src/linux/linux_environment.c
+++ b/src/linux/linux_environment.c
@ -0,0 +1,68 @@
+
+#include <unistd.h>
+#include <stddef.h>
+
+#include <stdlib.h>
+#include <signal.h>
+#include <locale.h>
+
+// Exit routines
+#define ATEXIT_FUNC_COUNT  64
+#define ATQEXIT_FUNC_COUNT 64
+static void (*atexit_funcs [ATEXIT_FUNC_COUNT])(void);
+static void (*atqexit_funcs[ATQEXIT_FUNC_COUNT])(void);
+static int atexit_func_count;
+static int atqexit_func_count;
+
+extern int main(int argc, char** argv);
+
+void _start() {
+    
+    srand(0);
+    setlocale(LC_ALL, "C");
+
+    int argc = 0;
+    char *argv[1] = {NULL};
+    int code = main(argc, argv);
+    _exit(code);
+}
+
+_Noreturn void quick_exit(int status) {
+    while(atqexit_func_count--) {
+        atqexit_funcs[atqexit_func_count]();
+    }
+    _exit(status);
+}
+
+_Noreturn void exit(int status) {
+    while(atexit_func_count--) {
+        atexit_funcs[atqexit_func_count]();
+    }
+    // _close_io();
+    _exit(status);
+}
+
+_Noreturn void _Exit(int status) {
+    _exit(status);
+}
+
+_Noreturn void abort(void) {
+    // raise(SIGABRT);
+    _exit(-69);
+}
+
+int atexit(void (*func)(void)) {
+    if (atexit_func_count >= ATEXIT_FUNC_COUNT) {
+        return 0;
+    }
+    atexit_funcs[atexit_func_count++] = func;
+    return 1;
+}
+
+int at_quick_exit(void (*func)(void)) {
+    if(atqexit_func_count >= ATQEXIT_FUNC_COUNT) {
+        return 0;
+    }
+    atqexit_funcs[atqexit_func_count++] = func;
+    return 1;
+}