Unicode shit (wctype.h)

2022-06-16 18:49:46 +11:00 · 2022-06-16 18:49:46 +11:00 · 0ef1894917
parent 293b812a03
commit 0ef1894917
9 changed files with 69534 additions and 3 deletions
--- a/bake.cmd
+++ b/bake.cmd
@ -51,5 +51,5 @@ del build\*.obj

 :skip_crt_compilation
 echo Compiling test..
-clang -fno-builtin test\test_printf.c ciabatta.lib -std=c11 -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS%
+clang -fno-builtin test\test_wctype.c ciabatta.lib -std=c11 -lkernel32 -luser32 -lshell32 -nostdlib %CIABATTA_OPTIONS%
 ::cl test\test_math.c /Iinc -D_CRT_SECURE_NO_WARNINGS /Z7 /link ciabatta.lib kernel32.lib user32.lib shell32.lib -nostdlib -nodefaultlibs
--- a/code/unicode/readme
+++ b/code/unicode/readme
@ -0,0 +1,11 @@
+
+The unicode-based functions work based on official unicode data. You can find
+the file with Unicode data at:
+
+    https://unicode.org/Public/UNIDATA/UnicodeData.txt
+
+This file is placed into this directory by the name unicode_data.txt. To update
+the unicode standard a new file is put under that name, then unicode_compile.py
+is ran with python interpreter. It will generate a new unicode.h header file.
+
+DO NOT MODIFY unicode.h DIRECTLY BRUH.
--- a/code/unicode/unicode.h
+++ b/code/unicode/unicode.h
--- a/code/unicode/unicode_compile.py
+++ b/code/unicode/unicode_compile.py
@ -0,0 +1,62 @@
+
+import os;
+import sys;
+
+abspath = os.path.abspath(sys.argv[0])
+dname = os.path.dirname(abspath)
+os.chdir(dname)
+
+with open('unicode.h', 'w') as header:
+    header.write('\n');
+    header.write('#pragma once\n\n');
+    header.write('#define Cc  0\n');
+    header.write('#define Cf  1\n');
+    header.write('#define Co  2\n');
+    header.write('#define Cs  3\n');
+    header.write('#define Ll  4\n');
+    header.write('#define Lm  5\n');
+    header.write('#define Lo  6\n');
+    header.write('#define Lt  7\n');
+    header.write('#define Lu  8\n');
+    header.write('#define Mc  9\n');
+    header.write('#define Me 10\n');
+    header.write('#define Mn 11\n');
+    header.write('#define Nd 12\n');
+    header.write('#define Nl 13\n');
+    header.write('#define No 14\n');
+    header.write('#define Pc 15\n');
+    header.write('#define Pd 16\n');
+    header.write('#define Pe 17\n');
+    header.write('#define Pf 18\n');
+    header.write('#define Pi 19\n');
+    header.write('#define Po 20\n');
+    header.write('#define Ps 21\n');
+    header.write('#define Sc 22\n');
+    header.write('#define Sk 23\n');
+    header.write('#define Sm 24\n');
+    header.write('#define So 25\n');
+    header.write('#define Zl 26\n');
+    header.write('#define Zp 27\n');
+    header.write('#define Zs 28\n');
+    header.write('\n');
+    header.write('#define UNI_TAB \\\n');
+
+    with open('unicode_data.txt') as file:
+        for line in file:
+            row = line.split(';')
+            code  = row[0].strip()
+            cat   = row[2].strip()
+            lower = row[13].strip()
+            upper = row[14].strip()
+            if lower == '':
+                lower = code
+            if upper == '' or upper == '\n':
+                upper = code
+            header.write('    X(' + \
+                '0x' + code  + ', ' + \
+                cat          + ', ' + \
+                '0x' + lower + ', ' + \
+                '0x' + upper + ')\\\n');
+
+    header.write('\n');
+    header.close();
--- a/code/unicode/unicode_data.txt
+++ b/code/unicode/unicode_data.txt
--- a/code/unicode/wctype.c
+++ b/code/unicode/wctype.c
@ -0,0 +1,127 @@
+
+#include <wctype.h>
+#include <string.h>
+
+#include "unicode.h"
+
+static inline int char_cat(wint_t wc) {
+    #define X(code, cat, l, u) case code: return cat;
+    switch(wc) {
+        UNI_TAB
+    }
+    #undef X
+    return -1;
+}
+
+int iswctype(wint_t wc, wctype_t desc) {
+    return desc(wc);
+}
+
+wctype_t wctype(const char *property) {
+    if(!strcmp(property, "alnum"))  return iswalnum;
+    if(!strcmp(property, "alpha"))  return iswalpha;
+    if(!strcmp(property, "blank"))  return iswblank;
+    if(!strcmp(property, "cntrl"))  return iswcntrl;
+    if(!strcmp(property, "digit"))  return iswdigit;
+    if(!strcmp(property, "graph"))  return iswgraph;
+    if(!strcmp(property, "lower"))  return iswlower;
+    if(!strcmp(property, "print"))  return iswprint;
+    if(!strcmp(property, "punct"))  return iswpunct;
+    if(!strcmp(property, "space"))  return iswspace;
+    if(!strcmp(property, "upper"))  return iswupper;
+    if(!strcmp(property, "xdigit")) return iswxdigit;
+    return NULL;
+}
+
+wint_t towctrans(wint_t wc, wctrans_t desc) {
+    return desc(wc);
+}
+
+wctrans_t wctrans(const char *property) {
+    if(!strcmp(property, "tolower"))  return towlower;
+    if(!strcmp(property, "toupper"))  return towupper;
+    return NULL;
+}
+
+int iswalnum(wint_t wc) {
+    return iswalpha(wc) || iswdigit(wc);
+}
+
+int iswalpha(wint_t wc) {
+    return iswupper(wc) || iswlower(wc);
+}
+
+int iswblank(wint_t wc) {
+    return wc == ' ' || wc == '\t';
+}
+
+int iswcntrl(wint_t wc) {
+    return char_cat(wc) == Cc;
+}
+
+int iswdigit(wint_t wc) {
+    return '0' <= wc && wc <= '9';
+}
+
+int iswgraph(wint_t wc) {
+    return iswprint(wc) && !iswspace(wc);
+}
+
+int iswlower(wint_t wc) {
+    return char_cat(wc) == Ll;
+}
+
+int iswprint(wint_t wc) {
+    switch(char_cat(wc)) {
+        case Cc:
+        case Cf:
+        case Co:
+        case Cs:
+            return 0;
+    }
+    return 1;
+}
+
+int iswpunct(wint_t wc) {
+    switch(char_cat(wc)) {
+        case Pc:
+        case Pd:
+        case Pe:
+        case Pf:
+        case Pi:
+        case Po:
+        case Ps:
+            return 1;
+    }
+    return 0;
+}
+
+int iswspace(wint_t wc) {
+    return char_cat(wc) == Zs;
+}
+
+int iswupper(wint_t wc) {
+    return char_cat(wc) == Lu;
+}
+
+int iswxdigit(wint_t wc) {
+    return iswdigit(wc) || ('a'<=wc && wc<='f') || ('A'<= wc && wc<='F');
+}
+
+wint_t towlower(wint_t wc) {
+    #define X(code, cat, l, u) case code: return l;
+    switch(wc) {
+        UNI_TAB
+    }
+    #undef X
+    return wc;
+}
+
+wint_t towupper(wint_t wc) {
+    #define X(code, cat, l, u) case code: return u;
+    switch(wc) {
+        UNI_TAB
+    }
+    #undef X
+    return wc;
+}
--- a/inc/wctype.h
+++ b/inc/wctype.h
@ -2,8 +2,8 @@
 #pragma once

 typedef int wint_t;
-wctrans_t;
-wctype_t;
+typedef int (*wctrans_t)(wint_t wc);
+typedef int (*wctype_t)(wint_t wc);

 #ifndef WEOF
 	#define WEOF 0
--- a/test/test_wctype.c
+++ b/test/test_wctype.c
@ -0,0 +1,10 @@
+
+#include <wctype.h>
+#include <stdio.h>
+
+int main() {
+    if(iswalpha(L'я')) {
+        printf("Symbol 'я' is indeed a letter\n");
+    }
+    return 0;
+}
--- a/unicode.h
+++ b/unicode.h
@ -0,0 +1,34 @@
+
+#pragma once
+
+#define Cc  0
+#define Cf  1
+#define Co  2
+#define Cs  3
+#define Ll  4
+#define Lm  5
+#define Lo  6
+#define Lt  7
+#define Lu  8
+#define Mc  9
+#define Me 10
+#define Mn 11
+#define Nd 12
+#define Nl 13
+#define No 14
+#define Pc 15
+#define Pd 16
+#define Pe 17
+#define Pf 18
+#define Pi 19
+#define Po 20
+#define Ps 21
+#define Sc 22
+#define Sk 23
+#define Sm 24
+#define So 25
+#define Zl 26
+#define Zp 27
+#define Zs 28
+
+#define UNI_TAB \