mirror of https://github.com/flysand7/ciabatta.git
Remove submodule dependencies
This commit is contained in:
parent
ba774149f7
commit
c93371977a
|
@ -1 +1 @@
|
|||
clang test\test_%test%.c src\libwinsane\libwinsane.obj -Iinc -g -lciabatta.lib
|
||||
clang test\test_%test%.c utf8.obj -Iinc -g -lciabatta.lib
|
11
bake.cmd
11
bake.cmd
|
@ -1,4 +1,13 @@
|
|||
|
||||
:: Compile UTF-8 resource into .obj file
|
||||
:: this .obj file has to be linked to the executable using it, NOT archived
|
||||
:: together with ciabatta.lib.
|
||||
windres -o utf8.obj utf8.rc
|
||||
ld -relocatable -o libwinsane.obj utf8.obj
|
||||
|
||||
:: Compile chkstk
|
||||
nasm src\_win\chkstk.asm -ochkstk.o -fwin64
|
||||
|
||||
:: Compile the rest of the party
|
||||
clang -Wall src\ciabatta.c -o ciabatta.obj -c -DCIABATTA_WIN -I inc -I src\_win -nodefaultlibs -g -mfma
|
||||
lib /nologo /out:ciabatta.lib chkstk.o ciabatta.obj src\fdec64\fdec64.lib src\unicope\unicope.lib
|
||||
lib /nologo /out:ciabatta.lib chkstk.o ciabatta.obj
|
17
readme
17
readme
|
@ -32,12 +32,17 @@ PLATFORM SUPPORT
|
|||
- x86-64
|
||||
|
||||
USAGE
|
||||
NOTE: libwinsane.obj can be obtained by running:
|
||||
bake -DNO_CRT
|
||||
In the libwinsane directory.
|
||||
Add the following flags to your compilation command:
|
||||
-I <path/to/ciabatta/inc> libwinsane.obj -nostdlib -mfma
|
||||
Don't forget to link to the following libraries:
|
||||
Note that the library can only be used with clang
|
||||
Once MSVC compiler finally decides to support C11 atomic types I'll consider
|
||||
supporting MSVC, until then clang is your only option
|
||||
1. Run bake.cmd
|
||||
2. Make sure you've got the following in some folder:
|
||||
- The inc folder
|
||||
- The ciabatta.lib archive file
|
||||
- The utf8.obj object file
|
||||
3. Add the following flags to your compilation command:
|
||||
-I <path/to/ciabatta/inc> utf8.obj -nostdlib -mfma
|
||||
4. Don't forget to link to the following libraries:
|
||||
-lciabatta.lib
|
||||
|
||||
CONTRIBUTING
|
||||
|
|
|
@ -79,6 +79,7 @@ void thrd_yield(void) {
|
|||
_Noreturn void thrd_exit(int res) {
|
||||
// TODO(NeGate): setup TSS dtors here
|
||||
ExitThread((DWORD)res);
|
||||
__builtin_unreachable();
|
||||
}
|
||||
|
||||
void mtx_destroy(mtx_t *mtx) {
|
||||
|
|
|
@ -38,8 +38,8 @@
|
|||
#include "intrin.h"
|
||||
|
||||
// Dependencies
|
||||
#include "fdec64/fdec64.h"
|
||||
#include "unicope/inc/unicope.h"
|
||||
#include "decfloat/decfloat.c"
|
||||
#include "decfloat/decfloat.h"
|
||||
|
||||
// Platform-independent stuff
|
||||
#include "fmt/gen_fmt.c"
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <fdec64.h>
|
||||
#include <fdec64_table.h>
|
||||
#include <intrin.h>
|
||||
#include "decfloat.h"
|
||||
#include "decfloat_table.h"
|
||||
|
||||
#define DOUBLE_MANTISSA_BITS 52
|
||||
#define DOUBLE_EXPONENT_BITS 11
|
||||
|
@ -139,7 +135,7 @@ static inline uint64_t div1e8(const uint64_t x) {
|
|||
|
||||
|
||||
|
||||
fdec64 dtofdec64(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
|
||||
decfloat_t dtodecfloat(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
|
||||
int32_t e2;
|
||||
uint64_t m2;
|
||||
if (ieeeExponent == 0) {
|
||||
|
@ -304,7 +300,7 @@ fdec64 dtofdec64(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
|
|||
}
|
||||
const int32_t exp = e10 + removed;
|
||||
|
||||
fdec64 fd;
|
||||
decfloat_t fd;
|
||||
fd.exponent = exp;
|
||||
fd.mantissa = output;
|
||||
return fd;
|
|
@ -0,0 +1,9 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
typedef struct {
|
||||
uint64_t mantissa;
|
||||
int32_t exponent;
|
||||
} decfloat_t;
|
||||
|
||||
decfloat_t todecfloat(const uint64_t ieeeMant, const uint32_t ieeeExp);
|
|
@ -1,9 +0,0 @@
|
|||
ciabatta
|
||||
*.lib
|
||||
test.c
|
||||
*.exe
|
||||
*.ilk
|
||||
*.obj
|
||||
*.pdb
|
||||
*.obj
|
||||
bin
|
|
@ -1,3 +0,0 @@
|
|||
Ulf Adams <ulfjack@google.com>
|
||||
Stephan T. Lavavej <stl@microsoft.com>
|
||||
Alexander Bolz <alexbolz@web.de>
|
|
@ -1,68 +0,0 @@
|
|||
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
# General compile options
|
||||
|
||||
platform = 'win'
|
||||
|
||||
definitions = []
|
||||
inc_folders = ['.']
|
||||
|
||||
|
||||
# Compiler-specific options
|
||||
|
||||
clang_dbg_flags = ['-g', '-gcodeview']
|
||||
clang_common_flags = ['-c', '-nodefaultlibs', '-mfma']
|
||||
|
||||
#----------------------------------------------------------------------------#
|
||||
# Map lists to lists of options
|
||||
inc_flags = []
|
||||
def_flags = []
|
||||
def compile(root, cmap):
|
||||
global inc_flags
|
||||
global def_flags
|
||||
inc_flags = list(map(lambda p: '-I '+ p, inc_folders))
|
||||
def_flags = list(map(lambda d: '-D' + d, definitions))
|
||||
for path, subdirs, files in os.walk(root):
|
||||
for file_name in files:
|
||||
file_path = os.path.join(path, file_name)
|
||||
short_name, ext = os.path.splitext(file_path)
|
||||
if ext in cmap.keys():
|
||||
func = cmap[ext]
|
||||
func(file_path)
|
||||
|
||||
def get_bin_path(file_path):
|
||||
rel_path = os.path.normpath(file_path).split(os.path.sep)[1:]
|
||||
name, ext = os.path.splitext(os.path.sep.join(rel_path))
|
||||
bin_path = os.path.join('bin', name+'.obj')
|
||||
os.makedirs(os.path.dirname(bin_path), exist_ok=True)
|
||||
return bin_path
|
||||
|
||||
def clang_compile(file_name):
|
||||
bin_path = get_bin_path(file_name)
|
||||
dbg_flags = clang_dbg_flags
|
||||
cmn_flags = clang_common_flags
|
||||
flags = dbg_flags + cmn_flags + inc_flags + def_flags
|
||||
command = ' '.join(["clang", file_name, '-o', bin_path] + flags)
|
||||
subprocess.run(command.split(' '))
|
||||
print(file_name, '=>', bin_path)
|
||||
|
||||
def nasm_compile(file_name):
|
||||
bin_path = get_bin_path(file_name)
|
||||
subprocess.run(['nasm', file_name, '-f', 'win64', '-o', bin_path])
|
||||
print(file_name, '=>', bin_path)
|
||||
|
||||
#-----------------------------------------------------------------------------#
|
||||
|
||||
# Compile the object files
|
||||
compile_map = {}
|
||||
compile_map['.c'] = clang_compile
|
||||
compile(os.path.normpath('.'), compile_map)
|
||||
|
||||
# Make an archive of all object files
|
||||
obj_paths = []
|
||||
for dir, _, f in os.walk('bin'):
|
||||
if len(f) != 0:
|
||||
obj_paths.append(os.path.join(dir, '*.obj'))
|
||||
subprocess.run(['llvm-ar', 'rc', 'fdec64.lib'] + obj_paths)
|
|
@ -1,9 +0,0 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
typedef struct fdec64 {
|
||||
uint64_t mantissa;
|
||||
int32_t exponent;
|
||||
} fdec64;
|
||||
|
||||
fdec64 dtofdec64(const uint64_t ieeeMant, const uint32_t ieeeExp);
|
|
@ -1,23 +0,0 @@
|
|||
Boost Software License - Version 1.0 - August 17th, 2003
|
||||
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
|
@ -1,4 +0,0 @@
|
|||
|
||||
I stole this code from ryu. Original:
|
||||
|
||||
https://github.com/ulfjack/ryu
|
|
@ -270,7 +270,7 @@ static int pfx(vprintfcb)(
|
|||
E = 0;
|
||||
}
|
||||
else {
|
||||
fdec64 f = dtofdec64(m2, e2);
|
||||
decfloat_t f = todecfloat(m2, e2);
|
||||
E = f.exponent;
|
||||
}
|
||||
}
|
||||
|
@ -697,7 +697,7 @@ static inline int pfx(_dtoa)(
|
|||
exp = 0;
|
||||
}
|
||||
else {
|
||||
fdec64 f = dtofdec64(m2, e2);
|
||||
decfloat_t f = todecfloat(m2, e2);
|
||||
mant = f.mantissa;
|
||||
exp = f.exponent;
|
||||
}
|
||||
|
@ -822,7 +822,7 @@ static inline int pfx(_etoa)(
|
|||
exp = 0;
|
||||
}
|
||||
else {
|
||||
fdec64 f = dtofdec64(m2, e2);
|
||||
decfloat_t f = todecfloat(m2, e2);
|
||||
mant = f.mantissa;
|
||||
exp = f.exponent;
|
||||
}
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Subproject commit c22973570de5d75eff0f03a823536e781ebdac4c
|
240
src/uchar.c
240
src/uchar.c
|
@ -1,126 +1,126 @@
|
|||
|
||||
size_t mbrtoc16(
|
||||
char16_t *restrict pc16,
|
||||
char const *restrict s,
|
||||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
// Figure out the conversion state
|
||||
static mbstate_t static_mbstate = {0};
|
||||
if(ps == NULL) ps = &static_mbstate;
|
||||
if(s == NULL) {
|
||||
*ps = (mbstate_t) {0xd800};
|
||||
return 0;
|
||||
}
|
||||
// Check leftovers, using 0xd800 as "no leftover" marker because it
|
||||
// doesn't encode a valid character.
|
||||
if(ps->leftover == 0xd800) {
|
||||
// Decode the UTF-8 encoded codepoint
|
||||
char32_t code_point;
|
||||
int mblen = utf8_chdec((char8_t *)s, n, &code_point);
|
||||
if(mblen == UNI_ESTRLN) return (size_t)(-2);
|
||||
if(mblen <= 0) goto invalid_seq;
|
||||
// Encode the codepoint into UTF-16 string
|
||||
char16_t str[2];
|
||||
int c16len = utf16_chenc(str, 2, code_point);
|
||||
if(c16len <= 0) goto invalid_seq;
|
||||
// Assign the decoded UTF-16 character, decide leftover
|
||||
if(pc16 != NULL) *pc16 = str[0];
|
||||
ps->leftover = (c16len == 2? str[1] : 0xd800);
|
||||
return (size_t)mblen;
|
||||
}
|
||||
else {
|
||||
// Otherwise use and reset the leftover
|
||||
if(pc16 != NULL) *pc16 = ps->leftover;
|
||||
ps->leftover = 0xd800;
|
||||
return (size_t)(-3);
|
||||
}
|
||||
invalid_seq:
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
// size_t mbrtoc16(
|
||||
// char16_t *restrict pc16,
|
||||
// char const *restrict s,
|
||||
// size_t n,
|
||||
// mbstate_t *restrict ps
|
||||
// ) {
|
||||
// // Figure out the conversion state
|
||||
// static mbstate_t static_mbstate = {0};
|
||||
// if(ps == NULL) ps = &static_mbstate;
|
||||
// if(s == NULL) {
|
||||
// *ps = (mbstate_t) {0xd800};
|
||||
// return 0;
|
||||
// }
|
||||
// // Check leftovers, using 0xd800 as "no leftover" marker because it
|
||||
// // doesn't encode a valid character.
|
||||
// if(ps->leftover == 0xd800) {
|
||||
// // Decode the UTF-8 encoded codepoint
|
||||
// char32_t code_point;
|
||||
// int mblen = utf8_chdec((char8_t *)s, n, &code_point);
|
||||
// if(mblen == UNI_ESTRLN) return (size_t)(-2);
|
||||
// if(mblen <= 0) goto invalid_seq;
|
||||
// // Encode the codepoint into UTF-16 string
|
||||
// char16_t str[2];
|
||||
// int c16len = utf16_chenc(str, 2, code_point);
|
||||
// if(c16len <= 0) goto invalid_seq;
|
||||
// // Assign the decoded UTF-16 character, decide leftover
|
||||
// if(pc16 != NULL) *pc16 = str[0];
|
||||
// ps->leftover = (c16len == 2? str[1] : 0xd800);
|
||||
// return (size_t)mblen;
|
||||
// }
|
||||
// else {
|
||||
// // Otherwise use and reset the leftover
|
||||
// if(pc16 != NULL) *pc16 = ps->leftover;
|
||||
// ps->leftover = 0xd800;
|
||||
// return (size_t)(-3);
|
||||
// }
|
||||
// invalid_seq:
|
||||
// errno = EILSEQ;
|
||||
// return (size_t)(-1);
|
||||
// }
|
||||
|
||||
|
||||
|
||||
size_t c16rtomb(
|
||||
char *restrict s,
|
||||
char16_t c16,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
// Figure out conversion state
|
||||
static mbstate_t static_mbstate = {0};
|
||||
if(ps == NULL) ps = &static_mbstate;
|
||||
if(s == NULL) {
|
||||
*ps = (mbstate_t) {0xd800};
|
||||
return 0;
|
||||
}
|
||||
char32_t codepoint_to_write;
|
||||
// Check whether a high surrogate was detected in a previous call to the
|
||||
// function. If not, the high_surrogate value is 0xd800
|
||||
if(ps->high_surrogate == 0xd800) {
|
||||
// If c16 is a surrogate record it, or throw an error
|
||||
if(uni_is_hsur(c16)) {
|
||||
ps->high_surrogate = c16;
|
||||
return 0;
|
||||
}
|
||||
else if(uni_is_lsur(c16)) {
|
||||
goto invalid_char;
|
||||
}
|
||||
// We'll just write c16
|
||||
codepoint_to_write = c16;
|
||||
}
|
||||
// If high surrogate exists, the next character must be a low surrogate
|
||||
// so we'll write a codepoint made out of high and low surrogates
|
||||
else if(uni_is_lsur(c16)) {
|
||||
codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
|
||||
}
|
||||
else goto invalid_char;
|
||||
// Write the codepoint that we decided to write to multibyte string
|
||||
int written_len = utf8_chenc((char8_t *)s, 4, codepoint_to_write);
|
||||
if(written_len < 0) {
|
||||
goto invalid_char;
|
||||
}
|
||||
s[written_len] = 0;
|
||||
return (size_t)written_len;
|
||||
invalid_char:
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
// size_t c16rtomb(
|
||||
// char *restrict s,
|
||||
// char16_t c16,
|
||||
// mbstate_t *restrict ps
|
||||
// ) {
|
||||
// // Figure out conversion state
|
||||
// static mbstate_t static_mbstate = {0};
|
||||
// if(ps == NULL) ps = &static_mbstate;
|
||||
// if(s == NULL) {
|
||||
// *ps = (mbstate_t) {0xd800};
|
||||
// return 0;
|
||||
// }
|
||||
// char32_t codepoint_to_write;
|
||||
// // Check whether a high surrogate was detected in a previous call to the
|
||||
// // function. If not, the high_surrogate value is 0xd800
|
||||
// if(ps->high_surrogate == 0xd800) {
|
||||
// // If c16 is a surrogate record it, or throw an error
|
||||
// if(uni_is_hsur(c16)) {
|
||||
// ps->high_surrogate = c16;
|
||||
// return 0;
|
||||
// }
|
||||
// else if(uni_is_lsur(c16)) {
|
||||
// goto invalid_char;
|
||||
// }
|
||||
// // We'll just write c16
|
||||
// codepoint_to_write = c16;
|
||||
// }
|
||||
// // If high surrogate exists, the next character must be a low surrogate
|
||||
// // so we'll write a codepoint made out of high and low surrogates
|
||||
// else if(uni_is_lsur(c16)) {
|
||||
// codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
|
||||
// }
|
||||
// else goto invalid_char;
|
||||
// // Write the codepoint that we decided to write to multibyte string
|
||||
// int written_len = utf8_chenc((char8_t *)s, 4, codepoint_to_write);
|
||||
// if(written_len < 0) {
|
||||
// goto invalid_char;
|
||||
// }
|
||||
// s[written_len] = 0;
|
||||
// return (size_t)written_len;
|
||||
// invalid_char:
|
||||
// errno = EILSEQ;
|
||||
// return (size_t)(-1);
|
||||
// }
|
||||
|
||||
size_t mbrtoc32(
|
||||
char32_t *restrict pc32,
|
||||
char const *restrict s,
|
||||
size_t n,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
if(s == NULL) {
|
||||
return 0;
|
||||
}
|
||||
char32_t code_point;
|
||||
int mblen = utf8_chdec((char8_t *)s, n, &code_point);
|
||||
if(mblen == UNI_ESTRLN) return (size_t)(-2);
|
||||
if(mblen <= 0) {
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
*pc32 = code_point;
|
||||
if(code_point == 0) return 0;
|
||||
return (size_t)mblen;
|
||||
}
|
||||
// size_t mbrtoc32(
|
||||
// char32_t *restrict pc32,
|
||||
// char const *restrict s,
|
||||
// size_t n,
|
||||
// mbstate_t *restrict ps
|
||||
// ) {
|
||||
// if(s == NULL) {
|
||||
// return 0;
|
||||
// }
|
||||
// char32_t code_point;
|
||||
// int mblen = utf8_chdec((char8_t *)s, n, &code_point);
|
||||
// if(mblen == UNI_ESTRLN) return (size_t)(-2);
|
||||
// if(mblen <= 0) {
|
||||
// errno = EILSEQ;
|
||||
// return (size_t)(-1);
|
||||
// }
|
||||
// *pc32 = code_point;
|
||||
// if(code_point == 0) return 0;
|
||||
// return (size_t)mblen;
|
||||
// }
|
||||
|
||||
size_t c32rtomb(
|
||||
char *restrict s,
|
||||
char32_t c32,
|
||||
mbstate_t *restrict ps
|
||||
) {
|
||||
if(s == NULL) {
|
||||
*ps = (mbstate_t) {0};
|
||||
return 0;
|
||||
}
|
||||
int mblen = utf8_chenc((char8_t *)s, 4, c32);
|
||||
if(mblen <= 0) {
|
||||
errno = EILSEQ;
|
||||
return (size_t)(-1);
|
||||
}
|
||||
return (size_t)mblen;
|
||||
}
|
||||
// size_t c32rtomb(
|
||||
// char *restrict s,
|
||||
// char32_t c32,
|
||||
// mbstate_t *restrict ps
|
||||
// ) {
|
||||
// if(s == NULL) {
|
||||
// *ps = (mbstate_t) {0};
|
||||
// return 0;
|
||||
// }
|
||||
// int mblen = utf8_chenc((char8_t *)s, 4, c32);
|
||||
// if(mblen <= 0) {
|
||||
// errno = EILSEQ;
|
||||
// return (size_t)(-1);
|
||||
// }
|
||||
// return (size_t)mblen;
|
||||
// }
|
||||
|
|
|
@ -1,3 +0,0 @@
|
|||
*.lib
|
||||
*.obj
|
||||
*.o
|
|
@ -1,9 +0,0 @@
|
|||
@echo off
|
||||
pushd %~pd0
|
||||
if not exist bin mkdir bin
|
||||
pushd bin
|
||||
rem cl ..\src\*.c -c -I ..\inc /EHa-
|
||||
clang ..\src\*.c -c -I ..\inc
|
||||
lib /out:..\unicope.lib *.o
|
||||
popd
|
||||
popd
|
|
@ -1,4 +0,0 @@
|
|||
Copyright © 2000 Sunagatov Denis yyakut.ac@gmail.com
|
||||
This work is free. You can redistribute it and/or modify it under the
|
||||
terms of the Do What The Fuck You Want To Public License, Version 2,
|
||||
as published by Sam Hocevar. See the COPYING file for more details.
|
|
@ -1,128 +0,0 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
typedef unsigned char char8_t;
|
||||
typedef uint_least16_t char16_t;
|
||||
typedef uint_least32_t char32_t;
|
||||
|
||||
#define UNI_EBADCP (-1)
|
||||
#define UNI_EULSUR (-2)
|
||||
#define UNI_EIBYTE (-3)
|
||||
#define UNI_ETBYTE (-4)
|
||||
#define UNI_ESTRLN (-5)
|
||||
#define UNI_EOLONG (-6)
|
||||
|
||||
#define UCHAR_MAJOR_CAT_MASK 0x38
|
||||
#define UCHAR_MINOR_CAT_MASK 0x07
|
||||
#define UCHAR_CAT_GEN_MASK 0x3f
|
||||
#define UCHAR_MIRR_BIT 0x40
|
||||
|
||||
#define UCHAR_C 0x08
|
||||
#define UCHAR_L 0x10
|
||||
#define UCHAR_M 0x18
|
||||
#define UCHAR_N 0x20
|
||||
#define UCHAR_P 0x28
|
||||
#define UCHAR_S 0x30
|
||||
#define UCHAR_Z 0x38
|
||||
|
||||
enum Unicode_General_Category {
|
||||
UCHAR_Cc = UCHAR_C | 0x00,
|
||||
UCHAR_Cf = UCHAR_C | 0x01,
|
||||
UCHAR_Co = UCHAR_C | 0x02,
|
||||
UCHAR_Cs = UCHAR_C | 0x03,
|
||||
UCHAR_Ll = UCHAR_L | 0x00,
|
||||
UCHAR_Lm = UCHAR_L | 0x01,
|
||||
UCHAR_Lo = UCHAR_L | 0x02,
|
||||
UCHAR_Lt = UCHAR_L | 0x03,
|
||||
UCHAR_Lu = UCHAR_L | 0x04,
|
||||
UCHAR_Mc = UCHAR_M | 0x00,
|
||||
UCHAR_Me = UCHAR_M | 0x01,
|
||||
UCHAR_Mn = UCHAR_M | 0x02,
|
||||
UCHAR_Nd = UCHAR_N | 0x00,
|
||||
UCHAR_Nl = UCHAR_N | 0x01,
|
||||
UCHAR_No = UCHAR_N | 0x02,
|
||||
UCHAR_Pc = UCHAR_P | 0x00,
|
||||
UCHAR_Pd = UCHAR_P | 0x01,
|
||||
UCHAR_Pe = UCHAR_P | 0x02,
|
||||
UCHAR_Pf = UCHAR_P | 0x03,
|
||||
UCHAR_Pi = UCHAR_P | 0x04,
|
||||
UCHAR_Po = UCHAR_P | 0x05,
|
||||
UCHAR_Ps = UCHAR_P | 0x06,
|
||||
UCHAR_Sc = UCHAR_S | 0x00,
|
||||
UCHAR_Sk = UCHAR_S | 0x01,
|
||||
UCHAR_Sm = UCHAR_S | 0x02,
|
||||
UCHAR_So = UCHAR_S | 0x03,
|
||||
UCHAR_Zl = UCHAR_Z | 0x00,
|
||||
UCHAR_Zp = UCHAR_Z | 0x01,
|
||||
UCHAR_Zs = UCHAR_Z | 0x02,
|
||||
};
|
||||
|
||||
enum Unicode_Bidi_Class {
|
||||
UCHAR_BIDI_AL,
|
||||
UCHAR_BIDI_AN,
|
||||
UCHAR_BIDI_B,
|
||||
UCHAR_BIDI_BN,
|
||||
UCHAR_BIDI_CS,
|
||||
UCHAR_BIDI_EN,
|
||||
UCHAR_BIDI_ES,
|
||||
UCHAR_BIDI_ET,
|
||||
UCHAR_BIDI_FSI,
|
||||
UCHAR_BIDI_L,
|
||||
UCHAR_BIDI_LRE,
|
||||
UCHAR_BIDI_LRI,
|
||||
UCHAR_BIDI_LRO,
|
||||
UCHAR_BIDI_NSM,
|
||||
UCHAR_BIDI_ON,
|
||||
UCHAR_BIDI_PDF,
|
||||
UCHAR_BIDI_PDI,
|
||||
UCHAR_BIDI_R,
|
||||
UCHAR_BIDI_RLE,
|
||||
UCHAR_BIDI_RLI,
|
||||
UCHAR_BIDI_RLO,
|
||||
UCHAR_BIDI_S,
|
||||
UCHAR_BIDI_WS,
|
||||
};
|
||||
|
||||
enum Unicode_Decomposition {
|
||||
UCHAR_DECOMP_CANON,
|
||||
UCHAR_DECOMP_FONT,
|
||||
UCHAR_DECOMP_NOBREAK,
|
||||
UCHAR_DECOMP_INITIAL,
|
||||
UCHAR_DECOMP_MEDIAL,
|
||||
UCHAR_DECOMP_FINAL,
|
||||
UCHAR_DECOMP_ISOLATED,
|
||||
UCHAR_DECOMP_CIRCLE,
|
||||
UCHAR_DECOMP_SUPER,
|
||||
UCHAR_DECOMP_SUB,
|
||||
UCHAR_DECOMP_VERTICAL,
|
||||
UCHAR_DECOMP_WIDE,
|
||||
UCHAR_DECOMP_NARROW,
|
||||
UCHAR_DECOMP_SMALL,
|
||||
UCHAR_DECOMP_SQUARE,
|
||||
UCHAR_DECOMP_FRACTION,
|
||||
UCHAR_DECOMP_COMPAT,
|
||||
};
|
||||
|
||||
// Character functions
|
||||
char32_t const *uni_dec_map (char32_t cp, int *num);
|
||||
int uni_valid (char32_t cp);
|
||||
int uni_classify(char32_t cp);
|
||||
char32_t uni_tolower (char32_t cp);
|
||||
char32_t uni_toupper (char32_t cp);
|
||||
char32_t uni_totitle (char32_t cp);
|
||||
int uni_is_hsur (char16_t cp);
|
||||
int uni_is_lsur (char16_t cp);
|
||||
char32_t uni_surtoc (char16_t hsur, char16_t lsur);
|
||||
|
||||
// UTF-16 encoding
|
||||
int utf16_chlen(char16_t const *s);
|
||||
int utf16_chdec(char16_t const *restrict s, size_t len, char32_t *restrict c);
|
||||
int utf16_chenc(char16_t *s, size_t len, char32_t c);
|
||||
|
||||
// UTF-8 encoding
|
||||
int utf8_chlen(char8_t const *s);
|
||||
int utf8_chdec(char8_t const *restrict s, size_t len, char32_t *restrict c);
|
||||
int utf8_chenc(char8_t *s, size_t len, char32_t c);
|
|
@ -1,13 +0,0 @@
|
|||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
Version 2, December 2004
|
||||
|
||||
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim or modified
|
||||
copies of this license document, and changing it is allowed as long
|
||||
as the name is changed.
|
||||
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. You just DO WHAT THE FUCK YOU WANT TO.
|
|
@ -1,166 +0,0 @@
|
|||
|
||||
Unicope - a C11 library for unicode processing. This library provides the user
|
||||
with functions related to unicode processing as well as with the unicode data,
|
||||
like character category, their name, numeric value, et cetera.
|
||||
|
||||
To use the library simply link your code with unicope.lib and add unicope.h to
|
||||
your include paths.
|
||||
|
||||
===============================================================================
|
||||
1. TYPES
|
||||
===============================================================================
|
||||
|
||||
char8_t - type representing UTF-8 code unit.
|
||||
char16_t - type representing UTF-16 code unit.
|
||||
char32_t - type representing UTF-32 code unit.
|
||||
uchar_t - signed integer type capable of holding any Unicode codepoint.
|
||||
|
||||
The above types are compatible with corresponding types that are defined in
|
||||
uchar.h. All of the types are unsigned.
|
||||
|
||||
enum Unicode_General_Category
|
||||
This type holds enumeration constants for unicode characters' general
|
||||
categories.
|
||||
|
||||
enum Unicode_Bidi_Class
|
||||
This type holds enumeration constants for the bi-directional class of
|
||||
unicode characters.
|
||||
|
||||
enum Unicode_Decomposition
|
||||
This type holds enumeration constants for the character's decomposition
|
||||
type.
|
||||
|
||||
struct uchar_props,
|
||||
uchar_props
|
||||
These types hold the data associated with each unicode character.
|
||||
|
||||
===============================================================================
|
||||
2. CHARACTER API
|
||||
===============================================================================
|
||||
|
||||
|
||||
int uni_valid(uchar_t cp);
|
||||
PARAMETERS:
|
||||
cp - any integer that might represent a codepoint
|
||||
RETURN VALUE:
|
||||
Returns non-zero value if cp is a valid codepoint. Returns zero otherwise.
|
||||
A codepoint is considered valid if it doesn't lie in the range u+d800 to
|
||||
u+dc00, is positive and it's less than u+110000.
|
||||
|
||||
|
||||
int uni_classify(uchar_t cp);
|
||||
DESCRIPTION:
|
||||
Returns a classification a unicode codepoint.
|
||||
RETURN VALUE:
|
||||
Returns a value of type `enum Unicode_General_Category`, corresponding to
|
||||
the general character category.
|
||||
|
||||
|
||||
uchar_t uni_tolower(uchar_t cp);
|
||||
RETURN VALUE:
|
||||
Returns the lowercase form of cp, if such is defined. Otherwise returns cp
|
||||
unchanged.
|
||||
|
||||
|
||||
uchar_t uni_toupper(uchar_t cp);
|
||||
RETURN VALUE:
|
||||
Returns the uppercase form of cp, if such is defined. Otherwise returns cp
|
||||
unchanged.
|
||||
|
||||
|
||||
uchar_t uni_totitle(uchar_t cp);
|
||||
RETURN VALUE:
|
||||
Returns the titlecase form of cp, if such is defined. Otherwise returns cp
|
||||
unchanged. Note, titlecase is different from lowercase. For example U+01F1
|
||||
LATIN CAPITAL LETTER DZ will be converted to U+01F2 LATIN CAPITAL LETTER
|
||||
D WITH SMALL LETTER z
|
||||
|
||||
|
||||
int uni_is_hsur(char16_t cp);
|
||||
RETURN VALUE:
|
||||
Returns non-zero value iff the value is a high surrogate.
|
||||
|
||||
|
||||
int uni_is_lsur(char16_t cp);
|
||||
RETURN VALUE:
|
||||
Returns non-zero value iff the value is a low surrogate.
|
||||
|
||||
|
||||
uchar_t uni_surtoc(char16_t hsur, char16_t lsur);
|
||||
PARAMETERS:
|
||||
hsur - a correct high surrogate codepoint
|
||||
lsur - a correct low surrogate codepoint
|
||||
RETURN VALUE:
|
||||
A unicode character that is encoded by the given surrogate pair
|
||||
|
||||
===============================================================================
|
||||
3. UTF16 ENCODING/DECODING
|
||||
===============================================================================
|
||||
|
||||
int utf16_chlen(char16_t const *s);
|
||||
DESCRIPTION:
|
||||
Returns the length of the first unicode character in the UTF-16 string s.
|
||||
RETURN VALUE:
|
||||
UNI_EULSUR if s points to a low surrogate code unit
|
||||
otherwise returns the length of the UTF-16 character pointed to by s
|
||||
|
||||
|
||||
int utf16_chdec(char16_t const *restrict s, size_t len, uchar_t *restrict c);
|
||||
DESCRIPTION:
|
||||
Decode the first character in the UTF-16 string s.
|
||||
PARAMETERS:
|
||||
s - A (possibly-invlalid) UTF-16 string.
|
||||
len - the number of bytes in a string
|
||||
c - pointer to uchar_t that receives the decoded character. can be NULL
|
||||
RETURN VALUE:
|
||||
Returns the number of code units the character occupies, or:
|
||||
UNI_EULSUR - the string s points to a low surrogate code unit
|
||||
UNI_EBADCP - the decoded character decodes value larger than u+10ffff
|
||||
UNI_ESTRLN - if a character wasn't fully encoded in a string
|
||||
0 - if the len is zero
|
||||
NOTES:
|
||||
In case of character encoding error (UNI_EULSUR or UNI_EBADCP) the
|
||||
character returned is 0xfffd (substitution character). In case of other
|
||||
abnormal states (UNI_ESTRLN or length is zero) the character is not
|
||||
modified.
|
||||
EXAMPLE:
|
||||
-------------------------------------------------------------------------------
|
||||
// This example shows char-by-char processing of a unicode string
|
||||
char16_t string[] = u"Улыбок тебе дед макар";
|
||||
char16_t str = &string;
|
||||
size_t str_len = sizeof(wstring)/2-1;
|
||||
|
||||
// Process a length-bounded string
|
||||
int ch_len = 0;
|
||||
uchar_t ch;
|
||||
while((ch_len = utf16_chdec(str, str_len, &ch)) > 0) {
|
||||
printf("\t%u\n", ch);
|
||||
str += ch_len;
|
||||
str_len -= ch_len;
|
||||
}
|
||||
if(ch_len < 0) ;// error_handle
|
||||
|
||||
// Process a nul-terminated string
|
||||
int ch_len = 0;
|
||||
uchar_t ch;
|
||||
while((ch_len = utf16_chdec(str, 2, &ch)) > 0 && ch != 0) {
|
||||
printf("\t%u\n", ch);
|
||||
str += ch_len;
|
||||
}
|
||||
if(ch_len < 0) ;// error_handle
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
int utf16_chenc(char16_t *s, size_t len, uchar_t c);
|
||||
DESCRIPTION:
|
||||
Encode a unicode character into UTF-16 string
|
||||
PARAMETERS:
|
||||
s - a pointer to the place where the character should be written to
|
||||
len - the maximum size of the string
|
||||
c - a unicode character to encode.
|
||||
RETURN VALUE:
|
||||
UNI_EBADCP - the provided codepoint is invalid
|
||||
UNI_ESTRLN - not enough space in a string to encode a character
|
||||
otherwise returns the number of code units written into the string
|
||||
NOTES:
|
||||
In case of error the contents of the string s are not modified
|
||||
|
|
@ -1,115 +0,0 @@
|
|||
|
||||
#include <unicope.h>
|
||||
|
||||
#include "data/unicode.h"
|
||||
#define countof(tab) (sizeof(tab)/sizeof(tab)[0])
|
||||
|
||||
int uni_valid(char32_t ch) {
|
||||
return (0x0000 <= ch && ch <= 0xd7ff) || (0xe000 <= ch && ch <= 0x10ffff);
|
||||
}
|
||||
|
||||
int uni_is_hsur(char16_t ch) {
|
||||
return 0xd800 <= ch && ch <= 0xdbff;
|
||||
}
|
||||
|
||||
int uni_is_lsur(char16_t ch) {
|
||||
return 0xdc00 <= ch && ch <= 0xdfff;
|
||||
}
|
||||
|
||||
char32_t uni_surtoc(char16_t hsur, char16_t lsur) {
|
||||
char32_t u = ((0x3ff & hsur) << 10) | (lsur & 0x3ff);
|
||||
return u + 0x10000;
|
||||
}
|
||||
|
||||
|
||||
int uni_classify(char32_t cp) {
|
||||
if(!uni_valid(cp)) {
|
||||
return -1;
|
||||
}
|
||||
int range = cp >> 7;
|
||||
if(range < countof(cat_block_indices)) {
|
||||
return -1;
|
||||
}
|
||||
int offset = cp & 0x7f;
|
||||
int block = cat_block_indices[range];
|
||||
int cat = char_cat_props[block][offset].cat_gen & UCHAR_CAT_GEN_MASK;
|
||||
return cat;
|
||||
}
|
||||
|
||||
static const char32_t fdfa_map[] = {
|
||||
0x0635, 0x0644, 0x0649, 0x0020, 0x0627, 0x0644,
|
||||
0x0644, 0x0647, 0x0020, 0x0639, 0x0644, 0x064a,
|
||||
0x0647, 0x0020, 0x0648, 0x0633, 0x0644, 0x0645,
|
||||
};
|
||||
|
||||
char32_t const *uni_dec_map(char32_t cp, int *num) {
|
||||
if(cp == 0xfdfa) {
|
||||
*num = 18;
|
||||
return fdfa_map;
|
||||
}
|
||||
if(cp < 0x10000) {
|
||||
if(unicode_data16[cp].code == cp) {
|
||||
*num = unicode_data16[cp].dec_map_n;
|
||||
return unicode_data16[cp].dec_map;
|
||||
}
|
||||
}
|
||||
else if((cp - 0x10000) < countof(unicode_data32)) {
|
||||
if(unicode_data32[cp-0x10000].code == cp) {
|
||||
*num = unicode_data32[cp-0x10000].dec_map_n;
|
||||
return unicode_data32[cp-0x10000].dec_map;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
char32_t uni_tolower(char32_t cp) {
|
||||
if(!uni_valid(cp)) {
|
||||
return cp;
|
||||
}
|
||||
if(cp < 0x10000) {
|
||||
if(unicode_data16[cp].code == cp) {
|
||||
return unicode_data16[cp].lower;
|
||||
}
|
||||
}
|
||||
else if((cp - 0x10000) < countof(unicode_data32)) {
|
||||
if(unicode_data32[cp-0x10000].code == cp) {
|
||||
return unicode_data32[cp-0x10000].lower;
|
||||
}
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
char32_t uni_toupper(char32_t cp) {
|
||||
if(!uni_valid(cp)) {
|
||||
return cp;
|
||||
}
|
||||
if(cp < 0x10000) {
|
||||
if(unicode_data16[cp].code == cp) {
|
||||
return unicode_data16[cp].upper;
|
||||
}
|
||||
}
|
||||
else if((cp - 0x10000) < countof(unicode_data32)) {
|
||||
if(unicode_data32[cp-0x10000].code == cp) {
|
||||
return unicode_data32[cp-0x10000].upper;
|
||||
}
|
||||
}
|
||||
return cp;
|
||||
}
|
||||
|
||||
char32_t uni_totitle(char32_t cp) {
|
||||
if(uni_valid(cp)) {
|
||||
return cp;
|
||||
}
|
||||
if(cp < 0x10000) {
|
||||
if(unicode_data16[cp].code == cp) {
|
||||
return unicode_data16[cp].title;
|
||||
}
|
||||
}
|
||||
else if((cp - 0x10000) < countof(unicode_data32)) {
|
||||
if(unicode_data32[cp-0x10000].code == cp) {
|
||||
return unicode_data32[cp-0x10000].title;
|
||||
}
|
||||
}
|
||||
return cp;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -1,207 +0,0 @@
|
|||
|
||||
import os;
|
||||
import sys;
|
||||
|
||||
abspath = os.path.abspath(sys.argv[0])
|
||||
dname = os.path.dirname(abspath)
|
||||
os.chdir(dname)
|
||||
|
||||
with open('unicode.h', 'w') as header:
|
||||
header.write('\n');
|
||||
header.write('#pragma once\n\n');
|
||||
header.write('#include <unicope.h>\n');
|
||||
header.write(
|
||||
'''
|
||||
|
||||
#pragma pack(push, 1)
|
||||
|
||||
typedef struct char_cat_props_t char_cat_props_t;
|
||||
struct char_cat_props_t {
|
||||
uint16_t comb_class;
|
||||
uint8_t cat_gen;
|
||||
uint8_t cat_bidi;
|
||||
};
|
||||
|
||||
typedef struct char32_props char32_props;
|
||||
struct char32_props {
|
||||
double num_value;
|
||||
char32_t const dec_map[8];
|
||||
uint8_t dec_type;
|
||||
char32_t code;
|
||||
char32_t lower;
|
||||
char32_t upper;
|
||||
char32_t title;
|
||||
uint8_t dec_map_n;
|
||||
};
|
||||
|
||||
typedef struct char16_props char16_props;
|
||||
struct char16_props {
|
||||
double num_value;
|
||||
char32_t const dec_map[8];
|
||||
uint8_t dec_type;
|
||||
char16_t code;
|
||||
char16_t lower;
|
||||
char16_t upper;
|
||||
char16_t title;
|
||||
uint8_t dec_map_n;
|
||||
};\n
|
||||
''');
|
||||
|
||||
|
||||
header.write('static char const name_buff[] = {\n');
|
||||
header.write(' "\\0"\n');
|
||||
offsets = [0]
|
||||
offset = 1
|
||||
prev_idx = -1
|
||||
with open('UnicodeData.txt') as file:
|
||||
for line in file:
|
||||
row = line.split(';')
|
||||
code = row[0].strip()
|
||||
name = row[1].strip()
|
||||
header.write(' "%s\\0"\n' % (name))
|
||||
offset += len(name) + 1
|
||||
idx = int(code, 16)
|
||||
while idx - prev_idx != 1:
|
||||
offsets.append(0)
|
||||
prev_idx += 1
|
||||
prev_idx = idx
|
||||
if code == 'F0000':
|
||||
break
|
||||
header.write('};\n\n');
|
||||
|
||||
header.write('static char const *char_names[] = {\n');
|
||||
for o in offsets:
|
||||
header.write(' name_buff+%s,\n' % (o))
|
||||
header.write('};\n\n');
|
||||
|
||||
header.write('static uint8_t cat_block_indices[] = {\n');
|
||||
|
||||
block_size = 2**7
|
||||
block_id = 0
|
||||
block_ent_id = 0
|
||||
cur_block = []
|
||||
blocks = []
|
||||
|
||||
def blkcmp(b1, b2):
|
||||
for b in range(block_size):
|
||||
if b1[b] != b2[b]:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
with open('UnicodeData.txt') as file:
|
||||
for line in file:
|
||||
row = line.split(';')
|
||||
code = int(row[0].strip(), 16)
|
||||
cat_gen = row[2].strip()
|
||||
cat_bidi = row[4].strip()
|
||||
comb_class = row[3].strip()
|
||||
mirrored = row[9].strip()
|
||||
cur_block.append((comb_class, cat_gen, cat_bidi, mirrored))
|
||||
block_ent_id += 1
|
||||
if block_ent_id == block_size:
|
||||
existing_found = False
|
||||
cur_block_id = block_id
|
||||
for i,b in enumerate(blocks):
|
||||
if blkcmp(b, cur_block):
|
||||
existing_found = True
|
||||
cur_block_id = i
|
||||
if not existing_found:
|
||||
blocks.append(cur_block)
|
||||
block_id += 1
|
||||
header.write(' %s,\n' % (cur_block_id))
|
||||
block_ent_id = 0
|
||||
cur_block = []
|
||||
header.write('};\n\n')
|
||||
|
||||
header.write('static char_cat_props_t char_cat_props[][%s] = {\n' % (block_size));
|
||||
for b in blocks:
|
||||
header.write(' {\n')
|
||||
for d in b:
|
||||
cat = 'UCHAR_'+d[1];
|
||||
if d[3] == 'Y':
|
||||
cat += '|0x40'
|
||||
header.write(' {%s,%s,UCHAR_BIDI_%s},\n' % (d[0], cat, d[2]))
|
||||
header.write(' },\n')
|
||||
header.write('};\n\n');
|
||||
|
||||
tab32started = False
|
||||
prev_idx = -1
|
||||
header.write('static char16_props unicode_data16[] = {\n')
|
||||
with open('UnicodeData.txt') as file:
|
||||
for line in file:
|
||||
row = line.split(';')
|
||||
code = row[0].strip()
|
||||
if int(code, 16) > 0xffff and not tab32started:
|
||||
header.write('\n};\n\nstatic char32_props unicode_data32[] = {\n')
|
||||
tab32started=True
|
||||
prev_idx = -1
|
||||
name = row[1].strip()
|
||||
cat_gen = row[2].strip()
|
||||
cat_bidi = row[4].strip()
|
||||
comb_class = row[3].strip()
|
||||
dec_map = row[5].strip()
|
||||
dec_value = row[6].strip()
|
||||
dig_value = row[7].strip()
|
||||
num_value = row[8].strip()
|
||||
mirrored = row[9].strip()
|
||||
old_name = row[10].strip()
|
||||
comment = row[11].strip()
|
||||
upper = row[12].strip()
|
||||
lower = row[13].strip()
|
||||
title = row[14].strip()
|
||||
# Process decompositional mapping
|
||||
dec_map_n = 0
|
||||
dec_type = 'CANON'
|
||||
if dec_map != '':
|
||||
dec_map = dec_map.split(' ')
|
||||
if dec_map[0][0] == '<':
|
||||
dec_type = dec_map[0][1:-1].upper()
|
||||
dec_map = dec_map[1:]
|
||||
dec_map_n = len(dec_map)
|
||||
if dec_map_n != 0:
|
||||
dec_map = ', '.join(list(map(lambda x: '0x' + x, dec_map)))
|
||||
else:
|
||||
dec_map = '0'
|
||||
# Make sure lowercase and uppercase mappings are defined
|
||||
if lower == '':
|
||||
lower = code
|
||||
if upper == '':
|
||||
upper = code
|
||||
if title == '' or title == '\n':
|
||||
title = code
|
||||
# Special cases
|
||||
if code == 'FDFA':
|
||||
dec_map = '0'
|
||||
idx = int(code,16)
|
||||
if tab32started:
|
||||
idx -= 0x10000
|
||||
contiguous = idx - prev_idx == 1
|
||||
prev_idx = idx
|
||||
if cat_gen == 'Co':
|
||||
continue;
|
||||
header.write(
|
||||
'''
|
||||
%s{
|
||||
.code = 0x%s,
|
||||
.dec_map_n = %s,
|
||||
.dec_map = {%s},
|
||||
.dec_type = UCHAR_DECOMP_%s,
|
||||
.num_value = %s,
|
||||
.lower = 0x%s,
|
||||
.upper = 0x%s,
|
||||
.title = 0x%s,
|
||||
},''' % (
|
||||
'[' + hex(idx) + '] =' if not contiguous else '',
|
||||
code,
|
||||
dec_map_n,
|
||||
dec_map,
|
||||
dec_type,
|
||||
num_value if num_value != '' else dec_value if dec_value != '' else dig_value if dig_value != '' else '0',
|
||||
lower,
|
||||
upper,
|
||||
title
|
||||
));
|
||||
|
||||
header.write('\n};\n#pragma pack(pop)\n');
|
||||
header.close();
|
|
@ -1,11 +0,0 @@
|
|||
|
||||
The unicode-based functions work based on official unicode data. You can find
|
||||
the file with Unicode data at:
|
||||
|
||||
https://unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
|
||||
This file is placed into this directory by the name data.txt. To update the
|
||||
unicode standard a new file is put under that name, then unicode_compile.py is
|
||||
ran with python interpreter. It will generate a new unicode.h header file.
|
||||
|
||||
DO NOT MODIFY data.h DIRECTLY BRUH.
|
1353752
src/unicope/src/data/unicode.h
1353752
src/unicope/src/data/unicode.h
File diff suppressed because it is too large
Load Diff
|
@ -1,58 +0,0 @@
|
|||
|
||||
#include <unicope.h>
|
||||
|
||||
int utf16_chlen(char16_t const *str) {
|
||||
char16_t cp = *str;
|
||||
if(uni_is_hsur(cp)) return 2;
|
||||
else if(uni_is_lsur(cp)) return UNI_EULSUR;
|
||||
else return 1;
|
||||
}
|
||||
|
||||
int utf16_chdec(
|
||||
char16_t const *restrict str,
|
||||
size_t len,
|
||||
char32_t *restrict chp
|
||||
) {
|
||||
if(len == 0) return 0;
|
||||
int chlen;
|
||||
char32_t ch;
|
||||
if(uni_is_hsur(str[0])) {
|
||||
if(len < 2) return UNI_ESTRLN;
|
||||
char16_t hsur = str[0];
|
||||
char16_t lsur = str[1];
|
||||
ch = uni_surtoc(hsur, lsur);
|
||||
chlen = 2;
|
||||
if(ch > 0x10ffff) {
|
||||
chlen = UNI_EBADCP;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
}
|
||||
else if(!uni_is_lsur(str[0])) {
|
||||
ch = str[0];
|
||||
chlen = 1;
|
||||
}
|
||||
else {
|
||||
chlen = UNI_EULSUR;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(chp != NULL) *chp = ch;
|
||||
return chlen;
|
||||
}
|
||||
|
||||
int utf16_chenc(char16_t *str, size_t len, char32_t cp) {
|
||||
if(!uni_valid(cp)) {
|
||||
return UNI_EBADCP;
|
||||
}
|
||||
if(cp < 0x10000) {
|
||||
if(len < 1) return UNI_ESTRLN;
|
||||
str[0] = cp;
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
if(len < 2) return UNI_ESTRLN;
|
||||
cp -= 0x10000;
|
||||
str[0] = 0xD800 + (cp >> 10);
|
||||
str[1] = 0xDC00 + (cp & 0x3ff);
|
||||
return 2;
|
||||
}
|
||||
}
|
|
@ -1,79 +0,0 @@
|
|||
|
||||
#include <unicope.h>
|
||||
|
||||
int utf8_chlen(char8_t const *str) {
|
||||
char8_t byte0 = *str;
|
||||
if(byte0 < 0x80) return 1;
|
||||
else if(byte0 < 0xc0) return UNI_EIBYTE;
|
||||
else if(byte0 < 0xe0) return 2;
|
||||
else if(byte0 < 0xf0) return 3;
|
||||
else if(byte0 < 0xf8) return 4;
|
||||
return UNI_EIBYTE;
|
||||
}
|
||||
|
||||
int utf8_chdec(
|
||||
char8_t const *restrict str,
|
||||
size_t len,
|
||||
char32_t *restrict chp
|
||||
) {
|
||||
if(len == 0) return 0;
|
||||
int chlen;
|
||||
char32_t ch;
|
||||
if(str[0] < 0x80) ch = str[0], chlen = 1;
|
||||
else if(str[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
|
||||
else if(str[0] < 0xe0) ch = str[0] & 0x1f, chlen = 2;
|
||||
else if(str[0] < 0xf0) ch = str[0] & 0x0f, chlen = 3;
|
||||
else if(str[0] < 0xf8) ch = str[0] & 0x07, chlen = 4;
|
||||
else ch = 0xfffd, chlen = UNI_EIBYTE;
|
||||
if(chlen > len) {
|
||||
return UNI_ESTRLN;
|
||||
}
|
||||
if(chlen > 0) for(int i = 1; i < chlen; ++i) {
|
||||
uint8_t trail = str[i];
|
||||
if((trail & 0xc0) != 0x80) {
|
||||
chlen = UNI_ETBYTE;
|
||||
ch = 0xfffd;
|
||||
break;
|
||||
}
|
||||
ch <<= 6;
|
||||
ch |= (trail & 0x3f);
|
||||
}
|
||||
if(!uni_valid(ch)) {
|
||||
chlen = UNI_EBADCP;
|
||||
ch = 0xfffd;
|
||||
}
|
||||
if(chp != NULL) *chp = ch;
|
||||
return chlen;
|
||||
}
|
||||
|
||||
int utf8_chenc(char8_t *str, size_t len, char32_t cp) {
|
||||
if(!uni_valid(cp)) {
|
||||
return UNI_EBADCP;
|
||||
}
|
||||
if(len == 0) return 0;
|
||||
if(cp < 0x80) {
|
||||
str[0] = cp;
|
||||
return 1;
|
||||
}
|
||||
else if(cp < 0x800) {
|
||||
if(len < 2) return UNI_ESTRLN;
|
||||
str[0] = 0xc0 | (cp >> 6);
|
||||
str[1] = 0x80 | ((cp >> 0) & 0x3f);
|
||||
return 2;
|
||||
}
|
||||
else if(cp < 0x10000) {
|
||||
if(len < 3) return UNI_ESTRLN;
|
||||
str[0] = 0xe0 | (cp >> 18);
|
||||
str[1] = 0x80 | ((cp >> 6) & 0x3f);
|
||||
str[2] = 0x80 | ((cp >> 0) & 0x3f);
|
||||
return 3;
|
||||
}
|
||||
else {
|
||||
if(len < 4) return UNI_ESTRLN;
|
||||
str[0] = 0xe0 | (cp >> 24);
|
||||
str[1] = 0x80 | ((cp >> 18) & 0x3f);
|
||||
str[2] = 0x80 | ((cp >> 6) & 0x3f);
|
||||
str[3] = 0x80 | ((cp >> 0) & 0x3f);
|
||||
return 4;
|
||||
}
|
||||
}
|
57
src/wctype.c
57
src/wctype.c
|
@ -42,7 +42,8 @@ int iswblank(wint_t wc) {
|
|||
}
|
||||
|
||||
int iswcntrl(wint_t wc) {
|
||||
return uni_classify(wc) == UCHAR_Cc;
|
||||
return 0;
|
||||
//return uni_classify(wc) == UCHAR_Cc;
|
||||
}
|
||||
|
||||
int iswdigit(wint_t wc) {
|
||||
|
@ -54,35 +55,36 @@ int iswgraph(wint_t wc) {
|
|||
}
|
||||
|
||||
int iswlower(wint_t wc) {
|
||||
return uni_classify(wc) == UCHAR_Ll;
|
||||
return 0;
|
||||
// return uni_classify(wc) == UCHAR_Ll;
|
||||
}
|
||||
|
||||
int iswprint(wint_t wc) {
|
||||
switch(uni_classify(wc)) {
|
||||
case UCHAR_Cc:
|
||||
case UCHAR_Cf:
|
||||
case UCHAR_Co:
|
||||
case UCHAR_Cs:
|
||||
return 0;
|
||||
}
|
||||
// switch(uni_classify(wc)) {
|
||||
// case UCHAR_Cc:
|
||||
// case UCHAR_Cf:
|
||||
// case UCHAR_Co:
|
||||
// case UCHAR_Cs:
|
||||
// return 0;
|
||||
// }
|
||||
return 1;
|
||||
}
|
||||
|
||||
int iswpunct(wint_t wc) {
|
||||
switch(uni_classify(wc)) {
|
||||
case UCHAR_Pc:
|
||||
case UCHAR_Pd:
|
||||
case UCHAR_Pe:
|
||||
case UCHAR_Pf:
|
||||
case UCHAR_Pi:
|
||||
case UCHAR_Po:
|
||||
case UCHAR_Ps:
|
||||
case UCHAR_Sk:
|
||||
case UCHAR_Sc:
|
||||
case UCHAR_Sm:
|
||||
case UCHAR_So:
|
||||
return 1;
|
||||
}
|
||||
// switch(uni_classify(wc)) {
|
||||
// case UCHAR_Pc:
|
||||
// case UCHAR_Pd:
|
||||
// case UCHAR_Pe:
|
||||
// case UCHAR_Pf:
|
||||
// case UCHAR_Pi:
|
||||
// case UCHAR_Po:
|
||||
// case UCHAR_Ps:
|
||||
// case UCHAR_Sk:
|
||||
// case UCHAR_Sc:
|
||||
// case UCHAR_Sm:
|
||||
// case UCHAR_So:
|
||||
// return 1;
|
||||
// }
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -100,7 +102,8 @@ int iswspace(wint_t wc) {
|
|||
}
|
||||
|
||||
int iswupper(wint_t wc) {
|
||||
return uni_classify(wc) == UCHAR_Lu;
|
||||
// return uni_classify(wc) == UCHAR_Lu;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int iswxdigit(wint_t wc) {
|
||||
|
@ -108,9 +111,11 @@ int iswxdigit(wint_t wc) {
|
|||
}
|
||||
|
||||
wint_t towlower(wint_t wc) {
|
||||
return uni_tolower(wc);
|
||||
// return uni_tolower(wc);
|
||||
return wc;
|
||||
}
|
||||
|
||||
wint_t towupper(wint_t wc) {
|
||||
return uni_toupper(wc);
|
||||
return wc;
|
||||
// return uni_toupper(wc);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
|
||||
#include <winuser.h>
|
||||
CREATEPROCESS_MANIFEST_RESOURCE_ID RT_MANIFEST "utf8.xml"
|
|
@ -0,0 +1 @@
|
|||
<assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1"><assemblyIdentity name="." version="6.0.0.0"/><application><windowsSettings><activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage></windowsSettings></application></assembly>
|
Loading…
Reference in New Issue