Remove submodule dependencies

This commit is contained in:
bumbread 2022-08-05 15:21:07 +11:00
parent ba774149f7
commit c93371977a
33 changed files with 197 additions and 1389456 deletions

View File

@ -1 +1 @@
clang test\test_%test%.c src\libwinsane\libwinsane.obj -Iinc -g -lciabatta.lib
clang test\test_%test%.c utf8.obj -Iinc -g -lciabatta.lib

View File

@ -1,4 +1,13 @@
:: Compile UTF-8 resource into .obj file
:: this .obj file has to be linked to the executable using it, NOT archived
:: together with ciabatta.lib.
windres -o utf8.obj utf8.rc
ld -relocatable -o libwinsane.obj utf8.obj
:: Compile chkstk
nasm src\_win\chkstk.asm -ochkstk.o -fwin64
:: Compile the rest of the party
clang -Wall src\ciabatta.c -o ciabatta.obj -c -DCIABATTA_WIN -I inc -I src\_win -nodefaultlibs -g -mfma
lib /nologo /out:ciabatta.lib chkstk.o ciabatta.obj src\fdec64\fdec64.lib src\unicope\unicope.lib
lib /nologo /out:ciabatta.lib chkstk.o ciabatta.obj

17
readme
View File

@ -32,12 +32,17 @@ PLATFORM SUPPORT
- x86-64
USAGE
NOTE: libwinsane.obj can be obtained by running:
bake -DNO_CRT
In the libwinsane directory.
Add the following flags to your compilation command:
-I <path/to/ciabatta/inc> libwinsane.obj -nostdlib -mfma
Don't forget to link to the following libraries:
Note that the library can only be used with clang
Once MSVC compiler finally decides to support C11 atomic types I'll consider
supporting MSVC, until then clang is your only option
1. Run bake.cmd
2. Make sure you've got the following in some folder:
- The inc folder
- The ciabatta.lib archive file
- The utf8.obj object file
3. Add the following flags to your compilation command:
-I <path/to/ciabatta/inc> utf8.obj -nostdlib -mfma
4. Don't forget to link to the following libraries:
-lciabatta.lib
CONTRIBUTING

View File

@ -79,6 +79,7 @@ void thrd_yield(void) {
_Noreturn void thrd_exit(int res) {
// TODO(NeGate): setup TSS dtors here
ExitThread((DWORD)res);
__builtin_unreachable();
}
void mtx_destroy(mtx_t *mtx) {

View File

@ -38,8 +38,8 @@
#include "intrin.h"
// Dependencies
#include "fdec64/fdec64.h"
#include "unicope/inc/unicope.h"
#include "decfloat/decfloat.c"
#include "decfloat/decfloat.h"
// Platform-independent stuff
#include "fmt/gen_fmt.c"

View File

@ -1,10 +1,6 @@
#include <stdbool.h>
#include <stdint.h>
#include <fdec64.h>
#include <fdec64_table.h>
#include <intrin.h>
#include "decfloat.h"
#include "decfloat_table.h"
#define DOUBLE_MANTISSA_BITS 52
#define DOUBLE_EXPONENT_BITS 11
@ -139,7 +135,7 @@ static inline uint64_t div1e8(const uint64_t x) {
fdec64 dtofdec64(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
decfloat_t dtodecfloat(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
int32_t e2;
uint64_t m2;
if (ieeeExponent == 0) {
@ -304,7 +300,7 @@ fdec64 dtofdec64(const uint64_t ieeeMantissa, const uint32_t ieeeExponent) {
}
const int32_t exp = e10 + removed;
fdec64 fd;
decfloat_t fd;
fd.exponent = exp;
fd.mantissa = output;
return fd;

9
src/decfloat/decfloat.h Normal file
View File

@ -0,0 +1,9 @@
#pragma once
typedef struct {
uint64_t mantissa;
int32_t exponent;
} decfloat_t;
decfloat_t todecfloat(const uint64_t ieeeMant, const uint32_t ieeeExp);

View File

@ -1,9 +0,0 @@
ciabatta
*.lib
test.c
*.exe
*.ilk
*.obj
*.pdb
*.obj
bin

View File

@ -1,3 +0,0 @@
Ulf Adams <ulfjack@google.com>
Stephan T. Lavavej <stl@microsoft.com>
Alexander Bolz <alexbolz@web.de>

View File

@ -1,68 +0,0 @@
import os
import subprocess
# General compile options
platform = 'win'
definitions = []
inc_folders = ['.']
# Compiler-specific options
clang_dbg_flags = ['-g', '-gcodeview']
clang_common_flags = ['-c', '-nodefaultlibs', '-mfma']
#----------------------------------------------------------------------------#
# Map lists to lists of options
inc_flags = []
def_flags = []
def compile(root, cmap):
global inc_flags
global def_flags
inc_flags = list(map(lambda p: '-I '+ p, inc_folders))
def_flags = list(map(lambda d: '-D' + d, definitions))
for path, subdirs, files in os.walk(root):
for file_name in files:
file_path = os.path.join(path, file_name)
short_name, ext = os.path.splitext(file_path)
if ext in cmap.keys():
func = cmap[ext]
func(file_path)
def get_bin_path(file_path):
rel_path = os.path.normpath(file_path).split(os.path.sep)[1:]
name, ext = os.path.splitext(os.path.sep.join(rel_path))
bin_path = os.path.join('bin', name+'.obj')
os.makedirs(os.path.dirname(bin_path), exist_ok=True)
return bin_path
def clang_compile(file_name):
bin_path = get_bin_path(file_name)
dbg_flags = clang_dbg_flags
cmn_flags = clang_common_flags
flags = dbg_flags + cmn_flags + inc_flags + def_flags
command = ' '.join(["clang", file_name, '-o', bin_path] + flags)
subprocess.run(command.split(' '))
print(file_name, '=>', bin_path)
def nasm_compile(file_name):
bin_path = get_bin_path(file_name)
subprocess.run(['nasm', file_name, '-f', 'win64', '-o', bin_path])
print(file_name, '=>', bin_path)
#-----------------------------------------------------------------------------#
# Compile the object files
compile_map = {}
compile_map['.c'] = clang_compile
compile(os.path.normpath('.'), compile_map)
# Make an archive of all object files
obj_paths = []
for dir, _, f in os.walk('bin'):
if len(f) != 0:
obj_paths.append(os.path.join(dir, '*.obj'))
subprocess.run(['llvm-ar', 'rc', 'fdec64.lib'] + obj_paths)

View File

@ -1,9 +0,0 @@
#pragma once
typedef struct fdec64 {
uint64_t mantissa;
int32_t exponent;
} fdec64;
fdec64 dtofdec64(const uint64_t ieeeMant, const uint32_t ieeeExp);

View File

@ -1,23 +0,0 @@
Boost Software License - Version 1.0 - August 17th, 2003
Permission is hereby granted, free of charge, to any person or organization
obtaining a copy of the software and accompanying documentation covered by
this license (the "Software") to use, reproduce, display, distribute,
execute, and transmit the Software, and to prepare derivative works of the
Software, and to permit third-parties to whom the Software is furnished to
do so, all subject to the following:
The copyright notices in the Software and this entire statement, including
the above license grant, this restriction and the following disclaimer,
must be included in all copies of the Software, in whole or in part, and
all derivative works of the Software, unless such copies or derivative
works are solely in the form of machine-executable object code generated by
a source language processor.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

View File

@ -1,4 +0,0 @@
I stole this code from ryu. Original:
https://github.com/ulfjack/ryu

View File

@ -270,7 +270,7 @@ static int pfx(vprintfcb)(
E = 0;
}
else {
fdec64 f = dtofdec64(m2, e2);
decfloat_t f = todecfloat(m2, e2);
E = f.exponent;
}
}
@ -697,7 +697,7 @@ static inline int pfx(_dtoa)(
exp = 0;
}
else {
fdec64 f = dtofdec64(m2, e2);
decfloat_t f = todecfloat(m2, e2);
mant = f.mantissa;
exp = f.exponent;
}
@ -822,7 +822,7 @@ static inline int pfx(_etoa)(
exp = 0;
}
else {
fdec64 f = dtofdec64(m2, e2);
decfloat_t f = todecfloat(m2, e2);
mant = f.mantissa;
exp = f.exponent;
}

@ -1 +0,0 @@
Subproject commit c22973570de5d75eff0f03a823536e781ebdac4c

View File

@ -1,126 +1,126 @@
size_t mbrtoc16(
char16_t *restrict pc16,
char const *restrict s,
size_t n,
mbstate_t *restrict ps
) {
// Figure out the conversion state
static mbstate_t static_mbstate = {0};
if(ps == NULL) ps = &static_mbstate;
if(s == NULL) {
*ps = (mbstate_t) {0xd800};
return 0;
}
// Check leftovers, using 0xd800 as "no leftover" marker because it
// doesn't encode a valid character.
if(ps->leftover == 0xd800) {
// Decode the UTF-8 encoded codepoint
char32_t code_point;
int mblen = utf8_chdec((char8_t *)s, n, &code_point);
if(mblen == UNI_ESTRLN) return (size_t)(-2);
if(mblen <= 0) goto invalid_seq;
// Encode the codepoint into UTF-16 string
char16_t str[2];
int c16len = utf16_chenc(str, 2, code_point);
if(c16len <= 0) goto invalid_seq;
// Assign the decoded UTF-16 character, decide leftover
if(pc16 != NULL) *pc16 = str[0];
ps->leftover = (c16len == 2? str[1] : 0xd800);
return (size_t)mblen;
}
else {
// Otherwise use and reset the leftover
if(pc16 != NULL) *pc16 = ps->leftover;
ps->leftover = 0xd800;
return (size_t)(-3);
}
invalid_seq:
errno = EILSEQ;
return (size_t)(-1);
}
// size_t mbrtoc16(
// char16_t *restrict pc16,
// char const *restrict s,
// size_t n,
// mbstate_t *restrict ps
// ) {
// // Figure out the conversion state
// static mbstate_t static_mbstate = {0};
// if(ps == NULL) ps = &static_mbstate;
// if(s == NULL) {
// *ps = (mbstate_t) {0xd800};
// return 0;
// }
// // Check leftovers, using 0xd800 as "no leftover" marker because it
// // doesn't encode a valid character.
// if(ps->leftover == 0xd800) {
// // Decode the UTF-8 encoded codepoint
// char32_t code_point;
// int mblen = utf8_chdec((char8_t *)s, n, &code_point);
// if(mblen == UNI_ESTRLN) return (size_t)(-2);
// if(mblen <= 0) goto invalid_seq;
// // Encode the codepoint into UTF-16 string
// char16_t str[2];
// int c16len = utf16_chenc(str, 2, code_point);
// if(c16len <= 0) goto invalid_seq;
// // Assign the decoded UTF-16 character, decide leftover
// if(pc16 != NULL) *pc16 = str[0];
// ps->leftover = (c16len == 2? str[1] : 0xd800);
// return (size_t)mblen;
// }
// else {
// // Otherwise use and reset the leftover
// if(pc16 != NULL) *pc16 = ps->leftover;
// ps->leftover = 0xd800;
// return (size_t)(-3);
// }
// invalid_seq:
// errno = EILSEQ;
// return (size_t)(-1);
// }
size_t c16rtomb(
char *restrict s,
char16_t c16,
mbstate_t *restrict ps
) {
// Figure out conversion state
static mbstate_t static_mbstate = {0};
if(ps == NULL) ps = &static_mbstate;
if(s == NULL) {
*ps = (mbstate_t) {0xd800};
return 0;
}
char32_t codepoint_to_write;
// Check whether a high surrogate was detected in a previous call to the
// function. If not, the high_surrogate value is 0xd800
if(ps->high_surrogate == 0xd800) {
// If c16 is a surrogate record it, or throw an error
if(uni_is_hsur(c16)) {
ps->high_surrogate = c16;
return 0;
}
else if(uni_is_lsur(c16)) {
goto invalid_char;
}
// We'll just write c16
codepoint_to_write = c16;
}
// If high surrogate exists, the next character must be a low surrogate
// so we'll write a codepoint made out of high and low surrogates
else if(uni_is_lsur(c16)) {
codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
}
else goto invalid_char;
// Write the codepoint that we decided to write to multibyte string
int written_len = utf8_chenc((char8_t *)s, 4, codepoint_to_write);
if(written_len < 0) {
goto invalid_char;
}
s[written_len] = 0;
return (size_t)written_len;
invalid_char:
errno = EILSEQ;
return (size_t)(-1);
}
// size_t c16rtomb(
// char *restrict s,
// char16_t c16,
// mbstate_t *restrict ps
// ) {
// // Figure out conversion state
// static mbstate_t static_mbstate = {0};
// if(ps == NULL) ps = &static_mbstate;
// if(s == NULL) {
// *ps = (mbstate_t) {0xd800};
// return 0;
// }
// char32_t codepoint_to_write;
// // Check whether a high surrogate was detected in a previous call to the
// // function. If not, the high_surrogate value is 0xd800
// if(ps->high_surrogate == 0xd800) {
// // If c16 is a surrogate record it, or throw an error
// if(uni_is_hsur(c16)) {
// ps->high_surrogate = c16;
// return 0;
// }
// else if(uni_is_lsur(c16)) {
// goto invalid_char;
// }
// // We'll just write c16
// codepoint_to_write = c16;
// }
// // If high surrogate exists, the next character must be a low surrogate
// // so we'll write a codepoint made out of high and low surrogates
// else if(uni_is_lsur(c16)) {
// codepoint_to_write = uni_surtoc(ps->high_surrogate, c16);
// }
// else goto invalid_char;
// // Write the codepoint that we decided to write to multibyte string
// int written_len = utf8_chenc((char8_t *)s, 4, codepoint_to_write);
// if(written_len < 0) {
// goto invalid_char;
// }
// s[written_len] = 0;
// return (size_t)written_len;
// invalid_char:
// errno = EILSEQ;
// return (size_t)(-1);
// }
size_t mbrtoc32(
char32_t *restrict pc32,
char const *restrict s,
size_t n,
mbstate_t *restrict ps
) {
if(s == NULL) {
return 0;
}
char32_t code_point;
int mblen = utf8_chdec((char8_t *)s, n, &code_point);
if(mblen == UNI_ESTRLN) return (size_t)(-2);
if(mblen <= 0) {
errno = EILSEQ;
return (size_t)(-1);
}
*pc32 = code_point;
if(code_point == 0) return 0;
return (size_t)mblen;
}
// size_t mbrtoc32(
// char32_t *restrict pc32,
// char const *restrict s,
// size_t n,
// mbstate_t *restrict ps
// ) {
// if(s == NULL) {
// return 0;
// }
// char32_t code_point;
// int mblen = utf8_chdec((char8_t *)s, n, &code_point);
// if(mblen == UNI_ESTRLN) return (size_t)(-2);
// if(mblen <= 0) {
// errno = EILSEQ;
// return (size_t)(-1);
// }
// *pc32 = code_point;
// if(code_point == 0) return 0;
// return (size_t)mblen;
// }
size_t c32rtomb(
char *restrict s,
char32_t c32,
mbstate_t *restrict ps
) {
if(s == NULL) {
*ps = (mbstate_t) {0};
return 0;
}
int mblen = utf8_chenc((char8_t *)s, 4, c32);
if(mblen <= 0) {
errno = EILSEQ;
return (size_t)(-1);
}
return (size_t)mblen;
}
// size_t c32rtomb(
// char *restrict s,
// char32_t c32,
// mbstate_t *restrict ps
// ) {
// if(s == NULL) {
// *ps = (mbstate_t) {0};
// return 0;
// }
// int mblen = utf8_chenc((char8_t *)s, 4, c32);
// if(mblen <= 0) {
// errno = EILSEQ;
// return (size_t)(-1);
// }
// return (size_t)mblen;
// }

View File

@ -1,3 +0,0 @@
*.lib
*.obj
*.o

View File

@ -1,9 +0,0 @@
@echo off
pushd %~pd0
if not exist bin mkdir bin
pushd bin
rem cl ..\src\*.c -c -I ..\inc /EHa-
clang ..\src\*.c -c -I ..\inc
lib /out:..\unicope.lib *.o
popd
popd

View File

@ -1,4 +0,0 @@
Copyright © 2000 Sunagatov Denis yyakut.ac@gmail.com
This work is free. You can redistribute it and/or modify it under the
terms of the Do What The Fuck You Want To Public License, Version 2,
as published by Sam Hocevar. See the COPYING file for more details.

View File

@ -1,128 +0,0 @@
#pragma once
#include <stdint.h>
#include <stddef.h>
typedef unsigned char char8_t;
typedef uint_least16_t char16_t;
typedef uint_least32_t char32_t;
#define UNI_EBADCP (-1)
#define UNI_EULSUR (-2)
#define UNI_EIBYTE (-3)
#define UNI_ETBYTE (-4)
#define UNI_ESTRLN (-5)
#define UNI_EOLONG (-6)
#define UCHAR_MAJOR_CAT_MASK 0x38
#define UCHAR_MINOR_CAT_MASK 0x07
#define UCHAR_CAT_GEN_MASK 0x3f
#define UCHAR_MIRR_BIT 0x40
#define UCHAR_C 0x08
#define UCHAR_L 0x10
#define UCHAR_M 0x18
#define UCHAR_N 0x20
#define UCHAR_P 0x28
#define UCHAR_S 0x30
#define UCHAR_Z 0x38
enum Unicode_General_Category {
UCHAR_Cc = UCHAR_C | 0x00,
UCHAR_Cf = UCHAR_C | 0x01,
UCHAR_Co = UCHAR_C | 0x02,
UCHAR_Cs = UCHAR_C | 0x03,
UCHAR_Ll = UCHAR_L | 0x00,
UCHAR_Lm = UCHAR_L | 0x01,
UCHAR_Lo = UCHAR_L | 0x02,
UCHAR_Lt = UCHAR_L | 0x03,
UCHAR_Lu = UCHAR_L | 0x04,
UCHAR_Mc = UCHAR_M | 0x00,
UCHAR_Me = UCHAR_M | 0x01,
UCHAR_Mn = UCHAR_M | 0x02,
UCHAR_Nd = UCHAR_N | 0x00,
UCHAR_Nl = UCHAR_N | 0x01,
UCHAR_No = UCHAR_N | 0x02,
UCHAR_Pc = UCHAR_P | 0x00,
UCHAR_Pd = UCHAR_P | 0x01,
UCHAR_Pe = UCHAR_P | 0x02,
UCHAR_Pf = UCHAR_P | 0x03,
UCHAR_Pi = UCHAR_P | 0x04,
UCHAR_Po = UCHAR_P | 0x05,
UCHAR_Ps = UCHAR_P | 0x06,
UCHAR_Sc = UCHAR_S | 0x00,
UCHAR_Sk = UCHAR_S | 0x01,
UCHAR_Sm = UCHAR_S | 0x02,
UCHAR_So = UCHAR_S | 0x03,
UCHAR_Zl = UCHAR_Z | 0x00,
UCHAR_Zp = UCHAR_Z | 0x01,
UCHAR_Zs = UCHAR_Z | 0x02,
};
enum Unicode_Bidi_Class {
UCHAR_BIDI_AL,
UCHAR_BIDI_AN,
UCHAR_BIDI_B,
UCHAR_BIDI_BN,
UCHAR_BIDI_CS,
UCHAR_BIDI_EN,
UCHAR_BIDI_ES,
UCHAR_BIDI_ET,
UCHAR_BIDI_FSI,
UCHAR_BIDI_L,
UCHAR_BIDI_LRE,
UCHAR_BIDI_LRI,
UCHAR_BIDI_LRO,
UCHAR_BIDI_NSM,
UCHAR_BIDI_ON,
UCHAR_BIDI_PDF,
UCHAR_BIDI_PDI,
UCHAR_BIDI_R,
UCHAR_BIDI_RLE,
UCHAR_BIDI_RLI,
UCHAR_BIDI_RLO,
UCHAR_BIDI_S,
UCHAR_BIDI_WS,
};
enum Unicode_Decomposition {
UCHAR_DECOMP_CANON,
UCHAR_DECOMP_FONT,
UCHAR_DECOMP_NOBREAK,
UCHAR_DECOMP_INITIAL,
UCHAR_DECOMP_MEDIAL,
UCHAR_DECOMP_FINAL,
UCHAR_DECOMP_ISOLATED,
UCHAR_DECOMP_CIRCLE,
UCHAR_DECOMP_SUPER,
UCHAR_DECOMP_SUB,
UCHAR_DECOMP_VERTICAL,
UCHAR_DECOMP_WIDE,
UCHAR_DECOMP_NARROW,
UCHAR_DECOMP_SMALL,
UCHAR_DECOMP_SQUARE,
UCHAR_DECOMP_FRACTION,
UCHAR_DECOMP_COMPAT,
};
// Character functions
char32_t const *uni_dec_map (char32_t cp, int *num);
int uni_valid (char32_t cp);
int uni_classify(char32_t cp);
char32_t uni_tolower (char32_t cp);
char32_t uni_toupper (char32_t cp);
char32_t uni_totitle (char32_t cp);
int uni_is_hsur (char16_t cp);
int uni_is_lsur (char16_t cp);
char32_t uni_surtoc (char16_t hsur, char16_t lsur);
// UTF-16 encoding
int utf16_chlen(char16_t const *s);
int utf16_chdec(char16_t const *restrict s, size_t len, char32_t *restrict c);
int utf16_chenc(char16_t *s, size_t len, char32_t c);
// UTF-8 encoding
int utf8_chlen(char8_t const *s);
int utf8_chdec(char8_t const *restrict s, size_t len, char32_t *restrict c);
int utf8_chenc(char8_t *s, size_t len, char32_t c);

View File

@ -1,13 +0,0 @@
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
Version 2, December 2004
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
Everyone is permitted to copy and distribute verbatim or modified
copies of this license document, and changing it is allowed as long
as the name is changed.
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. You just DO WHAT THE FUCK YOU WANT TO.

View File

@ -1,166 +0,0 @@
Unicope - a C11 library for unicode processing. This library provides the user
with functions related to unicode processing as well as with the unicode data,
like character category, their name, numeric value, et cetera.
To use the library simply link your code with unicope.lib and add unicope.h to
your include paths.
===============================================================================
1. TYPES
===============================================================================
char8_t - type representing UTF-8 code unit.
char16_t - type representing UTF-16 code unit.
char32_t - type representing UTF-32 code unit.
uchar_t - signed integer type capable of holding any Unicode codepoint.
The above types are compatible with corresponding types that are defined in
uchar.h. All of the types are unsigned.
enum Unicode_General_Category
This type holds enumeration constants for unicode characters' general
categories.
enum Unicode_Bidi_Class
This type holds enumeration constants for the bi-directional class of
unicode characters.
enum Unicode_Decomposition
This type holds enumeration constants for the character's decomposition
type.
struct uchar_props,
uchar_props
These types hold the data associated with each unicode character.
===============================================================================
2. CHARACTER API
===============================================================================
int uni_valid(uchar_t cp);
PARAMETERS:
cp - any integer that might represent a codepoint
RETURN VALUE:
Returns non-zero value if cp is a valid codepoint. Returns zero otherwise.
A codepoint is considered valid if it doesn't lie in the range u+d800 to
u+dc00, is positive and it's less than u+110000.
int uni_classify(uchar_t cp);
DESCRIPTION:
Returns a classification a unicode codepoint.
RETURN VALUE:
Returns a value of type `enum Unicode_General_Category`, corresponding to
the general character category.
uchar_t uni_tolower(uchar_t cp);
RETURN VALUE:
Returns the lowercase form of cp, if such is defined. Otherwise returns cp
unchanged.
uchar_t uni_toupper(uchar_t cp);
RETURN VALUE:
Returns the uppercase form of cp, if such is defined. Otherwise returns cp
unchanged.
uchar_t uni_totitle(uchar_t cp);
RETURN VALUE:
Returns the titlecase form of cp, if such is defined. Otherwise returns cp
unchanged. Note, titlecase is different from lowercase. For example U+01F1
LATIN CAPITAL LETTER DZ will be converted to U+01F2 LATIN CAPITAL LETTER
D WITH SMALL LETTER z
int uni_is_hsur(char16_t cp);
RETURN VALUE:
Returns non-zero value iff the value is a high surrogate.
int uni_is_lsur(char16_t cp);
RETURN VALUE:
Returns non-zero value iff the value is a low surrogate.
uchar_t uni_surtoc(char16_t hsur, char16_t lsur);
PARAMETERS:
hsur - a correct high surrogate codepoint
lsur - a correct low surrogate codepoint
RETURN VALUE:
A unicode character that is encoded by the given surrogate pair
===============================================================================
3. UTF16 ENCODING/DECODING
===============================================================================
int utf16_chlen(char16_t const *s);
DESCRIPTION:
Returns the length of the first unicode character in the UTF-16 string s.
RETURN VALUE:
UNI_EULSUR if s points to a low surrogate code unit
otherwise returns the length of the UTF-16 character pointed to by s
int utf16_chdec(char16_t const *restrict s, size_t len, uchar_t *restrict c);
DESCRIPTION:
Decode the first character in the UTF-16 string s.
PARAMETERS:
s - A (possibly-invlalid) UTF-16 string.
len - the number of bytes in a string
c - pointer to uchar_t that receives the decoded character. can be NULL
RETURN VALUE:
Returns the number of code units the character occupies, or:
UNI_EULSUR - the string s points to a low surrogate code unit
UNI_EBADCP - the decoded character decodes value larger than u+10ffff
UNI_ESTRLN - if a character wasn't fully encoded in a string
0 - if the len is zero
NOTES:
In case of character encoding error (UNI_EULSUR or UNI_EBADCP) the
character returned is 0xfffd (substitution character). In case of other
abnormal states (UNI_ESTRLN or length is zero) the character is not
modified.
EXAMPLE:
-------------------------------------------------------------------------------
// This example shows char-by-char processing of a unicode string
char16_t string[] = u"Улыбок тебе дед макар";
char16_t str = &string;
size_t str_len = sizeof(wstring)/2-1;
// Process a length-bounded string
int ch_len = 0;
uchar_t ch;
while((ch_len = utf16_chdec(str, str_len, &ch)) > 0) {
printf("\t%u\n", ch);
str += ch_len;
str_len -= ch_len;
}
if(ch_len < 0) ;// error_handle
// Process a nul-terminated string
int ch_len = 0;
uchar_t ch;
while((ch_len = utf16_chdec(str, 2, &ch)) > 0 && ch != 0) {
printf("\t%u\n", ch);
str += ch_len;
}
if(ch_len < 0) ;// error_handle
-------------------------------------------------------------------------------
int utf16_chenc(char16_t *s, size_t len, uchar_t c);
DESCRIPTION:
Encode a unicode character into UTF-16 string
PARAMETERS:
s - a pointer to the place where the character should be written to
len - the maximum size of the string
c - a unicode character to encode.
RETURN VALUE:
UNI_EBADCP - the provided codepoint is invalid
UNI_ESTRLN - not enough space in a string to encode a character
otherwise returns the number of code units written into the string
NOTES:
In case of error the contents of the string s are not modified

View File

@ -1,115 +0,0 @@
#include <unicope.h>
#include "data/unicode.h"
#define countof(tab) (sizeof(tab)/sizeof(tab)[0])
int uni_valid(char32_t ch) {
return (0x0000 <= ch && ch <= 0xd7ff) || (0xe000 <= ch && ch <= 0x10ffff);
}
int uni_is_hsur(char16_t ch) {
return 0xd800 <= ch && ch <= 0xdbff;
}
int uni_is_lsur(char16_t ch) {
return 0xdc00 <= ch && ch <= 0xdfff;
}
char32_t uni_surtoc(char16_t hsur, char16_t lsur) {
char32_t u = ((0x3ff & hsur) << 10) | (lsur & 0x3ff);
return u + 0x10000;
}
int uni_classify(char32_t cp) {
if(!uni_valid(cp)) {
return -1;
}
int range = cp >> 7;
if(range < countof(cat_block_indices)) {
return -1;
}
int offset = cp & 0x7f;
int block = cat_block_indices[range];
int cat = char_cat_props[block][offset].cat_gen & UCHAR_CAT_GEN_MASK;
return cat;
}
static const char32_t fdfa_map[] = {
0x0635, 0x0644, 0x0649, 0x0020, 0x0627, 0x0644,
0x0644, 0x0647, 0x0020, 0x0639, 0x0644, 0x064a,
0x0647, 0x0020, 0x0648, 0x0633, 0x0644, 0x0645,
};
char32_t const *uni_dec_map(char32_t cp, int *num) {
if(cp == 0xfdfa) {
*num = 18;
return fdfa_map;
}
if(cp < 0x10000) {
if(unicode_data16[cp].code == cp) {
*num = unicode_data16[cp].dec_map_n;
return unicode_data16[cp].dec_map;
}
}
else if((cp - 0x10000) < countof(unicode_data32)) {
if(unicode_data32[cp-0x10000].code == cp) {
*num = unicode_data32[cp-0x10000].dec_map_n;
return unicode_data32[cp-0x10000].dec_map;
}
}
return NULL;
}
char32_t uni_tolower(char32_t cp) {
if(!uni_valid(cp)) {
return cp;
}
if(cp < 0x10000) {
if(unicode_data16[cp].code == cp) {
return unicode_data16[cp].lower;
}
}
else if((cp - 0x10000) < countof(unicode_data32)) {
if(unicode_data32[cp-0x10000].code == cp) {
return unicode_data32[cp-0x10000].lower;
}
}
return cp;
}
char32_t uni_toupper(char32_t cp) {
if(!uni_valid(cp)) {
return cp;
}
if(cp < 0x10000) {
if(unicode_data16[cp].code == cp) {
return unicode_data16[cp].upper;
}
}
else if((cp - 0x10000) < countof(unicode_data32)) {
if(unicode_data32[cp-0x10000].code == cp) {
return unicode_data32[cp-0x10000].upper;
}
}
return cp;
}
char32_t uni_totitle(char32_t cp) {
if(uni_valid(cp)) {
return cp;
}
if(cp < 0x10000) {
if(unicode_data16[cp].code == cp) {
return unicode_data16[cp].title;
}
}
else if((cp - 0x10000) < countof(unicode_data32)) {
if(unicode_data32[cp-0x10000].code == cp) {
return unicode_data32[cp-0x10000].title;
}
}
return cp;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,207 +0,0 @@
import os;
import sys;
abspath = os.path.abspath(sys.argv[0])
dname = os.path.dirname(abspath)
os.chdir(dname)
with open('unicode.h', 'w') as header:
header.write('\n');
header.write('#pragma once\n\n');
header.write('#include <unicope.h>\n');
header.write(
'''
#pragma pack(push, 1)
typedef struct char_cat_props_t char_cat_props_t;
struct char_cat_props_t {
uint16_t comb_class;
uint8_t cat_gen;
uint8_t cat_bidi;
};
typedef struct char32_props char32_props;
struct char32_props {
double num_value;
char32_t const dec_map[8];
uint8_t dec_type;
char32_t code;
char32_t lower;
char32_t upper;
char32_t title;
uint8_t dec_map_n;
};
typedef struct char16_props char16_props;
struct char16_props {
double num_value;
char32_t const dec_map[8];
uint8_t dec_type;
char16_t code;
char16_t lower;
char16_t upper;
char16_t title;
uint8_t dec_map_n;
};\n
''');
header.write('static char const name_buff[] = {\n');
header.write(' "\\0"\n');
offsets = [0]
offset = 1
prev_idx = -1
with open('UnicodeData.txt') as file:
for line in file:
row = line.split(';')
code = row[0].strip()
name = row[1].strip()
header.write(' "%s\\0"\n' % (name))
offset += len(name) + 1
idx = int(code, 16)
while idx - prev_idx != 1:
offsets.append(0)
prev_idx += 1
prev_idx = idx
if code == 'F0000':
break
header.write('};\n\n');
header.write('static char const *char_names[] = {\n');
for o in offsets:
header.write(' name_buff+%s,\n' % (o))
header.write('};\n\n');
header.write('static uint8_t cat_block_indices[] = {\n');
block_size = 2**7
block_id = 0
block_ent_id = 0
cur_block = []
blocks = []
def blkcmp(b1, b2):
for b in range(block_size):
if b1[b] != b2[b]:
return False
return True
with open('UnicodeData.txt') as file:
for line in file:
row = line.split(';')
code = int(row[0].strip(), 16)
cat_gen = row[2].strip()
cat_bidi = row[4].strip()
comb_class = row[3].strip()
mirrored = row[9].strip()
cur_block.append((comb_class, cat_gen, cat_bidi, mirrored))
block_ent_id += 1
if block_ent_id == block_size:
existing_found = False
cur_block_id = block_id
for i,b in enumerate(blocks):
if blkcmp(b, cur_block):
existing_found = True
cur_block_id = i
if not existing_found:
blocks.append(cur_block)
block_id += 1
header.write(' %s,\n' % (cur_block_id))
block_ent_id = 0
cur_block = []
header.write('};\n\n')
header.write('static char_cat_props_t char_cat_props[][%s] = {\n' % (block_size));
for b in blocks:
header.write(' {\n')
for d in b:
cat = 'UCHAR_'+d[1];
if d[3] == 'Y':
cat += '|0x40'
header.write(' {%s,%s,UCHAR_BIDI_%s},\n' % (d[0], cat, d[2]))
header.write(' },\n')
header.write('};\n\n');
tab32started = False
prev_idx = -1
header.write('static char16_props unicode_data16[] = {\n')
with open('UnicodeData.txt') as file:
for line in file:
row = line.split(';')
code = row[0].strip()
if int(code, 16) > 0xffff and not tab32started:
header.write('\n};\n\nstatic char32_props unicode_data32[] = {\n')
tab32started=True
prev_idx = -1
name = row[1].strip()
cat_gen = row[2].strip()
cat_bidi = row[4].strip()
comb_class = row[3].strip()
dec_map = row[5].strip()
dec_value = row[6].strip()
dig_value = row[7].strip()
num_value = row[8].strip()
mirrored = row[9].strip()
old_name = row[10].strip()
comment = row[11].strip()
upper = row[12].strip()
lower = row[13].strip()
title = row[14].strip()
# Process decompositional mapping
dec_map_n = 0
dec_type = 'CANON'
if dec_map != '':
dec_map = dec_map.split(' ')
if dec_map[0][0] == '<':
dec_type = dec_map[0][1:-1].upper()
dec_map = dec_map[1:]
dec_map_n = len(dec_map)
if dec_map_n != 0:
dec_map = ', '.join(list(map(lambda x: '0x' + x, dec_map)))
else:
dec_map = '0'
# Make sure lowercase and uppercase mappings are defined
if lower == '':
lower = code
if upper == '':
upper = code
if title == '' or title == '\n':
title = code
# Special cases
if code == 'FDFA':
dec_map = '0'
idx = int(code,16)
if tab32started:
idx -= 0x10000
contiguous = idx - prev_idx == 1
prev_idx = idx
if cat_gen == 'Co':
continue;
header.write(
'''
%s{
.code = 0x%s,
.dec_map_n = %s,
.dec_map = {%s},
.dec_type = UCHAR_DECOMP_%s,
.num_value = %s,
.lower = 0x%s,
.upper = 0x%s,
.title = 0x%s,
},''' % (
'[' + hex(idx) + '] =' if not contiguous else '',
code,
dec_map_n,
dec_map,
dec_type,
num_value if num_value != '' else dec_value if dec_value != '' else dig_value if dig_value != '' else '0',
lower,
upper,
title
));
header.write('\n};\n#pragma pack(pop)\n');
header.close();

View File

@ -1,11 +0,0 @@
The unicode-based functions work based on official unicode data. You can find
the file with Unicode data at:
https://unicode.org/Public/UNIDATA/UnicodeData.txt
This file is placed into this directory by the name data.txt. To update the
unicode standard a new file is put under that name, then unicode_compile.py is
ran with python interpreter. It will generate a new unicode.h header file.
DO NOT MODIFY data.h DIRECTLY BRUH.

File diff suppressed because it is too large Load Diff

View File

@ -1,58 +0,0 @@
#include <unicope.h>
int utf16_chlen(char16_t const *str) {
char16_t cp = *str;
if(uni_is_hsur(cp)) return 2;
else if(uni_is_lsur(cp)) return UNI_EULSUR;
else return 1;
}
int utf16_chdec(
char16_t const *restrict str,
size_t len,
char32_t *restrict chp
) {
if(len == 0) return 0;
int chlen;
char32_t ch;
if(uni_is_hsur(str[0])) {
if(len < 2) return UNI_ESTRLN;
char16_t hsur = str[0];
char16_t lsur = str[1];
ch = uni_surtoc(hsur, lsur);
chlen = 2;
if(ch > 0x10ffff) {
chlen = UNI_EBADCP;
ch = 0xfffd;
}
}
else if(!uni_is_lsur(str[0])) {
ch = str[0];
chlen = 1;
}
else {
chlen = UNI_EULSUR;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf16_chenc(char16_t *str, size_t len, char32_t cp) {
if(!uni_valid(cp)) {
return UNI_EBADCP;
}
if(cp < 0x10000) {
if(len < 1) return UNI_ESTRLN;
str[0] = cp;
return 1;
}
else {
if(len < 2) return UNI_ESTRLN;
cp -= 0x10000;
str[0] = 0xD800 + (cp >> 10);
str[1] = 0xDC00 + (cp & 0x3ff);
return 2;
}
}

View File

@ -1,79 +0,0 @@
#include <unicope.h>
int utf8_chlen(char8_t const *str) {
char8_t byte0 = *str;
if(byte0 < 0x80) return 1;
else if(byte0 < 0xc0) return UNI_EIBYTE;
else if(byte0 < 0xe0) return 2;
else if(byte0 < 0xf0) return 3;
else if(byte0 < 0xf8) return 4;
return UNI_EIBYTE;
}
int utf8_chdec(
char8_t const *restrict str,
size_t len,
char32_t *restrict chp
) {
if(len == 0) return 0;
int chlen;
char32_t ch;
if(str[0] < 0x80) ch = str[0], chlen = 1;
else if(str[0] < 0xc0) ch = 0xfffd, chlen = UNI_EIBYTE;
else if(str[0] < 0xe0) ch = str[0] & 0x1f, chlen = 2;
else if(str[0] < 0xf0) ch = str[0] & 0x0f, chlen = 3;
else if(str[0] < 0xf8) ch = str[0] & 0x07, chlen = 4;
else ch = 0xfffd, chlen = UNI_EIBYTE;
if(chlen > len) {
return UNI_ESTRLN;
}
if(chlen > 0) for(int i = 1; i < chlen; ++i) {
uint8_t trail = str[i];
if((trail & 0xc0) != 0x80) {
chlen = UNI_ETBYTE;
ch = 0xfffd;
break;
}
ch <<= 6;
ch |= (trail & 0x3f);
}
if(!uni_valid(ch)) {
chlen = UNI_EBADCP;
ch = 0xfffd;
}
if(chp != NULL) *chp = ch;
return chlen;
}
int utf8_chenc(char8_t *str, size_t len, char32_t cp) {
if(!uni_valid(cp)) {
return UNI_EBADCP;
}
if(len == 0) return 0;
if(cp < 0x80) {
str[0] = cp;
return 1;
}
else if(cp < 0x800) {
if(len < 2) return UNI_ESTRLN;
str[0] = 0xc0 | (cp >> 6);
str[1] = 0x80 | ((cp >> 0) & 0x3f);
return 2;
}
else if(cp < 0x10000) {
if(len < 3) return UNI_ESTRLN;
str[0] = 0xe0 | (cp >> 18);
str[1] = 0x80 | ((cp >> 6) & 0x3f);
str[2] = 0x80 | ((cp >> 0) & 0x3f);
return 3;
}
else {
if(len < 4) return UNI_ESTRLN;
str[0] = 0xe0 | (cp >> 24);
str[1] = 0x80 | ((cp >> 18) & 0x3f);
str[2] = 0x80 | ((cp >> 6) & 0x3f);
str[3] = 0x80 | ((cp >> 0) & 0x3f);
return 4;
}
}

View File

@ -42,7 +42,8 @@ int iswblank(wint_t wc) {
}
int iswcntrl(wint_t wc) {
return uni_classify(wc) == UCHAR_Cc;
return 0;
//return uni_classify(wc) == UCHAR_Cc;
}
int iswdigit(wint_t wc) {
@ -54,35 +55,36 @@ int iswgraph(wint_t wc) {
}
int iswlower(wint_t wc) {
return uni_classify(wc) == UCHAR_Ll;
return 0;
// return uni_classify(wc) == UCHAR_Ll;
}
int iswprint(wint_t wc) {
switch(uni_classify(wc)) {
case UCHAR_Cc:
case UCHAR_Cf:
case UCHAR_Co:
case UCHAR_Cs:
return 0;
}
// switch(uni_classify(wc)) {
// case UCHAR_Cc:
// case UCHAR_Cf:
// case UCHAR_Co:
// case UCHAR_Cs:
// return 0;
// }
return 1;
}
int iswpunct(wint_t wc) {
switch(uni_classify(wc)) {
case UCHAR_Pc:
case UCHAR_Pd:
case UCHAR_Pe:
case UCHAR_Pf:
case UCHAR_Pi:
case UCHAR_Po:
case UCHAR_Ps:
case UCHAR_Sk:
case UCHAR_Sc:
case UCHAR_Sm:
case UCHAR_So:
return 1;
}
// switch(uni_classify(wc)) {
// case UCHAR_Pc:
// case UCHAR_Pd:
// case UCHAR_Pe:
// case UCHAR_Pf:
// case UCHAR_Pi:
// case UCHAR_Po:
// case UCHAR_Ps:
// case UCHAR_Sk:
// case UCHAR_Sc:
// case UCHAR_Sm:
// case UCHAR_So:
// return 1;
// }
return 0;
}
@ -100,7 +102,8 @@ int iswspace(wint_t wc) {
}
int iswupper(wint_t wc) {
return uni_classify(wc) == UCHAR_Lu;
// return uni_classify(wc) == UCHAR_Lu;
return 0;
}
int iswxdigit(wint_t wc) {
@ -108,9 +111,11 @@ int iswxdigit(wint_t wc) {
}
wint_t towlower(wint_t wc) {
return uni_tolower(wc);
// return uni_tolower(wc);
return wc;
}
wint_t towupper(wint_t wc) {
return uni_toupper(wc);
return wc;
// return uni_toupper(wc);
}

3
utf8.rc Normal file
View File

@ -0,0 +1,3 @@
#include <winuser.h>
CREATEPROCESS_MANIFEST_RESOURCE_ID RT_MANIFEST "utf8.xml"

1
utf8.xml Normal file
View File

@ -0,0 +1 @@
<assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1"><assemblyIdentity name="." version="6.0.0.0"/><application><windowsSettings><activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage></windowsSettings></application></assembly>