201 lines
8.3 KiB
C
201 lines
8.3 KiB
C
//*****************************************************************
|
|
//
|
|
// $file: utf8.h $
|
|
// $author: Martin Fouilleul $
|
|
// $date: 05/11/2016 $
|
|
// $revision: $
|
|
// $note: (C) 2016 by Martin Fouilleul - all rights reserved $
|
|
//
|
|
//*****************************************************************
|
|
#ifndef __UTF8_H_
|
|
#define __UTF8_H_
|
|
|
|
#include"typedefs.h"
|
|
#include"strings.h"
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
typedef u32 utf32;
|
|
|
|
//-----------------------------------------------------------------
|
|
//NOTE: getting sizes / offsets / indices
|
|
//-----------------------------------------------------------------
|
|
MP_API u32 utf8_size_from_leading_char(char leadingChar);
|
|
MP_API u32 utf8_codepoint_size(utf32 codePoint);
|
|
|
|
MP_API u64 utf8_codepoint_count_for_string(str8 string);
|
|
MP_API u64 utf8_byte_count_for_codepoints(str32 codePoints);
|
|
|
|
MP_API u64 utf8_next_offset(str8 string, u64 byteOffset);
|
|
MP_API u64 utf8_prev_offset(str8 string, u64 byteOffset);
|
|
|
|
//-----------------------------------------------------------------
|
|
//NOTE: encoding / decoding
|
|
//-----------------------------------------------------------------
|
|
typedef struct utf8_dec
|
|
{
|
|
utf32 codepoint; //NOTE: decoded codepoint
|
|
u32 size; //NOTE: size of corresponding utf8 sequence
|
|
} utf8_dec;
|
|
|
|
MP_API utf8_dec utf8_decode(str8 string); //NOTE: decode a single utf8 sequence at start of string
|
|
MP_API utf8_dec utf8_decode_at(str8 string, u64 offset); //NOTE: decode a single utf8 sequence starting at byte offset
|
|
MP_API str8 utf8_encode(char* dst, utf32 codePoint); //NOTE: encode codepoint into backing buffer dst
|
|
|
|
MP_API str32 utf8_to_codepoints(u64 maxCount, utf32* backing, str8 string);
|
|
MP_API str8 utf8_from_codepoints(u64 maxBytes, char* backing, str32 codePoints);
|
|
|
|
MP_API str32 utf8_push_to_codepoints(mem_arena* arena, str8 string);
|
|
MP_API str8 utf8_push_from_codepoints(mem_arena* arena, str32 codePoints);
|
|
|
|
//-----------------------------------------------------------------
|
|
// utf8 range struct and X-macros for defining utf8 ranges
|
|
//-----------------------------------------------------------------
|
|
|
|
typedef struct unicode_range
|
|
{
|
|
utf32 firstCodePoint;
|
|
u32 count;
|
|
} unicode_range;
|
|
|
|
//NOTE(martin): range declared here are defined in utf8.cpp
|
|
// they can be used by prefixing them with UTF8_RANGE_, as in 'UTF8_RANGE_BASIC_LATIN'
|
|
#define UNICODE_RANGES \
|
|
UNICODE_RANGE(0x0000, 127, BASIC_LATIN) \
|
|
UNICODE_RANGE(0x0080, 127, C1_CONTROLS_AND_LATIN_1_SUPPLEMENT) \
|
|
UNICODE_RANGE(0x0100, 127, LATIN_EXTENDED_A) \
|
|
UNICODE_RANGE(0x0180, 207, LATIN_EXTENDED_B) \
|
|
UNICODE_RANGE(0x0250, 95, IPA_EXTENSIONS) \
|
|
UNICODE_RANGE(0x02b0, 79, SPACING_MODIFIER_LETTERS) \
|
|
UNICODE_RANGE(0x0300, 111, COMBINING_DIACRITICAL_MARKS) \
|
|
UNICODE_RANGE(0x0370, 143, GREEK_COPTIC) \
|
|
UNICODE_RANGE(0x0400, 255, CYRILLIC) \
|
|
UNICODE_RANGE(0x0500, 47, CYRILLIC_SUPPLEMENT) \
|
|
UNICODE_RANGE(0x0530, 95, ARMENIAN) \
|
|
UNICODE_RANGE(0x0590, 111, HEBREW) \
|
|
UNICODE_RANGE(0x0600, 255, ARABIC) \
|
|
UNICODE_RANGE(0x0700, 79, SYRIAC) \
|
|
UNICODE_RANGE(0x0780, 63, THAANA) \
|
|
UNICODE_RANGE(0x0900, 127, DEVANAGARI) \
|
|
UNICODE_RANGE(0x0980, 127, BENGALI_ASSAMESE) \
|
|
UNICODE_RANGE(0x0a00, 127, GURMUKHI) \
|
|
UNICODE_RANGE(0x0a80, 127, GUJARATI) \
|
|
UNICODE_RANGE(0x0b00, 127, ORIYA) \
|
|
UNICODE_RANGE(0x0b80, 127, TAMIL) \
|
|
UNICODE_RANGE(0x0c00, 127, TELUGU) \
|
|
UNICODE_RANGE(0x0c80, 127, KANNADA) \
|
|
UNICODE_RANGE(0x0d00, 255, MALAYALAM) \
|
|
UNICODE_RANGE(0x0d80, 127, SINHALA) \
|
|
UNICODE_RANGE(0x0e00, 127, THAI) \
|
|
UNICODE_RANGE(0x0e80, 127, LAO) \
|
|
UNICODE_RANGE(0x0f00, 255, TIBETAN) \
|
|
UNICODE_RANGE(0x1000, 159, MYANMAR) \
|
|
UNICODE_RANGE(0x10a0, 95, GEORGIAN) \
|
|
UNICODE_RANGE(0x1100, 255, HANGUL_JAMO) \
|
|
UNICODE_RANGE(0x1200, 383, ETHIOPIC) \
|
|
UNICODE_RANGE(0x13a0, 95, CHEROKEE) \
|
|
UNICODE_RANGE(0x1400, 639, UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS) \
|
|
UNICODE_RANGE(0x1680, 31, OGHAM) \
|
|
UNICODE_RANGE(0x16a0, 95, RUNIC) \
|
|
UNICODE_RANGE(0x1700, 31, TAGALOG) \
|
|
UNICODE_RANGE(0x1720, 31, HANUNOO) \
|
|
UNICODE_RANGE(0x1740, 31, BUHID) \
|
|
UNICODE_RANGE(0x1760, 31, TAGBANWA) \
|
|
UNICODE_RANGE(0x1780, 127, KHMER) \
|
|
UNICODE_RANGE(0x1800, 175, MONGOLIAN) \
|
|
UNICODE_RANGE(0x1900, 79, LIMBU) \
|
|
UNICODE_RANGE(0x1950, 47, TAI_LE) \
|
|
UNICODE_RANGE(0x19e0, 31, KHMER_SYMBOLS) \
|
|
UNICODE_RANGE(0x1d00, 127, PHONETIC_EXTENSIONS) \
|
|
UNICODE_RANGE(0x1e00, 255, LATIN_EXTENDED_ADDITIONAL) \
|
|
UNICODE_RANGE(0x1f00, 255, GREEK_EXTENDED) \
|
|
UNICODE_RANGE(0x2000, 111, GENERAL_PUNCTUATION) \
|
|
UNICODE_RANGE(0x2070, 47, SUPERSCRIPTS_AND_SUBSCRIPTS) \
|
|
UNICODE_RANGE(0x20a0, 47, CURRENCY_SYMBOLS) \
|
|
UNICODE_RANGE(0x20d0, 47, COMBINING_DIACRITICAL_MARKS_FOR_SYMBOLS) \
|
|
UNICODE_RANGE(0x2100, 79, LETTERLIKE_SYMBOLS) \
|
|
UNICODE_RANGE(0x2150, 63, NUMBER_FORMS) \
|
|
UNICODE_RANGE(0x2190, 111, ARROWS) \
|
|
UNICODE_RANGE(0x2200, 255, MATHEMATICAL_OPERATORS) \
|
|
UNICODE_RANGE(0x2300, 255, MISCELLANEOUS_TECHNICAL) \
|
|
UNICODE_RANGE(0x2400, 63, CONTROL_PICTURES) \
|
|
UNICODE_RANGE(0x2440, 31, OPTICAL_CHARACTER_RECOGNITION) \
|
|
UNICODE_RANGE(0x2460, 159, ENCLOSED_ALPHANUMERICS) \
|
|
UNICODE_RANGE(0x2500, 127, BOX_DRAWING) \
|
|
UNICODE_RANGE(0x2580, 31, BLOCK_ELEMENTS) \
|
|
UNICODE_RANGE(0x25a0, 95, GEOMETRIC_SHAPES) \
|
|
UNICODE_RANGE(0x2600, 255, MISCELLANEOUS_SYMBOLS) \
|
|
UNICODE_RANGE(0x2700, 191, DINGBATS) \
|
|
UNICODE_RANGE(0x27c0, 47, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A) \
|
|
UNICODE_RANGE(0x27f0, 15, SUPPLEMENTAL_ARROWS_A) \
|
|
UNICODE_RANGE(0x2800, 255, BRAILLE_PATTERNS) \
|
|
UNICODE_RANGE(0x2900, 127, SUPPLEMENTAL_ARROWS_B) \
|
|
UNICODE_RANGE(0x2980, 127, MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B) \
|
|
UNICODE_RANGE(0x2a00, 255, SUPPLEMENTAL_MATHEMATICAL_OPERATORS) \
|
|
UNICODE_RANGE(0x2b00, 255, MISCELLANEOUS_SYMBOLS_AND_ARROWS) \
|
|
UNICODE_RANGE(0x2e80, 127, CJK_RADICALS_SUPPLEMENT) \
|
|
UNICODE_RANGE(0x2f00, 223, KANGXI_RADICALS) \
|
|
UNICODE_RANGE(0x2ff0, 15, IDEOGRAPHIC_DESCRIPTION_CHARACTERS) \
|
|
UNICODE_RANGE(0x3000, 63, CJK_SYMBOLS_AND_PUNCTUATION) \
|
|
UNICODE_RANGE(0x3040, 95, HIRAGANA) \
|
|
UNICODE_RANGE(0x30a0, 95, KATAKANA) \
|
|
UNICODE_RANGE(0x3100, 47, BOPOMOFO) \
|
|
UNICODE_RANGE(0x3130, 95, HANGUL_COMPATIBILITY_JAMO) \
|
|
UNICODE_RANGE(0x3190, 15, KANBUN_KUNTEN) \
|
|
UNICODE_RANGE(0x31a0, 31, BOPOMOFO_EXTENDED) \
|
|
UNICODE_RANGE(0x31f0, 15, KATAKANA_PHONETIC_EXTENSIONS) \
|
|
UNICODE_RANGE(0x3200, 255, ENCLOSED_CJK_LETTERS_AND_MONTHS) \
|
|
UNICODE_RANGE(0x3300, 255, CJK_COMPATIBILITY) \
|
|
UNICODE_RANGE(0x3400, 6591, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) \
|
|
UNICODE_RANGE(0x4dc0, 63, YIJING_HEXAGRAM_SYMBOLS) \
|
|
UNICODE_RANGE(0x4e00, 20911, CJK_UNIFIED_IDEOGRAPHS) \
|
|
UNICODE_RANGE(0xa000, 1167, YI_SYLLABLES) \
|
|
UNICODE_RANGE(0xa490, 63, YI_RADICALS) \
|
|
UNICODE_RANGE(0xac00, 11183, HANGUL_SYLLABLES) \
|
|
UNICODE_RANGE(0xd800, 1023, HIGH_SURROGATE_AREA) \
|
|
UNICODE_RANGE(0xdc00, 1023, LOW_SURROGATE_AREA) \
|
|
UNICODE_RANGE(0xe000, 6399, PRIVATE_USE_AREA) \
|
|
UNICODE_RANGE(0xf900, 511, CJK_COMPATIBILITY_IDEOGRAPHS) \
|
|
UNICODE_RANGE(0xfb00, 79, ALPHABETIC_PRESENTATION_FORMS) \
|
|
UNICODE_RANGE(0xfb50, 687, ARABIC_PRESENTATION_FORMS_A) \
|
|
UNICODE_RANGE(0xfe00, 15, VARIATION_SELECTORS) \
|
|
UNICODE_RANGE(0xfe20, 15, COMBINING_HALF_MARKS) \
|
|
UNICODE_RANGE(0xfe30, 31, CJK_COMPATIBILITY_FORMS) \
|
|
UNICODE_RANGE(0xfe50, 31, SMALL_FORM_VARIANTS) \
|
|
UNICODE_RANGE(0xfe70, 143, ARABIC_PRESENTATION_FORMS_B) \
|
|
UNICODE_RANGE(0xff00, 239, HALFWIDTH_AND_FULLWIDTH_FORMS) \
|
|
UNICODE_RANGE(0xfff0, 15, SPECIALS) \
|
|
UNICODE_RANGE(0x10000, 127, LINEAR_B_SYLLABARY) \
|
|
UNICODE_RANGE(0x10080, 127, LINEAR_B_IDEOGRAMS) \
|
|
UNICODE_RANGE(0x10100, 63, AEGEAN_NUMBERS) \
|
|
UNICODE_RANGE(0x10300, 47, OLD_ITALIC) \
|
|
UNICODE_RANGE(0x10330, 31, GOTHIC) \
|
|
UNICODE_RANGE(0x10380, 31, UGARITIC) \
|
|
UNICODE_RANGE(0x10400, 79, DESERET) \
|
|
UNICODE_RANGE(0x10450, 47, SHAVIAN) \
|
|
UNICODE_RANGE(0x10480, 47, OSMANYA) \
|
|
UNICODE_RANGE(0x10800, 63, CYPRIOT_SYLLABARY) \
|
|
UNICODE_RANGE(0x1d000, 255, BYZANTINE_MUSICAL_SYMBOLS) \
|
|
UNICODE_RANGE(0x1d100, 255, MUSICAL_SYMBOLS) \
|
|
UNICODE_RANGE(0x1d300, 95, TAI_XUAN_JING_SYMBOLS) \
|
|
UNICODE_RANGE(0x1d400, 1023, MATHEMATICAL_ALPHANUMERIC_SYMBOLS) \
|
|
UNICODE_RANGE(0x20000, 42719, CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B) \
|
|
UNICODE_RANGE(0x2f800, 543, CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT) \
|
|
UNICODE_RANGE(0xe0000, 127, TAGS) \
|
|
UNICODE_RANGE(0xe0100, 239, VARIATION_SELECTORS_SUPPLEMENT) \
|
|
UNICODE_RANGE(0xf0000, 65533, SUPPLEMENTARY_PRIVATE_USE_AREA_A) \
|
|
UNICODE_RANGE(0x100000, 65533, SUPPLEMENTARY_PRIVATE_USE_AREA_B)
|
|
|
|
#define UNICODE_RANGE(start, count, name) \
|
|
MP_API extern const unicode_range _cat2_(UNICODE_RANGE_, name);
|
|
UNICODE_RANGES
|
|
#undef UNICODE_RANGE
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
#endif
|
|
|
|
#endif //__UTF8_H_
|