Annotation-System/hmmlib2/hmmlib.h

860 lines
21 KiB
C

#ifndef HMMLIB_H_
#define HMMLIB_H_
#include <stddef.h>
#include <stdio.h>
// Data structures
typedef struct {
char* name;
char* role;
} HMML_Credit;
typedef struct {
char* key;
char* value;
} HMML_VideoCustomMetaData;
typedef struct {
char* stream_platform;
char* project;
char* title;
char* vod_platform;
char* id;
char* output;
char* template;
char* medium;
char* number;
char* cc_lang;
HMML_Credit* credits;
size_t credit_count;
HMML_Credit* uncredits;
size_t uncredit_count;
HMML_VideoCustomMetaData* custom;
size_t custom_count;
} HMML_VideoMetaData;
typedef struct {
char* site;
char* page;
char* url;
char* title;
char* article;
char* author;
char* editor;
char* publisher;
char* isbn;
int offset;
} HMML_Reference;
typedef enum {
HMML_CATEGORY,
HMML_MEMBER,
HMML_PROJECT,
HMML_MARKER_COUNT,
} HMML_MarkerType;
typedef struct {
HMML_MarkerType type;
char* marker;
char* parameter;
char* episode;
int offset;
} HMML_Marker;
typedef struct {
_Bool present;
int id;
char* author;
} HMML_Quote;
typedef struct {
int line;
int h, m, s, ms;
char* text;
char* author;
HMML_Reference* references;
size_t reference_count;
HMML_Marker* markers;
size_t marker_count;
HMML_Quote quote;
} HMML_Timestamp;
typedef struct {
int line;
int col;
char* message;
} HMML_Error;
typedef struct {
_Bool well_formed;
HMML_VideoMetaData metadata;
HMML_Timestamp* timestamps;
size_t timestamp_count;
HMML_Error error;
void* free_list; // implementation detail
} HMML_Output;
// Functions
HMML_Output hmml_parse (const char* string);
void hmml_free (HMML_Output* output);
// Version
extern const struct HMML_Version {
int Major, Minor, Patch;
} hmml_version;
#endif
#ifdef HMMLIB_IMPLEMENTATION
#include <setjmp.h>
#include <string.h>
#include <stdarg.h>
#include <stdlib.h>
#include <stdint.h>
#define HSTX(x) x, sizeof(x)-1
#define HSTR(x) (const struct _hmml_str){ HSTX(x) }
#ifndef MALLOC
#define MALLOC malloc
#endif
#ifndef REALLOC
#define REALLOC realloc
#endif
#ifndef countof
#define countof(x) (sizeof(x)/sizeof(*x))
#endif
#define _hmml_debug(...)
//#define _hmml_debug printf
struct _hmml_parser {
HMML_Output out;
const char* mem;
const char* cursor;
jmp_buf err_buf;
uintptr_t* free_list;
int line;
};
struct _hmml_str {
const char* ptr;
size_t len;
};
// memory management boilerplate stuff
static void* _hmml_store_ptr(struct _hmml_parser* p, void* input)
{
uintptr_t* ptr;
if(p->free_list) {
ptr = p->free_list;
if(ptr[1] + 1 == ptr[0]) {
size_t n = ptr[0] << 1;
ptr = REALLOC(ptr, n * sizeof(uintptr_t));
ptr[0] = n;
}
ptr[ptr[1]] = (uintptr_t)input;
ptr[1]++;
} else {
ptr = MALLOC(8 * sizeof(uintptr_t));
ptr[0] = 8;
ptr[1] = 3;
ptr[2] = (uintptr_t)input;
}
p->free_list = ptr;
return input;
}
static char* _hmml_persist_str(struct _hmml_parser* p, const struct _hmml_str str)
{
char* mem = MALLOC(str.len+1);
memcpy(mem, str.ptr, str.len);
mem[str.len] = '\0';
return _hmml_store_ptr(p, mem);
}
static void _hmml_persist_array_fn(struct _hmml_parser* p, void** out, size_t* out_count, void* in, size_t in_size)
{
void* base;
if(!*out) {
base = MALLOC(in_size + sizeof(size_t));
_hmml_store_ptr(p, base);
*(size_t*)base = p->free_list[1]-1;
} else {
base = (char*)(*out) - sizeof(size_t);
base = REALLOC(base, (*out_count + 1) * in_size + sizeof(size_t));
size_t free_list_off = *(size_t*)base;
p->free_list[free_list_off] = (intptr_t)base;
}
*out = (char*)base + sizeof(size_t);
memcpy((char*)*out + (*out_count * in_size), in, in_size);
++(*out_count);
}
#define _hmml_persist_array(p, out, out_count, in) \
_hmml_persist_array_fn((p), (void**)(out), (out_count), &(in), sizeof(in))
// error handling
#define _hmml_err(p, fmt, ...) \
_hmml_err_fn((p), fmt "\n", ##__VA_ARGS__)
__attribute__((noreturn))
static void _hmml_err_fn(struct _hmml_parser* p, const char* fmt, ...)
{
static char error_buf[4096];
va_list va;
va_start(va, fmt);
int n = vsnprintf(error_buf, sizeof(error_buf), fmt, va);
va_end(va);
int line = 1, col = 1;
for(const char* ptr = p->mem; ptr != p->cursor; ++ptr) {
if(*ptr == '\n') {
++line;
col = 1;
} else {
++col;
}
}
p->out.error.message = _hmml_persist_str(p, (struct _hmml_str){ error_buf, n });
p->out.error.line = line;
p->out.error.col = col;
longjmp(p->err_buf, 1);
}
// actual parsing stuff
static void _hmml_skip_ws(struct _hmml_parser* p)
{
for(;;) {
uint8_t c = *p->cursor;
if(c && c <= ' ') {
if(c == '\n') {
++p->line;
}
++p->cursor;
} else {
break;
}
}
}
static _Bool _hmml_str_eq(struct _hmml_str a, struct _hmml_str b)
{
return a.len == b.len && memcmp(a.ptr, b.ptr, a.len) == 0;
}
static _Bool _hmml_unesc(char in, char* out)
{
if(strchr("[]:@~\\\"", in)) {
*out = in;
return 1;
} else {
return 0;
}
}
static char* _hmml_read_attr(struct _hmml_parser* p, char* mem, size_t mem_size, _Bool break_on_punct)
{
const char* src = p->cursor;
char* dst = mem;
if(*src == '"') {
++src;
while(*src && *src != '"' && (size_t)(src - p->cursor) < mem_size) {
char converted;
if(*src == '\\' && _hmml_unesc(src[1], &converted)) {
*dst++ = converted;
src += 2;
} else {
*dst++ = *src++;
}
}
if(*src != '"') {
_hmml_err(p, "Partially quoted attribute");
}
*dst = '\0';
p->cursor = src+1;
} else {
const char* breaks = break_on_punct
? " ]\r\n\t:,'-.#=[\\?!…()\"%"
: " ]\r\n\t"
;
size_t n = strcspn(src, breaks);
if(n >= mem_size) {
_hmml_err(p, "Attribute [%.10s...] too long", p->cursor);
}
memcpy(dst, src, n);
dst += n;
*dst = '\0';
p->cursor += n;
}
return dst;
}
static void _hmml_read_kv(struct _hmml_parser* p, struct _hmml_str* key, struct _hmml_str* val)
{
static char key_memory[64];
static char val_memory[1024];
size_t key_len = strcspn(p->cursor, " \r\n\t=");
if(key_len >= sizeof(key_memory)) {
_hmml_err(p, "Attribute key [%.10s...] too long", p->cursor);
}
memcpy(key_memory, p->cursor, key_len);
key_memory[key_len] = '\0';
p->cursor += key_len;
_hmml_skip_ws(p);
if(*p->cursor != '=') {
_hmml_err(p, "Expected '=', got [%.3s]", p->cursor);
}
++p->cursor;
char* end = _hmml_read_attr(p, val_memory, sizeof(val_memory), 0);
_hmml_debug("read kv [%s] = [%s]\n", key_memory, val_memory);
key->ptr = key_memory;
key->len = key_len;
val->ptr = val_memory;
val->len = end - val_memory;
}
static HMML_Marker _hmml_parse_marker(struct _hmml_parser* p)
{
static char marker_mem[4096];
// the extended markers are inside [ ] and can contain parameters
_Bool extended = *p->cursor == '[';
if(extended) {
++p->cursor;
}
HMML_Marker marker = {
.offset = -1,
};
char c = *p->cursor;
if(c == '~') {
marker.type = HMML_PROJECT;
} else if(c == '@') {
marker.type = HMML_MEMBER;
} else if(c == ':') {
marker.type = HMML_CATEGORY;
} else {
_hmml_err(p, "Unknown marker type");
}
++p->cursor;
char* end = _hmml_read_attr(p, marker_mem, sizeof(marker_mem), !extended);
marker.marker = _hmml_persist_str(p, (struct _hmml_str){ marker_mem, end - marker_mem });
if(extended) {
_hmml_skip_ws(p);
if(*p->cursor == '#') {
++p->cursor;
size_t n = strcspn(p->cursor, " ");
marker.episode = _hmml_persist_str(p, (struct _hmml_str){ p->cursor, n });
p->cursor += n + 1;
}
if(*p->cursor != ']') {
const char* end = p->cursor;
for(;;) {
if(!*end) {
break;
}
char converted;
if(*end == '\\' && _hmml_unesc(end[1], &converted)) {
end += 2;
} else if(*end == ']'){
break;
} else {
++end;
}
}
marker.parameter = _hmml_persist_str(p, (struct _hmml_str){ p->cursor, end - p->cursor });
p->cursor = end;
}
if(*p->cursor != ']') {
_hmml_err(p, "Expected ']'");
}
++p->cursor;
}
return marker;
}
static HMML_Reference _hmml_parse_ref(struct _hmml_parser* p)
{
HMML_Reference ref = {
.offset = -1,
};
struct str_attr {
struct _hmml_str str;
char** dest;
} str_attrs[] = {
{ HSTR("site") , &ref.site },
{ HSTR("page") , &ref.page },
{ HSTR("url") , &ref.url },
{ HSTR("title") , &ref.title },
{ HSTR("article") , &ref.article },
{ HSTR("author") , &ref.author },
{ HSTR("editor") , &ref.editor },
{ HSTR("publisher"), &ref.publisher },
{ HSTR("isbn") , &ref.isbn },
};
for(;;) {
next_attr:
_hmml_skip_ws(p);
if(*p->cursor == ']') {
++p->cursor;
break;
}
struct _hmml_str key, value;
_hmml_read_kv(p, &key, &value);
for(size_t i = 0; i < countof(str_attrs); ++i) {
struct str_attr* s = str_attrs + i;
if(_hmml_str_eq(key, s->str)) {
*s->dest = _hmml_persist_str(p, value);
goto next_attr;
}
}
_hmml_err(p, "Unknown reference attribute");
}
return ref;
}
static void _hmml_parse_timecode(struct _hmml_parser* p, HMML_Timestamp* ts)
{
unsigned int h = 0, m = 0, s = 0, ms = 0;
int offset = 0;
int count = sscanf(p->cursor, "[%u:%u%n", &m, &s, &offset);
if(count < 2) {
_hmml_err(p, "Unable to parse timecode");
}
p->cursor += offset;
char c = *p->cursor;
if(c == ':') {
unsigned int tmp;
offset = 0;
if(sscanf(p->cursor, ":%u%n", &tmp, &offset) != 1 || offset == 0) {
_hmml_err(p, "Unable to parse 3-part timecode");
}
h = m;
m = s;
s = tmp;
p->cursor += offset;
c = *p->cursor;
}
if(c == '.') {
unsigned int tmp;
offset = 0;
int non_number_chars = 2;
int digits_in_100 = 3;
int max_chars_to_parse = non_number_chars + digits_in_100;
if(sscanf(p->cursor, ".%u]%n", &tmp, &offset) != 1 || offset == 0 || offset > max_chars_to_parse) {
_hmml_err(p, "Unable to parse %u.5-part timecode", h ? 3 : 2);
}
for(int i = offset - non_number_chars; i < digits_in_100; ++i) {
tmp *= 10;
}
ms = tmp;
p->cursor += offset;
} else if(c != ']') {
_hmml_err(p, "Unable to parse timecode");
} else {
++p->cursor;
}
if(ms >= 1000) {
_hmml_err(p, "Milliseconds cannot exceed 999");
}
if(s >= 60) {
_hmml_err(p, "Seconds cannot exceed 59");
}
if(m >= 60) {
_hmml_err(p, "Minutes cannot exceed 59");
}
ts->h = h;
ts->m = m;
ts->s = s;
ts->ms = ms;
}
static void _hmml_store_marker(struct _hmml_parser* p, HMML_Timestamp* ts, char** out, char* text_mem, size_t text_mem_size)
{
HMML_Marker m = _hmml_parse_marker(p);
m.offset = (*out) - text_mem;
_hmml_persist_array(p, &ts->markers, &ts->marker_count, m);
const char* marker_text = m.parameter
? m.parameter
: m.marker
;
size_t text_len = strlen(marker_text);
if((*out) + text_len > text_mem + text_mem_size) {\
_hmml_err(p, "Not enough text memory");\
}
memcpy(*out, marker_text, text_len);
*out += text_len;
}
static size_t _hmml_parse_text(struct _hmml_parser* p, HMML_Timestamp* ts)
{
static char text_mem[4096];
char* out = text_mem;
memset(text_mem, 0, sizeof(text_mem));
for(;;) {
size_t n = strcspn(p->cursor, "\\\n\r[]:@~");
char c = p->cursor[n];
if(out + n > text_mem + sizeof(text_mem)) {\
_hmml_err(p, "Not enough text memory");\
}
memcpy(out, p->cursor, n);
p->cursor += n;
out += n;
if(c == '\0') {
_hmml_err(p, "Unexpected EOF");
}
else if(c == ']') {
++p->cursor;
break;
}
else if(c == '\\') {
char converted;
if(_hmml_unesc(p->cursor[1], &converted)) {
*out++ = converted;
p->cursor += 2;
} else {
*out++ = '\\';
p->cursor++;
}
}
else if(c == '\n' || c == '\r') {
++p->cursor;
}
else if(c == '[') {
if(strncmp(p->cursor + 1, "ref", 3) == 0) {
p->cursor += 4;
HMML_Reference ref = _hmml_parse_ref(p);
ref.offset = out - text_mem;
_hmml_persist_array(p, &ts->references, &ts->reference_count, ref);
} else {
_hmml_store_marker(p, ts, &out, text_mem, sizeof(text_mem));
}
}
// it is a @ ~ or : marker without parameters
else {
// if next char is a space, or prev char is not a space*, then it can't be a marker
// * unless it's the first char
if(strchr(" \t\r\n", p->cursor[1]) || !(out == text_mem || strchr(" \t\r\n", p->cursor[-1]))) {
*out++ = c;
++p->cursor;
} else {
_hmml_store_marker(p, ts, &out, text_mem, sizeof(text_mem));
}
}
if((size_t)(out - text_mem) >= sizeof(text_mem)) {
_hmml_err(p, "Not enough text memory");
}
}
// trim trailing whitespace
while(out > text_mem && (uint8_t)(out[-1]) <= ' ') {
out[-1] = '\0';
--out;
}
size_t text_size = out - text_mem;
ts->text = _hmml_persist_str(p, (struct _hmml_str){ text_mem, text_size });
return text_size;
}
static void _hmml_parse_quote(struct _hmml_parser* p, HMML_Timestamp* ts)
{
char member[256];
int id;
int off = 0;
if(sscanf(p->cursor, "[quote %255s %d]%n", member, &id, &off) == 2 && off) {
ts->quote.present = 1;
ts->quote.id = id;
ts->quote.author = _hmml_persist_str(p, (struct _hmml_str){ member, strlen(member) });
} else if(sscanf(p->cursor, "[quote %d]%n", &id, &off) == 1 && off) {
ts->quote.present = 1;
ts->quote.id = id;
} else {
_hmml_err(p, "Unable to parse quote");
}
p->cursor += off;
}
static void _hmml_parse_timestamps(struct _hmml_parser* p)
{
for(;;) {
_hmml_skip_ws(p);
if(*p->cursor == '\0') {
_hmml_err(p, "Unexpected EOF");
}
if(strncmp(p->cursor, "[/video]", 8) == 0) {
break;
}
HMML_Timestamp ts = {
.line = p->line
};
_hmml_parse_timecode(p, &ts);
if(*p->cursor != '[') {
_hmml_err(p, "Expected '['");
}
if(p->cursor[1] == '@') {
HMML_Marker m = _hmml_parse_marker(p);
ts.author = m.marker;
}
++p->cursor;
int text_len = _hmml_parse_text(p, &ts);
if(p->cursor[0] == '[' && p->cursor[1] == ':') {
++p->cursor;
do {
HMML_Marker m = _hmml_parse_marker(p);
_hmml_persist_array(p, &ts.markers, &ts.marker_count, m);
_hmml_skip_ws(p);
if(*p->cursor != ':' && *p->cursor != ']') {
_hmml_err(p, "Unterminated post-text category node");
}
} while(*p->cursor == ':');
++p->cursor;
}
if(p->cursor[0] == '[' && p->cursor[1] == 'q') {
_hmml_parse_quote(p, &ts);
}
// convert all markers to lowercase, fix any out of range offsets
for(size_t i = 0; i < ts.marker_count; ++i) {
HMML_Marker* m = ts.markers + i;
for(char* c = m->marker; *c; ++c) {
if(*c >= 'A' && *c <= 'Z') {
*c = (*c - ('A' - 'a'));
}
}
if(m->offset > text_len) {
m->offset = text_len;
}
}
for(size_t i = 0; i < ts.reference_count; ++i) {
HMML_Reference* ref = ts.references + i;
if(ref->offset > text_len) {
ref->offset = text_len;
}
}
_hmml_persist_array(p, &p->out.timestamps, &p->out.timestamp_count, ts);
}
}
static HMML_Credit _hmml_parse_credit(struct _hmml_parser* p, struct _hmml_str value)
{
HMML_Credit credit = {};
char* colon = strchr(value.ptr, ':');
if(colon) {
*colon = '\0';
credit.name = _hmml_persist_str(p, (struct _hmml_str){ value.ptr, colon - value.ptr });
credit.role = _hmml_persist_str(p, (struct _hmml_str){ colon+1, value.len - ((colon+1) - value.ptr) });
} else {
credit.name = _hmml_persist_str(p, value);
}
return credit;
}
static void _hmml_parse_video(struct _hmml_parser* p)
{
struct str_attr {
struct _hmml_str str;
char** dest;
} str_attrs[] = {
{ HSTR("stream_platform"), &p->out.metadata.stream_platform },
{ HSTR("project") , &p->out.metadata.project },
{ HSTR("title") , &p->out.metadata.title },
{ HSTR("vod_platform") , &p->out.metadata.vod_platform },
{ HSTR("id") , &p->out.metadata.id },
{ HSTR("template") , &p->out.metadata.template },
{ HSTR("medium") , &p->out.metadata.medium },
{ HSTR("number") , &p->out.metadata.number },
{ HSTR("output") , &p->out.metadata.output },
{ HSTR("cc_lang") , &p->out.metadata.cc_lang },
};
for(;;) {
next_attr:
_hmml_skip_ws(p);
if(*p->cursor == ']') {
++p->cursor;
_hmml_parse_timestamps(p);
return;
}
struct _hmml_str key, value;
_hmml_read_kv(p, &key, &value);
for(size_t i = 0; i < countof(str_attrs); ++i) {
struct str_attr* s = str_attrs + i;
if(_hmml_str_eq(key, s->str)) {
*s->dest = _hmml_persist_str(p, value);
goto next_attr;
}
}
if(_hmml_str_eq(key, HSTR("credit"))) {
HMML_Credit credit = _hmml_parse_credit(p, value);
_hmml_persist_array(p, &p->out.metadata.credits, &p->out.metadata.credit_count, credit);
goto next_attr;
}
if(_hmml_str_eq(key, HSTR("uncredit"))) {
HMML_Credit uncredit = _hmml_parse_credit(p, value);
_hmml_persist_array(p, &p->out.metadata.uncredits, &p->out.metadata.uncredit_count, uncredit);
goto next_attr;
}
HMML_VideoCustomMetaData custom = {
.key = _hmml_persist_str(p, key),
.value = _hmml_persist_str(p, value),
};
_hmml_persist_array(p, &p->out.metadata.custom, &p->out.metadata.custom_count, custom);
}
}
HMML_Output hmml_parse(const char* string)
{
struct _hmml_parser p = {
.mem = string,
.cursor = string,
.line = 1,
};
if(setjmp(p.err_buf) == 1) {
// if it returns 1, an error happened
p.out.free_list = p.free_list;
return p.out;
}
const struct _hmml_str prefix = HSTR("[video");
if(strncmp(p.cursor, prefix.ptr, prefix.len)) {
_hmml_err(&p, "Missing initial video tag.");
} else {
p.cursor += prefix.len;
_hmml_parse_video(&p);
}
p.out.free_list = p.free_list;
p.out.well_formed = 1;
return p.out;
}
void hmml_free(HMML_Output* out)
{
if(!out->free_list) {
return;
}
for(uintptr_t i = 2; i < ((uintptr_t*)out->free_list)[1]; ++i) {
free(((void**)out->free_list)[i]);
}
free(out->free_list);
}
const struct HMML_Version hmml_version = {
2, 0, 15
};
#undef HSTX
#undef HSTR
#endif