hmml parser C library

This commit is contained in:
Alex Baines 2017-02-27 21:42:14 +00:00
parent 36a6654b82
commit 06854a7b66
6 changed files with 653 additions and 0 deletions

28
hmmlib/example.c Normal file
View File

@ -0,0 +1,28 @@
#if 0
cc $0 -o ${0/.c/} hmml.a
exit
#endif
#include "hmmlib.h"
#include <stdio.h>
int main(int argc, char** argv){
if(argc < 2){
fputs("gimme a file.\n", stderr);
return 1;
}
FILE* f = fopen(argv[1], "r");
if(f){
printf("Reading file: %s\n", argv[1]);
HMML_Output hmml = hmml_parse_file(f);
hmml_dump(&hmml);
hmml_free(&hmml);
fclose(f);
} else {
perror("Error reading file");
}
return 0;
}

86
hmmlib/hmmlib.h Normal file
View File

@ -0,0 +1,86 @@
#ifndef HMML_H_
#define HMML_H_
#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stdio.h>
// Data structures
typedef struct {
char* member;
char* twitch;
char* project;
char* title;
char* platform;
char* id;
char* annotator;
} HMML_VideoMetaData;
typedef struct {
char* site;
char* page;
char* url;
char* title;
char* article;
char* author;
char* editor;
char* publisher;
char* isbn;
} HMML_Reference;
typedef enum {
HMML_CATEGORY,
HMML_MEMBER,
HMML_PROJECT,
HMML_MARKER_COUNT,
} HMML_MarkerType;
typedef struct {
HMML_MarkerType type;
char* text;
} HMML_Marker;
typedef struct {
int id;
char* author;
} HMML_Quote;
typedef struct {
int line;
char* time;
char* text;
char* author;
HMML_Reference* references;
size_t reference_count;
HMML_Marker* markers;
size_t marker_count;
HMML_Quote quote;
bool is_quote;
} HMML_Annotation;
typedef struct {
int line;
char* message;
} HMML_Error;
typedef struct {
bool well_formed;
HMML_VideoMetaData metadata;
HMML_Annotation* annotations;
size_t annotation_count;
HMML_Error error;
} HMML_Output;
// Functions
HMML_Output hmml_parse_file (FILE* file);
void hmml_dump (HMML_Output* output);
void hmml_free (HMML_Output* output);
#endif

390
hmmlib/hmmlib.l Normal file
View File

@ -0,0 +1,390 @@
%{
#include <stdio.h>
#include <stdbool.h>
#include <ctype.h>
#include "stb_sb.h"
#include "hmmlib.h"
typedef struct {
int line;
HMML_Annotation* annos;
HMML_VideoMetaData meta;
HMML_Annotation an;
HMML_Reference ref;
HMML_Error* error;
char** attr;
int mnext;
bool first;
} HMML_ParseState;
#define HMML_ERR(fmt, ...) \
do { \
asprintf(&yyextra->error->message, fmt, ##__VA_ARGS__);\
yyextra->error->line = yyextra->line;\
return 1;\
} while(0)
#define V_(x) &yyextra->meta.x
#define R_(x) &yyextra->ref.x
#define M_(x, state) do { HMML_Marker m = { HMML_ ## x }; sb_push(yyextra->an.markers, m); yyextra->mnext = state; } while(0)
#define M_ADD(t, n) do { char* c = strndup(t, n); sb_last(yyextra->an.markers).text = c; memcpy(sb_add(yyextra->an.text, n), c, n); } while(0)
#define NEWANNO() \
do { \
if(!yyextra->first) sb_push(yyextra->annos, yyextra->an); \
memset(&yyextra->an, 0, sizeof(yyextra->an));\
yyextra->an.line = yyextra->line;\
yyextra->first = false;\
} while(0)
#define CHECKESCAPE(x) do { if(!strchr("[]:@~\\", x)) HMML_ERR("Unknown backslash escape code '%c'", x); } while(0)
%}
%option reentrant
%option extra-type="HMML_ParseState*"
%option noyywrap
S [\t ]*
ATTR_SIMPLE [^\" \]\t\r\n][^ \]\t\r\n]*
ATTR_ALNUM [0-9a-zA-Z][0-9a-zA-Z_]*
ATTR_QUOTED \"([^\n\"\\]|\\.)*\"
TAG_VIDEO_OPEN \[video
TIMECODE \[[0-9]{1,2}(:[0-5][0-9]){1,2}\]
BAD_TIMECODE \[[0-9]{1,2}(:[6-9][0-9]){1,2}\]
LB \[
RB \]
%s VIDEO
%s V_ATTR
%s ANNOTATION
%s TEXT_START
%s TEXT
%s MARKER
%s MARKER_XTRA
%s REF
%s R_ATTR
%s AFTERTEXT
%s AUTHOR
%s CATEGORIES
%s QUOTES
%%
<<EOF>> { HMML_ERR("Unexpected EOF, video close tag not found."); }
\n { yyextra->line++; }
<INITIAL>{TAG_VIDEO_OPEN} { BEGIN(VIDEO); }
<INITIAL>. { HMML_ERR("Missing video tag."); }
<VIDEO>{S}
<VIDEO>member{S}= { yyextra->attr = V_(member); BEGIN(V_ATTR); }
<VIDEO>twitch_username{S}= { yyextra->attr = V_(twitch); BEGIN(V_ATTR); }
<VIDEO>project{S}= { yyextra->attr = V_(project); BEGIN(V_ATTR); }
<VIDEO>title{S}= { yyextra->attr = V_(title); BEGIN(V_ATTR); }
<VIDEO>platform{S}= { yyextra->attr = V_(platform); BEGIN(V_ATTR); }
<VIDEO>id{S}= { yyextra->attr = V_(id); BEGIN(V_ATTR); }
<VIDEO>annotator{S}= { yyextra->attr = V_(annotator); BEGIN(V_ATTR); }
<VIDEO>\] { BEGIN(ANNOTATION); };
<VIDEO>. { HMML_ERR("Invalid char '%c' in video tag.", *yytext); }
<V_ATTR>{S} { BEGIN(VIDEO); }
<V_ATTR>{ATTR_SIMPLE} { *yyextra->attr = strndup(yytext , yyleng ); BEGIN(VIDEO); }
<V_ATTR>{ATTR_QUOTED} { *yyextra->attr = strndup(yytext+1, yyleng-2); BEGIN(VIDEO); }
<V_ATTR>\] { yyless(0); BEGIN(VIDEO); }
<ANNOTATION>{TIMECODE}{LB}@ { NEWANNO(); yyextra->an.time = strndup(yytext+1, yyleng-4); BEGIN(AUTHOR); }
<ANNOTATION>{TIMECODE} { NEWANNO(); yyextra->an.time = strndup(yytext+1, yyleng-2); BEGIN(TEXT_START); }
<ANNOTATION>{BAD_TIMECODE} { HMML_ERR("Timecode %s out of range.", yytext); }
<ANNOTATION>{S}
<ANNOTATION>. { HMML_ERR("Cannot parse annotation. Expected timecode."); }
<TEXT_START>{LB}: { M_(CATEGORY, TEXT); BEGIN(MARKER); }
<TEXT_START>{LB}@ { M_(MEMBER , TEXT); BEGIN(MARKER); }
<TEXT_START>{LB}~ { M_(PROJECT , TEXT); BEGIN(MARKER); }
<TEXT_START>{LB} { yyless(0); BEGIN(TEXT); }
<TEXT_START>. { HMML_ERR("Unknown character '%c' after timecode.", *yytext); }
<TEXT>[^\\:@~\]\n\[ ]+ { memcpy(sb_add(yyextra->an.text, yyleng), yytext, yyleng); }
<TEXT>\\. { CHECKESCAPE(yytext[1]); memcpy(sb_add(yyextra->an.text, yyleng-1), yytext+1, yyleng-1); }
<TEXT>[ ]: { sb_push(yyextra->an.text, ' '); M_(CATEGORY, TEXT); BEGIN(MARKER); }
<TEXT>[ ]@ { sb_push(yyextra->an.text, ' '); M_(MEMBER , TEXT); BEGIN(MARKER); }
<TEXT>[ ]~ { sb_push(yyextra->an.text, ' '); M_(PROJECT , TEXT); BEGIN(MARKER); }
<TEXT>{LB}: { M_(CATEGORY, MARKER_XTRA); BEGIN(MARKER); }
<TEXT>{LB}@ { M_(MEMBER , MARKER_XTRA); BEGIN(MARKER); }
<TEXT>{LB}~ { M_(PROJECT , MARKER_XTRA); BEGIN(MARKER); }
<TEXT>\] { BEGIN(AFTERTEXT); }
<TEXT>{LB}ref { memset(&yyextra->ref, 0, sizeof(yyextra->ref)); BEGIN(REF); }
<TEXT>{LB}
<TEXT>{S}{S} { sb_push(yyextra->an.text, ' '); }
<TEXT>. { sb_push(yyextra->an.text, *yytext); }
<MARKER>{ATTR_ALNUM} { M_ADD(yytext , yyleng ); BEGIN(yyextra->mnext); };
<MARKER>{ATTR_QUOTED} { M_ADD(yytext+1, yyleng-2); BEGIN(yyextra->mnext); };
<MARKER>. { HMML_ERR("Cannot parse Marker. Expected quoted or alphanumeric attribute."); }
/* TODO: store the extra text somewhere */
<MARKER_XTRA>\] { BEGIN(TEXT); }
<MARKER_XTRA>.
<REF>[\t ]
<REF>site{S}= { yyextra->attr = R_(site); BEGIN(R_ATTR); }
<REF>page{S}= { yyextra->attr = R_(page); BEGIN(R_ATTR); }
<REF>url{S}= { yyextra->attr = R_(url); BEGIN(R_ATTR); }
<REF>title{S}= { yyextra->attr = R_(title); BEGIN(R_ATTR); }
<REF>article{S}= { yyextra->attr = R_(article); BEGIN(R_ATTR); }
<REF>author{S}= { yyextra->attr = R_(author); BEGIN(R_ATTR); }
<REF>editor{S}= { yyextra->attr = R_(editor); BEGIN(R_ATTR); }
<REF>publisher{S}= { yyextra->attr = R_(publisher); BEGIN(R_ATTR); }
<REF>isbn{S}= { yyextra->attr = R_(isbn); BEGIN(R_ATTR); }
<REF>\] { sb_push(yyextra->an.references, yyextra->ref); BEGIN(TEXT); }
<REF>. { HMML_ERR("Unexpected item in ref: %s", yytext); }
<R_ATTR>{S}
<R_ATTR>{ATTR_SIMPLE} { *yyextra->attr = strndup(yytext , yyleng ); BEGIN(REF); }
<R_ATTR>{ATTR_QUOTED} { *yyextra->attr = strndup(yytext+1, yyleng-2); BEGIN(REF); }
<AFTERTEXT,ANNOTATION>\[\/video\] { NEWANNO(); return 0; }
<AFTERTEXT>{S}
<AFTERTEXT>{LB}quote { BEGIN(QUOTES); }
<AFTERTEXT>{LB}: { BEGIN(CATEGORIES); yyless(1); }
<AFTERTEXT>{LB}[0-9] { BEGIN(ANNOTATION); yyless(0); }
<AFTERTEXT>.. { HMML_ERR("Unexpected thing after text node: %s", yytext); }
<AFTERTEXT>. { HMML_ERR("Unexpected thing after text node: %s", yytext); }
<AUTHOR>[^\]\n]+\] { yyextra->an.author = strndup(yytext, yyleng-1); BEGIN(TEXT_START); }
<AUTHOR>{S}
<CATEGORIES>{S}
<CATEGORIES>:{ATTR_SIMPLE} { HMML_Marker m = { HMML_CATEGORY, strndup(yytext+1, yyleng-1) }; sb_push(yyextra->an.markers, m); }
<CATEGORIES>:{ATTR_QUOTED} { HMML_Marker m = { HMML_CATEGORY, strndup(yytext+2, yyleng-3) }; sb_push(yyextra->an.markers, m); }
<CATEGORIES>\]{LB} { BEGIN(QUOTES); }
<CATEGORIES>\] { BEGIN(ANNOTATION); }
<CATEGORIES>. { HMML_ERR("Unexpected character in category tag: '%c'\n", *yytext); }
<QUOTES>{S}
<QUOTES>[0-9]+{S}\] { yyextra->an.is_quote = true; yyextra->an.quote.id = atoi(yytext); BEGIN(ANNOTATION); }
<QUOTES>{ATTR_ALNUM} { memcpy(sb_add(yyextra->an.quote.author, yyleng), yytext, yyleng); }
<QUOTES>. { HMML_ERR("Unexpected character in quotes tag: %s", yytext); }
%%
HMML_Output hmml_parse_file(FILE* f){
HMML_Output output = {};
HMML_ParseState state = {};
state.error = &output.error;
state.first = true;
state.line = 1;
yyscan_t scan;
yylex_init_extra(&state, &scan);
yyset_in(f, scan);
output.well_formed = yylex(scan) == 0;
if(output.well_formed){
memcpy(&output.metadata, &state.meta, sizeof(HMML_VideoMetaData));
output.annotations = state.annos;
output.annotation_count = sb_count(state.annos);
for(size_t i = 0; i < output.annotation_count; ++i){
HMML_Annotation* a = output.annotations + i;
sb_push(a->text, 0);
a->marker_count = sb_count(a->markers);
a->reference_count = sb_count(a->references);
if(a->is_quote && a->quote.author){
sb_push(a->quote.author, 0);
}
}
}
yylex_destroy(scan);
return output;
}
void hmml_free(HMML_Output* hmml){
if(!hmml) return;
for(size_t i = 0; i < sizeof(HMML_VideoMetaData)/sizeof(char*); ++i){
free(((char**)&hmml->metadata)[i]);
}
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
free(a->time);
free(a->author);
sb_free(a->text);
for(size_t j = 0; j < a->reference_count; ++j){
for(size_t k = 0; k < sizeof(HMML_Reference)/sizeof(char*); ++k){
free(((char**)a->references + j)[k]);
}
}
sb_free(a->references);
for(size_t j = 0; j < a->marker_count; ++j){
free(a->markers[j].text);
}
sb_free(a->markers);
sb_free(a->quote.author);
}
sb_free(hmml->annotations);
free(hmml->error.message);
memset(hmml, 0, sizeof(*hmml));
}
typedef struct {
char* text;
int* lines;
} Index;
Index* index_find(Index* base, const char* text){
for(size_t i = 0; i < sb_count(base); ++i){
if(strcmp(base[i].text, text) == 0){
return base + i;
}
}
return NULL;
}
void hmml_dump(HMML_Output* hmml){
if(!hmml){
puts("(null");
return;
}
if(!hmml->well_formed){
printf("Parse error on line %d: %s\n", hmml->error.line, hmml->error.message);
return;
}
puts("Annotations:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
printf("\t%3d [%7s] [%s]\n", a->line, a->time, a->text);
}
Index* authors = NULL;
Index* markers[HMML_MARKER_COUNT] = {};
int max_text_len = 0;
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
if(a->author){
size_t len = strlen(a->author);
if(len > max_text_len){
max_text_len = len;
}
Index* idx;
if(!(idx = index_find(authors, a->author))){
Index x = { a->author };
sb_push(authors, x);
idx = &sb_last(authors);
}
sb_push(idx->lines, a->line);
}
}
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
for(size_t j = 0; j < a->marker_count; ++j){
int type = a->markers[j].type;
char* text = a->markers[j].text;
size_t len = strlen(text);
if(len > max_text_len){
max_text_len = len;
}
for(char* c = text; *c; ++c) *c = tolower(*c);
Index* idx;
if(!(idx = index_find(markers[type], text))){
Index x = { text };
sb_push(markers[type], x);
idx = &sb_last(markers[type]);
}
sb_push(idx->lines, a->line);
}
}
puts("Authors:");
for(size_t i = 0; i < sb_count(authors); ++i){
printf("\t %*s: ", max_text_len, authors[i].text);
for(size_t j = 0; j < sb_count(authors[i].lines); ++j){
printf("%3d ", authors[i].lines[j]);
}
puts("");
}
static const char* m_tags[HMML_MARKER_COUNT] = { "Categories", "Members", "Projects" };
for(size_t i = 0; i < HMML_MARKER_COUNT; ++i){
printf("%s:\n", m_tags[i]);
for(size_t j = 0; j < sb_count(markers[i]); ++j){
printf("\t %*s: ", max_text_len, markers[i][j].text);
for(size_t k = 0; k < sb_count(markers[i][j].lines); ++k){
printf("%3d ", markers[i][j].lines[k]);
}
puts("");
}
}
static const char* r_tags[] = { "Site", "Page", "URL", "Title", "Article", "Author", "Editor", "Publisher", "ISBN" };
puts("References:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
for(size_t j = 0; j < a->reference_count; ++j){
printf("\t%3d ", a->line);
HMML_Reference* r = a->references + j;
for(size_t k = 0; k < 9; ++k){
char* item = ((char**)r)[k];
if(item){
printf("[%s = %s] ", r_tags[k], item);
}
}
puts("");
}
}
puts("Quotes:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
if(a->is_quote){
if(a->quote.author){
printf("\t%3d [Quote #%d, by %s]", a->line, a->quote.id, a->quote.author);
} else {
printf("\t%3d [Quote #%d]", a->line, a->quote.id);
}
puts("");
}
}
for(size_t i = 0; i < sb_count(authors); ++i){
sb_free(authors[i].lines);
}
sb_free(authors);
for(size_t i = 0; i < HMML_MARKER_COUNT; ++i){
for(size_t j = 0; j < sb_count(markers[i]); ++j){
sb_free(markers[i][j].lines);
}
sb_free(markers[i]);
}
}

10
hmmlib/makefile Normal file
View File

@ -0,0 +1,10 @@
all: hmml.a example
hmml.a: hmmlib.o
ar rcs $@ $<
hmmlib.o: hmmlib.c
gcc -D_GNU_SOURCE -g -c $< -o $@
example: example.c hmml.a
gcc -g $^ -o $@

11
hmmlib/readme.txt Normal file
View File

@ -0,0 +1,11 @@
This is a C hmml parser library thing made with flex.
It has just 3 functions, look at hmml.h for the API.
To build it, run make, and it'll spit out hmml.a that you just need to include
on the command line when you build stuff. e.g.:
clang myfile.c hmml.a -o mything
(your .c file also needs to #include hmmlib.h)
-- insofaras

128
hmmlib/stb_sb.h Normal file
View File

@ -0,0 +1,128 @@
// stb stretchy_buffer.h v1.02 nothings.org/stb
// with custom addtions sb_end, sb_pop, sb_erase
#ifndef STB_STRETCHY_BUFFER_H_INCLUDED
#define STB_STRETCHY_BUFFER_H_INCLUDED
#ifndef NO_STRETCHY_BUFFER_SHORT_NAMES
#define sb_free stb_sb_free
#define sb_push stb_sb_push
#define sb_count stb_sb_count
#define sb_add stb_sb_add
#define sb_last stb_sb_last
#define sb_end stb_sb_end
#define sb_pop stb_sb_pop
#define sb_erase stb_sb_erase
#endif
#define stb_sb_free(a) ((a) ? free(stb__sbraw(a)),(a)=0,0 : 0)
#define stb_sb_push(a,v) (stb__sbmaybegrow(a,1), (a)[stb__sbn(a)++] = (v))
#define stb_sb_count(a) ((a) ? stb__sbn(a) : 0)
#define stb_sb_add(a,n) (stb__sbmaybegrow(a,n), stb__sbn(a)+=(n), &(a)[stb__sbn(a)-(n)])
#define stb_sb_last(a) ((a)[stb__sbn(a)-1])
#define stb_sb_end(a) ((a) ? (a) + stb__sbn(a) : 0)
#define stb_sb_pop(a) (--stb__sbn(a))
#define stb_sb_erase(a,i) ((a) ? memmove((a)+(i), (a)+(i)+1, sizeof(*(a))*((--stb__sbn(a))-(i))),0 : 0);
#define stb__sbraw(a) ((size_t *) (a) - 2)
#define stb__sbm(a) stb__sbraw(a)[0]
#define stb__sbn(a) stb__sbraw(a)[1]
#define stb__sbneedgrow(a,n) ((a)==0 || stb__sbn(a)+(n) >= stb__sbm(a))
#define stb__sbmaybegrow(a,n) (stb__sbneedgrow(a,(n)) ? stb__sbgrow(a,n) : 0)
#define stb__sbgrow(a,n) ((a) = stb__sbgrowf((a), (n), sizeof(*(a))))
#include <stdlib.h>
static inline void * stb__sbgrowf(void *arr, int increment, int itemsize)
{
size_t inc_cur = arr ? stb__sbm(arr) + (stb__sbm(arr) >> 1) : 0;
size_t min_needed = stb_sb_count(arr) + increment;
size_t m = inc_cur > min_needed ? inc_cur : min_needed;
size_t *p = (size_t *) realloc(arr ? stb__sbraw(arr) : 0, itemsize * m + sizeof(size_t)*2);
if (p) {
if (!arr)
p[1] = 0;
p[0] = m;
return p+2;
} else {
#ifdef STRETCHY_BUFFER_OUT_OF_MEMORY
STRETCHY_BUFFER_OUT_OF_MEMORY ;
#endif
return (void *) (2*sizeof(size_t)); // try to force a NULL pointer exception later
}
}
#ifdef STB_SB_MMAP
#include <sys/mman.h>
#include <stdio.h>
#define sbmm_free(a) ((a) ? munmap(stb__sbraw(a), stb__sbm(a)),(a)=0,0 : 0)
#define sbmm_push(a,v) (stb__sbmaybegrow_mm(a,1), (a)[stb__sbn(a)++] = (v))
#define sbmm_add(a,n) (stb__sbmaybegrow_mm(a,n), stb__sbn(a)+=(n), &(a)[stb__sbn(a)-(n)])
#define sbmm_count stb_sb_count
#define sbmm_last stb_sb_last
#define sbmm_end stb_sb_end
#define sbmm_pop stb_sb_pop
#define sbmm_erase stb_sb_erase
#define stb__sbmaybegrow_mm(a,n) (stb__sbneedgrow(a,(n)) ? stb__sbgrow_mm(a,n) : 0)
#define stb__sbgrow_mm(a,n) ((a) = stb__sbgrowf_mm((a), (n), sizeof(*(a))))
#define SB_PAGE_SIZE 4096
static inline void * stb__sbgrowf_mm(void *arr, int increment, int itemsize)
{
size_t inc_cur = arr ? stb__sbm(arr) + SB_PAGE_SIZE : 0;
size_t min_needed = stb_sb_count(arr) + increment;
size_t m = inc_cur > min_needed ? inc_cur : min_needed;
size_t mem_needed = m * itemsize + sizeof(size_t) * 2;
mem_needed = (mem_needed + (SB_PAGE_SIZE-1)) & ~(SB_PAGE_SIZE-1);
size_t mem_have = !arr ? 0 : stb__sbm(arr) * itemsize + sizeof(size_t) * 2;
mem_have = (mem_have + (SB_PAGE_SIZE-1)) & ~(SB_PAGE_SIZE-1);
size_t* p = 0;
if(arr){
p = mremap(
stb__sbraw(arr),
mem_have,
mem_needed,
MREMAP_MAYMOVE
);
if(p == MAP_FAILED){
perror("mremap");
}
} else {
p = mmap(
0,
mem_needed,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1,
0
);
if(p == MAP_FAILED){
perror("mmap");
}
}
if (p != MAP_FAILED) {
if (!arr)
p[1] = 0;
p[0] = m;
return p+2;
} else {
#ifdef STRETCHY_BUFFER_OUT_OF_MEMORY
STRETCHY_BUFFER_OUT_OF_MEMORY ;
#endif
return (void *) (2*sizeof(size_t)); // try to force a NULL pointer exception later
}
}
#endif // STB_SB_MMAP
#endif // STB_STRETCHY_BUFFER_H_INCLUDED