Annotation-System/hmmlib/hmmlib.l

515 lines
17 KiB
Plaintext

%{
#include <stdio.h>
#include <stdbool.h>
#include <ctype.h>
#include "stb_sb.h"
#include "hmmlib.h"
const struct HMML_Version hmml_version = { 0, 4, 0 };
typedef struct {
int line;
HMML_Annotation* annos;
HMML_VideoMetaData meta;
HMML_Annotation an;
HMML_Reference ref;
HMML_Error error;
void* attr;
int mnext;
bool first;
} HMML_ParseState;
#define HMML_ERR(fmt, ...) \
do { \
asprintf(&yyextra->error.message, fmt, ##__VA_ARGS__);\
yyextra->error.line = yyextra->line;\
return 1;\
} while(0)
#define CHECKESCAPE(x) do { if(!strchr("[]:@~\\\"", x)) HMML_ERR("Unknown backslash escape code '%c'", x); } while(0)
#define V_(x) &yyextra->meta.x
#define R_(x) &yyextra->ref.x
#define M_(x, state) do { HMML_Marker m = { HMML_ ## x }; sb_push(yyextra->an.markers, m); yyextra->mnext = state; } while(0)
#define M_ADD(t, n) \
do { \
char* c = UNQUOTE(strndup(t, n));\
sb_last(yyextra->an.markers).marker = c;\
sb_last(yyextra->an.markers).offset = sb_count(yyextra->an.text);\
if(yyextra->mnext == TEXT){\
memcpy(sb_add(yyextra->an.text, n), c, n);\
}\
} while(0)
#define MX_ADD(c) \
do { \
sb_push(sb_last(yyextra->an.markers).parameter, c);\
if(c) sb_push(yyextra->an.text, c);\
} while(0)
#define NEWANNO() \
do { \
if(!yyextra->first) sb_push(yyextra->annos, yyextra->an); \
memset(&yyextra->an, 0, sizeof(yyextra->an));\
yyextra->an.line = yyextra->line;\
yyextra->first = false;\
} while(0)
#define UNQUOTE_LEN(_attr, _len) ({ \
typeof(_attr) attr = (_attr); \
typeof(_len) len = (_len); \
for(char* c = attr; c < attr+len; ++c){\
if(*c == '\\'){ \
CHECKESCAPE(c[1]); \
memmove(c, c+1, len-(c-attr)); \
--len; \
} \
} \
attr; \
})
#define UNQUOTE(_attr) UNQUOTE_LEN(_attr, strlen(attr))
%}
%option reentrant
%option extra-type="HMML_ParseState*"
%option noyywrap
S [\t \r]*
ATTR_SIMPLE [^\" \]\t\r\n][^ \]\t\r\n]*
ATTR_ALNUM [0-9a-zA-Z][0-9a-zA-Z_]*
ATTR_QUOTED \"([^\"\\]|\\.)*\"
TAG_VIDEO_OPEN \[video
TIMECODE \[[0-9]{1,2}(:[0-5][0-9]){1,2}\]
BAD_TIMECODE \[[0-9]{1,2}(:[6-9][0-9]){1,2}\]
TEXT_BREAK [^\\:@~\[\]\r\n\t ]
LB \[
RB \]
%s VIDEO
%s V_ATTR
%s V2_ATTR
%s V3_ATTR
%s ANNOTATION
%s TEXT_START
%s TEXT
%s MARKER
%s MARKER_XTRA
%s REF
%s R_ATTR
%s AFTERTEXT
%s AUTHOR
%s CATEGORIES
%s QUOTES
%%
<<EOF>> { HMML_ERR("Unexpected EOF, video close tag not found."); }
\r\n|\n { yyextra->line++; }
<INITIAL>{TAG_VIDEO_OPEN} { BEGIN(VIDEO); }
<INITIAL>. { HMML_ERR("Missing video tag."); }
<VIDEO>{S}
<VIDEO>member{S}= { yyextra->attr = V_(member); BEGIN(V_ATTR); }
<VIDEO>stream_platform{S}= { yyextra->attr = V_(stream_platform); BEGIN(V_ATTR); }
<VIDEO>stream_username{S}= { yyextra->attr = V_(stream_username); BEGIN(V_ATTR); }
<VIDEO>project{S}= { yyextra->attr = V_(project); BEGIN(V_ATTR); }
<VIDEO>title{S}= { yyextra->attr = V_(title); BEGIN(V_ATTR); }
<VIDEO>vod_platform{S}= { yyextra->attr = V_(vod_platform); BEGIN(V_ATTR); }
<VIDEO>id{S}= { yyextra->attr = V_(id); BEGIN(V_ATTR); }
<VIDEO>template{S}= { yyextra->attr = V_(template); BEGIN(V_ATTR); }
<VIDEO>medium{S}= { yyextra->attr = V_(medium); BEGIN(V_ATTR); }
<VIDEO>custom0{S}= { yyextra->attr = V_(custom[0]); BEGIN(V_ATTR); } // probably smarter way to do this ¯\_(ツ)_/¯
<VIDEO>custom1{S}= { yyextra->attr = V_(custom[1]); BEGIN(V_ATTR); }
<VIDEO>custom2{S}= { yyextra->attr = V_(custom[2]); BEGIN(V_ATTR); }
<VIDEO>custom3{S}= { yyextra->attr = V_(custom[3]); BEGIN(V_ATTR); }
<VIDEO>custom4{S}= { yyextra->attr = V_(custom[4]); BEGIN(V_ATTR); }
<VIDEO>custom5{S}= { yyextra->attr = V_(custom[5]); BEGIN(V_ATTR); }
<VIDEO>custom6{S}= { yyextra->attr = V_(custom[6]); BEGIN(V_ATTR); }
<VIDEO>custom7{S}= { yyextra->attr = V_(custom[7]); BEGIN(V_ATTR); }
<VIDEO>custom8{S}= { yyextra->attr = V_(custom[8]); BEGIN(V_ATTR); }
<VIDEO>custom9{S}= { yyextra->attr = V_(custom[9]); BEGIN(V_ATTR); }
<VIDEO>custom10{S}= { yyextra->attr = V_(custom[10]); BEGIN(V_ATTR); }
<VIDEO>custom11{S}= { yyextra->attr = V_(custom[11]); BEGIN(V_ATTR); }
<VIDEO>custom12{S}= { yyextra->attr = V_(custom[12]); BEGIN(V_ATTR); }
<VIDEO>custom13{S}= { yyextra->attr = V_(custom[13]); BEGIN(V_ATTR); }
<VIDEO>custom14{S}= { yyextra->attr = V_(custom[14]); BEGIN(V_ATTR); }
<VIDEO>custom15{S}= { yyextra->attr = V_(custom[15]); BEGIN(V_ATTR); }
<VIDEO>co\-host{S}= { yyextra->attr = V_(co_hosts); BEGIN(V2_ATTR); }
<VIDEO>guest{S}= { yyextra->attr = V_(guests); BEGIN(V2_ATTR); }
<VIDEO>annotator{S}= { yyextra->attr = V_(annotators); BEGIN(V2_ATTR); }
<VIDEO>publication_date{S}= { yyextra->attr = V_(publication_date); BEGIN(V3_ATTR); }
<VIDEO>output{S}= { yyextra->attr = V_(output); BEGIN(V_ATTR); }
<VIDEO>\] { BEGIN(ANNOTATION); };
<VIDEO>. { HMML_ERR("Unknown attribute in video tag, beginning with '%s'.", yytext); }
<V_ATTR>{S} { BEGIN(VIDEO); }
<V_ATTR>{ATTR_SIMPLE} { *(char**)yyextra->attr = strndup(yytext , yyleng ); BEGIN(VIDEO); }
<V_ATTR>{ATTR_QUOTED} { *(char**)yyextra->attr = UNQUOTE(strndup(yytext+1, yyleng-2)); BEGIN(VIDEO); }
<V_ATTR>\] { yyless(0); BEGIN(VIDEO); }
<V2_ATTR>{S} { BEGIN(VIDEO); }
<V2_ATTR>{ATTR_SIMPLE} { sb_push(*(char***)yyextra->attr, strndup(yytext , yyleng )); BEGIN(VIDEO); }
<V2_ATTR>{ATTR_QUOTED} { sb_push(*(char***)yyextra->attr, UNQUOTE(strndup(yytext+1, yyleng-2))); BEGIN(VIDEO); }
<V2_ATTR>\] { yyless(0); BEGIN(VIDEO); }
<V3_ATTR>{S} { BEGIN(VIDEO); }
<V3_ATTR>{ATTR_SIMPLE} { *(long*)yyextra->attr = atol(yytext); BEGIN(VIDEO); }
<V3_ATTR>{ATTR_QUOTED} { *(long*)yyextra->attr = atol(UNQUOTE_LEN(yytext+1, yyleng-2)); BEGIN(VIDEO); }
<V3_ATTR>\] { yyless(0); BEGIN(VIDEO); }
<ANNOTATION>{TIMECODE}{LB}@ { NEWANNO(); yyextra->an.time = strndup(yytext+1, yyleng-4); BEGIN(AUTHOR); }
<ANNOTATION>{TIMECODE} { NEWANNO(); yyextra->an.time = strndup(yytext+1, yyleng-2); BEGIN(TEXT_START); }
<ANNOTATION>{BAD_TIMECODE} { HMML_ERR("Timecode %s out of range.", yytext); }
<ANNOTATION>{S}
<ANNOTATION>. { HMML_ERR("Cannot parse annotation. Expected timecode."); }
<TEXT_START>{LB}: { M_(CATEGORY, TEXT); BEGIN(MARKER); }
<TEXT_START>{LB}@ { M_(MEMBER , TEXT); BEGIN(MARKER); }
<TEXT_START>{LB}~ { M_(PROJECT , TEXT); BEGIN(MARKER); }
<TEXT_START>{LB} { BEGIN(TEXT); }
<TEXT_START>. { HMML_ERR("Unknown character '%s' after timecode.", yytext); }
<TEXT>{TEXT_BREAK}+ { memcpy(sb_add(yyextra->an.text, yyleng), yytext, yyleng); }
<TEXT>\\. { CHECKESCAPE(yytext[1]); memcpy(sb_add(yyextra->an.text, yyleng-1), yytext+1, yyleng-1); }
<TEXT>[ \r\t]+: { sb_push(yyextra->an.text, ' '); M_(CATEGORY, TEXT); BEGIN(MARKER); }
<TEXT>[ \r\t]+@ { sb_push(yyextra->an.text, ' '); M_(MEMBER , TEXT); BEGIN(MARKER); }
<TEXT>[ \r\t]+~ { sb_push(yyextra->an.text, ' '); M_(PROJECT , TEXT); BEGIN(MARKER); }
<TEXT>{LB}: { M_(CATEGORY, MARKER_XTRA); BEGIN(MARKER); }
<TEXT>{LB}@ { M_(MEMBER , MARKER_XTRA); BEGIN(MARKER); }
<TEXT>{LB}~ { M_(PROJECT , MARKER_XTRA); BEGIN(MARKER); }
<TEXT>\] { BEGIN(AFTERTEXT); }
<TEXT>{LB}ref { yyextra->ref.offset = sb_count(yyextra->an.text); BEGIN(REF); }
<TEXT>{LB}
<TEXT>{S} { sb_push(yyextra->an.text, ' '); }
<TEXT>. { sb_push(yyextra->an.text, *yytext); }
<MARKER>{ATTR_ALNUM} { M_ADD(yytext , yyleng ); BEGIN(yyextra->mnext); };
<MARKER>{ATTR_QUOTED} { M_ADD(yytext+1, yyleng-2); BEGIN(yyextra->mnext); };
<MARKER>. { HMML_ERR("Cannot parse Marker. Expected quoted or alphanumeric attribute."); }
<MARKER_XTRA>\\] { MX_ADD(']'); }
<MARKER_XTRA>\\# { MX_ADD('#'); }
<MARKER_XTRA>\] { MX_ADD('\0'); BEGIN(TEXT); }
<MARKER_XTRA>[ ]\#[0-9]+ {
HMML_Marker* m = &sb_last(yyextra->an.markers);
if(m->type == HMML_PROJECT){
m->episode = strdup(yytext+2);
} else {
memcpy(sb_add(m->parameter, yyleng-1), yytext+1, yyleng-1);
memcpy(sb_add(yyextra->an.text, yyleng-1), yytext+1, yyleng-1);
}
}
<MARKER_XTRA>[ ] { if(sb_last(yyextra->an.markers).parameter){ MX_ADD(' '); } }
<MARKER_XTRA>. { MX_ADD(*yytext); }
<REF>{S}
<REF>site{S}= { yyextra->attr = R_(site); BEGIN(R_ATTR); }
<REF>page{S}= { yyextra->attr = R_(page); BEGIN(R_ATTR); }
<REF>url{S}= { yyextra->attr = R_(url); BEGIN(R_ATTR); }
<REF>title{S}= { yyextra->attr = R_(title); BEGIN(R_ATTR); }
<REF>article{S}= { yyextra->attr = R_(article); BEGIN(R_ATTR); }
<REF>author{S}= { yyextra->attr = R_(author); BEGIN(R_ATTR); }
<REF>editor{S}= { yyextra->attr = R_(editor); BEGIN(R_ATTR); }
<REF>publisher{S}= { yyextra->attr = R_(publisher); BEGIN(R_ATTR); }
<REF>isbn{S}= { yyextra->attr = R_(isbn); BEGIN(R_ATTR); }
<REF>\] { sb_push(yyextra->an.references, yyextra->ref); memset(&yyextra->ref, 0, sizeof(yyextra->ref)); BEGIN(TEXT); }
<REF>. { HMML_ERR("Unexpected item in ref: %s", yytext); }
<R_ATTR>{S}
<R_ATTR>{ATTR_SIMPLE} { *(char**)yyextra->attr = strndup(yytext, yyleng); BEGIN(REF); }
<R_ATTR>{ATTR_QUOTED} { *(char**)yyextra->attr = UNQUOTE(strndup(yytext+1, yyleng-2)); BEGIN(REF); }
<AFTERTEXT,ANNOTATION>\[\/video\] { NEWANNO(); return 0; }
<AFTERTEXT>{S}
<AFTERTEXT>{LB}quote { BEGIN(QUOTES); }
<AFTERTEXT>{LB}: { BEGIN(CATEGORIES); yyless(1); }
<AFTERTEXT>{LB}[0-9] { BEGIN(ANNOTATION); yyless(0); }
<AFTERTEXT>.. { HMML_ERR("Unexpected thing after text node: %s", yytext); }
<AFTERTEXT>. { HMML_ERR("Unexpected thing after text node: %s", yytext); }
<AUTHOR>[^\]\n]+\] { yyextra->an.author = strndup(yytext, yyleng-1); BEGIN(TEXT_START); }
<AUTHOR>{S}
<CATEGORIES>{S}
<CATEGORIES>:{ATTR_SIMPLE} { HMML_Marker m = { HMML_CATEGORY, strndup(yytext+1, yyleng-1), NULL, NULL, -1 }; sb_push(yyextra->an.markers, m); }
<CATEGORIES>:{ATTR_QUOTED} { HMML_Marker m = { HMML_CATEGORY, UNQUOTE(strndup(yytext+2, yyleng-3)), NULL, NULL, -1 }; sb_push(yyextra->an.markers, m); }
<CATEGORIES>\]{LB}quote { BEGIN(QUOTES); }
<CATEGORIES>\] { BEGIN(ANNOTATION); }
<CATEGORIES>. { HMML_ERR("Unexpected character in category tag: '%s'\n", yytext); }
<QUOTES>{S}
<QUOTES>[0-9]+{S}\] { yyextra->an.is_quote = true; yyextra->an.quote.id = atoi(yytext); BEGIN(ANNOTATION); }
<QUOTES>{ATTR_ALNUM} { memcpy(sb_add(yyextra->an.quote.author, yyleng), yytext, yyleng); }
<QUOTES>. { HMML_ERR("Unexpected character in quotes tag: %s", yytext); }
%%
#define HMML_REF_ITEMS 9
static void _hmml_free_ref(HMML_Reference*);
static void _hmml_free_anno(HMML_Annotation*);
HMML_Output hmml_parse_file(FILE* f){
HMML_Output output = {};
HMML_ParseState state = {};
state.first = true;
state.line = 1;
yyscan_t scan;
yylex_init_extra(&state, &scan);
yyset_in(f, scan);
output.well_formed = yylex(scan) == 0;
memcpy(&output.metadata, &state.meta, sizeof(HMML_VideoMetaData));
output.metadata.co_host_count = sb_count(output.metadata.co_hosts);
output.metadata.guest_count = sb_count(output.metadata.guests);
output.metadata.annotator_count = sb_count(output.metadata.annotators);
output.annotations = state.annos;
output.annotation_count = sb_count(state.annos);
for(size_t i = 0; i < output.annotation_count; ++i){
HMML_Annotation* a = output.annotations + i;
sb_push(a->text, 0);
a->marker_count = sb_count(a->markers);
a->reference_count = sb_count(a->references);
for(size_t j = 0; j < sb_count(a->markers); ++j){
for(char* c = a->markers[j].marker; *c; ++c) *c = tolower(*c);
}
if(a->is_quote && a->quote.author){
sb_push(a->quote.author, 0);
}
}
if(!output.well_formed){
hmml_free(&output);
memcpy(&output.error, &state.error, sizeof(HMML_Error));
}
_hmml_free_anno(&state.an);
_hmml_free_ref(&state.ref);
yylex_destroy(scan);
return output;
}
static void _hmml_free_ref(HMML_Reference* r){
for(size_t k = 0; k < HMML_REF_ITEMS; ++k){
free(((char**)r)[k]);
}
}
static void _hmml_free_anno(HMML_Annotation* a){
free(a->time);
free(a->author);
sb_free(a->text);
sb_each(r, a->references){
_hmml_free_ref(r);
}
sb_free(a->references);
sb_each(m, a->markers){
free(m->marker);
free(m->episode);
sb_free(m->parameter);
}
sb_free(a->markers);
sb_free(a->quote.author);
}
void hmml_free(HMML_Output* hmml){
if(!hmml) return;
for(size_t i = 0; i < hmml->annotation_count; ++i){
_hmml_free_anno(hmml->annotations + i);
}
sb_free(hmml->annotations);
free(hmml->metadata.member);
free(hmml->metadata.stream_platform);
free(hmml->metadata.stream_username);
free(hmml->metadata.project);
free(hmml->metadata.title);
free(hmml->metadata.vod_platform);
free(hmml->metadata.id);
free(hmml->metadata.template);
free(hmml->metadata.medium);
free(hmml->metadata.output);
for(size_t i = 0; i < HMML_CUSTOM_ATTR_COUNT; ++i){
free(hmml->metadata.custom[i]);
}
sb_each(i, hmml->metadata.co_hosts) free(*i);
sb_each(i, hmml->metadata.guests) free(*i);
sb_each(i, hmml->metadata.annotators) free(*i);
sb_free(hmml->metadata.co_hosts);
sb_free(hmml->metadata.guests);
sb_free(hmml->metadata.annotators);
free(hmml->error.message);
memset(hmml, 0, sizeof(*hmml));
}
typedef struct {
char* text;
int* lines;
} Index;
Index* index_find(Index* base, const char* text){
for(size_t i = 0; i < sb_count(base); ++i){
if(strcmp(base[i].text, text) == 0){
return base + i;
}
}
return NULL;
}
void hmml_dump(HMML_Output* hmml){
if(!hmml){
puts("(null)");
return;
}
if(!hmml->well_formed){
printf("Parse error on line %d: %s\n", hmml->error.line, hmml->error.message);
return;
}
puts("Annotations:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
printf("\t%3d [%7s] [%s]\n", a->line, a->time, a->text);
}
Index* authors = NULL;
Index* markers[HMML_MARKER_COUNT] = {};
int max_text_len = 0;
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
if(a->author){
size_t len = strlen(a->author);
if(len > max_text_len){
max_text_len = len;
}
Index* idx;
if(!(idx = index_find(authors, a->author))){
Index x = { a->author };
sb_push(authors, x);
idx = &sb_last(authors);
}
sb_push(idx->lines, a->line);
}
}
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
for(size_t j = 0; j < a->marker_count; ++j){
int type = a->markers[j].type;
char* text = a->markers[j].marker;
size_t len = strlen(text);
if(len > max_text_len){
max_text_len = len;
}
Index* idx;
if(!(idx = index_find(markers[type], text))){
Index x = { text };
sb_push(markers[type], x);
idx = &sb_last(markers[type]);
}
sb_push(idx->lines, a->line);
}
}
puts("Authors:");
for(size_t i = 0; i < sb_count(authors); ++i){
printf("\t %*s: ", max_text_len, authors[i].text);
for(size_t j = 0; j < sb_count(authors[i].lines); ++j){
printf("%3d ", authors[i].lines[j]);
}
puts("");
}
static const char* m_tags[HMML_MARKER_COUNT] = { "Categories", "Members", "Projects" };
for(size_t i = 0; i < HMML_MARKER_COUNT; ++i){
printf("%s:\n", m_tags[i]);
for(size_t j = 0; j < sb_count(markers[i]); ++j){
printf("\t %*s: ", max_text_len, markers[i][j].text);
for(size_t k = 0; k < sb_count(markers[i][j].lines); ++k){
printf("%3d ", markers[i][j].lines[k]);
}
puts("");
}
}
static const char* r_tags[] = { "Site", "Page", "URL", "Title", "Article", "Author", "Editor", "Publisher", "ISBN" };
puts("References:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
for(size_t j = 0; j < a->reference_count; ++j){
printf("\t%3d ", a->line);
HMML_Reference* r = a->references + j;
for(size_t k = 0; k < HMML_REF_ITEMS; ++k){
char* item = ((char**)r)[k];
if(item){
printf("[%s = %s] ", r_tags[k], item);
}
}
puts("");
}
}
puts("Quotes:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
if(a->is_quote){
if(a->quote.author){
printf("\t%3d [Quote #%d, by %s]", a->line, a->quote.id, a->quote.author);
} else {
printf("\t%3d [Quote #%d]", a->line, a->quote.id);
}
puts("");
}
}
for(size_t i = 0; i < sb_count(authors); ++i){
sb_free(authors[i].lines);
}
sb_free(authors);
for(size_t i = 0; i < HMML_MARKER_COUNT; ++i){
for(size_t j = 0; j < sb_count(markers[i]); ++j){
sb_free(markers[i][j].lines);
}
sb_free(markers[i]);
}
}