Annotation-System/hmmlib/hmmlib.l

416 lines
12 KiB
Plaintext

%{
#include <stdio.h>
#include <stdbool.h>
#include <ctype.h>
#include "stb_sb.h"
#include "hmmlib.h"
typedef struct {
int line;
HMML_Annotation* annos;
HMML_VideoMetaData meta;
HMML_Annotation an;
HMML_Reference ref;
HMML_Error error;
char** attr;
int mnext;
bool first;
} HMML_ParseState;
#define HMML_ERR(fmt, ...) \
do { \
asprintf(&yyextra->error.message, fmt, ##__VA_ARGS__);\
yyextra->error.line = yyextra->line;\
return 1;\
} while(0)
#define V_(x) &yyextra->meta.x
#define R_(x) &yyextra->ref.x
#define M_(x, state) do { HMML_Marker m = { HMML_ ## x }; sb_push(yyextra->an.markers, m); yyextra->mnext = state; } while(0)
#define M_ADD(t, n) \
do { \
char* c = strndup(t, n);\
sb_last(yyextra->an.markers).text = c;\
sb_last(yyextra->an.markers).offset = sb_count(yyextra->an.text);\
memcpy(sb_add(yyextra->an.text, n), c, n);\
} while(0)
#define NEWANNO() \
do { \
if(!yyextra->first) sb_push(yyextra->annos, yyextra->an); \
memset(&yyextra->an, 0, sizeof(yyextra->an));\
yyextra->an.line = yyextra->line;\
yyextra->first = false;\
} while(0)
#define CHECKESCAPE(x) do { if(!strchr("[]:@~\\", x)) HMML_ERR("Unknown backslash escape code '%c'", x); } while(0)
%}
%option reentrant
%option extra-type="HMML_ParseState*"
%option noyywrap
S [\t \r]*
ATTR_SIMPLE [^\" \]\t\r\n][^ \]\t\r\n]*
ATTR_ALNUM [0-9a-zA-Z][0-9a-zA-Z_]*
ATTR_QUOTED \"([^\n\"\\]|\\.)*\"
TAG_VIDEO_OPEN \[video
TIMECODE \[[0-9]{1,2}(:[0-5][0-9]){1,2}\]
BAD_TIMECODE \[[0-9]{1,2}(:[6-9][0-9]){1,2}\]
TEXT_BREAK [^\\:@~\[\]\r\n\t ]
LB \[
RB \]
%s VIDEO
%s V_ATTR
%s ANNOTATION
%s TEXT_START
%s TEXT
%s MARKER
%s MARKER_XTRA
%s REF
%s R_ATTR
%s AFTERTEXT
%s AUTHOR
%s CATEGORIES
%s QUOTES
%%
<<EOF>> { HMML_ERR("Unexpected EOF, video close tag not found."); }
\r\n|\n { yyextra->line++; }
<INITIAL>{TAG_VIDEO_OPEN} { BEGIN(VIDEO); }
<INITIAL>. { HMML_ERR("Missing video tag."); }
<VIDEO>{S}
<VIDEO>member{S}= { yyextra->attr = V_(member); BEGIN(V_ATTR); }
<VIDEO>twitch_username{S}= { yyextra->attr = V_(twitch); BEGIN(V_ATTR); }
<VIDEO>project{S}= { yyextra->attr = V_(project); BEGIN(V_ATTR); }
<VIDEO>title{S}= { yyextra->attr = V_(title); BEGIN(V_ATTR); }
<VIDEO>platform{S}= { yyextra->attr = V_(platform); BEGIN(V_ATTR); }
<VIDEO>id{S}= { yyextra->attr = V_(id); BEGIN(V_ATTR); }
<VIDEO>annotator{S}= { yyextra->attr = V_(annotator); BEGIN(V_ATTR); }
<VIDEO>\] { BEGIN(ANNOTATION); };
<VIDEO>. { HMML_ERR("Invalid char '%c' in video tag.", *yytext); }
<V_ATTR>{S} { BEGIN(VIDEO); }
<V_ATTR>{ATTR_SIMPLE} { *yyextra->attr = strndup(yytext , yyleng ); BEGIN(VIDEO); }
<V_ATTR>{ATTR_QUOTED} { *yyextra->attr = strndup(yytext+1, yyleng-2); BEGIN(VIDEO); }
<V_ATTR>\] { yyless(0); BEGIN(VIDEO); }
<ANNOTATION>{TIMECODE}{LB}@ { NEWANNO(); yyextra->an.time = strndup(yytext+1, yyleng-4); BEGIN(AUTHOR); }
<ANNOTATION>{TIMECODE} { NEWANNO(); yyextra->an.time = strndup(yytext+1, yyleng-2); BEGIN(TEXT_START); }
<ANNOTATION>{BAD_TIMECODE} { HMML_ERR("Timecode %s out of range.", yytext); }
<ANNOTATION>{S}
<ANNOTATION>. { HMML_ERR("Cannot parse annotation. Expected timecode."); }
<TEXT_START>{LB}: { M_(CATEGORY, TEXT); BEGIN(MARKER); }
<TEXT_START>{LB}@ { M_(MEMBER , TEXT); BEGIN(MARKER); }
<TEXT_START>{LB}~ { M_(PROJECT , TEXT); BEGIN(MARKER); }
<TEXT_START>{LB} { yyless(0); BEGIN(TEXT); }
<TEXT_START>. { HMML_ERR("Unknown character '%c' after timecode.", *yytext); }
<TEXT>{TEXT_BREAK}+ { memcpy(sb_add(yyextra->an.text, yyleng), yytext, yyleng); }
<TEXT>\\. { CHECKESCAPE(yytext[1]); memcpy(sb_add(yyextra->an.text, yyleng-1), yytext+1, yyleng-1); }
<TEXT>[ \r\t]+: { sb_push(yyextra->an.text, ' '); M_(CATEGORY, TEXT); BEGIN(MARKER); }
<TEXT>[ \r\t]+@ { sb_push(yyextra->an.text, ' '); M_(MEMBER , TEXT); BEGIN(MARKER); }
<TEXT>[ \r\t]+~ { sb_push(yyextra->an.text, ' '); M_(PROJECT , TEXT); BEGIN(MARKER); }
<TEXT>{LB}: { M_(CATEGORY, MARKER_XTRA); BEGIN(MARKER); }
<TEXT>{LB}@ { M_(MEMBER , MARKER_XTRA); BEGIN(MARKER); }
<TEXT>{LB}~ { M_(PROJECT , MARKER_XTRA); BEGIN(MARKER); }
<TEXT>\] { BEGIN(AFTERTEXT); }
<TEXT>{LB}ref { yyextra->ref.offset = sb_count(yyextra->an.text); BEGIN(REF); }
<TEXT>{LB}
<TEXT>[ \r\t]{1,2} { sb_push(yyextra->an.text, ' '); }
<TEXT>. { sb_push(yyextra->an.text, *yytext); }
<MARKER>{ATTR_ALNUM} { M_ADD(yytext , yyleng ); BEGIN(yyextra->mnext); };
<MARKER>{ATTR_QUOTED} { M_ADD(yytext+1, yyleng-2); BEGIN(yyextra->mnext); };
<MARKER>. { HMML_ERR("Cannot parse Marker. Expected quoted or alphanumeric attribute."); }
/* TODO: store the extra text somewhere */
<MARKER_XTRA>\] { BEGIN(TEXT); }
<MARKER_XTRA>.
<REF>{S}
<REF>site{S}= { yyextra->attr = R_(site); BEGIN(R_ATTR); }
<REF>page{S}= { yyextra->attr = R_(page); BEGIN(R_ATTR); }
<REF>url{S}= { yyextra->attr = R_(url); BEGIN(R_ATTR); }
<REF>title{S}= { yyextra->attr = R_(title); BEGIN(R_ATTR); }
<REF>article{S}= { yyextra->attr = R_(article); BEGIN(R_ATTR); }
<REF>author{S}= { yyextra->attr = R_(author); BEGIN(R_ATTR); }
<REF>editor{S}= { yyextra->attr = R_(editor); BEGIN(R_ATTR); }
<REF>publisher{S}= { yyextra->attr = R_(publisher); BEGIN(R_ATTR); }
<REF>isbn{S}= { yyextra->attr = R_(isbn); BEGIN(R_ATTR); }
<REF>\] { sb_push(yyextra->an.references, yyextra->ref); memset(&yyextra->ref, 0, sizeof(yyextra->ref)); BEGIN(TEXT); }
<REF>. { HMML_ERR("Unexpected item in ref: %s", yytext); }
<R_ATTR>{S}
<R_ATTR>{ATTR_SIMPLE} { *yyextra->attr = strndup(yytext , yyleng ); BEGIN(REF); }
<R_ATTR>{ATTR_QUOTED} { *yyextra->attr = strndup(yytext+1, yyleng-2); BEGIN(REF); }
<AFTERTEXT,ANNOTATION>\[\/video\] { NEWANNO(); return 0; }
<AFTERTEXT>{S}
<AFTERTEXT>{LB}quote { BEGIN(QUOTES); }
<AFTERTEXT>{LB}: { BEGIN(CATEGORIES); yyless(1); }
<AFTERTEXT>{LB}[0-9] { BEGIN(ANNOTATION); yyless(0); }
<AFTERTEXT>.. { HMML_ERR("Unexpected thing after text node: %s", yytext); }
<AFTERTEXT>. { HMML_ERR("Unexpected thing after text node: %s", yytext); }
<AUTHOR>[^\]\n]+\] { yyextra->an.author = strndup(yytext, yyleng-1); BEGIN(TEXT_START); }
<AUTHOR>{S}
<CATEGORIES>{S}
<CATEGORIES>:{ATTR_SIMPLE} { HMML_Marker m = { HMML_CATEGORY, strndup(yytext+1, yyleng-1), -1 }; sb_push(yyextra->an.markers, m); }
<CATEGORIES>:{ATTR_QUOTED} { HMML_Marker m = { HMML_CATEGORY, strndup(yytext+2, yyleng-3), -1 }; sb_push(yyextra->an.markers, m); }
<CATEGORIES>\]{LB} { BEGIN(QUOTES); }
<CATEGORIES>\] { BEGIN(ANNOTATION); }
<CATEGORIES>. { HMML_ERR("Unexpected character in category tag: '%c'\n", *yytext); }
<QUOTES>{S}
<QUOTES>[0-9]+{S}\] { yyextra->an.is_quote = true; yyextra->an.quote.id = atoi(yytext); BEGIN(ANNOTATION); }
<QUOTES>{ATTR_ALNUM} { memcpy(sb_add(yyextra->an.quote.author, yyleng), yytext, yyleng); }
<QUOTES>. { HMML_ERR("Unexpected character in quotes tag: %s", yytext); }
%%
#define HMML_REF_ITEMS 9
static void _hmml_free_ref(HMML_Reference*);
static void _hmml_free_anno(HMML_Annotation*);
HMML_Output hmml_parse_file(FILE* f){
HMML_Output output = {};
HMML_ParseState state = {};
state.first = true;
state.line = 1;
yyscan_t scan;
yylex_init_extra(&state, &scan);
yyset_in(f, scan);
output.well_formed = yylex(scan) == 0;
memcpy(&output.metadata, &state.meta, sizeof(HMML_VideoMetaData));
output.annotations = state.annos;
output.annotation_count = sb_count(state.annos);
for(size_t i = 0; i < output.annotation_count; ++i){
HMML_Annotation* a = output.annotations + i;
sb_push(a->text, 0);
a->marker_count = sb_count(a->markers);
a->reference_count = sb_count(a->references);
if(a->is_quote && a->quote.author){
sb_push(a->quote.author, 0);
}
}
if(!output.well_formed){
hmml_free(&output);
memcpy(&output.error, &state.error, sizeof(HMML_Error));
}
_hmml_free_anno(&state.an);
_hmml_free_ref(&state.ref);
yylex_destroy(scan);
return output;
}
static void _hmml_free_ref(HMML_Reference* r){
for(size_t k = 0; k < HMML_REF_ITEMS; ++k){
free(((char**)r)[k]);
}
}
static void _hmml_free_anno(HMML_Annotation* a){
free(a->time);
free(a->author);
sb_free(a->text);
for(size_t j = 0; j < sb_count(a->references); ++j){
_hmml_free_ref(a->references + j);
}
sb_free(a->references);
for(size_t j = 0; j < sb_count(a->markers); ++j){
free(a->markers[j].text);
}
sb_free(a->markers);
sb_free(a->quote.author);
}
void hmml_free(HMML_Output* hmml){
if(!hmml) return;
for(size_t i = 0; i < sizeof(HMML_VideoMetaData)/sizeof(char*); ++i){
free(((char**)&hmml->metadata)[i]);
}
for(size_t i = 0; i < hmml->annotation_count; ++i){
_hmml_free_anno(hmml->annotations + i);
}
sb_free(hmml->annotations);
free(hmml->error.message);
memset(hmml, 0, sizeof(*hmml));
}
typedef struct {
char* text;
int* lines;
} Index;
Index* index_find(Index* base, const char* text){
for(size_t i = 0; i < sb_count(base); ++i){
if(strcmp(base[i].text, text) == 0){
return base + i;
}
}
return NULL;
}
void hmml_dump(HMML_Output* hmml){
if(!hmml){
puts("(null");
return;
}
if(!hmml->well_formed){
printf("Parse error on line %d: %s\n", hmml->error.line, hmml->error.message);
return;
}
puts("Annotations:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
printf("\t%3d [%7s] [%s]\n", a->line, a->time, a->text);
}
Index* authors = NULL;
Index* markers[HMML_MARKER_COUNT] = {};
int max_text_len = 0;
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
if(a->author){
size_t len = strlen(a->author);
if(len > max_text_len){
max_text_len = len;
}
Index* idx;
if(!(idx = index_find(authors, a->author))){
Index x = { a->author };
sb_push(authors, x);
idx = &sb_last(authors);
}
sb_push(idx->lines, a->line);
}
}
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
for(size_t j = 0; j < a->marker_count; ++j){
int type = a->markers[j].type;
char* text = a->markers[j].text;
size_t len = strlen(text);
if(len > max_text_len){
max_text_len = len;
}
for(char* c = text; *c; ++c) *c = tolower(*c);
Index* idx;
if(!(idx = index_find(markers[type], text))){
Index x = { text };
sb_push(markers[type], x);
idx = &sb_last(markers[type]);
}
sb_push(idx->lines, a->line);
}
}
puts("Authors:");
for(size_t i = 0; i < sb_count(authors); ++i){
printf("\t %*s: ", max_text_len, authors[i].text);
for(size_t j = 0; j < sb_count(authors[i].lines); ++j){
printf("%3d ", authors[i].lines[j]);
}
puts("");
}
static const char* m_tags[HMML_MARKER_COUNT] = { "Categories", "Members", "Projects" };
for(size_t i = 0; i < HMML_MARKER_COUNT; ++i){
printf("%s:\n", m_tags[i]);
for(size_t j = 0; j < sb_count(markers[i]); ++j){
printf("\t %*s: ", max_text_len, markers[i][j].text);
for(size_t k = 0; k < sb_count(markers[i][j].lines); ++k){
printf("%3d ", markers[i][j].lines[k]);
}
puts("");
}
}
static const char* r_tags[] = { "Site", "Page", "URL", "Title", "Article", "Author", "Editor", "Publisher", "ISBN" };
puts("References:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
for(size_t j = 0; j < a->reference_count; ++j){
printf("\t%3d ", a->line);
HMML_Reference* r = a->references + j;
for(size_t k = 0; k < HMML_REF_ITEMS; ++k){
char* item = ((char**)r)[k];
if(item){
printf("[%s = %s] ", r_tags[k], item);
}
}
puts("");
}
}
puts("Quotes:");
for(size_t i = 0; i < hmml->annotation_count; ++i){
HMML_Annotation* a = hmml->annotations + i;
if(a->is_quote){
if(a->quote.author){
printf("\t%3d [Quote #%d, by %s]", a->line, a->quote.id, a->quote.author);
} else {
printf("\t%3d [Quote #%d]", a->line, a->quote.id);
}
puts("");
}
}
for(size_t i = 0; i < sb_count(authors); ++i){
sb_free(authors[i].lines);
}
sb_free(authors);
for(size_t i = 0; i < HMML_MARKER_COUNT; ++i){
for(size_t j = 0; j < sb_count(markers[i]); ++j){
sb_free(markers[i][j].lines);
}
sb_free(markers[i]);
}
}