/** @license 2022 Neil Edelman, distributed under the terms of the [MIT License](https://opensource.org/licenses/MIT). Lexer for journal entries. @std C89/90 */ #include "../src/lex.h" #include #include #include #include #include /*!re2c re2c:yyfill:enable = 0; re2c:define:YYCTYPE = char; */ /** "-"? [1-9][0-9]*$, within the range of `INT_MAX`. */ int lex_looks_like_year(const char *const a, int *const year) { const char *YYCURSOR = a, *YYMARKER = a, *s0; /*!stags:re2c format = 'const char *@@;\n'; */ (void)yyt2; assert(a && year); /*!re2c @s0 ("-"? [1-9][0-9]* | "0") "\x00" { int sign = 1, mag; if(*s0 == '-') { sign = -1; s0++; } for(mag = 0; *s0 != '\0'; s0++) { int d = *s0 - '0'; if((INT_MAX - d) / 10 < mag) return 0; mag = mag * 10 + d; } *year = sign * mag; return 1; } * { return 0; } */ } /** 1 <= [0-1][0-9]$ <= 12 */ int lex_looks_like_month(const char *const a) { const char *YYCURSOR = a, *YYMARKER = a, *s0; /*!stags:re2c format = 'const char *@@;\n'; */ (void)yyt1, (void)yyt2; assert(a); /*!re2c @s0 [0-1][0-9] "\x00" { int val = 10 * (s0[0] - '0') + (s0[1] - '0'); return val < 1 || val > 12 ? 0 : val; } * { return 0; } */ } /** 1 <= [0-3][0-9].txt$ <= 31 */ int lex_looks_like_day(const char *const a) { const char *YYCURSOR = a, *YYMARKER = a, *s0; /*!stags:re2c format = 'const char *@@;\n'; */ (void)yyt1, (void)yyt2; assert(a); /*!re2c @s0 [0-3][0-9] ".txt\x00" { int val = 10 * (s0[0] - '0') + (s0[1] - '0'); return val < 1 || val > 31 ? 0 : val; } * { return 0; } */ } /* This defines `enum condition`. */ /*!types:re2c*/ /* "[edict: expect; there; to; be; args]", in this case, expect would be a stack of `size = 5` `EXPECT_KEYWORD`. This mirrors arguments in `LEX_SYMBOL` and should also be an `edict_*` in and . */ #define EXPECT_HEAD X(keyword, KEYWORD) X(date, DATE) X(natural, NATURAL) #define EXPECT_CONS Y(freeform, FREEFORM) #define EXPECT EXPECT_HEAD EXPECT_CONS /** Scan reads one file as a time and extracts semantic information. Valid to access only while underlying pointers do not change. This is a singleton, not concurrent: convenient and bad. */ static struct scan { /* `re2c` variables; these point directly into `buffer`. */ const char *marker, *ctx_marker, *from, *cursor; /* Weird `c2re` stuff: these fields have to come after when >5? */ const char *label, *buffer; enum condition condition; size_t line; int is_ws_expected, is_source; #define X(n, N) EXPECT_ ## N, #define Y(n, N) EXPECT_ ## N struct { unsigned size; enum { EXPECT } expect[16]; } edict; #undef X #undef Y } scan; /** Resets the buffer to some `buffer`. */ void lex_reset(const char *const buffer) { scan.marker = scan.ctx_marker = scan.from = scan.cursor = scan.label = scan.buffer = buffer; scan.condition = yycline; scan.line = 1; } /** I don't think `re2c` supports branching on variable conditions. It does now? */ static void expect_pop(void) { if(!scan.edict.size) { scan.condition = yycedict_end; return; } switch(scan.edict.expect[--scan.edict.size]) { #define X(n, N) case EXPECT_ ## N : scan.condition = yycedict_ ## n; break; #define Y(n, N) case EXPECT_ ## N : scan.condition = yycedict_ ## n; break; EXPECT #undef X #undef Y } } int lex_next(struct lex *const x) { /*!re2c /**/ re2c:flags:tags = 1; re2c:define:YYCURSOR = scan.cursor; re2c:define:YYMARKER = scan.marker; re2c:define:YYCTXMARKER = scan.ctx_marker; re2c:define:YYCONDTYPE = 'condition'; re2c:define:YYGETCONDITION = 'scan.condition'; re2c:define:YYGETCONDITION:naked = 1; re2c:define:YYSETCONDITION = 'scan.condition = @@;'; re2c:define:YYSETCONDITION:naked = 1; sentinel = "\x00"; illegal = [\x01-\x08\x0a-\x1f\x7f]; /* unix-style control characters */ newline = "\n"; ws = [ \t]; glyph = [^] \ (sentinel | illegal | newline | ws); keyword = ([a-zA-Z] | [0-9][0-9_\-]*[a-zA-Z]) [a-zA-Z0-9_\-]*; decimal = "-"? ([1-9][0-9]* | [0])? "." [0-9]+ | [1-9][0-9]* | [0]; natural = [1-9][0-9]*; date = "-"? natural "-" [0-1][0-9] "-" [0-1][0-9]; */ const char *s0, *s1; /*!stags:re2c format = 'const char *@@;\n'; */ assert(x); if(!scan.buffer) return 0; x->line = scan.line; x->s0 = x->s1 = 0; scan: /*!re2c /**/ <*> illegal { return x->symbol = ILLEGAL, 0; } <*> * { return x->symbol = SYNTAX, 0; } <*> sentinel { return x->symbol = scan.condition == yycline ? END : ILLEGAL, 0; } newline => line { x->line = ++scan.line; goto scan; } ws* @s0 glyph (glyph | ws)* @s1 ws* / newline => expect_line { x->s0 = s0, x->s1 = s1; return x->symbol = CAPTION, 1; } newline { x->line = ++scan.line; return x->symbol = PARAGRAPH, 1; } "--" :=> source "->" :=> location "[" :=> edict /* Just plain text. */ ws* / glyph :=> text /* match-empty-string: text takes care of it. */ newline => line { x->line = ++scan.line; goto scan; } ws+ { goto scan; } @s0 glyph+ @s1 { x->s0 = s0, x->s1 = s1; return x->symbol = TEXT, 1; } bible_ref = natural ":" natural [ab]? ("-" (natural ":")? natural [ab]?)?; glyph_minus = glyph \ [']; @s0 ("Genesis" | "Exodus" | "Leviticus" | "Numbers" | "Deuteronomy" | "Joshua" | "Judges" | "Ruth" | "I"{1,2} " Samuel" | "I"{1,2} " Kings" | "I"{1,2} " Chronicles" | "Ezra" | "Nehemiah" | "Esther" | "Job" | "Psalms" | "Proverbs" | "Ecclesiastes" | "Song of Solomon" | "Isaiah" | "Jeremiah" | "Lamentations" | "Ezekiel" | "Daniel" | "Hosea" | "Joel" | "Amos" | "Obadiah" | "Jonah" | "Micah" | "Nahum" | "Habakkuk" | "Zephaniah" | "Haggai" | "Zechariah" | "Malachi" | "Matthew" | "Mark" | "Luke" | "John" | "Acts" | "Romans" | "I"{1,2} " Corinthians" | "Galatians" | "Ephesians" | "Philippians" | "Colossians" | "I"{1,2} " Thessalonians" | "I"{1,2} " Timothy" | "Titus" | "Philemon" | "Hebrews" | "James" | "I"{1,2} " Peter" | "I"{1,3} " John" | "Jude" | "Revelation") @s1 ws* / bible_ref ws+ "--" ws+ "``" => bible { x->s0 = s0, x->s1 = s1; return x->symbol = BIBLE_BOOK, 1; } @s0 bible_ref @s1 ws+ "--" ws+ "``" { x->s0 = s0, x->s1 = s1; return x->symbol = BIBLE_CHAPTER_VERSE, 1; } "``" { return x->symbol = BIBLE_NEXT, 1; } "''" :=> text /* fixme: This is a hack that doesn't allow apostrophes at the end of a word, (not sure there are any in the bible.) Is ' terminated by ''; otherwise same as glyph+ above. */ @s0 (glyph_minus+ ("'" glyph_minus+)*) | (("'" glyph_minus+)+) @s1 { x->s0 = s0, x->s1 = s1; return x->symbol = BIBLE_TEXT, 1; } /* Multiple verses can be present, but they end in ''. Not strictly enforced. */ newline / (newline | "``") { x->line = ++scan.line; goto scan; } newline { return x->symbol = SYNTAX, 0; } @s0 keyword @s1 => expect_line { x->s0 = s0, x->s1 = s1; return x->symbol = SOURCE_RECALL, 1; } "" / "(" :=> map "[" ws* @s0 keyword @s1 ws* "]" { x->s0 = s0, x->s1 = s1; return x->symbol = LOCATION_SAVE, 1; } @s0 keyword @s1 => expect_line { x->s0 = s0, x->s1 = s1; return x->symbol = LOCATION_RECALL, 1; } "(" @s0 decimal "," @s1 decimal ")" => expect_caption { x->s0 = s0, x->s1 = s1; return x->symbol = LOCATION, 1; } "source" { if(scan.is_ws_expected || scan.edict.size) return x->symbol = SYNTAX, 0; scan.is_ws_expected = 1, scan.is_source = 1; scan.edict.size = 2; scan.edict.expect[1] = EXPECT_KEYWORD; scan.edict.expect[0] = EXPECT_FREEFORM; return x->symbol = SOURCE, 1; } "default" { if(scan.is_ws_expected || !scan.is_source) return x->symbol = SYNTAX, 0; scan.is_ws_expected = 1, scan.is_source = 0; return x->symbol = DEFAULT, 1; } "ed" { if(scan.is_ws_expected || scan.edict.size) return x->symbol = SYNTAX, 0; scan.is_ws_expected = 1; /* no idea, just copy; probably should do sth */ scan.edict.size = 1; scan.edict.expect[0] = EXPECT_FREEFORM; return x->symbol = EDITORIALIZING, 1; } "significant" { if(scan.is_ws_expected || scan.edict.size) return x->symbol = SYNTAX, 0; scan.is_ws_expected = 1; scan.edict.size = 3; scan.edict.expect[2] = EXPECT_NATURAL; scan.edict.expect[1] = EXPECT_FREEFORM; scan.edict.expect[0] = EXPECT_DATE; return x->symbol = SIGNIFICANT, 1; } @s0 natural @s1 { if(scan.is_ws_expected || scan.edict.size) return x->symbol = SYNTAX, 0; scan.is_ws_expected = 1; x->s0 = s0, x->s1 = s1; return x->symbol = SIGNIFICANT_RECALL, 1; } ws+ { scan.is_ws_expected = 0; goto scan; } ":" { if(!scan.edict.size) return x->symbol = SYNTAX, 0; scan.is_ws_expected = 0, scan.is_source = 0; expect_pop(); goto scan; } ws* @s0 keyword @s1 ws* ";"? { x->s0 = s0, x->s1 = s1; expect_pop(); return x->symbol = ARG_KEYWORD, 1; } ws* @s0 date @s1 ws* ";"? { x->s0 = s0, x->s1 = s1; expect_pop(); return x->symbol = ARG_DATE, 1; } ws* @s0 natural @s1 ws* ";"? { x->s0 = s0, x->s1 = s1; expect_pop(); return x->symbol = ARG_NATURAL, 1; } ws* @s0 (glyph \ [;[\]]) ((glyph \ [;[\]]) | ws)* @s1 ws* ";"? { x->s0 = s0, x->s1 = s1; expect_pop(); return x->symbol = ARG_FREEFORM, 1; } "]" => expect_line { if(scan.edict.size) return 0; goto scan; } */ }