269 lines
9.0 KiB
C
269 lines
9.0 KiB
C
/** @license 2022 Neil Edelman, distributed under the terms of the
|
|
[MIT License](https://opensource.org/licenses/MIT).
|
|
|
|
Lexer for journal entries.
|
|
|
|
@std C89/90 */
|
|
|
|
#include "../src/lex.h"
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
#include <limits.h>
|
|
#include <errno.h>
|
|
|
|
/*!re2c
|
|
re2c:yyfill:enable = 0;
|
|
re2c:define:YYCTYPE = char;
|
|
*/
|
|
|
|
/** "-"? [1-9][0-9]*$, within the range of `INT_MAX`. */
|
|
int lex_looks_like_year(const char *const a, int *const year) {
|
|
const char *YYCURSOR = a, *YYMARKER = a, *s0;
|
|
/*!stags:re2c format = 'const char *@@;\n'; */
|
|
(void)yyt2;
|
|
assert(a && year);
|
|
/*!re2c
|
|
@s0 ("-"? [1-9][0-9]* | "0") "\x00" {
|
|
int sign = 1, mag;
|
|
if(*s0 == '-') { sign = -1; s0++; }
|
|
for(mag = 0; *s0 != '\0'; s0++) {
|
|
int d = *s0 - '0';
|
|
if((INT_MAX - d) / 10 < mag) return 0;
|
|
mag = mag * 10 + d;
|
|
}
|
|
*year = sign * mag;
|
|
return 1;
|
|
}
|
|
* { return 0; }
|
|
*/
|
|
}
|
|
|
|
/** 1 <= [0-1][0-9]$ <= 12 */
|
|
int lex_looks_like_month(const char *const a) {
|
|
const char *YYCURSOR = a, *YYMARKER = a, *s0;
|
|
/*!stags:re2c format = 'const char *@@;\n'; */
|
|
(void)yyt1, (void)yyt2;
|
|
assert(a);
|
|
/*!re2c
|
|
@s0 [0-1][0-9] "\x00" {
|
|
int val = 10 * (s0[0] - '0') + (s0[1] - '0');
|
|
return val < 1 || val > 12 ? 0 : val;
|
|
}
|
|
* { return 0; }
|
|
*/
|
|
}
|
|
|
|
/** 1 <= [0-3][0-9].txt$ <= 31 */
|
|
int lex_looks_like_day(const char *const a) {
|
|
const char *YYCURSOR = a, *YYMARKER = a, *s0;
|
|
/*!stags:re2c format = 'const char *@@;\n'; */
|
|
(void)yyt1, (void)yyt2;
|
|
assert(a);
|
|
/*!re2c
|
|
@s0 [0-3][0-9] ".txt\x00" {
|
|
int val = 10 * (s0[0] - '0') + (s0[1] - '0');
|
|
return val < 1 || val > 31 ? 0 : val;
|
|
}
|
|
* { return 0; }
|
|
*/
|
|
}
|
|
|
|
/* This defines `enum condition`. */
|
|
/*!types:re2c*/
|
|
|
|
/* "[edict: expect; there; to; be; args]", in this case, expect would be a
|
|
stack of `size = 5` `EXPECT_KEYWORD`. This mirrors arguments in `LEX_SYMBOL`
|
|
and should also be an `edict_*` in <fn:lex_next> and <fn:expect_pop>. */
|
|
#define EXPECT_HEAD X(keyword, KEYWORD) X(date, DATE) X(natural, NATURAL)
|
|
#define EXPECT_CONS Y(freeform, FREEFORM)
|
|
#define EXPECT EXPECT_HEAD EXPECT_CONS
|
|
|
|
/** Scan reads one file as a time and extracts semantic information. Valid to
|
|
access only while underlying pointers do not change. This is a singleton, not
|
|
concurrent: convenient and bad. */
|
|
static struct scan {
|
|
/* `re2c` variables; these point directly into `buffer`. */
|
|
const char *marker, *ctx_marker, *from, *cursor;
|
|
/* Weird `c2re` stuff: these fields have to come after when >5? */
|
|
const char *label, *buffer;
|
|
enum condition condition;
|
|
size_t line;
|
|
int is_ws_expected, is_source;
|
|
#define X(n, N) EXPECT_ ## N,
|
|
#define Y(n, N) EXPECT_ ## N
|
|
struct { unsigned size; enum { EXPECT } expect[16]; } edict;
|
|
#undef X
|
|
#undef Y
|
|
} scan;
|
|
|
|
/** Resets the buffer to some `buffer`. */
|
|
void lex_reset(const char *const buffer) {
|
|
scan.marker = scan.ctx_marker = scan.from = scan.cursor = scan.label
|
|
= scan.buffer = buffer;
|
|
scan.condition = yycline;
|
|
scan.line = 1;
|
|
}
|
|
|
|
/** I don't think `re2c` supports branching on variable conditions.
|
|
It does now? */
|
|
static void expect_pop(void) {
|
|
if(!scan.edict.size) { scan.condition = yycedict_end; return; }
|
|
switch(scan.edict.expect[--scan.edict.size]) {
|
|
#define X(n, N) case EXPECT_ ## N : scan.condition = yycedict_ ## n; break;
|
|
#define Y(n, N) case EXPECT_ ## N : scan.condition = yycedict_ ## n; break;
|
|
EXPECT
|
|
#undef X
|
|
#undef Y
|
|
}
|
|
}
|
|
|
|
int lex_next(struct lex *const x) {
|
|
/*!re2c /**/
|
|
re2c:flags:tags = 1;
|
|
re2c:define:YYCURSOR = scan.cursor;
|
|
re2c:define:YYMARKER = scan.marker;
|
|
re2c:define:YYCTXMARKER = scan.ctx_marker;
|
|
re2c:define:YYCONDTYPE = 'condition';
|
|
re2c:define:YYGETCONDITION = 'scan.condition';
|
|
re2c:define:YYGETCONDITION:naked = 1;
|
|
re2c:define:YYSETCONDITION = 'scan.condition = @@;';
|
|
re2c:define:YYSETCONDITION:naked = 1;
|
|
sentinel = "\x00";
|
|
illegal = [\x01-\x08\x0a-\x1f\x7f]; /* unix-style control characters */
|
|
newline = "\n";
|
|
ws = [ \t];
|
|
glyph = [^] \ (sentinel | illegal | newline | ws);
|
|
keyword = ([a-zA-Z] | [0-9][0-9_\-]*[a-zA-Z]) [a-zA-Z0-9_\-]*;
|
|
decimal = "-"? ([1-9][0-9]* | [0])? "." [0-9]+ | [1-9][0-9]* | [0];
|
|
natural = [1-9][0-9]*;
|
|
date = "-"? natural "-" [0-1][0-9] "-" [0-1][0-9];
|
|
*/
|
|
const char *s0, *s1;
|
|
/*!stags:re2c format = 'const char *@@;\n'; */
|
|
assert(x);
|
|
if(!scan.buffer) return 0;
|
|
x->line = scan.line;
|
|
x->s0 = x->s1 = 0;
|
|
scan:
|
|
/*!re2c /**/
|
|
<*> illegal { return x->symbol = ILLEGAL, 0; }
|
|
<*> * { return x->symbol = SYNTAX, 0; }
|
|
<*> sentinel
|
|
{ return x->symbol = scan.condition == yycline ? END : ILLEGAL, 0; }
|
|
<expect_line> newline => line { x->line = ++scan.line; goto scan; }
|
|
<expect_caption> ws* @s0 glyph (glyph | ws)* @s1 ws* / newline
|
|
=> expect_line
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = CAPTION, 1; }
|
|
<line> newline { x->line = ++scan.line; return x->symbol = PARAGRAPH, 1; }
|
|
<line> "--" :=> source
|
|
<line> "->" :=> location
|
|
<line> "[" :=> edict
|
|
/* Just plain text. */
|
|
<line> ws* / glyph :=> text /* match-empty-string: text takes care of it. */
|
|
|
|
<text> newline => line { x->line = ++scan.line; goto scan; }
|
|
<text, bible> ws+ { goto scan; }
|
|
<text> @s0 glyph+ @s1
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = TEXT, 1; }
|
|
|
|
roman1 = "1"|"I";
|
|
roman2 = roman1|"2"|"II";
|
|
roman3 = roman2|"3"|"III";
|
|
bible_ref = natural ":" natural [ab]? ("-" (natural ":")? natural [ab]?)?;
|
|
|
|
<text> @s0 ("Genesis" | "Exodus" | "Leviticus" | "Numbers" | "Deuteronomy"
|
|
| "Joshua" | "Judges" | "Ruth" | roman2 " Samuel" | roman2 " Kings"
|
|
| roman2 " Chronicles" | "Ezra" | "Nehemiah" | "Esther" | "Job"
|
|
| "Psalms" | "Proverbs" | "Ecclesiastes" | "Song of Solomon" | "Isaiah"
|
|
| "Jeremiah" | "Lamentations" | "Ezekiel" | "Daniel" | "Hosea" | "Joel"
|
|
| "Amos" | "Obadiah" | "Jonah" | "Micah" | "Nahum" | "Habakkuk"
|
|
| "Zephaniah" | "Haggai" | "Zechariah" | "Malachi" | "Matthew" | "Mark"
|
|
| "Luke" | "John" | "Acts" | "Romans" | roman2 " Corinthians"
|
|
| "Galatians" | "Ephesians" | "Philippians" | "Colossians"
|
|
| roman2 " Thessalonians" | roman2 " Timothy" | "Titus" | "Philemon"
|
|
| "Hebrews" | "James" | roman2 " Peter" | roman3 " John" | "Jude"
|
|
| "Revelation") @s1 ws* / bible_ref ws+ "--" ws+ "``"
|
|
=> bible { x->s0 = s0, x->s1 = s1; return x->symbol = BIBLE_BOOK, 1; }
|
|
<bible> @s0 bible_ref @s1 ws+ "--" ws+ "``"
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = BIBLE_CHAPTER_VERSE, 1; }
|
|
<bible> "``" { return x->symbol = BIBLE_NEXT, 1; }
|
|
<bible> "''" => expect_line { printf("**reset;\n"); }
|
|
/* HACK! Bible verses generally don't contain apostrophes. */
|
|
<bible> @s0 (glyph \ ['])+ @s1
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = BIBLE_TEXT, 1; }
|
|
<bible> newline { x->line = ++scan.line; goto scan; }
|
|
|
|
<source> @s0 keyword @s1 => expect_line
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = SOURCE_RECALL, 1; }
|
|
|
|
<location> "" / "(" :=> map
|
|
<location> "[" ws* @s0 keyword @s1 ws* "]"
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = LOCATION_SAVE, 1; }
|
|
<location> @s0 keyword @s1 => expect_line
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = LOCATION_RECALL, 1; }
|
|
|
|
<map> "(" @s0 decimal "," @s1 decimal ")" => expect_caption
|
|
{ x->s0 = s0, x->s1 = s1; return x->symbol = LOCATION, 1; }
|
|
|
|
<edict> "source"
|
|
{ if(scan.is_ws_expected || scan.edict.size)
|
|
return x->symbol = SYNTAX, 0;
|
|
scan.is_ws_expected = 1, scan.is_source = 1;
|
|
scan.edict.size = 2;
|
|
scan.edict.expect[1] = EXPECT_KEYWORD;
|
|
scan.edict.expect[0] = EXPECT_FREEFORM;
|
|
return x->symbol = SOURCE, 1; }
|
|
<edict> "default"
|
|
{ if(scan.is_ws_expected || !scan.is_source)
|
|
return x->symbol = SYNTAX, 0;
|
|
scan.is_ws_expected = 1, scan.is_source = 0;
|
|
return x->symbol = DEFAULT, 1; }
|
|
|
|
<edict> "ed"
|
|
{ if(scan.is_ws_expected || scan.edict.size)
|
|
return x->symbol = SYNTAX, 0;
|
|
scan.is_ws_expected = 1; /* no idea, just copy; probably should do sth */
|
|
scan.edict.size = 1;
|
|
scan.edict.expect[0] = EXPECT_FREEFORM;
|
|
return x->symbol = EDITORIALIZING, 1; }
|
|
|
|
<edict> "significant"
|
|
{ if(scan.is_ws_expected || scan.edict.size)
|
|
return x->symbol = SYNTAX, 0;
|
|
scan.is_ws_expected = 1;
|
|
scan.edict.size = 3;
|
|
scan.edict.expect[2] = EXPECT_NATURAL;
|
|
scan.edict.expect[1] = EXPECT_FREEFORM;
|
|
scan.edict.expect[0] = EXPECT_DATE;
|
|
return x->symbol = SIGNIFICANT, 1; }
|
|
<edict> @s0 natural @s1
|
|
{ if(scan.is_ws_expected || scan.edict.size)
|
|
return x->symbol = SYNTAX, 0;
|
|
scan.is_ws_expected = 1;
|
|
x->s0 = s0, x->s1 = s1;
|
|
return x->symbol = SIGNIFICANT_RECALL, 1; }
|
|
|
|
<edict> ws+ { scan.is_ws_expected = 0; goto scan; }
|
|
<edict> ":"
|
|
{ if(!scan.edict.size) return x->symbol = SYNTAX, 0;
|
|
scan.is_ws_expected = 0, scan.is_source = 0;
|
|
expect_pop(); goto scan; }
|
|
<edict_keyword> ws* @s0 keyword @s1 ws* ";"?
|
|
{ x->s0 = s0, x->s1 = s1; expect_pop();
|
|
return x->symbol = ARG_KEYWORD, 1; }
|
|
<edict_date> ws* @s0 date @s1 ws* ";"?
|
|
{ x->s0 = s0, x->s1 = s1; expect_pop();
|
|
return x->symbol = ARG_DATE, 1; }
|
|
<edict_natural> ws* @s0 natural @s1 ws* ";"?
|
|
{ x->s0 = s0, x->s1 = s1; expect_pop();
|
|
return x->symbol = ARG_NATURAL, 1; }
|
|
<edict_freeform>
|
|
ws* @s0 (glyph \ [;[\]]) ((glyph \ [;[\]]) | ws)* @s1 ws* ";"?
|
|
{ x->s0 = s0, x->s1 = s1; expect_pop();
|
|
return x->symbol = ARG_FREEFORM, 1; }
|
|
<edict, edict_end> "]" => expect_line
|
|
{ if(scan.edict.size) return 0; goto scan; }
|
|
*/
|
|
}
|