interpret/src/lex.re_c.c

224 lines
6.5 KiB
C

/** @license 2022 Neil Edelman, distributed under the terms of the
[MIT License](https://opensource.org/licenses/MIT).
Lexer for journal entries.
@std C89/90 */
#include "../src/lex.h"
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <limits.h>
#include <errno.h>
/*!re2c
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = char;
*/
int lex_looks_like_year(const char *const a, int *const year) {
const char *YYCURSOR = a, *YYMARKER = a, *s0;
/*!stags:re2c format = 'const char *@@;\n'; */
(void)yyt2, (void)yyt3;
assert(a && year);
/*!re2c
@s0 ("-"? [1-9][0-9]* | "0") "\x00" {
int sign = 1, mag;
if(*s0 == '-') { sign = -1; s0++; }
for(mag = 0; *s0 != '\0'; s0++) {
int d = *s0 - '0';
if((INT_MAX - d) / 10 < mag) return 0;
mag = mag * 10 + d;
}
*year = sign * mag;
return 1;
}
* { return 0; }
*/
}
int lex_looks_like_month(const char *const a) {
const char *YYCURSOR = a, *YYMARKER = a, *s0;
/*!stags:re2c format = 'const char *@@;\n'; */
(void)yyt1, (void)yyt2, (void)yyt3;
assert(a);
/*!re2c
@s0 [0-1][0-9] "\x00" {
int val = 10 * (s0[0] - '0') + (s0[1] - '0');
return val < 1 || val > 12 ? 0 : val;
}
* { return 0; }
*/
}
int lex_looks_like_day(const char *const a) {
const char *YYCURSOR = a, *YYMARKER = a, *s0;
/*!stags:re2c format = 'const char *@@;\n'; */
(void)yyt1, (void)yyt2, (void)yyt3;
assert(a);
/*!re2c
@s0 [0-3][0-9] ".txt\x00" {
int val = 10 * (s0[0] - '0') + (s0[1] - '0');
return val < 1 || val > 31 ? 0 : val;
}
* { return 0; }
*/
}
/* This defines `enum condition`. */
/*!types:re2c*/
/* "[edict: expect; there; to; be; args]", in this case, expect would be a
stack of `size = 5` `EXPECT_KEYWORD`. This mirrors arguments in `LEX_SYMBOL`
and should also be an `edict_*` in <fn:lex_next> and <fn:expect_pop>. */
#define EXPECT_HEAD X(keyword, KEYWORD) X(date, DATE)
#define EXPECT_CONS Y(freeform, FREEFORM)
#define EXPECT EXPECT_HEAD EXPECT_CONS
/** scanner reads a file and extracts semantic information. Valid to access
only while underlying pointers do not change. */
static struct scan {
/* `re2c` variables; these point directly into `buffer`. */
const char *marker, *ctx_marker, *from, *cursor;
/* Weird `c2re` stuff: these fields have to come after when >5? */
const char *label, *buffer;
enum condition condition;
size_t line;
int is_ws_expected, is_source;
#define X(n, N) EXPECT_ ## N,
#define Y(n, N) EXPECT_ ## N
struct { unsigned size; enum { EXPECT } expect[16]; } edict;
#undef X
#undef Y
} scan; /* Not suited for concurrency. Simple. */
/** Resets the buffer to some `buffer`. */
void lex_reset(const char *const buffer) {
scan.marker = scan.ctx_marker = scan.from = scan.cursor = scan.label
= scan.buffer = buffer;
scan.condition = yycline;
scan.line = 1;
}
/** I don't think `re2c` supports branching on variable conditions.
It does now? */
static void expect_pop(void) {
if(!scan.edict.size) { scan.condition = yycedict_end; return; }
switch(scan.edict.expect[--scan.edict.size]) {
#define X(n, N) case EXPECT_ ## N : scan.condition = yycedict_ ## n; break;
#define Y(n, N) case EXPECT_ ## N : scan.condition = yycedict_ ## n; break;
EXPECT
#undef X
#undef Y
}
}
int lex_next(struct lex *const x) {
/*!re2c
re2c:flags:tags = 1;
re2c:define:YYCURSOR = scan.cursor;
re2c:define:YYMARKER = scan.marker;
re2c:define:YYCTXMARKER = scan.ctx_marker;
re2c:define:YYCONDTYPE = 'condition';
re2c:define:YYGETCONDITION = 'scan.condition';
re2c:define:YYGETCONDITION:naked = 1;
re2c:define:YYSETCONDITION = 'scan.condition = @@;';
re2c:define:YYSETCONDITION:naked = 1;
*/
const char *s0, *s1;
/*!stags:re2c format = 'const char *@@;\n'; */
assert(x);
if(!scan.buffer) return 0;
x->s0 = x->s1 = 0;
scan:
/*!re2c
sentinel = "\x00";
illegal = [\x01-\x08\x0a-\x1f\x7f]; // unix-style control characters
newline = "\n";
ws = [ \t];
glyph = [^\x00-\x1f \x7f];
<*> illegal { return x->symbol = ILLEGAL, 0; }
<line> sentinel { return x->symbol = END, 0; }
<text, text, image, edict, edict_keyword, edict_date, edict_freeform, edict_end>
sentinel { return x->symbol = ILLEGAL, 0; }
<line> newline { x->line = ++scan.line; return x->symbol = PARAGRAPH, 1; }
<line> "![" :=> image
<line> "[" :=> edict
<line> "" / glyph :=> text
<line> * { return x->symbol = SYNTAX, 1; }
<text> newline => line { x->line = ++scan.line; goto scan; }
<text> ws+ { goto scan; }
<text> @s0 glyph+ @s1 { x->s0 = s0, x->s1 = s1;
return x->symbol = TEXT, 1; }
decimal = "-"? ([1-9][0-9]* | [0])? "." [0-9]+ | [1-9][0-9]* | [0];
<image> ws* "osm" ws* "](geo:" @s0 decimal "," @s1 decimal ")" => text {
x->symbol = MAP, x->s0 = s0, x->s1 = s1;
printf("Got a map.\n");
return 1;
}
<image> * { printf("image(broken)\n");return 0; }
//
natural = [1-9][0-9]*;
id = [a-zA-Z_][a-zA-Z_\-0-9]{0,63};
date = "-"? natural "-" [0-1][0-9] "-" [0-1][0-9];
<edict> "source"
{ if(scan.is_ws_expected || scan.edict.size)
return x->symbol = SYNTAX, 0;
scan.is_ws_expected = 1, scan.is_source = 1;
scan.edict.size = 2;
scan.edict.expect[1] = EXPECT_KEYWORD;
scan.edict.expect[0] = EXPECT_FREEFORM;
return x->symbol = SOURCE, 1; }
<edict> "default"
{ if(scan.is_ws_expected || !scan.is_source)
return x->symbol = SYNTAX, 0;
scan.is_ws_expected = 1, scan.is_source = 0;
return x->symbol = DEFAULT, 1; }
// score
<edict> "significant"
{ if(scan.is_ws_expected || scan.edict.size)
return x->symbol = SYNTAX, 0;
scan.is_ws_expected = 1;
scan.edict.size = 2;
scan.edict.expect[1] = EXPECT_FREEFORM;
scan.edict.expect[0] = EXPECT_DATE;
return x->symbol = SIGNIFICANT, 1; }
<edict> @s0 natural @s1
{ if(scan.is_ws_expected || scan.edict.size)
return x->symbol = SYNTAX, 0;
scan.is_ws_expected = 1;
x->s0 = s0, x->s1 = s1;
return x->symbol = SCORE, 1; }
<edict> ws+ { scan.is_ws_expected = 0; goto scan; }
<edict> ":"
{ if(!scan.edict.size) return x->symbol = SYNTAX, 0;
scan.is_ws_expected = 0, scan.is_source = 0;
expect_pop(); goto scan; }
<! edict_keyword, edict_date, edict_freeform> { expect_pop(); }
<edict_keyword> ws* @s0 id @s1 ws* ";"? / "]"?
{ x->s0 = s0, x->s1 = s1;
return x->symbol = ARG_KEYWORD, 1; }
<edict_date> ws* @s0 date @s1 ws* ";"? / "]"?
{ x->s0 = s0, x->s1 = s1;
return x->symbol = ARG_DATE, 1; }
<edict_freeform> ws* @s0
[^ \t\n\r\v\f;[\]\x00][^\t\n\r\v\f;[\]\x00]*[^ \t\n\r\v\f;[\]\x00]*
@s1 ws* ";"? / "]"?
{ x->s0 = s0, x->s1 = s1;
return x->symbol = ARG_FREEFORM, 1; }
<edict, edict_end> "]" => text
{ if(scan.edict.size) return 0;
goto scan; }
<edict, edict_keyword, edict_date, edict_freeform, edict_end> *
{ return x->symbol = SYNTAX, 0; }
*/
}