Merge pull request #30 from McDutchie/interval-expr

backport ERE interval/repetition expressions from Apple awk-24
This commit is contained in:
Arnold Robbins 2019-03-05 21:28:45 +02:00 committed by GitHub
commit 4bc685f701
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 250 additions and 5 deletions

5
FIXES
View File

@ -25,6 +25,11 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the AWK book This file lists all bug fixes, changes, etc., made since the AWK book
was sent to the printers in August, 1987. was sent to the printers in August, 1987.
Mar 5, 2019:
Added support for POSIX-standard interval expressions (a.k.a.
bounds, a.k.a. repetition expressions) in regular expressions,
backported (via NetBSD) from Apple awk-24 (20070501).
Jan 25, 2019: Jan 25, 2019:
Make getline handle numeric strings properly in all cases. Make getline handle numeric strings properly in all cases.
(Thanks, Arnold.) (Thanks, Arnold.)

2
awk.1
View File

@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
The scope rules for variables in functions are a botch; The scope rules for variables in functions are a botch;
the syntax is worse. the syntax is worse.
.br .br
POSIX-standard interval expressions in regular expressions are not supported.
.br
Only eight-bit characters sets are handled correctly. Only eight-bit characters sets are handled correctly.

246
b.c
View File

@ -66,6 +66,11 @@ int rlxval;
static uschar *rlxstr; static uschar *rlxstr;
static uschar *prestr; /* current position in current re */ static uschar *prestr; /* current position in current re */
static uschar *lastre; /* origin of last re */ static uschar *lastre; /* origin of last re */
static uschar *lastatom; /* origin of last Atom */
static uschar *starttok;
static uschar *basestr; /* starts with original, replaced during
repetition processing */
static uschar *firstbasestr;
static int setcnt; static int setcnt;
static int poscnt; static int poscnt;
@ -125,6 +130,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
Node *p, *p1; Node *p, *p1;
fa *f; fa *f;
firstbasestr = (uschar *) s;
basestr = firstbasestr;
p = reparse(s); p = reparse(s);
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
/* put ALL STAR in front of reg. exp. */ /* put ALL STAR in front of reg. exp. */
@ -146,6 +153,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
f->initstat = makeinit(f, anchor); f->initstat = makeinit(f, anchor);
f->anchor = anchor; f->anchor = anchor;
f->restr = (uschar *) tostring(s); f->restr = (uschar *) tostring(s);
if (firstbasestr != basestr) {
if (basestr)
xfree(basestr);
}
return f; return f;
} }
@ -629,9 +640,11 @@ Node *regexp(void) /* top-level parse of reg expr */
Node *primary(void) Node *primary(void)
{ {
Node *np; Node *np;
int savelastatom;
switch (rtok) { switch (rtok) {
case CHAR: case CHAR:
lastatom = starttok;
np = op2(CHAR, NIL, itonp(rlxval)); np = op2(CHAR, NIL, itonp(rlxval));
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
@ -640,16 +653,19 @@ Node *primary(void)
return (unary(op2(ALL, NIL, NIL))); return (unary(op2(ALL, NIL, NIL)));
case EMPTYRE: case EMPTYRE:
rtok = relex(); rtok = relex();
return (unary(op2(ALL, NIL, NIL))); return (unary(op2(EMPTYRE, NIL, NIL)));
case DOT: case DOT:
lastatom = starttok;
rtok = relex(); rtok = relex();
return (unary(op2(DOT, NIL, NIL))); return (unary(op2(DOT, NIL, NIL)));
case CCL: case CCL:
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
lastatom = starttok;
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
case NCCL: case NCCL:
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
lastatom = starttok;
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
case '^': case '^':
@ -659,6 +675,8 @@ Node *primary(void)
rtok = relex(); rtok = relex();
return (unary(op2(CHAR, NIL, NIL))); return (unary(op2(CHAR, NIL, NIL)));
case '(': case '(':
lastatom = starttok;
savelastatom = starttok - basestr; /* Retain over recursion */
rtok = relex(); rtok = relex();
if (rtok == ')') { /* special pleading for () */ if (rtok == ')') { /* special pleading for () */
rtok = relex(); rtok = relex();
@ -666,6 +684,7 @@ Node *primary(void)
} }
np = regexp(); np = regexp();
if (rtok == ')') { if (rtok == ')') {
lastatom = basestr + savelastatom; /* Restore */
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
} }
@ -680,8 +699,12 @@ Node *primary(void)
Node *concat(Node *np) Node *concat(Node *np)
{ {
switch (rtok) { switch (rtok) {
case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
return (concat(op2(CAT, np, primary()))); return (concat(op2(CAT, np, primary())));
case EMPTYRE:
rtok = relex();
return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
primary())));
} }
return (np); return (np);
} }
@ -766,6 +789,117 @@ struct charclass {
{ NULL, 0, NULL }, { NULL, 0, NULL },
}; };
#define REPEAT_SIMPLE 0
#define REPEAT_PLUS_APPENDED 1
#define REPEAT_WITH_Q 2
#define REPEAT_ZERO 3
static int
replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
int atomlen, int firstnum, int secondnum, int special_case)
{
int i, j;
uschar *buf = 0;
int ret = 1;
int init_q = (firstnum==0); /* first added char will be ? */
int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
int prefix_length = reptok - basestr; /* prefix includes first rep */
int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
int size = prefix_length + suffix_length;
if (firstnum > 1) { /* add room for reps 2 through firstnum */
size += atomlen*(firstnum-1);
}
/* Adjust size of buffer for special cases */
if (special_case == REPEAT_PLUS_APPENDED) {
size++; /* for the final + */
} else if (special_case == REPEAT_WITH_Q) {
size += init_q + (atomlen+1)* n_q_reps;
} else if (special_case == REPEAT_ZERO) {
size += 2; /* just a null ERE: () */
}
if ((buf = (uschar *) malloc(size+1)) == NULL)
FATAL("out of space in reg expr %.10s..", lastre);
memcpy(buf, basestr, prefix_length); /* copy prefix */
j = prefix_length;
if (special_case == REPEAT_ZERO) {
j -= atomlen;
buf[j++] = '(';
buf[j++] = ')';
}
for (i=1; i < firstnum; i++) { /* copy x reps */
memcpy(&buf[j], atom, atomlen);
j += atomlen;
}
if (special_case == REPEAT_PLUS_APPENDED) {
buf[j++] = '+';
} else if (special_case == REPEAT_WITH_Q) {
if (init_q) buf[j++] = '?';
for (i=0; i < n_q_reps; i++) { /* copy x? reps */
memcpy(&buf[j], atom, atomlen);
j += atomlen;
buf[j++] = '?';
}
}
memcpy(&buf[j], reptok+reptoklen, suffix_length);
if (special_case == REPEAT_ZERO) {
buf[j+suffix_length] = '\0';
} else {
buf[size] = '\0';
}
/* free old basestr */
if (firstbasestr != basestr) {
if (basestr)
xfree(basestr);
}
basestr = buf;
prestr = buf + prefix_length;
if (special_case == REPEAT_ZERO) {
prestr -= atomlen;
ret++;
}
return ret;
}
static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
int atomlen, int firstnum, int secondnum)
{
/*
In general, the repetition specifier or "bound" is replaced here
by an equivalent ERE string, repeating the immediately previous atom
and appending ? and + as needed. Note that the first copy of the
atom is left in place, except in the special_case of a zero-repeat
(i.e., {0}).
*/
if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
if (firstnum < 2) {
/* 0 or 1: should be handled before you get here */
FATAL("internal error");
} else {
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_PLUS_APPENDED);
}
} else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
if (firstnum == 0) { /* {0} or {0,0} */
/* This case is unusual because the resulting
replacement string might actually be SMALLER than
the original ERE */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_ZERO);
} else { /* (firstnum >= 1) */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_SIMPLE);
}
} else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
/* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_WITH_Q);
} else { /* Error - shouldn't be here (n>m) */
FATAL("internal error");
}
return 0;
}
int relex(void) /* lexical analyzer for reparse */ int relex(void) /* lexical analyzer for reparse */
{ {
@ -776,6 +910,11 @@ int relex(void) /* lexical analyzer for reparse */
uschar *bp; uschar *bp;
struct charclass *cc; struct charclass *cc;
int i; int i;
int num, m, commafound, digitfound;
const uschar *startreptok;
rescan:
starttok = prestr;
switch (c = *prestr++) { switch (c = *prestr++) {
case '|': return OR; case '|': return OR;
@ -842,6 +981,40 @@ int relex(void) /* lexical analyzer for reparse */
} }
} else } else
*bp++ = c; *bp++ = c;
} else if (c == '[' && *prestr == '.') {
char collate_char;
prestr++;
collate_char = *prestr++;
if (*prestr == '.' && prestr[1] == ']') {
prestr += 2;
/* Found it: map via locale TBD: for
now, simply return this char. This
is sufficient to pass conformance
test awk.ex 156
*/
if (*prestr == ']') {
prestr++;
rlxval = collate_char;
return CHAR;
}
}
} else if (c == '[' && *prestr == '=') {
char equiv_char;
prestr++;
equiv_char = *prestr++;
if (*prestr == '=' && prestr[1] == ']') {
prestr += 2;
/* Found it: map via locale TBD: for now
simply return this char. This is
sufficient to pass conformance test
awk.ex 156
*/
if (*prestr == ']') {
prestr++;
rlxval = equiv_char;
return CHAR;
}
}
} else if (c == '\0') { } else if (c == '\0') {
FATAL("nonterminated character class %.20s", lastre); FATAL("nonterminated character class %.20s", lastre);
} else if (bp == buf) { /* 1st char is special */ } else if (bp == buf) { /* 1st char is special */
@ -856,6 +1029,75 @@ int relex(void) /* lexical analyzer for reparse */
} else } else
*bp++ = c; *bp++ = c;
} }
break;
case '{':
if (isdigit(*(prestr))) {
num = 0; /* Process as a repetition */
n = -1; m = -1;
commafound = 0;
digitfound = 0;
startreptok = prestr-1;
/* Remember start of previous atom here ? */
} else { /* just a { char, not a repetition */
rlxval = c;
return CHAR;
}
for (; ; ) {
if ((c = *prestr++) == '}') {
if (commafound) {
if (digitfound) { /* {n,m} */
m = num;
if (m<n)
FATAL("illegal repetition expression: class %.20s",
lastre);
if ((n==0) && (m==1)) {
return QUEST;
}
} else { /* {n,} */
if (n==0) return STAR;
if (n==1) return PLUS;
}
} else {
if (digitfound) { /* {n} same as {n,n} */
n = num;
m = num;
} else { /* {} */
FATAL("illegal repetition expression: class %.20s",
lastre);
}
}
if (repeat(starttok, prestr-starttok, lastatom,
startreptok - lastatom, n, m) > 0) {
if ((n==0) && (m==0)) {
return EMPTYRE;
}
/* must rescan input for next token */
goto rescan;
}
/* Failed to replace: eat up {...} characters
and treat like just PLUS */
return PLUS;
} else if (c == '\0') {
FATAL("nonterminated character class %.20s",
lastre);
} else if (isdigit(c)) {
num = 10 * num + c - '0';
digitfound = 1;
} else if (c == ',') {
if (commafound)
FATAL("illegal repetition expression: class %.20s",
lastre);
/* looking for {n,} or {n,m} */
commafound = 1;
n = num;
digitfound = 0; /* reset */
num = 0;
} else {
FATAL("illegal repetition expression: class %.20s",
lastre);
}
}
break;
} }
} }

2
main.c
View File

@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE. THIS SOFTWARE.
****************************************************************/ ****************************************************************/
const char *version = "version 20190125"; const char *version = "version 20190305";
#define DEBUG #define DEBUG
#include <stdio.h> #include <stdio.h>