backport ERE interval/repetition expressions from Apple awk-24

The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a.
repetition expressions) in regular expressions is listed under BUGS
in 'awk.1'. Apple's version of onetrueawk has supported these since
at least 2009, judging by the date stamp on their src/b.c in:
https://opensource.apple.com/tarballs/awk/awk-24.tar.gz

A bug report prompted NetBSD to swiftly integrate this code into
their awk. This commit is based on that NetBSD diff.
http://gnats.netbsd.org/53885
f3e4c4ca1d

b.c:
- Backport POSIX-standard interval expressions support in regular
  expressions via NetBSD from Apple awk-24 (20070501).

main.c:
- Bump version ID.

FIXES:
- Add note and credit for this feature.

awk.1: section BUGS:
- Remove line saying interval expressions are not supported.

_________
[*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
This commit is contained in:
Martijn Dekker 2019-01-23 09:12:27 +00:00
parent c3c7c1370e
commit 8a2222286c
4 changed files with 248 additions and 5 deletions

5
FIXES
View File

@ -25,6 +25,11 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the AWK book This file lists all bug fixes, changes, etc., made since the AWK book
was sent to the printers in August, 1987. was sent to the printers in August, 1987.
Jan 23, 2019:
Added support for POSIX-standard interval expressions (a.k.a.
bounds, a.k.a. repetition expressions) in regular expressions,
backported (via NetBSD) from Apple awk-24 (20070501).
Oct 25, 2018: Oct 25, 2018:
Added test in maketab.c to prevent generating a proctab entry Added test in maketab.c to prevent generating a proctab entry
for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings

2
awk.1
View File

@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
The scope rules for variables in functions are a botch; The scope rules for variables in functions are a botch;
the syntax is worse. the syntax is worse.
.br .br
POSIX-standard interval expressions in regular expressions are not supported.
.br
Only eight-bit characters sets are handled correctly. Only eight-bit characters sets are handled correctly.

244
b.c
View File

@ -65,6 +65,11 @@ int rlxval;
static uschar *rlxstr; static uschar *rlxstr;
static uschar *prestr; /* current position in current re */ static uschar *prestr; /* current position in current re */
static uschar *lastre; /* origin of last re */ static uschar *lastre; /* origin of last re */
static uschar *lastatom; /* origin of last Atom */
static uschar *starttok;
static uschar *basestr; /* starts with original, replaced during
repetition processing */
static uschar *firstbasestr;
static int setcnt; static int setcnt;
static int poscnt; static int poscnt;
@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
Node *p, *p1; Node *p, *p1;
fa *f; fa *f;
firstbasestr = (uschar *) s;
basestr = firstbasestr;
p = reparse(s); p = reparse(s);
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
/* put ALL STAR in front of reg. exp. */ /* put ALL STAR in front of reg. exp. */
@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
f->initstat = makeinit(f, anchor); f->initstat = makeinit(f, anchor);
f->anchor = anchor; f->anchor = anchor;
f->restr = (uschar *) tostring(s); f->restr = (uschar *) tostring(s);
if (firstbasestr != basestr) {
if (basestr)
xfree(basestr);
}
return f; return f;
} }
@ -628,9 +639,11 @@ Node *regexp(void) /* top-level parse of reg expr */
Node *primary(void) Node *primary(void)
{ {
Node *np; Node *np;
int savelastatom;
switch (rtok) { switch (rtok) {
case CHAR: case CHAR:
lastatom = starttok;
np = op2(CHAR, NIL, itonp(rlxval)); np = op2(CHAR, NIL, itonp(rlxval));
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
@ -639,16 +652,19 @@ Node *primary(void)
return (unary(op2(ALL, NIL, NIL))); return (unary(op2(ALL, NIL, NIL)));
case EMPTYRE: case EMPTYRE:
rtok = relex(); rtok = relex();
return (unary(op2(ALL, NIL, NIL))); return (unary(op2(EMPTYRE, NIL, NIL)));
case DOT: case DOT:
lastatom = starttok;
rtok = relex(); rtok = relex();
return (unary(op2(DOT, NIL, NIL))); return (unary(op2(DOT, NIL, NIL)));
case CCL: case CCL:
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
lastatom = starttok;
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
case NCCL: case NCCL:
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
lastatom = starttok;
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
case '^': case '^':
@ -658,6 +674,8 @@ Node *primary(void)
rtok = relex(); rtok = relex();
return (unary(op2(CHAR, NIL, NIL))); return (unary(op2(CHAR, NIL, NIL)));
case '(': case '(':
lastatom = starttok;
savelastatom = starttok - basestr; /* Retain over recursion */
rtok = relex(); rtok = relex();
if (rtok == ')') { /* special pleading for () */ if (rtok == ')') { /* special pleading for () */
rtok = relex(); rtok = relex();
@ -665,6 +683,7 @@ Node *primary(void)
} }
np = regexp(); np = regexp();
if (rtok == ')') { if (rtok == ')') {
lastatom = basestr + savelastatom; /* Restore */
rtok = relex(); rtok = relex();
return (unary(np)); return (unary(np));
} }
@ -679,8 +698,12 @@ Node *primary(void)
Node *concat(Node *np) Node *concat(Node *np)
{ {
switch (rtok) { switch (rtok) {
case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
return (concat(op2(CAT, np, primary()))); return (concat(op2(CAT, np, primary())));
case EMPTYRE:
rtok = relex();
return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
primary())));
} }
return (np); return (np);
} }
@ -765,6 +788,115 @@ struct charclass {
{ NULL, 0, NULL }, { NULL, 0, NULL },
}; };
#define REPEAT_SIMPLE 0
#define REPEAT_PLUS_APPENDED 1
#define REPEAT_WITH_Q 2
#define REPEAT_ZERO 3
static int
replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
int atomlen, int firstnum, int secondnum, int special_case)
{
int i, j;
uschar *buf = 0;
int ret = 1;
int init_q = (firstnum==0); /* first added char will be ? */
int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
int prefix_length = reptok - basestr; /* prefix includes first rep */
int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
int size = prefix_length + suffix_length;
if (firstnum > 1) { /* add room for reps 2 through firstnum */
size += atomlen*(firstnum-1);
}
/* Adjust size of buffer for special cases */
if (special_case == REPEAT_PLUS_APPENDED) {
size++; /* for the final + */
} else if (special_case == REPEAT_WITH_Q) {
size += init_q + (atomlen+1)* n_q_reps;
} else if (special_case == REPEAT_ZERO) {
size += 2; /* just a null ERE: () */
}
if ((buf = (uschar *) malloc(size+1)) == NULL)
FATAL("out of space in reg expr %.10s..", lastre);
memcpy(buf, basestr, prefix_length); /* copy prefix */
j = prefix_length;
if (special_case == REPEAT_ZERO) {
j -= atomlen;
buf[j++] = '(';
buf[j++] = ')';
}
for (i=1; i < firstnum; i++) { /* copy x reps */
memcpy(&buf[j], atom, atomlen);
j += atomlen;
}
if (special_case == REPEAT_PLUS_APPENDED) {
buf[j++] = '+';
} else if (special_case == REPEAT_WITH_Q) {
if (init_q) buf[j++] = '?';
for (i=0; i < n_q_reps; i++) { /* copy x? reps */
memcpy(&buf[j], atom, atomlen);
j += atomlen;
buf[j++] = '?';
}
}
memcpy(&buf[j], reptok+reptoklen, suffix_length);
if (special_case == REPEAT_ZERO) {
buf[j+suffix_length] = '\0';
} else {
buf[size] = '\0';
}
/* free old basestr */
if (firstbasestr != basestr) {
if (basestr)
xfree(basestr);
}
basestr = buf;
prestr = buf + prefix_length;
if (special_case == REPEAT_ZERO) {
prestr -= atomlen;
ret++;
}
return ret;
}
static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
int atomlen, int firstnum, int secondnum)
{
/*
In general, the repetition specifier or "bound" is replaced here
by an equivalent ERE string, repeating the immediately previous atom
and appending ? and + as needed. Note that the first copy of the
atom is left in place, except in the special_case of a zero-repeat
(i.e., {0}).
*/
if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
if (firstnum < 2) {
/* 0 or 1: should be handled before you get here */
} else {
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_PLUS_APPENDED);
}
} else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
if (firstnum == 0) { /* {0} or {0,0} */
/* This case is unusual because the resulting
replacement string might actually be SMALLER than
the original ERE */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_ZERO);
} else { /* (firstnum >= 1) */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_SIMPLE);
}
} else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
/* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_WITH_Q);
} else { /* Error - shouldn't be here (n>m) */
}
return 0;
}
int relex(void) /* lexical analyzer for reparse */ int relex(void) /* lexical analyzer for reparse */
{ {
@ -775,6 +907,11 @@ int relex(void) /* lexical analyzer for reparse */
uschar *bp; uschar *bp;
struct charclass *cc; struct charclass *cc;
int i; int i;
int num, m, commafound, digitfound;
const uschar *startreptok;
rescan:
starttok = prestr;
switch (c = *prestr++) { switch (c = *prestr++) {
case '|': return OR; case '|': return OR;
@ -841,6 +978,40 @@ int relex(void) /* lexical analyzer for reparse */
} }
} else } else
*bp++ = c; *bp++ = c;
} else if (c == '[' && *prestr == '.') {
char collate_char;
prestr++;
collate_char = *prestr++;
if (*prestr == '.' && prestr[1] == ']') {
prestr += 2;
/* Found it: map via locale TBD: for
now, simply return this char. This
is sufficient to pass conformance
test awk.ex 156
*/
if (*prestr == ']') {
prestr++;
rlxval = collate_char;
return CHAR;
}
}
} else if (c == '[' && *prestr == '=') {
char equiv_char;
prestr++;
equiv_char = *prestr++;
if (*prestr == '=' && prestr[1] == ']') {
prestr += 2;
/* Found it: map via locale TBD: for now
simply return this char. This is
sufficient to pass conformance test
awk.ex 156
*/
if (*prestr == ']') {
prestr++;
rlxval = equiv_char;
return CHAR;
}
}
} else if (c == '\0') { } else if (c == '\0') {
FATAL("nonterminated character class %.20s", lastre); FATAL("nonterminated character class %.20s", lastre);
} else if (bp == buf) { /* 1st char is special */ } else if (bp == buf) { /* 1st char is special */
@ -855,6 +1026,75 @@ int relex(void) /* lexical analyzer for reparse */
} else } else
*bp++ = c; *bp++ = c;
} }
break;
case '{':
if (isdigit(*(prestr))) {
num = 0; /* Process as a repetition */
n = -1; m = -1;
commafound = 0;
digitfound = 0;
startreptok = prestr-1;
/* Remember start of previous atom here ? */
} else { /* just a { char, not a repetition */
rlxval = c;
return CHAR;
}
for (; ; ) {
if ((c = *prestr++) == '}') {
if (commafound) {
if (digitfound) { /* {n,m} */
m = num;
if (m<n)
FATAL("illegal repetition expression: class %.20s",
lastre);
if ((n==0) && (m==1)) {
return QUEST;
}
} else { /* {n,} */
if (n==0) return STAR;
if (n==1) return PLUS;
}
} else {
if (digitfound) { /* {n} same as {n,n} */
n = num;
m = num;
} else { /* {} */
FATAL("illegal repetition expression: class %.20s",
lastre);
}
}
if (repeat(starttok, prestr-starttok, lastatom,
startreptok - lastatom, n, m) > 0) {
if ((n==0) && (m==0)) {
return EMPTYRE;
}
/* must rescan input for next token */
goto rescan;
}
/* Failed to replace: eat up {...} characters
and treat like just PLUS */
return PLUS;
} else if (c == '\0') {
FATAL("nonterminated character class %.20s",
lastre);
} else if (isdigit(c)) {
num = 10 * num + c - '0';
digitfound = 1;
} else if (c == ',') {
if (commafound)
FATAL("illegal repetition expression: class %.20s",
lastre);
/* looking for {n,} or {n,m} */
commafound = 1;
n = num;
digitfound = 0; /* reset */
num = 0;
} else {
FATAL("illegal repetition expression: class %.20s",
lastre);
}
}
break;
} }
} }

2
main.c
View File

@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE. THIS SOFTWARE.
****************************************************************/ ****************************************************************/
const char *version = "version 20180827"; const char *version = "version 20190123";
#define DEBUG #define DEBUG
#include <stdio.h> #include <stdio.h>