backport ERE interval/repetition expressions from Apple awk-24

The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a.
repetition expressions) in regular expressions is listed under BUGS
in 'awk.1'. Apple's version of onetrueawk has supported these since
at least 2009, judging by the date stamp on their src/b.c in:
https://opensource.apple.com/tarballs/awk/awk-24.tar.gz

A bug report prompted NetBSD to swiftly integrate this code into
their awk. This commit is based on that NetBSD diff.
http://gnats.netbsd.org/53885
f3e4c4ca1d

b.c:
- Backport POSIX-standard interval expressions support in regular
  expressions via NetBSD from Apple awk-24 (20070501).

main.c:
- Bump version ID.

FIXES:
- Add note and credit for this feature.

awk.1: section BUGS:
- Remove line saying interval expressions are not supported.

_________
[*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
This commit is contained in:
Martijn Dekker 2019-01-23 09:12:27 +00:00
parent c3c7c1370e
commit 8a2222286c
4 changed files with 248 additions and 5 deletions

5
FIXES
View File

@ -25,6 +25,11 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the AWK book
was sent to the printers in August, 1987.
Jan 23, 2019:
Added support for POSIX-standard interval expressions (a.k.a.
bounds, a.k.a. repetition expressions) in regular expressions,
backported (via NetBSD) from Apple awk-24 (20070501).
Oct 25, 2018:
Added test in maketab.c to prevent generating a proctab entry
for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings

2
awk.1
View File

@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
The scope rules for variables in functions are a botch;
the syntax is worse.
.br
POSIX-standard interval expressions in regular expressions are not supported.
.br
Only eight-bit characters sets are handled correctly.

244
b.c
View File

@ -65,6 +65,11 @@ int rlxval;
static uschar *rlxstr;
static uschar *prestr; /* current position in current re */
static uschar *lastre; /* origin of last re */
static uschar *lastatom; /* origin of last Atom */
static uschar *starttok;
static uschar *basestr; /* starts with original, replaced during
repetition processing */
static uschar *firstbasestr;
static int setcnt;
static int poscnt;
@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
Node *p, *p1;
fa *f;
firstbasestr = (uschar *) s;
basestr = firstbasestr;
p = reparse(s);
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
/* put ALL STAR in front of reg. exp. */
@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
f->initstat = makeinit(f, anchor);
f->anchor = anchor;
f->restr = (uschar *) tostring(s);
if (firstbasestr != basestr) {
if (basestr)
xfree(basestr);
}
return f;
}
@ -628,9 +639,11 @@ Node *regexp(void) /* top-level parse of reg expr */
Node *primary(void)
{
Node *np;
int savelastatom;
switch (rtok) {
case CHAR:
lastatom = starttok;
np = op2(CHAR, NIL, itonp(rlxval));
rtok = relex();
return (unary(np));
@ -639,16 +652,19 @@ Node *primary(void)
return (unary(op2(ALL, NIL, NIL)));
case EMPTYRE:
rtok = relex();
return (unary(op2(ALL, NIL, NIL)));
return (unary(op2(EMPTYRE, NIL, NIL)));
case DOT:
lastatom = starttok;
rtok = relex();
return (unary(op2(DOT, NIL, NIL)));
case CCL:
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
lastatom = starttok;
rtok = relex();
return (unary(np));
case NCCL:
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
lastatom = starttok;
rtok = relex();
return (unary(np));
case '^':
@ -658,6 +674,8 @@ Node *primary(void)
rtok = relex();
return (unary(op2(CHAR, NIL, NIL)));
case '(':
lastatom = starttok;
savelastatom = starttok - basestr; /* Retain over recursion */
rtok = relex();
if (rtok == ')') { /* special pleading for () */
rtok = relex();
@ -665,6 +683,7 @@ Node *primary(void)
}
np = regexp();
if (rtok == ')') {
lastatom = basestr + savelastatom; /* Restore */
rtok = relex();
return (unary(np));
}
@ -679,8 +698,12 @@ Node *primary(void)
Node *concat(Node *np)
{
switch (rtok) {
case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
return (concat(op2(CAT, np, primary())));
case EMPTYRE:
rtok = relex();
return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
primary())));
}
return (np);
}
@ -765,6 +788,115 @@ struct charclass {
{ NULL, 0, NULL },
};
#define REPEAT_SIMPLE 0
#define REPEAT_PLUS_APPENDED 1
#define REPEAT_WITH_Q 2
#define REPEAT_ZERO 3
static int
replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
int atomlen, int firstnum, int secondnum, int special_case)
{
int i, j;
uschar *buf = 0;
int ret = 1;
int init_q = (firstnum==0); /* first added char will be ? */
int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
int prefix_length = reptok - basestr; /* prefix includes first rep */
int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
int size = prefix_length + suffix_length;
if (firstnum > 1) { /* add room for reps 2 through firstnum */
size += atomlen*(firstnum-1);
}
/* Adjust size of buffer for special cases */
if (special_case == REPEAT_PLUS_APPENDED) {
size++; /* for the final + */
} else if (special_case == REPEAT_WITH_Q) {
size += init_q + (atomlen+1)* n_q_reps;
} else if (special_case == REPEAT_ZERO) {
size += 2; /* just a null ERE: () */
}
if ((buf = (uschar *) malloc(size+1)) == NULL)
FATAL("out of space in reg expr %.10s..", lastre);
memcpy(buf, basestr, prefix_length); /* copy prefix */
j = prefix_length;
if (special_case == REPEAT_ZERO) {
j -= atomlen;
buf[j++] = '(';
buf[j++] = ')';
}
for (i=1; i < firstnum; i++) { /* copy x reps */
memcpy(&buf[j], atom, atomlen);
j += atomlen;
}
if (special_case == REPEAT_PLUS_APPENDED) {
buf[j++] = '+';
} else if (special_case == REPEAT_WITH_Q) {
if (init_q) buf[j++] = '?';
for (i=0; i < n_q_reps; i++) { /* copy x? reps */
memcpy(&buf[j], atom, atomlen);
j += atomlen;
buf[j++] = '?';
}
}
memcpy(&buf[j], reptok+reptoklen, suffix_length);
if (special_case == REPEAT_ZERO) {
buf[j+suffix_length] = '\0';
} else {
buf[size] = '\0';
}
/* free old basestr */
if (firstbasestr != basestr) {
if (basestr)
xfree(basestr);
}
basestr = buf;
prestr = buf + prefix_length;
if (special_case == REPEAT_ZERO) {
prestr -= atomlen;
ret++;
}
return ret;
}
static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
int atomlen, int firstnum, int secondnum)
{
/*
In general, the repetition specifier or "bound" is replaced here
by an equivalent ERE string, repeating the immediately previous atom
and appending ? and + as needed. Note that the first copy of the
atom is left in place, except in the special_case of a zero-repeat
(i.e., {0}).
*/
if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
if (firstnum < 2) {
/* 0 or 1: should be handled before you get here */
} else {
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_PLUS_APPENDED);
}
} else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
if (firstnum == 0) { /* {0} or {0,0} */
/* This case is unusual because the resulting
replacement string might actually be SMALLER than
the original ERE */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_ZERO);
} else { /* (firstnum >= 1) */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_SIMPLE);
}
} else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
/* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
return replace_repeat(reptok, reptoklen, atom, atomlen,
firstnum, secondnum, REPEAT_WITH_Q);
} else { /* Error - shouldn't be here (n>m) */
}
return 0;
}
int relex(void) /* lexical analyzer for reparse */
{
@ -775,6 +907,11 @@ int relex(void) /* lexical analyzer for reparse */
uschar *bp;
struct charclass *cc;
int i;
int num, m, commafound, digitfound;
const uschar *startreptok;
rescan:
starttok = prestr;
switch (c = *prestr++) {
case '|': return OR;
@ -841,6 +978,40 @@ int relex(void) /* lexical analyzer for reparse */
}
} else
*bp++ = c;
} else if (c == '[' && *prestr == '.') {
char collate_char;
prestr++;
collate_char = *prestr++;
if (*prestr == '.' && prestr[1] == ']') {
prestr += 2;
/* Found it: map via locale TBD: for
now, simply return this char. This
is sufficient to pass conformance
test awk.ex 156
*/
if (*prestr == ']') {
prestr++;
rlxval = collate_char;
return CHAR;
}
}
} else if (c == '[' && *prestr == '=') {
char equiv_char;
prestr++;
equiv_char = *prestr++;
if (*prestr == '=' && prestr[1] == ']') {
prestr += 2;
/* Found it: map via locale TBD: for now
simply return this char. This is
sufficient to pass conformance test
awk.ex 156
*/
if (*prestr == ']') {
prestr++;
rlxval = equiv_char;
return CHAR;
}
}
} else if (c == '\0') {
FATAL("nonterminated character class %.20s", lastre);
} else if (bp == buf) { /* 1st char is special */
@ -855,6 +1026,75 @@ int relex(void) /* lexical analyzer for reparse */
} else
*bp++ = c;
}
break;
case '{':
if (isdigit(*(prestr))) {
num = 0; /* Process as a repetition */
n = -1; m = -1;
commafound = 0;
digitfound = 0;
startreptok = prestr-1;
/* Remember start of previous atom here ? */
} else { /* just a { char, not a repetition */
rlxval = c;
return CHAR;
}
for (; ; ) {
if ((c = *prestr++) == '}') {
if (commafound) {
if (digitfound) { /* {n,m} */
m = num;
if (m<n)
FATAL("illegal repetition expression: class %.20s",
lastre);
if ((n==0) && (m==1)) {
return QUEST;
}
} else { /* {n,} */
if (n==0) return STAR;
if (n==1) return PLUS;
}
} else {
if (digitfound) { /* {n} same as {n,n} */
n = num;
m = num;
} else { /* {} */
FATAL("illegal repetition expression: class %.20s",
lastre);
}
}
if (repeat(starttok, prestr-starttok, lastatom,
startreptok - lastatom, n, m) > 0) {
if ((n==0) && (m==0)) {
return EMPTYRE;
}
/* must rescan input for next token */
goto rescan;
}
/* Failed to replace: eat up {...} characters
and treat like just PLUS */
return PLUS;
} else if (c == '\0') {
FATAL("nonterminated character class %.20s",
lastre);
} else if (isdigit(c)) {
num = 10 * num + c - '0';
digitfound = 1;
} else if (c == ',') {
if (commafound)
FATAL("illegal repetition expression: class %.20s",
lastre);
/* looking for {n,} or {n,m} */
commafound = 1;
n = num;
digitfound = 0; /* reset */
num = 0;
} else {
FATAL("illegal repetition expression: class %.20s",
lastre);
}
}
break;
}
}

2
main.c
View File

@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
const char *version = "version 20180827";
const char *version = "version 20190123";
#define DEBUG
#include <stdio.h>