backport ERE interval/repetition expressions from Apple awk-24
The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a.
repetition expressions) in regular expressions is listed under BUGS
in 'awk.1'. Apple's version of onetrueawk has supported these since
at least 2009, judging by the date stamp on their src/b.c in:
https://opensource.apple.com/tarballs/awk/awk-24.tar.gz
A bug report prompted NetBSD to swiftly integrate this code into
their awk. This commit is based on that NetBSD diff.
http://gnats.netbsd.org/53885
f3e4c4ca1d
b.c:
- Backport POSIX-standard interval expressions support in regular
expressions via NetBSD from Apple awk-24 (20070501).
main.c:
- Bump version ID.
FIXES:
- Add note and credit for this feature.
awk.1: section BUGS:
- Remove line saying interval expressions are not supported.
_________
[*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
This commit is contained in:
parent
c3c7c1370e
commit
8a2222286c
5
FIXES
5
FIXES
@ -25,6 +25,11 @@ THIS SOFTWARE.
|
||||
This file lists all bug fixes, changes, etc., made since the AWK book
|
||||
was sent to the printers in August, 1987.
|
||||
|
||||
Jan 23, 2019:
|
||||
Added support for POSIX-standard interval expressions (a.k.a.
|
||||
bounds, a.k.a. repetition expressions) in regular expressions,
|
||||
backported (via NetBSD) from Apple awk-24 (20070501).
|
||||
|
||||
Oct 25, 2018:
|
||||
Added test in maketab.c to prevent generating a proctab entry
|
||||
for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings
|
||||
|
2
awk.1
2
awk.1
@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
|
||||
The scope rules for variables in functions are a botch;
|
||||
the syntax is worse.
|
||||
.br
|
||||
POSIX-standard interval expressions in regular expressions are not supported.
|
||||
.br
|
||||
Only eight-bit characters sets are handled correctly.
|
||||
|
244
b.c
244
b.c
@ -65,6 +65,11 @@ int rlxval;
|
||||
static uschar *rlxstr;
|
||||
static uschar *prestr; /* current position in current re */
|
||||
static uschar *lastre; /* origin of last re */
|
||||
static uschar *lastatom; /* origin of last Atom */
|
||||
static uschar *starttok;
|
||||
static uschar *basestr; /* starts with original, replaced during
|
||||
repetition processing */
|
||||
static uschar *firstbasestr;
|
||||
|
||||
static int setcnt;
|
||||
static int poscnt;
|
||||
@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
|
||||
Node *p, *p1;
|
||||
fa *f;
|
||||
|
||||
firstbasestr = (uschar *) s;
|
||||
basestr = firstbasestr;
|
||||
p = reparse(s);
|
||||
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
|
||||
/* put ALL STAR in front of reg. exp. */
|
||||
@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
|
||||
f->initstat = makeinit(f, anchor);
|
||||
f->anchor = anchor;
|
||||
f->restr = (uschar *) tostring(s);
|
||||
if (firstbasestr != basestr) {
|
||||
if (basestr)
|
||||
xfree(basestr);
|
||||
}
|
||||
return f;
|
||||
}
|
||||
|
||||
@ -628,9 +639,11 @@ Node *regexp(void) /* top-level parse of reg expr */
|
||||
Node *primary(void)
|
||||
{
|
||||
Node *np;
|
||||
int savelastatom;
|
||||
|
||||
switch (rtok) {
|
||||
case CHAR:
|
||||
lastatom = starttok;
|
||||
np = op2(CHAR, NIL, itonp(rlxval));
|
||||
rtok = relex();
|
||||
return (unary(np));
|
||||
@ -639,16 +652,19 @@ Node *primary(void)
|
||||
return (unary(op2(ALL, NIL, NIL)));
|
||||
case EMPTYRE:
|
||||
rtok = relex();
|
||||
return (unary(op2(ALL, NIL, NIL)));
|
||||
return (unary(op2(EMPTYRE, NIL, NIL)));
|
||||
case DOT:
|
||||
lastatom = starttok;
|
||||
rtok = relex();
|
||||
return (unary(op2(DOT, NIL, NIL)));
|
||||
case CCL:
|
||||
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
|
||||
lastatom = starttok;
|
||||
rtok = relex();
|
||||
return (unary(np));
|
||||
case NCCL:
|
||||
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
|
||||
lastatom = starttok;
|
||||
rtok = relex();
|
||||
return (unary(np));
|
||||
case '^':
|
||||
@ -658,6 +674,8 @@ Node *primary(void)
|
||||
rtok = relex();
|
||||
return (unary(op2(CHAR, NIL, NIL)));
|
||||
case '(':
|
||||
lastatom = starttok;
|
||||
savelastatom = starttok - basestr; /* Retain over recursion */
|
||||
rtok = relex();
|
||||
if (rtok == ')') { /* special pleading for () */
|
||||
rtok = relex();
|
||||
@ -665,6 +683,7 @@ Node *primary(void)
|
||||
}
|
||||
np = regexp();
|
||||
if (rtok == ')') {
|
||||
lastatom = basestr + savelastatom; /* Restore */
|
||||
rtok = relex();
|
||||
return (unary(np));
|
||||
}
|
||||
@ -679,8 +698,12 @@ Node *primary(void)
|
||||
Node *concat(Node *np)
|
||||
{
|
||||
switch (rtok) {
|
||||
case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
|
||||
case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
|
||||
return (concat(op2(CAT, np, primary())));
|
||||
case EMPTYRE:
|
||||
rtok = relex();
|
||||
return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
|
||||
primary())));
|
||||
}
|
||||
return (np);
|
||||
}
|
||||
@ -765,6 +788,115 @@ struct charclass {
|
||||
{ NULL, 0, NULL },
|
||||
};
|
||||
|
||||
#define REPEAT_SIMPLE 0
|
||||
#define REPEAT_PLUS_APPENDED 1
|
||||
#define REPEAT_WITH_Q 2
|
||||
#define REPEAT_ZERO 3
|
||||
|
||||
static int
|
||||
replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
|
||||
int atomlen, int firstnum, int secondnum, int special_case)
|
||||
{
|
||||
int i, j;
|
||||
uschar *buf = 0;
|
||||
int ret = 1;
|
||||
int init_q = (firstnum==0); /* first added char will be ? */
|
||||
int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
|
||||
int prefix_length = reptok - basestr; /* prefix includes first rep */
|
||||
int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
|
||||
int size = prefix_length + suffix_length;
|
||||
|
||||
if (firstnum > 1) { /* add room for reps 2 through firstnum */
|
||||
size += atomlen*(firstnum-1);
|
||||
}
|
||||
|
||||
/* Adjust size of buffer for special cases */
|
||||
if (special_case == REPEAT_PLUS_APPENDED) {
|
||||
size++; /* for the final + */
|
||||
} else if (special_case == REPEAT_WITH_Q) {
|
||||
size += init_q + (atomlen+1)* n_q_reps;
|
||||
} else if (special_case == REPEAT_ZERO) {
|
||||
size += 2; /* just a null ERE: () */
|
||||
}
|
||||
if ((buf = (uschar *) malloc(size+1)) == NULL)
|
||||
FATAL("out of space in reg expr %.10s..", lastre);
|
||||
memcpy(buf, basestr, prefix_length); /* copy prefix */
|
||||
j = prefix_length;
|
||||
if (special_case == REPEAT_ZERO) {
|
||||
j -= atomlen;
|
||||
buf[j++] = '(';
|
||||
buf[j++] = ')';
|
||||
}
|
||||
for (i=1; i < firstnum; i++) { /* copy x reps */
|
||||
memcpy(&buf[j], atom, atomlen);
|
||||
j += atomlen;
|
||||
}
|
||||
if (special_case == REPEAT_PLUS_APPENDED) {
|
||||
buf[j++] = '+';
|
||||
} else if (special_case == REPEAT_WITH_Q) {
|
||||
if (init_q) buf[j++] = '?';
|
||||
for (i=0; i < n_q_reps; i++) { /* copy x? reps */
|
||||
memcpy(&buf[j], atom, atomlen);
|
||||
j += atomlen;
|
||||
buf[j++] = '?';
|
||||
}
|
||||
}
|
||||
memcpy(&buf[j], reptok+reptoklen, suffix_length);
|
||||
if (special_case == REPEAT_ZERO) {
|
||||
buf[j+suffix_length] = '\0';
|
||||
} else {
|
||||
buf[size] = '\0';
|
||||
}
|
||||
/* free old basestr */
|
||||
if (firstbasestr != basestr) {
|
||||
if (basestr)
|
||||
xfree(basestr);
|
||||
}
|
||||
basestr = buf;
|
||||
prestr = buf + prefix_length;
|
||||
if (special_case == REPEAT_ZERO) {
|
||||
prestr -= atomlen;
|
||||
ret++;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
|
||||
int atomlen, int firstnum, int secondnum)
|
||||
{
|
||||
/*
|
||||
In general, the repetition specifier or "bound" is replaced here
|
||||
by an equivalent ERE string, repeating the immediately previous atom
|
||||
and appending ? and + as needed. Note that the first copy of the
|
||||
atom is left in place, except in the special_case of a zero-repeat
|
||||
(i.e., {0}).
|
||||
*/
|
||||
if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
|
||||
if (firstnum < 2) {
|
||||
/* 0 or 1: should be handled before you get here */
|
||||
} else {
|
||||
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||
firstnum, secondnum, REPEAT_PLUS_APPENDED);
|
||||
}
|
||||
} else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
|
||||
if (firstnum == 0) { /* {0} or {0,0} */
|
||||
/* This case is unusual because the resulting
|
||||
replacement string might actually be SMALLER than
|
||||
the original ERE */
|
||||
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||
firstnum, secondnum, REPEAT_ZERO);
|
||||
} else { /* (firstnum >= 1) */
|
||||
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||
firstnum, secondnum, REPEAT_SIMPLE);
|
||||
}
|
||||
} else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
|
||||
/* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
|
||||
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||
firstnum, secondnum, REPEAT_WITH_Q);
|
||||
} else { /* Error - shouldn't be here (n>m) */
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int relex(void) /* lexical analyzer for reparse */
|
||||
{
|
||||
@ -775,6 +907,11 @@ int relex(void) /* lexical analyzer for reparse */
|
||||
uschar *bp;
|
||||
struct charclass *cc;
|
||||
int i;
|
||||
int num, m, commafound, digitfound;
|
||||
const uschar *startreptok;
|
||||
|
||||
rescan:
|
||||
starttok = prestr;
|
||||
|
||||
switch (c = *prestr++) {
|
||||
case '|': return OR;
|
||||
@ -841,6 +978,40 @@ int relex(void) /* lexical analyzer for reparse */
|
||||
}
|
||||
} else
|
||||
*bp++ = c;
|
||||
} else if (c == '[' && *prestr == '.') {
|
||||
char collate_char;
|
||||
prestr++;
|
||||
collate_char = *prestr++;
|
||||
if (*prestr == '.' && prestr[1] == ']') {
|
||||
prestr += 2;
|
||||
/* Found it: map via locale TBD: for
|
||||
now, simply return this char. This
|
||||
is sufficient to pass conformance
|
||||
test awk.ex 156
|
||||
*/
|
||||
if (*prestr == ']') {
|
||||
prestr++;
|
||||
rlxval = collate_char;
|
||||
return CHAR;
|
||||
}
|
||||
}
|
||||
} else if (c == '[' && *prestr == '=') {
|
||||
char equiv_char;
|
||||
prestr++;
|
||||
equiv_char = *prestr++;
|
||||
if (*prestr == '=' && prestr[1] == ']') {
|
||||
prestr += 2;
|
||||
/* Found it: map via locale TBD: for now
|
||||
simply return this char. This is
|
||||
sufficient to pass conformance test
|
||||
awk.ex 156
|
||||
*/
|
||||
if (*prestr == ']') {
|
||||
prestr++;
|
||||
rlxval = equiv_char;
|
||||
return CHAR;
|
||||
}
|
||||
}
|
||||
} else if (c == '\0') {
|
||||
FATAL("nonterminated character class %.20s", lastre);
|
||||
} else if (bp == buf) { /* 1st char is special */
|
||||
@ -855,6 +1026,75 @@ int relex(void) /* lexical analyzer for reparse */
|
||||
} else
|
||||
*bp++ = c;
|
||||
}
|
||||
break;
|
||||
case '{':
|
||||
if (isdigit(*(prestr))) {
|
||||
num = 0; /* Process as a repetition */
|
||||
n = -1; m = -1;
|
||||
commafound = 0;
|
||||
digitfound = 0;
|
||||
startreptok = prestr-1;
|
||||
/* Remember start of previous atom here ? */
|
||||
} else { /* just a { char, not a repetition */
|
||||
rlxval = c;
|
||||
return CHAR;
|
||||
}
|
||||
for (; ; ) {
|
||||
if ((c = *prestr++) == '}') {
|
||||
if (commafound) {
|
||||
if (digitfound) { /* {n,m} */
|
||||
m = num;
|
||||
if (m<n)
|
||||
FATAL("illegal repetition expression: class %.20s",
|
||||
lastre);
|
||||
if ((n==0) && (m==1)) {
|
||||
return QUEST;
|
||||
}
|
||||
} else { /* {n,} */
|
||||
if (n==0) return STAR;
|
||||
if (n==1) return PLUS;
|
||||
}
|
||||
} else {
|
||||
if (digitfound) { /* {n} same as {n,n} */
|
||||
n = num;
|
||||
m = num;
|
||||
} else { /* {} */
|
||||
FATAL("illegal repetition expression: class %.20s",
|
||||
lastre);
|
||||
}
|
||||
}
|
||||
if (repeat(starttok, prestr-starttok, lastatom,
|
||||
startreptok - lastatom, n, m) > 0) {
|
||||
if ((n==0) && (m==0)) {
|
||||
return EMPTYRE;
|
||||
}
|
||||
/* must rescan input for next token */
|
||||
goto rescan;
|
||||
}
|
||||
/* Failed to replace: eat up {...} characters
|
||||
and treat like just PLUS */
|
||||
return PLUS;
|
||||
} else if (c == '\0') {
|
||||
FATAL("nonterminated character class %.20s",
|
||||
lastre);
|
||||
} else if (isdigit(c)) {
|
||||
num = 10 * num + c - '0';
|
||||
digitfound = 1;
|
||||
} else if (c == ',') {
|
||||
if (commafound)
|
||||
FATAL("illegal repetition expression: class %.20s",
|
||||
lastre);
|
||||
/* looking for {n,} or {n,m} */
|
||||
commafound = 1;
|
||||
n = num;
|
||||
digitfound = 0; /* reset */
|
||||
num = 0;
|
||||
} else {
|
||||
FATAL("illegal repetition expression: class %.20s",
|
||||
lastre);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user