backport ERE interval/repetition expressions from Apple awk-24
The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a.
repetition expressions) in regular expressions is listed under BUGS
in 'awk.1'. Apple's version of onetrueawk has supported these since
at least 2009, judging by the date stamp on their src/b.c in:
https://opensource.apple.com/tarballs/awk/awk-24.tar.gz
A bug report prompted NetBSD to swiftly integrate this code into
their awk. This commit is based on that NetBSD diff.
http://gnats.netbsd.org/53885
f3e4c4ca1d
b.c:
- Backport POSIX-standard interval expressions support in regular
expressions via NetBSD from Apple awk-24 (20070501).
main.c:
- Bump version ID.
FIXES:
- Add note and credit for this feature.
awk.1: section BUGS:
- Remove line saying interval expressions are not supported.
_________
[*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
This commit is contained in:
parent
c3c7c1370e
commit
8a2222286c
5
FIXES
5
FIXES
@ -25,6 +25,11 @@ THIS SOFTWARE.
|
|||||||
This file lists all bug fixes, changes, etc., made since the AWK book
|
This file lists all bug fixes, changes, etc., made since the AWK book
|
||||||
was sent to the printers in August, 1987.
|
was sent to the printers in August, 1987.
|
||||||
|
|
||||||
|
Jan 23, 2019:
|
||||||
|
Added support for POSIX-standard interval expressions (a.k.a.
|
||||||
|
bounds, a.k.a. repetition expressions) in regular expressions,
|
||||||
|
backported (via NetBSD) from Apple awk-24 (20070501).
|
||||||
|
|
||||||
Oct 25, 2018:
|
Oct 25, 2018:
|
||||||
Added test in maketab.c to prevent generating a proctab entry
|
Added test in maketab.c to prevent generating a proctab entry
|
||||||
for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings
|
for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings
|
||||||
|
2
awk.1
2
awk.1
@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
|
|||||||
The scope rules for variables in functions are a botch;
|
The scope rules for variables in functions are a botch;
|
||||||
the syntax is worse.
|
the syntax is worse.
|
||||||
.br
|
.br
|
||||||
POSIX-standard interval expressions in regular expressions are not supported.
|
|
||||||
.br
|
|
||||||
Only eight-bit characters sets are handled correctly.
|
Only eight-bit characters sets are handled correctly.
|
||||||
|
244
b.c
244
b.c
@ -65,6 +65,11 @@ int rlxval;
|
|||||||
static uschar *rlxstr;
|
static uschar *rlxstr;
|
||||||
static uschar *prestr; /* current position in current re */
|
static uschar *prestr; /* current position in current re */
|
||||||
static uschar *lastre; /* origin of last re */
|
static uschar *lastre; /* origin of last re */
|
||||||
|
static uschar *lastatom; /* origin of last Atom */
|
||||||
|
static uschar *starttok;
|
||||||
|
static uschar *basestr; /* starts with original, replaced during
|
||||||
|
repetition processing */
|
||||||
|
static uschar *firstbasestr;
|
||||||
|
|
||||||
static int setcnt;
|
static int setcnt;
|
||||||
static int poscnt;
|
static int poscnt;
|
||||||
@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
|
|||||||
Node *p, *p1;
|
Node *p, *p1;
|
||||||
fa *f;
|
fa *f;
|
||||||
|
|
||||||
|
firstbasestr = (uschar *) s;
|
||||||
|
basestr = firstbasestr;
|
||||||
p = reparse(s);
|
p = reparse(s);
|
||||||
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
|
p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
|
||||||
/* put ALL STAR in front of reg. exp. */
|
/* put ALL STAR in front of reg. exp. */
|
||||||
@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
|
|||||||
f->initstat = makeinit(f, anchor);
|
f->initstat = makeinit(f, anchor);
|
||||||
f->anchor = anchor;
|
f->anchor = anchor;
|
||||||
f->restr = (uschar *) tostring(s);
|
f->restr = (uschar *) tostring(s);
|
||||||
|
if (firstbasestr != basestr) {
|
||||||
|
if (basestr)
|
||||||
|
xfree(basestr);
|
||||||
|
}
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -628,9 +639,11 @@ Node *regexp(void) /* top-level parse of reg expr */
|
|||||||
Node *primary(void)
|
Node *primary(void)
|
||||||
{
|
{
|
||||||
Node *np;
|
Node *np;
|
||||||
|
int savelastatom;
|
||||||
|
|
||||||
switch (rtok) {
|
switch (rtok) {
|
||||||
case CHAR:
|
case CHAR:
|
||||||
|
lastatom = starttok;
|
||||||
np = op2(CHAR, NIL, itonp(rlxval));
|
np = op2(CHAR, NIL, itonp(rlxval));
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(np));
|
return (unary(np));
|
||||||
@ -639,16 +652,19 @@ Node *primary(void)
|
|||||||
return (unary(op2(ALL, NIL, NIL)));
|
return (unary(op2(ALL, NIL, NIL)));
|
||||||
case EMPTYRE:
|
case EMPTYRE:
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(op2(ALL, NIL, NIL)));
|
return (unary(op2(EMPTYRE, NIL, NIL)));
|
||||||
case DOT:
|
case DOT:
|
||||||
|
lastatom = starttok;
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(op2(DOT, NIL, NIL)));
|
return (unary(op2(DOT, NIL, NIL)));
|
||||||
case CCL:
|
case CCL:
|
||||||
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
|
np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
|
||||||
|
lastatom = starttok;
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(np));
|
return (unary(np));
|
||||||
case NCCL:
|
case NCCL:
|
||||||
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
|
np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
|
||||||
|
lastatom = starttok;
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(np));
|
return (unary(np));
|
||||||
case '^':
|
case '^':
|
||||||
@ -658,6 +674,8 @@ Node *primary(void)
|
|||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(op2(CHAR, NIL, NIL)));
|
return (unary(op2(CHAR, NIL, NIL)));
|
||||||
case '(':
|
case '(':
|
||||||
|
lastatom = starttok;
|
||||||
|
savelastatom = starttok - basestr; /* Retain over recursion */
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
if (rtok == ')') { /* special pleading for () */
|
if (rtok == ')') { /* special pleading for () */
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
@ -665,6 +683,7 @@ Node *primary(void)
|
|||||||
}
|
}
|
||||||
np = regexp();
|
np = regexp();
|
||||||
if (rtok == ')') {
|
if (rtok == ')') {
|
||||||
|
lastatom = basestr + savelastatom; /* Restore */
|
||||||
rtok = relex();
|
rtok = relex();
|
||||||
return (unary(np));
|
return (unary(np));
|
||||||
}
|
}
|
||||||
@ -679,8 +698,12 @@ Node *primary(void)
|
|||||||
Node *concat(Node *np)
|
Node *concat(Node *np)
|
||||||
{
|
{
|
||||||
switch (rtok) {
|
switch (rtok) {
|
||||||
case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
|
case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
|
||||||
return (concat(op2(CAT, np, primary())));
|
return (concat(op2(CAT, np, primary())));
|
||||||
|
case EMPTYRE:
|
||||||
|
rtok = relex();
|
||||||
|
return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
|
||||||
|
primary())));
|
||||||
}
|
}
|
||||||
return (np);
|
return (np);
|
||||||
}
|
}
|
||||||
@ -765,6 +788,115 @@ struct charclass {
|
|||||||
{ NULL, 0, NULL },
|
{ NULL, 0, NULL },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define REPEAT_SIMPLE 0
|
||||||
|
#define REPEAT_PLUS_APPENDED 1
|
||||||
|
#define REPEAT_WITH_Q 2
|
||||||
|
#define REPEAT_ZERO 3
|
||||||
|
|
||||||
|
static int
|
||||||
|
replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
|
||||||
|
int atomlen, int firstnum, int secondnum, int special_case)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
uschar *buf = 0;
|
||||||
|
int ret = 1;
|
||||||
|
int init_q = (firstnum==0); /* first added char will be ? */
|
||||||
|
int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
|
||||||
|
int prefix_length = reptok - basestr; /* prefix includes first rep */
|
||||||
|
int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */
|
||||||
|
int size = prefix_length + suffix_length;
|
||||||
|
|
||||||
|
if (firstnum > 1) { /* add room for reps 2 through firstnum */
|
||||||
|
size += atomlen*(firstnum-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Adjust size of buffer for special cases */
|
||||||
|
if (special_case == REPEAT_PLUS_APPENDED) {
|
||||||
|
size++; /* for the final + */
|
||||||
|
} else if (special_case == REPEAT_WITH_Q) {
|
||||||
|
size += init_q + (atomlen+1)* n_q_reps;
|
||||||
|
} else if (special_case == REPEAT_ZERO) {
|
||||||
|
size += 2; /* just a null ERE: () */
|
||||||
|
}
|
||||||
|
if ((buf = (uschar *) malloc(size+1)) == NULL)
|
||||||
|
FATAL("out of space in reg expr %.10s..", lastre);
|
||||||
|
memcpy(buf, basestr, prefix_length); /* copy prefix */
|
||||||
|
j = prefix_length;
|
||||||
|
if (special_case == REPEAT_ZERO) {
|
||||||
|
j -= atomlen;
|
||||||
|
buf[j++] = '(';
|
||||||
|
buf[j++] = ')';
|
||||||
|
}
|
||||||
|
for (i=1; i < firstnum; i++) { /* copy x reps */
|
||||||
|
memcpy(&buf[j], atom, atomlen);
|
||||||
|
j += atomlen;
|
||||||
|
}
|
||||||
|
if (special_case == REPEAT_PLUS_APPENDED) {
|
||||||
|
buf[j++] = '+';
|
||||||
|
} else if (special_case == REPEAT_WITH_Q) {
|
||||||
|
if (init_q) buf[j++] = '?';
|
||||||
|
for (i=0; i < n_q_reps; i++) { /* copy x? reps */
|
||||||
|
memcpy(&buf[j], atom, atomlen);
|
||||||
|
j += atomlen;
|
||||||
|
buf[j++] = '?';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
memcpy(&buf[j], reptok+reptoklen, suffix_length);
|
||||||
|
if (special_case == REPEAT_ZERO) {
|
||||||
|
buf[j+suffix_length] = '\0';
|
||||||
|
} else {
|
||||||
|
buf[size] = '\0';
|
||||||
|
}
|
||||||
|
/* free old basestr */
|
||||||
|
if (firstbasestr != basestr) {
|
||||||
|
if (basestr)
|
||||||
|
xfree(basestr);
|
||||||
|
}
|
||||||
|
basestr = buf;
|
||||||
|
prestr = buf + prefix_length;
|
||||||
|
if (special_case == REPEAT_ZERO) {
|
||||||
|
prestr -= atomlen;
|
||||||
|
ret++;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
|
||||||
|
int atomlen, int firstnum, int secondnum)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
In general, the repetition specifier or "bound" is replaced here
|
||||||
|
by an equivalent ERE string, repeating the immediately previous atom
|
||||||
|
and appending ? and + as needed. Note that the first copy of the
|
||||||
|
atom is left in place, except in the special_case of a zero-repeat
|
||||||
|
(i.e., {0}).
|
||||||
|
*/
|
||||||
|
if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
|
||||||
|
if (firstnum < 2) {
|
||||||
|
/* 0 or 1: should be handled before you get here */
|
||||||
|
} else {
|
||||||
|
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||||
|
firstnum, secondnum, REPEAT_PLUS_APPENDED);
|
||||||
|
}
|
||||||
|
} else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
|
||||||
|
if (firstnum == 0) { /* {0} or {0,0} */
|
||||||
|
/* This case is unusual because the resulting
|
||||||
|
replacement string might actually be SMALLER than
|
||||||
|
the original ERE */
|
||||||
|
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||||
|
firstnum, secondnum, REPEAT_ZERO);
|
||||||
|
} else { /* (firstnum >= 1) */
|
||||||
|
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||||
|
firstnum, secondnum, REPEAT_SIMPLE);
|
||||||
|
}
|
||||||
|
} else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
|
||||||
|
/* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
|
||||||
|
return replace_repeat(reptok, reptoklen, atom, atomlen,
|
||||||
|
firstnum, secondnum, REPEAT_WITH_Q);
|
||||||
|
} else { /* Error - shouldn't be here (n>m) */
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int relex(void) /* lexical analyzer for reparse */
|
int relex(void) /* lexical analyzer for reparse */
|
||||||
{
|
{
|
||||||
@ -775,6 +907,11 @@ int relex(void) /* lexical analyzer for reparse */
|
|||||||
uschar *bp;
|
uschar *bp;
|
||||||
struct charclass *cc;
|
struct charclass *cc;
|
||||||
int i;
|
int i;
|
||||||
|
int num, m, commafound, digitfound;
|
||||||
|
const uschar *startreptok;
|
||||||
|
|
||||||
|
rescan:
|
||||||
|
starttok = prestr;
|
||||||
|
|
||||||
switch (c = *prestr++) {
|
switch (c = *prestr++) {
|
||||||
case '|': return OR;
|
case '|': return OR;
|
||||||
@ -841,6 +978,40 @@ int relex(void) /* lexical analyzer for reparse */
|
|||||||
}
|
}
|
||||||
} else
|
} else
|
||||||
*bp++ = c;
|
*bp++ = c;
|
||||||
|
} else if (c == '[' && *prestr == '.') {
|
||||||
|
char collate_char;
|
||||||
|
prestr++;
|
||||||
|
collate_char = *prestr++;
|
||||||
|
if (*prestr == '.' && prestr[1] == ']') {
|
||||||
|
prestr += 2;
|
||||||
|
/* Found it: map via locale TBD: for
|
||||||
|
now, simply return this char. This
|
||||||
|
is sufficient to pass conformance
|
||||||
|
test awk.ex 156
|
||||||
|
*/
|
||||||
|
if (*prestr == ']') {
|
||||||
|
prestr++;
|
||||||
|
rlxval = collate_char;
|
||||||
|
return CHAR;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (c == '[' && *prestr == '=') {
|
||||||
|
char equiv_char;
|
||||||
|
prestr++;
|
||||||
|
equiv_char = *prestr++;
|
||||||
|
if (*prestr == '=' && prestr[1] == ']') {
|
||||||
|
prestr += 2;
|
||||||
|
/* Found it: map via locale TBD: for now
|
||||||
|
simply return this char. This is
|
||||||
|
sufficient to pass conformance test
|
||||||
|
awk.ex 156
|
||||||
|
*/
|
||||||
|
if (*prestr == ']') {
|
||||||
|
prestr++;
|
||||||
|
rlxval = equiv_char;
|
||||||
|
return CHAR;
|
||||||
|
}
|
||||||
|
}
|
||||||
} else if (c == '\0') {
|
} else if (c == '\0') {
|
||||||
FATAL("nonterminated character class %.20s", lastre);
|
FATAL("nonterminated character class %.20s", lastre);
|
||||||
} else if (bp == buf) { /* 1st char is special */
|
} else if (bp == buf) { /* 1st char is special */
|
||||||
@ -855,6 +1026,75 @@ int relex(void) /* lexical analyzer for reparse */
|
|||||||
} else
|
} else
|
||||||
*bp++ = c;
|
*bp++ = c;
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
case '{':
|
||||||
|
if (isdigit(*(prestr))) {
|
||||||
|
num = 0; /* Process as a repetition */
|
||||||
|
n = -1; m = -1;
|
||||||
|
commafound = 0;
|
||||||
|
digitfound = 0;
|
||||||
|
startreptok = prestr-1;
|
||||||
|
/* Remember start of previous atom here ? */
|
||||||
|
} else { /* just a { char, not a repetition */
|
||||||
|
rlxval = c;
|
||||||
|
return CHAR;
|
||||||
|
}
|
||||||
|
for (; ; ) {
|
||||||
|
if ((c = *prestr++) == '}') {
|
||||||
|
if (commafound) {
|
||||||
|
if (digitfound) { /* {n,m} */
|
||||||
|
m = num;
|
||||||
|
if (m<n)
|
||||||
|
FATAL("illegal repetition expression: class %.20s",
|
||||||
|
lastre);
|
||||||
|
if ((n==0) && (m==1)) {
|
||||||
|
return QUEST;
|
||||||
|
}
|
||||||
|
} else { /* {n,} */
|
||||||
|
if (n==0) return STAR;
|
||||||
|
if (n==1) return PLUS;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (digitfound) { /* {n} same as {n,n} */
|
||||||
|
n = num;
|
||||||
|
m = num;
|
||||||
|
} else { /* {} */
|
||||||
|
FATAL("illegal repetition expression: class %.20s",
|
||||||
|
lastre);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (repeat(starttok, prestr-starttok, lastatom,
|
||||||
|
startreptok - lastatom, n, m) > 0) {
|
||||||
|
if ((n==0) && (m==0)) {
|
||||||
|
return EMPTYRE;
|
||||||
|
}
|
||||||
|
/* must rescan input for next token */
|
||||||
|
goto rescan;
|
||||||
|
}
|
||||||
|
/* Failed to replace: eat up {...} characters
|
||||||
|
and treat like just PLUS */
|
||||||
|
return PLUS;
|
||||||
|
} else if (c == '\0') {
|
||||||
|
FATAL("nonterminated character class %.20s",
|
||||||
|
lastre);
|
||||||
|
} else if (isdigit(c)) {
|
||||||
|
num = 10 * num + c - '0';
|
||||||
|
digitfound = 1;
|
||||||
|
} else if (c == ',') {
|
||||||
|
if (commafound)
|
||||||
|
FATAL("illegal repetition expression: class %.20s",
|
||||||
|
lastre);
|
||||||
|
/* looking for {n,} or {n,m} */
|
||||||
|
commafound = 1;
|
||||||
|
n = num;
|
||||||
|
digitfound = 0; /* reset */
|
||||||
|
num = 0;
|
||||||
|
} else {
|
||||||
|
FATAL("illegal repetition expression: class %.20s",
|
||||||
|
lastre);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
2
main.c
2
main.c
@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
|||||||
THIS SOFTWARE.
|
THIS SOFTWARE.
|
||||||
****************************************************************/
|
****************************************************************/
|
||||||
|
|
||||||
const char *version = "version 20180827";
|
const char *version = "version 20190123";
|
||||||
|
|
||||||
#define DEBUG
|
#define DEBUG
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
Loading…
Reference in New Issue
Block a user