From 8a2222286cb82eca05a7b9f7b1fa60b69c2c8a9f Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Wed, 23 Jan 2019 09:12:27 +0000 Subject: [PATCH 1/2] backport ERE interval/repetition expressions from Apple awk-24 The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a. repetition expressions) in regular expressions is listed under BUGS in 'awk.1'. Apple's version of onetrueawk has supported these since at least 2009, judging by the date stamp on their src/b.c in: https://opensource.apple.com/tarballs/awk/awk-24.tar.gz A bug report prompted NetBSD to swiftly integrate this code into their awk. This commit is based on that NetBSD diff. http://gnats.netbsd.org/53885 https://github.com/NetBSD/src/commit/f3e4c4ca1dfcdd939a2e33ebfe708f01e25b3bae b.c: - Backport POSIX-standard interval expressions support in regular expressions via NetBSD from Apple awk-24 (20070501). main.c: - Bump version ID. FIXES: - Add note and credit for this feature. awk.1: section BUGS: - Remove line saying interval expressions are not supported. _________ [*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06 --- FIXES | 5 ++ awk.1 | 2 - b.c | 244 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- main.c | 2 +- 4 files changed, 248 insertions(+), 5 deletions(-) diff --git a/FIXES b/FIXES index 909afb7..eb3b84f 100644 --- a/FIXES +++ b/FIXES @@ -25,6 +25,11 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the AWK book was sent to the printers in August, 1987. +Jan 23, 2019: + Added support for POSIX-standard interval expressions (a.k.a. + bounds, a.k.a. repetition expressions) in regular expressions, + backported (via NetBSD) from Apple awk-24 (20070501). + Oct 25, 2018: Added test in maketab.c to prevent generating a proctab entry for YYSTYPE_IS_DEFINED. It was harmless but some gcc settings diff --git a/awk.1 b/awk.1 index 5830143..18e99ad 100644 --- a/awk.1 +++ b/awk.1 @@ -558,6 +558,4 @@ to force it to be treated as a string concatenate The scope rules for variables in functions are a botch; the syntax is worse. .br -POSIX-standard interval expressions in regular expressions are not supported. -.br Only eight-bit characters sets are handled correctly. diff --git a/b.c b/b.c index a54a234..13fddd8 100644 --- a/b.c +++ b/b.c @@ -65,6 +65,11 @@ int rlxval; static uschar *rlxstr; static uschar *prestr; /* current position in current re */ static uschar *lastre; /* origin of last re */ +static uschar *lastatom; /* origin of last Atom */ +static uschar *starttok; +static uschar *basestr; /* starts with original, replaced during + repetition processing */ +static uschar *firstbasestr; static int setcnt; static int poscnt; @@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ Node *p, *p1; fa *f; + firstbasestr = (uschar *) s; + basestr = firstbasestr; p = reparse(s); p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); /* put ALL STAR in front of reg. exp. */ @@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ f->initstat = makeinit(f, anchor); f->anchor = anchor; f->restr = (uschar *) tostring(s); + if (firstbasestr != basestr) { + if (basestr) + xfree(basestr); + } return f; } @@ -628,9 +639,11 @@ Node *regexp(void) /* top-level parse of reg expr */ Node *primary(void) { Node *np; + int savelastatom; switch (rtok) { case CHAR: + lastatom = starttok; np = op2(CHAR, NIL, itonp(rlxval)); rtok = relex(); return (unary(np)); @@ -639,16 +652,19 @@ Node *primary(void) return (unary(op2(ALL, NIL, NIL))); case EMPTYRE: rtok = relex(); - return (unary(op2(ALL, NIL, NIL))); + return (unary(op2(EMPTYRE, NIL, NIL))); case DOT: + lastatom = starttok; rtok = relex(); return (unary(op2(DOT, NIL, NIL))); case CCL: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); + lastatom = starttok; rtok = relex(); return (unary(np)); case NCCL: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); + lastatom = starttok; rtok = relex(); return (unary(np)); case '^': @@ -658,6 +674,8 @@ Node *primary(void) rtok = relex(); return (unary(op2(CHAR, NIL, NIL))); case '(': + lastatom = starttok; + savelastatom = starttok - basestr; /* Retain over recursion */ rtok = relex(); if (rtok == ')') { /* special pleading for () */ rtok = relex(); @@ -665,6 +683,7 @@ Node *primary(void) } np = regexp(); if (rtok == ')') { + lastatom = basestr + savelastatom; /* Restore */ rtok = relex(); return (unary(np)); } @@ -679,8 +698,12 @@ Node *primary(void) Node *concat(Node *np) { switch (rtok) { - case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': + case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': return (concat(op2(CAT, np, primary()))); + case EMPTYRE: + rtok = relex(); + return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), + primary()))); } return (np); } @@ -765,6 +788,115 @@ struct charclass { { NULL, 0, NULL }, }; +#define REPEAT_SIMPLE 0 +#define REPEAT_PLUS_APPENDED 1 +#define REPEAT_WITH_Q 2 +#define REPEAT_ZERO 3 + +static int +replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, + int atomlen, int firstnum, int secondnum, int special_case) +{ + int i, j; + uschar *buf = 0; + int ret = 1; + int init_q = (firstnum==0); /* first added char will be ? */ + int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ + int prefix_length = reptok - basestr; /* prefix includes first rep */ + int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */ + int size = prefix_length + suffix_length; + + if (firstnum > 1) { /* add room for reps 2 through firstnum */ + size += atomlen*(firstnum-1); + } + + /* Adjust size of buffer for special cases */ + if (special_case == REPEAT_PLUS_APPENDED) { + size++; /* for the final + */ + } else if (special_case == REPEAT_WITH_Q) { + size += init_q + (atomlen+1)* n_q_reps; + } else if (special_case == REPEAT_ZERO) { + size += 2; /* just a null ERE: () */ + } + if ((buf = (uschar *) malloc(size+1)) == NULL) + FATAL("out of space in reg expr %.10s..", lastre); + memcpy(buf, basestr, prefix_length); /* copy prefix */ + j = prefix_length; + if (special_case == REPEAT_ZERO) { + j -= atomlen; + buf[j++] = '('; + buf[j++] = ')'; + } + for (i=1; i < firstnum; i++) { /* copy x reps */ + memcpy(&buf[j], atom, atomlen); + j += atomlen; + } + if (special_case == REPEAT_PLUS_APPENDED) { + buf[j++] = '+'; + } else if (special_case == REPEAT_WITH_Q) { + if (init_q) buf[j++] = '?'; + for (i=0; i < n_q_reps; i++) { /* copy x? reps */ + memcpy(&buf[j], atom, atomlen); + j += atomlen; + buf[j++] = '?'; + } + } + memcpy(&buf[j], reptok+reptoklen, suffix_length); + if (special_case == REPEAT_ZERO) { + buf[j+suffix_length] = '\0'; + } else { + buf[size] = '\0'; + } + /* free old basestr */ + if (firstbasestr != basestr) { + if (basestr) + xfree(basestr); + } + basestr = buf; + prestr = buf + prefix_length; + if (special_case == REPEAT_ZERO) { + prestr -= atomlen; + ret++; + } + return ret; +} + +static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, + int atomlen, int firstnum, int secondnum) +{ + /* + In general, the repetition specifier or "bound" is replaced here + by an equivalent ERE string, repeating the immediately previous atom + and appending ? and + as needed. Note that the first copy of the + atom is left in place, except in the special_case of a zero-repeat + (i.e., {0}). + */ + if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ + if (firstnum < 2) { + /* 0 or 1: should be handled before you get here */ + } else { + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_PLUS_APPENDED); + } + } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ + if (firstnum == 0) { /* {0} or {0,0} */ + /* This case is unusual because the resulting + replacement string might actually be SMALLER than + the original ERE */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_ZERO); + } else { /* (firstnum >= 1) */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_SIMPLE); + } + } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ + /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_WITH_Q); + } else { /* Error - shouldn't be here (n>m) */ + } + return 0; +} int relex(void) /* lexical analyzer for reparse */ { @@ -775,6 +907,11 @@ int relex(void) /* lexical analyzer for reparse */ uschar *bp; struct charclass *cc; int i; + int num, m, commafound, digitfound; + const uschar *startreptok; + +rescan: + starttok = prestr; switch (c = *prestr++) { case '|': return OR; @@ -841,6 +978,40 @@ int relex(void) /* lexical analyzer for reparse */ } } else *bp++ = c; + } else if (c == '[' && *prestr == '.') { + char collate_char; + prestr++; + collate_char = *prestr++; + if (*prestr == '.' && prestr[1] == ']') { + prestr += 2; + /* Found it: map via locale TBD: for + now, simply return this char. This + is sufficient to pass conformance + test awk.ex 156 + */ + if (*prestr == ']') { + prestr++; + rlxval = collate_char; + return CHAR; + } + } + } else if (c == '[' && *prestr == '=') { + char equiv_char; + prestr++; + equiv_char = *prestr++; + if (*prestr == '=' && prestr[1] == ']') { + prestr += 2; + /* Found it: map via locale TBD: for now + simply return this char. This is + sufficient to pass conformance test + awk.ex 156 + */ + if (*prestr == ']') { + prestr++; + rlxval = equiv_char; + return CHAR; + } + } } else if (c == '\0') { FATAL("nonterminated character class %.20s", lastre); } else if (bp == buf) { /* 1st char is special */ @@ -855,6 +1026,75 @@ int relex(void) /* lexical analyzer for reparse */ } else *bp++ = c; } + break; + case '{': + if (isdigit(*(prestr))) { + num = 0; /* Process as a repetition */ + n = -1; m = -1; + commafound = 0; + digitfound = 0; + startreptok = prestr-1; + /* Remember start of previous atom here ? */ + } else { /* just a { char, not a repetition */ + rlxval = c; + return CHAR; + } + for (; ; ) { + if ((c = *prestr++) == '}') { + if (commafound) { + if (digitfound) { /* {n,m} */ + m = num; + if (m 0) { + if ((n==0) && (m==0)) { + return EMPTYRE; + } + /* must rescan input for next token */ + goto rescan; + } + /* Failed to replace: eat up {...} characters + and treat like just PLUS */ + return PLUS; + } else if (c == '\0') { + FATAL("nonterminated character class %.20s", + lastre); + } else if (isdigit(c)) { + num = 10 * num + c - '0'; + digitfound = 1; + } else if (c == ',') { + if (commafound) + FATAL("illegal repetition expression: class %.20s", + lastre); + /* looking for {n,} or {n,m} */ + commafound = 1; + n = num; + digitfound = 0; /* reset */ + num = 0; + } else { + FATAL("illegal repetition expression: class %.20s", + lastre); + } + } + break; } } diff --git a/main.c b/main.c index 1c38a1e..ef5c311 100644 --- a/main.c +++ b/main.c @@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20180827"; +const char *version = "version 20190123"; #define DEBUG #include From 0619d5d5377ea2485b858e48da74780b75568500 Mon Sep 17 00:00:00 2001 From: Martijn Dekker Date: Thu, 21 Feb 2019 22:38:16 +0100 Subject: [PATCH 2/2] repeat(): add FATAL calls for errors that should be impossible --- b.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/b.c b/b.c index 13fddd8..94de52e 100644 --- a/b.c +++ b/b.c @@ -874,6 +874,7 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ if (firstnum < 2) { /* 0 or 1: should be handled before you get here */ + FATAL("internal error"); } else { return replace_repeat(reptok, reptoklen, atom, atomlen, firstnum, secondnum, REPEAT_PLUS_APPENDED); @@ -894,6 +895,7 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, return replace_repeat(reptok, reptoklen, atom, atomlen, firstnum, secondnum, REPEAT_WITH_Q); } else { /* Error - shouldn't be here (n>m) */ + FATAL("internal error"); } return 0; }