backport ERE interval/repetition expressions from Apple awk-24

The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a. repetition expressions) in regular expressions is listed under BUGS in 'awk.1'. Apple's version of onetrueawk has supported these since at least 2009, judging by the date stamp on their src/b.c in: https://opensource.apple.com/tarballs/awk/awk-24.tar.gz A bug report prompted NetBSD to swiftly integrate this code into their awk. This commit is based on that NetBSD diff. http://gnats.netbsd.org/53885 f3e4c4ca1d b.c: - Backport POSIX-standard interval expressions support in regular expressions via NetBSD from Apple awk-24 (20070501). main.c: - Bump version ID. FIXES: - Add note and credit for this feature. awk.1: section BUGS: - Remove line saying interval expressions are not supported. _________ [*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
2019-01-23 09:12:27 +00:00 · 2019-01-23 09:12:27 +00:00 · 8a2222286c
commit 8a2222286c
parent c3c7c1370e
4 changed files with 248 additions and 5 deletions
--- a/5
+++ b/5
@ -25,6 +25,11 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the AWK book
 was sent to the printers in August, 1987.
 Jan 23, 2019:
 	Added support for POSIX-standard interval expressions (a.k.a.
 	bounds, a.k.a. repetition expressions) in regular expressions,
 	backported (via NetBSD) from Apple awk-24 (20070501).
 Oct 25, 2018:
 	Added test in maketab.c to prevent generating a proctab entry
 	for YYSTYPE_IS_DEFINED.  It was harmless but some gcc settings
--- a/awk.1
+++ b/awk.1
@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
 The scope rules for variables in functions are a botch;
 the syntax is worse.
 .br
 POSIX-standard interval expressions in regular expressions are not supported.
 .br
 Only eight-bit characters sets are handled correctly.
--- a/b.c
+++ b/b.c
@ -65,6 +65,11 @@ int	rlxval;
 static uschar	*rlxstr;
 static uschar	*prestr;	/* current position in current re */
 static uschar	*lastre;	/* origin of last re */
 static uschar	*lastatom;	/* origin of last Atom */
 static uschar	*starttok;
 static uschar 	*basestr;	/* starts with original, replaced during
 				   repetition processing */
 static uschar 	*firstbasestr;
 static	int setcnt;
 static	int poscnt;
@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
 	Node *p, *p1;
 	fa *f;
 	firstbasestr = (uschar *) s;
 	basestr = firstbasestr;
 	p = reparse(s);
 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 		/* put ALL STAR in front of reg.  exp. */
@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
 	f->initstat = makeinit(f, anchor);
 	f->anchor = anchor;
 	f->restr = (uschar *) tostring(s);
 	if (firstbasestr != basestr) {
 		if (basestr)
 			xfree(basestr);
 	}
 	return f;
 }
@ -628,9 +639,11 @@ Node *regexp(void)	/* top-level parse of reg expr */
 Node *primary(void)
 {
 	Node *np;
 	int savelastatom;
 	switch (rtok) {
 	case CHAR:
 		lastatom = starttok;
 		np = op2(CHAR, NIL, itonp(rlxval));
 		rtok = relex();
 		return (unary(np));
@ -639,16 +652,19 @@ Node *primary(void)
 		return (unary(op2(ALL, NIL, NIL)));
 	case EMPTYRE:
 		rtok = relex();
-		return (unary(op2(ALL, NIL, NIL)));
+		return (unary(op2(EMPTYRE, NIL, NIL)));
 	case DOT:
 		lastatom = starttok;
 		rtok = relex();
 		return (unary(op2(DOT, NIL, NIL)));
 	case CCL:
 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
 		lastatom = starttok;
 		rtok = relex();
 		return (unary(np));
 	case NCCL:
 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
 		lastatom = starttok;
 		rtok = relex();
 		return (unary(np));
 	case '^':
@ -658,6 +674,8 @@ Node *primary(void)
 		rtok = relex();
 		return (unary(op2(CHAR, NIL, NIL)));
 	case '(':
 		lastatom = starttok;
 		savelastatom = starttok - basestr; /* Retain over recursion */
 		rtok = relex();
 		if (rtok == ')') {	/* special pleading for () */
 			rtok = relex();
@ -665,6 +683,7 @@ Node *primary(void)
 		}
 		np = regexp();
 		if (rtok == ')') {
 			lastatom = basestr + savelastatom; /* Restore */
 			rtok = relex();
 			return (unary(np));
 		}
@ -679,8 +698,12 @@ Node *primary(void)
 Node *concat(Node *np)
 {
 	switch (rtok) {
-	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
+	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
 		return (concat(op2(CAT, np, primary())));
 	case EMPTYRE:
 		rtok = relex();
 		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
 				primary())));
 	}
 	return (np);
 }
@ -765,6 +788,115 @@ struct charclass {
 	{ NULL,		0,	NULL },
 };
 #define REPEAT_SIMPLE		0
 #define REPEAT_PLUS_APPENDED	1
 #define REPEAT_WITH_Q		2
 #define REPEAT_ZERO		3
 static int
 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 	       int atomlen, int firstnum, int secondnum, int special_case)
 {
 	int i, j;
 	uschar *buf = 0;
 	int ret = 1;
 	int init_q = (firstnum==0);		/* first added char will be ? */
 	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
 	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
 	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
 	int size = prefix_length +  suffix_length;
 	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
 		size += atomlen*(firstnum-1);
 	}
 	/* Adjust size of buffer for special cases */
 	if (special_case == REPEAT_PLUS_APPENDED) {
 		size++;		/* for the final + */
 	} else if (special_case == REPEAT_WITH_Q) {
 		size += init_q + (atomlen+1)* n_q_reps;
 	} else if (special_case == REPEAT_ZERO) {
 		size += 2;	/* just a null ERE: () */
 	}
 	if ((buf = (uschar *) malloc(size+1)) == NULL)
 		FATAL("out of space in reg expr %.10s..", lastre);
 	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
 	j = prefix_length;
 	if (special_case == REPEAT_ZERO) {
 		j -= atomlen;
 		buf[j++] = '(';
 		buf[j++] = ')';
 	}
 	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
 		memcpy(&buf[j], atom, atomlen);
 		j += atomlen;
 	}
 	if (special_case == REPEAT_PLUS_APPENDED) {
 		buf[j++] = '+';
 	} else if (special_case == REPEAT_WITH_Q) {
 		if (init_q) buf[j++] = '?';
 		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
 			memcpy(&buf[j], atom, atomlen);
 			j += atomlen;
 			buf[j++] = '?';
 		}
 	}
 	memcpy(&buf[j], reptok+reptoklen, suffix_length);
 	if (special_case == REPEAT_ZERO) {
 		buf[j+suffix_length] = '\0';
 	} else {
 		buf[size] = '\0';
 	}
 	/* free old basestr */
 	if (firstbasestr != basestr) {
 		if (basestr)
 			xfree(basestr);
 	}
 	basestr = buf;
 	prestr  = buf + prefix_length;
 	if (special_case == REPEAT_ZERO) {
 		prestr  -= atomlen;
 		ret++;
 	}
 	return ret;
 }
 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 		  int atomlen, int firstnum, int secondnum)
 {
 	/*
 	   In general, the repetition specifier or "bound" is replaced here
 	   by an equivalent ERE string, repeating the immediately previous atom
 	   and appending ? and + as needed. Note that the first copy of the
 	   atom is left in place, except in the special_case of a zero-repeat
 	   (i.e., {0}).
 	 */
 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
 		if (firstnum < 2) {
 			/* 0 or 1: should be handled before you get here */
 		} else {
 			return replace_repeat(reptok, reptoklen, atom, atomlen,
 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
 		}
 	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
 		if (firstnum == 0) {	/* {0} or {0,0} */
 			/* This case is unusual because the resulting
 			   replacement string might actually be SMALLER than
 			   the original ERE */
 			return replace_repeat(reptok, reptoklen, atom, atomlen,
 					firstnum, secondnum, REPEAT_ZERO);
 		} else {		/* (firstnum >= 1) */
 			return replace_repeat(reptok, reptoklen, atom, atomlen,
 					firstnum, secondnum, REPEAT_SIMPLE);
 		}
 	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
 		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
 		return replace_repeat(reptok, reptoklen, atom, atomlen,
 					firstnum, secondnum, REPEAT_WITH_Q);
 	} else {	/* Error - shouldn't be here (n>m) */
 	}
 	return 0;
 }
 int relex(void)		/* lexical analyzer for reparse */
 {
@ -775,6 +907,11 @@ int relex(void)		/* lexical analyzer for reparse */
 	uschar *bp;
 	struct charclass *cc;
 	int i;
 	int num, m, commafound, digitfound;
 	const uschar *startreptok;
 rescan:
 	starttok = prestr;
 	switch (c = *prestr++) {
 	case '|': return OR;
@ -841,6 +978,40 @@ int relex(void)		/* lexical analyzer for reparse */
 					}
 				} else
 					*bp++ = c;
 			} else if (c == '[' && *prestr == '.') {
 				char collate_char;
 				prestr++;
 				collate_char = *prestr++;
 				if (*prestr == '.' && prestr[1] == ']') {
 					prestr += 2;
 					/* Found it: map via locale TBD: for
 					   now, simply return this char.  This
 					   is sufficient to pass conformance
 					   test awk.ex 156
 					 */
 					if (*prestr == ']') {
 						prestr++;
 						rlxval = collate_char;
 						return CHAR;
 					}
 				}
 			} else if (c == '[' && *prestr == '=') {
 				char equiv_char;
 				prestr++;
 				equiv_char = *prestr++;
 				if (*prestr == '=' && prestr[1] == ']') {
 					prestr += 2;
 					/* Found it: map via locale TBD: for now
 					   simply return this char. This is
 					   sufficient to pass conformance test
 					   awk.ex 156
 					 */
 					if (*prestr == ']') {
 						prestr++;
 						rlxval = equiv_char;
 						return CHAR;
 					}
 				}
 			} else if (c == '\0') {
 				FATAL("nonterminated character class %.20s", lastre);
 			} else if (bp == buf) {	/* 1st char is special */
@ -855,6 +1026,75 @@ int relex(void)		/* lexical analyzer for reparse */
 			} else
 				*bp++ = c;
 		}
 		break;
 	case '{':
 		if (isdigit(*(prestr))) {
 			num = 0;	/* Process as a repetition */
 			n = -1; m = -1;
 			commafound = 0;
 			digitfound = 0;
 			startreptok = prestr-1;
 			/* Remember start of previous atom here ? */
 		} else {        	/* just a { char, not a repetition */
 			rlxval = c;
 			return CHAR;
                }
 		for (; ; ) {
 			if ((c = *prestr++) == '}') {
 				if (commafound) {
 					if (digitfound) { /* {n,m} */
 						m = num;
 						if (m<n)
 							FATAL("illegal repetition expression: class %.20s",
 								lastre);
 						if ((n==0) && (m==1)) {
 							return QUEST;
 						}
 					} else {	/* {n,} */
 						if (n==0) return STAR;
 						if (n==1) return PLUS;
 					}
 				} else {
 					if (digitfound) { /* {n} same as {n,n} */
 						n = num;
 						m = num;
 					} else {	/* {} */
 						FATAL("illegal repetition expression: class %.20s",
 							lastre);
 					}
 				}
 				if (repeat(starttok, prestr-starttok, lastatom,
 					   startreptok - lastatom, n, m) > 0) {
 					if ((n==0) && (m==0)) {
 						return EMPTYRE;
 					}
 					/* must rescan input for next token */
 					goto rescan;
 				}
 				/* Failed to replace: eat up {...} characters
 				   and treat like just PLUS */
 				return PLUS;
 			} else if (c == '\0') {
 				FATAL("nonterminated character class %.20s",
 					lastre);
 			} else if (isdigit(c)) {
 				num = 10 * num + c - '0';
 				digitfound = 1;
 			} else if (c == ',') {
 				if (commafound)
 					FATAL("illegal repetition expression: class %.20s",
 						lastre);
 				/* looking for {n,} or {n,m} */
 				commafound = 1;
 				n = num;
 				digitfound = 0; /* reset */
 				num = 0;
 			} else {
 				FATAL("illegal repetition expression: class %.20s",
 					lastre);
 			}
 		}
 		break;
 	}
 }
--- a/main.c
+++ b/main.c
@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/
-const char	*version = "version 20180827";
+const char	*version = "version 20190123";
 #define DEBUG
 #include <stdio.h>