backport ERE interval/repetition expressions from Apple awk-24

The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a. repetition expressions) in regular expressions is listed under BUGS in 'awk.1'. Apple's version of onetrueawk has supported these since at least 2009, judging by the date stamp on their src/b.c in: https://opensource.apple.com/tarballs/awk/awk-24.tar.gz A bug report prompted NetBSD to swiftly integrate this code into their awk. This commit is based on that NetBSD diff. http://gnats.netbsd.org/53885 f3e4c4ca1d b.c: - Backport POSIX-standard interval expressions support in regular expressions via NetBSD from Apple awk-24 (20070501). main.c: - Bump version ID. FIXES: - Add note and credit for this feature. awk.1: section BUGS: - Remove line saying interval expressions are not supported. _________ [*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
2019-01-23 09:12:27 +00:00 · 2019-01-23 09:12:27 +00:00 · 8a2222286c
commit 8a2222286c
parent c3c7c1370e
4 changed files with 248 additions and 5 deletions
--- a/5
+++ b/5
@ -25,6 +25,11 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the AWK book
 was sent to the printers in August, 1987.

+Jan 23, 2019:
+	Added support for POSIX-standard interval expressions (a.k.a.
+	bounds, a.k.a. repetition expressions) in regular expressions,
+	backported (via NetBSD) from Apple awk-24 (20070501).
+
 Oct 25, 2018:
 	Added test in maketab.c to prevent generating a proctab entry
 	for YYSTYPE_IS_DEFINED.  It was harmless but some gcc settings
--- a/awk.1
+++ b/awk.1
@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
 The scope rules for variables in functions are a botch;
 the syntax is worse.
 .br
-POSIX-standard interval expressions in regular expressions are not supported.
-.br
 Only eight-bit characters sets are handled correctly.
--- a/b.c
+++ b/b.c
@ -65,6 +65,11 @@ int	rlxval;
 static uschar	*rlxstr;
 static uschar	*prestr;	/* current position in current re */
 static uschar	*lastre;	/* origin of last re */
+static uschar	*lastatom;	/* origin of last Atom */
+static uschar	*starttok;
+static uschar 	*basestr;	/* starts with original, replaced during
+				   repetition processing */
+static uschar 	*firstbasestr;

 static	int setcnt;
 static	int poscnt;
@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
 	Node *p, *p1;
 	fa *f;

+	firstbasestr = (uschar *) s;
+	basestr = firstbasestr;
 	p = reparse(s);
 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 		/* put ALL STAR in front of reg.  exp. */
@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
 	f->initstat = makeinit(f, anchor);
 	f->anchor = anchor;
 	f->restr = (uschar *) tostring(s);
+	if (firstbasestr != basestr) {
+		if (basestr)
+			xfree(basestr);
+	}
 	return f;
 }

@ -628,9 +639,11 @@ Node *regexp(void)	/* top-level parse of reg expr */
 Node *primary(void)
 {
 	Node *np;
+	int savelastatom;

 	switch (rtok) {
 	case CHAR:
+		lastatom = starttok;
 		np = op2(CHAR, NIL, itonp(rlxval));
 		rtok = relex();
 		return (unary(np));
@ -639,16 +652,19 @@ Node *primary(void)
 		return (unary(op2(ALL, NIL, NIL)));
 	case EMPTYRE:
 		rtok = relex();
-		return (unary(op2(ALL, NIL, NIL)));
+		return (unary(op2(EMPTYRE, NIL, NIL)));
 	case DOT:
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(op2(DOT, NIL, NIL)));
 	case CCL:
 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(np));
 	case NCCL:
 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(np));
 	case '^':
@ -658,6 +674,8 @@ Node *primary(void)
 		rtok = relex();
 		return (unary(op2(CHAR, NIL, NIL)));
 	case '(':
+		lastatom = starttok;
+		savelastatom = starttok - basestr; /* Retain over recursion */
 		rtok = relex();
 		if (rtok == ')') {	/* special pleading for () */
 			rtok = relex();
@ -665,6 +683,7 @@ Node *primary(void)
 		}
 		np = regexp();
 		if (rtok == ')') {
+			lastatom = basestr + savelastatom; /* Restore */
 			rtok = relex();
 			return (unary(np));
 		}
@ -679,8 +698,12 @@ Node *primary(void)
 Node *concat(Node *np)
 {
 	switch (rtok) {
-	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
+	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
 		return (concat(op2(CAT, np, primary())));
+	case EMPTYRE:
+		rtok = relex();
+		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+				primary())));
 	}
 	return (np);
 }
@ -765,6 +788,115 @@ struct charclass {
 	{ NULL,		0,	NULL },
 };

+#define REPEAT_SIMPLE		0
+#define REPEAT_PLUS_APPENDED	1
+#define REPEAT_WITH_Q		2
+#define REPEAT_ZERO		3
+
+static int
+replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+	       int atomlen, int firstnum, int secondnum, int special_case)
+{
+	int i, j;
+	uschar *buf = 0;
+	int ret = 1;
+	int init_q = (firstnum==0);		/* first added char will be ? */
+	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
+	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
+	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
+	int size = prefix_length +  suffix_length;
+
+	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
+		size += atomlen*(firstnum-1);
+	}
+
+	/* Adjust size of buffer for special cases */
+	if (special_case == REPEAT_PLUS_APPENDED) {
+		size++;		/* for the final + */
+	} else if (special_case == REPEAT_WITH_Q) {
+		size += init_q + (atomlen+1)* n_q_reps;
+	} else if (special_case == REPEAT_ZERO) {
+		size += 2;	/* just a null ERE: () */
+	}
+	if ((buf = (uschar *) malloc(size+1)) == NULL)
+		FATAL("out of space in reg expr %.10s..", lastre);
+	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
+	j = prefix_length;
+	if (special_case == REPEAT_ZERO) {
+		j -= atomlen;
+		buf[j++] = '(';
+		buf[j++] = ')';
+	}
+	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
+		memcpy(&buf[j], atom, atomlen);
+		j += atomlen;
+	}
+	if (special_case == REPEAT_PLUS_APPENDED) {
+		buf[j++] = '+';
+	} else if (special_case == REPEAT_WITH_Q) {
+		if (init_q) buf[j++] = '?';
+		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
+			memcpy(&buf[j], atom, atomlen);
+			j += atomlen;
+			buf[j++] = '?';
+		}
+	}
+	memcpy(&buf[j], reptok+reptoklen, suffix_length);
+	if (special_case == REPEAT_ZERO) {
+		buf[j+suffix_length] = '\0';
+	} else {
+		buf[size] = '\0';
+	}
+	/* free old basestr */
+	if (firstbasestr != basestr) {
+		if (basestr)
+			xfree(basestr);
+	}
+	basestr = buf;
+	prestr  = buf + prefix_length;
+	if (special_case == REPEAT_ZERO) {
+		prestr  -= atomlen;
+		ret++;
+	}
+	return ret;
+}
+
+static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+		  int atomlen, int firstnum, int secondnum)
+{
+	/*
+	   In general, the repetition specifier or "bound" is replaced here
+	   by an equivalent ERE string, repeating the immediately previous atom
+	   and appending ? and + as needed. Note that the first copy of the
+	   atom is left in place, except in the special_case of a zero-repeat
+	   (i.e., {0}).
+	 */
+	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
+		if (firstnum < 2) {
+			/* 0 or 1: should be handled before you get here */
+		} else {
+			return replace_repeat(reptok, reptoklen, atom, atomlen,
+				firstnum, secondnum, REPEAT_PLUS_APPENDED);
+		}
+	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
+		if (firstnum == 0) {	/* {0} or {0,0} */
+			/* This case is unusual because the resulting
+			   replacement string might actually be SMALLER than
+			   the original ERE */
+			return replace_repeat(reptok, reptoklen, atom, atomlen,
+					firstnum, secondnum, REPEAT_ZERO);
+		} else {		/* (firstnum >= 1) */
+			return replace_repeat(reptok, reptoklen, atom, atomlen,
+					firstnum, secondnum, REPEAT_SIMPLE);
+		}
+	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
+		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
+		return replace_repeat(reptok, reptoklen, atom, atomlen,
+					firstnum, secondnum, REPEAT_WITH_Q);
+	} else {	/* Error - shouldn't be here (n>m) */
+	}
+	return 0;
+}

 int relex(void)		/* lexical analyzer for reparse */
 {
@ -775,6 +907,11 @@ int relex(void)		/* lexical analyzer for reparse */
 	uschar *bp;
 	struct charclass *cc;
 	int i;
+	int num, m, commafound, digitfound;
+	const uschar *startreptok;
+
+rescan:
+	starttok = prestr;

 	switch (c = *prestr++) {
 	case '|': return OR;
@ -841,6 +978,40 @@ int relex(void)		/* lexical analyzer for reparse */
 					}
 				} else
 					*bp++ = c;
+			} else if (c == '[' && *prestr == '.') {
+				char collate_char;
+				prestr++;
+				collate_char = *prestr++;
+				if (*prestr == '.' && prestr[1] == ']') {
+					prestr += 2;
+					/* Found it: map via locale TBD: for
+					   now, simply return this char.  This
+					   is sufficient to pass conformance
+					   test awk.ex 156
+					 */
+					if (*prestr == ']') {
+						prestr++;
+						rlxval = collate_char;
+						return CHAR;
+					}
+				}
+			} else if (c == '[' && *prestr == '=') {
+				char equiv_char;
+				prestr++;
+				equiv_char = *prestr++;
+				if (*prestr == '=' && prestr[1] == ']') {
+					prestr += 2;
+					/* Found it: map via locale TBD: for now
+					   simply return this char. This is
+					   sufficient to pass conformance test
+					   awk.ex 156
+					 */
+					if (*prestr == ']') {
+						prestr++;
+						rlxval = equiv_char;
+						return CHAR;
+					}
+				}
 			} else if (c == '\0') {
 				FATAL("nonterminated character class %.20s", lastre);
 			} else if (bp == buf) {	/* 1st char is special */
@ -855,6 +1026,75 @@ int relex(void)		/* lexical analyzer for reparse */
 			} else
 				*bp++ = c;
 		}
+		break;
+	case '{':
+		if (isdigit(*(prestr))) {
+			num = 0;	/* Process as a repetition */
+			n = -1; m = -1;
+			commafound = 0;
+			digitfound = 0;
+			startreptok = prestr-1;
+			/* Remember start of previous atom here ? */
+		} else {        	/* just a { char, not a repetition */
+			rlxval = c;
+			return CHAR;
+                }
+		for (; ; ) {
+			if ((c = *prestr++) == '}') {
+				if (commafound) {
+					if (digitfound) { /* {n,m} */
+						m = num;
+						if (m<n)
+							FATAL("illegal repetition expression: class %.20s",
+								lastre);
+						if ((n==0) && (m==1)) {
+							return QUEST;
+						}
+					} else {	/* {n,} */
+						if (n==0) return STAR;
+						if (n==1) return PLUS;
+					}
+				} else {
+					if (digitfound) { /* {n} same as {n,n} */
+						n = num;
+						m = num;
+					} else {	/* {} */
+						FATAL("illegal repetition expression: class %.20s",
+							lastre);
+					}
+				}
+				if (repeat(starttok, prestr-starttok, lastatom,
+					   startreptok - lastatom, n, m) > 0) {
+					if ((n==0) && (m==0)) {
+						return EMPTYRE;
+					}
+					/* must rescan input for next token */
+					goto rescan;
+				}
+				/* Failed to replace: eat up {...} characters
+				   and treat like just PLUS */
+				return PLUS;
+			} else if (c == '\0') {
+				FATAL("nonterminated character class %.20s",
+					lastre);
+			} else if (isdigit(c)) {
+				num = 10 * num + c - '0';
+				digitfound = 1;
+			} else if (c == ',') {
+				if (commafound)
+					FATAL("illegal repetition expression: class %.20s",
+						lastre);
+				/* looking for {n,} or {n,m} */
+				commafound = 1;
+				n = num;
+				digitfound = 0; /* reset */
+				num = 0;
+			} else {
+				FATAL("illegal repetition expression: class %.20s",
+					lastre);
+			}
+		}
+		break;
 	}
 }

--- a/main.c
+++ b/main.c
@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/

-const char	*version = "version 20180827";
+const char	*version = "version 20190123";

 #define DEBUG
 #include <stdio.h>