From 8a2222286cb82eca05a7b9f7b1fa60b69c2c8a9f Mon Sep 17 00:00:00 2001
From: Martijn Dekker <martijn@inlv.org>
Date: Wed, 23 Jan 2019 09:12:27 +0000
Subject: [PATCH 1/2] backport ERE interval/repetition expressions from Apple
 awk-24

The lack of POSIX interval expressions[*] (a.k.a. bounds, a.k.a.
repetition expressions) in regular expressions is listed under BUGS
in 'awk.1'. Apple's version of onetrueawk has supported these since
at least 2009, judging by the date stamp on their src/b.c in:
https://opensource.apple.com/tarballs/awk/awk-24.tar.gz

A bug report prompted NetBSD to swiftly integrate this code into
their awk. This commit is based on that NetBSD diff.
http://gnats.netbsd.org/53885
https://github.com/NetBSD/src/commit/f3e4c4ca1dfcdd939a2e33ebfe708f01e25b3bae

b.c:
- Backport POSIX-standard interval expressions support in regular
  expressions via NetBSD from Apple awk-24 (20070501).

main.c:
- Bump version ID.

FIXES:
- Add note and credit for this feature.

awk.1: section BUGS:
- Remove line saying interval expressions are not supported.

_________
[*] http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_06
---
 FIXES  |   5 ++
 awk.1  |   2 -
 b.c    | 244 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 main.c |   2 +-
 4 files changed, 248 insertions(+), 5 deletions(-)

diff --git a/FIXES b/FIXES
index 909afb7..eb3b84f 100644
--- a/FIXES
+++ b/FIXES
@@ -25,6 +25,11 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the AWK book
 was sent to the printers in August, 1987.
 
+Jan 23, 2019:
+	Added support for POSIX-standard interval expressions (a.k.a.
+	bounds, a.k.a. repetition expressions) in regular expressions,
+	backported (via NetBSD) from Apple awk-24 (20070501).
+
 Oct 25, 2018:
 	Added test in maketab.c to prevent generating a proctab entry
 	for YYSTYPE_IS_DEFINED.  It was harmless but some gcc settings
diff --git a/awk.1 b/awk.1
index 5830143..18e99ad 100644
--- a/awk.1
+++ b/awk.1
@@ -558,6 +558,4 @@ to force it to be treated as a string concatenate
 The scope rules for variables in functions are a botch;
 the syntax is worse.
 .br
-POSIX-standard interval expressions in regular expressions are not supported.
-.br
 Only eight-bit characters sets are handled correctly.
diff --git a/b.c b/b.c
index a54a234..13fddd8 100644
--- a/b.c
+++ b/b.c
@@ -65,6 +65,11 @@ int	rlxval;
 static uschar	*rlxstr;
 static uschar	*prestr;	/* current position in current re */
 static uschar	*lastre;	/* origin of last re */
+static uschar	*lastatom;	/* origin of last Atom */
+static uschar	*starttok;
+static uschar 	*basestr;	/* starts with original, replaced during
+				   repetition processing */
+static uschar 	*firstbasestr;
 
 static	int setcnt;
 static	int poscnt;
@@ -124,6 +129,8 @@ fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
 	Node *p, *p1;
 	fa *f;
 
+	firstbasestr = (uschar *) s;
+	basestr = firstbasestr;
 	p = reparse(s);
 	p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
 		/* put ALL STAR in front of reg.  exp. */
@@ -145,6 +152,10 @@ fa *mkdfa(const char *s, int anchor)	/* does the real work of making a dfa */
 	f->initstat = makeinit(f, anchor);
 	f->anchor = anchor;
 	f->restr = (uschar *) tostring(s);
+	if (firstbasestr != basestr) {
+		if (basestr)
+			xfree(basestr);
+	}
 	return f;
 }
 
@@ -628,9 +639,11 @@ Node *regexp(void)	/* top-level parse of reg expr */
 Node *primary(void)
 {
 	Node *np;
+	int savelastatom;
 
 	switch (rtok) {
 	case CHAR:
+		lastatom = starttok;
 		np = op2(CHAR, NIL, itonp(rlxval));
 		rtok = relex();
 		return (unary(np));
@@ -639,16 +652,19 @@ Node *primary(void)
 		return (unary(op2(ALL, NIL, NIL)));
 	case EMPTYRE:
 		rtok = relex();
-		return (unary(op2(ALL, NIL, NIL)));
+		return (unary(op2(EMPTYRE, NIL, NIL)));
 	case DOT:
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(op2(DOT, NIL, NIL)));
 	case CCL:
 		np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(np));
 	case NCCL:
 		np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
+		lastatom = starttok;
 		rtok = relex();
 		return (unary(np));
 	case '^':
@@ -658,6 +674,8 @@ Node *primary(void)
 		rtok = relex();
 		return (unary(op2(CHAR, NIL, NIL)));
 	case '(':
+		lastatom = starttok;
+		savelastatom = starttok - basestr; /* Retain over recursion */
 		rtok = relex();
 		if (rtok == ')') {	/* special pleading for () */
 			rtok = relex();
@@ -665,6 +683,7 @@ Node *primary(void)
 		}
 		np = regexp();
 		if (rtok == ')') {
+			lastatom = basestr + savelastatom; /* Restore */
 			rtok = relex();
 			return (unary(np));
 		}
@@ -679,8 +698,12 @@ Node *primary(void)
 Node *concat(Node *np)
 {
 	switch (rtok) {
-	case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
+	case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
 		return (concat(op2(CAT, np, primary())));
+	case EMPTYRE:
+		rtok = relex();
+		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+				primary())));
 	}
 	return (np);
 }
@@ -765,6 +788,115 @@ struct charclass {
 	{ NULL,		0,	NULL },
 };
 
+#define REPEAT_SIMPLE		0
+#define REPEAT_PLUS_APPENDED	1
+#define REPEAT_WITH_Q		2
+#define REPEAT_ZERO		3
+
+static int
+replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+	       int atomlen, int firstnum, int secondnum, int special_case)
+{
+	int i, j;
+	uschar *buf = 0;
+	int ret = 1;
+	int init_q = (firstnum==0);		/* first added char will be ? */
+	int n_q_reps = secondnum-firstnum;	/* m>n, so reduce until {1,m-n} left  */
+	int prefix_length = reptok - basestr;	/* prefix includes first rep	*/
+	int suffix_length = strlen((char *) reptok) - reptoklen;	/* string after rep specifier	*/
+	int size = prefix_length +  suffix_length;
+
+	if (firstnum > 1) {	/* add room for reps 2 through firstnum */
+		size += atomlen*(firstnum-1);
+	}
+
+	/* Adjust size of buffer for special cases */
+	if (special_case == REPEAT_PLUS_APPENDED) {
+		size++;		/* for the final + */
+	} else if (special_case == REPEAT_WITH_Q) {
+		size += init_q + (atomlen+1)* n_q_reps;
+	} else if (special_case == REPEAT_ZERO) {
+		size += 2;	/* just a null ERE: () */
+	}
+	if ((buf = (uschar *) malloc(size+1)) == NULL)
+		FATAL("out of space in reg expr %.10s..", lastre);
+	memcpy(buf, basestr, prefix_length);	/* copy prefix	*/
+	j = prefix_length;
+	if (special_case == REPEAT_ZERO) {
+		j -= atomlen;
+		buf[j++] = '(';
+		buf[j++] = ')';
+	}
+	for (i=1; i < firstnum; i++) {		/* copy x reps 	*/
+		memcpy(&buf[j], atom, atomlen);
+		j += atomlen;
+	}
+	if (special_case == REPEAT_PLUS_APPENDED) {
+		buf[j++] = '+';
+	} else if (special_case == REPEAT_WITH_Q) {
+		if (init_q) buf[j++] = '?';
+		for (i=0; i < n_q_reps; i++) {	/* copy x? reps */
+			memcpy(&buf[j], atom, atomlen);
+			j += atomlen;
+			buf[j++] = '?';
+		}
+	}
+	memcpy(&buf[j], reptok+reptoklen, suffix_length);
+	if (special_case == REPEAT_ZERO) {
+		buf[j+suffix_length] = '\0';
+	} else {
+		buf[size] = '\0';
+	}
+	/* free old basestr */
+	if (firstbasestr != basestr) {
+		if (basestr)
+			xfree(basestr);
+	}
+	basestr = buf;
+	prestr  = buf + prefix_length;
+	if (special_case == REPEAT_ZERO) {
+		prestr  -= atomlen;
+		ret++;
+	}
+	return ret;
+}
+
+static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
+		  int atomlen, int firstnum, int secondnum)
+{
+	/*
+	   In general, the repetition specifier or "bound" is replaced here
+	   by an equivalent ERE string, repeating the immediately previous atom
+	   and appending ? and + as needed. Note that the first copy of the
+	   atom is left in place, except in the special_case of a zero-repeat
+	   (i.e., {0}).
+	 */
+	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
+		if (firstnum < 2) {
+			/* 0 or 1: should be handled before you get here */
+		} else {
+			return replace_repeat(reptok, reptoklen, atom, atomlen,
+				firstnum, secondnum, REPEAT_PLUS_APPENDED);
+		}
+	} else if (firstnum == secondnum) {	/* {n} or {n,n} -> simply repeat n-1 times */
+		if (firstnum == 0) {	/* {0} or {0,0} */
+			/* This case is unusual because the resulting
+			   replacement string might actually be SMALLER than
+			   the original ERE */
+			return replace_repeat(reptok, reptoklen, atom, atomlen,
+					firstnum, secondnum, REPEAT_ZERO);
+		} else {		/* (firstnum >= 1) */
+			return replace_repeat(reptok, reptoklen, atom, atomlen,
+					firstnum, secondnum, REPEAT_SIMPLE);
+		}
+	} else if (firstnum < secondnum) {	/* {n,m} -> repeat n-1 times then alternate  */
+		/*  x{n,m}  =>  xx...x{1, m-n+1}  =>  xx...x?x?x?..x?	*/
+		return replace_repeat(reptok, reptoklen, atom, atomlen,
+					firstnum, secondnum, REPEAT_WITH_Q);
+	} else {	/* Error - shouldn't be here (n>m) */
+	}
+	return 0;
+}
 
 int relex(void)		/* lexical analyzer for reparse */
 {
@@ -775,6 +907,11 @@ int relex(void)		/* lexical analyzer for reparse */
 	uschar *bp;
 	struct charclass *cc;
 	int i;
+	int num, m, commafound, digitfound;
+	const uschar *startreptok;
+
+rescan:
+	starttok = prestr;
 
 	switch (c = *prestr++) {
 	case '|': return OR;
@@ -841,6 +978,40 @@ int relex(void)		/* lexical analyzer for reparse */
 					}
 				} else
 					*bp++ = c;
+			} else if (c == '[' && *prestr == '.') {
+				char collate_char;
+				prestr++;
+				collate_char = *prestr++;
+				if (*prestr == '.' && prestr[1] == ']') {
+					prestr += 2;
+					/* Found it: map via locale TBD: for
+					   now, simply return this char.  This
+					   is sufficient to pass conformance
+					   test awk.ex 156
+					 */
+					if (*prestr == ']') {
+						prestr++;
+						rlxval = collate_char;
+						return CHAR;
+					}
+				}
+			} else if (c == '[' && *prestr == '=') {
+				char equiv_char;
+				prestr++;
+				equiv_char = *prestr++;
+				if (*prestr == '=' && prestr[1] == ']') {
+					prestr += 2;
+					/* Found it: map via locale TBD: for now
+					   simply return this char. This is
+					   sufficient to pass conformance test
+					   awk.ex 156
+					 */
+					if (*prestr == ']') {
+						prestr++;
+						rlxval = equiv_char;
+						return CHAR;
+					}
+				}
 			} else if (c == '\0') {
 				FATAL("nonterminated character class %.20s", lastre);
 			} else if (bp == buf) {	/* 1st char is special */
@@ -855,6 +1026,75 @@ int relex(void)		/* lexical analyzer for reparse */
 			} else
 				*bp++ = c;
 		}
+		break;
+	case '{':
+		if (isdigit(*(prestr))) {
+			num = 0;	/* Process as a repetition */
+			n = -1; m = -1;
+			commafound = 0;
+			digitfound = 0;
+			startreptok = prestr-1;
+			/* Remember start of previous atom here ? */
+		} else {        	/* just a { char, not a repetition */
+			rlxval = c;
+			return CHAR;
+                }
+		for (; ; ) {
+			if ((c = *prestr++) == '}') {
+				if (commafound) {
+					if (digitfound) { /* {n,m} */
+						m = num;
+						if (m<n)
+							FATAL("illegal repetition expression: class %.20s",
+								lastre);
+						if ((n==0) && (m==1)) {
+							return QUEST;
+						}
+					} else {	/* {n,} */
+						if (n==0) return STAR;
+						if (n==1) return PLUS;
+					}
+				} else {
+					if (digitfound) { /* {n} same as {n,n} */
+						n = num;
+						m = num;
+					} else {	/* {} */
+						FATAL("illegal repetition expression: class %.20s",
+							lastre);
+					}
+				}
+				if (repeat(starttok, prestr-starttok, lastatom,
+					   startreptok - lastatom, n, m) > 0) {
+					if ((n==0) && (m==0)) {
+						return EMPTYRE;
+					}
+					/* must rescan input for next token */
+					goto rescan;
+				}
+				/* Failed to replace: eat up {...} characters
+				   and treat like just PLUS */
+				return PLUS;
+			} else if (c == '\0') {
+				FATAL("nonterminated character class %.20s",
+					lastre);
+			} else if (isdigit(c)) {
+				num = 10 * num + c - '0';
+				digitfound = 1;
+			} else if (c == ',') {
+				if (commafound)
+					FATAL("illegal repetition expression: class %.20s",
+						lastre);
+				/* looking for {n,} or {n,m} */
+				commafound = 1;
+				n = num;
+				digitfound = 0; /* reset */
+				num = 0;
+			} else {
+				FATAL("illegal repetition expression: class %.20s",
+					lastre);
+			}
+		}
+		break;
 	}
 }
 
diff --git a/main.c b/main.c
index 1c38a1e..ef5c311 100644
--- a/main.c
+++ b/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/
 
-const char	*version = "version 20180827";
+const char	*version = "version 20190123";
 
 #define DEBUG
 #include <stdio.h>

From 0619d5d5377ea2485b858e48da74780b75568500 Mon Sep 17 00:00:00 2001
From: Martijn Dekker <martijn@inlv.org>
Date: Thu, 21 Feb 2019 22:38:16 +0100
Subject: [PATCH 2/2] repeat(): add FATAL calls for errors that should be
 impossible

---
 b.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/b.c b/b.c
index 13fddd8..94de52e 100644
--- a/b.c
+++ b/b.c
@@ -874,6 +874,7 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 	if (secondnum < 0) {	/* means {n,} -> repeat n-1 times followed by PLUS */
 		if (firstnum < 2) {
 			/* 0 or 1: should be handled before you get here */
+			FATAL("internal error");
 		} else {
 			return replace_repeat(reptok, reptoklen, atom, atomlen,
 				firstnum, secondnum, REPEAT_PLUS_APPENDED);
@@ -894,6 +895,7 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 		return replace_repeat(reptok, reptoklen, atom, atomlen,
 					firstnum, secondnum, REPEAT_WITH_Q);
 	} else {	/* Error - shouldn't be here (n>m) */
+		FATAL("internal error");
 	}
 	return 0;
 }