Fix readrec's definition of a record
I botched readrec's definition of a record, when I implemented RS regular expression support. This is the relevant hunk from the old diff: ``` - return c == EOF && rr == buf ? 0 : 1; + isrec = *buf || !feof(inf); + dprintf( ("readrec saw <%s>, returns %d\n", buf, isrec) ); + return isrec; ``` Problem #1 Unlike testing with EOF, `*buf || !feof(inf)` is blind to stdio errors. This can cause an infinite loop whose each iteration fabricates an empty record. The following demonstration uses standard terminal access control policy to produce a persistent error condition. Note that the "i/o error" message does not come from readrec(). It's produced much later by closeall() at shutdown. ``` $ trap '' SIGTTIN && awk 'END {print NR}' & [1] 33517 $ # After fg, type ^D $ fg trap '' SIGTTIN && awk 'END {print NR}' 13847376 awk: i/o error occurred on /dev/stdin input record number 13847376, file source line number 1 ``` Each time awk tries to read the terminal from the background, while ignoring SIGTTIN, the read fails with EIO, getc returns EOF, the stream's end-of-file indicator remains clear, and `!feof` erroneously promotoes the empty buffer to an empty record. So long as the error persists, the stream's position does not advance and end-of-file is never set. Problem #2: When RS is a regex, `*buf || !feof(inf)` can't see an empty record's terminator at the end of a stream. ``` $ echo a | awk 1 RS='a\n' $ ``` That pipeline should have found one empty record and printed a blank line, but `*buf || !feof(inf)` considers reaching the end of the stream the conclusion of a fruitless search. That's only correct when the terminator is a single character, because a regex RS search can set the end-of-file marker even when it succeeds. The Fix `isrec` must be 0 **iff** no record is found. The correct definition of "no record" is a failure to find a record terminator and a failure to find any data (possibly from a final, unterminated record). Conceptually, for any RS: ``` isrec = (noTERM && noDATA) ? 0 : 1 ``` noDATA is an expression that's true if `buf` is empty, false otherwise. When RS is null or a single character, noTERM is an expression that is true when the sought after character is not found, false otherwise. Since the search for a single character can only end with that character or EOF, noTERM is `c == EOF`. ``` isrec = (c == EOF && rr == buf) ? 0 : 1 ``` When RS is a regular expression: noTERM is an expression that is true if a match for RS is not found, false otherwise. This is simply the inverse of the result of the function that conducts the search, `!found`. ``` isrec = (found == 0 && *buf == '\0') ? 0 : 1 ```
This commit is contained in:
parent
c0f4e97e45
commit
92f9e8a9be
3
lib.c
3
lib.c
@ -241,6 +241,7 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one rec
|
|||||||
}
|
}
|
||||||
if (found)
|
if (found)
|
||||||
setptr(patbeg, '\0');
|
setptr(patbeg, '\0');
|
||||||
|
isrec = (found == 0 && *buf == '\0') ? 0 : 1;
|
||||||
} else {
|
} else {
|
||||||
if ((sep = *rs) == 0) {
|
if ((sep = *rs) == 0) {
|
||||||
sep = '\n';
|
sep = '\n';
|
||||||
@ -270,10 +271,10 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one rec
|
|||||||
if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readrec 3"))
|
if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readrec 3"))
|
||||||
FATAL("input record `%.30s...' too long", buf);
|
FATAL("input record `%.30s...' too long", buf);
|
||||||
*rr = 0;
|
*rr = 0;
|
||||||
|
isrec = (c == EOF && rr == buf) ? 0 : 1;
|
||||||
}
|
}
|
||||||
*pbuf = buf;
|
*pbuf = buf;
|
||||||
*pbufsize = bufsize;
|
*pbufsize = bufsize;
|
||||||
isrec = *buf || !feof(inf);
|
|
||||||
DPRINTF("readrec saw <%s>, returns %d\n", buf, isrec);
|
DPRINTF("readrec saw <%s>, returns %d\n", buf, isrec);
|
||||||
return isrec;
|
return isrec;
|
||||||
}
|
}
|
||||||
|
@ -186,6 +186,13 @@ BEGIN { RS = ""
|
|||||||
}' >foo1
|
}' >foo1
|
||||||
$awk 'END {print NR}' foo1 | grep 4 >/dev/null || echo 'BAD: T.misc abcdef fails'
|
$awk 'END {print NR}' foo1 | grep 4 >/dev/null || echo 'BAD: T.misc abcdef fails'
|
||||||
|
|
||||||
|
# Test for RS regex matching an empty record at EOF
|
||||||
|
echo a | $awk 1 RS='a\n' > foo1
|
||||||
|
cat << \EOF > foo2
|
||||||
|
|
||||||
|
EOF
|
||||||
|
diff foo1 foo2 || echo 'BAD: T.misc RS regex matching an empty record at EOF fails'
|
||||||
|
|
||||||
# Test for RS regex being reapplied
|
# Test for RS regex being reapplied
|
||||||
echo aaa1a2a | $awk 1 RS='^a' >foo1
|
echo aaa1a2a | $awk 1 RS='^a' >foo1
|
||||||
cat << \EOF > foo2
|
cat << \EOF > foo2
|
||||||
|
Loading…
Reference in New Issue
Block a user