7d3e9c6e88
This is one aspect which I think has blown up the complexity of many tr-implementations around today. Instead of complicating the set-theory-based parser itself (he should still be relying on one rune per char, not multirunes), I added a preprocessor, which basically scans the code for upcoming '\'s, reads what he finds, substitutes the real character onto '\'s index and shifts the entire following array so there are no "holes". What is left to reflect on is what to do with octal sequences. I have a local implementation here, which works fine, but imho, given tr is already so focused on UTF-8, we might as well ignore POSIX at this point and rather implement the unicode UTF-8 code points, which are way more contemporary and future-proof. Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue, but I'm still struggling to find a way to turn it into a well-formed byte sequence. Hit me with a mail if you have a simple solution for that.
289 lines
5.7 KiB
C
289 lines
5.7 KiB
C
#include <wctype.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "utf.h"
|
|
#include "util.h"
|
|
|
|
static int cflag = 0;
|
|
static int dflag = 0;
|
|
static int sflag = 0;
|
|
|
|
struct range {
|
|
Rune start;
|
|
Rune end;
|
|
size_t quant;
|
|
};
|
|
|
|
static struct {
|
|
char *name;
|
|
int (*check)(wint_t);
|
|
} classes[] = {
|
|
{ "alnum", iswalnum },
|
|
{ "alpha", iswalpha },
|
|
{ "blank", iswblank },
|
|
{ "cntrl", iswcntrl },
|
|
{ "digit", iswdigit },
|
|
{ "graph", iswgraph },
|
|
{ "lower", iswlower },
|
|
{ "print", iswprint },
|
|
{ "punct", iswpunct },
|
|
{ "space", iswspace },
|
|
{ "upper", iswupper },
|
|
{ "xdigit", iswxdigit },
|
|
};
|
|
|
|
static struct range *set1 = NULL;
|
|
static size_t set1ranges = 0;
|
|
static int (*set1check)(wint_t) = NULL;
|
|
static struct range *set2 = NULL;
|
|
static size_t set2ranges = 0;
|
|
static int (*set2check)(wint_t) = NULL;
|
|
|
|
|
|
static size_t
|
|
rangelen(struct range r)
|
|
{
|
|
return (r.end - r.start + 1) * r.quant;
|
|
}
|
|
|
|
static size_t
|
|
setlen(struct range *set, size_t setranges)
|
|
{
|
|
size_t len = 0, i;
|
|
|
|
for (i = 0; i < setranges; i++)
|
|
len += rangelen(set[i]);
|
|
|
|
return len;
|
|
}
|
|
|
|
static int
|
|
rstrmatch(Rune *r, char *s, size_t n)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
if (r[i] != s[i])
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
static size_t
|
|
resolveescapes(Rune *r, size_t len)
|
|
{
|
|
size_t i, off, m;
|
|
|
|
for (i = 0; i < len - 1; i++) {
|
|
if (r[i] != '\\')
|
|
continue;
|
|
off = 0;
|
|
|
|
switch (r[i + 1]) {
|
|
case '\\': r[i] = '\\'; off++; break;
|
|
case 'a': r[i] = '\a'; off++; break;
|
|
case 'b': r[i] = '\b'; off++; break;
|
|
case 'f': r[i] = '\f'; off++; break;
|
|
case 'n': r[i] = '\n'; off++; break;
|
|
case 'r': r[i] = '\r'; off++; break;
|
|
case 't': r[i] = '\t'; off++; break;
|
|
case 'v': r[i] = '\v'; off++; break;
|
|
default: continue;
|
|
}
|
|
|
|
for (m = i + 1; m <= len - off; m++)
|
|
r[m] = r[m + off];
|
|
len -= off;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
static size_t
|
|
makeset(char *str, struct range **set, int (**check)(wint_t))
|
|
{
|
|
Rune *rstr;
|
|
size_t len, i, j, m, n;
|
|
size_t q, setranges = 0;
|
|
int factor, base;
|
|
|
|
/* rstr defines at most len ranges */
|
|
len = chartorunearr(str, &rstr);
|
|
len = resolveescapes(rstr, len);
|
|
*set = emalloc(len * sizeof(**set));
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (rstr[i] == '[') {
|
|
j = i;
|
|
nextbrack:
|
|
if (j == len)
|
|
goto literal;
|
|
for (m = j; m < len; m++)
|
|
if (rstr[m] == ']') {
|
|
j = m;
|
|
break;
|
|
}
|
|
if (j == i)
|
|
goto literal;
|
|
|
|
/* CLASSES [=EQUIV=] (skip) */
|
|
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
/* CLASSES [:CLASS:] */
|
|
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
|
|
for (n = 0; n < LEN(classes); n++) {
|
|
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
|
|
*check = classes[n].check;
|
|
return 0;
|
|
}
|
|
}
|
|
eprintf("Invalid character class.\n");
|
|
}
|
|
|
|
/* REPEAT [_*n] (only allowed in set2) */
|
|
if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
|
|
/* check if right side of '*' is a number */
|
|
q = 0;
|
|
factor = 1;
|
|
base = (rstr[i + 3] == '0') ? 8 : 10;
|
|
for (n = j - 1; n > i + 2; n--) {
|
|
if (rstr[n] < '0' && rstr[n] > '9') {
|
|
n = 0;
|
|
break;
|
|
}
|
|
q += (rstr[n] - '0') * factor;
|
|
factor *= base;
|
|
}
|
|
|
|
if (n == 0) {
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
(*set)[setranges].start = rstr[i + 1];
|
|
(*set)[setranges].end = rstr[i + 1];
|
|
(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
|
|
setranges++;
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
literal:
|
|
/* RANGES [_-__-_], _-__-_ */
|
|
/* LITERALS _______ */
|
|
(*set)[setranges].start = rstr[i];
|
|
|
|
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
|
|
i += 2;
|
|
(*set)[setranges].end = rstr[i];
|
|
(*set)[setranges].quant = 1;
|
|
setranges++;
|
|
}
|
|
|
|
free(rstr);
|
|
return setranges;
|
|
}
|
|
|
|
static void
|
|
usage(void)
|
|
{
|
|
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
Rune r = 0, lastrune = 0;
|
|
size_t off1, off2, i, m;
|
|
|
|
ARGBEGIN {
|
|
case 'c':
|
|
case 'C':
|
|
cflag = 1;
|
|
break;
|
|
case 'd':
|
|
dflag = 1;
|
|
break;
|
|
case 's':
|
|
sflag = 1;
|
|
break;
|
|
default:
|
|
usage();
|
|
} ARGEND;
|
|
|
|
if (argc < 1 || argc > 2 || (argc == 1 && dflag == sflag))
|
|
usage();
|
|
set1ranges = makeset(argv[0], &set1, &set1check);
|
|
if (argc == 2)
|
|
set2ranges = makeset(argv[1], &set2, &set2check);
|
|
if (dflag == sflag && !set2ranges && !set2check)
|
|
eprintf("set2 must be non-empty.\n");
|
|
if (set2check && set2check != iswlower && set2check != iswupper)
|
|
eprintf("set2 can only be the 'lower' or 'upper' class.\n");
|
|
read:
|
|
if (!readrune("<stdin>", stdin, &r))
|
|
return 0;
|
|
off1 = off2 = 0;
|
|
for (i = 0; i < set1ranges; i++) {
|
|
if (set1[i].start <= r && r <= set1[i].end) {
|
|
if (dflag && !cflag)
|
|
goto read;
|
|
if (sflag) {
|
|
if (r == lastrune)
|
|
goto read;
|
|
else
|
|
goto write;
|
|
}
|
|
for (m = 0; m < i; m++)
|
|
off1 += rangelen(set1[m]);
|
|
off1 += r - set1[m].start;
|
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
|
r = set2[set2ranges - 1].end;
|
|
goto write;
|
|
}
|
|
for (m = 0; m < set2ranges; m++) {
|
|
if (off2 + rangelen(set2[m]) > off1) {
|
|
m++;
|
|
break;
|
|
}
|
|
off2 += rangelen(set2[m]);
|
|
}
|
|
m--;
|
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
|
|
|
goto write;
|
|
}
|
|
}
|
|
if (set1check && set1check((wint_t)r)) {
|
|
if (dflag && !cflag)
|
|
goto read;
|
|
if (sflag) {
|
|
if (r == lastrune)
|
|
goto read;
|
|
else
|
|
goto write;
|
|
}
|
|
if (set1check == iswupper && set2check == iswlower)
|
|
r = towlower((wint_t)r);
|
|
else if (set1check == iswlower && set2check == iswupper)
|
|
r = towupper((wint_t)r);
|
|
else if (set2ranges > 0)
|
|
r = set2[set2ranges - 1].end;
|
|
else
|
|
eprintf("Misaligned character classes.\n");
|
|
}
|
|
if (dflag && cflag)
|
|
goto read;
|
|
if (dflag && sflag && r == lastrune)
|
|
goto read;
|
|
write:
|
|
lastrune = r;
|
|
writerune("<stdout>", stdout, &r);
|
|
goto read;
|
|
}
|