sbase/tr.c

304 lines
6.2 KiB
C

/* See LICENSE file for copyright and license details. */
#include <wctype.h>
#include <stdio.h>
#include <stdlib.h>
#include "utf.h"
#include "util.h"
static int cflag = 0;
static int dflag = 0;
static int sflag = 0;
struct range {
Rune start;
Rune end;
size_t quant;
};
static struct {
char *name;
int (*check)(wint_t);
} classes[] = {
{ "alnum", iswalnum },
{ "alpha", iswalpha },
{ "blank", iswblank },
{ "cntrl", iswcntrl },
{ "digit", iswdigit },
{ "graph", iswgraph },
{ "lower", iswlower },
{ "print", iswprint },
{ "punct", iswpunct },
{ "space", iswspace },
{ "upper", iswupper },
{ "xdigit", iswxdigit },
};
static struct range *set1 = NULL;
static size_t set1ranges = 0;
static int (*set1check)(wint_t) = NULL;
static struct range *set2 = NULL;
static size_t set2ranges = 0;
static int (*set2check)(wint_t) = NULL;
static size_t
rangelen(struct range r)
{
return (r.end - r.start + 1) * r.quant;
}
static size_t
setlen(struct range *set, size_t setranges)
{
size_t len = 0, i;
for (i = 0; i < setranges; i++)
len += rangelen(set[i]);
return len;
}
static int
rstrmatch(Rune *r, char *s, size_t n)
{
size_t i;
for (i = 0; i < n; i++)
if (r[i] != s[i])
return 0;
return 1;
}
static size_t
resolveescapes(Rune *r, size_t len)
{
size_t i, off, m, factor, q;
for (i = 0; i < len; i++) {
if (r[i] != '\\')
continue;
off = 0;
switch (r[i + 1]) {
case '\\': r[i] = '\\'; off++; break;
case 'a': r[i] = '\a'; off++; break;
case 'b': r[i] = '\b'; off++; break;
case 'f': r[i] = '\f'; off++; break;
case 'n': r[i] = '\n'; off++; break;
case 'r': r[i] = '\r'; off++; break;
case 't': r[i] = '\t'; off++; break;
case 'v': r[i] = '\v'; off++; break;
case '\0':
eprintf("tr: null escape sequence\n");
default:
/* "\O[OO]" octal escape */
for (m = i + 1; m < i + 1 + 3 && m < len; m++)
if (r[m] < '0' || r[m] > '7')
break;
if (m == i + 1)
eprintf("tr: invalid escape sequence '\\%c'\n", r[i + 1]);
off += m - i - 1;
for (--m, q = 0, factor = 1; m > i; m--) {
q += (r[m] - '0') * factor;
factor *= 8;
}
r[i] = q;
}
for (m = i + 1; m <= len - off; m++)
r[m] = r[m + off];
len -= off;
}
return len;
}
static size_t
makeset(char *str, struct range **set, int (**check)(wint_t))
{
Rune *rstr;
size_t len, i, j, m, n;
size_t q, setranges = 0;
int factor, base;
/* rstr defines at most len ranges */
len = chartorunearr(str, &rstr);
len = resolveescapes(rstr, len);
*set = emalloc(len * sizeof(**set));
for (i = 0; i < len; i++) {
if (rstr[i] == '[') {
j = i;
nextbrack:
if (j == len)
goto literal;
for (m = j; m < len; m++)
if (rstr[m] == ']') {
j = m;
break;
}
if (j == i)
goto literal;
/* CLASSES [=EQUIV=] (skip) */
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
i = j;
continue;
}
/* CLASSES [:CLASS:] */
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
for (n = 0; n < LEN(classes); n++) {
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
*check = classes[n].check;
return 0;
}
}
eprintf("Invalid character class.\n");
}
/* REPEAT [_*n] (only allowed in set2) */
if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
/* check if right side of '*' is a number */
q = 0;
factor = 1;
base = (rstr[i + 3] == '0') ? 8 : 10;
for (n = j - 1; n > i + 2; n--) {
if (rstr[n] < '0' && rstr[n] > '9') {
n = 0;
break;
}
q += (rstr[n] - '0') * factor;
factor *= base;
}
if (n == 0) {
j = m + 1;
goto nextbrack;
}
(*set)[setranges].start = rstr[i + 1];
(*set)[setranges].end = rstr[i + 1];
(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
setranges++;
i = j;
continue;
}
j = m + 1;
goto nextbrack;
}
literal:
/* RANGES [_-__-_], _-__-_ */
/* LITERALS _______ */
(*set)[setranges].start = rstr[i];
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
i += 2;
(*set)[setranges].end = rstr[i];
(*set)[setranges].quant = 1;
setranges++;
}
free(rstr);
return setranges;
}
static void
usage(void)
{
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
}
int
main(int argc, char *argv[])
{
Rune r = 0, lastrune = 0;
size_t off1, off2, i, m;
ARGBEGIN {
case 'c':
case 'C':
cflag = 1;
break;
case 'd':
dflag = 1;
break;
case 's':
sflag = 1;
break;
default:
usage();
} ARGEND;
if (argc < 1 || argc > 2 || (argc == 1 && dflag == sflag))
usage();
set1ranges = makeset(argv[0], &set1, &set1check);
if (argc == 2)
set2ranges = makeset(argv[1], &set2, &set2check);
if (dflag == sflag && !set2ranges && !set2check)
eprintf("set2 must be non-empty.\n");
if (set2check && set2check != iswlower && set2check != iswupper)
eprintf("set2 can only be the 'lower' or 'upper' class.\n");
read:
if (!readrune("<stdin>", stdin, &r))
return 0;
off1 = off2 = 0;
for (i = 0; i < set1ranges; i++) {
if (set1[i].start <= r && r <= set1[i].end) {
if (dflag && !cflag)
goto read;
if (sflag) {
if (r == lastrune)
goto read;
else
goto write;
}
for (m = 0; m < i; m++)
off1 += rangelen(set1[m]);
off1 += r - set1[m].start;
if (off1 > setlen(set2, set2ranges) - 1) {
r = set2[set2ranges - 1].end;
goto write;
}
for (m = 0; m < set2ranges; m++) {
if (off2 + rangelen(set2[m]) > off1) {
m++;
break;
}
off2 += rangelen(set2[m]);
}
m--;
r = set2[m].start + (off1 - off2) / set2[m].quant;
goto write;
}
}
if (set1check && set1check((wint_t)r)) {
if (dflag && !cflag)
goto read;
if (sflag) {
if (r == lastrune)
goto read;
else
goto write;
}
if (set1check == iswupper && set2check == iswlower)
r = towlower((wint_t)r);
else if (set1check == iswlower && set2check == iswupper)
r = towupper((wint_t)r);
else if (set2ranges > 0)
r = set2[set2ranges - 1].end;
else
eprintf("Misaligned character classes.\n");
}
if (dflag && cflag)
goto read;
if (dflag && sflag && r == lastrune)
goto read;
write:
lastrune = r;
writerune("<stdout>", stdout, &r);
goto read;
}