bc4c293fe5
If you look at GNU coreutils, they do not support the mappings $ echo "1234abc" | tr "[:alnum:]" "[:upper:]" $ echo "ABCabc" | tr -c "[:upper:]" "[l*]" to only give a few examples. This commit broadens the scope of tr(1) as far as humanly possible to map between classes and non-classes, making tr a usable tool and actually fulfilling user expectations. Posix really is of no help here as it still kind of assumes the fixed ASCII table instead of complex Unicode code points or even Grapheme clusters.
288 lines
5.8 KiB
C
288 lines
5.8 KiB
C
/* See LICENSE file for copyright and license details. */
|
|
#include <stdlib.h>
|
|
|
|
#include "utf.h"
|
|
#include "util.h"
|
|
|
|
static int cflag = 0;
|
|
static int dflag = 0;
|
|
static int sflag = 0;
|
|
|
|
struct range {
|
|
Rune start;
|
|
Rune end;
|
|
size_t quant;
|
|
};
|
|
|
|
static struct {
|
|
char *name;
|
|
int (*check)(Rune);
|
|
} classes[] = {
|
|
{ "alnum", isalnumrune },
|
|
{ "alpha", isalpharune },
|
|
{ "blank", isblankrune },
|
|
{ "cntrl", iscntrlrune },
|
|
{ "digit", isdigitrune },
|
|
{ "graph", isgraphrune },
|
|
{ "lower", islowerrune },
|
|
{ "print", isprintrune },
|
|
{ "punct", ispunctrune },
|
|
{ "space", isspacerune },
|
|
{ "upper", isupperrune },
|
|
{ "xdigit", isxdigitrune },
|
|
};
|
|
|
|
static struct range *set1 = NULL;
|
|
static size_t set1ranges = 0;
|
|
static int (*set1check)(Rune) = NULL;
|
|
static struct range *set2 = NULL;
|
|
static size_t set2ranges = 0;
|
|
static int (*set2check)(Rune) = NULL;
|
|
|
|
static size_t
|
|
rangelen(struct range r)
|
|
{
|
|
return (r.end - r.start + 1) * r.quant;
|
|
}
|
|
|
|
static size_t
|
|
setlen(struct range *set, size_t setranges)
|
|
{
|
|
size_t len = 0, i;
|
|
|
|
for (i = 0; i < setranges; i++)
|
|
len += rangelen(set[i]);
|
|
|
|
return len;
|
|
}
|
|
|
|
static int
|
|
rstrmatch(Rune *r, char *s, size_t n)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
if (r[i] != s[i])
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
static size_t
|
|
makeset(char *str, struct range **set, int (**check)(Rune))
|
|
{
|
|
Rune *rstr;
|
|
size_t len, i, j, m, n;
|
|
size_t q, setranges = 0;
|
|
int factor, base;
|
|
|
|
/* rstr defines at most len ranges */
|
|
unescape(str);
|
|
rstr = ereallocarray(NULL, utflen(str) + 1, sizeof(*rstr));
|
|
len = utftorunestr(str, rstr);
|
|
*set = ereallocarray(NULL, len, sizeof(**set));
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (rstr[i] == '[') {
|
|
j = i;
|
|
nextbrack:
|
|
if (j == len)
|
|
goto literal;
|
|
for (m = j; m < len; m++)
|
|
if (rstr[m] == ']') {
|
|
j = m;
|
|
break;
|
|
}
|
|
if (j == i)
|
|
goto literal;
|
|
|
|
/* CLASSES [=EQUIV=] (skip) */
|
|
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
|
|
if (j - i != 4)
|
|
goto literal;
|
|
(*set)[setranges].start = rstr[i + 2];
|
|
(*set)[setranges].end = rstr[i + 2];
|
|
(*set)[setranges].quant = 1;
|
|
setranges++;
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
/* CLASSES [:CLASS:] */
|
|
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
|
|
for (n = 0; n < LEN(classes); n++) {
|
|
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
|
|
*check = classes[n].check;
|
|
return 0;
|
|
}
|
|
}
|
|
eprintf("Invalid character class.\n");
|
|
}
|
|
|
|
/* REPEAT [_*n] (only allowed in set2) */
|
|
if (j - i > 2 && rstr[i + 2] == '*') {
|
|
/* check if right side of '*' is a number */
|
|
q = 0;
|
|
factor = 1;
|
|
base = (rstr[i + 3] == '0') ? 8 : 10;
|
|
for (n = j - 1; n > i + 2; n--) {
|
|
if (rstr[n] < '0' || rstr[n] > '9') {
|
|
n = 0;
|
|
break;
|
|
}
|
|
q += (rstr[n] - '0') * factor;
|
|
factor *= base;
|
|
}
|
|
if (n == 0) {
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
(*set)[setranges].start = rstr[i + 1];
|
|
(*set)[setranges].end = rstr[i + 1];
|
|
(*set)[setranges].quant = q ? q : setlen(set1, MAX(set1ranges, 1));
|
|
setranges++;
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
literal:
|
|
/* RANGES [_-__-_], _-__-_ */
|
|
/* LITERALS _______ */
|
|
(*set)[setranges].start = rstr[i];
|
|
|
|
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
|
|
i += 2;
|
|
(*set)[setranges].end = rstr[i];
|
|
(*set)[setranges].quant = 1;
|
|
setranges++;
|
|
}
|
|
|
|
free(rstr);
|
|
return setranges;
|
|
}
|
|
|
|
static void
|
|
usage(void)
|
|
{
|
|
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
Rune r = 0, lastrune = 0;
|
|
size_t off1, off2, i, m;
|
|
int ret = 0;
|
|
|
|
ARGBEGIN {
|
|
case 'c':
|
|
case 'C':
|
|
cflag = 1;
|
|
break;
|
|
case 'd':
|
|
dflag = 1;
|
|
break;
|
|
case 's':
|
|
sflag = 1;
|
|
break;
|
|
default:
|
|
usage();
|
|
} ARGEND
|
|
|
|
if (!argc || argc > 2 || (argc == 1 && dflag == sflag))
|
|
usage();
|
|
set1ranges = makeset(argv[0], &set1, &set1check);
|
|
if (argc == 2)
|
|
set2ranges = makeset(argv[1], &set2, &set2check);
|
|
|
|
if (!dflag) {
|
|
/* sanity checks as we are translating */
|
|
if (!set2ranges && !set2check)
|
|
eprintf("cannot map to an empty set.\n");
|
|
if (set2check && set2check != islowerrune &&
|
|
set2check != isupperrune) {
|
|
eprintf("can only map to 'lower' and 'upper' class.\n");
|
|
}
|
|
}
|
|
read:
|
|
if (!efgetrune(&r, stdin, "<stdin>")) {
|
|
ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
|
|
return ret;
|
|
}
|
|
for (i = 0, off1 = 0; i < set1ranges; i++, off1 += rangelen(set1[i])) {
|
|
if (set1[i].start <= r && r <= set1[i].end) {
|
|
if (dflag) {
|
|
if (cflag)
|
|
continue;
|
|
else
|
|
goto read;
|
|
}
|
|
if (cflag)
|
|
goto write;
|
|
|
|
/* map r to set2 */
|
|
if (set2check) {
|
|
if (set2check == islowerrune)
|
|
r = tolowerrune(r);
|
|
else
|
|
r = toupperrune(r);
|
|
} else {
|
|
off1 += r - set1[i].start;
|
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
|
r = set2[set2ranges - 1].end;
|
|
goto write;
|
|
}
|
|
for (m = 0, off2 = 0; m < set2ranges; m++) {
|
|
if (off2 + rangelen(set2[m]) > off1) {
|
|
m++;
|
|
break;
|
|
}
|
|
off2 += rangelen(set2[m]);
|
|
}
|
|
m--;
|
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
|
}
|
|
goto write;
|
|
}
|
|
}
|
|
if (set1check && set1check(r)) {
|
|
if (dflag && !cflag)
|
|
goto read;
|
|
if (set2check) {
|
|
if (set2check == islowerrune)
|
|
r = tolowerrune(r);
|
|
else
|
|
r = toupperrune(r);
|
|
} else {
|
|
r = set2[set2ranges - 1].end;
|
|
}
|
|
}
|
|
if (!dflag && cflag) {
|
|
if (set2check) {
|
|
if (set2check == islowerrune)
|
|
r = tolowerrune(r);
|
|
else
|
|
r = toupperrune(r);
|
|
} else {
|
|
r = set2[set2ranges - 1].end;
|
|
}
|
|
goto write;
|
|
}
|
|
if (dflag && cflag)
|
|
goto read;
|
|
write:
|
|
if (sflag && r == lastrune) {
|
|
if (set2check && set2check(r))
|
|
goto read;
|
|
for (i = 0; i < set2ranges; i++) {
|
|
if (set2[i].start <= r && r <= set2[i].end)
|
|
goto read;
|
|
}
|
|
}
|
|
lastrune = r;
|
|
efputrune(&r, stdout, "<stdout>");
|
|
goto read;
|
|
}
|