fb11173926
See his description below. Thanks Michael!
---
A bug was introduced in bc4c293fe5
causing the
range length for the next set to be used instead of the first one. This causes
issues when choosing the replacement rune when the ranges are of different
lengths.
Current behavior:
$ echo 1234 | tr 'a-f1-4' '1-6a-d'
56ab
Correct behavior:
$ echo 1234 | tr 'a-f1-4' '1-6a-d'
abcd
This also fixes range expressions in the form [a-z], which get encoded as four
ranges '[', 'a'..'z', ']', causing all a-z characters to get mapped to ']'. This
form is occasionally used in shell scripts, including the syscalltbl.sh script
used to build linux.
---
303 lines
6.1 KiB
C
303 lines
6.1 KiB
C
/* See LICENSE file for copyright and license details. */
|
|
#include <stdlib.h>
|
|
|
|
#include "utf.h"
|
|
#include "util.h"
|
|
|
|
static int cflag = 0;
|
|
static int dflag = 0;
|
|
static int sflag = 0;
|
|
|
|
struct range {
|
|
Rune start;
|
|
Rune end;
|
|
size_t quant;
|
|
};
|
|
|
|
static struct {
|
|
char *name;
|
|
int (*check)(Rune);
|
|
} classes[] = {
|
|
{ "alnum", isalnumrune },
|
|
{ "alpha", isalpharune },
|
|
{ "blank", isblankrune },
|
|
{ "cntrl", iscntrlrune },
|
|
{ "digit", isdigitrune },
|
|
{ "graph", isgraphrune },
|
|
{ "lower", islowerrune },
|
|
{ "print", isprintrune },
|
|
{ "punct", ispunctrune },
|
|
{ "space", isspacerune },
|
|
{ "upper", isupperrune },
|
|
{ "xdigit", isxdigitrune },
|
|
};
|
|
|
|
static struct range *set1 = NULL;
|
|
static size_t set1ranges = 0;
|
|
static int (*set1check)(Rune) = NULL;
|
|
static struct range *set2 = NULL;
|
|
static size_t set2ranges = 0;
|
|
static int (*set2check)(Rune) = NULL;
|
|
|
|
static size_t
|
|
rangelen(struct range r)
|
|
{
|
|
return (r.end - r.start + 1) * r.quant;
|
|
}
|
|
|
|
static size_t
|
|
setlen(struct range *set, size_t setranges)
|
|
{
|
|
size_t len = 0, i;
|
|
|
|
for (i = 0; i < setranges; i++)
|
|
len += rangelen(set[i]);
|
|
|
|
return len;
|
|
}
|
|
|
|
static int
|
|
rstrmatch(Rune *r, char *s, size_t n)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
if (r[i] != s[i])
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
static size_t
|
|
makeset(char *str, struct range **set, int (**check)(Rune))
|
|
{
|
|
Rune *rstr;
|
|
size_t len, i, j, m, n;
|
|
size_t q, setranges = 0;
|
|
int factor, base;
|
|
|
|
/* rstr defines at most len ranges */
|
|
unescape(str);
|
|
rstr = ereallocarray(NULL, utflen(str) + 1, sizeof(*rstr));
|
|
len = utftorunestr(str, rstr);
|
|
*set = ereallocarray(NULL, len, sizeof(**set));
|
|
|
|
for (i = 0; i < len; i++) {
|
|
if (rstr[i] == '[') {
|
|
j = i;
|
|
nextbrack:
|
|
if (j == len)
|
|
goto literal;
|
|
for (m = j; m < len; m++)
|
|
if (rstr[m] == ']') {
|
|
j = m;
|
|
break;
|
|
}
|
|
if (j == i)
|
|
goto literal;
|
|
|
|
/* CLASSES [=EQUIV=] (skip) */
|
|
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
|
|
if (j - i != 4)
|
|
goto literal;
|
|
(*set)[setranges].start = rstr[i + 2];
|
|
(*set)[setranges].end = rstr[i + 2];
|
|
(*set)[setranges].quant = 1;
|
|
setranges++;
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
/* CLASSES [:CLASS:] */
|
|
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
|
|
for (n = 0; n < LEN(classes); n++) {
|
|
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
|
|
*check = classes[n].check;
|
|
return 0;
|
|
}
|
|
}
|
|
eprintf("Invalid character class.\n");
|
|
}
|
|
|
|
/* REPEAT [_*n] (only allowed in set2) */
|
|
if (j - i > 2 && rstr[i + 2] == '*') {
|
|
/* check if right side of '*' is a number */
|
|
q = 0;
|
|
factor = 1;
|
|
base = (rstr[i + 3] == '0') ? 8 : 10;
|
|
for (n = j - 1; n > i + 2; n--) {
|
|
if (rstr[n] < '0' || rstr[n] > '9') {
|
|
n = 0;
|
|
break;
|
|
}
|
|
q += (rstr[n] - '0') * factor;
|
|
factor *= base;
|
|
}
|
|
if (n == 0) {
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
(*set)[setranges].start = rstr[i + 1];
|
|
(*set)[setranges].end = rstr[i + 1];
|
|
(*set)[setranges].quant = q ? q : setlen(set1, MAX(set1ranges, 1));
|
|
setranges++;
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
literal:
|
|
/* RANGES [_-__-_], _-__-_ */
|
|
/* LITERALS _______ */
|
|
(*set)[setranges].start = rstr[i];
|
|
|
|
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
|
|
i += 2;
|
|
(*set)[setranges].end = rstr[i];
|
|
(*set)[setranges].quant = 1;
|
|
setranges++;
|
|
}
|
|
|
|
free(rstr);
|
|
return setranges;
|
|
}
|
|
|
|
static void
|
|
usage(void)
|
|
{
|
|
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
Rune r, lastrune = 0;
|
|
size_t off1, off2, i, m;
|
|
int ret = 0;
|
|
|
|
ARGBEGIN {
|
|
case 'c':
|
|
case 'C':
|
|
cflag = 1;
|
|
break;
|
|
case 'd':
|
|
dflag = 1;
|
|
break;
|
|
case 's':
|
|
sflag = 1;
|
|
break;
|
|
default:
|
|
usage();
|
|
} ARGEND
|
|
|
|
if (!argc || argc > 2 || (argc == 1 && dflag == sflag))
|
|
usage();
|
|
set1ranges = makeset(argv[0], &set1, &set1check);
|
|
if (argc == 2)
|
|
set2ranges = makeset(argv[1], &set2, &set2check);
|
|
|
|
if (!dflag || (argc == 2 && sflag)) {
|
|
/* sanity checks as we are translating */
|
|
if (!sflag && !set2ranges && !set2check)
|
|
eprintf("cannot map to an empty set.\n");
|
|
if (set2check && set2check != islowerrune &&
|
|
set2check != isupperrune) {
|
|
eprintf("can only map to 'lower' and 'upper' class.\n");
|
|
}
|
|
}
|
|
read:
|
|
if (!efgetrune(&r, stdin, "<stdin>")) {
|
|
ret |= fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>");
|
|
return ret;
|
|
}
|
|
if (argc == 1 && sflag)
|
|
goto write;
|
|
for (i = 0, off1 = 0; i < set1ranges; off1 += rangelen(set1[i]), i++) {
|
|
if (set1[i].start <= r && r <= set1[i].end) {
|
|
if (dflag) {
|
|
if (cflag)
|
|
goto write;
|
|
else
|
|
goto read;
|
|
}
|
|
if (cflag)
|
|
goto write;
|
|
|
|
/* map r to set2 */
|
|
if (set2check) {
|
|
if (set2check == islowerrune)
|
|
r = tolowerrune(r);
|
|
else
|
|
r = toupperrune(r);
|
|
} else {
|
|
off1 += r - set1[i].start;
|
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
|
r = set2[set2ranges - 1].end;
|
|
goto write;
|
|
}
|
|
for (m = 0, off2 = 0; m < set2ranges; m++) {
|
|
if (off2 + rangelen(set2[m]) > off1) {
|
|
m++;
|
|
break;
|
|
}
|
|
off2 += rangelen(set2[m]);
|
|
}
|
|
m--;
|
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
|
}
|
|
goto write;
|
|
}
|
|
}
|
|
if (set1check && set1check(r)) {
|
|
if (dflag) {
|
|
if (cflag)
|
|
goto write;
|
|
else
|
|
goto read;
|
|
}
|
|
if (set2check) {
|
|
if (set2check == islowerrune)
|
|
r = tolowerrune(r);
|
|
else
|
|
r = toupperrune(r);
|
|
} else {
|
|
r = set2[set2ranges - 1].end;
|
|
}
|
|
goto write;
|
|
}
|
|
if (!dflag && cflag) {
|
|
if (set2check) {
|
|
if (set2check == islowerrune)
|
|
r = tolowerrune(r);
|
|
else
|
|
r = toupperrune(r);
|
|
} else {
|
|
r = set2[set2ranges - 1].end;
|
|
}
|
|
goto write;
|
|
}
|
|
if (dflag && cflag)
|
|
goto read;
|
|
write:
|
|
if (argc == 1 && sflag && r == lastrune) {
|
|
if (set1check && set1check(r))
|
|
goto read;
|
|
for (i = 0; i < set1ranges; i++) {
|
|
if (set1[i].start <= r && r <= set1[i].end)
|
|
goto read;
|
|
}
|
|
}
|
|
if (argc == 2 && sflag && r == lastrune) {
|
|
if (set2check && set2check(r))
|
|
goto read;
|
|
for (i = 0; i < set2ranges; i++) {
|
|
if (set2[i].start <= r && r <= set2[i].end)
|
|
goto read;
|
|
}
|
|
}
|
|
efputrune(&r, stdout, "<stdout>");
|
|
lastrune = r;
|
|
goto read;
|
|
}
|