244 lines
4.6 KiB
C
244 lines
4.6 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
|
|
#include "utf.h"
|
|
#include "util.h"
|
|
|
|
static int cflag = 0;
|
|
static int dflag = 0;
|
|
static int sflag = 0;
|
|
|
|
struct range {
|
|
Rune start;
|
|
Rune end;
|
|
size_t quant;
|
|
};
|
|
|
|
#define DIGIT "0-9"
|
|
#define UPPER "A-Z"
|
|
#define LOWER "a-z"
|
|
#define PUNCT "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
|
|
#define ALNUM DIGIT UPPER LOWER
|
|
|
|
static struct {
|
|
char *name;
|
|
char *str;
|
|
} classes[] = {
|
|
{ "alnum", ALNUM },
|
|
{ "alpha", UPPER LOWER },
|
|
{ "blank", " \t" },
|
|
{ "cntrl", "\000-\037\177" },
|
|
{ "digit", DIGIT },
|
|
{ "graph", ALNUM PUNCT },
|
|
{ "lower", LOWER },
|
|
{ "print", ALNUM PUNCT " " },
|
|
{ "punct", PUNCT },
|
|
{ "space", "\t\n\v\f\r" },
|
|
{ "upper", UPPER },
|
|
{ "xdigit", DIGIT "A-Fa-f" },
|
|
};
|
|
|
|
static struct range *set1 = NULL;
|
|
static size_t set1ranges = 0;
|
|
static struct range *set2 = NULL;
|
|
static size_t set2ranges = 0;
|
|
|
|
static size_t
|
|
rangelen(struct range r)
|
|
{
|
|
return (r.end - r.start + 1) * r.quant;
|
|
}
|
|
|
|
static size_t
|
|
setlen(struct range *set, size_t setranges)
|
|
{
|
|
size_t len = 0, i;
|
|
|
|
for (i = 0; i < setranges; i++)
|
|
len += rangelen(set[i]);
|
|
|
|
return len;
|
|
}
|
|
|
|
static int
|
|
rstrmatch(Rune *r, char *s, size_t n)
|
|
{
|
|
size_t i;
|
|
|
|
for (i = 0; i < n; i++)
|
|
if (r[i] != s[i])
|
|
return 0;
|
|
return 1;
|
|
}
|
|
|
|
static size_t
|
|
makeset(char *str, struct range **set)
|
|
{
|
|
Rune *rstr;
|
|
size_t len, i, j, m, n;
|
|
size_t q, setranges;
|
|
int factor, base;
|
|
|
|
reset:
|
|
setranges = 0;
|
|
|
|
/* rstr defines at most len ranges */
|
|
len = chartorunearr(str, &rstr);
|
|
*set = emalloc(len * sizeof(**set));
|
|
|
|
/* todo: allow expressions */
|
|
for (i = 0; i < len; i++) {
|
|
if (rstr[i] == '[') {
|
|
j = i;
|
|
nextbrack:
|
|
if (j == len)
|
|
goto literal;
|
|
for (m = j; m < len; m++)
|
|
if (rstr[m] == ']') {
|
|
j = m;
|
|
break;
|
|
}
|
|
if (j == i)
|
|
goto literal;
|
|
|
|
/* CLASSES [=EQUIV=] (skip) */
|
|
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
/* CLASSES [:CLASS:] */
|
|
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
|
|
for (n = 0; n < LEN(classes); n++) {
|
|
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
|
|
str = classes[n].str;
|
|
goto reset;
|
|
}
|
|
}
|
|
eprintf("Invalid character class\n");
|
|
}
|
|
|
|
/* REPEAT [_*n] (only allowed in set2) */
|
|
if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
|
|
/* check if right side of '*' is a number */
|
|
q = 0;
|
|
factor = 1;
|
|
base = (rstr[i + 3] == '0') ? 8 : 10;
|
|
for (n = j - 1; n > i + 2; n--) {
|
|
if (rstr[n] < '0' && rstr[n] > '9') {
|
|
n = 0;
|
|
break;
|
|
}
|
|
q += (rstr[n] - '0') * factor;
|
|
factor *= base;
|
|
}
|
|
|
|
if (n == 0) {
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
(*set)[setranges].start = rstr[i + 1];
|
|
(*set)[setranges].end = rstr[i + 1];
|
|
(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
|
|
setranges++;
|
|
i = j;
|
|
continue;
|
|
}
|
|
|
|
j = m + 1;
|
|
goto nextbrack;
|
|
}
|
|
literal:
|
|
/* RANGES [_-__-_], _-__-_ */
|
|
/* LITERALS _______ */
|
|
(*set)[setranges].start = rstr[i];
|
|
|
|
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
|
|
i += 2;
|
|
(*set)[setranges].end = rstr[i];
|
|
(*set)[setranges].quant = 1;
|
|
setranges++;
|
|
}
|
|
|
|
free(rstr);
|
|
return setranges;
|
|
}
|
|
|
|
static void
|
|
usage(void)
|
|
{
|
|
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
Rune r = 0, lastrune = 0;
|
|
size_t off1, off2, i, m;
|
|
|
|
ARGBEGIN {
|
|
case 'c':
|
|
case 'C':
|
|
cflag = 1;
|
|
break;
|
|
case 'd':
|
|
dflag = 1;
|
|
break;
|
|
case 's':
|
|
sflag = 1;
|
|
break;
|
|
default:
|
|
usage();
|
|
} ARGEND;
|
|
|
|
if (argc < 1 || argc > 2 || (argc == 1 && dflag == sflag))
|
|
usage();
|
|
set1ranges = makeset(argv[0], &set1);
|
|
if (argc == 2)
|
|
set2ranges = makeset(argv[1], &set2);
|
|
if (!dflag && !set2ranges)
|
|
eprintf("set2 must be non-empty\n");
|
|
read:
|
|
if (!readrune("<stdin>", stdin, &r))
|
|
return 0;
|
|
off1 = off2 = 0;
|
|
for (i = 0; i < set1ranges; i++) {
|
|
if (set1[i].start <= r && r <= set1[i].end) {
|
|
if (dflag && !cflag)
|
|
goto read;
|
|
if (sflag) {
|
|
if (r == lastrune)
|
|
goto read;
|
|
else
|
|
goto write;
|
|
}
|
|
for (m = 0; m < i; m++)
|
|
off1 += rangelen(set1[m]);
|
|
off1 += r - set1[m].start;
|
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
|
r = set2[set2ranges - 1].end;
|
|
goto write;
|
|
}
|
|
for (m = 0; m < set2ranges; m++) {
|
|
if (off2 + rangelen(set2[m]) > off1) {
|
|
m++;
|
|
break;
|
|
}
|
|
off2 += rangelen(set2[m]);
|
|
}
|
|
m--;
|
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
|
|
|
goto write;
|
|
}
|
|
}
|
|
if (dflag && cflag)
|
|
goto read;
|
|
if (dflag && sflag && r == lastrune)
|
|
goto read;
|
|
write:
|
|
lastrune = r;
|
|
writerune("<stdout>", stdout, &r);
|
|
goto read;
|
|
}
|