#include #include #include "utf.h" #include "util.h" static int cflag = 0; static int dflag = 0; static int sflag = 0; struct range { Rune start; Rune end; size_t quant; }; #define DIGIT "0-9" #define UPPER "A-Z" #define LOWER "a-z" #define PUNCT "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" #define ALNUM DIGIT UPPER LOWER static struct { char *name; char *str; } classes[] = { { "alnum", ALNUM }, { "alpha", UPPER LOWER }, { "blank", " \t" }, { "cntrl", "\000-\037\177" }, { "digit", DIGIT }, { "graph", ALNUM PUNCT }, { "lower", LOWER }, { "print", ALNUM PUNCT " " }, { "punct", PUNCT }, { "space", "\t\n\v\f\r" }, { "upper", UPPER }, { "xdigit", DIGIT "A-Fa-f" }, }; static struct range *set1 = NULL; static size_t set1ranges = 0; static struct range *set2 = NULL; static size_t set2ranges = 0; static size_t rangelen(struct range r) { return (r.end - r.start + 1) * r.quant; } static size_t setlen(struct range *set, size_t setranges) { int i; size_t len = 0; for (i = 0; i < setranges; i++) len += rangelen(set[i]); return len; } static int rstrmatch(Rune *r, char *s, size_t n) { size_t i; for (i = 0; i < n; i++) if (r[i] != s[i]) return 0; return 1; } static size_t makeset(char *str, struct range **set) { Rune *rstr; size_t len, i, j, m, n; size_t q, setranges; int factor, base; reset: setranges = 0; /* rstr defines at most len ranges */ len = chartorunearr(str, &rstr); *set = emalloc(len * sizeof(**set)); /* todo: allow expressions */ for (i = 0; i < len; i++) { if (rstr[i] == '[') { j = i; nextbrack: if (j == len) goto literal; for (m = j; m < len; m++) if (rstr[m] == ']') { j = m; break; } if (j == i) goto literal; /* CLASSES [=EQUIV=] (skip) */ if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') { i = j; continue; } /* CLASSES [:CLASS:] */ if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') { for (n = 0; n < LEN(classes); n++) { if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) { str = classes[n].str; goto reset; } } eprintf("Invalid character class\n"); } /* REPEAT [_*n] (only allowed in set2) */ if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) { /* check if right side of '*' is a number */ q = 0; factor = 1; base = (rstr[i + 3] == '0') ? 8 : 10; for (n = j - 1; n > i + 2; n--) { if (rstr[n] < '0' && rstr[n] > '9') { n = 0; break; } q += (rstr[n] - '0') * factor; factor *= base; } if (n == 0) { j = m + 1; goto nextbrack; } (*set)[setranges].start = rstr[i + 1]; (*set)[setranges].end = rstr[i + 1]; (*set)[setranges].quant = q ? q : setlen(set1, set1ranges); setranges++; i = j; continue; } j = m + 1; goto nextbrack; } literal: /* RANGES [_-__-_], _-__-_ */ /* LITERALS _______ */ (*set)[setranges].start = rstr[i]; if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i]) i += 2; (*set)[setranges].end = rstr[i]; (*set)[setranges].quant = 1; setranges++; } free(rstr); return setranges; } static void usage(void) { eprintf("usage: %s [-cCds] set1 [set2]\n", argv0); } int main(int argc, char *argv[]) { Rune r = 0, lastrune = 0; int i, m; size_t off1, off2; ARGBEGIN { case 'c': case 'C': cflag = 1; break; case 'd': dflag = 1; break; case 's': sflag = 1; break; default: usage(); } ARGEND; if (argc < 1 || argc > 2 || (argc == 1 && dflag == sflag)) usage(); set1ranges = makeset(argv[0], &set1); if (argc == 2) set2ranges = makeset(argv[1], &set2); read: if (!readrune("", stdin, &r)) return 0; off1 = off2 = 0; for (i = 0; i < set1ranges; i++) { if (set1[i].start <= r && r <= set1[i].end) { if (dflag && !cflag) goto read; if (sflag) { if (r == lastrune) goto read; else goto write; } for (m = 0; m < i; m++) off1 += rangelen(set1[m]); off1 += r - set1[m].start; if (off1 > setlen(set2, set2ranges) - 1) { r = set2[set2ranges - 1].end; goto write; } for (m = 0; m < set2ranges; m++) { if (off2 + rangelen(set2[m]) > off1) { m++; break; } off2 += rangelen(set2[m]); } m--; r = set2[m].start + (off1 - off2) / set2[m].quant; goto write; } } if (dflag && cflag) goto read; if (dflag && sflag && r == lastrune) goto read; write: lastrune = r; writerune("", stdout, &r); goto read; }