2015-01-20 10:26:08 -05:00
|
|
|
/* See LICENSE file for copyright and license details. */
|
2013-11-15 11:25:10 -05:00
|
|
|
#include <stdlib.h>
|
2014-11-13 12:29:30 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
#include "utf.h"
|
2013-11-15 11:25:10 -05:00
|
|
|
#include "util.h"
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static int cflag = 0;
|
|
|
|
static int dflag = 0;
|
|
|
|
static int sflag = 0;
|
2014-07-14 18:49:42 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
struct range {
|
|
|
|
Rune start;
|
|
|
|
Rune end;
|
|
|
|
size_t quant;
|
2014-07-14 18:49:42 -04:00
|
|
|
};
|
|
|
|
|
2015-01-10 09:21:09 -05:00
|
|
|
static struct {
|
2015-01-11 14:26:20 -05:00
|
|
|
char *name;
|
2015-02-11 07:03:32 -05:00
|
|
|
int (*check)(Rune);
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
} classes[] = {
|
2015-02-11 07:03:32 -05:00
|
|
|
{ "alnum", isalnumrune },
|
|
|
|
{ "alpha", isalpharune },
|
|
|
|
{ "blank", isblankrune },
|
|
|
|
{ "cntrl", iscntrlrune },
|
|
|
|
{ "digit", isdigitrune },
|
|
|
|
{ "graph", isgraphrune },
|
|
|
|
{ "lower", islowerrune },
|
|
|
|
{ "print", isprintrune },
|
|
|
|
{ "punct", ispunctrune },
|
|
|
|
{ "space", isspacerune },
|
|
|
|
{ "upper", isupperrune },
|
|
|
|
{ "xdigit", isxdigitrune },
|
2014-07-14 18:49:42 -04:00
|
|
|
};
|
|
|
|
|
2015-02-11 07:03:32 -05:00
|
|
|
static struct range *set1 = NULL;
|
|
|
|
static size_t set1ranges = 0;
|
|
|
|
static int (*set1check)(Rune) = NULL;
|
|
|
|
static struct range *set2 = NULL;
|
|
|
|
static size_t set2ranges = 0;
|
|
|
|
static int (*set2check)(Rune) = NULL;
|
2015-01-11 14:26:20 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static size_t
|
|
|
|
rangelen(struct range r)
|
2013-11-15 11:25:10 -05:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
return (r.end - r.start + 1) * r.quant;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static size_t
|
|
|
|
setlen(struct range *set, size_t setranges)
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
2015-01-10 11:38:28 -05:00
|
|
|
size_t len = 0, i;
|
2014-07-14 18:49:42 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
for (i = 0; i < setranges; i++)
|
|
|
|
len += rangelen(set[i]);
|
2014-04-12 14:50:51 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
return len;
|
2014-04-12 14:50:51 -04:00
|
|
|
}
|
|
|
|
|
2014-07-14 18:49:42 -04:00
|
|
|
static int
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
rstrmatch(Rune *r, char *s, size_t n)
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
size_t i;
|
2014-07-14 18:49:42 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
for (i = 0; i < n; i++)
|
|
|
|
if (r[i] != s[i])
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static size_t
|
2015-02-11 07:03:32 -05:00
|
|
|
makeset(char *str, struct range **set, int (**check)(Rune))
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
2015-03-10 17:19:19 -04:00
|
|
|
Rune *rstr;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
size_t len, i, j, m, n;
|
2015-01-11 14:26:20 -05:00
|
|
|
size_t q, setranges = 0;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
int factor, base;
|
|
|
|
|
|
|
|
/* rstr defines at most len ranges */
|
2015-01-29 15:52:44 -05:00
|
|
|
unescape(str);
|
2015-03-11 05:50:18 -04:00
|
|
|
rstr = ereallocarray(NULL, utflen(str) + 1, sizeof(*rstr));
|
2015-02-11 15:32:09 -05:00
|
|
|
len = utftorunestr(str, rstr);
|
2015-03-11 05:50:18 -04:00
|
|
|
*set = ereallocarray(NULL, len, sizeof(**set));
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
if (rstr[i] == '[') {
|
|
|
|
j = i;
|
|
|
|
nextbrack:
|
|
|
|
if (j == len)
|
|
|
|
goto literal;
|
|
|
|
for (m = j; m < len; m++)
|
|
|
|
if (rstr[m] == ']') {
|
|
|
|
j = m;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (j == i)
|
|
|
|
goto literal;
|
|
|
|
|
|
|
|
/* CLASSES [=EQUIV=] (skip) */
|
|
|
|
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
|
2015-01-28 13:44:05 -05:00
|
|
|
if (j - i != 4)
|
|
|
|
goto literal;
|
|
|
|
(*set)[setranges].start = rstr[i + 2];
|
|
|
|
(*set)[setranges].end = rstr[i + 2];
|
|
|
|
(*set)[setranges].quant = 1;
|
|
|
|
setranges++;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
i = j;
|
|
|
|
continue;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
/* CLASSES [:CLASS:] */
|
|
|
|
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
|
|
|
|
for (n = 0; n < LEN(classes); n++) {
|
|
|
|
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
|
2015-01-11 14:26:20 -05:00
|
|
|
*check = classes[n].check;
|
|
|
|
return 0;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
}
|
|
|
|
}
|
2015-01-11 18:03:48 -05:00
|
|
|
eprintf("Invalid character class.\n");
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
/* REPEAT [_*n] (only allowed in set2) */
|
|
|
|
if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
|
|
|
|
/* check if right side of '*' is a number */
|
|
|
|
q = 0;
|
|
|
|
factor = 1;
|
|
|
|
base = (rstr[i + 3] == '0') ? 8 : 10;
|
|
|
|
for (n = j - 1; n > i + 2; n--) {
|
2015-04-20 15:17:53 -04:00
|
|
|
if (rstr[n] < '0' || rstr[n] > '9') {
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
n = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
q += (rstr[n] - '0') * factor;
|
|
|
|
factor *= base;
|
|
|
|
}
|
|
|
|
if (n == 0) {
|
|
|
|
j = m + 1;
|
|
|
|
goto nextbrack;
|
|
|
|
}
|
|
|
|
(*set)[setranges].start = rstr[i + 1];
|
|
|
|
(*set)[setranges].end = rstr[i + 1];
|
|
|
|
(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
|
|
|
|
setranges++;
|
|
|
|
i = j;
|
|
|
|
continue;
|
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
j = m + 1;
|
|
|
|
goto nextbrack;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
literal:
|
|
|
|
/* RANGES [_-__-_], _-__-_ */
|
|
|
|
/* LITERALS _______ */
|
|
|
|
(*set)[setranges].start = rstr[i];
|
|
|
|
|
|
|
|
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
|
|
|
|
i += 2;
|
|
|
|
(*set)[setranges].end = rstr[i];
|
|
|
|
(*set)[setranges].quant = 1;
|
|
|
|
setranges++;
|
2013-11-15 11:25:10 -05:00
|
|
|
}
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
free(rstr);
|
|
|
|
return setranges;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
usage(void)
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
2013-11-15 11:25:10 -05:00
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
Rune r = 0, lastrune = 0;
|
2015-01-10 14:55:37 -05:00
|
|
|
size_t off1, off2, i, m;
|
2013-11-15 11:25:10 -05:00
|
|
|
|
|
|
|
ARGBEGIN {
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
case 'c':
|
|
|
|
case 'C':
|
|
|
|
cflag = 1;
|
|
|
|
break;
|
2014-04-09 08:12:34 -04:00
|
|
|
case 'd':
|
|
|
|
dflag = 1;
|
|
|
|
break;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
case 's':
|
|
|
|
sflag = 1;
|
2014-07-14 18:49:42 -04:00
|
|
|
break;
|
2013-11-15 11:25:10 -05:00
|
|
|
default:
|
|
|
|
usage();
|
|
|
|
} ARGEND;
|
|
|
|
|
2015-03-17 18:41:22 -04:00
|
|
|
if (!argc || argc > 2 || (argc == 1 && dflag == sflag))
|
2013-11-15 11:25:10 -05:00
|
|
|
usage();
|
2015-01-11 14:26:20 -05:00
|
|
|
set1ranges = makeset(argv[0], &set1, &set1check);
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
if (argc == 2)
|
2015-01-11 14:26:20 -05:00
|
|
|
set2ranges = makeset(argv[1], &set2, &set2check);
|
|
|
|
if (dflag == sflag && !set2ranges && !set2check)
|
2015-01-11 18:03:48 -05:00
|
|
|
eprintf("set2 must be non-empty.\n");
|
2015-04-20 15:38:46 -04:00
|
|
|
if (argc == 2 && !set2check != !set1check)
|
|
|
|
eprintf("can't mix classes with non-classes.\n");
|
2015-02-11 07:03:32 -05:00
|
|
|
if (set2check && set2check != islowerrune && set2check != isupperrune)
|
2015-01-11 18:03:48 -05:00
|
|
|
eprintf("set2 can only be the 'lower' or 'upper' class.\n");
|
2015-02-02 13:59:41 -05:00
|
|
|
if (set2check && cflag && !dflag)
|
2015-02-07 12:09:04 -05:00
|
|
|
eprintf("set2 can't be imaged to from a complement.\n");
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
read:
|
2015-02-11 14:13:43 -05:00
|
|
|
if (!efgetrune(&r, stdin, "<stdin>"))
|
Add *fshut() functions to properly flush file streams
This has been a known issue for a long time. Example:
printf "word" > /dev/full
wouldn't report there's not enough space on the device.
This is due to the fact that every libc has internal buffers
for stdout which store fragments of written data until they reach
a certain size or on some callback to flush them all at once to the
kernel.
You can force the libc to flush them with fflush(). In case flushing
fails, you can check the return value of fflush() and report an error.
However, previously, sbase didn't have such checks and without fflush(),
the libc silently flushes the buffers on exit without checking the errors.
No offense, but there's no way for the libc to report errors in the exit-
condition.
GNU coreutils solve this by having onexit-callbacks to handle the flushing
and report issues, but they have obvious deficiencies.
After long discussions on IRC, we came to the conclusion that checking the
return value of every io-function would be a bit too much, and having a
general-purpose fclose-wrapper would be the best way to go.
It turned out that fclose() alone is not enough to detect errors. The right
way to do it is to fflush() + check ferror on the fp and then to a fclose().
This is what fshut does and that's how it's done before each return.
The return value is obviously affected, reporting an error in case a flush
or close failed, but also when reading failed for some reason, the error-
state is caught.
the !!( ... + ...) construction is used to call all functions inside the
brackets and not "terminating" on the first.
We want errors to be reported, but there's no reason to stop flushing buffers
when one other file buffer has issues.
Obviously, functionales come before the flush and ret-logic comes after to
prevent early exits as well without reporting warnings if there are any.
One more advantage of fshut() is that it is even able to report errors
on obscure NFS-setups which the other coreutils are unable to detect,
because they only check the return-value of fflush() and fclose(),
not ferror() as well.
2015-04-04 15:25:17 -04:00
|
|
|
return !!(fshut(stdin, "<stdin>") + fshut(stdout, "<stdout>"));
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
off1 = off2 = 0;
|
|
|
|
for (i = 0; i < set1ranges; i++) {
|
|
|
|
if (set1[i].start <= r && r <= set1[i].end) {
|
2015-02-02 11:57:46 -05:00
|
|
|
if (dflag) {
|
2015-02-02 13:59:41 -05:00
|
|
|
if (!cflag || (sflag && r == lastrune))
|
2015-02-02 11:57:46 -05:00
|
|
|
goto read;
|
2015-02-02 13:59:41 -05:00
|
|
|
else
|
|
|
|
goto write;
|
2015-02-02 11:57:46 -05:00
|
|
|
}
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
if (sflag) {
|
|
|
|
if (r == lastrune)
|
|
|
|
goto read;
|
|
|
|
else
|
|
|
|
goto write;
|
|
|
|
}
|
2015-04-26 20:49:02 -04:00
|
|
|
if (cflag)
|
|
|
|
goto write;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
for (m = 0; m < i; m++)
|
|
|
|
off1 += rangelen(set1[m]);
|
|
|
|
off1 += r - set1[m].start;
|
|
|
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
|
|
|
r = set2[set2ranges - 1].end;
|
|
|
|
goto write;
|
|
|
|
}
|
|
|
|
for (m = 0; m < set2ranges; m++) {
|
|
|
|
if (off2 + rangelen(set2[m]) > off1) {
|
|
|
|
m++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
off2 += rangelen(set2[m]);
|
|
|
|
}
|
|
|
|
m--;
|
|
|
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
goto write;
|
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
}
|
2015-02-11 07:03:32 -05:00
|
|
|
if (set1check && set1check(r)) {
|
2015-02-02 13:59:41 -05:00
|
|
|
if (dflag) {
|
|
|
|
if (!cflag || (sflag && r == lastrune))
|
2015-02-02 11:57:46 -05:00
|
|
|
goto read;
|
2015-02-02 13:59:41 -05:00
|
|
|
else
|
|
|
|
goto write;
|
|
|
|
}
|
2015-01-11 14:26:20 -05:00
|
|
|
if (sflag) {
|
|
|
|
if (r == lastrune)
|
|
|
|
goto read;
|
|
|
|
else
|
|
|
|
goto write;
|
|
|
|
}
|
2015-02-11 07:03:32 -05:00
|
|
|
if (set1check == isupperrune && set2check == islowerrune)
|
|
|
|
r = tolowerrune(r);
|
|
|
|
else if (set1check == islowerrune && set2check == isupperrune)
|
|
|
|
r = toupperrune(r);
|
2015-01-11 18:03:48 -05:00
|
|
|
else if (set2ranges > 0)
|
2015-02-02 13:59:41 -05:00
|
|
|
r = cflag ? r : set2[set2ranges - 1].end;
|
2015-01-11 18:03:48 -05:00
|
|
|
else
|
|
|
|
eprintf("Misaligned character classes.\n");
|
2015-02-02 13:59:41 -05:00
|
|
|
} else if (cflag && set2ranges > 0) {
|
|
|
|
r = set2[set2ranges - 1].end;
|
2015-01-11 14:26:20 -05:00
|
|
|
}
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
if (dflag && cflag)
|
|
|
|
goto read;
|
|
|
|
if (dflag && sflag && r == lastrune)
|
|
|
|
goto read;
|
|
|
|
write:
|
|
|
|
lastrune = r;
|
2015-02-11 14:58:00 -05:00
|
|
|
efputrune(&r, stdout, "<stdout>");
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
goto read;
|
2013-11-15 11:25:10 -05:00
|
|
|
}
|