2015-01-20 10:26:08 -05:00
|
|
|
/* See LICENSE file for copyright and license details. */
|
2015-01-11 14:26:20 -05:00
|
|
|
#include <wctype.h>
|
2013-11-15 11:25:10 -05:00
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
2014-11-13 12:29:30 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
#include "utf.h"
|
2013-11-15 11:25:10 -05:00
|
|
|
#include "util.h"
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static int cflag = 0;
|
|
|
|
static int dflag = 0;
|
|
|
|
static int sflag = 0;
|
2014-07-14 18:49:42 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
struct range {
|
|
|
|
Rune start;
|
|
|
|
Rune end;
|
|
|
|
size_t quant;
|
2014-07-14 18:49:42 -04:00
|
|
|
};
|
|
|
|
|
2015-01-10 09:21:09 -05:00
|
|
|
static struct {
|
2015-01-11 14:26:20 -05:00
|
|
|
char *name;
|
|
|
|
int (*check)(wint_t);
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
} classes[] = {
|
2015-01-11 14:26:20 -05:00
|
|
|
{ "alnum", iswalnum },
|
|
|
|
{ "alpha", iswalpha },
|
|
|
|
{ "blank", iswblank },
|
|
|
|
{ "cntrl", iswcntrl },
|
|
|
|
{ "digit", iswdigit },
|
|
|
|
{ "graph", iswgraph },
|
|
|
|
{ "lower", iswlower },
|
2015-01-11 14:29:27 -05:00
|
|
|
{ "print", iswprint },
|
2015-01-11 14:26:20 -05:00
|
|
|
{ "punct", iswpunct },
|
|
|
|
{ "space", iswspace },
|
|
|
|
{ "upper", iswupper },
|
|
|
|
{ "xdigit", iswxdigit },
|
2014-07-14 18:49:42 -04:00
|
|
|
};
|
|
|
|
|
2015-01-11 14:26:20 -05:00
|
|
|
static struct range *set1 = NULL;
|
|
|
|
static size_t set1ranges = 0;
|
|
|
|
static int (*set1check)(wint_t) = NULL;
|
|
|
|
static struct range *set2 = NULL;
|
|
|
|
static size_t set2ranges = 0;
|
|
|
|
static int (*set2check)(wint_t) = NULL;
|
|
|
|
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static size_t
|
|
|
|
rangelen(struct range r)
|
2013-11-15 11:25:10 -05:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
return (r.end - r.start + 1) * r.quant;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static size_t
|
|
|
|
setlen(struct range *set, size_t setranges)
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
2015-01-10 11:38:28 -05:00
|
|
|
size_t len = 0, i;
|
2014-07-14 18:49:42 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
for (i = 0; i < setranges; i++)
|
|
|
|
len += rangelen(set[i]);
|
2014-04-12 14:50:51 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
return len;
|
2014-04-12 14:50:51 -04:00
|
|
|
}
|
|
|
|
|
2014-07-14 18:49:42 -04:00
|
|
|
static int
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
rstrmatch(Rune *r, char *s, size_t n)
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
size_t i;
|
2014-07-14 18:49:42 -04:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
for (i = 0; i < n; i++)
|
|
|
|
if (r[i] != s[i])
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
Resolve escape characters in tr(1)
This is one aspect which I think has blown up the complexity of many
tr-implementations around today.
Instead of complicating the set-theory-based parser itself (he should
still be relying on one rune per char, not multirunes), I added a
preprocessor, which basically scans the code for upcoming '\'s, reads
what he finds, substitutes the real character onto '\'s index and shifts
the entire following array so there are no "holes".
What is left to reflect on is what to do with octal sequences.
I have a local implementation here, which works fine, but imho,
given tr is already so focused on UTF-8, we might as well ignore
POSIX at this point and rather implement the unicode UTF-8 code points,
which are way more contemporary and future-proof.
Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue,
but I'm still struggling to find a way to turn it into a well-formed
byte sequence. Hit me with a mail if you have a simple solution for
that.
2015-01-15 05:51:58 -05:00
|
|
|
static size_t
|
|
|
|
resolveescapes(Rune *r, size_t len)
|
|
|
|
{
|
2015-01-24 16:43:46 -05:00
|
|
|
size_t i, off, m, factor, q;
|
Resolve escape characters in tr(1)
This is one aspect which I think has blown up the complexity of many
tr-implementations around today.
Instead of complicating the set-theory-based parser itself (he should
still be relying on one rune per char, not multirunes), I added a
preprocessor, which basically scans the code for upcoming '\'s, reads
what he finds, substitutes the real character onto '\'s index and shifts
the entire following array so there are no "holes".
What is left to reflect on is what to do with octal sequences.
I have a local implementation here, which works fine, but imho,
given tr is already so focused on UTF-8, we might as well ignore
POSIX at this point and rather implement the unicode UTF-8 code points,
which are way more contemporary and future-proof.
Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue,
but I'm still struggling to find a way to turn it into a well-formed
byte sequence. Hit me with a mail if you have a simple solution for
that.
2015-01-15 05:51:58 -05:00
|
|
|
|
2015-01-24 17:00:34 -05:00
|
|
|
for (i = 0; i < len; i++) {
|
Resolve escape characters in tr(1)
This is one aspect which I think has blown up the complexity of many
tr-implementations around today.
Instead of complicating the set-theory-based parser itself (he should
still be relying on one rune per char, not multirunes), I added a
preprocessor, which basically scans the code for upcoming '\'s, reads
what he finds, substitutes the real character onto '\'s index and shifts
the entire following array so there are no "holes".
What is left to reflect on is what to do with octal sequences.
I have a local implementation here, which works fine, but imho,
given tr is already so focused on UTF-8, we might as well ignore
POSIX at this point and rather implement the unicode UTF-8 code points,
which are way more contemporary and future-proof.
Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue,
but I'm still struggling to find a way to turn it into a well-formed
byte sequence. Hit me with a mail if you have a simple solution for
that.
2015-01-15 05:51:58 -05:00
|
|
|
if (r[i] != '\\')
|
|
|
|
continue;
|
|
|
|
off = 0;
|
|
|
|
|
|
|
|
switch (r[i + 1]) {
|
|
|
|
case '\\': r[i] = '\\'; off++; break;
|
|
|
|
case 'a': r[i] = '\a'; off++; break;
|
|
|
|
case 'b': r[i] = '\b'; off++; break;
|
|
|
|
case 'f': r[i] = '\f'; off++; break;
|
|
|
|
case 'n': r[i] = '\n'; off++; break;
|
|
|
|
case 'r': r[i] = '\r'; off++; break;
|
|
|
|
case 't': r[i] = '\t'; off++; break;
|
|
|
|
case 'v': r[i] = '\v'; off++; break;
|
2015-01-24 16:43:46 -05:00
|
|
|
case '\0':
|
|
|
|
eprintf("tr: null escape sequence\n");
|
|
|
|
default:
|
|
|
|
/* "\O[OO]" octal escape */
|
|
|
|
for (m = i + 1; m < i + 1 + 3 && m < len; m++)
|
|
|
|
if (r[m] < '0' || r[m] > '7')
|
|
|
|
break;
|
|
|
|
if (m == i + 1)
|
2015-01-24 17:00:34 -05:00
|
|
|
eprintf("tr: invalid escape sequence '\\%c'\n", r[i + 1]);
|
2015-01-24 16:43:46 -05:00
|
|
|
off += m - i - 1;
|
|
|
|
for (--m, q = 0, factor = 1; m > i; m--) {
|
|
|
|
q += (r[m] - '0') * factor;
|
|
|
|
factor *= 8;
|
|
|
|
}
|
|
|
|
r[i] = q;
|
Resolve escape characters in tr(1)
This is one aspect which I think has blown up the complexity of many
tr-implementations around today.
Instead of complicating the set-theory-based parser itself (he should
still be relying on one rune per char, not multirunes), I added a
preprocessor, which basically scans the code for upcoming '\'s, reads
what he finds, substitutes the real character onto '\'s index and shifts
the entire following array so there are no "holes".
What is left to reflect on is what to do with octal sequences.
I have a local implementation here, which works fine, but imho,
given tr is already so focused on UTF-8, we might as well ignore
POSIX at this point and rather implement the unicode UTF-8 code points,
which are way more contemporary and future-proof.
Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue,
but I'm still struggling to find a way to turn it into a well-formed
byte sequence. Hit me with a mail if you have a simple solution for
that.
2015-01-15 05:51:58 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
for (m = i + 1; m <= len - off; m++)
|
|
|
|
r[m] = r[m + off];
|
|
|
|
len -= off;
|
|
|
|
}
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
static size_t
|
2015-01-11 14:26:20 -05:00
|
|
|
makeset(char *str, struct range **set, int (**check)(wint_t))
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
Rune *rstr;
|
|
|
|
size_t len, i, j, m, n;
|
2015-01-11 14:26:20 -05:00
|
|
|
size_t q, setranges = 0;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
int factor, base;
|
|
|
|
|
|
|
|
/* rstr defines at most len ranges */
|
|
|
|
len = chartorunearr(str, &rstr);
|
Resolve escape characters in tr(1)
This is one aspect which I think has blown up the complexity of many
tr-implementations around today.
Instead of complicating the set-theory-based parser itself (he should
still be relying on one rune per char, not multirunes), I added a
preprocessor, which basically scans the code for upcoming '\'s, reads
what he finds, substitutes the real character onto '\'s index and shifts
the entire following array so there are no "holes".
What is left to reflect on is what to do with octal sequences.
I have a local implementation here, which works fine, but imho,
given tr is already so focused on UTF-8, we might as well ignore
POSIX at this point and rather implement the unicode UTF-8 code points,
which are way more contemporary and future-proof.
Reading in \uC3A4 as a an array of 0xC3 and 0xA4 is not the issue,
but I'm still struggling to find a way to turn it into a well-formed
byte sequence. Hit me with a mail if you have a simple solution for
that.
2015-01-15 05:51:58 -05:00
|
|
|
len = resolveescapes(rstr, len);
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
*set = emalloc(len * sizeof(**set));
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
if (rstr[i] == '[') {
|
|
|
|
j = i;
|
|
|
|
nextbrack:
|
|
|
|
if (j == len)
|
|
|
|
goto literal;
|
|
|
|
for (m = j; m < len; m++)
|
|
|
|
if (rstr[m] == ']') {
|
|
|
|
j = m;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (j == i)
|
|
|
|
goto literal;
|
|
|
|
|
|
|
|
/* CLASSES [=EQUIV=] (skip) */
|
|
|
|
if (j - i > 3 && rstr[i + 1] == '=' && rstr[m - 1] == '=') {
|
|
|
|
i = j;
|
|
|
|
continue;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
/* CLASSES [:CLASS:] */
|
|
|
|
if (j - i > 3 && rstr[i + 1] == ':' && rstr[m - 1] == ':') {
|
|
|
|
for (n = 0; n < LEN(classes); n++) {
|
|
|
|
if (rstrmatch(rstr + i + 2, classes[n].name, j - i - 3)) {
|
2015-01-11 14:26:20 -05:00
|
|
|
*check = classes[n].check;
|
|
|
|
return 0;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
}
|
|
|
|
}
|
2015-01-11 18:03:48 -05:00
|
|
|
eprintf("Invalid character class.\n");
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
/* REPEAT [_*n] (only allowed in set2) */
|
|
|
|
if (j - i > 2 && rstr[i + 2] == '*' && set1ranges > 0) {
|
|
|
|
/* check if right side of '*' is a number */
|
|
|
|
q = 0;
|
|
|
|
factor = 1;
|
|
|
|
base = (rstr[i + 3] == '0') ? 8 : 10;
|
|
|
|
for (n = j - 1; n > i + 2; n--) {
|
|
|
|
if (rstr[n] < '0' && rstr[n] > '9') {
|
|
|
|
n = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
q += (rstr[n] - '0') * factor;
|
|
|
|
factor *= base;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (n == 0) {
|
|
|
|
j = m + 1;
|
|
|
|
goto nextbrack;
|
|
|
|
}
|
|
|
|
(*set)[setranges].start = rstr[i + 1];
|
|
|
|
(*set)[setranges].end = rstr[i + 1];
|
|
|
|
(*set)[setranges].quant = q ? q : setlen(set1, set1ranges);
|
|
|
|
setranges++;
|
|
|
|
i = j;
|
|
|
|
continue;
|
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
j = m + 1;
|
|
|
|
goto nextbrack;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
literal:
|
|
|
|
/* RANGES [_-__-_], _-__-_ */
|
|
|
|
/* LITERALS _______ */
|
|
|
|
(*set)[setranges].start = rstr[i];
|
|
|
|
|
|
|
|
if (i < len - 2 && rstr[i + 1] == '-' && rstr[i + 2] >= rstr[i])
|
|
|
|
i += 2;
|
|
|
|
(*set)[setranges].end = rstr[i];
|
|
|
|
(*set)[setranges].quant = 1;
|
|
|
|
setranges++;
|
2013-11-15 11:25:10 -05:00
|
|
|
}
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
free(rstr);
|
|
|
|
return setranges;
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
usage(void)
|
2014-07-14 18:49:42 -04:00
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
eprintf("usage: %s [-cCds] set1 [set2]\n", argv0);
|
2014-07-14 18:49:42 -04:00
|
|
|
}
|
|
|
|
|
2013-11-15 11:25:10 -05:00
|
|
|
int
|
|
|
|
main(int argc, char *argv[])
|
|
|
|
{
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
Rune r = 0, lastrune = 0;
|
2015-01-10 14:55:37 -05:00
|
|
|
size_t off1, off2, i, m;
|
2013-11-15 11:25:10 -05:00
|
|
|
|
|
|
|
ARGBEGIN {
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
case 'c':
|
|
|
|
case 'C':
|
|
|
|
cflag = 1;
|
|
|
|
break;
|
2014-04-09 08:12:34 -04:00
|
|
|
case 'd':
|
|
|
|
dflag = 1;
|
|
|
|
break;
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
case 's':
|
|
|
|
sflag = 1;
|
2014-07-14 18:49:42 -04:00
|
|
|
break;
|
2013-11-15 11:25:10 -05:00
|
|
|
default:
|
|
|
|
usage();
|
|
|
|
} ARGEND;
|
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
if (argc < 1 || argc > 2 || (argc == 1 && dflag == sflag))
|
2013-11-15 11:25:10 -05:00
|
|
|
usage();
|
2015-01-11 14:26:20 -05:00
|
|
|
set1ranges = makeset(argv[0], &set1, &set1check);
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
if (argc == 2)
|
2015-01-11 14:26:20 -05:00
|
|
|
set2ranges = makeset(argv[1], &set2, &set2check);
|
|
|
|
if (dflag == sflag && !set2ranges && !set2check)
|
2015-01-11 18:03:48 -05:00
|
|
|
eprintf("set2 must be non-empty.\n");
|
|
|
|
if (set2check && set2check != iswlower && set2check != iswupper)
|
|
|
|
eprintf("set2 can only be the 'lower' or 'upper' class.\n");
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
read:
|
|
|
|
if (!readrune("<stdin>", stdin, &r))
|
|
|
|
return 0;
|
|
|
|
off1 = off2 = 0;
|
|
|
|
for (i = 0; i < set1ranges; i++) {
|
|
|
|
if (set1[i].start <= r && r <= set1[i].end) {
|
|
|
|
if (dflag && !cflag)
|
|
|
|
goto read;
|
|
|
|
if (sflag) {
|
|
|
|
if (r == lastrune)
|
|
|
|
goto read;
|
|
|
|
else
|
|
|
|
goto write;
|
|
|
|
}
|
|
|
|
for (m = 0; m < i; m++)
|
|
|
|
off1 += rangelen(set1[m]);
|
|
|
|
off1 += r - set1[m].start;
|
|
|
|
if (off1 > setlen(set2, set2ranges) - 1) {
|
|
|
|
r = set2[set2ranges - 1].end;
|
|
|
|
goto write;
|
|
|
|
}
|
|
|
|
for (m = 0; m < set2ranges; m++) {
|
|
|
|
if (off2 + rangelen(set2[m]) > off1) {
|
|
|
|
m++;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
off2 += rangelen(set2[m]);
|
|
|
|
}
|
|
|
|
m--;
|
|
|
|
r = set2[m].start + (off1 - off2) / set2[m].quant;
|
2013-11-15 11:25:10 -05:00
|
|
|
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
goto write;
|
|
|
|
}
|
2013-11-15 11:25:10 -05:00
|
|
|
}
|
2015-01-11 14:29:27 -05:00
|
|
|
if (set1check && set1check((wint_t)r)) {
|
2015-01-11 14:26:20 -05:00
|
|
|
if (dflag && !cflag)
|
|
|
|
goto read;
|
|
|
|
if (sflag) {
|
|
|
|
if (r == lastrune)
|
|
|
|
goto read;
|
|
|
|
else
|
|
|
|
goto write;
|
|
|
|
}
|
|
|
|
if (set1check == iswupper && set2check == iswlower)
|
2015-01-11 14:29:27 -05:00
|
|
|
r = towlower((wint_t)r);
|
2015-01-11 18:03:48 -05:00
|
|
|
else if (set1check == iswlower && set2check == iswupper)
|
2015-01-11 14:29:27 -05:00
|
|
|
r = towupper((wint_t)r);
|
2015-01-11 18:03:48 -05:00
|
|
|
else if (set2ranges > 0)
|
|
|
|
r = set2[set2ranges - 1].end;
|
|
|
|
else
|
|
|
|
eprintf("Misaligned character classes.\n");
|
2015-01-11 14:26:20 -05:00
|
|
|
}
|
Rewrite tr(1) in a sane way
tr(1) always used to be a saddening part of sbase, which was
inherently broken and crufted.
But to be fair, the POSIX-standard doesn't make it very simple.
Given the current version was unfixable and broken by design, I
sat down and rewrote tr(1) very close to the concept of set theory
and the POSIX-standard with a few exceptions:
- UTF-8: not allowed in POSIX, but in my opinion a must. This
finally allows you to work with UTF-8 streams without
problems or unexpected behaviour.
- Equivalence classes: Left out, even GNU coreutils ignore them
and depending on LC_COLLATE, which sucks.
- Character classes: No experiments or environment-variable-trickery.
Just plain definitions derived from the POSIX-
standard, working as expected.
I tested this thoroughly, but expect problems to show up in some
way given the wide range of input this program has to handle.
The only thing left on the TODO is to add support for literal
expressions ('\n', '\t', '\001', ...) and probably rethinking
the way [_*n] is unnecessarily restricted to string2.
2015-01-09 14:36:27 -05:00
|
|
|
if (dflag && cflag)
|
|
|
|
goto read;
|
|
|
|
if (dflag && sflag && r == lastrune)
|
|
|
|
goto read;
|
|
|
|
write:
|
|
|
|
lastrune = r;
|
|
|
|
writerune("<stdout>", stdout, &r);
|
|
|
|
goto read;
|
2013-11-15 11:25:10 -05:00
|
|
|
}
|