Improved tr
- Added support for character ranges ( a-z ) - Added support for complementary charset ( -c ), only in delete mode - Added support for octal escape sequences - Unicode now only works when there are no octal escape sequences, otherwise behavior is not predictable at first sight. - tr now supports null characters in the input - Does not yet have support for character classes ( [:upper:] )
This commit is contained in:
parent
8b3a9c1971
commit
b3a63a60e4
13
tr.1
13
tr.1
@ -3,7 +3,7 @@
|
||||
tr \- translate characters
|
||||
.SH SYNOPSIS
|
||||
.B tr
|
||||
.RB [ \-d ]
|
||||
.RB [ \-d ] [ \-c ]
|
||||
.RB set1
|
||||
.P
|
||||
.B tr
|
||||
@ -13,6 +13,9 @@ tr \- translate characters
|
||||
.TP
|
||||
.B \-d
|
||||
For compatibility. If given, characters in set1 will be deleted from the input and specifying set2 will result in an error.
|
||||
.B \-c
|
||||
Complementary, causes the specified character set to be inverted, this is all the characters not specified belong to it.
|
||||
It only works in conjunction with \-d, because order doesn't make much sense with translation.
|
||||
.SH DESCRIPTION
|
||||
.B tr
|
||||
reads input from stdin replacing every character in
|
||||
@ -50,9 +53,15 @@ If set1 is longer than set2
|
||||
.B tr
|
||||
will map all the remaining characters to the last one in set2. In case set2 is longer than set1, the remaining characters from set2 will be ignored.
|
||||
.B
|
||||
Character escape sequences, be them characters or octal numbers, are done preceding the token with a "\\". You may specify three digits or less for it,
|
||||
digits will stop being read when a non-octal character or when three characters are read.
|
||||
.B
|
||||
Use "A-B" for ordered sets fom A to B.
|
||||
.B
|
||||
.SH NOTES
|
||||
.B tr
|
||||
is Unicode-aware but does not yet handle character classes (e.g. [:alnum:] or [:digit:]).
|
||||
is Unicode-aware, but only if you don't specify characters in octal (for example \\012), because else it is not predictable. Does not support character
|
||||
classes.
|
||||
.SH SEE ALSO
|
||||
.IR sed(1)
|
||||
.IR awk(1)
|
||||
|
340
tr.c
340
tr.c
@ -3,7 +3,6 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/mman.h>
|
||||
#include <locale.h>
|
||||
#include <wchar.h>
|
||||
#include "text.h"
|
||||
@ -12,135 +11,316 @@
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
eprintf("usage: %s [-d] set1 [set2]\n", argv0);
|
||||
eprintf("usage: %s [-d] [-c] set1 [set2]\n", argv0);
|
||||
}
|
||||
|
||||
static int dflag, cflag;
|
||||
static wchar_t mappings[0x110000];
|
||||
|
||||
struct wset_state {
|
||||
char *s; /* current character */
|
||||
wchar_t rfirst, rlast; /* first and last in range */
|
||||
wchar_t prev; /* previous returned character */
|
||||
int prev_was_range; /* was the previous character part of a c-c range? */
|
||||
};
|
||||
|
||||
struct set_state {
|
||||
char *s, rfirst, rlast, prev;
|
||||
int prev_was_octal; /* was the previous returned character written in octal? */
|
||||
};
|
||||
|
||||
static void
|
||||
set_state_defaults(struct set_state *s)
|
||||
{
|
||||
s->rfirst = 1;
|
||||
s->rlast = 0;
|
||||
s->prev_was_octal = 1;
|
||||
}
|
||||
|
||||
static void
|
||||
handleescapes(char *s)
|
||||
wset_state_defaults(struct wset_state *s)
|
||||
{
|
||||
s->rfirst = 1;
|
||||
s->rlast = 0;
|
||||
s->prev_was_range = 1;
|
||||
}
|
||||
|
||||
/* sets *s to the char that was intended to be written.
|
||||
* returns how many bytes the s pointer has to advance to skip the
|
||||
* escape sequence if it was an octal, always zero otherwise. */
|
||||
static int
|
||||
resolve_escape(char *s)
|
||||
{
|
||||
int i;
|
||||
unsigned char c;
|
||||
|
||||
switch(*s) {
|
||||
case 'n':
|
||||
*s = '\n';
|
||||
break;
|
||||
return 0;
|
||||
case 't':
|
||||
*s = '\t';
|
||||
break;
|
||||
case '\\':
|
||||
*s = '\\';
|
||||
break;
|
||||
return 0;
|
||||
case 'r':
|
||||
*s = '\r';
|
||||
break;
|
||||
return 0;
|
||||
case 'f':
|
||||
*s = '\f';
|
||||
break;
|
||||
return 0;
|
||||
case 'a':
|
||||
*s = '\a';
|
||||
break;
|
||||
return 0;
|
||||
case 'b':
|
||||
*s = '\b';
|
||||
break;
|
||||
return 0;
|
||||
case 'v':
|
||||
*s = '\v';
|
||||
break;
|
||||
return 0;
|
||||
case '\\':
|
||||
*s = '\\';
|
||||
return 0;
|
||||
case '\0':
|
||||
eprintf("stray '\\' at end of input:");
|
||||
default: ;
|
||||
}
|
||||
|
||||
if(*s<'0' || *s>'7')
|
||||
eprintf("invalid character after '\\':");
|
||||
for(i=0, c=0; s[i]>='0' && s[i]<='7' && i<3; i++) {
|
||||
c <<= 3;
|
||||
c += s[i]-'0';
|
||||
}
|
||||
if(*s>'3' && i==3)
|
||||
eprintf("octal byte cannot be bigger than 377:");
|
||||
*s = c;
|
||||
return i;
|
||||
}
|
||||
|
||||
#define embtowc(a, b) mbtowc(a, b, 4)
|
||||
|
||||
static int
|
||||
xmbtowc(wchar_t *unicodep, const char *s)
|
||||
{
|
||||
int rv;
|
||||
|
||||
rv = mbtowc(unicodep, s, 4);
|
||||
rv = embtowc(unicodep, s);
|
||||
if (rv < 0)
|
||||
eprintf("mbtowc:");
|
||||
eprintf("mbtowc: invalid input sequence:");
|
||||
return rv;
|
||||
}
|
||||
|
||||
static void
|
||||
parsemapping(const char *set1, const char *set2, wchar_t *mappings)
|
||||
static int
|
||||
has_octal_escapes(const char *s)
|
||||
{
|
||||
char *s1, *s2;
|
||||
wchar_t runeleft;
|
||||
wchar_t runeright;
|
||||
int leftbytes;
|
||||
int rightbytes;
|
||||
while(*s)
|
||||
if(*s++ == '\\' && *s >= '0' && *s <= '7')
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
s1 = (char *)set1;
|
||||
if(set2)
|
||||
s2 = (char *)set2;
|
||||
else
|
||||
s2 = (char *)set1;
|
||||
static char
|
||||
get_next_char(struct set_state *s)
|
||||
{
|
||||
char c;
|
||||
int nchars;
|
||||
|
||||
while(*s1) {
|
||||
if(*s1 == '\\')
|
||||
handleescapes(++s1);
|
||||
leftbytes = xmbtowc(&runeleft, s1);
|
||||
s1 += leftbytes;
|
||||
if(*s2 == '\\')
|
||||
handleescapes(++s2);
|
||||
if(*s2 != '\0') {
|
||||
rightbytes = xmbtowc(&runeright, s2);
|
||||
s2 += rightbytes;
|
||||
start:
|
||||
if(s->rfirst <= s->rlast) {
|
||||
c = s->rfirst;
|
||||
s->rfirst++;
|
||||
return c;
|
||||
}
|
||||
mappings[runeleft] = runeright;
|
||||
|
||||
if(*s->s == '-' && !s->prev_was_octal) {
|
||||
s->s++;
|
||||
if(!*s->s)
|
||||
return '-';
|
||||
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
|
||||
goto char_is_octal;
|
||||
s->rlast = *(s->s)++;
|
||||
if(!s->rlast)
|
||||
return '\0';
|
||||
s->prev_was_octal = 1;
|
||||
s->rfirst = ++(s->prev);
|
||||
goto start;
|
||||
}
|
||||
if(*s->s == '\\' && (nchars = resolve_escape(++(s->s))))
|
||||
goto char_is_octal;
|
||||
|
||||
s->prev_was_octal = 0;
|
||||
c = *(s->s)++;
|
||||
s->prev = c;
|
||||
return c;
|
||||
|
||||
char_is_octal:
|
||||
s->prev_was_octal = 1;
|
||||
c = *s->s;
|
||||
s->s += nchars;
|
||||
return c;
|
||||
}
|
||||
|
||||
static wchar_t
|
||||
get_next_wchar(struct wset_state *s)
|
||||
{
|
||||
start:
|
||||
if(s->rfirst <= s->rlast) {
|
||||
s->prev = s->rfirst;
|
||||
s->rfirst++;
|
||||
return s->prev;
|
||||
}
|
||||
|
||||
if(*s->s == '-' && !s->prev_was_range) {
|
||||
s->s++;
|
||||
if(!*s->s)
|
||||
return '-';
|
||||
if(*s->s == '\\')
|
||||
resolve_escape(++(s->s));
|
||||
s->s += xmbtowc(&s->rlast, s->s);
|
||||
if(!s->rlast)
|
||||
return '\0';
|
||||
s->rfirst = ++(s->prev);
|
||||
s->prev_was_range = 1;
|
||||
goto start;
|
||||
}
|
||||
|
||||
if(*s->s == '\\')
|
||||
resolve_escape(++(s->s));
|
||||
s->s += xmbtowc(&s->prev, s->s);
|
||||
s->prev_was_range = 0;
|
||||
return s->prev;
|
||||
}
|
||||
|
||||
static int
|
||||
is_mapping_wide(const char *set1, const char *set2)
|
||||
{
|
||||
struct set_state ss1, ss2;
|
||||
struct wset_state wss1, wss2;
|
||||
wchar_t wc1, wc2, last_wc2;
|
||||
|
||||
if(has_octal_escapes(set1)) {
|
||||
set_state_defaults(&ss1);
|
||||
ss1.s = (char *) set1;
|
||||
if(set2) {
|
||||
set_state_defaults(&ss2);
|
||||
ss2.s = (char *) set2;
|
||||
/* if the character returned is from an octal triplet, it might be null
|
||||
and still need to continue */
|
||||
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal ) {
|
||||
if(!(wc2 = (unsigned char) get_next_char(&ss2)))
|
||||
wc2 = last_wc2;
|
||||
mappings[wc1] = wc2;
|
||||
last_wc2 = wc2;
|
||||
}
|
||||
} else {
|
||||
while((wc1 = (unsigned char) get_next_char(&ss1)) || ss1.prev_was_octal)
|
||||
mappings[wc1] = 1;
|
||||
}
|
||||
return 0;
|
||||
} else {
|
||||
wset_state_defaults(&wss1);
|
||||
wss1.s = (char *) set1;
|
||||
if(set2) {
|
||||
wset_state_defaults(&wss2);
|
||||
wss2.s = (char *) set2;
|
||||
while((wc1 = get_next_wchar(&wss1))) {
|
||||
if(!(wc2 = get_next_wchar(&wss2)))
|
||||
wc2 = last_wc2;
|
||||
mappings[wc1] = wc2;
|
||||
last_wc2 = wc2;
|
||||
}
|
||||
} else {
|
||||
while((wc1 = get_next_wchar(&wss1)))
|
||||
mappings[wc1] = 1;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
return 0; /* unreachable */
|
||||
}
|
||||
|
||||
static void
|
||||
wmap_null(char *in, ssize_t nbytes)
|
||||
{
|
||||
char *s;
|
||||
wchar_t rune;
|
||||
int parsed_bytes = 0;
|
||||
|
||||
s = in;
|
||||
while(nbytes) {
|
||||
parsed_bytes = embtowc(&rune, s);
|
||||
if(parsed_bytes < 0) {
|
||||
rune = *s;
|
||||
parsed_bytes = 1;
|
||||
}
|
||||
if(((!mappings[rune])&1) ^ cflag)
|
||||
putwchar(rune);
|
||||
s += parsed_bytes;
|
||||
nbytes -= parsed_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
maptonull(const wchar_t *mappings, char *in)
|
||||
wmap_set(char *in, ssize_t nbytes)
|
||||
{
|
||||
const char *s;
|
||||
wchar_t runeleft;
|
||||
int leftbytes = 0;
|
||||
char *s;
|
||||
wchar_t rune;
|
||||
int parsed_bytes = 0;
|
||||
|
||||
s = in;
|
||||
while(*s) {
|
||||
leftbytes = xmbtowc(&runeleft, s);
|
||||
if(!mappings[runeleft])
|
||||
putwchar(runeleft);
|
||||
s += leftbytes;
|
||||
while(nbytes) {
|
||||
parsed_bytes = embtowc(&rune, s);
|
||||
if(parsed_bytes < 0) {
|
||||
rune = *s;
|
||||
parsed_bytes = 1;
|
||||
}
|
||||
if(!mappings[rune])
|
||||
putwchar(rune);
|
||||
else
|
||||
putwchar(mappings[rune]);
|
||||
nbytes -= parsed_bytes;
|
||||
s += parsed_bytes;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
maptoset(const wchar_t *mappings, char *in)
|
||||
map_null(char *in, ssize_t nbytes)
|
||||
{
|
||||
const char *s;
|
||||
wchar_t runeleft;
|
||||
int leftbytes = 0;
|
||||
char *s;
|
||||
|
||||
s = in;
|
||||
while(*s) {
|
||||
leftbytes = xmbtowc(&runeleft, s);
|
||||
if(!mappings[runeleft])
|
||||
putwchar(runeleft);
|
||||
for(s=in; nbytes; s++, nbytes--)
|
||||
if(((!mappings[(unsigned char)*s])&1) ^ cflag)
|
||||
putchar(*s);
|
||||
}
|
||||
|
||||
static void
|
||||
map_set(char *in, ssize_t nbytes)
|
||||
{
|
||||
char *s;
|
||||
|
||||
for(s=in; nbytes; s++, nbytes--)
|
||||
if(!mappings[(unsigned char)*s])
|
||||
putchar(*s);
|
||||
else
|
||||
putwchar(mappings[runeleft]);
|
||||
s += leftbytes;
|
||||
}
|
||||
putchar(mappings[(unsigned char)*s]);
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char *argv[])
|
||||
{
|
||||
wchar_t *mappings;
|
||||
char *buf = NULL;
|
||||
size_t size = 0;
|
||||
void (*mapfunc)(const wchar_t*, char*);
|
||||
int dflag = 0;
|
||||
ssize_t nbytes;
|
||||
void (*mapfunc)(char*, ssize_t);
|
||||
|
||||
setlocale(LC_ALL, "");
|
||||
|
||||
mappings = mmap(NULL, 0x110000 * sizeof(wchar_t),
|
||||
PROT_READ|PROT_WRITE, MAP_ANON|MAP_PRIVATE, -1, 0);
|
||||
if (mappings == MAP_FAILED)
|
||||
eprintf("mmap:");
|
||||
dflag = cflag = 0;
|
||||
|
||||
ARGBEGIN {
|
||||
case 'd':
|
||||
dflag = 1;
|
||||
break;
|
||||
case 'c':
|
||||
cflag = 1;
|
||||
break;
|
||||
default:
|
||||
usage();
|
||||
} ARGEND;
|
||||
@ -148,25 +328,29 @@ main(int argc, char *argv[])
|
||||
if(argc == 0)
|
||||
usage();
|
||||
|
||||
if(dflag || argc == 1) {
|
||||
if(dflag) {
|
||||
if(argc != 1)
|
||||
usage();
|
||||
parsemapping(argv[0], NULL, mappings);
|
||||
mapfunc = maptonull;
|
||||
} else {
|
||||
if(argc != 2)
|
||||
if(is_mapping_wide(argv[0], NULL))
|
||||
mapfunc = wmap_null;
|
||||
else
|
||||
mapfunc = map_null;
|
||||
} else if(cflag) {
|
||||
usage();
|
||||
} else if(argc == 2) {
|
||||
if(is_mapping_wide(argv[0], argv[1]))
|
||||
mapfunc = wmap_set;
|
||||
else
|
||||
mapfunc = map_set;
|
||||
} else {
|
||||
usage();
|
||||
parsemapping(argv[0], argv[1], mappings);
|
||||
mapfunc = maptoset;
|
||||
}
|
||||
|
||||
while(agetline(&buf, &size, stdin) != -1)
|
||||
mapfunc(mappings, buf);
|
||||
while((nbytes = agetline(&buf, &size, stdin)) != -1)
|
||||
mapfunc(buf, nbytes);
|
||||
free(buf);
|
||||
if(ferror(stdin))
|
||||
eprintf("<stdin>: read error:");
|
||||
|
||||
munmap(mappings, 0x110000 * sizeof(wchar_t));
|
||||
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user