Support NUL-containing lines in sort(1)
For sort(1) we need memmem(), which I imported from OpenBSD. Inside sort(1), the changes involved working with the explicit lengths given by getlines() earlier and rewriting some of the functions. Now we can handle NUL-characters in the input just fine.
This commit is contained in:
parent
e4810f1cdb
commit
eb9bda8787
1
Makefile
1
Makefile
@ -58,6 +58,7 @@ LIBUTILSRC =\
|
|||||||
libutil/getlines.c\
|
libutil/getlines.c\
|
||||||
libutil/human.c\
|
libutil/human.c\
|
||||||
libutil/md5.c\
|
libutil/md5.c\
|
||||||
|
libutil/memmem.c\
|
||||||
libutil/mkdirp.c\
|
libutil/mkdirp.c\
|
||||||
libutil/mode.c\
|
libutil/mode.c\
|
||||||
libutil/parseoffset.c\
|
libutil/parseoffset.c\
|
||||||
|
2
README
2
README
@ -79,7 +79,7 @@ The following tools are implemented:
|
|||||||
0=* x sha512-224sum .
|
0=* x sha512-224sum .
|
||||||
0=* x sha512-256sum .
|
0=* x sha512-256sum .
|
||||||
0=*|o sleep .
|
0=*|o sleep .
|
||||||
#*|o sort .
|
0#*|o sort .
|
||||||
0=*|o split .
|
0=*|o split .
|
||||||
0=*|x sponge .
|
0=*|x sponge .
|
||||||
0#*|o strings .
|
0#*|o strings .
|
||||||
|
66
libutil/memmem.c
Normal file
66
libutil/memmem.c
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
/* $OpenBSD: memmem.c,v 1.4 2015/08/31 02:53:57 guenther Exp $ */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2005 Pascal Gloor <pascal.gloor@spale.com>
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* 3. The name of the author may not be used to endorse or promote
|
||||||
|
* products derived from this software without specific prior written
|
||||||
|
* permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#include "../util.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Find the first occurrence of the byte string s in byte string l.
|
||||||
|
*/
|
||||||
|
|
||||||
|
void *
|
||||||
|
memmem(const void *l, size_t l_len, const void *s, size_t s_len)
|
||||||
|
{
|
||||||
|
const char *cur, *last;
|
||||||
|
const char *cl = l;
|
||||||
|
const char *cs = s;
|
||||||
|
|
||||||
|
/* a zero length needle should just return the haystack */
|
||||||
|
if (s_len == 0)
|
||||||
|
return (void *)cl;
|
||||||
|
|
||||||
|
/* "s" must be smaller or equal to "l" */
|
||||||
|
if (l_len < s_len)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
/* special case where s_len == 1 */
|
||||||
|
if (s_len == 1)
|
||||||
|
return memchr(l, *cs, l_len);
|
||||||
|
|
||||||
|
/* the last position where its possible to find "s" in "l" */
|
||||||
|
last = cl + l_len - s_len;
|
||||||
|
|
||||||
|
for (cur = cl; cur <= last; cur++)
|
||||||
|
if (cur[0] == cs[0] && memcmp(cur, cs, s_len) == 0)
|
||||||
|
return (void *)cur;
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
191
sort.c
191
sort.c
@ -33,119 +33,133 @@ static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
|
|||||||
static int Cflag = 0, cflag = 0, uflag = 0;
|
static int Cflag = 0, cflag = 0, uflag = 0;
|
||||||
static char *fieldsep = NULL;
|
static char *fieldsep = NULL;
|
||||||
static size_t fieldseplen = 0;
|
static size_t fieldseplen = 0;
|
||||||
static char *col1, *col2;
|
static struct linebufline col1, col2;
|
||||||
static size_t col1siz, col2siz;
|
|
||||||
|
|
||||||
static char *
|
static void
|
||||||
skipblank(char *s)
|
skipblank(struct linebufline *a)
|
||||||
{
|
{
|
||||||
while (*s == ' ' || *s == '\t')
|
while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
|
||||||
s++;
|
a->data++;
|
||||||
|
a->len--;
|
||||||
return s;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static char *
|
static void
|
||||||
skipnonblank(char *s)
|
skipnonblank(struct linebufline *a)
|
||||||
{
|
{
|
||||||
while (*s && *s != '\n' && *s != ' ' && *s != '\t')
|
while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
|
||||||
s++;
|
*(a->data) != '\t')) {
|
||||||
|
a->data++;
|
||||||
return s;
|
a->len--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static char *
|
static void
|
||||||
skipcolumn(char *s, char *eol, int skip_to_next_col)
|
skipcolumn(struct linebufline *a, int skip_to_next_col)
|
||||||
{
|
{
|
||||||
|
char *s;
|
||||||
|
|
||||||
if (fieldsep) {
|
if (fieldsep) {
|
||||||
if ((s = strstr(s, fieldsep))) {
|
if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
|
||||||
if (skip_to_next_col)
|
if (skip_to_next_col) {
|
||||||
s += fieldseplen;
|
a->len = a->len - (s - a->data);
|
||||||
} else {
|
a->data = s;
|
||||||
s = eol;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
s = skipblank(s);
|
a->data += a->len - 1;
|
||||||
s = skipnonblank(s);
|
a->len = 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
skipblank(a);
|
||||||
|
skipnonblank(a);
|
||||||
}
|
}
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t
|
static size_t
|
||||||
columns(char *line, const struct keydef *kd, char **col, size_t *colsiz)
|
columns(struct linebufline *line, const struct keydef *kd, struct linebufline *col)
|
||||||
{
|
{
|
||||||
Rune r;
|
Rune r;
|
||||||
char *start, *end, *eol = strchr(line, '\n');
|
struct linebufline start, end;
|
||||||
size_t len, utflen, rlen;
|
size_t len, utflen, rlen;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
for (i = 1, start = line; i < kd->start_column; i++)
|
start.data = line->data;
|
||||||
start = skipcolumn(start, eol, 1);
|
start.len = line->len;
|
||||||
|
for (i = 1; i < kd->start_column; i++)
|
||||||
|
skipcolumn(&start, 1);
|
||||||
if (kd->flags & MOD_STARTB)
|
if (kd->flags & MOD_STARTB)
|
||||||
start = skipblank(start);
|
skipblank(&start);
|
||||||
for (utflen = 0; start < eol && utflen < kd->start_char - 1;) {
|
for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
|
||||||
rlen = chartorune(&r, start);
|
rlen = chartorune(&r, start.data);
|
||||||
start += rlen;
|
start.data += rlen;
|
||||||
|
start.len -= rlen;
|
||||||
utflen++;
|
utflen++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
end.data = line->data;
|
||||||
|
end.len = line->len;
|
||||||
if (kd->end_column) {
|
if (kd->end_column) {
|
||||||
for (i = 1, end = line; i < kd->end_column; i++)
|
for (i = 1; i < kd->end_column; i++)
|
||||||
end = skipcolumn(end, eol, 1);
|
skipcolumn(&end, 1);
|
||||||
if (kd->flags & MOD_ENDB)
|
if (kd->flags & MOD_ENDB)
|
||||||
end = skipblank(end);
|
skipblank(&end);
|
||||||
if (kd->end_char) {
|
if (kd->end_char) {
|
||||||
for (utflen = 0; end < eol && utflen < kd->end_char;) {
|
for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
|
||||||
rlen = chartorune(&r, end);
|
rlen = chartorune(&r, end.data);
|
||||||
end += rlen;
|
end.data += rlen;
|
||||||
|
end.len -= rlen;
|
||||||
utflen++;
|
utflen++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
end = skipcolumn(end, eol, 0);
|
skipcolumn(&end, 0);
|
||||||
}
|
}
|
||||||
|
printf("end.data = '%s'\n", end.data);
|
||||||
} else {
|
} else {
|
||||||
end = eol;
|
end.data += end.len - 1;
|
||||||
|
end.len = 1;
|
||||||
}
|
}
|
||||||
len = (start > end) ? 0 : (end - start);
|
len = MAX(0, end.data - start.data);
|
||||||
if (!*col || *colsiz < len)
|
if (!(col->data) || col->len < len)
|
||||||
*col = erealloc(*col, len + 1);
|
col->data = erealloc(col->data, len + 1);
|
||||||
memcpy(*col, start, len);
|
memcpy(col->data, start.data, len);
|
||||||
(*col)[len] = '\0';
|
col->data[len] = '\0';
|
||||||
if (*colsiz < len)
|
if (col->len < len)
|
||||||
*colsiz = len;
|
col->len = len;
|
||||||
|
|
||||||
return len;
|
return len;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
skipmodcmp(const char *s1, const char *s2, int flags)
|
skipmodcmp(struct linebufline *a, struct linebufline *b, int flags)
|
||||||
{
|
{
|
||||||
Rune r1, r2;
|
Rune r1, r2;
|
||||||
|
size_t offa = 0, offb = 0;
|
||||||
|
|
||||||
do {
|
do {
|
||||||
s1 += chartorune(&r1, s1);
|
offa += chartorune(&r1, a->data + offa);
|
||||||
s2 += chartorune(&r2, s2);
|
offb += chartorune(&r2, b->data + offb);
|
||||||
|
|
||||||
if (flags & MOD_D && flags & MOD_I) {
|
if (flags & MOD_D && flags & MOD_I) {
|
||||||
while (*s1 && ((!isblankrune(r1) && !isalnumrune(r1)) ||
|
while (offa < a->len && ((!isblankrune(r1) &&
|
||||||
(!isprintrune(r1))))
|
!isalnumrune(r1)) || (!isprintrune(r1))))
|
||||||
s1 += chartorune(&r1, s1);
|
offa += chartorune(&r1, a->data + offa);
|
||||||
while (*s2 && ((!isblankrune(r2) && !isalnumrune(r2)) ||
|
while (offb < b->len && ((!isblankrune(r2) &&
|
||||||
(!isprintrune(r2))))
|
!isalnumrune(r2)) || (!isprintrune(r2))))
|
||||||
s2 += chartorune(&r2, s2);
|
offb += chartorune(&r2, b->data + offb);
|
||||||
}
|
}
|
||||||
else if (flags & MOD_D) {
|
else if (flags & MOD_D) {
|
||||||
while (*s1 && !isblankrune(r1) && !isalnumrune(r1))
|
while (offa < a->len && !isblankrune(r1) &&
|
||||||
s1 += chartorune(&r1, s1);
|
!isalnumrune(r1))
|
||||||
while (*s2 && !isblankrune(r2) && !isalnumrune(r2))
|
offa += chartorune(&r1, a->data + offa);
|
||||||
s2 += chartorune(&r2, s2);
|
while (offb < b->len && !isblankrune(r2) &&
|
||||||
|
!isalnumrune(r2))
|
||||||
|
offb += chartorune(&r2, b->data + offb);
|
||||||
}
|
}
|
||||||
else if (flags & MOD_I) {
|
else if (flags & MOD_I) {
|
||||||
while (*s1 && !isprintrune(r1))
|
while (offa < a->len && !isprintrune(r1))
|
||||||
s1 += chartorune(&r1, s1);
|
offa += chartorune(&r1, a->data + offa);
|
||||||
while (*s2 && !isprintrune(r2))
|
while (offb < b->len && !isprintrune(r2))
|
||||||
s2 += chartorune(&r2, s2);
|
offb += chartorune(&r2, b->data + offb);
|
||||||
}
|
}
|
||||||
if (flags & MOD_F) {
|
if (flags & MOD_F) {
|
||||||
r1 = toupperrune(r1);
|
r1 = toupperrune(r1);
|
||||||
@ -157,15 +171,15 @@ skipmodcmp(const char *s1, const char *s2, int flags)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
linecmp(const char **a, const char **b)
|
linecmp(struct linebufline *a, struct linebufline *b)
|
||||||
{
|
{
|
||||||
int res = 0;
|
int res = 0;
|
||||||
long double x, y;
|
long double x, y;
|
||||||
struct keydef *kd;
|
struct keydef *kd;
|
||||||
|
|
||||||
TAILQ_FOREACH(kd, &kdhead, entry) {
|
TAILQ_FOREACH(kd, &kdhead, entry) {
|
||||||
columns((char *)*a, kd, &col1, &col1siz);
|
columns(a, kd, &col1);
|
||||||
columns((char *)*b, kd, &col2, &col2siz);
|
columns(b, kd, &col2);
|
||||||
|
|
||||||
/* if -u is given, don't use default key definition
|
/* if -u is given, don't use default key definition
|
||||||
* unless it is the only one */
|
* unless it is the only one */
|
||||||
@ -173,13 +187,17 @@ linecmp(const char **a, const char **b)
|
|||||||
TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
|
TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
|
||||||
res = 0;
|
res = 0;
|
||||||
} else if (kd->flags & MOD_N) {
|
} else if (kd->flags & MOD_N) {
|
||||||
x = strtold(col1, NULL);
|
x = strtold(col1.data, NULL);
|
||||||
y = strtold(col2, NULL);
|
y = strtold(col2.data, NULL);
|
||||||
res = (x < y) ? -1 : (x > y);
|
res = (x < y) ? -1 : (x > y);
|
||||||
} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
|
} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
|
||||||
res = skipmodcmp(col1, col2, kd->flags);
|
res = skipmodcmp(&col1, &col2, kd->flags);
|
||||||
} else {
|
} else {
|
||||||
res = strcmp(col1, col2);
|
if (!(res = memcmp(col1.data, col2.data,
|
||||||
|
MIN(col1.len, col2.len)))) {
|
||||||
|
res += col1.data[MIN(col1.len, col2.len)] -
|
||||||
|
col2.data[MIN(col1.len, col2.len)];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (kd->flags & MOD_R)
|
if (kd->flags & MOD_R)
|
||||||
@ -194,20 +212,25 @@ linecmp(const char **a, const char **b)
|
|||||||
static int
|
static int
|
||||||
check(FILE *fp, const char *fname)
|
check(FILE *fp, const char *fname)
|
||||||
{
|
{
|
||||||
static struct { char *buf; size_t size; } prev, cur, tmp;
|
static struct linebufline prev, cur, tmp;
|
||||||
|
static size_t prevsize, cursize, tmpsize;
|
||||||
|
|
||||||
if (!prev.buf && getline(&prev.buf, &prev.size, fp) < 0)
|
if (!prev.data && (prev.len = getline(&prev.data, &prevsize, fp)) < 0)
|
||||||
eprintf("getline:");
|
eprintf("getline:");
|
||||||
while (getline(&cur.buf, &cur.size, fp) > 0) {
|
while ((cur.len = getline(&cur.data, &cursize, fp)) > 0) {
|
||||||
if (uflag > linecmp((const char **)&cur.buf,
|
if (uflag > linecmp(&cur, &prev)) {
|
||||||
(const char **)&prev.buf)) {
|
if (!Cflag) {
|
||||||
if (!Cflag)
|
weprintf("disorder %s: ", fname);
|
||||||
weprintf("disorder %s: %s", fname, cur.buf);
|
fwrite(cur.data, 1, cur.len, stderr);
|
||||||
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
tmp = cur;
|
tmp = cur;
|
||||||
|
tmpsize = cursize;
|
||||||
cur = prev;
|
cur = prev;
|
||||||
|
cursize = prevsize;
|
||||||
prev = tmp;
|
prev = tmp;
|
||||||
|
prevsize = tmpsize;
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
@ -345,7 +368,7 @@ main(int argc, char *argv[])
|
|||||||
break;
|
break;
|
||||||
case 't':
|
case 't':
|
||||||
fieldsep = EARGF(usage());
|
fieldsep = EARGF(usage());
|
||||||
fieldseplen = strlen(fieldsep);
|
fieldseplen = unescape(fieldsep);
|
||||||
break;
|
break;
|
||||||
case 'u':
|
case 'u':
|
||||||
uflag = 1;
|
uflag = 1;
|
||||||
@ -388,14 +411,14 @@ main(int argc, char *argv[])
|
|||||||
if (outfile && !(ofp = fopen(outfile, "w")))
|
if (outfile && !(ofp = fopen(outfile, "w")))
|
||||||
eprintf("fopen %s:", outfile);
|
eprintf("fopen %s:", outfile);
|
||||||
|
|
||||||
qsort(linebuf.lines, linebuf.nlines, sizeof *linebuf.lines,
|
qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
|
||||||
(int (*)(const void *, const void *))linecmp);
|
(int (*)(const void *, const void *))linecmp);
|
||||||
|
|
||||||
for (i = 0; i < linebuf.nlines; i++) {
|
for (i = 0; i < linebuf.nlines; i++) {
|
||||||
if (!uflag || i == 0 ||
|
if (!uflag || i == 0 ||
|
||||||
linecmp((const char **)&linebuf.lines[i],
|
linecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
|
||||||
(const char **)&linebuf.lines[i - 1])) {
|
fwrite(linebuf.lines[i].data, 1,
|
||||||
fputs(linebuf.lines[i], ofp);
|
linebuf.lines[i].len, ofp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
2
util.h
2
util.h
@ -76,3 +76,5 @@ long long enstrtonum(int, const char *, long long, long long);
|
|||||||
long long estrtonum(const char *, long long, long long);
|
long long estrtonum(const char *, long long, long long);
|
||||||
size_t unescape(char *);
|
size_t unescape(char *);
|
||||||
int mkdirp(const char *);
|
int mkdirp(const char *);
|
||||||
|
#undef memmem
|
||||||
|
void *memmem(const void *, size_t, const void *, size_t);
|
||||||
|
Loading…
Reference in New Issue
Block a user