sbase/sort.c
FRIGN eb9bda8787 Support NUL-containing lines in sort(1)
For sort(1) we need memmem(), which I imported from OpenBSD.
Inside sort(1), the changes involved working with the explicit lengths
given by getlines() earlier and rewriting some of the functions.

Now we can handle NUL-characters in the input just fine.
2016-03-10 08:48:09 +00:00

432 lines
9.2 KiB
C

/* See LICENSE file for copyright and license details. */
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "queue.h"
#include "text.h"
#include "utf.h"
#include "util.h"
struct keydef {
int start_column;
int end_column;
int start_char;
int end_char;
int flags;
TAILQ_ENTRY(keydef) entry;
};
enum {
MOD_N = 1 << 0,
MOD_STARTB = 1 << 1,
MOD_ENDB = 1 << 2,
MOD_R = 1 << 3,
MOD_D = 1 << 4,
MOD_F = 1 << 5,
MOD_I = 1 << 6,
};
static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
static int Cflag = 0, cflag = 0, uflag = 0;
static char *fieldsep = NULL;
static size_t fieldseplen = 0;
static struct linebufline col1, col2;
static void
skipblank(struct linebufline *a)
{
while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
a->data++;
a->len--;
}
}
static void
skipnonblank(struct linebufline *a)
{
while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
*(a->data) != '\t')) {
a->data++;
a->len--;
}
}
static void
skipcolumn(struct linebufline *a, int skip_to_next_col)
{
char *s;
if (fieldsep) {
if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
if (skip_to_next_col) {
a->len = a->len - (s - a->data);
a->data = s;
}
} else {
a->data += a->len - 1;
a->len = 1;
}
} else {
skipblank(a);
skipnonblank(a);
}
}
static size_t
columns(struct linebufline *line, const struct keydef *kd, struct linebufline *col)
{
Rune r;
struct linebufline start, end;
size_t len, utflen, rlen;
int i;
start.data = line->data;
start.len = line->len;
for (i = 1; i < kd->start_column; i++)
skipcolumn(&start, 1);
if (kd->flags & MOD_STARTB)
skipblank(&start);
for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
rlen = chartorune(&r, start.data);
start.data += rlen;
start.len -= rlen;
utflen++;
}
end.data = line->data;
end.len = line->len;
if (kd->end_column) {
for (i = 1; i < kd->end_column; i++)
skipcolumn(&end, 1);
if (kd->flags & MOD_ENDB)
skipblank(&end);
if (kd->end_char) {
for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
rlen = chartorune(&r, end.data);
end.data += rlen;
end.len -= rlen;
utflen++;
}
} else {
skipcolumn(&end, 0);
}
printf("end.data = '%s'\n", end.data);
} else {
end.data += end.len - 1;
end.len = 1;
}
len = MAX(0, end.data - start.data);
if (!(col->data) || col->len < len)
col->data = erealloc(col->data, len + 1);
memcpy(col->data, start.data, len);
col->data[len] = '\0';
if (col->len < len)
col->len = len;
return len;
}
static int
skipmodcmp(struct linebufline *a, struct linebufline *b, int flags)
{
Rune r1, r2;
size_t offa = 0, offb = 0;
do {
offa += chartorune(&r1, a->data + offa);
offb += chartorune(&r2, b->data + offb);
if (flags & MOD_D && flags & MOD_I) {
while (offa < a->len && ((!isblankrune(r1) &&
!isalnumrune(r1)) || (!isprintrune(r1))))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && ((!isblankrune(r2) &&
!isalnumrune(r2)) || (!isprintrune(r2))))
offb += chartorune(&r2, b->data + offb);
}
else if (flags & MOD_D) {
while (offa < a->len && !isblankrune(r1) &&
!isalnumrune(r1))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && !isblankrune(r2) &&
!isalnumrune(r2))
offb += chartorune(&r2, b->data + offb);
}
else if (flags & MOD_I) {
while (offa < a->len && !isprintrune(r1))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && !isprintrune(r2))
offb += chartorune(&r2, b->data + offb);
}
if (flags & MOD_F) {
r1 = toupperrune(r1);
r2 = toupperrune(r2);
}
} while (r1 && r1 == r2);
return r1 - r2;
}
static int
linecmp(struct linebufline *a, struct linebufline *b)
{
int res = 0;
long double x, y;
struct keydef *kd;
TAILQ_FOREACH(kd, &kdhead, entry) {
columns(a, kd, &col1);
columns(b, kd, &col2);
/* if -u is given, don't use default key definition
* unless it is the only one */
if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
res = 0;
} else if (kd->flags & MOD_N) {
x = strtold(col1.data, NULL);
y = strtold(col2.data, NULL);
res = (x < y) ? -1 : (x > y);
} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
res = skipmodcmp(&col1, &col2, kd->flags);
} else {
if (!(res = memcmp(col1.data, col2.data,
MIN(col1.len, col2.len)))) {
res += col1.data[MIN(col1.len, col2.len)] -
col2.data[MIN(col1.len, col2.len)];
}
}
if (kd->flags & MOD_R)
res = -res;
if (res)
break;
}
return res;
}
static int
check(FILE *fp, const char *fname)
{
static struct linebufline prev, cur, tmp;
static size_t prevsize, cursize, tmpsize;
if (!prev.data && (prev.len = getline(&prev.data, &prevsize, fp)) < 0)
eprintf("getline:");
while ((cur.len = getline(&cur.data, &cursize, fp)) > 0) {
if (uflag > linecmp(&cur, &prev)) {
if (!Cflag) {
weprintf("disorder %s: ", fname);
fwrite(cur.data, 1, cur.len, stderr);
}
return 1;
}
tmp = cur;
tmpsize = cursize;
cur = prev;
cursize = prevsize;
prev = tmp;
prevsize = tmpsize;
}
return 0;
}
static int
parse_flags(char **s, int *flags, int bflag)
{
while (isalpha((int)**s)) {
switch (*((*s)++)) {
case 'b':
*flags |= bflag;
break;
case 'd':
*flags |= MOD_D;
break;
case 'f':
*flags |= MOD_F;
break;
case 'i':
*flags |= MOD_I;
break;
case 'n':
*flags |= MOD_N;
break;
case 'r':
*flags |= MOD_R;
break;
default:
return -1;
}
}
return 0;
}
static void
addkeydef(char *kdstr, int flags)
{
struct keydef *kd;
kd = enmalloc(2, sizeof(*kd));
/* parse key definition kdstr with format
* start_column[.start_char][flags][,end_column[.end_char][flags]]
*/
kd->start_column = 1;
kd->start_char = 1;
kd->end_column = 0; /* 0 means end of line */
kd->end_char = 0; /* 0 means end of column */
kd->flags = flags;
if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
enprintf(2, "invalid start column in key definition\n");
if (*kdstr == '.') {
if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
enprintf(2, "invalid start character in key "
"definition\n");
}
if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
enprintf(2, "invalid start flags in key definition\n");
if (*kdstr == ',') {
if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
enprintf(2, "invalid end column in key definition\n");
if (*kdstr == '.') {
if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
enprintf(2, "invalid end character in key "
"definition\n");
}
if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
enprintf(2, "invalid end flags in key definition\n");
}
if (*kdstr != '\0')
enprintf(2, "invalid key definition\n");
TAILQ_INSERT_TAIL(&kdhead, kd, entry);
}
static void
usage(void)
{
enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] "
"[-k def]... [file ...]\n", argv0);
}
int
main(int argc, char *argv[])
{
FILE *fp, *ofp = stdout;
struct linebuf linebuf = EMPTY_LINEBUF;
size_t i;
int global_flags = 0, ret = 0;
char *outfile = NULL;
ARGBEGIN {
case 'C':
Cflag = 1;
break;
case 'b':
global_flags |= MOD_STARTB | MOD_ENDB;
break;
case 'c':
cflag = 1;
break;
case 'd':
global_flags |= MOD_D;
break;
case 'f':
global_flags |= MOD_F;
break;
case 'i':
global_flags |= MOD_I;
break;
case 'k':
addkeydef(EARGF(usage()), global_flags);
break;
case 'm':
/* more or less for free, but for performance-reasons,
* we should keep this flag in mind and maybe some later
* day implement it properly so we don't run out of memory
* while merging large sorted files.
*/
break;
case 'n':
global_flags |= MOD_N;
break;
case 'o':
outfile = EARGF(usage());
break;
case 'r':
global_flags |= MOD_R;
break;
case 't':
fieldsep = EARGF(usage());
fieldseplen = unescape(fieldsep);
break;
case 'u':
uflag = 1;
break;
default:
usage();
} ARGEND
/* -b shall only apply to custom key definitions */
if (TAILQ_EMPTY(&kdhead) && global_flags)
addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
addkeydef("1", global_flags & MOD_R);
if (!argc) {
if (Cflag || cflag) {
if (check(stdin, "<stdin>") && !ret)
ret = 1;
} else {
getlines(stdin, &linebuf);
}
} else for (; *argv; argc--, argv++) {
if (!strcmp(*argv, "-")) {
*argv = "<stdin>";
fp = stdin;
} else if (!(fp = fopen(*argv, "r"))) {
enprintf(2, "fopen %s:", *argv);
continue;
}
if (Cflag || cflag) {
if (check(fp, *argv) && !ret)
ret = 1;
} else {
getlines(fp, &linebuf);
}
if (fp != stdin && fshut(fp, *argv))
ret = 2;
}
if (!Cflag && !cflag) {
if (outfile && !(ofp = fopen(outfile, "w")))
eprintf("fopen %s:", outfile);
qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
(int (*)(const void *, const void *))linecmp);
for (i = 0; i < linebuf.nlines; i++) {
if (!uflag || i == 0 ||
linecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
fwrite(linebuf.lines[i].data, 1,
linebuf.lines[i].len, ofp);
}
}
}
if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
fshut(stderr, "<stderr>"))
ret = 2;
return ret;
}