sbase/sort.c
Michael Forney a944b682a6 sort: Fix line comparison when col buffer contains data from longer line
I'm not sure if there are other implications of this or not, but
the issue is that columns() uses len to store the allocated buffer
size, but linecmp() compares up to len bytes. If those trailing
bytes do not match, the line is considered not matching, even though
the relevant parts of the buffer do match.

To resolve this, also keep track of column capacity. Additionally,
since there is no reason to keep the existing data when resizing,
just use free and emalloc rather than erealloc.

The simplest case I could reduce it to is this:

if [ "$(printf '%s\n' a a xxb xxc | ./sort -u)" = "$(printf '%s\n' a xxb xxc)" ] ; then
	echo pass
else
	echo fail
fi
2016-07-09 10:09:50 +01:00

438 lines
9.2 KiB
C

/* See LICENSE file for copyright and license details. */
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "queue.h"
#include "text.h"
#include "utf.h"
#include "util.h"
struct keydef {
int start_column;
int end_column;
int start_char;
int end_char;
int flags;
TAILQ_ENTRY(keydef) entry;
};
struct column {
struct line line;
size_t cap;
};
enum {
MOD_N = 1 << 0,
MOD_STARTB = 1 << 1,
MOD_ENDB = 1 << 2,
MOD_R = 1 << 3,
MOD_D = 1 << 4,
MOD_F = 1 << 5,
MOD_I = 1 << 6,
};
static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
static int Cflag = 0, cflag = 0, uflag = 0;
static char *fieldsep = NULL;
static size_t fieldseplen = 0;
static struct column col1, col2;
static void
skipblank(struct line *a)
{
while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
a->data++;
a->len--;
}
}
static void
skipnonblank(struct line *a)
{
while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
*(a->data) != '\t')) {
a->data++;
a->len--;
}
}
static void
skipcolumn(struct line *a, int skip_to_next_col)
{
char *s;
if (fieldsep) {
if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
if (skip_to_next_col) {
s += fieldseplen;
a->data = s;
a->len = a->len - (s - a->data);
}
} else {
a->data += a->len - 1;
a->len = 1;
}
} else {
skipblank(a);
skipnonblank(a);
}
}
static void
columns(struct line *line, const struct keydef *kd, struct column *col)
{
Rune r;
struct line start, end;
size_t utflen, rlen;
int i;
start.data = line->data;
start.len = line->len;
for (i = 1; i < kd->start_column; i++)
skipcolumn(&start, 1);
if (kd->flags & MOD_STARTB)
skipblank(&start);
for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
rlen = chartorune(&r, start.data);
start.data += rlen;
start.len -= rlen;
utflen++;
}
end.data = line->data;
end.len = line->len;
if (kd->end_column) {
for (i = 1; i < kd->end_column; i++)
skipcolumn(&end, 1);
if (kd->flags & MOD_ENDB)
skipblank(&end);
if (kd->end_char) {
for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
rlen = chartorune(&r, end.data);
end.data += rlen;
end.len -= rlen;
utflen++;
}
} else {
skipcolumn(&end, 0);
}
} else {
end.data += end.len - 1;
end.len = 1;
}
col->line.len = MAX(0, end.data - start.data);
if (!(col->line.data) || col->cap < col->line.len + 1) {
free(col->line.data);
col->line.data = emalloc(col->line.len + 1);
}
memcpy(col->line.data, start.data, col->line.len);
col->line.data[col->line.len] = '\0';
}
static int
skipmodcmp(struct line *a, struct line *b, int flags)
{
Rune r1, r2;
size_t offa = 0, offb = 0;
do {
offa += chartorune(&r1, a->data + offa);
offb += chartorune(&r2, b->data + offb);
if (flags & MOD_D && flags & MOD_I) {
while (offa < a->len && ((!isblankrune(r1) &&
!isalnumrune(r1)) || (!isprintrune(r1))))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && ((!isblankrune(r2) &&
!isalnumrune(r2)) || (!isprintrune(r2))))
offb += chartorune(&r2, b->data + offb);
}
else if (flags & MOD_D) {
while (offa < a->len && !isblankrune(r1) &&
!isalnumrune(r1))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && !isblankrune(r2) &&
!isalnumrune(r2))
offb += chartorune(&r2, b->data + offb);
}
else if (flags & MOD_I) {
while (offa < a->len && !isprintrune(r1))
offa += chartorune(&r1, a->data + offa);
while (offb < b->len && !isprintrune(r2))
offb += chartorune(&r2, b->data + offb);
}
if (flags & MOD_F) {
r1 = toupperrune(r1);
r2 = toupperrune(r2);
}
} while (r1 && r1 == r2);
return r1 - r2;
}
static int
slinecmp(struct line *a, struct line *b)
{
int res = 0;
long double x, y;
struct keydef *kd;
TAILQ_FOREACH(kd, &kdhead, entry) {
columns(a, kd, &col1);
columns(b, kd, &col2);
/* if -u is given, don't use default key definition
* unless it is the only one */
if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
res = 0;
} else if (kd->flags & MOD_N) {
x = strtold(col1.line.data, NULL);
y = strtold(col2.line.data, NULL);
res = (x < y) ? -1 : (x > y);
} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
res = skipmodcmp(&col1.line, &col2.line, kd->flags);
} else {
res = linecmp(&col1.line, &col2.line);
}
if (kd->flags & MOD_R)
res = -res;
if (res)
break;
}
return res;
}
static int
check(FILE *fp, const char *fname)
{
static struct line prev, cur, tmp;
static size_t prevsize, cursize, tmpsize;
ssize_t len;
if (!prev.data) {
if ((len = getline(&prev.data, &prevsize, fp)) < 0)
eprintf("getline:");
prev.len = len;
}
while ((len = getline(&cur.data, &cursize, fp)) > 0) {
cur.len = len;
if (uflag > slinecmp(&cur, &prev)) {
if (!Cflag) {
weprintf("disorder %s: ", fname);
fwrite(cur.data, 1, cur.len, stderr);
}
return 1;
}
tmp = cur;
tmpsize = cursize;
cur = prev;
cursize = prevsize;
prev = tmp;
prevsize = tmpsize;
}
return 0;
}
static int
parse_flags(char **s, int *flags, int bflag)
{
while (isalpha((int)**s)) {
switch (*((*s)++)) {
case 'b':
*flags |= bflag;
break;
case 'd':
*flags |= MOD_D;
break;
case 'f':
*flags |= MOD_F;
break;
case 'i':
*flags |= MOD_I;
break;
case 'n':
*flags |= MOD_N;
break;
case 'r':
*flags |= MOD_R;
break;
default:
return -1;
}
}
return 0;
}
static void
addkeydef(char *kdstr, int flags)
{
struct keydef *kd;
kd = enmalloc(2, sizeof(*kd));
/* parse key definition kdstr with format
* start_column[.start_char][flags][,end_column[.end_char][flags]]
*/
kd->start_column = 1;
kd->start_char = 1;
kd->end_column = 0; /* 0 means end of line */
kd->end_char = 0; /* 0 means end of column */
kd->flags = flags;
if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
enprintf(2, "invalid start column in key definition\n");
if (*kdstr == '.') {
if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
enprintf(2, "invalid start character in key "
"definition\n");
}
if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
enprintf(2, "invalid start flags in key definition\n");
if (*kdstr == ',') {
if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
enprintf(2, "invalid end column in key definition\n");
if (*kdstr == '.') {
if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
enprintf(2, "invalid end character in key "
"definition\n");
}
if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
enprintf(2, "invalid end flags in key definition\n");
}
if (*kdstr != '\0')
enprintf(2, "invalid key definition\n");
TAILQ_INSERT_TAIL(&kdhead, kd, entry);
}
static void
usage(void)
{
enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] "
"[-k def]... [file ...]\n", argv0);
}
int
main(int argc, char *argv[])
{
FILE *fp, *ofp = stdout;
struct linebuf linebuf = EMPTY_LINEBUF;
size_t i;
int global_flags = 0, ret = 0;
char *outfile = NULL;
ARGBEGIN {
case 'C':
Cflag = 1;
break;
case 'b':
global_flags |= MOD_STARTB | MOD_ENDB;
break;
case 'c':
cflag = 1;
break;
case 'd':
global_flags |= MOD_D;
break;
case 'f':
global_flags |= MOD_F;
break;
case 'i':
global_flags |= MOD_I;
break;
case 'k':
addkeydef(EARGF(usage()), global_flags);
break;
case 'm':
/* more or less for free, but for performance-reasons,
* we should keep this flag in mind and maybe some later
* day implement it properly so we don't run out of memory
* while merging large sorted files.
*/
break;
case 'n':
global_flags |= MOD_N;
break;
case 'o':
outfile = EARGF(usage());
break;
case 'r':
global_flags |= MOD_R;
break;
case 't':
fieldsep = EARGF(usage());
if (!*fieldsep)
eprintf("empty delimiter\n");
fieldseplen = unescape(fieldsep);
break;
case 'u':
uflag = 1;
break;
default:
usage();
} ARGEND
/* -b shall only apply to custom key definitions */
if (TAILQ_EMPTY(&kdhead) && global_flags)
addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
addkeydef("1", global_flags & MOD_R);
if (!argc) {
if (Cflag || cflag) {
if (check(stdin, "<stdin>") && !ret)
ret = 1;
} else {
getlines(stdin, &linebuf);
}
} else for (; *argv; argc--, argv++) {
if (!strcmp(*argv, "-")) {
*argv = "<stdin>";
fp = stdin;
} else if (!(fp = fopen(*argv, "r"))) {
enprintf(2, "fopen %s:", *argv);
continue;
}
if (Cflag || cflag) {
if (check(fp, *argv) && !ret)
ret = 1;
} else {
getlines(fp, &linebuf);
}
if (fp != stdin && fshut(fp, *argv))
ret = 2;
}
if (!Cflag && !cflag) {
if (outfile && !(ofp = fopen(outfile, "w")))
eprintf("fopen %s:", outfile);
qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
(int (*)(const void *, const void *))slinecmp);
for (i = 0; i < linebuf.nlines; i++) {
if (!uflag || i == 0 ||
slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
fwrite(linebuf.lines[i].data, 1,
linebuf.lines[i].len, ofp);
}
}
}
if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
fshut(stderr, "<stderr>"))
ret = 2;
return ret;
}