a944b682a6
I'm not sure if there are other implications of this or not, but the issue is that columns() uses len to store the allocated buffer size, but linecmp() compares up to len bytes. If those trailing bytes do not match, the line is considered not matching, even though the relevant parts of the buffer do match. To resolve this, also keep track of column capacity. Additionally, since there is no reason to keep the existing data when resizing, just use free and emalloc rather than erealloc. The simplest case I could reduce it to is this: if [ "$(printf '%s\n' a a xxb xxc | ./sort -u)" = "$(printf '%s\n' a xxb xxc)" ] ; then echo pass else echo fail fi
438 lines
9.2 KiB
C
438 lines
9.2 KiB
C
/* See LICENSE file for copyright and license details. */
|
|
#include <ctype.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "queue.h"
|
|
#include "text.h"
|
|
#include "utf.h"
|
|
#include "util.h"
|
|
|
|
struct keydef {
|
|
int start_column;
|
|
int end_column;
|
|
int start_char;
|
|
int end_char;
|
|
int flags;
|
|
TAILQ_ENTRY(keydef) entry;
|
|
};
|
|
|
|
struct column {
|
|
struct line line;
|
|
size_t cap;
|
|
};
|
|
|
|
enum {
|
|
MOD_N = 1 << 0,
|
|
MOD_STARTB = 1 << 1,
|
|
MOD_ENDB = 1 << 2,
|
|
MOD_R = 1 << 3,
|
|
MOD_D = 1 << 4,
|
|
MOD_F = 1 << 5,
|
|
MOD_I = 1 << 6,
|
|
};
|
|
|
|
static TAILQ_HEAD(kdhead, keydef) kdhead = TAILQ_HEAD_INITIALIZER(kdhead);
|
|
|
|
static int Cflag = 0, cflag = 0, uflag = 0;
|
|
static char *fieldsep = NULL;
|
|
static size_t fieldseplen = 0;
|
|
static struct column col1, col2;
|
|
|
|
static void
|
|
skipblank(struct line *a)
|
|
{
|
|
while (a->len && (*(a->data) == ' ' || *(a->data) == '\t')) {
|
|
a->data++;
|
|
a->len--;
|
|
}
|
|
}
|
|
|
|
static void
|
|
skipnonblank(struct line *a)
|
|
{
|
|
while (a->len && (*(a->data) != '\n' && *(a->data) != ' ' &&
|
|
*(a->data) != '\t')) {
|
|
a->data++;
|
|
a->len--;
|
|
}
|
|
}
|
|
|
|
static void
|
|
skipcolumn(struct line *a, int skip_to_next_col)
|
|
{
|
|
char *s;
|
|
|
|
if (fieldsep) {
|
|
if ((s = memmem(a->data, a->len, fieldsep, fieldseplen))) {
|
|
if (skip_to_next_col) {
|
|
s += fieldseplen;
|
|
a->data = s;
|
|
a->len = a->len - (s - a->data);
|
|
}
|
|
} else {
|
|
a->data += a->len - 1;
|
|
a->len = 1;
|
|
}
|
|
} else {
|
|
skipblank(a);
|
|
skipnonblank(a);
|
|
}
|
|
}
|
|
|
|
static void
|
|
columns(struct line *line, const struct keydef *kd, struct column *col)
|
|
{
|
|
Rune r;
|
|
struct line start, end;
|
|
size_t utflen, rlen;
|
|
int i;
|
|
|
|
start.data = line->data;
|
|
start.len = line->len;
|
|
for (i = 1; i < kd->start_column; i++)
|
|
skipcolumn(&start, 1);
|
|
if (kd->flags & MOD_STARTB)
|
|
skipblank(&start);
|
|
for (utflen = 0; start.len > 1 && utflen < kd->start_char - 1;) {
|
|
rlen = chartorune(&r, start.data);
|
|
start.data += rlen;
|
|
start.len -= rlen;
|
|
utflen++;
|
|
}
|
|
|
|
end.data = line->data;
|
|
end.len = line->len;
|
|
if (kd->end_column) {
|
|
for (i = 1; i < kd->end_column; i++)
|
|
skipcolumn(&end, 1);
|
|
if (kd->flags & MOD_ENDB)
|
|
skipblank(&end);
|
|
if (kd->end_char) {
|
|
for (utflen = 0; end.len > 1 && utflen < kd->end_char;) {
|
|
rlen = chartorune(&r, end.data);
|
|
end.data += rlen;
|
|
end.len -= rlen;
|
|
utflen++;
|
|
}
|
|
} else {
|
|
skipcolumn(&end, 0);
|
|
}
|
|
} else {
|
|
end.data += end.len - 1;
|
|
end.len = 1;
|
|
}
|
|
col->line.len = MAX(0, end.data - start.data);
|
|
if (!(col->line.data) || col->cap < col->line.len + 1) {
|
|
free(col->line.data);
|
|
col->line.data = emalloc(col->line.len + 1);
|
|
}
|
|
memcpy(col->line.data, start.data, col->line.len);
|
|
col->line.data[col->line.len] = '\0';
|
|
}
|
|
|
|
static int
|
|
skipmodcmp(struct line *a, struct line *b, int flags)
|
|
{
|
|
Rune r1, r2;
|
|
size_t offa = 0, offb = 0;
|
|
|
|
do {
|
|
offa += chartorune(&r1, a->data + offa);
|
|
offb += chartorune(&r2, b->data + offb);
|
|
|
|
if (flags & MOD_D && flags & MOD_I) {
|
|
while (offa < a->len && ((!isblankrune(r1) &&
|
|
!isalnumrune(r1)) || (!isprintrune(r1))))
|
|
offa += chartorune(&r1, a->data + offa);
|
|
while (offb < b->len && ((!isblankrune(r2) &&
|
|
!isalnumrune(r2)) || (!isprintrune(r2))))
|
|
offb += chartorune(&r2, b->data + offb);
|
|
}
|
|
else if (flags & MOD_D) {
|
|
while (offa < a->len && !isblankrune(r1) &&
|
|
!isalnumrune(r1))
|
|
offa += chartorune(&r1, a->data + offa);
|
|
while (offb < b->len && !isblankrune(r2) &&
|
|
!isalnumrune(r2))
|
|
offb += chartorune(&r2, b->data + offb);
|
|
}
|
|
else if (flags & MOD_I) {
|
|
while (offa < a->len && !isprintrune(r1))
|
|
offa += chartorune(&r1, a->data + offa);
|
|
while (offb < b->len && !isprintrune(r2))
|
|
offb += chartorune(&r2, b->data + offb);
|
|
}
|
|
if (flags & MOD_F) {
|
|
r1 = toupperrune(r1);
|
|
r2 = toupperrune(r2);
|
|
}
|
|
} while (r1 && r1 == r2);
|
|
|
|
return r1 - r2;
|
|
}
|
|
|
|
static int
|
|
slinecmp(struct line *a, struct line *b)
|
|
{
|
|
int res = 0;
|
|
long double x, y;
|
|
struct keydef *kd;
|
|
|
|
TAILQ_FOREACH(kd, &kdhead, entry) {
|
|
columns(a, kd, &col1);
|
|
columns(b, kd, &col2);
|
|
|
|
/* if -u is given, don't use default key definition
|
|
* unless it is the only one */
|
|
if (uflag && kd == TAILQ_LAST(&kdhead, kdhead) &&
|
|
TAILQ_LAST(&kdhead, kdhead) != TAILQ_FIRST(&kdhead)) {
|
|
res = 0;
|
|
} else if (kd->flags & MOD_N) {
|
|
x = strtold(col1.line.data, NULL);
|
|
y = strtold(col2.line.data, NULL);
|
|
res = (x < y) ? -1 : (x > y);
|
|
} else if (kd->flags & (MOD_D | MOD_F | MOD_I)) {
|
|
res = skipmodcmp(&col1.line, &col2.line, kd->flags);
|
|
} else {
|
|
res = linecmp(&col1.line, &col2.line);
|
|
}
|
|
|
|
if (kd->flags & MOD_R)
|
|
res = -res;
|
|
if (res)
|
|
break;
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
static int
|
|
check(FILE *fp, const char *fname)
|
|
{
|
|
static struct line prev, cur, tmp;
|
|
static size_t prevsize, cursize, tmpsize;
|
|
ssize_t len;
|
|
|
|
if (!prev.data) {
|
|
if ((len = getline(&prev.data, &prevsize, fp)) < 0)
|
|
eprintf("getline:");
|
|
prev.len = len;
|
|
}
|
|
while ((len = getline(&cur.data, &cursize, fp)) > 0) {
|
|
cur.len = len;
|
|
if (uflag > slinecmp(&cur, &prev)) {
|
|
if (!Cflag) {
|
|
weprintf("disorder %s: ", fname);
|
|
fwrite(cur.data, 1, cur.len, stderr);
|
|
}
|
|
return 1;
|
|
}
|
|
tmp = cur;
|
|
tmpsize = cursize;
|
|
cur = prev;
|
|
cursize = prevsize;
|
|
prev = tmp;
|
|
prevsize = tmpsize;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
parse_flags(char **s, int *flags, int bflag)
|
|
{
|
|
while (isalpha((int)**s)) {
|
|
switch (*((*s)++)) {
|
|
case 'b':
|
|
*flags |= bflag;
|
|
break;
|
|
case 'd':
|
|
*flags |= MOD_D;
|
|
break;
|
|
case 'f':
|
|
*flags |= MOD_F;
|
|
break;
|
|
case 'i':
|
|
*flags |= MOD_I;
|
|
break;
|
|
case 'n':
|
|
*flags |= MOD_N;
|
|
break;
|
|
case 'r':
|
|
*flags |= MOD_R;
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
addkeydef(char *kdstr, int flags)
|
|
{
|
|
struct keydef *kd;
|
|
|
|
kd = enmalloc(2, sizeof(*kd));
|
|
|
|
/* parse key definition kdstr with format
|
|
* start_column[.start_char][flags][,end_column[.end_char][flags]]
|
|
*/
|
|
kd->start_column = 1;
|
|
kd->start_char = 1;
|
|
kd->end_column = 0; /* 0 means end of line */
|
|
kd->end_char = 0; /* 0 means end of column */
|
|
kd->flags = flags;
|
|
|
|
if ((kd->start_column = strtol(kdstr, &kdstr, 10)) < 1)
|
|
enprintf(2, "invalid start column in key definition\n");
|
|
|
|
if (*kdstr == '.') {
|
|
if ((kd->start_char = strtol(kdstr + 1, &kdstr, 10)) < 1)
|
|
enprintf(2, "invalid start character in key "
|
|
"definition\n");
|
|
}
|
|
if (parse_flags(&kdstr, &kd->flags, MOD_STARTB) < 0)
|
|
enprintf(2, "invalid start flags in key definition\n");
|
|
|
|
if (*kdstr == ',') {
|
|
if ((kd->end_column = strtol(kdstr + 1, &kdstr, 10)) < 0)
|
|
enprintf(2, "invalid end column in key definition\n");
|
|
if (*kdstr == '.') {
|
|
if ((kd->end_char = strtol(kdstr + 1, &kdstr, 10)) < 0)
|
|
enprintf(2, "invalid end character in key "
|
|
"definition\n");
|
|
}
|
|
if (parse_flags(&kdstr, &kd->flags, MOD_ENDB) < 0)
|
|
enprintf(2, "invalid end flags in key definition\n");
|
|
}
|
|
|
|
if (*kdstr != '\0')
|
|
enprintf(2, "invalid key definition\n");
|
|
|
|
TAILQ_INSERT_TAIL(&kdhead, kd, entry);
|
|
}
|
|
|
|
static void
|
|
usage(void)
|
|
{
|
|
enprintf(2, "usage: %s [-Cbcdfimnru] [-o outfile] [-t delim] "
|
|
"[-k def]... [file ...]\n", argv0);
|
|
}
|
|
|
|
int
|
|
main(int argc, char *argv[])
|
|
{
|
|
FILE *fp, *ofp = stdout;
|
|
struct linebuf linebuf = EMPTY_LINEBUF;
|
|
size_t i;
|
|
int global_flags = 0, ret = 0;
|
|
char *outfile = NULL;
|
|
|
|
ARGBEGIN {
|
|
case 'C':
|
|
Cflag = 1;
|
|
break;
|
|
case 'b':
|
|
global_flags |= MOD_STARTB | MOD_ENDB;
|
|
break;
|
|
case 'c':
|
|
cflag = 1;
|
|
break;
|
|
case 'd':
|
|
global_flags |= MOD_D;
|
|
break;
|
|
case 'f':
|
|
global_flags |= MOD_F;
|
|
break;
|
|
case 'i':
|
|
global_flags |= MOD_I;
|
|
break;
|
|
case 'k':
|
|
addkeydef(EARGF(usage()), global_flags);
|
|
break;
|
|
case 'm':
|
|
/* more or less for free, but for performance-reasons,
|
|
* we should keep this flag in mind and maybe some later
|
|
* day implement it properly so we don't run out of memory
|
|
* while merging large sorted files.
|
|
*/
|
|
break;
|
|
case 'n':
|
|
global_flags |= MOD_N;
|
|
break;
|
|
case 'o':
|
|
outfile = EARGF(usage());
|
|
break;
|
|
case 'r':
|
|
global_flags |= MOD_R;
|
|
break;
|
|
case 't':
|
|
fieldsep = EARGF(usage());
|
|
if (!*fieldsep)
|
|
eprintf("empty delimiter\n");
|
|
fieldseplen = unescape(fieldsep);
|
|
break;
|
|
case 'u':
|
|
uflag = 1;
|
|
break;
|
|
default:
|
|
usage();
|
|
} ARGEND
|
|
|
|
/* -b shall only apply to custom key definitions */
|
|
if (TAILQ_EMPTY(&kdhead) && global_flags)
|
|
addkeydef("1", global_flags & ~(MOD_STARTB | MOD_ENDB));
|
|
addkeydef("1", global_flags & MOD_R);
|
|
|
|
if (!argc) {
|
|
if (Cflag || cflag) {
|
|
if (check(stdin, "<stdin>") && !ret)
|
|
ret = 1;
|
|
} else {
|
|
getlines(stdin, &linebuf);
|
|
}
|
|
} else for (; *argv; argc--, argv++) {
|
|
if (!strcmp(*argv, "-")) {
|
|
*argv = "<stdin>";
|
|
fp = stdin;
|
|
} else if (!(fp = fopen(*argv, "r"))) {
|
|
enprintf(2, "fopen %s:", *argv);
|
|
continue;
|
|
}
|
|
if (Cflag || cflag) {
|
|
if (check(fp, *argv) && !ret)
|
|
ret = 1;
|
|
} else {
|
|
getlines(fp, &linebuf);
|
|
}
|
|
if (fp != stdin && fshut(fp, *argv))
|
|
ret = 2;
|
|
}
|
|
|
|
if (!Cflag && !cflag) {
|
|
if (outfile && !(ofp = fopen(outfile, "w")))
|
|
eprintf("fopen %s:", outfile);
|
|
|
|
qsort(linebuf.lines, linebuf.nlines, sizeof(*linebuf.lines),
|
|
(int (*)(const void *, const void *))slinecmp);
|
|
|
|
for (i = 0; i < linebuf.nlines; i++) {
|
|
if (!uflag || i == 0 ||
|
|
slinecmp(&linebuf.lines[i], &linebuf.lines[i - 1])) {
|
|
fwrite(linebuf.lines[i].data, 1,
|
|
linebuf.lines[i].len, ofp);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (fshut(stdin, "<stdin>") | fshut(stdout, "<stdout>") |
|
|
fshut(stderr, "<stderr>"))
|
|
ret = 2;
|
|
|
|
return ret;
|
|
}
|