Add join(1)

This commit is contained in:
Wolfgang Corcoran-Mathe 2015-04-20 11:23:20 +01:00 committed by sin
parent f83d7bc647
commit cd0b771cbb
6 changed files with 662 additions and 1 deletions

View File

@ -58,3 +58,4 @@ Authors/contributors include:
© 2015 Tai Chi Minh Ralph Eastwood <tcmreastwood@gmail.com>
© 2015 Quentin Rameau <quinq@quinq.eu.org>
© 2015 Dionysis Grigoropoulos <info@erethon.com>
© 2015 Wolfgang Corcoran-Mathe <first.lord.of.teal@gmail.com>

View File

@ -99,6 +99,7 @@ BIN =\
fold\
grep\
head\
join\
hostname\
kill\
link\

1
README
View File

@ -40,6 +40,7 @@ The following tools are implemented:
=* o grep .
=*|o head .
=*|x hostname .
=* o join .
=*|o kill .
=*|o link .
=*|o ln .

1
TODO
View File

@ -10,7 +10,6 @@ diff
ed
getconf
install
join
od
patch
pathchk

105
join.1 Normal file
View File

@ -0,0 +1,105 @@
.Dd April 18, 2015
.Dt JOIN 1
.Os sbase
.Sh NAME
.Nm join
.Nd relational database operator
.Sh SYNOPSIS
.Nm
.Op Fl 1 Ar field
.Op Fl 2 Ar field
.Op Fl o Ar list
.Op Fl e Ar string
.Op Fl a Ar fileno | Fl v Ar fileno
.Op Fl t Ar delim
.Ar file1 file2
.Sh DESCRIPTION
.Nm
lines from
.Ar file1
and
.Ar file2
on a matching field. If one of the input files is '-', standard input
is read for that file.
.Pp
Files are read sequentially and are assumed to be sorted on the join
field.
.Nm
does not check the order of input, and joining two unsorted files will
produce unexpected output.
.Pp
By default, input lines are matched on the first blank-separated
field; output lines are space-separated and consist of the join field
followed by the remaining fields from
.Ar file1 Ns ,
then the remaining fields from
.Ar file2 Ns .
.Sh OPTIONS
.Bl -tag -width Ds
.It Fl 1 Ar field
Join on the
.Ar field Ns eth
field of file 1.
.It Fl 2 Ar field
Join on the
.Ar field Ns eth
field of file 2.
.It Fl a Ar fileno
Print unpairable lines from file
.Ar fileno
in addition to normal output.
.It Fl e Ar string
When used with
.Fl o Ns ,
replace empty fields in the output list with
.Ar string Ns .
.It Fl o Ar list
Format output according to the string
.Ar list Ns .
Each element of
.Ar list
may be either
.Ar fileno.field
or 0 (representing the join field).
Elements in
.Ar list
may be separated by blanks or commas. For example,
.Bd -literal -offset indent
join -o "0 2.1 1.3"
.Ed
.Pp
would print the join field, the first field of
.Ar file2 Ns ,
then the third field of
.Ar file1 Ns .
.Pp
Only paired lines are formatted with the
.Fl o
option. Unpairable lines (selected with
.Fl a
or
.Fl v Ns )
are printed raw.
.It Fl t Ar delim
Use the arbitrary string
.Ar delim
as field delimiter for both input and output.
.It Fl v Ar fileno
Print unpairable lines from file
.Ar fileno
instead of normal output.
.El
.Sh STANDARDS
The
.Nm
utility is compliant with the
.St -p1003.1-2013
specification with the following exeption:
.Bl -bullet -offset indent
.It
Unpairable lines ignore formatting specified with
.Fl o Ns .
.El
.Pp
The possibility of specifying multibyte delimiters of arbitrary
length is an extension to the specification.

554
join.c Normal file
View File

@ -0,0 +1,554 @@
#include <ctype.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "arg.h"
#include "text.h"
#include "utf.h"
#include "util.h"
enum {
INIT = 1,
GROW = 2,
};
enum {
EXPAND = 0,
RESET = 1,
};
enum { FIELD_ERROR = -2, };
struct field {
char *s;
size_t len;
};
struct line {
char *text;
size_t nf;
size_t maxf;
struct field *fields;
};
struct spec {
size_t fileno;
size_t fldno;
};
struct outlist {
size_t ns;
size_t maxs;
struct spec **specs;
};
struct span {
size_t nl;
size_t maxl;
struct line **lines;
};
static char *sep = NULL;
static char *replace = NULL;
static const char defaultofs = ' ';
static const int jfield = 1; /* POSIX default join field */
static int unpairsa = 0, unpairsb = 0;
static int oflag = 0;
static int pairs = 1;
static size_t seplen;
static struct outlist output;
char *argv0;
static void
usage(void)
{
eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] "
"[-a | -v fileno] [-t delim] file1 file2\n", argv0);
}
static void
prfield(struct field *fp)
{
if (fwrite(fp->s, 1, fp->len, stdout) != fp->len)
eprintf("fwrite:");
}
static void
prsep(void)
{
if (sep)
fwrite(sep, 1, seplen, stdout);
else
putchar(defaultofs);
}
static void
swaplines(struct line *la, struct line *lb)
{
struct line tmp;
tmp = *la;
*la = *lb;
*lb = tmp;
}
static void
prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb)
{
struct spec *sp;
struct field *joinfield;
size_t i;
if (jfa >= la->nf || jfb >= lb->nf)
return;
joinfield = &la->fields[jfa];
if (oflag) {
for (i = 0; i < output.ns; i++) {
sp = output.specs[i];
if (sp->fileno == 1) {
if (sp->fldno < la->nf)
prfield(&la->fields[sp->fldno]);
else if (replace)
fputs(replace, stdout);
} else if (sp->fileno == 2) {
if (sp->fldno < lb->nf)
prfield(&lb->fields[sp->fldno]);
else if (replace)
fputs(replace, stdout);
} else if (sp->fileno == 0) {
prfield(joinfield);
}
if (i < output.ns - 1)
prsep();
}
} else {
prfield(joinfield);
prsep();
for (i = 0; i < la->nf; i++) {
if (i != jfa) {
prfield(&la->fields[i]);
prsep();
}
}
for (i = 0; i < lb->nf; i++) {
if (i != jfb) {
prfield(&lb->fields[i]);
if (i < la->nf - 1)
prsep();
}
}
}
putchar('\n');
}
static void
prline(struct line *lp)
{
size_t len = strlen(lp->text);
if (fwrite(lp->text, 1, len, stdout) != len)
eprintf("fwrite:");
putchar('\n');
}
static int
linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb)
{
int status;
/* return FIELD_ERROR if both lines are short */
if (jfa >= la->nf) {
status = jfb >= lb->nf ? FIELD_ERROR : -1;
} else if (jfb >= lb->nf) {
status = 1;
} else {
status = memcmp(la->fields[jfa].s, lb->fields[jfb].s,
MAX (la->fields[jfa].len, lb->fields[jfb].len));
if (status > 0)
status = 1;
else if (status < 0)
status = -1;
}
return status;
}
static void
addfield(struct line *lp, char *sp, size_t len)
{
if (lp->nf >= lp->maxf) {
lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf),
sizeof(struct field));
lp->maxf *= GROW;
}
lp->fields[lp->nf].s = sp;
lp->fields[lp->nf].len = len;
lp->nf++;
}
static void
prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb)
{
size_t i, j;
for (i = 0; i < (spa->nl - 1); i++)
for (j = 0; j < (spb->nl - 1); j++)
prjoin(spa->lines[i], spb->lines[j], jfa, jfb);
}
static struct line *
makeline(char *s, size_t len)
{
struct line *lp;
char *sp, *beg, *end;
size_t i;
int eol = 0;
if (s[len-1] == '\n')
s[len-1] = '\0';
lp = ereallocarray(NULL, INIT, sizeof(struct line));
lp->text = s;
lp->fields = ereallocarray(NULL, INIT, sizeof(struct field));
lp->nf = 0;
lp->maxf = INIT;
for (sp = lp->text; isblank(*sp); sp++)
;
while (!eol) {
beg = sp;
if (sep) {
if (!(end = utfutf(sp, sep)))
eol = 1;
if (!eol) {
addfield(lp, beg, end - beg);
for (i = 0; i < seplen; i++)
end++;
}
} else {
for (end = sp; !(isblank(*end)); end++) {
if (*end == '\0') {
eol = 1;
break;
}
}
if (!eol)
addfield(lp, beg, end - beg);
while (isblank(*++end))
;
}
if (eol)
addfield(lp, beg, strlen(sp));
sp = end;
}
return lp;
}
static int
addtospan(struct span *sp, FILE *fp, int reset)
{
char *newl = NULL;
size_t len, size = 0;
if ((len = getline(&newl, &size, fp)) == -1) {
if (ferror(fp))
eprintf("getline:");
else
return 0;
}
if (reset)
sp->nl = 0;
if (sp->nl >= sp->maxl) {
sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl),
sizeof(struct line *));
sp->maxl *= GROW;
}
sp->lines[sp->nl] = makeline(newl, len);
sp->nl++;
return 1;
}
static void
initspan(struct span *sp)
{
sp->nl = 0;
sp->maxl = INIT;
sp->lines = ereallocarray(NULL, INIT, sizeof(struct line *));;
}
static void
freespan(struct span *sp)
{
size_t i;
for (i = 0; i < sp->nl; i++) {
free(sp->lines[i]->fields);
free(sp->lines[i]->text);
}
free(sp->lines);
}
static void
initolist(struct outlist *olp)
{
olp->ns = 0;
olp->maxs = 1;
olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *));
}
static void
addspec(struct outlist *olp, struct spec *sp)
{
if (olp->ns >= olp->maxs) {
olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs),
sizeof(struct spec *));
olp->maxs *= GROW;
}
olp->specs[olp->ns] = sp;
olp->ns++;
}
static struct spec *
makespec(char *s)
{
struct spec *sp;
int fileno;
size_t fldno;
switch (s[0]) {
case '0': /* join field */
fileno = 0;
fldno = 0;
break;
case '1': case '2':
if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2)
eprintf("\"%s\": invalid format\n", s);
fldno--; /* ugly */
break;
default:
eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]);
break;
}
sp = ereallocarray(NULL, INIT, sizeof(struct spec));
sp->fileno = fileno;
sp->fldno = fldno;
return sp;
}
static void
makeolist(struct outlist *olp, char *s)
{
char *item, *sp;
sp = s;
while (sp) {
item = sp;
sp = strpbrk(sp, ", \t");
if (sp)
*sp++ = '\0';
addspec(olp, makespec(item));
}
}
static void
freespecs(struct outlist *olp)
{
size_t i;
for (i = 0; i < olp->ns; i++)
free(olp->specs[i]);
}
static void
join(FILE *fa, FILE *fb, size_t jfa, size_t jfb)
{
struct span spa, spb;
int cmp, eofa, eofb;
initspan(&spa);
initspan(&spb);
cmp = eofa = eofb = 0;
addtospan(&spa, fa, RESET);
addtospan(&spb, fb, RESET);
while (spa.nl && spb.nl) {
if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) {
if (unpairsa)
prline(spa.lines[0]);
if (!addtospan(&spa, fa, RESET)) {
if (unpairsb) { /* a is EOF'd; print the rest of b */
do
prline(spb.lines[0]);
while (addtospan(&spb, fb, RESET));
}
eofa = eofb = 1;
} else {
continue;
}
} else if (cmp > 0) {
if (unpairsb)
prline(spb.lines[0]);
if (!addtospan(&spb, fb, RESET)) {
if (unpairsa) { /* b is EOF'd; print the rest of a */
do
prline(spa.lines[0]);
while (addtospan(&spa, fa, RESET));
}
eofa = eofb = 1;
} else {
continue;
}
} else if (cmp == 0) {
/* read all consecutive matching lines from a */
do {
if (!addtospan(&spa, fa, EXPAND)) {
eofa = 1;
spa.nl++;
break;
}
} while (linecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0);
/* read all consecutive matching lines from b */
do {
if (!addtospan(&spb, fb, EXPAND)) {
eofb = 1;
spb.nl++;
break;
}
} while (linecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0);
if (pairs)
prspanjoin(&spa, &spb, jfa, jfb);
} else { /* FIELD_ERROR: both lines lacked join fields */
if (unpairsa)
prline(spa.lines[0]);
if (unpairsb)
prline(spb.lines[0]);
eofa = addtospan(&spa, fa, RESET) ? 0 : 1;
eofb = addtospan(&spb, fb, RESET) ? 0 : 1;
if (!eofa && !eofb)
continue;
}
if (eofa) {
spa.nl = 0;
} else {
swaplines(spa.lines[0], spa.lines[spa.nl - 1]); /* ugly */
spa.nl = 1;
}
if (eofb) {
spb.nl = 0;
} else {
swaplines(spb.lines[0], spb.lines[spb.nl - 1]); /* ugly */
spb.nl = 1;
}
}
freespan(&spa);
freespan(&spb);
}
int
main(int argc, char *argv[])
{
size_t jf[2] = { jfield, jfield, };
FILE *fp[2];
int n;
char *fno;
ARGBEGIN {
case '1':
jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
break;
case '2':
jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX));
break;
case 'a':
fno = EARGF(usage());
if (strcmp(fno, "1") == 0)
unpairsa = 1;
else if (strcmp(fno, "2") == 0)
unpairsb = 1;
else
usage();
break;
case 'e':
replace = EARGF(usage());
break;
case 'o':
oflag = 1;
initolist(&output);
makeolist(&output, EARGF(usage()));
break;
case 't':
sep = EARGF(usage());
break;
case 'v':
pairs = 0;
fno = EARGF(usage());
if (strcmp(fno, "1") == 0)
unpairsa = 1;
else if (strcmp(fno, "2") == 0)
unpairsb = 1;
else
usage();
break;
default:
usage();
} ARGEND;
if (sep)
seplen = unescape(sep);
if (argc != 2)
usage();
for (n = 0; n < 2; n++) {
if (argv[n][0] == '-' && !argv[n][1]) {
argv[n] = "<stdin>";
fp[n] = stdin;
} else if (!(fp[n] = fopen(argv[n], "r"))) {
eprintf("fopen %s:", argv[n]);
}
}
jf[0]--;
jf[1]--;
join(fp[0], fp[1], jf[0], jf[1]);
if (oflag)
freespecs(&output);
enfshut(2, fp[0], argv[0]);
if (fp[0] != fp[1])
enfshut(2, fp[1], argv[1]);
enfshut(2, stdout, "<stdout>");
exit(0);
}