From cd0b771cbbb56096335c1f2d8a0c953d46b0d430 Mon Sep 17 00:00:00 2001 From: Wolfgang Corcoran-Mathe Date: Mon, 20 Apr 2015 11:23:20 +0100 Subject: [PATCH] Add join(1) --- LICENSE | 1 + Makefile | 1 + README | 1 + TODO | 1 - join.1 | 105 +++++++++++ join.c | 554 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 662 insertions(+), 1 deletion(-) create mode 100644 join.1 create mode 100644 join.c diff --git a/LICENSE b/LICENSE index b1baebc..5ebeb0c 100644 --- a/LICENSE +++ b/LICENSE @@ -58,3 +58,4 @@ Authors/contributors include: © 2015 Tai Chi Minh Ralph Eastwood © 2015 Quentin Rameau © 2015 Dionysis Grigoropoulos +© 2015 Wolfgang Corcoran-Mathe diff --git a/Makefile b/Makefile index d782193..ffa8678 100644 --- a/Makefile +++ b/Makefile @@ -99,6 +99,7 @@ BIN =\ fold\ grep\ head\ + join\ hostname\ kill\ link\ diff --git a/README b/README index cdc2aaa..2009e49 100644 --- a/README +++ b/README @@ -40,6 +40,7 @@ The following tools are implemented: =* o grep . =*|o head . =*|x hostname . +=* o join . =*|o kill . =*|o link . =*|o ln . diff --git a/TODO b/TODO index e0055c7..2267def 100644 --- a/TODO +++ b/TODO @@ -10,7 +10,6 @@ diff ed getconf install -join od patch pathchk diff --git a/join.1 b/join.1 new file mode 100644 index 0000000..66d782f --- /dev/null +++ b/join.1 @@ -0,0 +1,105 @@ +.Dd April 18, 2015 +.Dt JOIN 1 +.Os sbase +.Sh NAME +.Nm join +.Nd relational database operator +.Sh SYNOPSIS +.Nm +.Op Fl 1 Ar field +.Op Fl 2 Ar field +.Op Fl o Ar list +.Op Fl e Ar string +.Op Fl a Ar fileno | Fl v Ar fileno +.Op Fl t Ar delim +.Ar file1 file2 +.Sh DESCRIPTION +.Nm +lines from +.Ar file1 +and +.Ar file2 +on a matching field. If one of the input files is '-', standard input +is read for that file. +.Pp +Files are read sequentially and are assumed to be sorted on the join +field. +.Nm +does not check the order of input, and joining two unsorted files will +produce unexpected output. +.Pp +By default, input lines are matched on the first blank-separated +field; output lines are space-separated and consist of the join field +followed by the remaining fields from +.Ar file1 Ns , +then the remaining fields from +.Ar file2 Ns . +.Sh OPTIONS +.Bl -tag -width Ds +.It Fl 1 Ar field +Join on the +.Ar field Ns eth +field of file 1. +.It Fl 2 Ar field +Join on the +.Ar field Ns eth +field of file 2. +.It Fl a Ar fileno +Print unpairable lines from file +.Ar fileno +in addition to normal output. +.It Fl e Ar string +When used with +.Fl o Ns , +replace empty fields in the output list with +.Ar string Ns . +.It Fl o Ar list +Format output according to the string +.Ar list Ns . +Each element of +.Ar list +may be either +.Ar fileno.field +or 0 (representing the join field). +Elements in +.Ar list +may be separated by blanks or commas. For example, +.Bd -literal -offset indent +join -o "0 2.1 1.3" +.Ed +.Pp +would print the join field, the first field of +.Ar file2 Ns , +then the third field of +.Ar file1 Ns . +.Pp +Only paired lines are formatted with the +.Fl o +option. Unpairable lines (selected with +.Fl a +or +.Fl v Ns ) +are printed raw. +.It Fl t Ar delim +Use the arbitrary string +.Ar delim +as field delimiter for both input and output. +.It Fl v Ar fileno +Print unpairable lines from file +.Ar fileno +instead of normal output. +.El +.Sh STANDARDS +The +.Nm +utility is compliant with the +.St -p1003.1-2013 +specification with the following exeption: +.Bl -bullet -offset indent +.It +Unpairable lines ignore formatting specified with +.Fl o Ns . +.El +.Pp +The possibility of specifying multibyte delimiters of arbitrary +length is an extension to the specification. diff --git a/join.c b/join.c new file mode 100644 index 0000000..ab6fff3 --- /dev/null +++ b/join.c @@ -0,0 +1,554 @@ +#include +#include +#include +#include +#include + +#include "arg.h" +#include "text.h" +#include "utf.h" +#include "util.h" + +enum { + INIT = 1, + GROW = 2, +}; + +enum { + EXPAND = 0, + RESET = 1, +}; + +enum { FIELD_ERROR = -2, }; + +struct field { + char *s; + size_t len; +}; + +struct line { + char *text; + size_t nf; + size_t maxf; + struct field *fields; +}; + +struct spec { + size_t fileno; + size_t fldno; +}; + +struct outlist { + size_t ns; + size_t maxs; + struct spec **specs; +}; + +struct span { + size_t nl; + size_t maxl; + struct line **lines; +}; + +static char *sep = NULL; +static char *replace = NULL; +static const char defaultofs = ' '; +static const int jfield = 1; /* POSIX default join field */ +static int unpairsa = 0, unpairsb = 0; +static int oflag = 0; +static int pairs = 1; +static size_t seplen; +static struct outlist output; + +char *argv0; + + +static void +usage(void) +{ + eprintf("usage: %s [-1 field] [-2 field] [-o list] [-e string] " + "[-a | -v fileno] [-t delim] file1 file2\n", argv0); +} + +static void +prfield(struct field *fp) +{ + if (fwrite(fp->s, 1, fp->len, stdout) != fp->len) + eprintf("fwrite:"); +} + +static void +prsep(void) +{ + if (sep) + fwrite(sep, 1, seplen, stdout); + else + putchar(defaultofs); +} + +static void +swaplines(struct line *la, struct line *lb) +{ + struct line tmp; + + tmp = *la; + *la = *lb; + *lb = tmp; +} + +static void +prjoin(struct line *la, struct line *lb, size_t jfa, size_t jfb) +{ + struct spec *sp; + struct field *joinfield; + size_t i; + + if (jfa >= la->nf || jfb >= lb->nf) + return; + + joinfield = &la->fields[jfa]; + + if (oflag) { + for (i = 0; i < output.ns; i++) { + sp = output.specs[i]; + + if (sp->fileno == 1) { + if (sp->fldno < la->nf) + prfield(&la->fields[sp->fldno]); + else if (replace) + fputs(replace, stdout); + } else if (sp->fileno == 2) { + if (sp->fldno < lb->nf) + prfield(&lb->fields[sp->fldno]); + else if (replace) + fputs(replace, stdout); + } else if (sp->fileno == 0) { + prfield(joinfield); + } + + if (i < output.ns - 1) + prsep(); + } + } else { + prfield(joinfield); + prsep(); + + for (i = 0; i < la->nf; i++) { + if (i != jfa) { + prfield(&la->fields[i]); + prsep(); + } + } + for (i = 0; i < lb->nf; i++) { + if (i != jfb) { + prfield(&lb->fields[i]); + if (i < la->nf - 1) + prsep(); + } + } + } + + putchar('\n'); +} + +static void +prline(struct line *lp) +{ + size_t len = strlen(lp->text); + + if (fwrite(lp->text, 1, len, stdout) != len) + eprintf("fwrite:"); + + putchar('\n'); +} + +static int +linecmp(struct line *la, struct line *lb, size_t jfa, size_t jfb) +{ + int status; + + /* return FIELD_ERROR if both lines are short */ + if (jfa >= la->nf) { + status = jfb >= lb->nf ? FIELD_ERROR : -1; + } else if (jfb >= lb->nf) { + status = 1; + } else { + status = memcmp(la->fields[jfa].s, lb->fields[jfb].s, + MAX (la->fields[jfa].len, lb->fields[jfb].len)); + if (status > 0) + status = 1; + else if (status < 0) + status = -1; + } + + return status; +} + +static void +addfield(struct line *lp, char *sp, size_t len) +{ + if (lp->nf >= lp->maxf) { + lp->fields = ereallocarray(lp->fields, (GROW * lp->maxf), + sizeof(struct field)); + lp->maxf *= GROW; + } + lp->fields[lp->nf].s = sp; + lp->fields[lp->nf].len = len; + lp->nf++; +} + +static void +prspanjoin(struct span *spa, struct span *spb, size_t jfa, size_t jfb) +{ + size_t i, j; + + for (i = 0; i < (spa->nl - 1); i++) + for (j = 0; j < (spb->nl - 1); j++) + prjoin(spa->lines[i], spb->lines[j], jfa, jfb); +} + +static struct line * +makeline(char *s, size_t len) +{ + struct line *lp; + char *sp, *beg, *end; + size_t i; + int eol = 0; + + if (s[len-1] == '\n') + s[len-1] = '\0'; + + lp = ereallocarray(NULL, INIT, sizeof(struct line)); + lp->text = s; + lp->fields = ereallocarray(NULL, INIT, sizeof(struct field)); + lp->nf = 0; + lp->maxf = INIT; + + for (sp = lp->text; isblank(*sp); sp++) + ; + + while (!eol) { + beg = sp; + + if (sep) { + if (!(end = utfutf(sp, sep))) + eol = 1; + + if (!eol) { + addfield(lp, beg, end - beg); + for (i = 0; i < seplen; i++) + end++; + } + } else { + for (end = sp; !(isblank(*end)); end++) { + if (*end == '\0') { + eol = 1; + break; + } + } + + if (!eol) + addfield(lp, beg, end - beg); + while (isblank(*++end)) + ; + } + + if (eol) + addfield(lp, beg, strlen(sp)); + + sp = end; + } + + return lp; +} + +static int +addtospan(struct span *sp, FILE *fp, int reset) +{ + char *newl = NULL; + size_t len, size = 0; + + if ((len = getline(&newl, &size, fp)) == -1) { + if (ferror(fp)) + eprintf("getline:"); + else + return 0; + } + + if (reset) + sp->nl = 0; + + if (sp->nl >= sp->maxl) { + sp->lines = ereallocarray(sp->lines, (GROW * sp->maxl), + sizeof(struct line *)); + sp->maxl *= GROW; + } + + sp->lines[sp->nl] = makeline(newl, len); + sp->nl++; + return 1; +} + +static void +initspan(struct span *sp) +{ + sp->nl = 0; + sp->maxl = INIT; + sp->lines = ereallocarray(NULL, INIT, sizeof(struct line *));; +} + +static void +freespan(struct span *sp) +{ + size_t i; + + for (i = 0; i < sp->nl; i++) { + free(sp->lines[i]->fields); + free(sp->lines[i]->text); + } + + free(sp->lines); +} + +static void +initolist(struct outlist *olp) +{ + olp->ns = 0; + olp->maxs = 1; + olp->specs = ereallocarray(NULL, INIT, sizeof(struct spec *)); +} + +static void +addspec(struct outlist *olp, struct spec *sp) +{ + if (olp->ns >= olp->maxs) { + olp->specs = ereallocarray(olp->specs, (GROW * olp->maxs), + sizeof(struct spec *)); + olp->maxs *= GROW; + } + olp->specs[olp->ns] = sp; + olp->ns++; +} + +static struct spec * +makespec(char *s) +{ + struct spec *sp; + int fileno; + size_t fldno; + + switch (s[0]) { + case '0': /* join field */ + fileno = 0; + fldno = 0; + break; + case '1': case '2': + if (sscanf(s, "%d.%zu", &fileno, &fldno) != 2) + eprintf("\"%s\": invalid format\n", s); + fldno--; /* ugly */ + break; + default: + eprintf("%c: invalid file number (must be 0, 1 or 2)\n", s[0]); + break; + } + + sp = ereallocarray(NULL, INIT, sizeof(struct spec)); + sp->fileno = fileno; + sp->fldno = fldno; + return sp; +} + +static void +makeolist(struct outlist *olp, char *s) +{ + char *item, *sp; + sp = s; + + while (sp) { + item = sp; + sp = strpbrk(sp, ", \t"); + if (sp) + *sp++ = '\0'; + addspec(olp, makespec(item)); + } +} + +static void +freespecs(struct outlist *olp) +{ + size_t i; + + for (i = 0; i < olp->ns; i++) + free(olp->specs[i]); +} + +static void +join(FILE *fa, FILE *fb, size_t jfa, size_t jfb) +{ + struct span spa, spb; + int cmp, eofa, eofb; + + initspan(&spa); + initspan(&spb); + cmp = eofa = eofb = 0; + + addtospan(&spa, fa, RESET); + addtospan(&spb, fb, RESET); + + while (spa.nl && spb.nl) { + if ((cmp = linecmp(spa.lines[0], spb.lines[0], jfa, jfb)) < 0) { + if (unpairsa) + prline(spa.lines[0]); + if (!addtospan(&spa, fa, RESET)) { + if (unpairsb) { /* a is EOF'd; print the rest of b */ + do + prline(spb.lines[0]); + while (addtospan(&spb, fb, RESET)); + } + eofa = eofb = 1; + } else { + continue; + } + } else if (cmp > 0) { + if (unpairsb) + prline(spb.lines[0]); + if (!addtospan(&spb, fb, RESET)) { + if (unpairsa) { /* b is EOF'd; print the rest of a */ + do + prline(spa.lines[0]); + while (addtospan(&spa, fa, RESET)); + } + eofa = eofb = 1; + } else { + continue; + } + } else if (cmp == 0) { + /* read all consecutive matching lines from a */ + do { + if (!addtospan(&spa, fa, EXPAND)) { + eofa = 1; + spa.nl++; + break; + } + } while (linecmp(spa.lines[spa.nl-1], spb.lines[0], jfa, jfb) == 0); + + /* read all consecutive matching lines from b */ + do { + if (!addtospan(&spb, fb, EXPAND)) { + eofb = 1; + spb.nl++; + break; + } + } while (linecmp(spa.lines[0], spb.lines[spb.nl-1], jfa, jfb) == 0); + + if (pairs) + prspanjoin(&spa, &spb, jfa, jfb); + + } else { /* FIELD_ERROR: both lines lacked join fields */ + if (unpairsa) + prline(spa.lines[0]); + if (unpairsb) + prline(spb.lines[0]); + eofa = addtospan(&spa, fa, RESET) ? 0 : 1; + eofb = addtospan(&spb, fb, RESET) ? 0 : 1; + if (!eofa && !eofb) + continue; + } + + if (eofa) { + spa.nl = 0; + } else { + swaplines(spa.lines[0], spa.lines[spa.nl - 1]); /* ugly */ + spa.nl = 1; + } + + if (eofb) { + spb.nl = 0; + } else { + swaplines(spb.lines[0], spb.lines[spb.nl - 1]); /* ugly */ + spb.nl = 1; + } + } + freespan(&spa); + freespan(&spb); +} + + +int +main(int argc, char *argv[]) +{ + size_t jf[2] = { jfield, jfield, }; + FILE *fp[2]; + int n; + char *fno; + + ARGBEGIN { + case '1': + jf[0] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX)); + break; + case '2': + jf[1] = estrtonum(EARGF(usage()), 1, MIN(LLONG_MAX, SIZE_MAX)); + break; + case 'a': + fno = EARGF(usage()); + if (strcmp(fno, "1") == 0) + unpairsa = 1; + else if (strcmp(fno, "2") == 0) + unpairsb = 1; + else + usage(); + break; + case 'e': + replace = EARGF(usage()); + break; + case 'o': + oflag = 1; + initolist(&output); + makeolist(&output, EARGF(usage())); + break; + case 't': + sep = EARGF(usage()); + break; + case 'v': + pairs = 0; + fno = EARGF(usage()); + if (strcmp(fno, "1") == 0) + unpairsa = 1; + else if (strcmp(fno, "2") == 0) + unpairsb = 1; + else + usage(); + break; + default: + usage(); + } ARGEND; + + if (sep) + seplen = unescape(sep); + + if (argc != 2) + usage(); + + for (n = 0; n < 2; n++) { + if (argv[n][0] == '-' && !argv[n][1]) { + argv[n] = ""; + fp[n] = stdin; + } else if (!(fp[n] = fopen(argv[n], "r"))) { + eprintf("fopen %s:", argv[n]); + } + } + + jf[0]--; + jf[1]--; + + join(fp[0], fp[1], jf[0], jf[1]); + + if (oflag) + freespecs(&output); + + enfshut(2, fp[0], argv[0]); + if (fp[0] != fp[1]) + enfshut(2, fp[1], argv[1]); + enfshut(2, stdout, ""); + exit(0); +}