/* FIXME: summary * decide whether we enforce valid UTF-8, right now it's enforced in certain * parts of the script, but not the input... * nul bytes cause explosions due to use of libc string functions. thoughts? * lack of newline at end of file, currently we add one. what should we do? * allow "\\t" for "\t" etc. in regex? in replacement text? * POSIX says don't flush on N when out of input, but GNU and busybox do. */ #include #include #include #include #include #include "utf.h" #include "util.h" /* Types */ /* used as queue for writes and stack for {,:,b,t */ typedef struct { void **data; size_t size; size_t cap; } Vec; /* used for arbitrary growth, str is a C string * FIXME: does it make sense to keep track of length? or just rely on libc * string functions? If we want to support nul bytes everything changes */ typedef struct { char *str; size_t cap; } String; typedef struct Cmd Cmd; typedef struct { void (*fn)(Cmd *); char *(*getarg)(Cmd *, char *); void (*freearg)(Cmd *); unsigned char naddr; } Fninfo; typedef struct { union { size_t lineno; regex_t *re; } u; enum { IGNORE, /* empty address, ignore */ EVERY , /* every line */ LINE , /* ilne number */ LAST , /* last line ($) */ REGEX , /* use included regex */ LASTRE, /* use most recently used regex */ } type; } Addr; /* DISCUSS: naddr is not strictly necessary, but very helpful * naddr == 0 iff beg.type == EVERY && end.type == IGNORE * naddr == 1 iff beg.type != IGNORE && end.type == IGNORE * naddr == 2 iff beg.type != IGNORE && end.type != IGNORE */ typedef struct { Addr beg; Addr end; unsigned char naddr; } Range; typedef struct { regex_t *re; /* if NULL use last regex */ String repl; FILE *file; size_t occurrence; /* 0 for all (g flag) */ Rune delim; unsigned int p:1; } Sarg; typedef struct { Rune *set1; Rune *set2; } Yarg; typedef struct { String str; /* a,c,i text. r file path */ void (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */ } ACIRarg; struct Cmd { Range range; Fninfo *fninfo; union { Cmd *jump; /* used for b,t when running */ char *label; /* used for :,b,t when building */ ptrdiff_t offset; /* used for { (pointers break during realloc) */ FILE *file; /* used for w */ /* FIXME: Should the following be in the union? or pointers and malloc? */ Sarg s; Yarg y; ACIRarg acir; } u; /* I find your lack of anonymous unions disturbing */ unsigned int in_match:1; unsigned int negate :1; }; /* Files for w command (and s' w flag) */ typedef struct { char *path; FILE *file; } Wfile; /* * Function Declarations */ /* Dynamically allocated arrays and strings */ static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next); static void *pop(Vec *v); static void push(Vec *v, void *p); static void stracat(String *dst, char *src); static void strnacat(String *dst, char *src, size_t n); static void stracpy(String *dst, char *src); /* Cleanup and errors */ static void usage(void); /* Parsing functions and related utilities */ static void compile(char *s, int isfile); static int read_line(FILE *f, String *s); static char *make_range(Range *range, char *s); static char *make_addr(Addr *addr, char *s); static char *find_delim(char *s, Rune delim, int do_brackets); static char *chompr(char *s, Rune rune); static char *chomp(char *s); static Rune *strtorunes(char *s, size_t nrunes); static long stol(char *s, char **endp); static size_t escapes(char *beg, char *end, Rune delim, int n_newline); static size_t echarntorune(Rune *r, char *s, size_t n); static void insert_labels(void); /* Get and Free arg and related utilities */ static char *get_aci_arg(Cmd *c, char *s); static void aci_append(Cmd *c, char *s); static void free_acir_arg(Cmd *c); static char *get_bt_arg(Cmd *c, char *s); static char *get_r_arg(Cmd *c, char *s); static char *get_s_arg(Cmd *c, char *s); static void free_s_arg(Cmd *c); static char *get_w_arg(Cmd *c, char *s); static char *get_y_arg(Cmd *c, char *s); static void free_y_arg(Cmd *c); static char *get_colon_arg(Cmd *c, char *s); static char *get_lbrace_arg(Cmd *c, char *s); static char *get_rbrace_arg(Cmd *c, char *s); static char *semicolon_arg(char *s); /* Running */ static void run(void); static int in_range(Cmd *c); static int match_addr(Addr *a); static int next_file(void); static int is_eof(FILE *f); static void do_writes(void); static void write_file(char *path, FILE *out); static void check_puts(char *s, FILE *f); static void update_ranges(Cmd *beg, Cmd *end); /* Sed functions */ static void cmd_y(Cmd *c); static void cmd_x(Cmd *c); static void cmd_w(Cmd *c); static void cmd_t(Cmd *c); static void cmd_s(Cmd *c); static void cmd_r(Cmd *c); static void cmd_q(Cmd *c); static void cmd_P(Cmd *c); static void cmd_p(Cmd *c); static void cmd_N(Cmd *c); static void cmd_n(Cmd *c); static void cmd_l(Cmd *c); static void cmd_i(Cmd *c); static void cmd_H(Cmd *c); static void cmd_h(Cmd *c); static void cmd_G(Cmd *c); static void cmd_g(Cmd *c); static void cmd_D(Cmd *c); static void cmd_d(Cmd *c); static void cmd_c(Cmd *c); static void cmd_b(Cmd *c); static void cmd_a(Cmd *c); static void cmd_colon(Cmd *c); static void cmd_equal(Cmd *c); static void cmd_lbrace(Cmd *c); static void cmd_rbrace(Cmd *c); static void cmd_last(Cmd *c); /* Actions */ static void new_line(void); static void app_line(void); static void new_next(void); static void old_next(void); /* * Globals */ static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */ static Vec writes; /* holds cmd*. writes scheduled by a and r commands */ static Vec wfiles; /* holds Wfile*. files for w and s///w commands */ static Cmd *prog, *pc; /* Program, program counter */ static size_t pcap; static size_t lineno; static regex_t *lastre; /* last used regex for empty regex search */ static char **files; /* list of file names from argv */ static FILE *file; /* current file we are reading */ static String patt, hold, genbuf; static struct { unsigned int n :1; /* -n (no print) */ unsigned int E :1; /* -E (extended re) */ unsigned int s :1; /* s/// replacement happened */ unsigned int aci_cont:1; /* a,c,i text continuation */ unsigned int s_cont :1; /* s/// replacement text continuation */ unsigned int halt :1; /* halt execution */ } gflags; /* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */ static Fninfo fns[] = { ['a'] = { cmd_a , get_aci_arg , free_acir_arg , 1 }, /* schedule write of text for later */ ['b'] = { cmd_b , get_bt_arg , NULL , 2 }, /* branch to label char *label when building, Cmd *jump when running */ ['c'] = { cmd_c , get_aci_arg , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text */ ['d'] = { cmd_d , NULL , NULL , 2 }, /* delete pattern space */ ['D'] = { cmd_D , NULL , NULL , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d) */ ['g'] = { cmd_g , NULL , NULL , 2 }, /* replace pattern space with hold space */ ['G'] = { cmd_G , NULL , NULL , 2 }, /* append newline and hold space to pattern space */ ['h'] = { cmd_h , NULL , NULL , 2 }, /* replace hold space with pattern space */ ['H'] = { cmd_H , NULL , NULL , 2 }, /* append newline and pattern space to hold space */ ['i'] = { cmd_i , get_aci_arg , free_acir_arg , 1 }, /* write text */ ['l'] = { cmd_l , NULL , NULL , 2 }, /* write pattern space in 'visually unambiguous form' */ ['n'] = { cmd_n , NULL , NULL , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit) */ ['N'] = { cmd_N , NULL , NULL , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */ ['p'] = { cmd_p , NULL , NULL , 2 }, /* write pattern space */ ['P'] = { cmd_P , NULL , NULL , 2 }, /* write pattern space up to first newline */ ['q'] = { cmd_q , NULL , NULL , 1 }, /* quit */ ['r'] = { cmd_r , get_r_arg , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file) */ ['s'] = { cmd_s , get_s_arg , free_s_arg , 2 }, /* find/replace/all that crazy s stuff */ ['t'] = { cmd_t , get_bt_arg , NULL , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */ ['w'] = { cmd_w , get_w_arg , NULL , 2 }, /* append pattern space to file */ ['x'] = { cmd_x , NULL , NULL , 2 }, /* exchange pattern and hold spaces */ ['y'] = { cmd_y , get_y_arg , free_y_arg , 2 }, /* replace runes in set1 with runes in set2 */ [':'] = { cmd_colon , get_colon_arg , NULL , 0 }, /* defines label for later b and t commands */ ['='] = { cmd_equal , NULL , NULL , 1 }, /* printf("%d\n", line_number); */ ['{'] = { cmd_lbrace, get_lbrace_arg, NULL , 2 }, /* if we match, run commands, otherwise jump to close */ ['}'] = { cmd_rbrace, get_rbrace_arg, NULL , 0 }, /* noop, hold onto open for ease of building scripts */ [0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */ }; /* * Function Definitions */ /* given memory pointed to by *ptr that currently holds *nmemb members of size * size, realloc to hold new_nmemb members, return new_nmemb in *memb and one * past old end in *next. if realloc fails...explode */ static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next) { void *n, *tmp; if (new_nmemb) { tmp = ereallocarray(*ptr, new_nmemb, size); } else { /* turns out realloc(*ptr, 0) != free(*ptr) */ free(*ptr); tmp = NULL; } n = (char *)tmp + *nmemb * size; *nmemb = new_nmemb; *ptr = tmp; if (next) *next = n; } static void * pop(Vec *v) { if (!v->size) return NULL; return v->data[--v->size]; } static void push(Vec *v, void *p) { if (v->size == v->cap) resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL); v->data[v->size++] = p; } static void stracat(String *dst, char *src) { int new = !dst->cap; size_t len; len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1; if (dst->cap < len) resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); if (new) *dst->str = '\0'; strcat(dst->str, src); } static void strnacat(String *dst, char *src, size_t n) { int new = !dst->cap; size_t len; len = strlen(src); len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1; if (dst->cap < len) resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); if (new) *dst->str = '\0'; strlcat(dst->str, src, len); } static void stracpy(String *dst, char *src) { size_t len; len = strlen(src) + 1; if (dst->cap < len) resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL); strcpy(dst->str, src); } static void leprintf(char *s) { if (errno) eprintf("%zu: %s: %s\n", lineno, s, strerror(errno)); else eprintf("%zu: %s\n", lineno, s); } /* FIXME: write usage message */ static void usage(void) { eprintf("usage: sed [-nrE] script [file ...]\n" " sed [-nrE] -e script [-e script] ... [-f scriptfile] ... [file ...]\n" " sed [-nrE] [-e script] ... -f scriptfile [-f scriptfile] ... [file ...]\n"); } /* Differences from POSIX * we allows semicolons and trailing blanks inside {} * we allow spaces after ! (and in between !s) * we allow extended regular expressions (-E) */ static void compile(char *s, int isfile) { FILE *f; if (!isfile && !*s) /* empty string script */ return; f = isfile ? fopen(s, "r") : fmemopen(s, strlen(s), "r"); if (!f) eprintf("fopen/fmemopen:"); /* NOTE: get arg functions can't use genbuf */ while (read_line(f, &genbuf) != EOF) { s = genbuf.str; /* if the first two characters of the script are "#n" default output shall be suppressed */ if (++lineno == 1 && *s == '#' && s[1] == 'n') { gflags.n = 1; continue; } if (gflags.aci_cont) { aci_append(pc - 1, s); continue; } if (gflags.s_cont) s = (pc - 1)->fninfo->getarg(pc - 1, s); while (*s) { s = chompr(s, ';'); if (!*s || *s == '#') break; if ((size_t)(pc - prog) == pcap) resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc); pc->range.beg.type = pc->range.end.type = IGNORE; pc->fninfo = NULL; pc->in_match = 0; s = make_range(&pc->range, s); s = chomp(s); pc->negate = *s == '!'; s = chompr(s, '!'); if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn) leprintf("bad sed function"); if (pc->range.naddr > pc->fninfo->naddr) leprintf("wrong number of addresses"); s++; if (pc->fninfo->getarg) s = pc->fninfo->getarg(pc, s); pc++; } } fshut(f, s); } /* FIXME: if we decide to honor lack of trailing newline, set/clear a global * flag when reading a line */ static int read_line(FILE *f, String *s) { ssize_t len; if (!f) return EOF; if ((len = getline(&s->str, &s->cap, f)) < 0) { if (ferror(f)) eprintf("getline:"); return EOF; } if (s->str[--len] == '\n') s->str[len] = '\0'; return 0; } /* read first range from s, return pointer to one past end of range */ static char * make_range(Range *range, char *s) { s = make_addr(&range->beg, s); if (*s == ',') s = make_addr(&range->end, s + 1); else range->end.type = IGNORE; if (range->beg.type == EVERY && range->end.type == IGNORE) range->naddr = 0; else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1; else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2; else leprintf("this is impossible..."); return s; } /* read first addr from s, return pointer to one past end of addr */ static char * make_addr(Addr *addr, char *s) { Rune r; char *p = s + strlen(s); size_t rlen = echarntorune(&r, s, p - s); if (r == '$') { addr->type = LAST; s += rlen; } else if (isdigitrune(r)) { addr->type = LINE; addr->u.lineno = stol(s, &s); } else if (r == '/' || r == '\\') { Rune delim; if (r == '\\') { s += rlen; rlen = echarntorune(&r, s, p - s); } if (r == '\\') leprintf("bad delimiter '\\'"); delim = r; s += rlen; rlen = echarntorune(&r, s, p - s); if (r == delim) { addr->type = LASTRE; s += rlen; } else { addr->type = REGEX; p = find_delim(s, delim, 1); if (!*p) leprintf("unclosed regex"); p -= escapes(s, p, delim, 0); *p++ = '\0'; addr->u.re = emalloc(sizeof(*addr->u.re)); eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0); s = p; } } else { addr->type = EVERY; } return s; } /* return pointer to first delim in s that is not escaped * and if do_brackets is set, not in [] (note possible [::], [..], [==], inside []) * return pointer to trailing nul byte if no delim found * * any escaped character that is not special is just itself (POSIX undefined) * FIXME: pull out into some util thing, will be useful for ed as well */ static char * find_delim(char *s, Rune delim, int do_brackets) { enum { OUTSIDE , /* not in brackets */ BRACKETS_OPENING, /* last char was first [ or last two were first [^ */ BRACKETS_INSIDE , /* inside [] */ INSIDE_OPENING , /* inside [] and last char was [ */ CLASS_INSIDE , /* inside class [::], or colating element [..] or [==], inside [] */ CLASS_CLOSING , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */ } state = OUTSIDE; Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */ size_t rlen; int escape = 0; char *end = s + strlen(s); for (; *s; s += rlen) { rlen = echarntorune(&r, s, end - s); if (state == BRACKETS_OPENING && r == '^' ) { continue; } else if (state == BRACKETS_OPENING && r == ']' ) { state = BRACKETS_INSIDE ; continue; } else if (state == BRACKETS_OPENING ) { state = BRACKETS_INSIDE ; } if (state == CLASS_CLOSING && r == ']' ) { state = BRACKETS_INSIDE ; } else if (state == CLASS_CLOSING ) { state = CLASS_INSIDE ; } else if (state == CLASS_INSIDE && r == c ) { state = CLASS_CLOSING ; } else if (state == INSIDE_OPENING && (r == ':' || r == '.' || r == '=') ) { state = CLASS_INSIDE ; c = r; } else if (state == INSIDE_OPENING && r == ']' ) { state = OUTSIDE ; } else if (state == INSIDE_OPENING ) { state = BRACKETS_INSIDE ; } else if (state == BRACKETS_INSIDE && r == '[' ) { state = INSIDE_OPENING ; } else if (state == BRACKETS_INSIDE && r == ']' ) { state = OUTSIDE ; } else if (state == OUTSIDE && escape ) { escape = 0 ; } else if (state == OUTSIDE && r == '\\' ) { escape = 1 ; } else if (state == OUTSIDE && r == delim) return s; else if (state == OUTSIDE && do_brackets && r == '[' ) { state = BRACKETS_OPENING; } } return s; } static char * chomp(char *s) { return chompr(s, 0); } /* eat all leading whitespace and occurrences of rune */ static char * chompr(char *s, Rune rune) { Rune r; size_t rlen; char *end = s + strlen(s); while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune)) s += rlen; return s; } /* convert first nrunes Runes from UTF-8 string s in allocated Rune* * NOTE: sequence must be valid UTF-8, check first */ static Rune * strtorunes(char *s, size_t nrunes) { Rune *rs, *rp; rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs)); while (nrunes--) s += chartorune(rp++, s); *rp = '\0'; return rs; } static long stol(char *s, char **endp) { long n; errno = 0; n = strtol(s, endp, 10); if (errno) leprintf("strtol:"); if (*endp == s) leprintf("strtol: invalid number"); return n; } /* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim) * if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal) * if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command) * if delim is 0 all escaped characters represent themselves (aci text) * memmove rest of string (beyond end) into place * return the number of converted escapes (backslashes removed) * FIXME: this has had too many corner cases slapped on and is ugly. rewrite better */ static size_t escapes(char *beg, char *end, Rune delim, int n_newline) { size_t num = 0; char *src = beg, *dst = beg; while (src < end) { /* handle escaped backslash specially so we don't think the second * backslash is escaping something */ if (*src == '\\' && src[1] == '\\') { *dst++ = *src++; if (delim) *dst++ = *src++; else src++; } else if (*src == '\\' && !delim) { src++; } else if (*src == '\\' && src[1]) { Rune r; size_t rlen; num++; src++; rlen = echarntorune(&r, src, end - src); if (r == 'n' && delim == 'n') { *src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */ } else if (r == 'n') { *src = '\n'; } else if (r != delim) { *dst++ = '\\'; num--; } memmove(dst, src, rlen); dst += rlen; src += rlen; } else { *dst++ = *src++; } } memmove(dst, src, strlen(src) + 1); return num; } static size_t echarntorune(Rune *r, char *s, size_t n) { size_t rlen = charntorune(r, s, n); if (!rlen || *r == Runeerror) leprintf("invalid UTF-8"); return rlen; } static void insert_labels(void) { size_t i; Cmd *from, *to; while (branches.size) { from = prog + (ptrdiff_t)pop(&branches); if (!from->u.label) {/* no label branch to end of script */ from->u.jump = pc - 1; } else { for (i = 0; i < labels.size; i++) { to = prog + (ptrdiff_t)labels.data[i]; if (!strcmp(from->u.label, to->u.label)) { from->u.jump = to; break; } } if (i == labels.size) leprintf("bad label"); } } } /* * Getargs / Freeargs * Read argument from s, return pointer to one past last character of argument */ /* POSIX compliant * i\ * foobar * * also allow the following non POSIX compliant * i # empty line * ifoobar * ifoobar\ * baz * * FIXME: GNU and busybox discard leading spaces * i foobar * i foobar * ifoobar * are equivalent in GNU and busybox. We don't. Should we? */ static char * get_aci_arg(Cmd *c, char *s) { c->u.acir.print = check_puts; c->u.acir.str = (String){ NULL, 0 }; gflags.aci_cont = !!*s; /* no continue flag if empty string */ /* neither empty string nor POSIX compliant */ if (*s && !(*s == '\\' && !s[1])) aci_append(c, s); return s + strlen(s); } static void aci_append(Cmd *c, char *s) { char *end = s + strlen(s), *p = end; gflags.aci_cont = 0; while (--p >= s && *p == '\\') gflags.aci_cont = !gflags.aci_cont; if (gflags.aci_cont) *--end = '\n'; escapes(s, end, 0, 0); stracat(&c->u.acir.str, s); } static void free_acir_arg(Cmd *c) { free(c->u.acir.str.str); } /* POSIX dictates that label is rest of line, including semicolons, trailing * whitespace, closing braces, etc. and can be limited to 8 bytes * * I allow a semicolon or closing brace to terminate a label name, it's not * POSIX compliant, but it's useful and every sed version I've tried to date * does the same. * * FIXME: POSIX dictates that leading whitespace is ignored but trailing * whitespace is not. This is annoying and we should probably get rid of it. */ static char * get_bt_arg(Cmd *c, char *s) { char *p = semicolon_arg(s = chomp(s)); if (p != s) { c->u.label = estrndup(s, p - s); } else { c->u.label = NULL; } push(&branches, (void *)(c - prog)); return p; } /* POSIX dictates file name is rest of line including semicolons, trailing * whitespace, closing braces, etc. and file name must be preceded by a space * * I allow a semicolon or closing brace to terminate a file name and don't * enforce leading space. * * FIXME: decide whether trailing whitespace should be included and fix * accordingly */ static char * get_r_arg(Cmd *c, char *s) { char *p = semicolon_arg(s = chomp(s)); if (p == s) leprintf("no file name"); c->u.acir.str.str = estrndup(s, p - s); c->u.acir.print = write_file; return p; } /* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX) * * FIXME: allow other escapes in regex and replacement? if so change escapes() */ static char * get_s_arg(Cmd *c, char *s) { Rune delim, r; Cmd buf; char *p; int esc, lastre; /* s/Find/Replace/Flags */ /* Find */ if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */ lastre = 0; c->u.s.repl = (String){ NULL, 0 }; c->u.s.occurrence = 1; c->u.s.file = NULL; c->u.s.p = 0; if (!*s || *s == '\\') leprintf("bad delimiter"); p = s + strlen(s); s += echarntorune(&delim, s, p - s); c->u.s.delim = delim; echarntorune(&r, s, p - s); if (r == delim) /* empty regex */ lastre = 1; p = find_delim(s, delim, 1); if (!*p) leprintf("missing second delimiter"); p -= escapes(s, p, delim, 0); *p = '\0'; if (lastre) { c->u.s.re = NULL; } else { c->u.s.re = emalloc(sizeof(*c->u.s.re)); /* FIXME: different eregcomp that calls fatal */ eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0); } s = p + runelen(delim); } /* Replace */ delim = c->u.s.delim; p = find_delim(s, delim, 0); p -= escapes(s, p, delim, 0); if (!*p) { /* no third delimiter */ /* FIXME: same backslash counting as aci_append() */ if (p[-1] != '\\') leprintf("missing third delimiter or "); p[-1] = '\n'; gflags.s_cont = 1; } else { gflags.s_cont = 0; } /* check for bad references in replacement text */ *p = '\0'; for (esc = 0, p = s; *p; p++) { if (esc) { esc = 0; if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub) leprintf("back reference number greater than number of groups"); } else if (*p == '\\') { esc = 1; } } stracat(&c->u.s.repl, s); if (gflags.s_cont) return p; s = p + runelen(delim); /* Flags */ p = semicolon_arg(s = chomp(s)); /* FIXME: currently for simplicity take last of g or occurrence flags and * ignore multiple p flags. need to fix that */ for (; s < p; s++) { if (isdigit(*s)) { c->u.s.occurrence = stol(s, &s); s--; /* for loop will advance pointer */ } else { switch (*s) { case 'g': c->u.s.occurrence = 0; break; case 'p': c->u.s.p = 1; break; case 'w': /* must be last flag, take everything up to newline/semicolon * s == p after this */ s = get_w_arg(&buf, chomp(s+1)); c->u.s.file = buf.u.file; break; } } } return p; } static void free_s_arg(Cmd *c) { if (c->u.s.re) regfree(c->u.s.re); free(c->u.s.re); free(c->u.s.repl.str); } /* see get_r_arg notes */ static char * get_w_arg(Cmd *c, char *s) { char *p = semicolon_arg(s = chomp(s)); Wfile *w, **wp; if (p == s) leprintf("no file name"); for (wp = (Wfile **)wfiles.data; (size_t)(wp - (Wfile **)wfiles.data) < wfiles.size; wp++) { if (strlen((*wp)->path) == (size_t)(p - s) && !strncmp(s, (*wp)->path, p - s)) { c->u.file = (*wp)->file; return p; } } w = emalloc(sizeof(*w)); w->path = estrndup(s, p - s); if (!(w->file = fopen(w->path, "w"))) leprintf("fopen failed"); c->u.file = w->file; push(&wfiles, w); return p; } static char * get_y_arg(Cmd *c, char *s) { Rune delim; char *p = s + strlen(s); size_t rlen = echarntorune(&delim, s, p - s); size_t nrunes1, nrunes2; c->u.y.set1 = c->u.y.set2 = NULL; s += rlen; p = find_delim(s, delim, 0); p -= escapes(s, p, delim, 1); nrunes1 = utfnlen(s, p - s); c->u.y.set1 = strtorunes(s, nrunes1); s = p + rlen; p = find_delim(s, delim, 0); p -= escapes(s, p, delim, 1); nrunes2 = utfnlen(s, p - s); if (nrunes1 != nrunes2) leprintf("different set lengths"); c->u.y.set2 = strtorunes(s, utfnlen(s, p - s)); return p + rlen; } static void free_y_arg(Cmd *c) { free(c->u.y.set1); free(c->u.y.set2); } /* see get_bt_arg notes */ static char * get_colon_arg(Cmd *c, char *s) { char *p = semicolon_arg(s = chomp(s)); if (p == s) leprintf("no label name"); c->u.label = estrndup(s, p - s); push(&labels, (void *)(c - prog)); return p; } static char * get_lbrace_arg(Cmd *c, char *s) { push(&braces, (void *)(c - prog)); return s; } static char * get_rbrace_arg(Cmd *c, char *s) { Cmd *lbrace; if (!braces.size) leprintf("extra }"); lbrace = prog + (ptrdiff_t)pop(&braces); lbrace->u.offset = c - prog; return s; } /* s points to beginning of an argument that may be semicolon terminated * return pointer to semicolon or nul byte after string * or closing brace as to not force ; before } * FIXME: decide whether or not to eat trailing whitespace for arguments that * we allow semicolon/brace termination that POSIX doesn't * b, r, t, w, : * POSIX says trailing whitespace is part of label name, file name, etc. * we should probably eat it */ static char * semicolon_arg(char *s) { char *p = strpbrk(s, ";}"); if (!p) p = s + strlen(s); return p; } static void run(void) { lineno = 0; if (braces.size) leprintf("extra {"); /* genbuf has already been initialized, patt will be in new_line * (or we'll halt) */ stracpy(&hold, ""); insert_labels(); next_file(); new_line(); for (pc = prog; !gflags.halt; pc++) pc->fninfo->fn(pc); } /* return true if we are in range for c, set c->in_match appropriately */ static int in_range(Cmd *c) { if (match_addr(&c->range.beg)) { if (c->range.naddr == 2) { if (c->range.end.type == LINE && c->range.end.u.lineno <= lineno) c->in_match = 0; else c->in_match = 1; } return !c->negate; } if (c->in_match && match_addr(&c->range.end)) { c->in_match = 0; return !c->negate; } return c->in_match ^ c->negate; } /* return true if addr matches current line */ static int match_addr(Addr *a) { switch (a->type) { default: case IGNORE: return 0; case EVERY: return 1; case LINE: return lineno == a->u.lineno; case LAST: while (is_eof(file) && !next_file()) ; return !file; case REGEX: lastre = a->u.re; return !regexec(a->u.re, patt.str, 0, NULL, 0); case LASTRE: if (!lastre) leprintf("no previous regex"); return !regexec(lastre, patt.str, 0, NULL, 0); } } /* move to next input file * stdin if first call and no files * return 0 for success and 1 for no more files */ static int next_file(void) { static unsigned char first = 1; if (file == stdin) clearerr(file); else if (file) fshut(file, ""); file = NULL; do { if (!*files) { if (first) /* given no files, default to stdin */ file = stdin; /* else we've used all our files, leave file = NULL */ } else if (!strcmp(*files, "-")) { file = stdin; files++; } else if (!(file = fopen(*files++, "r"))) { /* warn this file didn't open, but move on to next */ weprintf("fopen:"); } } while (!file && *files); first = 0; return !file; } /* test if stream is at EOF */ static int is_eof(FILE *f) { int c; if (!f || feof(f)) return 1; c = fgetc(f); if (c == EOF && ferror(f)) eprintf("fgetc:"); if (c != EOF && ungetc(c, f) == EOF) eprintf("ungetc EOF\n"); return c == EOF; } /* perform writes that were scheduled * for aci this is check_puts(string, stdout) * for r this is write_file(path, stdout) */ static void do_writes(void) { Cmd *c; size_t i; for (i = 0; i < writes.size; i++) { c = writes.data[i]; c->u.acir.print(c->u.acir.str.str, stdout); } writes.size = 0; } /* used for r's u.acir.print() * FIXME: something like util's concat() would be better */ static void write_file(char *path, FILE *out) { FILE *in = fopen(path, "r"); if (!in) /* no file is treated as empty file */ return; while (read_line(in, &genbuf) != EOF) check_puts(genbuf.str, out); fshut(in, path); } static void check_puts(char *s, FILE *f) { if (s && fputs(s, f) == EOF) eprintf("fputs:"); if (fputs("\n", f) == EOF) eprintf("fputs:"); } /* iterate from beg to end updating ranges so we don't miss any commands * e.g. sed -n '1d;1,3p' should still print lines 2 and 3 */ static void update_ranges(Cmd *beg, Cmd *end) { while (beg < end) in_range(beg++); } /* * Sed functions */ static void cmd_a(Cmd *c) { if (in_range(c)) push(&writes, c); } static void cmd_b(Cmd *c) { if (!in_range(c)) return; /* if we jump backwards update to end, otherwise update to destination */ update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); pc = c->u.jump; } static void cmd_c(Cmd *c) { if (!in_range(c)) return; /* write the text on the last line of the match */ if (!c->in_match) check_puts(c->u.acir.str.str, stdout); /* otherwise start the next cycle without printing pattern space * effectively deleting the text */ new_next(); } static void cmd_d(Cmd *c) { if (!in_range(c)) return; new_next(); } static void cmd_D(Cmd *c) { char *p; if (!in_range(c)) return; if ((p = strchr(patt.str, '\n'))) { p++; memmove(patt.str, p, strlen(p) + 1); old_next(); } else { new_next(); } } static void cmd_g(Cmd *c) { if (in_range(c)) stracpy(&patt, hold.str); } static void cmd_G(Cmd *c) { if (!in_range(c)) return; stracat(&patt, "\n"); stracat(&patt, hold.str); } static void cmd_h(Cmd *c) { if (in_range(c)) stracpy(&hold, patt.str); } static void cmd_H(Cmd *c) { if (!in_range(c)) return; stracat(&hold, "\n"); stracat(&hold, patt.str); } static void cmd_i(Cmd *c) { if (in_range(c)) check_puts(c->u.acir.str.str, stdout); } /* I think it makes sense to print invalid UTF-8 sequences in octal to satisfy * the "visually unambiguous form" sed(1p) */ static void cmd_l(Cmd *c) { Rune r; char *p, *end; size_t rlen; char *escapes[] = { /* FIXME: 7 entries and search instead of 127 */ ['\\'] = "\\\\", ['\a'] = "\\a", ['\b'] = "\\b", ['\f'] = "\\f" , ['\r'] = "\\r", ['\t'] = "\\t", ['\v'] = "\\v" , [0x7f] = NULL, /* fill out the table */ }; if (!in_range(c)) return; /* FIXME: line wrapping. sed(1p) says "length at which folding occurs is * unspecified, but should be appropraite for the output device" * just wrap at 80 Runes? */ for (p = patt.str, end = p + strlen(p); p < end; p += rlen) { if (isascii(*p) && escapes[(unsigned int)*p]) { fputs(escapes[(unsigned int)*p], stdout); rlen = 1; } else if (!(rlen = charntorune(&r, p, end - p))) { /* ran out of chars, print the bytes of the short sequence */ for (; p < end; p++) printf("\\%03hho", (unsigned char)*p); break; } else if (r == Runeerror) { for (; rlen; rlen--, p++) printf("\\%03hho", (unsigned char)*p); } else { while (fwrite(p, rlen, 1, stdout) < 1 && errno == EINTR) ; if (ferror(stdout)) eprintf("fwrite:"); } } check_puts("$", stdout); } static void cmd_n(Cmd *c) { if (!in_range(c)) return; if (!gflags.n) check_puts(patt.str, stdout); do_writes(); new_line(); } static void cmd_N(Cmd *c) { if (!in_range(c)) return; do_writes(); app_line(); } static void cmd_p(Cmd *c) { if (in_range(c)) check_puts(patt.str, stdout); } static void cmd_P(Cmd *c) { char *p; if (!in_range(c)) return; if ((p = strchr(patt.str, '\n'))) *p = '\0'; check_puts(patt.str, stdout); if (p) *p = '\n'; } static void cmd_q(Cmd *c) { if (!in_range(c)) return; if (!gflags.n) check_puts(patt.str, stdout); do_writes(); gflags.halt = 1; } static void cmd_r(Cmd *c) { if (in_range(c)) push(&writes, c); } static void cmd_s(Cmd *c) { String tmp; Rune r; size_t plen, rlen, len; char *p, *s, *end; unsigned int matches = 0, last_empty = 1, qflag = 0, cflags = 0; regex_t *re; regmatch_t *rm, *pmatch = NULL; if (!in_range(c)) return; if (!c->u.s.re && !lastre) leprintf("no previous regex"); re = c->u.s.re ? c->u.s.re : lastre; lastre = re; plen = re->re_nsub + 1; pmatch = ereallocarray(NULL, plen, sizeof(regmatch_t)); *genbuf.str = '\0'; s = patt.str; while (!qflag && !regexec(re, s, plen, pmatch, cflags)) { cflags = REG_NOTBOL; /* match against beginning of line first time, but not again */ if (!*s) /* match against empty string first time, but not again */ qflag = 1; /* don't substitute if last match was not empty but this one is. * s_a*_._g * foobar -> .f.o.o.b.r. */ if ((last_empty || pmatch[0].rm_eo) && (++matches == c->u.s.occurrence || !c->u.s.occurrence)) { /* copy over everything before the match */ strnacat(&genbuf, s, pmatch[0].rm_so); /* copy over replacement text, taking into account &, backreferences, and \ escapes */ for (p = c->u.s.repl.str, len = strcspn(p, "\\&"); *p; len = strcspn(++p, "\\&")) { strnacat(&genbuf, p, len); p += len; switch (*p) { default: leprintf("this shouldn't be possible"); case '\0': /* we're at the end, back up one so the ++p will put us on * the null byte to break out of the loop */ --p; break; case '&': strnacat(&genbuf, s + pmatch[0].rm_so, pmatch[0].rm_eo - pmatch[0].rm_so); break; case '\\': if (isdigit(*++p)) { /* backreference */ /* only need to check here if using lastre, otherwise we checked when building */ if (!c->u.s.re && (size_t)(*p - '0') > re->re_nsub) leprintf("back reference number greater than number of groups"); rm = &pmatch[*p - '0']; strnacat(&genbuf, s + rm->rm_so, rm->rm_eo - rm->rm_so); } else { /* character after backslash taken literally (well one byte, but it works) */ strnacat(&genbuf, p, 1); } break; } } } else { /* not replacing, copy over everything up to and including the match */ strnacat(&genbuf, s, pmatch[0].rm_eo); } if (!pmatch[0].rm_eo) { /* empty match, advance one rune and add it to output */ end = s + strlen(s); rlen = charntorune(&r, s, end - s); if (!rlen) { /* ran out of bytes, copy short sequence */ stracat(&genbuf, s); s = end; } else { /* copy whether or not it's a good rune */ strnacat(&genbuf, s, rlen); s += rlen; } } last_empty = !pmatch[0].rm_eo; s += pmatch[0].rm_eo; } free(pmatch); if (!(matches && matches >= c->u.s.occurrence)) /* no replacement */ return; gflags.s = 1; stracat(&genbuf, s); tmp = patt; patt = genbuf; genbuf = tmp; if (c->u.s.p) check_puts(patt.str, stdout); if (c->u.s.file) check_puts(patt.str, c->u.s.file); } static void cmd_t(Cmd *c) { if (!in_range(c) || !gflags.s) return; /* if we jump backwards update to end, otherwise update to destination */ update_ranges(c + 1, c->u.jump > c ? c->u.jump : prog + pcap); pc = c->u.jump; gflags.s = 0; } static void cmd_w(Cmd *c) { if (in_range(c)) check_puts(patt.str, c->u.file); } static void cmd_x(Cmd *c) { String tmp; if (!in_range(c)) return; tmp = patt; patt = hold; hold = tmp; } static void cmd_y(Cmd *c) { String tmp; Rune r, *rp; size_t n, rlen; char *s, *end, buf[UTFmax]; if (!in_range(c)) return; *genbuf.str = '\0'; for (s = patt.str, end = s + strlen(s); *s; s += rlen) { if (!(rlen = charntorune(&r, s, end - s))) { /* ran out of chars, copy rest */ stracat(&genbuf, s); break; } else if (r == Runeerror) { /* bad UTF-8 sequence, copy bytes */ strnacat(&genbuf, s, rlen); } else { for (rp = c->u.y.set1; *rp; rp++) if (*rp == r) break; if (*rp) { /* found r in set1, replace with Rune from set2 */ n = runetochar(buf, c->u.y.set2 + (rp - c->u.y.set1)); strnacat(&genbuf, buf, n); } else { strnacat(&genbuf, s, rlen); } } } tmp = patt; patt = genbuf; genbuf = tmp; } static void cmd_colon(Cmd *c) { } static void cmd_equal(Cmd *c) { if (in_range(c)) printf("%zu\n", lineno); } static void cmd_lbrace(Cmd *c) { Cmd *jump; if (in_range(c)) return; /* update ranges on all commands we skip */ jump = prog + c->u.offset; update_ranges(c + 1, jump); pc = jump; } static void cmd_rbrace(Cmd *c) { } /* not actually a sed function, but acts like one, put in last spot of script */ static void cmd_last(Cmd *c) { if (!gflags.n) check_puts(patt.str, stdout); do_writes(); new_next(); } /* * Actions */ /* read new line, continue current cycle */ static void new_line(void) { while (read_line(file, &patt) == EOF) { if (next_file()) { gflags.halt = 1; return; } } gflags.s = 0; lineno++; } /* append new line, continue current cycle * FIXME: used for N, POSIX specifies do not print pattern space when out of * input, but GNU does so busybox does as well. Currently we don't. * Should we? */ static void app_line(void) { while (read_line(file, &genbuf) == EOF) { if (next_file()) { gflags.halt = 1; return; } } stracat(&patt, "\n"); stracat(&patt, genbuf.str); gflags.s = 0; lineno++; } /* read new line, start new cycle */ static void new_next(void) { *patt.str = '\0'; update_ranges(pc + 1, prog + pcap); new_line(); pc = prog - 1; } /* keep old pattern space, start new cycle */ static void old_next(void) { update_ranges(pc + 1, prog + pcap); pc = prog - 1; } int main(int argc, char *argv[]) { char *arg; int ret = 0, script = 0; ARGBEGIN { case 'n': gflags.n = 1; break; case 'r': case 'E': gflags.E = 1; break; case 'e': arg = EARGF(usage()); compile(arg, 0); script = 1; break; case 'f': arg = EARGF(usage()); compile(arg, 1); script = 1; break; default : usage(); } ARGEND /* no script to run */ if (!script && !argc) usage(); /* no script yet, next argument is script */ if (!script) compile(*argv++, 0); /* shrink/grow memory to fit and add our last instruction */ resize((void **)&prog, &pcap, sizeof(*prog), pc - prog + 1, NULL); pc = prog + pcap - 1; pc->fninfo = &(Fninfo){ cmd_last, NULL, NULL, 0 }; files = argv; run(); ret |= fshut(stdin, "") | fshut(stdout, ""); return ret; }