sbase/sed.c

1731 lines
40 KiB
C
Raw Normal View History

/* FIXME: summary
* decide whether we enforce valid UTF-8, right now it's enforced in certain
* parts of the script, but not the input...
* nul bytes cause explosions due to use of libc string functions. thoughts?
* lack of newline at end of file, currently we add one. what should we do?
* allow "\\t" for "\t" etc. in regex? in replacement text?
* POSIX says don't flush on N when out of input, but GNU and busybox do.
*/
#include <ctype.h>
#include <errno.h>
#include <regex.h>
#include <stdlib.h>
#include <string.h>
#include "utf.h"
#include "util.h"
/* Types */
/* used as queue for writes and stack for {,:,b,t */
typedef struct {
void **data;
size_t size;
size_t cap;
} Vec;
/* used for arbitrary growth, str is a C string
* FIXME: does it make sense to keep track of length? or just rely on libc
* string functions? If we want to support nul bytes everything changes
*/
typedef struct {
char *str;
size_t cap;
} String;
typedef struct Cmd Cmd;
typedef struct {
void (*fn)(Cmd *);
char *(*getarg)(Cmd *, char *);
void (*freearg)(Cmd *);
unsigned char naddr;
} Fninfo;
typedef struct {
union {
size_t lineno;
regex_t *re;
} u;
enum {
IGNORE, /* empty address, ignore */
EVERY , /* every line */
LINE , /* ilne number */
LAST , /* last line ($) */
REGEX , /* use included regex */
LASTRE, /* use most recently used regex */
} type;
} Addr;
/* DISCUSS: naddr is not strictly necessary, but very helpful
* naddr == 0 iff beg.type == EVERY && end.type == IGNORE
* naddr == 1 iff beg.type != IGNORE && end.type == IGNORE
* naddr == 2 iff beg.type != IGNORE && end.type != IGNORE
*/
typedef struct {
Addr beg;
Addr end;
unsigned char naddr;
} Range;
typedef struct {
regex_t *re; /* if NULL use last regex */
String repl;
FILE *file;
size_t occurrence; /* 0 for all (g flag) */
Rune delim;
unsigned int p:1;
} Sarg;
typedef struct {
Rune *set1;
Rune *set2;
} Yarg;
typedef struct {
String str; /* a,c,i text. r file path */
void (*print)(char *, FILE *); /* check_puts for a, write_file for r, unused for c,i */
} ACIRarg;
struct Cmd {
Range range;
Fninfo *fninfo;
union {
Cmd *jump; /* used for b,t when running */
char *label; /* used for :,b,t when building */
ptrdiff_t offset; /* used for { (pointers break during realloc) */
FILE *file; /* used for w */
/* FIXME: Should the following be in the union? or pointers and malloc? */
Sarg s;
Yarg y;
ACIRarg acir;
} u; /* I find your lack of anonymous unions disturbing */
unsigned int in_match:1;
unsigned int negate :1;
};
/* Files for w command (and s' w flag) */
typedef struct {
char *path;
FILE *file;
} Wfile;
/*
* Function Declarations
*/
/* Dynamically allocated arrays and strings */
static void resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next);
static void *pop(Vec *v);
static void push(Vec *v, void *p);
static void stracat(String *dst, char *src);
static void strnacat(String *dst, char *src, size_t n);
static void stracpy(String *dst, char *src);
/* Cleanup and errors */
static void usage(void);
/* Parsing functions and related utilities */
static void compile(char *s, int isfile);
static int read_line(FILE *f, String *s);
static char *make_range(Range *range, char *s);
static char *make_addr(Addr *addr, char *s);
static char *find_delim(char *s, Rune delim, int do_brackets);
static char *chompr(char *s, Rune rune);
static char *chomp(char *s);
static Rune *strtorunes(char *s, size_t nrunes);
static long stol(char *s, char **endp);
static size_t escapes(char *beg, char *end, Rune delim, int n_newline);
static size_t echarntorune(Rune *r, char *s, size_t n);
static void insert_labels(void);
/* Get and Free arg and related utilities */
static char *get_aci_arg(Cmd *c, char *s);
static void aci_append(Cmd *c, char *s);
static void free_acir_arg(Cmd *c);
static char *get_bt_arg(Cmd *c, char *s);
static char *get_r_arg(Cmd *c, char *s);
static char *get_s_arg(Cmd *c, char *s);
static void free_s_arg(Cmd *c);
static char *get_w_arg(Cmd *c, char *s);
static char *get_y_arg(Cmd *c, char *s);
static void free_y_arg(Cmd *c);
static char *get_colon_arg(Cmd *c, char *s);
static char *get_lbrace_arg(Cmd *c, char *s);
static char *get_rbrace_arg(Cmd *c, char *s);
static char *semicolon_arg(char *s);
/* Running */
static void run(void);
static int in_range(Cmd *c);
static int match_addr(Addr *a);
static int next_file(void);
static int is_eof(FILE *f);
static void do_writes(void);
static void write_file(char *path, FILE *out);
static void check_puts(char *s, FILE *f);
static void update_ranges(Cmd *beg, Cmd *end);
/* Sed functions */
static void cmd_y(Cmd *c);
static void cmd_x(Cmd *c);
static void cmd_w(Cmd *c);
static void cmd_t(Cmd *c);
static void cmd_s(Cmd *c);
static void cmd_r(Cmd *c);
static void cmd_q(Cmd *c);
static void cmd_P(Cmd *c);
static void cmd_p(Cmd *c);
static void cmd_N(Cmd *c);
static void cmd_n(Cmd *c);
static void cmd_l(Cmd *c);
static void cmd_i(Cmd *c);
static void cmd_H(Cmd *c);
static void cmd_h(Cmd *c);
static void cmd_G(Cmd *c);
static void cmd_g(Cmd *c);
static void cmd_D(Cmd *c);
static void cmd_d(Cmd *c);
static void cmd_c(Cmd *c);
static void cmd_b(Cmd *c);
static void cmd_a(Cmd *c);
static void cmd_colon(Cmd *c);
static void cmd_equal(Cmd *c);
static void cmd_lbrace(Cmd *c);
static void cmd_rbrace(Cmd *c);
static void cmd_last(Cmd *c);
/* Actions */
static void new_line(void);
static void app_line(void);
static void new_next(void);
static void old_next(void);
/*
* Globals
*/
static Vec braces, labels, branches; /* holds ptrdiff_t. addrs of {, :, bt */
static Vec writes; /* holds cmd*. writes scheduled by a and r commands */
static Vec wfiles; /* holds Wfile*. files for w and s///w commands */
static Cmd *prog, *pc; /* Program, program counter */
static size_t pcap;
static size_t lineno;
static regex_t *lastre; /* last used regex for empty regex search */
static char **files; /* list of file names from argv */
static FILE *file; /* current file we are reading */
static String patt, hold, genbuf;
static struct {
unsigned int n :1; /* -n (no print) */
unsigned int E :1; /* -E (extended re) */
unsigned int s :1; /* s/// replacement happened */
unsigned int aci_cont:1; /* a,c,i text continuation */
unsigned int s_cont :1; /* s/// replacement text continuation */
unsigned int halt :1; /* halt execution */
} gflags;
/* FIXME: move character inside Fninfo and only use 26*sizeof(Fninfo) instead of 127*sizeof(Fninfo) bytes */
static Fninfo fns[] = {
['a'] = { cmd_a , get_aci_arg , free_acir_arg , 1 }, /* schedule write of text for later */
['b'] = { cmd_b , get_bt_arg , NULL , 2 }, /* branch to label char *label when building, Cmd *jump when running */
['c'] = { cmd_c , get_aci_arg , free_acir_arg , 2 }, /* delete pattern space. at 0 or 1 addr or end of 2 addr, write text */
['d'] = { cmd_d , NULL , NULL , 2 }, /* delete pattern space */
['D'] = { cmd_D , NULL , NULL , 2 }, /* delete to first newline and start new cycle without reading (if no newline, d) */
['g'] = { cmd_g , NULL , NULL , 2 }, /* replace pattern space with hold space */
['G'] = { cmd_G , NULL , NULL , 2 }, /* append newline and hold space to pattern space */
['h'] = { cmd_h , NULL , NULL , 2 }, /* replace hold space with pattern space */
['H'] = { cmd_H , NULL , NULL , 2 }, /* append newline and pattern space to hold space */
['i'] = { cmd_i , get_aci_arg , free_acir_arg , 1 }, /* write text */
['l'] = { cmd_l , NULL , NULL , 2 }, /* write pattern space in 'visually unambiguous form' */
['n'] = { cmd_n , NULL , NULL , 2 }, /* write pattern space (unless -n) read to replace pattern space (if no input, quit) */
['N'] = { cmd_N , NULL , NULL , 2 }, /* append to pattern space separated by newline, line number changes (if no input, quit) */
['p'] = { cmd_p , NULL , NULL , 2 }, /* write pattern space */
['P'] = { cmd_P , NULL , NULL , 2 }, /* write pattern space up to first newline */
['q'] = { cmd_q , NULL , NULL , 1 }, /* quit */
['r'] = { cmd_r , get_r_arg , free_acir_arg , 1 }, /* write contents of file (unable to open/read treated as empty file) */
['s'] = { cmd_s , get_s_arg , free_s_arg , 2 }, /* find/replace/all that crazy s stuff */
['t'] = { cmd_t , get_bt_arg , NULL , 2 }, /* if s/// succeeded (since input or last t) branch to label (branch to end if no label) */
['w'] = { cmd_w , get_w_arg , NULL , 2 }, /* append pattern space to file */
['x'] = { cmd_x , NULL , NULL , 2 }, /* exchange pattern and hold spaces */
['y'] = { cmd_y , get_y_arg , free_y_arg , 2 }, /* replace runes in set1 with runes in set2 */
[':'] = { cmd_colon , get_colon_arg , NULL , 0 }, /* defines label for later b and t commands */
['='] = { cmd_equal , NULL , NULL , 1 }, /* printf("%d\n", line_number); */
['{'] = { cmd_lbrace, get_lbrace_arg, NULL , 2 }, /* if we match, run commands, otherwise jump to close */
['}'] = { cmd_rbrace, get_rbrace_arg, NULL , 0 }, /* noop, hold onto open for ease of building scripts */
[0x7f] = { NULL, NULL, NULL, 0 }, /* index is checked with isascii(3p). fill out rest of array */
};
/*
* Function Definitions
*/
/* given memory pointed to by *ptr that currently holds *nmemb members of size
* size, realloc to hold new_nmemb members, return new_nmemb in *memb and one
* past old end in *next. if realloc fails...explode
*/
static void
resize(void **ptr, size_t *nmemb, size_t size, size_t new_nmemb, void **next)
{
void *n, *tmp;
if (new_nmemb) {
tmp = ereallocarray(*ptr, new_nmemb, size);
} else { /* turns out realloc(*ptr, 0) != free(*ptr) */
free(*ptr);
tmp = NULL;
}
n = (char *)tmp + *nmemb * size;
*nmemb = new_nmemb;
*ptr = tmp;
if (next)
*next = n;
}
static void *
pop(Vec *v)
{
if (!v->size)
return NULL;
return v->data[--v->size];
}
static void
push(Vec *v, void *p)
{
if (v->size == v->cap)
resize((void **)&v->data, &v->cap, sizeof(*v->data), v->cap * 2 + 1, NULL);
v->data[v->size++] = p;
}
static void
stracat(String *dst, char *src)
{
int new = !dst->cap;
size_t len;
len = (new ? 0 : strlen(dst->str)) + strlen(src) + 1;
if (dst->cap < len)
resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
if (new)
*dst->str = '\0';
strcat(dst->str, src);
}
static void
strnacat(String *dst, char *src, size_t n)
{
int new = !dst->cap;
size_t len;
len = strlen(src);
len = (new ? 0 : strlen(dst->str)) + MIN(n, len) + 1;
if (dst->cap < len)
resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
if (new)
*dst->str = '\0';
strlcat(dst->str, src, len);
}
static void
stracpy(String *dst, char *src)
{
size_t len;
len = strlen(src) + 1;
if (dst->cap < len)
resize((void **)&dst->str, &dst->cap, 1, len * 2, NULL);
strcpy(dst->str, src);
}
static void
leprintf(char *s)
{
if (errno)
eprintf("%zu: %s: %s\n", lineno, s, strerror(errno));
else
eprintf("%zu: %s\n", lineno, s);
}
/* FIXME: write usage message */
static void
usage(void)
{
eprintf("USAGE\n");
}
/* Differences from POSIX
* we allows semicolons and trailing blanks inside {}
* we allow spaces after ! (and in between !s)
* we allow extended regular expressions (-E)
*/
static void
compile(char *s, int isfile)
{
FILE *f;
if (!isfile && !*s) /* empty string script */
return;
f = isfile ? fopen(s, "r") : fmemopen(s, strlen(s), "r");
if (!f)
eprintf("fopen/fmemopen:");
/* NOTE: get arg functions can't use genbuf */
while (read_line(f, &genbuf) != EOF) {
s = genbuf.str;
/* if the first two characters of the script are "#n" default output shall be suppressed */
if (++lineno == 1 && *s == '#' && s[1] == 'n') {
gflags.n = 1;
continue;
}
if (gflags.aci_cont) {
aci_append(pc - 1, s);
continue;
}
if (gflags.s_cont)
s = (pc - 1)->fninfo->getarg(pc - 1, s);
while (*s) {
s = chompr(s, ';');
if (!*s || *s == '#')
break;
if ((size_t)(pc - prog) == pcap)
resize((void **)&prog, &pcap, sizeof(*prog), pcap * 2 + 1, (void **)&pc);
pc->range.beg.type = pc->range.end.type = IGNORE;
pc->fninfo = NULL;
pc->in_match = 0;
s = make_range(&pc->range, s);
s = chomp(s);
pc->negate = *s == '!';
s = chompr(s, '!');
if (!isascii(*s) || !(pc->fninfo = &fns[(unsigned)*s])->fn)
leprintf("bad sed function");
if (pc->range.naddr > pc->fninfo->naddr)
leprintf("wrong number of addresses");
s++;
if (pc->fninfo->getarg)
s = pc->fninfo->getarg(pc, s);
pc++;
}
}
Add *fshut() functions to properly flush file streams This has been a known issue for a long time. Example: printf "word" > /dev/full wouldn't report there's not enough space on the device. This is due to the fact that every libc has internal buffers for stdout which store fragments of written data until they reach a certain size or on some callback to flush them all at once to the kernel. You can force the libc to flush them with fflush(). In case flushing fails, you can check the return value of fflush() and report an error. However, previously, sbase didn't have such checks and without fflush(), the libc silently flushes the buffers on exit without checking the errors. No offense, but there's no way for the libc to report errors in the exit- condition. GNU coreutils solve this by having onexit-callbacks to handle the flushing and report issues, but they have obvious deficiencies. After long discussions on IRC, we came to the conclusion that checking the return value of every io-function would be a bit too much, and having a general-purpose fclose-wrapper would be the best way to go. It turned out that fclose() alone is not enough to detect errors. The right way to do it is to fflush() + check ferror on the fp and then to a fclose(). This is what fshut does and that's how it's done before each return. The return value is obviously affected, reporting an error in case a flush or close failed, but also when reading failed for some reason, the error- state is caught. the !!( ... + ...) construction is used to call all functions inside the brackets and not "terminating" on the first. We want errors to be reported, but there's no reason to stop flushing buffers when one other file buffer has issues. Obviously, functionales come before the flush and ret-logic comes after to prevent early exits as well without reporting warnings if there are any. One more advantage of fshut() is that it is even able to report errors on obscure NFS-setups which the other coreutils are unable to detect, because they only check the return-value of fflush() and fclose(), not ferror() as well.
2015-04-04 15:25:17 -04:00
fshut(f, s);
}
/* FIXME: if we decide to honor lack of trailing newline, set/clear a global
* flag when reading a line
*/
static int
read_line(FILE *f, String *s)
{
ssize_t len;
if (!f)
return EOF;
if ((len = getline(&s->str, &s->cap, f)) < 0) {
if (ferror(f))
eprintf("getline:");
return EOF;
}
if (s->str[--len] == '\n')
s->str[len] = '\0';
return 0;
}
/* read first range from s, return pointer to one past end of range */
static char *
make_range(Range *range, char *s)
{
s = make_addr(&range->beg, s);
if (*s == ',')
s = make_addr(&range->end, s + 1);
else
range->end.type = IGNORE;
if (range->beg.type == EVERY && range->end.type == IGNORE) range->naddr = 0;
else if (range->beg.type != IGNORE && range->end.type == IGNORE) range->naddr = 1;
else if (range->beg.type != IGNORE && range->end.type != IGNORE) range->naddr = 2;
else leprintf("this is impossible...");
return s;
}
/* read first addr from s, return pointer to one past end of addr */
static char *
make_addr(Addr *addr, char *s)
{
Rune r;
char *p = s + strlen(s);
size_t rlen = echarntorune(&r, s, p - s);
if (r == '$') {
addr->type = LAST;
s += rlen;
} else if (isdigitrune(r)) {
addr->type = LINE;
addr->u.lineno = stol(s, &s);
} else if (r == '/' || r == '\\') {
Rune delim;
if (r == '\\') {
s += rlen;
rlen = echarntorune(&r, s, p - s);
}
if (r == '\\')
leprintf("bad delimiter '\\'");
delim = r;
s += rlen;
rlen = echarntorune(&r, s, p - s);
if (r == delim) {
addr->type = LASTRE;
s += rlen;
} else {
addr->type = REGEX;
p = find_delim(s, delim, 1);
if (!*p)
leprintf("unclosed regex");
p -= escapes(s, p, delim, 0);
*p++ = '\0';
addr->u.re = emalloc(sizeof(*addr->u.re));
eregcomp(addr->u.re, s, gflags.E ? REG_EXTENDED : 0);
s = p;
}
} else {
addr->type = EVERY;
}
return s;
}
/* return pointer to first delim in s that is not escaped
* and if do_brackets is set, not in [] (note possible [::], [..], [==], inside [])
* return pointer to trailing nul byte if no delim found
*
* any escaped character that is not special is just itself (POSIX undefined)
* FIXME: pull out into some util thing, will be useful for ed as well
*/
static char *
find_delim(char *s, Rune delim, int do_brackets)
{
enum {
OUTSIDE , /* not in brackets */
BRACKETS_OPENING, /* last char was first [ or last two were first [^ */
BRACKETS_INSIDE , /* inside [] */
INSIDE_OPENING , /* inside [] and last char was [ */
CLASS_INSIDE , /* inside class [::], or colating element [..] or [==], inside [] */
CLASS_CLOSING , /* inside class [::], or colating element [..] or [==], and last character was the respective : . or = */
} state = OUTSIDE;
Rune r, c = 0; /* no c won't be used uninitialized, shutup -Wall */
size_t rlen;
int escape = 0;
char *end = s + strlen(s);
for (; *s; s += rlen) {
rlen = echarntorune(&r, s, end - s);
if (state == BRACKETS_OPENING && r == '^' ) { continue; }
else if (state == BRACKETS_OPENING && r == ']' ) { state = BRACKETS_INSIDE ; continue; }
else if (state == BRACKETS_OPENING ) { state = BRACKETS_INSIDE ; }
if (state == CLASS_CLOSING && r == ']' ) { state = BRACKETS_INSIDE ; }
else if (state == CLASS_CLOSING ) { state = CLASS_INSIDE ; }
else if (state == CLASS_INSIDE && r == c ) { state = CLASS_CLOSING ; }
else if (state == INSIDE_OPENING && (r == ':' ||
r == '.' ||
r == '=') ) { state = CLASS_INSIDE ; c = r; }
else if (state == INSIDE_OPENING && r == ']' ) { state = OUTSIDE ; }
else if (state == BRACKETS_INSIDE && r == '[' ) { state = INSIDE_OPENING ; }
else if (state == BRACKETS_INSIDE && r == ']' ) { state = OUTSIDE ; }
else if (state == OUTSIDE && escape ) { escape = 0 ; }
else if (state == OUTSIDE && r == '\\' ) { escape = 1 ; }
else if (state == OUTSIDE && do_brackets && r == '[' ) { state = BRACKETS_OPENING; }
else if (state == OUTSIDE && r == delim) return s;
}
return s;
}
static char *
chomp(char *s)
{
return chompr(s, 0);
}
/* eat all leading whitespace and occurrences of rune */
static char *
chompr(char *s, Rune rune)
{
Rune r;
size_t rlen;
char *end = s + strlen(s);
while (*s && (rlen = echarntorune(&r, s, end - s)) && (isspacerune(r) || r == rune))
s += rlen;
return s;
}
/* convert first nrunes Runes from UTF-8 string s in allocated Rune*
* NOTE: sequence must be valid UTF-8, check first */
static Rune *
strtorunes(char *s, size_t nrunes)
{
Rune *rs, *rp;
rp = rs = ereallocarray(NULL, nrunes + 1, sizeof(*rs));
while (nrunes--)
s += chartorune(rp++, s);
*rp = '\0';
return rs;
}
static long
stol(char *s, char **endp)
{
long n;
errno = 0;
n = strtol(s, endp, 10);
if (errno)
leprintf("strtol:");
if (*endp == s)
leprintf("strtol: invalid number");
return n;
}
/* from beg to end replace "\\d" with "d" and "\\n" with "\n" (where d is delim)
* if delim is 'n' and n_newline is 0 then "\\n" is replaced with "n" (normal)
* if delim is 'n' and n_newline is 1 then "\\n" is replaced with "\n" (y command)
* if delim is 0 all escaped characters represent themselves (aci text)
* memmove rest of string (beyond end) into place
* return the number of converted escapes (backslashes removed)
* FIXME: this has had too many corner cases slapped on and is ugly. rewrite better
*/
static size_t
escapes(char *beg, char *end, Rune delim, int n_newline)
{
size_t num = 0;
char *src = beg, *dst = beg;
while (src < end) {
/* handle escaped backslash specially so we don't think the second
* backslash is escaping something */
if (*src == '\\' && src[1] == '\\') {
*dst++ = *src++;
if (delim)
*dst++ = *src++;
else
src++;
} else if (*src == '\\' && !delim) {
src++;
} else if (*src == '\\' && src[1]) {
Rune r;
size_t rlen;
num++;
src++;
rlen = echarntorune(&r, src, end - src);
if (r == 'n' && delim == 'n') {
*src = n_newline ? '\n' : 'n'; /* src so we can still memmove() */
} else if (r == 'n') {
*src = '\n';
} else if (r != delim) {
*dst++ = '\\';
num--;
}
memmove(dst, src, rlen);
dst += rlen;
src += rlen;
} else {
*dst++ = *src++;
}
}
memmove(dst, src, strlen(src) + 1);
return num;
}
static size_t
echarntorune(Rune *r, char *s, size_t n)
{
size_t rlen = charntorune(r, s, n);
if (!rlen || *r == Runeerror)
leprintf("invalid UTF-8");
return rlen;
}
static void
insert_labels(void)
{
size_t i;
Cmd *from, *to;
while (branches.size) {
from = prog + (ptrdiff_t)pop(&branches);
if (!from->u.label) {/* no label branch to end of script */
from->u.jump = pc - 1;
} else {
for (i = 0; i < labels.size; i++) {
to = prog + (ptrdiff_t)labels.data[i];
if (!strcmp(from->u.label, to->u.label)) {
from->u.jump = to;
break;
}
}
if (i == labels.size)
leprintf("bad label");
}
}
}
/*
* Getargs / Freeargs
* Read argument from s, return pointer to one past last character of argument
*/
/* POSIX compliant
* i\
* foobar
*
* also allow the following non POSIX compliant
* i # empty line
* ifoobar
* ifoobar\
* baz
*
* FIXME: GNU and busybox discard leading spaces
* i foobar
* i foobar
* ifoobar
* are equivalent in GNU and busybox. We don't. Should we?
*/
static char *
get_aci_arg(Cmd *c, char *s)
{
c->u.acir.print = check_puts;
c->u.acir.str = (String){ NULL, 0 };
gflags.aci_cont = !!*s; /* no continue flag if empty string */
/* neither empty string nor POSIX compliant */
if (*s && !(*s == '\\' && !s[1]))
aci_append(c, s);
return s + strlen(s);
}
static void
aci_append(Cmd *c, char *s)
{
char *end = s + strlen(s), *p = end;
gflags.aci_cont = 0;
while (--p >= s && *p == '\\')
gflags.aci_cont = !gflags.aci_cont;
if (gflags.aci_cont)
*--end = '\n';
escapes(s, end, 0, 0);
stracat(&c->u.acir.str, s);
}
static void
free_acir_arg(Cmd *c)
{
free(c->u.acir.str.str);
}
/* POSIX dictates that label is rest of line, including semicolons, trailing
* whitespace, closing braces, etc. and can be limited to 8 bytes
*
* I allow a semicolon or closing brace to terminate a label name, it's not
* POSIX compliant, but it's useful and every sed version I've tried to date
* does the same.
*
* FIXME: POSIX dictates that leading whitespace is ignored but trailing
* whitespace is not. This is annoying and we should probably get rid of it.
*/
static char *
get_bt_arg(Cmd *c, char *s)
{
char *p = semicolon_arg(s = chomp(s));
if (p != s) {
c->u.label = estrndup(s, p - s);
} else {
c->u.label = NULL;
}
push(&branches, (void *)(c - prog));
return p;
}
/* POSIX dictates file name is rest of line including semicolons, trailing
* whitespace, closing braces, etc. and file name must be preceded by a space
*
* I allow a semicolon or closing brace to terminate a file name and don't
* enforce leading space.
*
* FIXME: decide whether trailing whitespace should be included and fix
* accordingly
*/
static char *
get_r_arg(Cmd *c, char *s)
{
char *p = semicolon_arg(s = chomp(s));
if (p == s)
leprintf("no file name");
c->u.acir.str.str = estrndup(s, p - s);
c->u.acir.print = write_file;
return p;
}
/* we allow "\\n" in replacement text to mean "\n" (undefined in POSIX)
*
* FIXME: allow other escapes in regex and replacement? if so change escapes()
*/
static char *
get_s_arg(Cmd *c, char *s)
{
Rune delim, r;
Cmd buf;
char *p;
int esc, lastre;
/* s/Find/Replace/Flags */
/* Find */
if (!gflags.s_cont) { /* NOT continuing from literal newline in replacement text */
lastre = 0;
c->u.s.repl = (String){ NULL, 0 };
c->u.s.occurrence = 1;
c->u.s.file = NULL;
c->u.s.p = 0;
if (!*s || *s == '\\')
leprintf("bad delimiter");
p = s + strlen(s);
s += echarntorune(&delim, s, p - s);
c->u.s.delim = delim;
echarntorune(&r, s, p - s);
if (r == delim) /* empty regex */
lastre = 1;
p = find_delim(s, delim, 1);
if (!*p)
leprintf("missing second delimiter");
p -= escapes(s, p, delim, 0);
*p = '\0';
if (lastre) {
c->u.s.re = NULL;
} else {
c->u.s.re = emalloc(sizeof(*c->u.s.re));
/* FIXME: different eregcomp that calls fatal */
eregcomp(c->u.s.re, s, gflags.E ? REG_EXTENDED : 0);
}
s = p + runelen(delim);
}
/* Replace */
delim = c->u.s.delim;
p = find_delim(s, delim, 0);
p -= escapes(s, p, delim, 0);
if (!*p) { /* no third delimiter */
/* FIXME: same backslash counting as aci_append() */
if (p[-1] != '\\')
leprintf("missing third delimiter or <backslash><newline>");
p[-1] = '\n';
gflags.s_cont = 1;
} else {
gflags.s_cont = 0;
}
/* check for bad references in replacement text */
*p = '\0';
for (esc = 0, p = s; *p; p++) {
if (esc) {
esc = 0;
if (isdigit(*p) && c->u.s.re && (size_t)(*p - '0') > c->u.s.re->re_nsub)
leprintf("back reference number greater than number of groups");
} else if (*p == '\\') {
esc = 1;
}
}
stracat(&c->u.s.repl, s);
if (gflags.s_cont)
return p;
s = p + runelen(delim);
/* Flags */
p = semicolon_arg(s = chomp(s));
/* FIXME: currently for simplicity take last of g or occurrence flags and
* ignore multiple p flags. need to fix that */
for (; s < p; s++) {
if (isdigit(*s)) {
c->u.s.occurrence = stol(s, &s);
} else {
switch (*s) {
case 'g': c->u.s.occurrence = 0; break;
case 'p': c->u.s.p = 1; break;
case 'w':
/* must be last flag, take everything up to newline/semicolon
* s == p after this */
s = get_w_arg(&buf, s);
c->u.s.file = buf.u.file;
break;
}
}
}
return p;
}