0
0
mirror of https://github.com/netwide-assembler/nasm.git synced 2025-09-22 10:43:39 -04:00

qstring: first cut at full quoted string support in the preprocessor

First attempt at properly handle quoted strings in the preprocessor.
This also adds range support in %substr.

No support in the assembler yet.
This commit is contained in:
H. Peter Anvin
2008-06-01 17:23:51 -07:00
parent 7f2f8b35e6
commit 8cad14bbcf
6 changed files with 545 additions and 267 deletions

View File

@@ -65,7 +65,7 @@ NASM = nasm.$(O) nasmlib.$(O) float.$(O) insnsa.$(O) insnsb.$(O) \
output/outelf32.$(O) output/outelf64.$(O) \
output/outobj.$(O) output/outas86.$(O) output/outrdf2.$(O) \
output/outdbg.$(O) output/outieee.$(O) output/outmacho.$(O) \
preproc.$(O) pptok.$(O) macros.$(O) \
preproc.$(O) quote.$(O) pptok.$(O) macros.$(O) \
listing.$(O) eval.$(O) exprlib.$(O) stdscan.$(O) tokhash.$(O) \
regvals.$(O) regflags.$(O)

225
nasmlib.c
View File

@@ -930,231 +930,6 @@ int src_get(int32_t *xline, char **xname)
return 0;
}
/* XXX: This is broken for strings which contain multiple quotes...
NASM doesn't have a sane syntax for dealing with those currently. */
void nasm_quote(char **str)
{
int ln = strlen(*str);
char q = (*str)[0];
char *p;
if (ln > 1 && (*str)[ln - 1] == q && (q == '"' || q == '\''))
return;
q = '"';
if (strchr(*str, q))
q = '\'';
p = nasm_malloc(ln + 3);
strcpy(p + 1, *str);
nasm_free(*str);
p[ln + 1] = p[0] = q;
p[ln + 2] = 0;
*str = p;
}
static char *emit_utf8(char *q, int32_t v)
{
if (v < 0) {
/* Impossible - do nothing */
} else if (v <= 0x7f) {
*q++ = v;
} else if (v <= 0x000007ff) {
*q++ = 0xc0 | (v >> 6);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x0000ffff) {
*q++ = 0xe0 | (v >> 12);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x001fffff) {
*q++ = 0xf0 | (v >> 18);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x03ffffff) {
*q++ = 0xf8 | (v >> 24);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else {
*q++ = 0xfc | (v >> 30);
*q++ = 0x80 | ((v >> 24) & 63);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
}
return q;
}
/*
* Do an *in-place* dequoting of the specified string, returning the
* resulting length (which may be containing embedded nulls.)
*
* In-place replacement is possible since the unquoted length is always
* shorter than or equal to the quoted length.
*/
size_t nasm_unquote(char *str)
{
size_t ln;
char bq, eq;
char *p, *q, *ep, *escp;
char c;
enum unq_state {
st_start,
st_backslash,
st_hex,
st_oct,
st_ucs,
} state;
int ndig = 0;
int32_t nval = 0;
bq = str[0];
if (!bq)
return 0;
ln = strlen(str);
eq = str[ln-1];
if ((bq == '\'' || bq == '\"') && bq == eq) {
/* '...' or "..." string */
memmove(str, str+1, ln-2);
str[ln-2] = '\0';
return ln-2;
}
if (bq == '`' || eq == '`') {
/* `...` string */
q = str;
p = str+1;
ep = str+ln-1;
state = st_start;
while (p < ep) {
c = *p++;
switch (state) {
case st_start:
if (c == '\\')
state = st_backslash;
else
*q++ = c;
break;
case st_backslash:
state = st_start;
escp = p-1;
switch (c) {
case 'a':
*q++ = 7;
break;
case 'b':
*q++ = 8;
break;
case 'e':
*q++ = 27;
break;
case 'f':
*q++ = 12;
break;
case 'n':
*q++ = 10;
break;
case 'r':
*q++ = 13;
break;
case 't':
*q++ = 9;
break;
case 'u':
state = st_ucs;
ndig = 4;
nval = 0;
break;
case 'U':
state = st_ucs;
ndig = 8;
nval = 0;
break;
case 'v':
*q++ = 11;
case 'x':
case 'X':
state = st_hex;
ndig = nval = 0;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
state = st_oct;
ndig = 1;
nval = c - '0';
break;
default:
*q++ = c;
break;
}
break;
case st_oct:
if (c >= '0' && c <= '7') {
nval = (nval << 3) + (c - '0');
if (++ndig >= 3) {
*q++ = nval;
state = st_start;
}
} else {
p--; /* Process this character again */
*q++ = nval;
state = st_start;
}
break;
case st_hex:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
nval = (nval << 4) + numvalue(c);
if (++ndig >= 2) {
*q++ = nval;
state = st_start;
}
} else {
p--; /* Process this character again */
*q++ = ndig ? nval : *escp;
state = st_start;
}
break;
case st_ucs:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
nval = (nval << 4) + numvalue(c);
if (!--ndig) {
q = emit_utf8(q, nval);
state = st_start;
}
} else {
p--; /* Process this character again */
if (p > escp+1)
q = emit_utf8(q, nval);
else
*q++ = *escp;
state = st_start;
}
break;
}
}
*q = '\0';
return q-str;
}
/* Otherwise, just return the input... */
return ln;
}
char *nasm_strcat(char *one, char *two)
{
char *rslt;

View File

@@ -395,7 +395,6 @@ int32_t src_get_linnum(void);
*/
int src_get(int32_t *xline, char **xname);
void nasm_quote(char **str);
char *nasm_strcat(char *one, char *two);
void null_debug_routine(const char *directive, const char *params);

100
preproc.c
View File

@@ -48,6 +48,7 @@
#include "nasmlib.h"
#include "preproc.h"
#include "hashtbl.h"
#include "quote.h"
#include "stdscan.h"
#include "tokens.h"
#include "tables.h"
@@ -795,15 +796,12 @@ static Token *tokenize(char *line)
p++;
while (*p && isidchar(*p))
p++;
} else if (*p == '\'' || *p == '"') {
} else if (*p == '\'' || *p == '"' || *p == '`') {
/*
* A string token.
*/
char c = *p;
p++;
type = TOK_STRING;
while (*p && *p != c)
p++;
p = nasm_skip_string(p);
if (*p) {
p++;
@@ -1514,6 +1512,7 @@ static bool if_condition(Token * tline, enum preproc_token ct)
break;
}
/* Unify surrounding quotes for strings */
/* XXX: this doesn't work anymore */
if (t->type == TOK_STRING) {
tt->text[0] = t->text[0];
tt->text[strlen(tt->text) - 1] = t->text[0];
@@ -2079,11 +2078,9 @@ static int do_directive(Token * tline)
if (tline->next)
error(ERR_WARNING,
"trailing garbage after `%%depend' ignored");
if (tline->type != TOK_INTERNAL_STRING) {
p = tline->text + 1; /* point past the quote to the name */
p[strlen(p) - 1] = '\0'; /* remove the trailing quote */
} else
p = tline->text; /* internal_string is easier */
p = tline->text;
if (tline->type != TOK_INTERNAL_STRING)
nasm_unquote(p);
if (dephead && !in_list(*dephead, p)) {
StrList *sl = nasm_malloc(strlen(p)+1+sizeof sl->next);
sl->next = NULL;
@@ -2107,11 +2104,9 @@ static int do_directive(Token * tline)
if (tline->next)
error(ERR_WARNING,
"trailing garbage after `%%include' ignored");
if (tline->type != TOK_INTERNAL_STRING) {
p = tline->text + 1; /* point past the quote to the name */
p[strlen(p) - 1] = '\0'; /* remove the trailing quote */
} else
p = tline->text; /* internal_string is easier */
p = tline->text;
if (tline->type != TOK_INTERNAL_STRING)
nasm_unquote(p);
inc = nasm_malloc(sizeof(Include));
inc->next = istk;
inc->conds = NULL;
@@ -2186,14 +2181,14 @@ static int do_directive(Token * tline)
tline = tline->next;
skip_white_(tline);
if (tok_type_(tline, TOK_STRING)) {
p = tline->text + 1; /* point past the quote to the name */
p[strlen(p) - 1] = '\0'; /* remove the trailing quote */
expand_macros_in_string(&p);
p = tline->text;
nasm_unquote(p);
expand_macros_in_string(&p); /* WHY? */
error(ERR_NONFATAL, "%s", p);
nasm_free(p);
} else {
p = detoken(tline, false);
error(ERR_WARNING, "%s", p);
error(ERR_WARNING, "%s", p); /* WARNING!??!! */
nasm_free(p);
}
free_tlist(origline);
@@ -2670,11 +2665,9 @@ static int do_directive(Token * tline)
if (t->next)
error(ERR_WARNING,
"trailing garbage after `%%pathsearch' ignored");
if (t->type != TOK_INTERNAL_STRING) {
p = t->text + 1; /* point past the quote to the name */
p[strlen(p) - 1] = '\0'; /* remove the trailing quote */
} else
p = t->text; /* internal_string is easier */
p = tline->text;
if (tline->type != TOK_INTERNAL_STRING)
nasm_unquote(p);
fp = inc_fopen(p, &xsl, &xsl, true);
if (fp) {
@@ -2683,8 +2676,7 @@ static int do_directive(Token * tline)
}
macro_start = nasm_malloc(sizeof(*macro_start));
macro_start->next = NULL;
macro_start->text = nasm_strdup(p);
nasm_quote(&macro_start->text);
macro_start->text = nasm_quote(p, strlen(p));
macro_start->type = TOK_STRING;
macro_start->mac = NULL;
if (xsl)
@@ -2736,7 +2728,7 @@ static int do_directive(Token * tline)
macro_start = nasm_malloc(sizeof(*macro_start));
macro_start->next = NULL;
make_tok_num(macro_start, strlen(t->text) - 2);
make_tok_num(macro_start, nasm_unquote(t->text));
macro_start->mac = NULL;
/*
@@ -2750,6 +2742,10 @@ static int do_directive(Token * tline)
return DIRECTIVE_FOUND;
case PP_SUBSTR:
{
int64_t a1, a2;
size_t len;
casesense = true;
tline = tline->next;
@@ -2786,29 +2782,50 @@ static int do_directive(Token * tline)
tt = t->next;
tptr = &tt;
tokval.t_type = TOKEN_INVALID;
evalresult =
evaluate(ppscan, tptr, &tokval, NULL, pass, error, NULL);
evalresult = evaluate(ppscan, tptr, &tokval, NULL,
pass, error, NULL);
if (!evalresult) {
free_tlist(tline);
free_tlist(origline);
return DIRECTIVE_FOUND;
}
if (!is_simple(evalresult)) {
} else if (!is_simple(evalresult)) {
error(ERR_NONFATAL, "non-constant value given to `%%substr`");
free_tlist(tline);
free_tlist(origline);
return DIRECTIVE_FOUND;
}
a1 = evalresult->value-1;
while (tok_type_(tt, TOK_WHITESPACE))
tt = tt->next;
if (!tt) {
a2 = 1; /* Backwards compatibility: one character */
} else {
tokval.t_type = TOKEN_INVALID;
evalresult = evaluate(ppscan, tptr, &tokval, NULL,
pass, error, NULL);
if (!evalresult) {
free_tlist(tline);
free_tlist(origline);
return DIRECTIVE_FOUND;
} else if (!is_simple(evalresult)) {
error(ERR_NONFATAL, "non-constant value given to `%%substr`");
free_tlist(tline);
free_tlist(origline);
return DIRECTIVE_FOUND;
}
a2 = evalresult->value;
}
len = nasm_unquote(t->text);
if (a2 < 0)
a2 = a2+1+len-a1;
if (a1+a2 > (int64_t)len)
a2 = len-a1;
macro_start = nasm_malloc(sizeof(*macro_start));
macro_start->next = NULL;
macro_start->text = nasm_strdup("'''");
if (evalresult->value > 0
&& evalresult->value < (int) strlen(t->text) - 1) {
macro_start->text[1] = t->text[evalresult->value];
} else {
macro_start->text[2] = '\0';
}
macro_start->text = nasm_quote((a1 < 0) ? "" : t->text+a1, a2);
macro_start->type = TOK_STRING;
macro_start->mac = NULL;
@@ -2821,6 +2838,7 @@ static int do_directive(Token * tline)
free_tlist(tline);
free_tlist(origline);
return DIRECTIVE_FOUND;
}
case PP_ASSIGN:
case PP_IASSIGN:
@@ -3209,9 +3227,11 @@ again:
if (!m->expansion) {
if (!strcmp("__FILE__", m->name)) {
int32_t num = 0;
src_get(&num, &(tline->text));
nasm_quote(&(tline->text));
char *file;
src_get(&num, &file);
tline->text = nasm_quote(file, strlen(file));
tline->type = TOK_STRING;
nasm_free(file);
continue;
}
if (!strcmp("__LINE__", m->name)) {

473
quote.c Normal file
View File

@@ -0,0 +1,473 @@
/* quote.c library routines for the Netwide Assembler
*
* The Netwide Assembler is copyright (C) 1996 Simon Tatham and
* Julian Hall. All rights reserved. The software is
* redistributable under the license given in the file "LICENSE"
* distributed in the NASM archive.
*/
#include "compiler.h"
#include <assert.h>
#include <stdlib.h>
#include "nasmlib.h"
#include "quote.h"
#define numvalue(c) ((c)>='a' ? (c)-'a'+10 : (c)>='A' ? (c)-'A'+10 : (c)-'0')
char *nasm_quote(char *str, size_t len)
{
char c, c1, *p, *q, *nstr, *ep;
bool sq_ok, dq_ok;
size_t qlen;
sq_ok = dq_ok = true;
ep = str+len;
qlen = 0; /* Length if we need `...` quotes */
for (p = str; p < ep; p++) {
c = *p;
switch (c) {
case '\'':
sq_ok = false;
qlen++;
break;
case '\"':
dq_ok = false;
qlen++;
break;
case '`':
case '\\':
qlen += 2;
break;
default:
if (c < ' ' || c > '~') {
sq_ok = dq_ok = false;
switch (c) {
case '\a':
case '\b':
case '\t':
case '\n':
case '\v':
case '\f':
case '\r':
case 27:
qlen += 2;
break;
default:
c1 = (p+1 < ep) ? p[1] : 0;
if (c > 077 || (c1 >= '0' && c1 <= '7'))
qlen += 4; /* Must use the full form */
else if (c > 07)
qlen += 3;
else
qlen += 2;
break;
}
} else {
qlen++;
}
break;
}
}
if (sq_ok || dq_ok) {
/* Use '...' or "..." */
nstr = nasm_malloc(len+3);
nstr[0] = nstr[len+1] = sq_ok ? '\'' : '\"';
nstr[len+2] = '\0';
memcpy(nstr+1, str, len);
} else {
/* Need to use `...` quoted syntax */
nstr = nasm_malloc(qlen+3);
q = nstr;
*q++ = '`';
for (p = str; p < ep; p++) {
c = *p;
switch (c) {
case '`':
case '\\':
*q++ = '\\';
*q++ = c;
break;
case '\a':
*q++ = '\\';
*q++ = 'a';
break;
case '\b':
*q++ = '\\';
*q++ = 'b';
break;
case '\t':
*q++ = '\\';
*q++ = 't';
break;
case '\n':
*q++ = '\\';
*q++ = 'n';
break;
case '\v':
*q++ = '\\';
*q++ = 'v';
break;
case '\f':
*q++ = '\\';
*q++ = 'f';
break;
case '\r':
*q++ = '\\';
*q++ = 'r';
break;
case 27:
*q++ = '\\';
*q++ = 'e';
break;
default:
if (c < ' ' || c > '~') {
c1 = (p+1 < ep) ? p[1] : 0;
if (c1 >= '0' && c1 <= '7')
q += sprintf(q, "\\%03o", (unsigned char)c);
else
q += sprintf(q, "\\%o", (unsigned char)c);
} else {
*q++ = c;
}
break;
}
}
*q++ = '`';
*q++ = '\0';
assert((size_t)(q-nstr) == qlen+3);
}
return nstr;
}
static char *emit_utf8(char *q, int32_t v)
{
if (v < 0) {
/* Impossible - do nothing */
} else if (v <= 0x7f) {
*q++ = v;
} else if (v <= 0x000007ff) {
*q++ = 0xc0 | (v >> 6);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x0000ffff) {
*q++ = 0xe0 | (v >> 12);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x001fffff) {
*q++ = 0xf0 | (v >> 18);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else if (v <= 0x03ffffff) {
*q++ = 0xf8 | (v >> 24);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
} else {
*q++ = 0xfc | (v >> 30);
*q++ = 0x80 | ((v >> 24) & 63);
*q++ = 0x80 | ((v >> 18) & 63);
*q++ = 0x80 | ((v >> 12) & 63);
*q++ = 0x80 | ((v >> 6) & 63);
*q++ = 0x80 | (v & 63);
}
return q;
}
/*
* Do an *in-place* dequoting of the specified string, returning the
* resulting length (which may be containing embedded nulls.)
*
* In-place replacement is possible since the unquoted length is always
* shorter than or equal to the quoted length.
*/
size_t nasm_unquote(char *str)
{
size_t ln;
char bq, eq;
char *p, *q, *ep;
char *escp = NULL;
char c;
enum unq_state {
st_start,
st_backslash,
st_hex,
st_oct,
st_ucs,
} state;
int ndig = 0;
int32_t nval = 0;
bq = str[0];
if (!bq)
return 0;
ln = strlen(str);
eq = str[ln-1];
if ((bq == '\'' || bq == '\"') && bq == eq) {
/* '...' or "..." string */
memmove(str, str+1, ln-2);
str[ln-2] = '\0';
return ln-2;
}
if (bq == '`' || eq == '`') {
/* `...` string */
q = str;
p = str+1;
ep = str+ln-1;
state = st_start;
while (p < ep) {
c = *p++;
switch (state) {
case st_start:
if (c == '\\')
state = st_backslash;
else
*q++ = c;
break;
case st_backslash:
state = st_start;
escp = p-1;
switch (c) {
case 'a':
*q++ = 7;
break;
case 'b':
*q++ = 8;
break;
case 'e':
*q++ = 27;
break;
case 'f':
*q++ = 12;
break;
case 'n':
*q++ = 10;
break;
case 'r':
*q++ = 13;
break;
case 't':
*q++ = 9;
break;
case 'u':
state = st_ucs;
ndig = 4;
nval = 0;
break;
case 'U':
state = st_ucs;
ndig = 8;
nval = 0;
break;
case 'v':
*q++ = 11;
case 'x':
case 'X':
state = st_hex;
ndig = nval = 0;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
state = st_oct;
ndig = 1;
nval = c - '0';
break;
default:
*q++ = c;
break;
}
break;
case st_oct:
if (c >= '0' && c <= '7') {
nval = (nval << 3) + (c - '0');
if (++ndig >= 3) {
*q++ = nval;
state = st_start;
}
} else {
p--; /* Process this character again */
*q++ = nval;
state = st_start;
}
break;
case st_hex:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
nval = (nval << 4) + numvalue(c);
if (++ndig >= 2) {
*q++ = nval;
state = st_start;
}
} else {
p--; /* Process this character again */
*q++ = ndig ? nval : *escp;
state = st_start;
}
break;
case st_ucs:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
nval = (nval << 4) + numvalue(c);
if (!--ndig) {
q = emit_utf8(q, nval);
state = st_start;
}
} else {
p--; /* Process this character again */
if (p > escp+1)
q = emit_utf8(q, nval);
else
*q++ = *escp;
state = st_start;
}
break;
}
}
*q = '\0';
return q-str;
}
/* Otherwise, just return the input... */
return ln;
}
/*
* Find the end of a quoted string; returns the pointer to the terminating
* character (either the ending quote or the null character, if unterminated.)
*/
char *nasm_skip_string(char *str)
{
char bq;
char *p;
char c;
enum unq_state {
st_start,
st_backslash,
st_hex,
st_oct,
st_ucs,
} state;
int ndig = 0;
bq = str[0];
if (bq == '\'' || bq == '\"') {
/* '...' or "..." string */
for (p = str+1; *p && *p != bq; p++)
;
return p;
} else if (bq == '`') {
/* `...` string */
p = str+1;
state = st_start;
while ((c = *p++)) {
switch (state) {
case st_start:
switch (c) {
case '\\':
state = st_backslash;
break;
case '`':
return p-1; /* Found the end */
default:
break;
}
break;
case st_backslash:
switch (c) {
case 'a':
case 'b':
case 'e':
case 'f':
case 'n':
case 'r':
case 't':
case 'v':
default:
state = st_start;
break;
case 'u':
state = st_ucs;
ndig = 4;
break;
case 'U':
state = st_ucs;
ndig = 8;
break;
case 'x':
case 'X':
state = st_hex;
ndig = 0;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
state = st_oct;
ndig = 1;
break;
}
break;
case st_oct:
if (c >= '0' && c <= '7') {
if (++ndig >= 3)
state = st_start;
} else {
p--; /* Process this character again */
state = st_start;
}
break;
case st_hex:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
if (++ndig >= 2)
state = st_start;
} else {
p--; /* Process this character again */
state = st_start;
}
break;
case st_ucs:
if ((c >= '0' && c <= '9') ||
(c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
if (!--ndig)
state = st_start;
} else {
p--; /* Process this character again */
state = st_start;
}
break;
}
}
return p; /* Unterminated string... */
} else {
return str; /* Not a string... */
}
}

11
quote.h Normal file
View File

@@ -0,0 +1,11 @@
#ifndef NASM_QUOTE_H
#define NASM_QUOTE_H
#include "compiler.h"
char *nasm_quote(char *str, size_t len);
size_t nasm_unquote(char *str);
char *nasm_skip_string(char *str);
#endif /* NASM_QUOTE_H */