diff --git a/awk.1 b/awk.1 index 050d253..d27dbff 100644 --- a/awk.1 +++ b/awk.1 @@ -577,3 +577,56 @@ The scope rules for variables in functions are a botch; the syntax is worse. .PP Only eight-bit characters sets are handled correctly. +.SH UNUSUAL FLOATING-POINT VALUES +.I Awk +was designed before IEEE 754 arithmetic defined Not-A-Number (NaN) +and Infinity values, which are supported by all modern floating-point +hardware. +.PP +Because +.I awk +uses +.IR strtod (3) +and +.IR atof (3) +to convert string values to double-precision floating-point values, +modern C libraries also convert strings starting with +.B inf +and +.B nan +into infinity and NaN values respectively. This led to strange results, +with something like this: +.PP +.EX +.nf +echo nancy | awk '{ print $1 + 0 }' +.fi +.EE +.PP +printing +.B nan +instead of zero. +.PP +.I Awk +now follows GNU AWK, and prefilters string values before attempting +to convert them to numbers, as follows: +.TP +.I "Hexadecimal values" +Hexadecimal values (allowed since C99) convert to zero, as they did +prior to C99. +.TP +.I "NaN values" +The two strings +.B +nan +and +.B \-nan +(case independent) convert to NaN. No others do. +(NaNs can have signs.) +.TP +.I "Infinity values" +The two strings +.B +inf +and +.B \-inf +(case independent) convert to positive and negative infinity, respectively. +No others do. diff --git a/lex.c b/lex.c index df39c8e..9d1ae06 100644 --- a/lex.c +++ b/lex.c @@ -191,7 +191,12 @@ int yylex(void) return word(buf); if (isdigit(c)) { char *cp = tostring(buf); - yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab); + double result; + + if (is_number(cp, & result)) + yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab); + else + yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab); free(cp); /* should this also have STR set? */ RET(NUMBER); diff --git a/lib.c b/lib.c index db065db..5404818 100644 --- a/lib.c +++ b/lib.c @@ -30,6 +30,7 @@ THIS SOFTWARE. #include #include #include +#include #include "awk.h" char EMPTY[] = { '\0' }; @@ -181,12 +182,14 @@ int getrec(char **pbuf, int *pbufsize, bool isrecord) /* get next input record * innew = false; if (c != 0 || buf[0] != '\0') { /* normal record */ if (isrecord) { + double result; + if (freeable(fldtab[0])) xfree(fldtab[0]->sval); fldtab[0]->sval = buf; /* buf == record */ fldtab[0]->tval = REC | STR | DONTFREE; - if (is_number(fldtab[0]->sval)) { - fldtab[0]->fval = atof(fldtab[0]->sval); + if (is_number(fldtab[0]->sval, & result)) { + fldtab[0]->fval = result; fldtab[0]->tval |= NUM; } } @@ -293,6 +296,7 @@ void setclvar(char *s) /* set var=value from s */ { char *p; Cell *q; + double result; for (p=s; *p != '='; p++) ; @@ -300,8 +304,8 @@ void setclvar(char *s) /* set var=value from s */ p = qstring(p, '\0'); q = setsymtab(s, p, 0.0, STR, symtab); setsval(q, p); - if (is_number(q->sval)) { - q->fval = atof(q->sval); + if (is_number(q->sval, & result)) { + q->fval = result; q->tval |= NUM; } DPRINTF("command line set %s to |%s|\n", s, p); @@ -402,9 +406,11 @@ void fldbld(void) /* create fields from current record */ lastfld = i; donefld = true; for (j = 1; j <= lastfld; j++) { + double result; + p = fldtab[j]; - if(is_number(p->sval)) { - p->fval = atof(p->sval); + if(is_number(p->sval, & result)) { + p->fval = result; p->tval |= NUM; } } @@ -756,24 +762,67 @@ int isclvar(const char *s) /* is s of form var=something ? */ /* strtod is supposed to be a proper test of what's a valid number */ /* appears to be broken in gcc on linux: thinks 0x123 is a valid FP number */ /* wrong: violates 4.10.1.4 of ansi C standard */ + /* well, not quite. As of C99, hex floating point is allowed. so this is - * a bit of a mess. + * a bit of a mess. We work around the mess by checking for a hexadecimal + * value and disallowing it. Similarly, we now follow gawk and allow only + * +nan, -nan, +inf, and -inf for NaN and infinity values. */ -#include -int is_number(const char *s) +/* + * This routine now has a more complicated interface, the main point + * being to avoid the double conversion of a string to double, and + * also to convey out, if requested, the information that the numeric + * value was a leading string or is all of the string. The latter bit + * is used in getfval(). + */ + +bool is_valid_number(const char *s, bool trailing_stuff_ok, + bool *no_trailing, double *result) { double r; char *ep; + bool retval = false; + + if (no_trailing) + *no_trailing = false; + + while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r') + s++; + + if (s[0] == '0' && tolower(s[1]) == 'x') // no hex floating point, sorry + return false; + + // allow +nan, -nan, +inf, -inf, any other letter, no + if (s[0] == '+' || s[0] == '-') { + if (strcasecmp(s+1, "nan") == 0 || strcasecmp(s+1, "inf") == 0) + return true; + else if (! isdigit(s[1]) && s[1] != '.') + return false; + } + else if (! isdigit(s[0]) && s[0] != '.') + return false; + errno = 0; r = strtod(s, &ep); if (ep == s || r == HUGE_VAL || errno == ERANGE) - return 0; - /* allow \r as well. windows files aren't going to go away. */ + return false; + + if (result != NULL) + *result = r; + + /* + * check for trailing stuff + * allow \r as well. windows files aren't going to go away. + */ while (*ep == ' ' || *ep == '\t' || *ep == '\n' || *ep == '\r') ep++; - if (*ep == '\0') - return 1; - else - return 0; + + if (no_trailing) + *no_trailing = (*ep == '\0'); + + // return true if found the end, or trailing stuff is allowed + retval = (*ep == '\0') || trailing_stuff_ok; + + return retval; } diff --git a/proto.h b/proto.h index 71e3cd9..a64991b 100644 --- a/proto.h +++ b/proto.h @@ -146,7 +146,9 @@ extern void eprint(void); extern void bclass(int); extern double errcheck(double, const char *); extern int isclvar(const char *); -extern int is_number(const char *); +extern bool is_valid_number(const char *s, bool trailing_stuff_ok, + bool *no_trailing, double *result); +#define is_number(s, val) is_valid_number(s, false, NULL, val) extern int adjbuf(char **pb, int *sz, int min, int q, char **pbp, const char *what); extern void run(Node *); diff --git a/run.c b/run.c index 854463f..da4f555 100644 --- a/run.c +++ b/run.c @@ -407,6 +407,7 @@ Cell *awkgetline(Node **a, int n) /* get next line from specific input */ int bufsize = recsize; int mode; bool newflag; + double result; if ((buf = (char *) malloc(bufsize)) == NULL) FATAL("out of memory in getline"); @@ -429,15 +430,15 @@ Cell *awkgetline(Node **a, int n) /* get next line from specific input */ } else if (a[0] != NULL) { /* getline var sval)) { - x->fval = atof(x->sval); + if (is_number(x->sval, & result)) { + x->fval = result; x->tval |= NUM; } tempfree(x); } else { /* getline sval)) { - fldtab[0]->fval = atof(fldtab[0]->sval); + if (is_number(fldtab[0]->sval, & result)) { + fldtab[0]->fval = result; fldtab[0]->tval |= NUM; } } @@ -448,8 +449,8 @@ Cell *awkgetline(Node **a, int n) /* get next line from specific input */ n = getrec(&buf, &bufsize, false); x = execute(a[0]); setsval(x, buf); - if (is_number(x->sval)) { - x->fval = atof(x->sval); + if (is_number(x->sval, & result)) { + x->fval = result; x->tval |= NUM; } tempfree(x); @@ -726,7 +727,7 @@ Cell *indirect(Node **a, int n) /* $( a[0] ) */ if ((Awkfloat)INT_MAX < val) FATAL("trying to access out of range field %s", x->nval); m = (int) val; - if (m == 0 && !is_number(s = getsval(x))) /* suspicion! */ + if (m == 0 && !is_number(s = getsval(x), NULL)) /* suspicion! */ FATAL("illegal field $(%s), name \"%s\"", s, x->nval); /* BUG: can x->nval ever be null??? */ tempfree(x); @@ -1259,6 +1260,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */ int sep; char temp, num[50]; int n, tempstat, arg3type; + double result; y = execute(a[0]); /* source string */ origs = s = strdup(getsval(y)); @@ -1303,8 +1305,8 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */ snprintf(num, sizeof(num), "%d", n); temp = *patbeg; setptr(patbeg, '\0'); - if (is_number(s)) - setsymtab(num, s, atof(s), STR|NUM, (Array *) ap->sval); + if (is_number(s, & result)) + setsymtab(num, s, result, STR|NUM, (Array *) ap->sval); else setsymtab(num, s, 0.0, STR, (Array *) ap->sval); setptr(patbeg, temp); @@ -1322,8 +1324,8 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */ } n++; snprintf(num, sizeof(num), "%d", n); - if (is_number(s)) - setsymtab(num, s, atof(s), STR|NUM, (Array *) ap->sval); + if (is_number(s, & result)) + setsymtab(num, s, result, STR|NUM, (Array *) ap->sval); else setsymtab(num, s, 0.0, STR, (Array *) ap->sval); spdone: @@ -1343,8 +1345,8 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */ temp = *s; setptr(s, '\0'); snprintf(num, sizeof(num), "%d", n); - if (is_number(t)) - setsymtab(num, t, atof(t), STR|NUM, (Array *) ap->sval); + if (is_number(t, & result)) + setsymtab(num, t, result, STR|NUM, (Array *) ap->sval); else setsymtab(num, t, 0.0, STR, (Array *) ap->sval); setptr(s, temp); @@ -1372,8 +1374,8 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */ temp = *s; setptr(s, '\0'); snprintf(num, sizeof(num), "%d", n); - if (is_number(t)) - setsymtab(num, t, atof(t), STR|NUM, (Array *) ap->sval); + if (is_number(t, & result)) + setsymtab(num, t, result, STR|NUM, (Array *) ap->sval); else setsymtab(num, t, 0.0, STR, (Array *) ap->sval); setptr(s, temp); diff --git a/tran.c b/tran.c index 79964cd..0ce45db 100644 --- a/tran.c +++ b/tran.c @@ -129,9 +129,11 @@ void arginit(int ac, char **av) /* set up ARGV and ARGC */ free(cp->sval); cp->sval = (char *) ARGVtab; for (i = 0; i < ac; i++) { + double result; + sprintf(temp, "%d", i); - if (is_number(*av)) - setsymtab(temp, *av, atof(*av), STR|NUM, ARGVtab); + if (is_number(*av, & result)) + setsymtab(temp, *av, result, STR|NUM, ARGVtab); else setsymtab(temp, *av, 0.0, STR, ARGVtab); av++; @@ -148,13 +150,15 @@ void envinit(char **envp) /* set up ENVIRON variable */ free(cp->sval); cp->sval = (char *) ENVtab; for ( ; *envp; envp++) { + double result; + if ((p = strchr(*envp, '=')) == NULL) continue; if( p == *envp ) /* no left hand side name in env string */ continue; *p++ = 0; /* split into two strings at = */ - if (is_number(p)) - setsymtab(*envp, p, atof(p), STR|NUM, ENVtab); + if (is_number(p, & result)) + setsymtab(*envp, p, result, STR|NUM, ENVtab); else setsymtab(*envp, p, 0.0, STR, ENVtab); p[-1] = '='; /* restore in case env is passed down to a shell */ @@ -399,9 +403,15 @@ Awkfloat getfval(Cell *vp) /* get float val of a Cell */ else if (isrec(vp) && !donerec) recbld(); if (!isnum(vp)) { /* not a number */ - vp->fval = atof(vp->sval); /* best guess */ - if (is_number(vp->sval) && !(vp->tval&CON)) - vp->tval |= NUM; /* make NUM only sparingly */ + double fval; + bool no_trailing; + + if (is_valid_number(vp->sval, true, & no_trailing, & fval)) { + vp->fval = fval; + if (no_trailing && !(vp->tval&CON)) + vp->tval |= NUM; /* make NUM only sparingly */ + } else + vp->fval = 0.0; } DPRINTF("getfval %p: %s = %g, t=%o\n", (void*)vp, NN(vp->nval), vp->fval, vp->tval);