/* Parser of HTTP headers */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "elinks.h" #include "protocol/header.h" #include "util/conv.h" #include "util/error.h" #include "util/memory.h" #include "util/string.h" /* * RFC 2616 HTTP/1.1 June 1999 * * * OCTET = * CHAR = * UPALPHA = * LOALPHA = * ALPHA = UPALPHA | LOALPHA * DIGIT = * CTL = * CR = * LF = * SP = * HT = * <"> = * * HTTP/1.1 defines the sequence CR LF as the end-of-line marker for all * protocol elements except the entity-body (see appendix 19.3 for * tolerant applications). The end-of-line marker within an entity-body * is defined by its associated media type, as described in section 3.7. * * CRLF = CR LF * * HTTP/1.1 header field values can be folded onto multiple lines if the * continuation line begins with a space or horizontal tab. All linear * white space, including folding, has the same semantics as SP. A * recipient MAY replace any linear white space with a single SP before * interpreting the field value or forwarding the message downstream. * * LWS = [CRLF] 1*( SP | HT ) * * The TEXT rule is only used for descriptive field contents and values * that are not intended to be interpreted by the message parser. Words * of *TEXT MAY contain characters from character sets other than ISO- * 8859-1 [22] only when encoded according to the rules of RFC 2047 * [14]. * * TEXT = * * A CRLF is allowed in the definition of TEXT only as part of a header * field continuation. It is expected that the folding LWS will be * replaced with a single SP before interpretation of the TEXT value. * * Hexadecimal numeric characters are used in several protocol elements. * * HEX = "A" | "B" | "C" | "D" | "E" | "F" * | "a" | "b" | "c" | "d" | "e" | "f" | DIGIT * * Many HTTP/1.1 header field values consist of words separated by LWS * or special characters. These special characters MUST be in a quoted * string to be used within a parameter value (as defined in section * 3.6). * * token = 1* * separators = "(" | ")" | "<" | ">" | "@" * | "," | ";" | ":" | "\" | <"> * | "/" | "[" | "]" | "?" | "=" * | "{" | "}" | SP | HT * * Comments can be included in some HTTP header fields by surrounding * the comment text with parentheses. Comments are only allowed in * fields containing "comment" as part of their field value definition. * In all other fields, parentheses are considered part of the field * value. * * comment = "(" *( ctext | quoted-pair | comment ) ")" * ctext = * * A string of text is parsed as a single word if it is quoted using * double-quote marks. * * quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) * qdtext = > * * The backslash character ("\") MAY be used as a single-character * quoting mechanism only within quoted-string and comment constructs. * * quoted-pair = "\" CHAR */ /* FIXME: bug 549 * * HTTP/1.1 header continuation lines are not honoured. * DEL char is accepted in TEXT part. * HT char is not accepted in TEXT part. * LF alone do not mark end of line, CRLF is the correct termination. * CR or LF are invalid in header line. * * Mozilla, IE, NS tolerate header value separator different from ':' * Examples: * name: value * name value * name :value * name=value */ #define LWS(c) ((c) == ' ' || (c) == ASCII_TAB) /** Searches for a message-header with the specified field-name. * * @param[in] head * Where to start searching in the message received from the server. * This function actually ignores the line to which @a head points, * and starts searching from the next line. Therefore, when parsing * an HTTP message, @a head should initially point to the start-line, * e.g. "HTTP/1.1 200 OK". Alternatively, if the caller has already * found a message-header and wants to know if there are any more * message-headers with the same field-name, then @a head can be the * pointer that a previous call stored in *@a ptr. * @param[in] item * The field-name for which this function searches. * @param[out] ptr * If @a ptr is not NULL, and this function finds a message-header, * then this function stores in *@a ptr the address at which the * field-content begins; the caller may pass that as @a head in a * later call. Otherwise, this function does not modify *@a ptr. * @returns * NULL if not found or out of memory. Otherwise, a copy of the * field-content of the message-header; the caller must eventually * mem_free() it. * * The terms message-header, field-name, start-line, and field-content * are defined in RFC 2616 sections 4.1 and 4.2. */ char * parse_header(char *head, const char *item, char **ptr) { char *pos = head; if (!pos) return NULL; while (*pos) { char *end, *value; const char *itempos; int len; /* Go for a newline. */ while (*pos && *pos != ASCII_LF) pos++; if (!*pos) break; pos++; /* Start of line now. */ /* Does item match header line ? */ for (itempos = item; *itempos && *pos; itempos++, pos++) if (c_toupper(*itempos) != c_toupper(*pos)) break; if (!*pos) break; /* Nothing left to parse. */ if (*itempos) continue; /* Do not match. */ /* Be tolerant: we accept headers with * weird syntax, since most browsers does it * anyway, ie: * name value * name :value * name = value * name[TAB]:[TAB]value */ end = pos; /* Skip leading whitespaces if any. */ while (LWS(*pos)) pos++; if (!*pos) break; /* Nothing left to parse. */ /* Eat ':' or '=' if any. */ if (*pos == ':' || *pos == '=') pos++; if (!*pos) break; /* Nothing left to parse. */ /* Skip whitespaces after separator if any. */ while (LWS(*pos)) pos++; if (!*pos) break; /* Nothing left to parse. */ if (pos == end) continue; /* Not an exact match (substring). */ /* Find the end of line/string. * We fail on control chars and DEL char. */ end = pos; while (*end != ASCII_DEL && (*end > ' ' || LWS(*end))) end++; if (!*end) break; /* No end of line, nothing left to parse. */ /* Ignore line if we encountered an unexpected char. */ if (*end != ASCII_CR && *end != ASCII_LF) continue; /* Strip trailing whitespaces. */ while (end > pos && LWS(end[-1])) end--; len = end - pos; assert(len >= 0); if_assert_failed break; if (!len) continue; /* Empty value. */ value = memacpy(pos, len); if (!value) break; /* Allocation failure, stop here. */ if (ptr) *ptr = pos; return value; } return NULL; } /* Extract the value of name part of the value of attribute content. * Ie. @name = "charset" and @str = "text/html; charset=iso-8859-1" * will store in *@ret an allocated string containing "iso-8859-1". * It supposes that separator is ';' and ignore first element in the * list. (ie. '1' is ignored in "1; URL=xxx") * The return value is one of: * * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret. * - HEADER_PARAM_NOT_FOUND: the parameter is not there. *@ret is now NULL. * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL. * * If @ret is NULL, then this function doesn't actually access *@ret, * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY. Some callers may * rely on this. */ enum parse_header_param parse_header_param(char *str, const char *name, char **ret, int content_disposition) { char *p = str; int namelen, plen = 0; if (ret) *ret = NULL; /* default in case of early return */ assert(str && name && *name); if_assert_failed return HEADER_PARAM_NOT_FOUND; /* Returns now if string @str is empty. */ if (!*p) return HEADER_PARAM_NOT_FOUND; namelen = strlen(name); if (!content_disposition) { a: p = strchr(p, ';'); if (!p) return HEADER_PARAM_NOT_FOUND; } while (*p && (*p == ';' || *p <= ' ')) p++; if (strlen(p) < namelen) return HEADER_PARAM_NOT_FOUND; if (c_strncasecmp(p, name, namelen)) goto a; p += namelen; while (*p && (*p <= ' ' || *p == '=')) p++; if (!*p) { if (ret) { *ret = stracpy(""); if (!*ret) return HEADER_PARAM_OUT_OF_MEMORY; } return HEADER_PARAM_FOUND; } while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++; /* Trim ending spaces */ while (plen > 0 && LWS(p[plen - 1])) plen--; /* XXX: Drop enclosing single quotes if there's some. * * Some websites like newsnow.co.uk are using single quotes around url * in URL field in meta tag content attribute like this: * * * This is an attempt to handle that, but it may break something else. * We drop all pair of enclosing quotes found (eg. '''url''' => url). * Please report any issue related to this. --Zas */ while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') { p++; plen -= 2; } if (ret) { *ret = memacpy(p, plen); if (!*ret) return HEADER_PARAM_OUT_OF_MEMORY; } return HEADER_PARAM_FOUND; } /* Parse string param="value", return value as new string or NULL if any * error. */ char * get_header_param(char *e, const char *name) { char *n, *start; again: while (*e && c_toupper(*e++) != c_toupper(*name)); if (!*e) return NULL; n = (char *)(name + 1); while (*n && c_toupper(*e) == c_toupper(*n)) e++, n++; if (*n) goto again; skip_space(e); if (*e++ != '=') return NULL; skip_space(e); start = e; if (!isquote(*e)) { skip_nonspace(e); } else { unsigned char uu = *e++; start++; while (*e != uu) { if (!*e) return NULL; e++; } } while (start < e && *start == ' ') start++; while (start < e && *(e - 1) == ' ') e--; if (start == e) return NULL; n = (char *)mem_alloc(e - start + 1); if (n) { int i = 0; while (start < e) { n[i++] = ((unsigned char)*start < ' ') ? '.' : *start; start++; } n[i] = '\0'; } return n; }