elinks/src/protocol/header.c

/* Parser of HTTP headers */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <string.h>

#include "elinks.h"

#include "protocol/header.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/memory.h"
#include "util/string.h"

/*
* RFC 2616                        HTTP/1.1                       June 1999
*
*
*        OCTET          = <any 8-bit sequence of data>
*        CHAR           = <any US-ASCII character (octets 0 - 127)>
*        UPALPHA        = <any US-ASCII uppercase letter "A".."Z">
*        LOALPHA        = <any US-ASCII lowercase letter "a".."z">
*        ALPHA          = UPALPHA | LOALPHA
*        DIGIT          = <any US-ASCII digit "0".."9">
*        CTL            = <any US-ASCII control character
*                         (octets 0 - 31) and DEL (127)>
*        CR             = <US-ASCII CR, carriage return (13)>
*        LF             = <US-ASCII LF, linefeed (10)>
*        SP             = <US-ASCII SP, space (32)>
*        HT             = <US-ASCII HT, horizontal-tab (9)>
*        <">            = <US-ASCII double-quote mark (34)>
*
*    HTTP/1.1 defines the sequence CR LF as the end-of-line marker for all
*    protocol elements except the entity-body (see appendix 19.3 for
*    tolerant applications). The end-of-line marker within an entity-body
*    is defined by its associated media type, as described in section 3.7.
*
*        CRLF           = CR LF
*
*    HTTP/1.1 header field values can be folded onto multiple lines if the
*    continuation line begins with a space or horizontal tab. All linear
*    white space, including folding, has the same semantics as SP. A
*    recipient MAY replace any linear white space with a single SP before
*    interpreting the field value or forwarding the message downstream.
*
*        LWS            = [CRLF] 1*( SP | HT )
*
*    The TEXT rule is only used for descriptive field contents and values
*    that are not intended to be interpreted by the message parser. Words
*    of *TEXT MAY contain characters from character sets other than ISO-
*    8859-1 [22] only when encoded according to the rules of RFC 2047
*    [14].
*
*        TEXT           = <any OCTET except CTLs,
*                         but including LWS>
*
*    A CRLF is allowed in the definition of TEXT only as part of a header
*    field continuation. It is expected that the folding LWS will be
*    replaced with a single SP before interpretation of the TEXT value.
*
*    Hexadecimal numeric characters are used in several protocol elements.
*
*        HEX            = "A" | "B" | "C" | "D" | "E" | "F"
*                       | "a" | "b" | "c" | "d" | "e" | "f" | DIGIT
*
*    Many HTTP/1.1 header field values consist of words separated by LWS
*    or special characters. These special characters MUST be in a quoted
*    string to be used within a parameter value (as defined in section
*    3.6).
*
*        token          = 1*<any CHAR except CTLs or separators>
*        separators     = "(" | ")" | "<" | ">" | "@"
*                       | "," | ";" | ":" | "\" | <">
*                       | "/" | "[" | "]" | "?" | "="
*                       | "{" | "}" | SP | HT
*
*    Comments can be included in some HTTP header fields by surrounding
*    the comment text with parentheses. Comments are only allowed in
*    fields containing "comment" as part of their field value definition.
*    In all other fields, parentheses are considered part of the field
*    value.
*
*        comment        = "(" *( ctext | quoted-pair | comment ) ")"
*        ctext          = <any TEXT excluding "(" and ")">
*
*    A string of text is parsed as a single word if it is quoted using
*    double-quote marks.
*
*        quoted-string  = ( <"> *(qdtext | quoted-pair ) <"> )
*        qdtext         = <any TEXT except <">>
*
*    The backslash character ("\") MAY be used as a single-character
*    quoting mechanism only within quoted-string and comment constructs.
*
*        quoted-pair    = "\" CHAR
*/

/* FIXME: bug 549
 *
 * HTTP/1.1 header continuation lines are not honoured.
 * DEL char is accepted in TEXT part.
 * HT char is not accepted in TEXT part.
 * LF alone do not mark end of line, CRLF is the correct termination.
 * CR or LF are invalid in header line.
 *
 * Mozilla, IE, NS tolerate header value separator different from ':'
 * Examples:
 * name: value
 * name value
 * name :value
 * name=value
 */

#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)

/** Searches for a message-header with the specified field-name.
 *
 * @param[in] head
 *   Where to start searching in the message received from the server.
 *   This function actually ignores the line to which @a head points,
 *   and starts searching from the next line.  Therefore, when parsing
 *   an HTTP message, @a head should initially point to the start-line,
 *   e.g. "HTTP/1.1 200 OK".  Alternatively, if the caller has already
 *   found a message-header and wants to know if there are any more
 *   message-headers with the same field-name, then @a head can be the
 *   pointer that a previous call stored in *@a ptr.
 * @param[in] item
 *   The field-name for which this function searches.
 * @param[out] ptr
 *   If @a ptr is not NULL, and this function finds a message-header,
 *   then this function stores in *@a ptr the address at which the
 *   field-content begins; the caller may pass that as @a head in a
 *   later call.  Otherwise, this function does not modify *@a ptr.
 * @returns
 *   NULL if not found or out of memory.  Otherwise, a copy of the
 *   field-content of the message-header; the caller must eventually
 *   mem_free() it.
 *
 * The terms message-header, field-name, start-line, and field-content
 * are defined in RFC 2616 sections 4.1 and 4.2.  */
char *
parse_header(char *head, const char *item, char **ptr)
{
	char *pos = head;

	if (!pos) return NULL;

	while (*pos) {
		char *end, *value;
		const char *itempos;
		int len;

		/* Go for a newline. */
		while (*pos && *pos != ASCII_LF) pos++;
		if (!*pos) break;
		pos++; /* Start of line now. */

		/* Does item match header line ? */
		for (itempos = item; *itempos && *pos; itempos++, pos++)
			if (c_toupper(*itempos) != c_toupper(*pos))
				break;

		if (!*pos) break; /* Nothing left to parse. */
		if (*itempos) continue; /* Do not match. */

		/* Be tolerant: we accept headers with
		 * weird syntax, since most browsers does it
		 * anyway, ie:
		 * name value
		 * name :value
		 * name = value
		 * name[TAB]:[TAB]value */

		end = pos;

		/* Skip leading whitespaces if any. */
		while (LWS(*pos)) pos++;
		if (!*pos) break; /* Nothing left to parse. */

		/* Eat ':' or '=' if any. */
		if (*pos == ':' || *pos == '=') pos++;
		if (!*pos) break; /* Nothing left to parse. */

		/* Skip whitespaces after separator if any. */
		while (LWS(*pos)) pos++;
		if (!*pos) break; /* Nothing left to parse. */

		if (pos == end) continue; /* Not an exact match (substring). */

		/* Find the end of line/string.
		 * We fail on control chars and DEL char. */
		end = pos;
		while (*end != ASCII_DEL && (*end > ' ' || LWS(*end))) end++;
		if (!*end) break; /* No end of line, nothing left to parse. */

		/* Ignore line if we encountered an unexpected char. */
		if (*end != ASCII_CR && *end != ASCII_LF) continue;

		/* Strip trailing whitespaces. */
		while (end > pos && LWS(end[-1])) end--;

		len = end - pos;
		assert(len >= 0);
		if_assert_failed break;

		if (!len) continue;	/* Empty value. */

		value = memacpy(pos, len);
		if (!value) break; /* Allocation failure, stop here. */

		if (ptr) *ptr = pos;
		return value;
	}

	return NULL;
}

/* Extract the value of name part of the value of attribute content.
 * Ie. @name = "charset" and @str = "text/html; charset=iso-8859-1"
 * will store in *@ret an allocated string containing "iso-8859-1".
 * It supposes that separator is ';' and ignore first element in the
 * list. (ie. '1' is ignored in "1; URL=xxx")
 * The return value is one of:
 *
 * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
 * - HEADER_PARAM_NOT_FOUND: the parameter is not there.  *@ret is now NULL.
 * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
 *
 * If @ret is NULL, then this function doesn't actually access *@ret,
 * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY.  Some callers may
 * rely on this. */
enum parse_header_param
parse_header_param(char *str, const char *name, char **ret, int content_disposition)
{
	char *p = str;
	int namelen, plen = 0;

	if (ret) *ret = NULL;	/* default in case of early return */

	assert(str && name && *name);
	if_assert_failed return HEADER_PARAM_NOT_FOUND;

	/* Returns now if string @str is empty. */
	if (!*p) return HEADER_PARAM_NOT_FOUND;

	namelen = strlen(name);

	if (!content_disposition) {
a:
		p = strchr(p, ';');
		if (!p) return HEADER_PARAM_NOT_FOUND;
	}
	while (*p && (*p == ';' || *p <= ' ')) p++;

	if (strlen(p) < namelen) return HEADER_PARAM_NOT_FOUND;
	if (c_strncasecmp(p, name, namelen)) goto a;

	p += namelen;

	while (*p && (*p <= ' ' || *p == '=')) p++;
	if (!*p) {
		if (ret) {
			*ret = stracpy("");
			if (!*ret)
				return HEADER_PARAM_OUT_OF_MEMORY;
		}
		return HEADER_PARAM_FOUND;
	}

	while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++;

	/* Trim ending spaces */
	while (plen > 0 && LWS(p[plen - 1])) plen--;

	/* XXX: Drop enclosing single quotes if there's some.
	 *
	 * Some websites like newsnow.co.uk are using single quotes around url
	 * in URL field in meta tag content attribute like this:
	 * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
	 *
	 * This is an attempt to handle that, but it may break something else.
	 * We drop all pair of enclosing quotes found (eg. '''url''' => url).
	 * Please report any issue related to this. --Zas */
	while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') {
		p++;
		plen -= 2;
	}

	if (ret) {
		*ret = memacpy(p, plen);
		if (!*ret)
			return HEADER_PARAM_OUT_OF_MEMORY;
	}
	return HEADER_PARAM_FOUND;
}

/* Parse string param="value", return value as new string or NULL if any
 * error. */
char *
get_header_param(char *e, const char *name)
{
	char *n, *start;

again:
	while (*e && c_toupper(*e++) != c_toupper(*name));
	if (!*e) return NULL;

	n = (char *)(name + 1);
	while (*n && c_toupper(*e) == c_toupper(*n)) e++, n++;
	if (*n) goto again;

	skip_space(e);
	if (*e++ != '=') return NULL;

	skip_space(e);
	start = e;

	if (!isquote(*e)) {
		skip_nonspace(e);
	} else {
		unsigned char uu = *e++;

		start++;
		while (*e != uu) {
			if (!*e) return NULL;
			e++;
		}
	}

	while (start < e && *start == ' ') start++;
	while (start < e && *(e - 1) == ' ') e--;
	if (start == e) return NULL;

	n = (char *)mem_alloc(e - start + 1);
	if (n) {
		int i = 0;

		while (start < e) {
			n[i++] = ((unsigned char)*start < ' ') ? '.' : *start;
			start++;
		}
		n[i] = '\0';
	}

	return n;
}