HTML: Rewrite parsing of meta refresh

The URL in <meta http-equiv="Refresh" content="42; URL=target.html"> can now freely contain spaces and semicolons. There cannot be other parameters between the delay and the URL. If the URL is not quoted, then it spans to the end of the attribute, except not to trailing spaces. If the URL is quoted, then it ends at the first closing quotation mark. All this is consistent with Debian Iceweasel 3.5.16.
2024-12-04 14:46:47 -05:00 · 2011-05-01 22:10:46 +03:00 · 2011-05-01 22:10:46 +03:00 · e22eae2e93
commit e22eae2e93
parent 715571a5d6
8 changed files with 397 additions and 153 deletions
--- a/2
+++ b/2
@ -83,6 +83,8 @@ Miscellaneous:
  <video> and <audio>.
 * enhancement: Add move-half-page-up and move-half-page-down actions.
 * enhancement: Add option to change overlap for vertical scrolling.
+* enhancement: HTML meta refresh allows semicolons in URLs, and the
+  syntax is more like in Firefox.
 * link against lua51 not lua50
 * SpiderMonkey must be 1.8.5 or later.  Find it with pkg-config.
 * using iconv for some multibyte charsets. It works if the terminal codepage
--- a/src/document/html/Makefile
+++ b/src/document/html/Makefile
@ -1,7 +1,7 @@
 top_builddir=../../..
 include $(top_builddir)/Makefile.config

-SUBDIRS = parser
-OBJS	= frames.o parser.o renderer.o tables.o
+SUBDIRS = parser test
+OBJS	= frames.o parse-meta-refresh.o parser.o renderer.o tables.o

 include $(top_srcdir)/Makefile.lib
--- a/src/document/html/parse-meta-refresh.c
+++ b/src/document/html/parse-meta-refresh.c
@ -0,0 +1,118 @@
+/* Parse <meta http-equiv="refresh" content="..."> */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "elinks.h"
+
+#include "document/html/parse-meta-refresh.h"
+#include "osdep/ascii.h"
+#include "util/string.h"
+
+#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
+
+int
+html_parse_meta_refresh(const unsigned char *content,
+			unsigned long *delay_out,
+			unsigned char **url_out)
+{
+	const unsigned char *scan = content;
+	char *delay_end;
+	int saw_delay = 0;
+	const unsigned char *url_begin;
+	const unsigned char *url_end;
+
+	*url_out = NULL;
+	*delay_out = 0;
+
+	while (LWS(*scan))
+		++scan;
+
+	/* TODO: Do we need to switch to the "C" locale and back?  */
+	*delay_out = strtoul(scan, &delay_end, 10);
+	saw_delay = (scan != (const unsigned char *) delay_end);
+	scan = (const unsigned char *) delay_end;
+
+	if (saw_delay) {
+		/* Omit any fractional part.  */
+		if (*scan == '.') {
+			++scan;
+			while (!(*scan == '\0' || LWS(*scan)
+				 || *scan == ';' || *scan == ','))
+				++scan;
+		}
+
+		if (!(*scan == '\0' || LWS(*scan)
+		      || *scan == ';' || *scan == ',')) {
+			/* The delay is followed by garbage.  Give up.  */
+			return -1;
+		}
+
+		/* Between the delay and the URL, there must be at
+		 * least one LWS, semicolon, or comma; optionally with
+		 * more LWS around it.  */
+		while (LWS(*scan))
+			++scan;
+		if (*scan == ';' || *scan == ',')
+			++scan;
+	} else {
+		/* The delay was not specified.  The delimiter must be
+		 * a semicolon or a comma, optionally with LWS.  LWS
+		 * alone does not suffice.  */
+		while (*scan != '\0' && *scan != ';' && *scan != ',')
+			++scan;
+		if (*scan == ';' || *scan == ',')
+			++scan;
+	}
+	
+	while (LWS(*scan))
+		++scan;
+
+	/* Presume the URL begins here...  */
+	url_begin = scan;
+
+	/* ..unless there is "URL=" with at least one equals sign,
+	 * and optional spaces.  */
+	if ((scan[0] == 'U' || scan[0] == 'u')
+	    && (scan[1] == 'R' || scan[1] == 'r')
+	    && (scan[2] == 'L' || scan[2] == 'l')) {
+		scan += 3;
+ 		while (LWS(*scan))
+			++scan;
+		if (*scan == '=') {
+			++scan;
+			while (LWS(*scan))
+				++scan;
+			url_begin = scan;
+		}
+	}
+
+	if (*url_begin == '"' || *url_begin == '\'') {
+		unsigned char quote = *url_begin++;
+
+		url_end = strchr(url_begin, quote);
+		if (url_end == NULL)
+			url_end = strchr(url_begin, '\0');
+	} else {
+		url_end = strchr(url_begin, '\0');
+	}
+
+	/* In any case, trim all spaces from the end of the URL.  */
+	while (url_begin < url_end && LWS(url_end[-1]))
+		--url_end;
+
+	if (url_begin != url_end) {
+		*url_out = memacpy(url_begin, url_end - url_begin);
+		if (!*url_out)
+			return -1;
+	} else if (!saw_delay) {
+		/* There is no delay and no URL.  */
+		return -1;
+	}
+	
+	return 0;
+}
--- a/src/document/html/parse-meta-refresh.h
+++ b/src/document/html/parse-meta-refresh.h
@ -0,0 +1,21 @@
+#ifndef EL__DOCUMENT_HTML_PARSE_META_REFRESH_H
+#define EL__DOCUMENT_HTML_PARSE_META_REFRESH_H
+
+/** Parses a \<meta http-equiv="refresh" content="..."> element.
+ *
+ * @param[in] content
+ *   The value of the content attribute, with entities already expanded.
+ * @param[out] delay
+ *   How many seconds to wait before refreshing.
+ * @param[out] url
+ *   The URI to load when refreshing, or NULL to reload the same document.
+ *   The caller must free the string with mem_free() unless it's NULL.
+ *
+ * @return
+ *   0 if successful, or negative on error.
+ *   On error, *@a url is NULL.  */
+int html_parse_meta_refresh(const unsigned char *content,
+			    unsigned long *delay,
+			    unsigned char **url);
+
+#endif
--- a/src/document/html/parser.c
+++ b/src/document/html/parser.c
@ -22,6 +22,7 @@
 #include "document/css/css.h"
 #include "document/css/stylesheet.h"
 #include "document/html/frames.h"
+#include "document/html/parse-meta-refresh.h"
 #include "document/html/parser/link.h"
 #include "document/html/parser/stack.h"
 #include "document/html/parser/parse.h"
@ -277,175 +278,42 @@ html_skip(struct html_context *html_context, unsigned char *a)
 	html_top->type = ELEMENT_DONT_KILL;
 }

-#define LWS(c) ((c) == ' ' || (c) == ASCII_TAB)
-
-/* Parse meta refresh without URL= in it:
- *  <meta http-equiv="refresh" content="3,http://elinks.or.cz/">
- *  <meta http-equiv="refresh" content="3; http://elinks.or.cz/">
- *  <meta http-equiv="refresh" content="   3 ;   http://elinks.or.cz/    ">
- */
-static void
-parse_old_meta_refresh(unsigned char *str, unsigned char **ret)
-{
-	unsigned char *p = str;
-	int len;
-
-	assert(str && ret);
-	if_assert_failed return;
-
-	*ret = NULL;
-	while (*p && LWS(*p)) p++;
-	if (!*p) return;
-	while (*p && *p >= '0' && *p <= '9') p++;
-	if (!*p) return;
-	while (*p && LWS(*p)) p++;
-	if (!*p) return;
-	if (*p == ';' || *p == ',') p++; else return;
-	while (*p && LWS(*p)) p++;
-	if (!*p) return;
-
-	len = strlen(p);
-	while (len && LWS(p[len])) len--;
-	if (len) *ret = memacpy(p, len);
-}
-
-/* Search for the url part in the content attribute and returns
- * it if found.
- * It searches the first occurence of 'url' marker somewhere ignoring
- * anything before it.
- * It should cope with most situations including:
- * content="0; URL='http://www.site.com/path/xxx.htm'"
- * content="0  url=http://www.site.com/path/xxx.htm"
- * content="anything ; some url  ===   ''''http://www.site.com/path/xxx.htm''''
- *
- * The return value is one of:
- *
- * - HEADER_PARAM_FOUND: the parameter was found, copied, and stored in *@ret.
- * - HEADER_PARAM_NOT_FOUND: the parameter is not there.  *@ret is now NULL.
- * - HEADER_PARAM_OUT_OF_MEMORY: error. *@ret is now NULL.
- *
- * If @ret is NULL, then this function doesn't actually access *@ret,
- * and cannot fail with HEADER_PARAM_OUT_OF_MEMORY.  Some callers may
- * rely on this. */
-static enum parse_header_param
-search_for_url_param(unsigned char *str, unsigned char **ret)
-{
-	unsigned char *p;
-	int plen = 0;
-
-	if (ret) *ret = NULL;	/* default in case of early return */
-
-	assert(str);
-	if_assert_failed return HEADER_PARAM_NOT_FOUND;
-
-	/* Returns now if string @str is empty. */
-	if (!*str) return HEADER_PARAM_NOT_FOUND;
-
-	p = c_strcasestr(str, "url");
-	if (!p) return HEADER_PARAM_NOT_FOUND;
-	p += 3;
-
-	while (*p && (*p <= ' ' || *p == '=')) p++;
-	if (!*p) {
-		if (ret) {
-			*ret = stracpy("");
-			if (!*ret)
-				return HEADER_PARAM_OUT_OF_MEMORY;
-		}
-		return HEADER_PARAM_FOUND;
-	}
-
-	while ((p[plen] > ' ' || LWS(p[plen])) && p[plen] != ';') plen++;
-
-	/* Trim ending spaces */
-	while (plen > 0 && LWS(p[plen - 1])) plen--;
-
-	/* XXX: Drop enclosing single quotes if there's some.
-	 *
-	 * Some websites like newsnow.co.uk are using single quotes around url
-	 * in URL field in meta tag content attribute like this:
-	 * <meta http-equiv="Refresh" content="0; URL='http://www.site.com/path/xxx.htm'">
-	 *
-	 * This is an attempt to handle that, but it may break something else.
-	 * We drop all pair of enclosing quotes found (eg. '''url''' => url).
-	 * Please report any issue related to this. --Zas */
-	while (plen > 1 && *p == '\'' && p[plen - 1] == '\'') {
-		p++;
-		plen -= 2;
-	}
-
-	if (ret) {
-		*ret = memacpy(p, plen);
-		if (!*ret)
-			return HEADER_PARAM_OUT_OF_MEMORY;
-	}
-	return HEADER_PARAM_FOUND;
-}
-
-#undef LWS
-
 static void
 check_head_for_refresh(struct html_context *html_context, unsigned char *head)
 {
-	unsigned char *refresh, *url;
+	unsigned char *refresh;
+	unsigned char *url = NULL;
+	unsigned char *joined_url = NULL;
+	unsigned long seconds;

 	refresh = parse_header(head, "Refresh", NULL);
 	if (!refresh) return;

-	search_for_url_param(refresh, &url);
-	if (!url) {
-		/* Let's try a more tolerant parsing. */
-		parse_old_meta_refresh(refresh, &url);
+	if (html_parse_meta_refresh(refresh, &seconds, &url) == 0) {
 		if (!url) {
 			/* If the URL parameter is missing assume that the
 			 * document being processed should be refreshed. */
-			url = get_uri_string(html_context->base_href, URI_ORIGINAL);
+			url = get_uri_string(html_context->base_href,
+					     URI_ORIGINAL);
 		}
 	}

-	if (url) {
-		/* Extraction of refresh time. */
-		unsigned long seconds = 0;
-		int valid = 1;
+	if (url)
+		joined_url = join_urls(html_context->base_href, url);

-		/* We try to extract the refresh time, and to handle weird things
-		 * in an elegant way. Among things we can have negative values,
-		 * too big ones, just ';' (we assume 0 seconds in that case) and
-		 * more. */
-		if (*refresh != ';') {
-			if (isdigit(*refresh)) {
-				unsigned long max_seconds = HTTP_REFRESH_MAX_DELAY;
+	if (joined_url) {
+		if (seconds > HTTP_REFRESH_MAX_DELAY)
+			seconds = HTTP_REFRESH_MAX_DELAY;

-				errno = 0;
-				seconds = strtoul(refresh, NULL, 10);
-				if (errno == ERANGE || seconds > max_seconds) {
-					/* Too big refresh value, limit it. */
-					seconds = max_seconds;
-				} else if (errno) {
-					/* Bad syntax */
-					valid = 0;
-				}
-			} else {
-				/* May be a negative number, or some bad syntax. */
-				valid = 0;
-			}
-		}
+		html_focusable(html_context, NULL);

-		if (valid) {
-			unsigned char *joined_url = join_urls(html_context->base_href, url);
-
-			html_focusable(html_context, NULL);
-
-			put_link_line("Refresh: ", url, joined_url,
-			              html_context->options->framename, html_context);
-			html_context->special_f(html_context, SP_REFRESH, seconds, joined_url);
-
-			mem_free(joined_url);
-		}
-
-		mem_free(url);
+		put_link_line("Refresh: ", url, joined_url,
+			      html_context->options->framename, html_context);
+		html_context->special_f(html_context, SP_REFRESH, seconds, joined_url);
 	}

+	mem_free_if(joined_url);
+	mem_free_if(url);
 	mem_free(refresh);
 }

--- a/src/document/html/test/Makefile
+++ b/src/document/html/test/Makefile
@ -0,0 +1,9 @@
+top_builddir=../../../..
+include $(top_builddir)/Makefile.config
+
+SUBDIRS = 
+TEST_PROGS = parse-meta-refresh-test
+TESTDEPS += \
+ $(top_builddir)/src/document/html/parse-meta-refresh.o
+
+include $(top_srcdir)/Makefile.lib
--- a/src/document/html/test/parse-meta-refresh-test.c
+++ b/src/document/html/test/parse-meta-refresh-test.c
@ -0,0 +1,223 @@
+/* Test parsing of <meta http-equiv="refresh" content="..."> */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "elinks.h"
+
+#include "document/html/parse-meta-refresh.h"
+#include "util/memory.h"
+
+struct meta_refresh_test_case
+{
+	const unsigned char *content;
+	int error;
+	unsigned long delay;
+	const unsigned char *url;
+};
+
+static const struct meta_refresh_test_case meta_refresh_test_cases[] = {
+	/* delay only */
+	{ "42",
+	  0, 42, NULL },
+	{ "0",
+	  0, 0, NULL },
+	{ "   5   ",
+	  0, 5, NULL },
+	{ "9999999999999999999999999",
+	  0, ULONG_MAX, NULL },
+	{ "69 ; ",
+	  0, 69, NULL },
+	{ "105;",
+	  0, 105, NULL },
+	{ "",
+	  -1, 0, NULL },
+
+	/* blank URL; these match Iceweasel/3.5.16 */
+	{ "5; URL=''",
+	  0, 5, NULL },
+	{ "; URL=''",
+	  -1, 0, NULL },
+
+	/* simple; these match Iceweasel/3.5.16 */
+	{ "42; URL=file:///dir/file.html",
+	  0, 42, "file:///dir/file.html" },
+	{ "42; URL='file:///dir/file.html'",
+	  0, 42, "file:///dir/file.html" },
+	{ "42; URL=\"file:///dir/file.html\"",
+	  0, 42, "file:///dir/file.html" },
+
+	/* without URL=; these match Iceweasel/3.5.16 */
+	{ "9; file:///dir/file.html",
+	  0, 9, "file:///dir/file.html" },
+	{ "9; 'file:///dir/file.html'",
+	  0, 9, "file:///dir/file.html" },
+	{ "9; \"file:///dir/file.html\"",
+	  0, 9, "file:///dir/file.html" },
+
+	/* lower case; these match Iceweasel/3.5.16 */
+	{ "3; Url=\"file:///dir/file.html\"",
+	  0, 3, "file:///dir/file.html" },
+	{ "3; url=\"file:///dir/file.html\"",
+	  0, 3, "file:///dir/file.html" },
+
+	/* unusual delimiters; these match Iceweasel/3.5.16 */
+	{ "0 URL=\"file:///dir/file.html\"",
+	  0, 0, "file:///dir/file.html" },
+	{ "0  ;  URL  =  \"file:///dir/file.html\"",
+	  0, 0, "file:///dir/file.html" },
+	{ "1, URL=\"file:///dir/file.html\"",
+	  0, 1, "file:///dir/file.html" },
+	{ "+0 URL='file:///dir/file.html'",
+	  0, 0, "file:///dir/file.html" },
+	{ "+0 URL=foo; URL='file:///dir/file.html'; URL='bar'",
+	  0, 0, "foo; URL='file:///dir/file.html'; URL='bar'" },
+	{ "+ URL=foo; URL='file:///dir/file.html'; URL='bar'",
+	  0, 0, "file:///dir/file.html" },
+	{ ". URL=foo; URL='file:///dir/file.html'; URL='bar'",
+	  0, 0, "file:///dir/file.html" },
+	{ ".0 URL=foo; URL='file:///dir/file.html'; URL='bar'",
+	  0, 0, "file:///dir/file.html" },
+	{ "0. URL=foo; URL='file:///dir/file.html'; URL='bar'",
+	  0, 0, "foo; URL='file:///dir/file.html'; URL='bar'" },
+	{ "4URL=foo; URL='file:///dir/file.html'; URL='bar'",
+	  -1, 0, NULL },
+	{ "garbage URL='file:///dir/file.html'",
+	  -1, 0, NULL },
+
+	/* semicolons in the URL; these match Iceweasel/3.5.16 */
+	{ "3; URL=file:///dir/file.cgi?a=1;b=2;c=3",
+	  0, 3, "file:///dir/file.cgi?a=1;b=2;c=3" },
+	{ "3; URL=\"file:///dir/file.cgi?a=1;b=2;c=3\"",
+	  0, 3, "file:///dir/file.cgi?a=1;b=2;c=3" },
+
+	/* spaces in the URL; these match Iceweasel/3.5.16 */
+	{ "3; URL=\"file:///dir/file.cgi?phrase=Hello, world!\"",
+	  0, 3, "file:///dir/file.cgi?phrase=Hello, world!" },
+	{ "3; URL=\"file:///dir/file.cgi?phrase=Hello, world!  \"",
+	  0, 3, "file:///dir/file.cgi?phrase=Hello, world!" },
+	{ "3; URL=\"file:///dir/file.cgi?phrase=Hello, world! %20 \"",
+	  0, 3, "file:///dir/file.cgi?phrase=Hello, world! %20" },
+	{ "3; URL=file:///dir/file.cgi?phrase=Hello, world!",
+	  0, 3, "file:///dir/file.cgi?phrase=Hello, world!" },
+	{ "3; URL=file:///dir/file.cgi?phrase=Hello, world! ",
+	  0, 3, "file:///dir/file.cgi?phrase=Hello, world!" },
+
+	/* "URL" in the URL; these match Iceweasel/3.5.16 */
+	{ "0; URL=file:///dir/xlat.cgi?url=http://example.org/&lang=cu",
+	  0, 0, "file:///dir/xlat.cgi?url=http://example.org/&lang=cu" },
+	{ "0; file:///dir/xlat.cgi?url=http://example.org/&lang=cu",
+	  0, 0, "file:///dir/xlat.cgi?url=http://example.org/&lang=cu" },
+
+	/* unusual delays; these sort-of match Iceweasel/3.5.16,
+	 * except it was not tested whether Iceweasel truncates the
+	 * delay to an integer, and it was not tested how long the
+	 * delays get with negative numbers.  */
+	{ "; URL=\"file:///dir/file.html\"",
+	  0, 0, "file:///dir/file.html" },
+	{ "2.99999; file:///dir/file.html",
+	  0, 2, "file:///dir/file.html" },
+	{ "2.99999; 'file:///dir/file.html'",
+	  0, 2, "file:///dir/file.html" },
+	{ "040; URL='file:///dir/file.html'",
+	  0, 40, "file:///dir/file.html" },
+	{ "+4; URL='file:///dir/file.html'",
+	  0, 4, "file:///dir/file.html" },
+	{ "  2; URL='file:///dir/file.html'",
+	  0, 2, "file:///dir/file.html" },
+	{ "+0; URL='file:///dir/file.html'",
+	  0, 0, "file:///dir/file.html" },
+	{ "-0; URL='file:///dir/file.html'",
+	  0, 0, "file:///dir/file.html" },
+	{ "-0.1; URL='file:///dir/file.html'",
+	  0, 0, "file:///dir/file.html" },
+	{ "-1; URL='file:///dir/file.html'",
+	  0, -1UL, "file:///dir/file.html" },
+	{ "-2; URL='file:///dir/file.html'",
+	  0, -2UL, "file:///dir/file.html" },
+	{ "garbage; URL='file:///dir/file.html'",
+	  0, 0, "file:///dir/file.html" },
+	{ "'5;1'; URL='file:///dir/file.html'",
+	  0, 0, "1'; URL='file:///dir/file.html'" },
+	{ "2,6; URL='file:///dir/file.html'",
+	  0, 2, "6; URL='file:///dir/file.html'" },
+	{ "2 3; URL='file:///dir/file.html'",
+	  0, 2, "3; URL='file:///dir/file.html'" },
+
+	/* unusual delay; not verified against Iceweasel */
+	{ "9999999999999999999999999; URL='file:///dir/file.html'",
+	  0, ULONG_MAX, "file:///dir/file.html" },
+
+	/* other stuff after the URL; these match Iceweasel/3.5.16 */
+	{ "5; URL=file:///dir/file.html   ",
+	  0, 5, "file:///dir/file.html" },
+	{ "5; URL=file:///dir/file.html\t",
+	  0, 5, "file:///dir/file.html" },
+	{ "5; URL=\"file:///dir/file.html\"  ",
+	  0, 5, "file:///dir/file.html" },
+	{ "5; URL=\"file:///dir/file.html\"\t\t",
+	  0, 5, "file:///dir/file.html" },
+	{ "5; URL=\"file:///dir/file.html\" ; ",
+	  0, 5, "file:///dir/file.html" },
+	{ "5; URL=\"file:///dir/file.html\"; transition=\"sweep\"",
+	  0, 5, "file:///dir/file.html" },
+
+	/* sentinel */
+	{ NULL, 0, 0, NULL }
+};
+
+int
+main(void)
+{
+	const struct meta_refresh_test_case *test;
+	int count_ok = 0;
+	int count_fail = 0;
+
+	for (test = meta_refresh_test_cases; test->content; test++) {
+		static unsigned char dummy[] = "dummy";
+		unsigned long delay = 21;
+		unsigned char *url = dummy;
+		
+		int error = html_parse_meta_refresh(test->content,
+						    &delay, &url);
+		if (error < 0 && test->error < 0 && url == NULL) {
+			/* Test OK */
+			count_ok++;
+		} else if (error >= 0 && test->error >= 0
+			   && ((!url && !test->url)
+			       || (url && test->url && !strcmp(url, test->url)))
+			   && delay == test->delay) {
+			/* Test OK */
+			count_ok++;
+		} else {
+			fprintf(stderr, "Test failed at input: %s\n"
+				"\tParsed  error: %d\n"
+				"\tCorrect error: %d\n"
+				"\tParsed  delay: %lu\n"
+				"\tCorrect delay: %lu\n"
+				"\tParsed  URL: %s\n"
+				"\tCorrect URL: %s\n",
+				test->content,
+				error,
+				test->error,
+				delay,
+				test->delay,
+				url ? (char *) url : "(null)",
+				test->url ? (char *) test->url : "(null)");
+			count_fail++;
+		}
+
+		if (url != dummy && url != NULL)
+			mem_free(url);
+	}
+
+	printf("Summary of meta refresh tests: %d OK, %d failed.\n",
+	       count_ok, count_fail);
+	return count_fail ? EXIT_FAILURE : EXIT_SUCCESS;
+}
--- a/src/document/html/test/test-parse-meta-refresh
+++ b/src/document/html/test/test-parse-meta-refresh
@ -0,0 +1,3 @@
+#! /bin/sh -e
+
+./parse-meta-refresh-test