From 715571a5d6c629089b827c46a42c693dab2d45a9 Mon Sep 17 00:00:00 2001 From: Kalle Olavi Niemitalo Date: Sun, 17 Apr 2011 23:23:00 +0300 Subject: [PATCH] HTML bug 1114: Don't doubly decode entities in attributes The HTML parser decoded SGML entity references and numeric character references in the following attributes, and then the renderer did the same again: link/@title link/@hreflang link/@type link/@media img/@alt area/@alt input[@type="image"]/@alt input[@type="image"]/@name input[@type="button"]/@value The result was that e.g. title="&#65;" displayed as "A" even though it was supposed to display as "A". Fix by making the HTML parser tell the renderer that the entities have already been decoded. --- NEWS | 2 ++ src/document/document.h | 3 +++ src/document/format.h | 5 +++++ src/document/html/parser.c | 10 ++++++++-- src/document/html/parser.h | 3 +++ src/document/html/parser/forms.c | 7 ++++++- src/document/html/parser/general.c | 3 ++- src/document/html/parser/link.c | 23 ++++++++++++----------- src/document/html/renderer.c | 5 ++++- src/viewer/text/link.c | 4 +++- 10 files changed, 48 insertions(+), 17 deletions(-) diff --git a/NEWS b/NEWS index be3b7db3b..e0f898689 100644 --- a/NEWS +++ b/NEWS @@ -47,6 +47,8 @@ Miscellaneous: supported UTF-8 as the dump charset.) * Really retry forever when connection.retries = 0. * minor bug 1113: Fix a small memory leak if a mailcap file is malformed. +* minor bug 1114: Decode SGML entities and NCRs only once in link/@title + and other attributes. * enhancement: Session-specific options. Any options changed with toggle-* actions no longer affect other tabs or other terminals. * Do not crash when document.browse.minimum_refresh_time = 0 and diff --git a/src/document/document.h b/src/document/document.h index f6e2461b8..0a24280af 100644 --- a/src/document/document.h +++ b/src/document/document.h @@ -91,6 +91,9 @@ struct link { unsigned char *where; unsigned char *target; unsigned char *where_img; + + /** The title of the link. This is in the document charset, + * and entities have already been decoded. */ unsigned char *title; /** The set of characters belonging to this link (their coordinates diff --git a/src/document/format.h b/src/document/format.h index fee79dcf0..defd2ada7 100644 --- a/src/document/format.h +++ b/src/document/format.h @@ -14,6 +14,11 @@ enum text_style_format { AT_FIXED = 8, AT_GRAPHICS = 16, AT_PREFORMATTED = 32, + + /* AT_NO_ENTITIES means the parser has already expanded + * entities and numeric character references, so the put_chars + * function of the renderer must not do that again. */ + AT_NO_ENTITIES = 64, }; struct text_style_color { diff --git a/src/document/html/parser.c b/src/document/html/parser.c index 7ebc582f0..4b89b8423 100644 --- a/src/document/html/parser.c +++ b/src/document/html/parser.c @@ -662,8 +662,10 @@ look_for_link(unsigned char **pos, unsigned char *eof, struct menu_item **menu, unsigned char *alt = get_attr_val(attr, "alt", options->cp); if (alt) { + /* CSM_NONE because get_attr_val() already + * decoded entities. */ label = convert_string(ct, alt, strlen(alt), - options->cp, CSM_DEFAULT, + options->cp, CSM_NONE, NULL, NULL, NULL); mem_free(alt); } else { @@ -864,7 +866,11 @@ done_html_parser_state(struct html_context *html_context, /* This function does not set html_context.doc_cp = document.cp, * because it does not know the document, and because the codepage has - * not even been decided when it is called. */ + * not even been decided when it is called. + * + * @param[out] title + * The title of the document. This is in the document charset, + * and entities have not been decoded. */ struct html_context * init_html_parser(struct uri *uri, struct document_options *options, unsigned char *start, unsigned char *end, diff --git a/src/document/html/parser.h b/src/document/html/parser.h index 8f99befc2..f989f7a0f 100644 --- a/src/document/html/parser.h +++ b/src/document/html/parser.h @@ -39,7 +39,10 @@ struct text_attrib { unsigned char *link; unsigned char *target; unsigned char *image; + + /* Any entities in the title have already been decoded. */ unsigned char *title; + struct form_control *form; struct text_attrib_color color; diff --git a/src/document/html/parser/forms.c b/src/document/html/parser/forms.c index 5f9dc4757..d40a569a1 100644 --- a/src/document/html/parser/forms.c +++ b/src/document/html/parser/forms.c @@ -232,12 +232,14 @@ html_input_format(struct html_context *html_context, unsigned char *a, } format.style.attr |= AT_BOLD; put_chrs(html_context, "[ ", 7); + format.style.attr |= AT_NO_ENTITIES; if (fc->alt) put_chrs(html_context, fc->alt, strlen(fc->alt)); else if (fc->name) put_chrs(html_context, fc->name, strlen(fc->name)); else put_chrs(html_context, "Submit", 6); + format.style.attr &= ~AT_NO_ENTITIES; put_chrs(html_context, " ]", 7); break; @@ -247,8 +249,11 @@ html_input_format(struct html_context *html_context, unsigned char *a, case FC_BUTTON: format.style.attr |= AT_BOLD; put_chrs(html_context, "[ ", 7); - if (fc->default_value) + if (fc->default_value) { + format.style.attr |= AT_NO_ENTITIES; put_chrs(html_context, fc->default_value, strlen(fc->default_value)); + format.style.attr &= ~AT_NO_ENTITIES; + } put_chrs(html_context, " ]", 7); break; case FC_TEXTAREA: diff --git a/src/document/html/parser/general.c b/src/document/html/parser/general.c index 5f811832c..7e8de820e 100644 --- a/src/document/html/parser/general.c +++ b/src/document/html/parser/general.c @@ -97,7 +97,8 @@ html_superscript(struct html_context *html_context, unsigned char *a, put_chrs(html_context, "^", 1); } -/* TODO: Add more languages. */ +/* TODO: Add more languages. + * Entities can be used in these strings. */ static unsigned char *quote_char[2] = { "\"", "'" }; void diff --git a/src/document/html/parser/link.c b/src/document/html/parser/link.c index 6a6435ef7..02e68cc34 100644 --- a/src/document/html/parser/link.c +++ b/src/document/html/parser/link.c @@ -193,17 +193,21 @@ static void put_image_label(unsigned char *a, unsigned char *label, struct html_context *html_context) { - color_T fg; + color_T saved_foreground; + enum text_style_format saved_attr; /* This is not 100% appropriate for , but well, accepting * accesskey and tabindex near is just our little * extension to the standard. After all, it makes sense. */ html_focusable(html_context, a); - fg = format.style.color.foreground; + saved_foreground = format.style.color.foreground; + saved_attr = format.style.attr; format.style.color.foreground = format.color.image_link; + format.style.attr |= AT_NO_ENTITIES; put_chrs(html_context, label, strlen(label)); - format.style.color.foreground = fg; + format.style.color.foreground = saved_foreground; + format.style.attr = saved_attr; } static void @@ -354,6 +358,7 @@ html_img(struct html_context *html_context, unsigned char *a, html_img_do(a, NULL, html_context); } +/* prefix can have entities in it, but linkname cannot. */ void put_link_line(unsigned char *prefix, unsigned char *linkname, unsigned char *link, unsigned char *target, @@ -370,14 +375,10 @@ put_link_line(unsigned char *prefix, unsigned char *linkname, format.link = join_urls(html_context->base_href, link); format.target = stracpy(target); format.style.color.foreground = format.color.clink; - /* FIXME: linkname typically comes from get_attr_val, which - * has already converted it from the document charset to the - * terminal charset and expanded character entity references. - * The following put_chrs call again converts the characters - * and expands entity references. So if we have - * - * then ELinks will display "foo?<" rather than "foo?<". - * This was mentioned in bug 213. */ + /* linkname typically comes from get_attr_val, which + * has already expanded character entity references. + * Tell put_chrs not to expand them again. */ + format.style.attr |= AT_NO_ENTITIES; put_chrs(html_context, linkname, strlen(linkname)); ln_break(html_context, 1); pop_html_element(html_context); diff --git a/src/document/html/renderer.c b/src/document/html/renderer.c index ec6c31254..303769199 100644 --- a/src/document/html/renderer.c +++ b/src/document/html/renderer.c @@ -1497,7 +1497,8 @@ put_chars_conv(struct html_context *html_context, convert_string(renderer_context.convert_table, chars, charslen, html_context->options->cp, - CSM_DEFAULT, NULL, (void (*)(void *, unsigned char *, int)) put_chars, html_context); + (format.style.attr & AT_NO_ENTITIES) ? CSM_NONE : CSM_DEFAULT, + NULL, (void (*)(void *, unsigned char *, int)) put_chars, html_context); } static inline void @@ -2442,6 +2443,8 @@ render_html_document(struct cache_entry *cached, struct document *document, html_context->doc_cp = document->cp; if (title.length) { + /* CSM_DEFAULT because init_html_parser() did not + * decode entities in the title. */ document->title = convert_string(renderer_context.convert_table, title.source, title.length, document->options.cp, diff --git a/src/viewer/text/link.c b/src/viewer/text/link.c index a73643bd9..86535a558 100644 --- a/src/viewer/text/link.c +++ b/src/viewer/text/link.c @@ -1424,10 +1424,12 @@ get_current_link_title(struct document_view *doc_view) convert_table = get_translation_table(doc_view->document->cp, doc_view->document->options.cp); + /* CSM_NONE because any entities in the title have + * already been decoded. */ link_title = convert_string(convert_table, link->title, strlen(link->title), doc_view->document->options.cp, - CSM_DEFAULT, NULL, NULL, NULL); + CSM_NONE, NULL, NULL, NULL); /* Remove illicit chars. */ #ifdef CONFIG_UTF8 if (link_title && !doc_view->document->options.utf8)