1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-01-03 14:57:44 -05:00

HTML bug 1114: Don't doubly decode entities in attributes

The HTML parser decoded SGML entity references and numeric character
references in the following attributes, and then the renderer did the
same again:

link/@title
link/@hreflang
link/@type
link/@media
img/@alt
area/@alt
input[@type="image"]/@alt
input[@type="image"]/@name
input[@type="button"]/@value

The result was that e.g. title="A" displayed as "A"
even though it was supposed to display as "A".

Fix by making the HTML parser tell the renderer that the entities have
already been decoded.
This commit is contained in:
Kalle Olavi Niemitalo 2011-04-17 23:23:00 +03:00 committed by Kalle Olavi Niemitalo
parent 0f49fe1c38
commit 715571a5d6
10 changed files with 48 additions and 17 deletions

2
NEWS
View File

@ -47,6 +47,8 @@ Miscellaneous:
supported UTF-8 as the dump charset.) supported UTF-8 as the dump charset.)
* Really retry forever when connection.retries = 0. * Really retry forever when connection.retries = 0.
* minor bug 1113: Fix a small memory leak if a mailcap file is malformed. * minor bug 1113: Fix a small memory leak if a mailcap file is malformed.
* minor bug 1114: Decode SGML entities and NCRs only once in link/@title
and other attributes.
* enhancement: Session-specific options. Any options changed with * enhancement: Session-specific options. Any options changed with
toggle-* actions no longer affect other tabs or other terminals. toggle-* actions no longer affect other tabs or other terminals.
* Do not crash when document.browse.minimum_refresh_time = 0 and * Do not crash when document.browse.minimum_refresh_time = 0 and

View File

@ -91,6 +91,9 @@ struct link {
unsigned char *where; unsigned char *where;
unsigned char *target; unsigned char *target;
unsigned char *where_img; unsigned char *where_img;
/** The title of the link. This is in the document charset,
* and entities have already been decoded. */
unsigned char *title; unsigned char *title;
/** The set of characters belonging to this link (their coordinates /** The set of characters belonging to this link (their coordinates

View File

@ -14,6 +14,11 @@ enum text_style_format {
AT_FIXED = 8, AT_FIXED = 8,
AT_GRAPHICS = 16, AT_GRAPHICS = 16,
AT_PREFORMATTED = 32, AT_PREFORMATTED = 32,
/* AT_NO_ENTITIES means the parser has already expanded
* entities and numeric character references, so the put_chars
* function of the renderer must not do that again. */
AT_NO_ENTITIES = 64,
}; };
struct text_style_color { struct text_style_color {

View File

@ -662,8 +662,10 @@ look_for_link(unsigned char **pos, unsigned char *eof, struct menu_item **menu,
unsigned char *alt = get_attr_val(attr, "alt", options->cp); unsigned char *alt = get_attr_val(attr, "alt", options->cp);
if (alt) { if (alt) {
/* CSM_NONE because get_attr_val() already
* decoded entities. */
label = convert_string(ct, alt, strlen(alt), label = convert_string(ct, alt, strlen(alt),
options->cp, CSM_DEFAULT, options->cp, CSM_NONE,
NULL, NULL, NULL); NULL, NULL, NULL);
mem_free(alt); mem_free(alt);
} else { } else {
@ -864,7 +866,11 @@ done_html_parser_state(struct html_context *html_context,
/* This function does not set html_context.doc_cp = document.cp, /* This function does not set html_context.doc_cp = document.cp,
* because it does not know the document, and because the codepage has * because it does not know the document, and because the codepage has
* not even been decided when it is called. */ * not even been decided when it is called.
*
* @param[out] title
* The title of the document. This is in the document charset,
* and entities have not been decoded. */
struct html_context * struct html_context *
init_html_parser(struct uri *uri, struct document_options *options, init_html_parser(struct uri *uri, struct document_options *options,
unsigned char *start, unsigned char *end, unsigned char *start, unsigned char *end,

View File

@ -39,7 +39,10 @@ struct text_attrib {
unsigned char *link; unsigned char *link;
unsigned char *target; unsigned char *target;
unsigned char *image; unsigned char *image;
/* Any entities in the title have already been decoded. */
unsigned char *title; unsigned char *title;
struct form_control *form; struct form_control *form;
struct text_attrib_color color; struct text_attrib_color color;

View File

@ -232,12 +232,14 @@ html_input_format(struct html_context *html_context, unsigned char *a,
} }
format.style.attr |= AT_BOLD; format.style.attr |= AT_BOLD;
put_chrs(html_context, "[ ", 7); put_chrs(html_context, "[ ", 7);
format.style.attr |= AT_NO_ENTITIES;
if (fc->alt) if (fc->alt)
put_chrs(html_context, fc->alt, strlen(fc->alt)); put_chrs(html_context, fc->alt, strlen(fc->alt));
else if (fc->name) else if (fc->name)
put_chrs(html_context, fc->name, strlen(fc->name)); put_chrs(html_context, fc->name, strlen(fc->name));
else else
put_chrs(html_context, "Submit", 6); put_chrs(html_context, "Submit", 6);
format.style.attr &= ~AT_NO_ENTITIES;
put_chrs(html_context, " ]", 7); put_chrs(html_context, " ]", 7);
break; break;
@ -247,8 +249,11 @@ html_input_format(struct html_context *html_context, unsigned char *a,
case FC_BUTTON: case FC_BUTTON:
format.style.attr |= AT_BOLD; format.style.attr |= AT_BOLD;
put_chrs(html_context, "[ ", 7); put_chrs(html_context, "[ ", 7);
if (fc->default_value) if (fc->default_value) {
format.style.attr |= AT_NO_ENTITIES;
put_chrs(html_context, fc->default_value, strlen(fc->default_value)); put_chrs(html_context, fc->default_value, strlen(fc->default_value));
format.style.attr &= ~AT_NO_ENTITIES;
}
put_chrs(html_context, " ]", 7); put_chrs(html_context, " ]", 7);
break; break;
case FC_TEXTAREA: case FC_TEXTAREA:

View File

@ -97,7 +97,8 @@ html_superscript(struct html_context *html_context, unsigned char *a,
put_chrs(html_context, "^", 1); put_chrs(html_context, "^", 1);
} }
/* TODO: Add more languages. */ /* TODO: Add more languages.
* Entities can be used in these strings. */
static unsigned char *quote_char[2] = { "\"", "'" }; static unsigned char *quote_char[2] = { "\"", "'" };
void void

View File

@ -193,17 +193,21 @@ static void
put_image_label(unsigned char *a, unsigned char *label, put_image_label(unsigned char *a, unsigned char *label,
struct html_context *html_context) struct html_context *html_context)
{ {
color_T fg; color_T saved_foreground;
enum text_style_format saved_attr;
/* This is not 100% appropriate for <img>, but well, accepting /* This is not 100% appropriate for <img>, but well, accepting
* accesskey and tabindex near <img> is just our little * accesskey and tabindex near <img> is just our little
* extension to the standard. After all, it makes sense. */ * extension to the standard. After all, it makes sense. */
html_focusable(html_context, a); html_focusable(html_context, a);
fg = format.style.color.foreground; saved_foreground = format.style.color.foreground;
saved_attr = format.style.attr;
format.style.color.foreground = format.color.image_link; format.style.color.foreground = format.color.image_link;
format.style.attr |= AT_NO_ENTITIES;
put_chrs(html_context, label, strlen(label)); put_chrs(html_context, label, strlen(label));
format.style.color.foreground = fg; format.style.color.foreground = saved_foreground;
format.style.attr = saved_attr;
} }
static void static void
@ -354,6 +358,7 @@ html_img(struct html_context *html_context, unsigned char *a,
html_img_do(a, NULL, html_context); html_img_do(a, NULL, html_context);
} }
/* prefix can have entities in it, but linkname cannot. */
void void
put_link_line(unsigned char *prefix, unsigned char *linkname, put_link_line(unsigned char *prefix, unsigned char *linkname,
unsigned char *link, unsigned char *target, unsigned char *link, unsigned char *target,
@ -370,14 +375,10 @@ put_link_line(unsigned char *prefix, unsigned char *linkname,
format.link = join_urls(html_context->base_href, link); format.link = join_urls(html_context->base_href, link);
format.target = stracpy(target); format.target = stracpy(target);
format.style.color.foreground = format.color.clink; format.style.color.foreground = format.color.clink;
/* FIXME: linkname typically comes from get_attr_val, which /* linkname typically comes from get_attr_val, which
* has already converted it from the document charset to the * has already expanded character entity references.
* terminal charset and expanded character entity references. * Tell put_chrs not to expand them again. */
* The following put_chrs call again converts the characters format.style.attr |= AT_NO_ENTITIES;
* and expands entity references. So if we have
* <meta http-equiv="refresh" content="3; url=foo?&amp;lt" />
* then ELinks will display "foo?<" rather than "foo?&lt".
* This was mentioned in bug 213. */
put_chrs(html_context, linkname, strlen(linkname)); put_chrs(html_context, linkname, strlen(linkname));
ln_break(html_context, 1); ln_break(html_context, 1);
pop_html_element(html_context); pop_html_element(html_context);

View File

@ -1497,7 +1497,8 @@ put_chars_conv(struct html_context *html_context,
convert_string(renderer_context.convert_table, chars, charslen, convert_string(renderer_context.convert_table, chars, charslen,
html_context->options->cp, html_context->options->cp,
CSM_DEFAULT, NULL, (void (*)(void *, unsigned char *, int)) put_chars, html_context); (format.style.attr & AT_NO_ENTITIES) ? CSM_NONE : CSM_DEFAULT,
NULL, (void (*)(void *, unsigned char *, int)) put_chars, html_context);
} }
static inline void static inline void
@ -2442,6 +2443,8 @@ render_html_document(struct cache_entry *cached, struct document *document,
html_context->doc_cp = document->cp; html_context->doc_cp = document->cp;
if (title.length) { if (title.length) {
/* CSM_DEFAULT because init_html_parser() did not
* decode entities in the title. */
document->title = convert_string(renderer_context.convert_table, document->title = convert_string(renderer_context.convert_table,
title.source, title.length, title.source, title.length,
document->options.cp, document->options.cp,

View File

@ -1424,10 +1424,12 @@ get_current_link_title(struct document_view *doc_view)
convert_table = get_translation_table(doc_view->document->cp, convert_table = get_translation_table(doc_view->document->cp,
doc_view->document->options.cp); doc_view->document->options.cp);
/* CSM_NONE because any entities in the title have
* already been decoded. */
link_title = convert_string(convert_table, link->title, link_title = convert_string(convert_table, link->title,
strlen(link->title), strlen(link->title),
doc_view->document->options.cp, doc_view->document->options.cp,
CSM_DEFAULT, NULL, NULL, NULL); CSM_NONE, NULL, NULL, NULL);
/* Remove illicit chars. */ /* Remove illicit chars. */
#ifdef CONFIG_UTF8 #ifdef CONFIG_UTF8
if (link_title && !doc_view->document->options.utf8) if (link_title && !doc_view->document->options.utf8)