/* HTML core parser routines */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include
#include
#include
#include
#include
#include "elinks.h"
#include "document/css/apply.h"
#include "document/css/parser.h"
#include "document/html/parser/forms.h"
#include "document/html/parser/general.h"
#include "document/html/parser/link.h"
#include "document/html/parser/parse.h"
#include "document/html/parser/stack.h"
#include "document/html/parser.h"
#include "document/options.h"
#include "intl/charsets.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/fastfind.h"
#include "util/memdebug.h"
#include "util/memory.h"
#include "util/string.h"
/* Unsafe macros */
#include "document/html/internal.h"
#define end_of_tag(c) ((c) == '>' || (c) == '<')
static inline int
atchr(register unsigned char c)
{
return (c < 127 && (c > '>' || (c > ' ' && c != '=' && !end_of_tag(c))));
}
/* This function eats one html element. */
/* - e is pointer to the begining of the element (*e must be '<')
* - eof is pointer to the end of scanned area
* - parsed element name is stored in name, it's length is namelen
* - first attribute is stored in attr
* - end points to first character behind the html element */
/* It returns -1 when it failed (returned values in pointers are invalid) and
* 0 for success. */
int
parse_element(register unsigned char *e, unsigned char *eof,
unsigned char **name, int *namelen,
unsigned char **attr, unsigned char **end)
{
#define next_char() if (++e == eof) return -1;
assert(e && eof);
if (e >= eof || *e != '<') return -1;
next_char();
if (name) *name = e;
if (*e == '/') next_char();
if (!isident(*e)) return -1;
while (isident(*e)) next_char();
if (!isspace(*e) && !end_of_tag(*e) && *e != '/' && *e != ':' && *e != '=')
return -1;
if (name && namelen) *namelen = e - *name;
while (isspace(*e) || *e == '/' || *e == ':') next_char();
/* Skip bad attribute */
while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
if (attr) *attr = e;
next_attr:
while (isspace(*e)) next_char();
/* Skip bad attribute */
while (!atchr(*e) && !end_of_tag(*e) && !isspace(*e)) next_char();
if (end_of_tag(*e)) goto end;
while (atchr(*e)) next_char();
while (isspace(*e)) next_char();
if (*e != '=') {
if (end_of_tag(*e)) goto end;
goto next_attr;
}
next_char();
while (isspace(*e)) next_char();
if (isquote(*e)) {
unsigned char quote = *e;
/* quoted_value: */
next_char();
while (*e != quote) next_char();
next_char();
/* The following apparently handles the case of , however that is very rare and probably not
* conforming. More frequent (and mishandling it more fatal) is
* probably the typo of - we can handle it as
* long as this is commented out. --pasky */
/* if (*e == quote) goto quoted_value; */
} else {
while (!isspace(*e) && !end_of_tag(*e)) next_char();
}
while (isspace(*e)) next_char();
if (!end_of_tag(*e)) goto next_attr;
end:
if (end) *end = e + (*e == '>');
return 0;
}
#define realloc_chrs(x, l) mem_align_alloc(x, l, (l) + 1, unsigned char, 0xFF)
#define add_chr(s, l, c) \
do { \
if (!realloc_chrs(&(s), l)) return NULL; \
(s)[(l)++] = (c); \
} while (0)
unsigned char *
get_attr_value(register unsigned char *e, unsigned char *name,
struct document_options *options, enum html_attr_flags flags)
{
unsigned char *n;
unsigned char *name_start;
unsigned char *attr = NULL;
int attrlen = 0;
int found;
next_attr:
skip_space(e);
if (end_of_tag(*e) || !atchr(*e)) goto parse_error;
n = name;
name_start = e;
while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;
found = !*n && !atchr(*e);
if (found && (flags & HTML_ATTR_TEST)) return name_start;
while (atchr(*e)) e++;
skip_space(e);
if (*e != '=') {
if (found) goto found_endattr;
goto next_attr;
}
e++;
skip_space(e);
if (found) {
if (!isquote(*e)) {
while (!isspace(*e) && !end_of_tag(*e)) {
if (!*e) goto parse_error;
add_chr(attr, attrlen, *e);
e++;
}
} else {
unsigned char quote = *e;
/* parse_quoted_value: */
while (*(++e) != quote) {
if (*e == ASCII_CR) continue;
if (!*e) goto parse_error;
if (*e != ASCII_TAB && *e != ASCII_LF)
add_chr(attr, attrlen, *e);
else if (!(flags & HTML_ATTR_EAT_NL))
add_chr(attr, attrlen, ' ');
}
e++;
/* The following apparently handles the case of , however that is very rare and probably
* not conforming. More frequent (and mishandling it
* more fatal) is probably the typo of -
* we can handle it as long as this is commented out.
* --pasky */
#if 0
if (*e == quote) {
add_chr(attr, attrlen, *e);
goto parse_quoted_value;
}
#endif
}
found_endattr:
add_chr(attr, attrlen, '\0');
attrlen--;
if (/* Unused: !(flags & HTML_ATTR_NO_CONV) && */
memchr(attr, '&', attrlen)) {
unsigned char *saved_attr = attr;
attr = convert_string(NULL, saved_attr, attrlen,
options->cp, CSM_QUERY,
NULL, NULL, NULL);
mem_free(saved_attr);
}
set_mem_comment(attr, name, strlen(name));
return attr;
} else {
if (!isquote(*e)) {
while (!isspace(*e) && !end_of_tag(*e)) {
if (!*e) goto parse_error;
e++;
}
} else {
unsigned char quote = *e;
do {
while (*(++e) != quote)
if (!*e) goto parse_error;
e++;
} while (/* See above. *e == quote */ 0);
}
}
goto next_attr;
parse_error:
mem_free_if(attr);
return NULL;
}
#undef add_chr
/* Extract numerical value of attribute @name.
* It will return a positive integer value on success,
* or -1 on error. */
int
get_num(unsigned char *a, unsigned char *name, struct document_options *options)
{
unsigned char *al = get_attr_val(a, name, options);
int result = -1;
if (al) {
unsigned char *end;
long num;
errno = 0;
num = strtol(al, (char **) &end, 10);
if (!errno && *al && !*end && num >= 0 && num <= INT_MAX)
result = (int) num;
mem_free(al);
}
return result;
}
/* Parse 'width[%],....'-like attribute @name of element @a. If @limited is
* set, it will limit the width value to the current usable width. Note that
* @limited must be set to be able to parse percentage widths. */
/* The function returns width in characters or -1 in case of error. */
int
get_width(unsigned char *a, unsigned char *name, int limited,
struct html_context *html_context)
{
unsigned char *value = get_attr_val(a, name, html_context->options);
unsigned char *str = value;
unsigned char *end;
int percentage = 0;
int len;
long width;
if (!value) return -1;
/* Skip spaces at start of string if any. */
skip_space(str);
/* Search for end of string or ',' character (ie. in "100,200") */
for (len = 0; str[len] && str[len] != ','; len++);
/* Go back, and skip spaces after width if any. */
while (len && isspace(str[len - 1])) len--;
if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
/* Is this a percentage ? */
if (str[len - 1] == '%') len--, percentage = 1;
/* Skip spaces between width number and percentage if any. */
while (len && isspace(str[len - 1])) len--;
if (!len) { mem_free(value); return -1; } /* Nothing to parse. */
/* Shorten the string a bit, so strtoul() will work on useful
* part of it. */
str[len] = '\0';
/* Convert to number if possible. */
errno = 0;
width = strtoul((char *) str, (char **) &end, 10);
/* @end points into the @value string so check @end position
* before freeing @value. */
if (errno || *end || width >= INT_MAX) {
/* Not a valid number. */
mem_free(value);
return -1;
}
mem_free(value);
#define WIDTH_PIXELS2CHARS(width) ((width) + (HTML_CHAR_WIDTH - 1) / 2) / HTML_CHAR_WIDTH;
if (limited) {
int maxwidth = get_html_max_width();
if (percentage) {
/* Value is a percentage. */
width = width * maxwidth / 100;
} else {
/* Value is a number of pixels, makes an approximation. */
width = WIDTH_PIXELS2CHARS(width);
}
if (width > maxwidth)
width = maxwidth;
} else {
if (percentage) {
/* No sense, we need @limited and @maxwidth for percentage. */
return -1;
} else {
/* Value is a number of pixels, makes an approximation,
* no limit here */
width = WIDTH_PIXELS2CHARS(width);
}
}
#undef WIDTH_PIXELS2CHARS
if (width < 0)
width = 0;
return width;
}
unsigned char *
skip_comment(unsigned char *html, unsigned char *eof)
{
if (html + 4 <= eof && html[2] == '-' && html[3] == '-') {
html += 4;
while (html < eof) {
if (html + 2 <= eof && html[0] == '-' && html[1] == '-') {
html += 2;
while (html < eof && *html == '-') html++;
while (html < eof && isspace(*html)) html++;
if (html >= eof) return eof;
if (*html == '>') return html + 1;
continue;
}
html++;
}
} else {
html += 2;
while (html < eof) {
if (html[0] == '>') return html + 1;
html++;
}
}
return eof;
}
enum element_type {
ELEMENT_TYPE_NESTABLE,
ELEMENT_TYPE_NON_NESTABLE,
ELEMENT_TYPE_NON_PAIRABLE,
ELEMENT_TYPE_LI,
};
struct element_info {
/* Element name, uppercase. */
unsigned char *name;
/* Element handler. This does the relevant arguments processing and
* formatting (by calling renderer hooks). Note that in a few cases,
* this is just a placeholder and the element is given special care
* in start_element() (which is also where we call these handlers). */
element_handler_T *func;
/* How many line-breaks to ensure we have before and after an element.
* Value of 1 means the element will be on a line on its own, value
* of 2 means that it will also have empty lines before and after.
* Note that this does not add up - it just ensures that there is
* at least so many linebreaks, but does not add more if that is the
* case. Therefore, something like e.g.
will add only two
* linebreaks, not four. */
/* In some stack killing logic, we use some weird heuristic based on
* whether an element is block or inline. That is determined from
* whether this attribute is zero on non-zero. */
int linebreak;
enum element_type type;
};
static struct element_info elements[] = {
{"A", html_a, 0, ELEMENT_TYPE_NON_NESTABLE},
{"ABBR", html_italic, 0, ELEMENT_TYPE_NESTABLE },
{"ADDRESS", html_address, 2, ELEMENT_TYPE_NESTABLE },
{"APPLET", html_applet, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"B", html_bold, 0, ELEMENT_TYPE_NESTABLE },
{"BASE", html_base, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"BASEFONT", html_font, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"BLOCKQUOTE", html_blockquote, 2, ELEMENT_TYPE_NESTABLE },
{"BODY", html_body, 0, ELEMENT_TYPE_NESTABLE },
{"BR", html_br, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"BUTTON", html_button, 0, ELEMENT_TYPE_NESTABLE },
{"CAPTION", html_center, 1, ELEMENT_TYPE_NESTABLE },
{"CENTER", html_center, 1, ELEMENT_TYPE_NESTABLE },
{"CODE", html_fixed, 0, ELEMENT_TYPE_NESTABLE },
{"DD", html_dd, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"DFN", html_bold, 0, ELEMENT_TYPE_NESTABLE },
{"DIR", html_ul, 2, ELEMENT_TYPE_NESTABLE },
{"DIV", html_linebrk, 1, ELEMENT_TYPE_NESTABLE },
{"DL", html_dl, 2, ELEMENT_TYPE_NESTABLE },
{"DT", html_dt, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"EM", html_italic, 0, ELEMENT_TYPE_NESTABLE },
{"EMBED", html_embed, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"FIXED", html_fixed, 0, ELEMENT_TYPE_NESTABLE },
{"FONT", html_font, 0, ELEMENT_TYPE_NESTABLE },
{"FORM", html_form, 1, ELEMENT_TYPE_NESTABLE },
{"FRAME", html_frame, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"FRAMESET", html_frameset, 1, ELEMENT_TYPE_NESTABLE },
{"H1", html_h1, 2, ELEMENT_TYPE_NON_NESTABLE},
{"H2", html_h2, 2, ELEMENT_TYPE_NON_NESTABLE},
{"H3", html_h3, 2, ELEMENT_TYPE_NON_NESTABLE},
{"H4", html_h4, 2, ELEMENT_TYPE_NON_NESTABLE},
{"H5", html_h5, 2, ELEMENT_TYPE_NON_NESTABLE},
{"H6", html_h6, 2, ELEMENT_TYPE_NON_NESTABLE},
{"HEAD", html_head, 0, ELEMENT_TYPE_NESTABLE },
{"HR", html_hr, 2, ELEMENT_TYPE_NON_PAIRABLE},
{"HTML", html_html, 0, ELEMENT_TYPE_NESTABLE },
{"I", html_italic, 0, ELEMENT_TYPE_NESTABLE },
{"IFRAME", html_iframe, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"IMG", html_img, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"INPUT", html_input, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"LI", html_li, 1, ELEMENT_TYPE_LI },
{"LINK", html_link, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"LISTING", html_pre, 2, ELEMENT_TYPE_NESTABLE },
{"MENU", html_ul, 2, ELEMENT_TYPE_NESTABLE },
{"META", html_meta, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"NOFRAMES", html_noframes, 0, ELEMENT_TYPE_NESTABLE },
{"NOSCRIPT", html_noscript, 0, ELEMENT_TYPE_NESTABLE },
{"OBJECT", html_object, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"OL", html_ol, 2, ELEMENT_TYPE_NESTABLE },
{"OPTION", html_option, 1, ELEMENT_TYPE_NON_PAIRABLE},
{"P", html_p, 2, ELEMENT_TYPE_NON_NESTABLE},
{"PRE", html_pre, 2, ELEMENT_TYPE_NESTABLE },
{"Q", html_italic, 0, ELEMENT_TYPE_NESTABLE },
{"S", html_underline, 0, ELEMENT_TYPE_NESTABLE },
{"SCRIPT", html_script, 0, ELEMENT_TYPE_NESTABLE },
{"SELECT", html_select, 0, ELEMENT_TYPE_NESTABLE },
{"SPAN", html_span, 0, ELEMENT_TYPE_NESTABLE },
{"STRIKE", html_underline, 0, ELEMENT_TYPE_NESTABLE },
{"STRONG", html_bold, 0, ELEMENT_TYPE_NESTABLE },
{"STYLE", html_style, 0, ELEMENT_TYPE_NESTABLE },
{"SUB", html_subscript, 0, ELEMENT_TYPE_NESTABLE },
{"SUP", html_superscript, 0, ELEMENT_TYPE_NESTABLE },
{"TABLE", html_table, 2, ELEMENT_TYPE_NESTABLE },
{"TD", html_td, 0, ELEMENT_TYPE_NESTABLE },
{"TEXTAREA", html_textarea, 0, ELEMENT_TYPE_NON_PAIRABLE},
{"TH", html_th, 0, ELEMENT_TYPE_NESTABLE },
{"TITLE", html_title, 0, ELEMENT_TYPE_NESTABLE },
{"TR", html_tr, 1, ELEMENT_TYPE_NESTABLE },
{"TT", html_tt, 0, ELEMENT_TYPE_NON_NESTABLE},
{"U", html_underline, 0, ELEMENT_TYPE_NESTABLE },
{"UL", html_ul, 2, ELEMENT_TYPE_NESTABLE },
{"XMP", html_xmp, 2, ELEMENT_TYPE_NESTABLE },
{NULL, NULL, 0, ELEMENT_TYPE_NESTABLE },
};
#define NUMBER_OF_TAGS (sizeof_array(elements) - 1)
#ifndef USE_FASTFIND
static int
compar(const void *a, const void *b)
{
return strcasecmp(((struct element_info *) a)->name,
((struct element_info *) b)->name);
}
#else
static struct element_info *internal_pointer;
/* Reset internal list pointer */
static void
tags_list_reset(void)
{
internal_pointer = elements;
}
/* Returns a pointer to a struct that contains
* current key and data pointers and increment
* internal pointer.
* It returns NULL when key is NULL. */
static struct fastfind_key_value *
tags_list_next(void)
{
static struct fastfind_key_value kv;
if (!internal_pointer->name) return NULL;
kv.key = internal_pointer->name;
kv.data = internal_pointer;
internal_pointer++;
return &kv;
}
static struct fastfind_index ff_tags_index
= INIT_FASTFIND_INDEX("tags_lookup", tags_list_reset, tags_list_next);
#endif /* USE_FASTFIND */
void
init_tags_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_index(&ff_tags_index, FF_COMPRESS);
#endif
}
void
free_tags_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_done(&ff_tags_index);
#endif
}
static unsigned char *process_element(unsigned char *name, int namelen, int endingtag,
unsigned char *html, unsigned char *prev_html,
unsigned char *eof, unsigned char *attr,
struct html_context *html_context);
void
parse_html(unsigned char *html, unsigned char *eof,
struct part *part, unsigned char *head,
struct html_context *html_context)
{
unsigned char *base_pos = html;
int noupdate = 0;
html_context->putsp = HTML_SPACE_SUPPRESS;
html_context->line_breax = html_context->table_level ? 2 : 1;
html_context->position = 0;
html_context->was_br = 0;
html_context->was_li = 0;
html_context->was_body = 0;
/* html_context->was_body_background = 0; */
html_context->part = part;
html_context->eoff = eof;
if (head) process_head(html_context, head);
main_loop:
while (html < eof) {
unsigned char *name, *attr, *end;
int namelen, endingtag;
int dotcounter = 0;
if (!noupdate) {
html_context->part = part;
html_context->eoff = eof;
base_pos = html;
} else {
noupdate = 0;
}
if (isspace(*html) && !html_is_preformatted()) {
unsigned char *h = html;
while (h < eof && isspace(*h))
h++;
if (h + 1 < eof && h[0] == '<' && h[1] == '/') {
if (!parse_element(h, eof, &name, &namelen, &attr, &end)) {
put_chrs(html_context, base_pos, html - base_pos);
base_pos = html = h;
html_context->putsp = HTML_SPACE_ADD;
goto element;
}
}
html++;
if (!(html_context->position + (html - base_pos - 1)))
goto skip_w; /* ??? */
if (*(html - 1) == ' ') { /* Do not replace with isspace() ! --Zas */
/* BIG performance win; not sure if it doesn't cause any bug */
if (html < eof && !isspace(*html)) {
noupdate = 1;
continue;
}
put_chrs(html_context, base_pos, html - base_pos);
} else {
put_chrs(html_context, base_pos, html - base_pos - 1);
put_chrs(html_context, " ", 1);
}
skip_w:
while (html < eof && isspace(*html))
html++;
continue;
}
if (html_is_preformatted()) {
html_context->putsp = HTML_SPACE_NORMAL;
if (*html == ASCII_TAB) {
put_chrs(html_context, base_pos, html - base_pos);
put_chrs(html_context, " ",
8 - (html_context->position % 8));
html++;
continue;
} else if (*html == ASCII_CR || *html == ASCII_LF) {
put_chrs(html_context, base_pos, html - base_pos);
if (html - base_pos == 0 && html_context->line_breax > 0)
html_context->line_breax--;
next_break:
if (*html == ASCII_CR && html < eof - 1
&& html[1] == ASCII_LF)
html++;
ln_break(html_context, 1);
html++;
if (*html == ASCII_CR || *html == ASCII_LF) {
html_context->line_breax = 0;
goto next_break;
}
continue;
} else if (html + 5 < eof && *html == '&') {
/* Really nasty hack to make
handling in
* -tags lynx-compatible. It works around
* the entity handling done in the renderer,
* since checking #13 value there would require
* something along the lines of NBSP_CHAR or
* checking for '\n's in AT_PREFORMATTED text. */
/* See bug 52 and 387 for more info. */
int length = html - base_pos;
int newlines = 0;
while ((html + 5 < eof && html[0] == '&' && html[1] == '#')
&& (!memcmp(html + 2, "13;", 3)
|| (html + 6 < eof && !strncasecmp(html + 2, "x0a;", 4)))) {
newlines++;
html += 5 + (html[4] != ';');
}
if (newlines) {
put_chrs(html_context, base_pos, length);
ln_break(html_context, newlines);
continue;
}
}
}
while (*html < ' ') {
if (html - base_pos)
put_chrs(html_context, base_pos, html - base_pos);
dotcounter++;
base_pos = ++html;
if (*html >= ' ' || isspace(*html) || html >= eof) {
unsigned char *dots = fmem_alloc(dotcounter);
if (dots) {
memset(dots, '.', dotcounter);
put_chrs(html_context, dots, dotcounter);
fmem_free(dots);
}
goto main_loop;
}
}
if (html + 2 <= eof && html[0] == '<' && (html[1] == '!' || html[1] == '?')
&& !(html_context->was_xmp || html_context->was_style)) {
put_chrs(html_context, base_pos, html - base_pos);
html = skip_comment(html, eof);
continue;
}
if (*html != '<' || parse_element(html, eof, &name, &namelen, &attr, &end)) {
html++;
noupdate = 1;
continue;
}
element:
endingtag = *name == '/'; name += endingtag; namelen -= endingtag;
if (!endingtag && html_context->putsp == HTML_SPACE_ADD && !html_top.invisible)
put_chrs(html_context, " ", 1);
put_chrs(html_context, base_pos, html - base_pos);
if (!html_is_preformatted() && !endingtag && html_context->putsp == HTML_SPACE_NORMAL) {
unsigned char *ee = end;
unsigned char *nm;
while (!parse_element(ee, eof, &nm, NULL, NULL, &ee))
if (*nm == '/')
goto ng;
if (ee < eof && isspace(*ee)) {
put_chrs(html_context, " ", 1);
}
ng:;
}
html = process_element(name, namelen, endingtag, end, html, eof, attr, html_context);
}
if (noupdate) put_chrs(html_context, base_pos, html - base_pos);
ln_break(html_context, 1);
/* Restore the part in case the html_context was trashed in the last
* iteration so that when destroying the stack in the caller we still
* get the right part pointer. */
html_context->part = part;
html_context->putsp = HTML_SPACE_SUPPRESS;
html_context->position = 0;
html_context->was_br = 0;
}
static unsigned char *
start_element(struct element_info *ei,
unsigned char *name, int namelen,
unsigned char *html,
unsigned char *eof, unsigned char *attr,
struct html_context *html_context)
{
#define ELEMENT_RENDER_PROLOGUE \
ln_break(html_context, ei->linebreak); \
a = get_attr_val(attr, "id", html_context->options); \
if (a) { \
html_context->special_f(html_context, SP_TAG, a); \
mem_free(a); \
}
unsigned char *a;
struct par_attrib old_format;
int restore_format;
#ifdef CONFIG_CSS
struct css_selector *selector = NULL;
#endif
if (html_top.type == ELEMENT_WEAK) {
kill_html_stack_item(html_context, &html_top);
}
/* We try to process nested