diff --git a/src/Makefile b/src/Makefile index 729dbd866..d5d672c54 100644 --- a/src/Makefile +++ b/src/Makefile @@ -7,6 +7,7 @@ SUBDIRS-$(CONFIG_FORMHIST) += formhist SUBDIRS-$(CONFIG_GLOBHIST) += globhist SUBDIRS-$(CONFIG_ECMASCRIPT) += ecmascript SUBDIRS-$(CONFIG_SCRIPTING) += scripting +SUBDIRS-$(CONFIG_DOM) += dom SUBDIRS = \ bfu \ diff --git a/src/document/Makefile b/src/document/Makefile index 5a3166312..8014c2519 100644 --- a/src/document/Makefile +++ b/src/document/Makefile @@ -2,7 +2,7 @@ top_builddir=../.. include $(top_builddir)/Makefile.config SUBDIRS-$(CONFIG_CSS) += css -SUBDIRS-$(CONFIG_DOM) += dom sgml +SUBDIRS-$(CONFIG_DOM) += dom SUBDIRS = html plain diff --git a/src/document/dom/Makefile b/src/document/dom/Makefile index db8415726..21f56d72c 100644 --- a/src/document/dom/Makefile +++ b/src/document/dom/Makefile @@ -1,6 +1,6 @@ top_builddir=../../.. include $(top_builddir)/Makefile.config -OBJS = node.o renderer.o select.o stack.o +OBJS = renderer.o include $(top_srcdir)/Makefile.lib diff --git a/src/document/dom/renderer.c b/src/document/dom/renderer.c index 015effa5b..93934917d 100644 --- a/src/document/dom/renderer.c +++ b/src/document/dom/renderer.c @@ -20,11 +20,12 @@ #include "document/css/stylesheet.h" #include "document/docdata.h" #include "document/document.h" -#include "document/dom/node.h" #include "document/dom/renderer.h" -#include "document/dom/stack.h" #include "document/renderer.h" -#include "document/sgml/parser.h" +#include "dom/scanner.h" +#include "dom/sgml/parser.h" +#include "dom/node.h" +#include "dom/stack.h" #include "intl/charsets.h" #include "globhist/globhist.h" /* get_global_history_item() */ #include "protocol/uri.h" @@ -32,7 +33,6 @@ #include "util/box.h" #include "util/error.h" #include "util/memory.h" -#include "util/scanner.h" #include "util/snprintf.h" #include "util/string.h" @@ -506,9 +506,9 @@ render_dom_element_end_source(struct dom_stack *stack, struct dom_node *node, vo struct dom_renderer *renderer = stack->current->data; struct dom_stack_state *state = get_dom_stack_top(stack); struct sgml_parser_state *pstate = get_dom_stack_state_data(stack->contexts[0], state); - struct scanner_token *token = &pstate->end_token; - unsigned char *string = token->string; - int length = token->length; + struct dom_scanner_token *token = &pstate->end_token; + unsigned char *string = token->string.string; + int length = token->string.length; assert(node && renderer && renderer->document); @@ -670,6 +670,10 @@ render_dom_document(struct cache_entry *cached, struct document *document, struct conv_table *convert_table; struct sgml_parser *parser; enum sgml_document_type doctype; + unsigned char *string = struri(cached->uri); + size_t length = strlen(string); + struct dom_string uri = INIT_DOM_STRING(string, length); + struct dom_string source = INIT_DOM_STRING(buffer->source, buffer->length); assert(document->options.plain); @@ -689,14 +693,14 @@ render_dom_document(struct cache_entry *cached, struct document *document, else doctype = SGML_DOCTYPE_HTML; - parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, cached->uri); + parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, &uri); if (!parser) return; add_dom_stack_context(&parser->stack, &renderer, &dom_source_renderer_context_info); add_dom_stack_tracer(&parser->stack); - root = parse_sgml(parser, buffer); + root = parse_sgml(parser, &source); if (root) { assert(parser->stack.depth == 1); diff --git a/src/document/sgml/.vimrc b/src/document/sgml/.vimrc deleted file mode 100644 index 18f65e474..000000000 --- a/src/document/sgml/.vimrc +++ /dev/null @@ -1,2 +0,0 @@ -:set runtimepath+=. -:runtime ../../../.vimrc diff --git a/src/document/sgml/html/.vimrc b/src/document/sgml/html/.vimrc deleted file mode 100644 index 18f65e474..000000000 --- a/src/document/sgml/html/.vimrc +++ /dev/null @@ -1,2 +0,0 @@ -:set runtimepath+=. -:runtime ../../../.vimrc diff --git a/src/dom/Makefile b/src/dom/Makefile new file mode 100644 index 000000000..35a6b3929 --- /dev/null +++ b/src/dom/Makefile @@ -0,0 +1,9 @@ +top_builddir=../.. +include $(top_builddir)/Makefile.config + +SUBDIRS = css sgml +OBJS = node.o select.o stack.o scanner.o + +SUBDIRS-$(CONFIG_DEBUG) += test + +include $(top_srcdir)/Makefile.lib diff --git a/src/dom/css/Makefile b/src/dom/css/Makefile new file mode 100644 index 000000000..b45c048e2 --- /dev/null +++ b/src/dom/css/Makefile @@ -0,0 +1,6 @@ +top_builddir=../../.. +include $(top_builddir)/Makefile.config + +OBJS = scanner.o + +include $(top_srcdir)/Makefile.lib diff --git a/src/dom/css/scanner.c b/src/dom/css/scanner.c new file mode 100644 index 000000000..d745b5535 --- /dev/null +++ b/src/dom/css/scanner.c @@ -0,0 +1,385 @@ +/* CSS token scanner utilities */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#include "elinks.h" + +#include "dom/css/scanner.h" +#include "dom/scanner.h" +#include "util/error.h" +#include "util/string.h" + + +/* Bitmap entries for the CSS character groups used in the scanner table */ + +enum css_char_group { + CSS_CHAR_ALPHA = (1 << 0), + CSS_CHAR_DIGIT = (1 << 1), + CSS_CHAR_HEX_DIGIT = (1 << 2), + CSS_CHAR_IDENT = (1 << 3), + CSS_CHAR_IDENT_START = (1 << 4), + CSS_CHAR_NEWLINE = (1 << 5), + CSS_CHAR_NON_ASCII = (1 << 6), + CSS_CHAR_SGML_MARKUP = (1 << 7), + CSS_CHAR_TOKEN = (1 << 8), + CSS_CHAR_TOKEN_START = (1 << 9), + CSS_CHAR_WHITESPACE = (1 << 10), +}; + +static const struct dom_scan_table_info css_scan_table_info[] = { + DOM_SCAN_TABLE_RANGE("0", '9', CSS_CHAR_DIGIT | CSS_CHAR_HEX_DIGIT | CSS_CHAR_IDENT), + DOM_SCAN_TABLE_RANGE("A", 'F', CSS_CHAR_HEX_DIGIT), + DOM_SCAN_TABLE_RANGE("A", 'Z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), + DOM_SCAN_TABLE_RANGE("a", 'f', CSS_CHAR_HEX_DIGIT), + DOM_SCAN_TABLE_RANGE("a", 'z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), + /* For the octal number impared (me including) \241 is 161 --jonas */ + DOM_SCAN_TABLE_RANGE("\241", 255, CSS_CHAR_NON_ASCII | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), + + DOM_SCAN_TABLE_STRING(" \f\n\r\t\v\000", CSS_CHAR_WHITESPACE), + DOM_SCAN_TABLE_STRING("\f\n\r", CSS_CHAR_NEWLINE), + DOM_SCAN_TABLE_STRING("-", CSS_CHAR_IDENT), + DOM_SCAN_TABLE_STRING(".#@!\"'<-/|^$*", CSS_CHAR_TOKEN_START), + /* Unicode escape (that we do not handle yet) + other special chars */ + DOM_SCAN_TABLE_STRING("\\_", CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), + /* This should contain mostly used char tokens like ':' and maybe a few + * garbage chars that people might put in their CSS code */ + DOM_SCAN_TABLE_STRING("[({})];:,.>+~", CSS_CHAR_TOKEN), + DOM_SCAN_TABLE_STRING("", CSS_CHAR_SGML_MARKUP), + + DOM_SCAN_TABLE_END, +}; + +static const struct dom_scanner_string_mapping css_string_mappings[] = { + { "Hz", CSS_TOKEN_FREQUENCY, CSS_TOKEN_DIMENSION }, + { "cm", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, + { "deg", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION }, + { "em", CSS_TOKEN_EM, CSS_TOKEN_DIMENSION }, + { "ex", CSS_TOKEN_EX, CSS_TOKEN_DIMENSION }, + { "grad", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION }, + { "in", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, + { "kHz", CSS_TOKEN_FREQUENCY, CSS_TOKEN_DIMENSION }, + { "mm", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, + { "ms", CSS_TOKEN_TIME, CSS_TOKEN_DIMENSION }, + { "pc", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, + { "pt", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, + { "px", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, + { "rad", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION }, + { "s", CSS_TOKEN_TIME, CSS_TOKEN_DIMENSION }, + + { "rgb", CSS_TOKEN_RGB, CSS_TOKEN_FUNCTION }, + { "url", CSS_TOKEN_URL, CSS_TOKEN_FUNCTION }, + + { "charset", CSS_TOKEN_AT_CHARSET, CSS_TOKEN_AT_KEYWORD }, + { "font-face", CSS_TOKEN_AT_FONT_FACE, CSS_TOKEN_AT_KEYWORD }, + { "import", CSS_TOKEN_AT_IMPORT, CSS_TOKEN_AT_KEYWORD }, + { "media", CSS_TOKEN_AT_MEDIA, CSS_TOKEN_AT_KEYWORD }, + { "page", CSS_TOKEN_AT_PAGE, CSS_TOKEN_AT_KEYWORD }, + + { NULL, CSS_TOKEN_NONE, CSS_TOKEN_NONE }, +}; + +static struct dom_scanner_token *scan_css_tokens(struct dom_scanner *scanner); + +struct dom_scanner_info dom_css_scanner_info = { + css_string_mappings, + css_scan_table_info, + scan_css_tokens, +}; + +#define check_css_table(c, bit) (dom_css_scanner_info.scan_table[(c)] & (bit)) + +#define scan_css(scanner, s, bit) \ + while ((s) < (scanner)->end && check_css_table(*(s), bit)) (s)++; + +#define scan_back_css(scanner, s, bit) \ + while ((s) >= (scanner)->string && check_css_table(*(s), bit)) (s)--; + +#define is_css_ident_start(c) check_css_table(c, CSS_CHAR_IDENT_START) +#define is_css_ident(c) check_css_table(c, CSS_CHAR_IDENT) +#define is_css_digit(c) check_css_table(c, CSS_CHAR_DIGIT) +#define is_css_hexdigit(c) check_css_table(c, CSS_CHAR_HEX_DIGIT) +#define is_css_char_token(c) check_css_table(c, CSS_CHAR_TOKEN) +#define is_css_token_start(c) check_css_table(c, CSS_CHAR_TOKEN_START) + + +#define skip_css(scanner, s, skipto) \ + while (s < (scanner)->end \ + && *(s) != (skipto) \ + && check_css_precedence(*(s), skipto)) { \ + if (isquote(*(s))) { \ + int size = (scanner)->end - (s); \ + unsigned char *end = memchr(s + 1, *(s), size); \ + \ + if (end) (s) = end; \ + } \ + (s)++; \ + } + + +static inline void +scan_css_token(struct dom_scanner *scanner, struct dom_scanner_token *token) +{ + unsigned char *string = scanner->position; + unsigned char first_char = *string; + enum css_token_type type = CSS_TOKEN_GARBAGE; + int real_length = -1; + + assert(first_char); + token->string.string = string++; + + if (is_css_char_token(first_char)) { + type = first_char; + + } else if (is_css_digit(first_char) || first_char == '.') { + scan_css(scanner, string, CSS_CHAR_DIGIT); + + /* First scan the full number token */ + if (*string == '.') { + string++; + + if (is_css_digit(*string)) { + type = CSS_TOKEN_NUMBER; + scan_css(scanner, string, CSS_CHAR_DIGIT); + } + } + + /* Check what kind of number we have */ + if (*string == '%') { + if (first_char != '.') + type = CSS_TOKEN_PERCENTAGE; + string++; + + } else if (!is_css_ident_start(*string)) { + type = CSS_TOKEN_NUMBER; + + } else { + unsigned char *ident = string; + + scan_css(scanner, string, CSS_CHAR_IDENT); + type = map_dom_scanner_string(scanner, ident, string, + CSS_TOKEN_DIMENSION); + } + + } else if (is_css_ident_start(first_char)) { + scan_css(scanner, string, CSS_CHAR_IDENT); + + if (*string == '(') { + unsigned char *function_end = string + 1; + + /* Make sure that we have an ending ')' */ + skip_css(scanner, function_end, ')'); + if (*function_end == ')') { + type = map_dom_scanner_string(scanner, token->string.string, + string, CSS_TOKEN_FUNCTION); + + /* If it is not a known function just skip the + * how arg stuff so we don't end up generating + * a lot of useless tokens. */ + if (type == CSS_TOKEN_FUNCTION) { + string = function_end; + + } else if (type == CSS_TOKEN_URL) { + /* Extracting the URL first removes any + * leading or ending whitespace and + * then see if the url is given in a + * string. If that is the case the + * string delimiters are also trimmed. + * This is not totally correct because + * we should of course handle escape + * sequences .. but that will have to + * be fixed later. */ + unsigned char *from = string + 1; + unsigned char *to = function_end - 1; + + scan_css(scanner, from, CSS_CHAR_WHITESPACE); + scan_back_css(scanner, to, CSS_CHAR_WHITESPACE); + + if (isquote(*from)) from++; + if (isquote(*to)) to--; + + token->string.string = from; + real_length = to - from + 1; + assert(real_length >= 0); + string = function_end; + } + + assert(type != CSS_TOKEN_RGB || *string == '('); + assert(type != CSS_TOKEN_URL || *string == ')'); + assert(type != CSS_TOKEN_FUNCTION || *string == ')'); + } + + string++; + + } else { + type = CSS_TOKEN_IDENT; + } + + } else if (!is_css_token_start(first_char)) { + /* TODO: Better composing of error tokens. For now we just + * split them down into char tokens */ + + } else if (first_char == '#') { + /* Check whether it is hexcolor or hash token */ + if (is_css_hexdigit(*string)) { + int hexdigits; + + scan_css(scanner, string, CSS_CHAR_HEX_DIGIT); + + /* Check that the hexdigit sequence is either 3 or 6 + * chars and it isn't just start of some non-hex ident + * string. */ + hexdigits = string - token->string.string - 1; + if ((hexdigits == 3 || hexdigits == 6) + && !is_css_ident(*string)) { + type = CSS_TOKEN_HEX_COLOR; + } else { + scan_css(scanner, string, CSS_CHAR_IDENT); + type = CSS_TOKEN_HASH; + } + + } else if (is_css_ident(*string)) { + /* Not *_ident_start() because hashes are #. */ + scan_css(scanner, string, CSS_CHAR_IDENT); + type = CSS_TOKEN_HASH; + } + + } else if (first_char == '@') { + /* Compose token containing @ */ + if (is_css_ident_start(*string)) { + unsigned char *ident = string; + + /* Scan both ident start and ident */ + scan_css(scanner, string, CSS_CHAR_IDENT); + type = map_dom_scanner_string(scanner, ident, string, + CSS_TOKEN_AT_KEYWORD); + } + + } else if (first_char == '*') { + if (*string == '=') { + type = CSS_TOKEN_SELECT_CONTAINS; + string++; + } else { + type = CSS_TOKEN_IDENT; + } + + } else if (first_char == '^') { + if (*string == '=') { + type = CSS_TOKEN_SELECT_BEGIN; + string++; + } + + } else if (first_char == '$') { + if (*string == '=') { + type = CSS_TOKEN_SELECT_END; + string++; + } + + } else if (first_char == '|') { + if (*string == '=') { + type = CSS_TOKEN_SELECT_HYPHEN_LIST; + string++; + } + + } else if (first_char == '!') { + scan_css(scanner, string, CSS_CHAR_WHITESPACE); + if (!strncasecmp(string, "important", 9)) { + type = CSS_TOKEN_IMPORTANT; + string += 9; + } + + } else if (isquote(first_char)) { + /* TODO: Escaped delimiters --jonas */ + int size = scanner->end - string; + unsigned char *string_end = memchr(string, first_char, size); + + if (string_end) { + /* We don't want the delimiters in the token */ + token->string.string++; + real_length = string_end - token->string.string; + string = string_end + 1; + type = CSS_TOKEN_STRING; + } + + } else if (first_char == '<' || first_char == '-') { + /* Try to navigate SGML tagsoup */ + + if (*string == '/') { + /* Some kind of SGML tag end ... better bail out screaming */ + type = CSS_TOKEN_NONE; + + } else { + unsigned char *sgml = string; + + /* Skip anything looking like SGML "" + * comments + notations. */ + scan_css(scanner, sgml, CSS_CHAR_SGML_MARKUP); + + if (sgml - string >= 2 + && ((first_char == '<' && *string == '!') + || (first_char == '-' && sgml[-1] == '>'))) { + type = CSS_TOKEN_SKIP; + string = sgml; + } + } + + } else if (first_char == '/') { + /* Comments */ + if (*string == '*') { + type = CSS_TOKEN_SKIP; + + for (string++; string < scanner->end; string++) + if (*string == '*' && string[1] == '/') { + string += 2; + break; + } + } + + } else { + INTERNAL("Someone forgot to put code for recognizing tokens " + "which start with '%c'.", first_char); + } + + token->type = type; + token->string.length = real_length > 0 ? real_length : string - token->string.string; + token->precedence = get_css_precedence(type); + scanner->position = string; +} + +static struct dom_scanner_token * +scan_css_tokens(struct dom_scanner *scanner) +{ + struct dom_scanner_token *table_end = scanner->table + DOM_SCANNER_TOKENS; + struct dom_scanner_token *current; + + if (!begin_dom_token_scanning(scanner)) + return get_dom_scanner_token(scanner); + + /* Scan tokens until we fill the table */ + for (current = scanner->table + scanner->tokens; + current < table_end && scanner->position < scanner->end; + current++) { + scan_css(scanner, scanner->position, CSS_CHAR_WHITESPACE); + if (scanner->position >= scanner->end) break; + + scan_css_token(scanner, current); + + /* Did some one scream for us to end the madness? */ + if (current->type == CSS_TOKEN_NONE) { + scanner->position = NULL; + current--; + break; + } + + /* Shall we scratch this token? */ + if (current->type == CSS_TOKEN_SKIP) { + current--; + } + } + + return end_dom_token_scanning(scanner, current); +} diff --git a/src/dom/css/scanner.h b/src/dom/css/scanner.h new file mode 100644 index 000000000..0ac45393a --- /dev/null +++ b/src/dom/css/scanner.h @@ -0,0 +1,112 @@ + +#ifndef EL__DOM_CSS_SCANNER_H +#define EL__DOM_CSS_SCANNER_H + +#include "dom/scanner.h" + +/* The various token types and what they contain. Patterns taken from + * the flex scanner declarations in the CSS 2 Specification. */ +enum css_token_type { + /* Char tokens: */ + + /* Char tokens range from 1 to 255 and have their char value as type */ + /* meaning non char tokens have values from 256 and up. */ + + /* Low level string tokens: */ + + /* {...} means char group, <...> means token */ + /* {identstart} [a-z_]|{nonascii} */ + /* {ident} [a-z0-9_-]|{nonascii} */ + /* {identstart}{ident}* */ + /* {ident}+ */ + /* [0-9]+|[0-9]*"."[0-9]+ */ + + /* Percentage is put because although it looks like being composed of + * and '%' floating point numbers are really not allowed but + * strtol() will round it down for us ;) */ + CSS_TOKEN_IDENT = 256, /* */ + CSS_TOKEN_NUMBER, /* */ + CSS_TOKEN_PERCENTAGE, /* % */ + CSS_TOKEN_STRING, /* Char sequence delimted by matching ' or " */ + + /* High level string tokens: */ + + /* The various number values; dimension being the most generic */ + CSS_TOKEN_ANGLE, /* rad, grad or deg */ + CSS_TOKEN_DIMENSION, /* */ + CSS_TOKEN_EM, /* em */ + CSS_TOKEN_EX, /* ex */ + CSS_TOKEN_FREQUENCY, /* Hz or kHz */ + CSS_TOKEN_LENGTH, /* {px,cm,mm,in,pt,pc} */ + CSS_TOKEN_TIME, /* ms or s */ + + /* XXX: CSS_TOKEN_HASH conflicts with CSS_TOKEN_HEX_COLOR. Generating + * hex color tokens has precedence and the hash token user have to + * treat CSS_TOKEN_HASH and CSS_TOKEN_HEX_COLOR alike. */ + CSS_TOKEN_HASH, /* # */ + CSS_TOKEN_HEX_COLOR, /* #[0-9a-f]\{3,6} */ + + /* For all unknown functions we generate on token contain both function name + * and args so scanning/parsing is easier. Besides we already check for + * ending ')'. */ + /* For known functions where we need several args [like rgb()] we want + * to generate tokens for every arg and arg delimiter ( ',' or ')' ). + * Because url() is a bit triggy: it can contain both and some + * chars that would other wise make the scanner probably choke we also + * include the arg in that token. Besides it will make things like + * 'background' property parsing easier. */ + CSS_TOKEN_FUNCTION, /* () */ + CSS_TOKEN_RGB, /* rgb( */ + CSS_TOKEN_URL, /* url() */ + + /* @-rule symbols */ + CSS_TOKEN_AT_KEYWORD, /* @ */ + CSS_TOKEN_AT_CHARSET, /* @charset */ + CSS_TOKEN_AT_FONT_FACE, /* @font-face */ + CSS_TOKEN_AT_IMPORT, /* @import */ + CSS_TOKEN_AT_MEDIA, /* @media */ + CSS_TOKEN_AT_PAGE, /* @page */ + + CSS_TOKEN_IMPORTANT, /* !important */ + + /* TODO: Selector stuff: */ + CSS_TOKEN_SELECT_SPACE_LIST, /* ~= */ + CSS_TOKEN_SELECT_HYPHEN_LIST, /* |= */ + CSS_TOKEN_SELECT_BEGIN, /* ^= */ + CSS_TOKEN_SELECT_END, /* $= */ + CSS_TOKEN_SELECT_CONTAINS, /* *= */ + + /* Special tokens: */ + + /* A special token for unrecognized strings */ + CSS_TOKEN_GARBAGE, + + /* Token type used internally when scanning to signal that the token + * should not be recorded in the scanners token table. */ + CSS_TOKEN_SKIP, + + /* Another internal token type used both to mark unused tokens in the + * scanner table as invalid or when scanning to signal that the + * scanning should end. */ + CSS_TOKEN_NONE = 0, +}; + +extern struct dom_scanner_info dom_css_scanner_info; + +#define skip_css_tokens(scanner, type) \ + skip_dom_scanner_tokens(scanner, type, get_css_precedence(type)) + +#define get_css_precedence(token_type) \ + ((token_type) == '}' ? (1 << 10) : \ + (token_type) == '{' ? (1 << 9) : \ + (token_type) == ';' ? (1 << 8) : \ + (token_type) == ')' ? (1 << 7) : 0) + +/* Check whether it is safe to skip the @token when looking for @skipto. */ +static inline int +check_css_precedence(int type, int skipto) +{ + return get_css_precedence(type) < get_css_precedence(skipto); +} + +#endif diff --git a/src/document/dom/dom.h b/src/dom/dom.h similarity index 90% rename from src/document/dom/dom.h rename to src/dom/dom.h index b4367baff..5eadc5a8e 100644 --- a/src/document/dom/dom.h +++ b/src/dom/dom.h @@ -1,5 +1,5 @@ -#ifndef EL__DOCUMENT_DOM_DOM_H -#define EL__DOCUMENT_DOM_DOM_H +#ifndef EL_DOM_DOM_H +#define EL_DOM_DOM_H enum dom_exception_code { DOM_ERR_NONE = 0, diff --git a/src/document/dom/node.c b/src/dom/node.c similarity index 98% rename from src/document/dom/node.c rename to src/dom/node.c index f1da57b4f..7c19f90a7 100644 --- a/src/document/dom/node.c +++ b/src/dom/node.c @@ -9,10 +9,8 @@ #include "elinks.h" -#include "document/dom/node.h" -#include "intl/charsets.h" +#include "dom/node.h" #include "util/hash.h" -#include "util/lists.h" #include "util/memory.h" #include "util/string.h" @@ -249,7 +247,7 @@ get_dom_node_list_index(struct dom_node *parent, struct dom_node *node) struct dom_node * init_dom_node_(unsigned char *file, int line, struct dom_node *parent, enum dom_node_type type, - unsigned char *string, size_t length) + struct dom_string *string) { #ifdef DEBUG_MEMLEAK struct dom_node *node = debug_mem_calloc(file, line, 1, sizeof(*node)); @@ -261,7 +259,7 @@ init_dom_node_(unsigned char *file, int line, node->type = type; node->parent = parent; - set_dom_string(&node->string, string, length); + copy_dom_string(&node->string, string); if (parent) { struct dom_node_list **list = get_dom_node_list(parent, node); diff --git a/src/document/dom/node.h b/src/dom/node.h similarity index 91% rename from src/document/dom/node.h rename to src/dom/node.h index 550df138e..542b89301 100644 --- a/src/document/dom/node.h +++ b/src/dom/node.h @@ -1,8 +1,8 @@ -#ifndef EL__DOCUMENT_DOM_NODE_H -#define EL__DOCUMENT_DOM_NODE_H +#ifndef EL_DOM_NODE_H +#define EL_DOM_NODE_H -#include "document/dom/string.h" +#include "dom/string.h" #include "util/hash.h" struct dom_node_list; @@ -255,34 +255,34 @@ get_dom_node_map_entry(struct dom_node_list *node_map, struct dom_node * init_dom_node_(unsigned char *file, int line, struct dom_node *parent, enum dom_node_type type, - unsigned char *string, size_t length); -#define init_dom_node(type, string, length) init_dom_node_(__FILE__, __LINE__, NULL, type, string, length) -#define add_dom_node(parent, type, string, length) init_dom_node_(__FILE__, __LINE__, parent, type, string, length) + struct dom_string *string); +#define init_dom_node(type, string) init_dom_node_(__FILE__, __LINE__, NULL, type, string) +#define add_dom_node(parent, type, string) init_dom_node_(__FILE__, __LINE__, parent, type, string) -#define add_dom_element(parent, string, length) \ - add_dom_node(parent, DOM_NODE_ELEMENT, string, length) +#define add_dom_element(parent, string) \ + add_dom_node(parent, DOM_NODE_ELEMENT, string) static inline struct dom_node * -add_dom_attribute(struct dom_node *parent, unsigned char *string, int length, - unsigned char *value, size_t valuelen) +add_dom_attribute(struct dom_node *parent, struct dom_string *name, + struct dom_string *value) { - struct dom_node *node = add_dom_node(parent, DOM_NODE_ATTRIBUTE, string, length); + struct dom_node *node = add_dom_node(parent, DOM_NODE_ATTRIBUTE, name); if (node && value) { - set_dom_string(&node->data.attribute.value, value, valuelen); + copy_dom_string(&node->data.attribute.value, value); } return node; } static inline struct dom_node * -add_dom_proc_instruction(struct dom_node *parent, unsigned char *string, int length, - unsigned char *instruction, size_t instructionlen) +add_dom_proc_instruction(struct dom_node *parent, struct dom_string *string, + struct dom_string *instruction) { - struct dom_node *node = add_dom_node(parent, DOM_NODE_PROCESSING_INSTRUCTION, string, length); + struct dom_node *node = add_dom_node(parent, DOM_NODE_PROCESSING_INSTRUCTION, string); if (node && instruction) { - set_dom_string(&node->data.proc_instruction.instruction, instruction, instructionlen); + copy_dom_string(&node->data.proc_instruction.instruction, instruction); } return node; diff --git a/src/dom/scanner.c b/src/dom/scanner.c new file mode 100644 index 000000000..4ba3aad1e --- /dev/null +++ b/src/dom/scanner.c @@ -0,0 +1,172 @@ +/* A pretty generic scanner */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include + +#include "elinks.h" + +#include "dom/scanner.h" +#include "util/error.h" +#include "util/string.h" + + +int +map_dom_scanner_string(struct dom_scanner *scanner, + unsigned char *ident, unsigned char *end, int base_type) +{ + const struct dom_scanner_string_mapping *mappings = scanner->info->mappings; + int length = end - ident; + + for (; mappings->name; mappings++) { + if (mappings->base_type == base_type + && !strlcasecmp(mappings->name, -1, ident, length)) + return mappings->type; + } + + return base_type; +} + + +struct dom_scanner_token * +skip_dom_scanner_tokens(struct dom_scanner *scanner, int skipto, int precedence) +{ + struct dom_scanner_token *token = get_dom_scanner_token(scanner); + + /* Skip tokens while handling some basic precedens of special chars + * so we don't skip to long. */ + while (token) { + if (token->type == skipto + || token->precedence > precedence) + break; + token = get_next_dom_scanner_token(scanner); + } + + return (token && token->type == skipto) + ? get_next_dom_scanner_token(scanner) : NULL; +} + +#ifdef DEBUG_SCANNER +void +dump_dom_scanner(struct dom_scanner *scanner) +{ + unsigned char buffer[MAX_STR_LEN]; + struct dom_scanner_token *token = scanner->current; + struct dom_scanner_token *table_end = scanner->table + scanner->tokens; + unsigned char *srcpos = token->string, *bufpos = buffer; + int src_lookahead = 50; + int token_lookahead = 4; + int srclen; + + if (!dom_scanner_has_tokens(scanner)) return; + + memset(buffer, 0, MAX_STR_LEN); + for (; token_lookahead > 0 && token < table_end; token++, token_lookahead--) { + int buflen = MAX_STR_LEN - (bufpos - buffer); + int added = snprintf(bufpos, buflen, "[%.*s] ", token->length, token->string); + + bufpos += added; + } + + if (scanner->tokens > token_lookahead) { + memcpy(bufpos, "... ", 4); + bufpos += 4; + } + + srclen = strlen(srcpos); + int_upper_bound(&src_lookahead, srclen); + *bufpos++ = '['; + + /* Compress the lookahead string */ + for (; src_lookahead > 0; src_lookahead--, srcpos++, bufpos++) { + if (*srcpos == '\n' || *srcpos == '\r' || *srcpos == '\t') { + *bufpos++ = '\\'; + *bufpos = *srcpos == '\n' ? 'n' + : (*srcpos == '\r' ? 'r' : 't'); + } else { + *bufpos = *srcpos; + } + } + + if (srclen > src_lookahead) + memcpy(bufpos, "...]", 4); + else + memcpy(bufpos, "]", 2); + + errfile = scanner->file, errline = scanner->line; + elinks_wdebug("%s", buffer); +} + +struct dom_scanner_token * +get_dom_scanner_token_debug(struct dom_scanner *scanner) +{ + if (!dom_scanner_has_tokens(scanner)) return NULL; + + dump_dom_scanner(scanner); + + /* Make sure we do not return invalid tokens */ + assert(!dom_scanner_has_tokens(scanner) + || scanner->current->type != 0); + + return get_dom_scanner_token(scanner); +} +#endif + + +/* Initializers */ + +static inline void +init_dom_scanner_info(struct dom_scanner_info *scanner_info) +{ + const struct dom_scan_table_info *info = scanner_info->scan_table_info; + int *scan_table = scanner_info->scan_table; + int i; + + if (!info) return; + + for (i = 0; info[i].type != DOM_SCAN_END; i++) { + const struct dom_string *data = &info[i].data; + + if (info[i].type == DOM_SCAN_RANGE) { + int index = *data->string; + + assert(index > 0); + assert(data->length < DOM_SCAN_TABLE_SIZE); + assert(index <= data->length); + + for (; index <= data->length; index++) + scan_table[index] |= info[i].bits; + + } else { + unsigned char *string = info[i].data.string; + int pos = info[i].data.length - 1; + + assert(info[i].type == DOM_SCAN_STRING && pos >= 0); + + for (; pos >= 0; pos--) + scan_table[string[pos]] |= info[i].bits; + } + } +} + +void +init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, + struct dom_string *string) +{ + if (!scanner_info->initialized) { + init_dom_scanner_info(scanner_info); + scanner_info->initialized = 1; + } + + memset(scanner, 0, sizeof(*scanner)); + + scanner->string = string->string; + scanner->position = string->string; + scanner->end = string->string + string->length; + scanner->current = scanner->table; + scanner->info = scanner_info; + scanner->info->scan(scanner); +} diff --git a/src/dom/scanner.h b/src/dom/scanner.h new file mode 100644 index 000000000..fe010afc7 --- /dev/null +++ b/src/dom/scanner.h @@ -0,0 +1,249 @@ +#ifndef EL_DOM_SCANNER_H +#define EL_DOM_SCANNER_H + +#include "dom/string.h" +#include "util/error.h" + +/* Define if you want a talking scanner */ +/* #define DEBUG_DOM_SCANNER */ + +/* The {struct dom_scanner_token} describes one scanner state. There are two + * kinds of tokens: char and non-char tokens. Char tokens contains only one + * char and simply have their char value as type. They are tokens having + * special control meaning in the code, like ':', ';', '{', '}' and '*'. Non + * char tokens has one or more chars and contain stuff like number or + * indentifier strings. */ +struct dom_scanner_token { + /* The type the token */ + int type; + + /* Some precedence value */ + int precedence; + + /* The start of the token string and the token length */ + struct dom_string string; +}; + +#define skip_dom_scanner_token_char(token) \ + do { (token)->string.string++; (token)->string.length--; } while (0) + +/* The naming of these two macros is a bit odd .. we compare often with + * "static" strings (I don't have a better word) so the macro name should + * be short. --jonas */ + +/* Compare the string of @token with @string */ +#define dom_scanner_token_strlcasecmp(token, str, len) \ + ((token) && !strlcasecmp((token)->string.string, (token)->string.length, str, len)) + +/* Also compares the token string but using a "static" string */ +#define dom_scanner_token_contains(token, str) \ + dom_scanner_token_strlcasecmp(token, str, sizeof(str) - 1) + + +struct dom_scan_table_info { + enum { DOM_SCAN_RANGE, DOM_SCAN_STRING, DOM_SCAN_END } type; + struct dom_string data; + int bits; +}; + +#define DOM_SCAN_TABLE_SIZE 256 + +#define DOM_SCAN_TABLE_INFO(type, data1, data2, bits) \ + { (type), INIT_DOM_STRING((data1), (data2)), (bits) } + +#define DOM_SCAN_TABLE_RANGE(from, to, bits) \ + DOM_SCAN_TABLE_INFO(DOM_SCAN_RANGE, from, to, bits) + +#define DOM_SCAN_TABLE_STRING(str, bits) \ + DOM_SCAN_TABLE_INFO(DOM_SCAN_STRING, str, sizeof(str) - 1, bits) + +#define DOM_SCAN_TABLE_END \ + DOM_SCAN_TABLE_INFO(DOM_SCAN_END, NULL, 0, 0) + +struct dom_scanner_string_mapping { + unsigned char *name; + int type; + int base_type; +}; + +struct dom_scanner; + +struct dom_scanner_info { + /* Table containing how to map strings to token types */ + const struct dom_scanner_string_mapping *mappings; + + /* Information for how to initialize the scanner table */ + const struct dom_scan_table_info *scan_table_info; + + /* Fills the scanner with tokens. Already scanned tokens which have not + * been requested remain and are moved to the start of the scanners + * token table. */ + /* Returns the current token or NULL if there are none. */ + struct dom_scanner_token *(*scan)(struct dom_scanner *scanner); + + /* The scanner table */ + /* Contains bitmaps for the various characters groups. + * Idea sync'ed from mozilla browser. */ + int scan_table[DOM_SCAN_TABLE_SIZE]; + + /* Has the scanner info been initialized? */ + unsigned int initialized:1; +}; + + +/* Initializes the scanner. */ +void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, + struct dom_string *string); + +/* The number of tokens in the scanners token table: + * At best it should be big enough to contain properties with space separated + * values and function calls with up to 3 variables like rgb(). At worst it + * should be no less than 2 in order to be able to peek at the next token in + * the scanner. */ +#define DOM_SCANNER_TOKENS 10 + +/* The {struct dom_scanner} describes the current state of the scanner. */ +struct dom_scanner { + /* The very start of the scanned string, the position in the string + * where to scan next and the end of the string. If position is NULL it + * means that no more tokens can be retrieved from the string. */ + unsigned char *string, *position, *end; + + /* The current token and number of scanned tokens in the table. + * If the number of scanned tokens is less than DOM_SCANNER_TOKENS it + * is because there are no more tokens in the string. */ + struct dom_scanner_token *current; + int tokens; + + /* The 'meta' scanner information */ + struct dom_scanner_info *info; + +#ifdef DEBUG_SCANNER + /* Debug info about the caller. */ + unsigned char *file; + int line; +#endif + + /* Some state indicator only meaningful to the scanner internals */ + int state; + + /* The table contain already scanned tokens. It is maintained in + * order to optimize the scanning a bit and make it possible to look + * ahead at the next token. You should always use the accessors + * (defined below) for getting tokens from the scanner. */ + struct dom_scanner_token table[DOM_SCANNER_TOKENS]; +}; + +#define dom_scanner_has_tokens(scanner) \ + ((scanner)->tokens > 0 && (scanner)->current < (scanner)->table + (scanner)->tokens) + +/* This macro checks if the current scanner state is valid. Meaning if the + * scanners table is full the last token skipping or get_next_scanner_token() + * call made it possible to get the type of the next token. */ +#define check_dom_scanner(scanner) \ + (scanner->tokens < DOM_SCANNER_TOKENS \ + || scanner->current + 1 < scanner->table + scanner->tokens) + + +/* Scanner table accessors and mutators */ + +/* Checks the type of the next token */ +#define check_next_dom_scanner_token(scanner, token_type) \ + (scanner_has_tokens(scanner) \ + && ((scanner)->current + 1 < (scanner)->table + (scanner)->tokens) \ + && (scanner)->current[1].type == (token_type)) + +/* Access current and next token. Getting the next token might cause + * a rescan so any token pointers that has been stored in a local variable + * might not be valid after the call. */ +static inline struct dom_scanner_token * +get_dom_scanner_token(struct dom_scanner *scanner) +{ + return dom_scanner_has_tokens(scanner) ? scanner->current : NULL; +} + +/* Do a scanning if we do not have also have access to next token. */ +static inline struct dom_scanner_token * +get_next_dom_scanner_token(struct dom_scanner *scanner) +{ + return (dom_scanner_has_tokens(scanner) + && (++scanner->current + 1 >= scanner->table + scanner->tokens) + ? scanner->info->scan(scanner) : get_dom_scanner_token(scanner)); +} + +/* This should just make the code more understandable .. hopefully */ +#define skip_dom_scanner_token(scanner) get_next_dom_scanner_token(scanner) + +/* Removes tokens from the scanner until it meets a token of the given type. + * This token will then also be skipped. */ +struct dom_scanner_token * +skip_dom_scanner_tokens(struct dom_scanner *scanner, int skipto, int precedence); + +/* Looks up the string from @ident to @end to in the scanners string mapping + * table */ +int +map_dom_scanner_string(struct dom_scanner *scanner, + unsigned char *ident, unsigned char *end, int base_type); + +#ifdef DEBUG_DOM_SCANNER +void dump_dom_scanner(struct dom_scanner *scanner); +#endif + +/* The begin_token_scanning() and end_token_scanning() functions provide the + * basic setup and teardown for the rescan function made public via the + * scanner_info->scan member. */ + +/* Returns NULL if it is not necessary to try to scan for more tokens */ +static inline struct dom_scanner_token * +begin_dom_token_scanning(struct dom_scanner *scanner) +{ + struct dom_scanner_token *table = scanner->table; + struct dom_scanner_token *table_end = table + scanner->tokens; + int move_to_front = int_max(table_end - scanner->current, 0); + struct dom_scanner_token *current = move_to_front ? scanner->current : table; + size_t moved_size = 0; + + assert(scanner->current); + + /* Move any untouched tokens */ + if (move_to_front) { + moved_size = move_to_front * sizeof(*table); + memmove(table, current, moved_size); + current = &table[move_to_front]; + } + + /* Clear all unused tokens */ + memset(current, 0, sizeof(*table) * DOM_SCANNER_TOKENS - moved_size); + + if (!scanner->position) { + scanner->tokens = move_to_front ? move_to_front : -1; + scanner->current = table; + assert(check_dom_scanner(scanner)); + return NULL; + } + + scanner->tokens = move_to_front; + + return table; +} + +/* Updates the @scanner struct after scanning has been done. The position + * _after_ the last valid token is taken as the @end argument. */ +/* It is ok for @end to be < scanner->table since scanner->tokens will become + * <= 0 anyway. */ +static inline struct dom_scanner_token * +end_dom_token_scanning(struct dom_scanner *scanner, struct dom_scanner_token *end) +{ + assert(end <= scanner->table + DOM_SCANNER_TOKENS); + + scanner->tokens = (end - scanner->table); + scanner->current = scanner->table; + if (scanner->position >= scanner->end) + scanner->position = NULL; + + assert(check_dom_scanner(scanner)); + + return get_dom_scanner_token(scanner); +} + +#endif diff --git a/src/document/dom/select.c b/src/dom/select.c similarity index 91% rename from src/document/dom/select.c rename to src/dom/select.c index 231e9f92d..c4d7cf0c8 100644 --- a/src/document/dom/select.c +++ b/src/dom/select.c @@ -6,13 +6,13 @@ #include "elinks.h" -#include "document/css/scanner.h" -#include "document/dom/dom.h" -#include "document/dom/node.h" -#include "document/dom/select.h" -#include "document/dom/stack.h" +#include "dom/css/scanner.h" +#include "dom/dom.h" +#include "dom/node.h" +#include "dom/scanner.h" +#include "dom/select.h" +#include "dom/stack.h" #include "util/memory.h" -#include "util/scanner.h" #include "util/string.h" @@ -20,7 +20,7 @@ /* Maps the content of a scanner token to a pseudo-class or -element ID. */ static enum dom_select_pseudo -get_dom_select_pseudo(struct scanner_token *token) +get_dom_select_pseudo(struct dom_scanner_token *token) { static struct { struct dom_string string; @@ -70,13 +70,10 @@ get_dom_select_pseudo(struct scanner_token *token) #undef INIT_DOM_SELECT_PSEUDO_STRING }; - struct dom_string string; int i; - set_dom_string(&string, token->string, token->length); - for (i = 0; i < sizeof_array(pseudo_info); i++) - if (!dom_string_casecmp(&pseudo_info[i].string, &string)) + if (!dom_string_casecmp(&pseudo_info[i].string, &token->string)) return pseudo_info[i].pseudo; return DOM_SELECT_PSEUDO_UNKNOWN; @@ -84,9 +81,9 @@ get_dom_select_pseudo(struct scanner_token *token) /* Parses attribute selector. For example '[foo="bar"]' or '[foo|="boo"]'. */ static enum dom_exception_code -parse_dom_select_attribute(struct dom_select_node *sel, struct scanner *scanner) +parse_dom_select_attribute(struct dom_select_node *sel, struct dom_scanner *scanner) { - struct scanner_token *token = get_scanner_token(scanner); + struct dom_scanner_token *token = get_dom_scanner_token(scanner); /* Get '['. */ @@ -95,15 +92,15 @@ parse_dom_select_attribute(struct dom_select_node *sel, struct scanner *scanner) /* Get the attribute name. */ - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_ERR_SYNTAX; - set_dom_string(&sel->node.string, token->string, token->length); + copy_dom_string(&sel->node.string, &token->string); /* Get the optional '=' combo or ending ']'. */ - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; switch (token->type) { @@ -137,13 +134,13 @@ parse_dom_select_attribute(struct dom_select_node *sel, struct scanner *scanner) /* Get the required value. */ - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; switch (token->type) { case CSS_TOKEN_IDENT: case CSS_TOKEN_STRING: - set_dom_string(&sel->node.data.attribute.value, token->string, token->length); + copy_dom_string(&sel->node.data.attribute.value, &token->string); break; default: @@ -152,7 +149,7 @@ parse_dom_select_attribute(struct dom_select_node *sel, struct scanner *scanner) /* Get the ending ']'. */ - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (token && token->type == ']') return DOM_ERR_NONE; @@ -170,13 +167,13 @@ parse_dom_select_attribute(struct dom_select_node *sel, struct scanner *scanner) * 0n+0 */ -/* FIXME: Move somewhere else? util/scanner.h? */ +/* FIXME: Move somewhere else? dom/scanner.h? */ static size_t -get_scanner_token_number(struct scanner_token *token) +get_scanner_token_number(struct dom_scanner_token *token) { size_t number = 0; - while (token->length > 0 && isdigit(token->string[0])) { + while (token->string.length > 0 && isdigit(token->string.string[0])) { size_t old_number = number; number *= 10; @@ -185,8 +182,8 @@ get_scanner_token_number(struct scanner_token *token) if (old_number > number) return -1; - number += token->string[0] - '0'; - token->string++, token->length--; + number += token->string.string[0] - '0'; + skip_dom_scanner_token_char(token); } return number; @@ -194,26 +191,26 @@ get_scanner_token_number(struct scanner_token *token) /* Parses the '(...)' part of ':nth-of-type(...)' and ':nth-child(...)'. */ static enum dom_exception_code -parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct scanner *scanner) +parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct dom_scanner *scanner) { - struct scanner_token *token = get_next_scanner_token(scanner); + struct dom_scanner_token *token = get_next_dom_scanner_token(scanner); int sign = 1; int number = -1; if (!token || token->type != '(') return DOM_ERR_SYNTAX; - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; switch (token->type) { case CSS_TOKEN_IDENT: - if (scanner_token_contains(token, "even")) { + if (dom_scanner_token_contains(token, "even")) { nth->step = 2; nth->index = 0; - } else if (scanner_token_contains(token, "odd")) { + } else if (dom_scanner_token_contains(token, "odd")) { nth->step = 2; nth->index = 1; @@ -230,7 +227,7 @@ parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct scanner *scann case '-': sign = -1; - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; if (token->type != CSS_TOKEN_IDENT) @@ -245,7 +242,7 @@ parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct scanner *scann if (number < 0) return DOM_ERR_INVALID_STATE; - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; break; @@ -256,18 +253,18 @@ parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct scanner *scann /* The rest can contain n+ part */ switch (token->type) { case CSS_TOKEN_IDENT: - if (!scanner_token_contains(token, "n")) + if (!dom_scanner_token_contains(token, "n")) return DOM_ERR_SYNTAX; nth->step = sign * number; - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; if (token->type != '+') break; - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (!token) return DOM_ERR_SYNTAX; if (token->type != CSS_TOKEN_NUMBER) @@ -294,15 +291,15 @@ parse_dom_select_nth_arg(struct dom_select_nth_match *nth, struct scanner *scann /* Parse a pseudo-class or -element with the syntax: ':'. */ static enum dom_exception_code parse_dom_select_pseudo(struct dom_select *select, struct dom_select_node *sel, - struct scanner *scanner) + struct dom_scanner *scanner) { - struct scanner_token *token = get_scanner_token(scanner); + struct dom_scanner_token *token = get_dom_scanner_token(scanner); enum dom_select_pseudo pseudo; enum dom_exception_code code; /* Skip double :'s in front of some pseudo's (::first-line, etc.) */ do { - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); } while (token && token->type == ':'); if (!token || token->type != CSS_TOKEN_IDENT) @@ -389,17 +386,17 @@ parse_dom_select_pseudo(struct dom_select *select, struct dom_select_node *sel, /* Parse a CSS3 selector and add selector nodes to the @select struct. */ static enum dom_exception_code parse_dom_select(struct dom_select *select, struct dom_stack *stack, - unsigned char *string, int length) + struct dom_string *string) { - struct scanner scanner; + struct dom_scanner scanner; struct dom_select_node sel; - init_scanner(&scanner, &css_scanner_info, string, string + length); + init_dom_scanner(&scanner, &dom_css_scanner_info, string); memset(&sel, 0, sizeof(sel)); - while (scanner_has_tokens(&scanner)) { - struct scanner_token *token = get_scanner_token(&scanner); + while (dom_scanner_has_tokens(&scanner)) { + struct dom_scanner_token *token = get_dom_scanner_token(&scanner); enum dom_exception_code code; struct dom_select_node *select_node; @@ -416,8 +413,8 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack, switch (token->type) { case CSS_TOKEN_IDENT: sel.node.type = DOM_NODE_ELEMENT; - set_dom_string(&sel.node.string, token->string, token->length); - if (token->length == 1 && token->string[0] == '*') + copy_dom_string(&sel.node.string, &token->string); + if (dom_scanner_token_contains(token, "*")) sel.match.element |= DOM_SELECT_ELEMENT_UNIVERSAL; break; @@ -427,7 +424,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack, sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_ID; /* Skip the leading '#'. */ - token->string++, token->length--; + skip_dom_scanner_token_char(token); break; case '[': @@ -438,14 +435,14 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack, break; case '.': - token = get_next_scanner_token(&scanner); + token = get_next_dom_scanner_token(&scanner); if (!token || token->type != CSS_TOKEN_IDENT) return DOM_ERR_SYNTAX; sel.node.type = DOM_NODE_ATTRIBUTE; sel.match.attribute |= DOM_SELECT_ATTRIBUTE_SPACE_LIST; set_dom_string(&sel.node.string, "class", -1); - set_dom_string(&sel.node.data.attribute.value, token->string, token->length); + copy_dom_string(&sel.node.data.attribute.value, &token->string); break; case ':': @@ -476,7 +473,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack, return DOM_ERR_SYNTAX; } - skip_scanner_token(&scanner); + skip_dom_scanner_token(&scanner); if (sel.node.type == DOM_NODE_UNKNOWN) continue; @@ -523,8 +520,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack, /* Basically this is just a wrapper for parse_dom_select() to ease error * handling. */ struct dom_select * -init_dom_select(enum dom_select_syntax syntax, - unsigned char *string, int length) +init_dom_select(enum dom_select_syntax syntax, struct dom_string *string) { struct dom_select *select = mem_calloc(1, sizeof(select)); struct dom_stack stack; @@ -532,7 +528,7 @@ init_dom_select(enum dom_select_syntax syntax, init_dom_stack(&stack, DOM_STACK_KEEP_NODES); - code = parse_dom_select(select, &stack, string, length); + code = parse_dom_select(select, &stack, string); done_dom_stack(&stack); if (code == DOM_ERR_NONE) diff --git a/src/document/dom/select.h b/src/dom/select.h similarity index 99% rename from src/document/dom/select.h rename to src/dom/select.h index 687734cd6..e794e1f5d 100644 --- a/src/document/dom/select.h +++ b/src/dom/select.h @@ -1,7 +1,7 @@ -#ifndef EL__DOCUMENT_DOM_SELECT_H -#define EL__DOCUMENT_DOM_SELECT_H +#ifndef EL_DOM_SELECT_H +#define EL_DOM_SELECT_H -#include "document/dom/node.h" +#include "dom/node.h" /* FIXME: Namespaces; *|E */ @@ -193,7 +193,7 @@ enum dom_select_syntax { }; struct dom_select *init_dom_select(enum dom_select_syntax syntax, - unsigned char *string, int length); + struct dom_string *string); void done_dom_select(struct dom_select *select); diff --git a/src/document/sgml/Makefile b/src/dom/sgml/Makefile similarity index 100% rename from src/document/sgml/Makefile rename to src/dom/sgml/Makefile diff --git a/src/document/sgml/README b/src/dom/sgml/README similarity index 100% rename from src/document/sgml/README rename to src/dom/sgml/README diff --git a/src/document/sgml/html/Makefile b/src/dom/sgml/html/Makefile similarity index 100% rename from src/document/sgml/html/Makefile rename to src/dom/sgml/html/Makefile diff --git a/src/document/sgml/html/attribute.inc b/src/dom/sgml/html/attribute.inc similarity index 100% rename from src/document/sgml/html/attribute.inc rename to src/dom/sgml/html/attribute.inc diff --git a/src/document/sgml/html/element.inc b/src/dom/sgml/html/element.inc similarity index 100% rename from src/document/sgml/html/element.inc rename to src/dom/sgml/html/element.inc diff --git a/src/document/sgml/html/html.c b/src/dom/sgml/html/html.c similarity index 79% rename from src/document/sgml/html/html.c rename to src/dom/sgml/html/html.c index 1f1f017e8..763bf9f1c 100644 --- a/src/document/sgml/html/html.c +++ b/src/dom/sgml/html/html.c @@ -9,8 +9,8 @@ #include "elinks.h" -#include "document/sgml/html/html.h" -#include "document/sgml/sgml.h" +#include "dom/sgml/html/html.h" +#include "dom/sgml/sgml.h" #define HTML_(node, name, id) SGML_NODE_INFO(HTML, node, name, id) @@ -20,13 +20,13 @@ static struct sgml_node_info html_attributes[HTML_ATTRIBUTES] = { SGML_NODE_HEAD(HTML, ATTRIBUTE), -#include "document/sgml/html/attribute.inc" +#include "dom/sgml/html/attribute.inc" }; static struct sgml_node_info html_elements[HTML_ELEMENTS] = { SGML_NODE_HEAD(HTML, ELEMENT), -#include "document/sgml/html/element.inc" +#include "dom/sgml/html/element.inc" }; diff --git a/src/document/sgml/html/html.h b/src/dom/sgml/html/html.h similarity index 66% rename from src/document/sgml/html/html.h rename to src/dom/sgml/html/html.h index 13234d621..78cc25dc2 100644 --- a/src/document/sgml/html/html.h +++ b/src/dom/sgml/html/html.h @@ -1,8 +1,8 @@ -#ifndef EL__DOCUMENT_SGML_HTML_HTML_H -#define EL__DOCUMENT_SGML_HTML_HTML_H +#ifndef EL_DOM_SGML_HTML_HTML_H +#define EL_DOM_SGML_HTML_HTML_H -#include "document/sgml/sgml.h" +#include "dom/sgml/sgml.h" extern struct sgml_info sgml_html_info; @@ -13,7 +13,7 @@ extern struct sgml_info sgml_html_info; enum html_element_type { HTML_ELEMENT_UNKNOWN, -#include "document/sgml/html/element.inc" +#include "dom/sgml/html/element.inc" HTML_ELEMENTS, }; @@ -21,7 +21,7 @@ enum html_element_type { enum html_attribute_type { HTML_ATTRIBUTE_UNKNOWN, -#include "document/sgml/html/attribute.inc" +#include "dom/sgml/html/attribute.inc" HTML_ATTRIBUTES, }; diff --git a/src/document/sgml/parser.c b/src/dom/sgml/parser.c similarity index 78% rename from src/document/sgml/parser.c rename to src/dom/sgml/parser.c index 4cc8433de..46d9a1245 100644 --- a/src/document/sgml/parser.c +++ b/src/dom/sgml/parser.c @@ -9,20 +9,18 @@ #include "elinks.h" -#include "document/dom/node.h" -#include "document/dom/stack.h" -#include "document/sgml/parser.h" -#include "document/sgml/scanner.h" -#include "document/sgml/sgml.h" -#include "protocol/uri.h" +#include "dom/node.h" +#include "dom/stack.h" +#include "dom/sgml/parser.h" +#include "dom/sgml/scanner.h" +#include "dom/sgml/sgml.h" #include "util/error.h" -#include "util/lists.h" #include "util/memory.h" #include "util/string.h" static struct sgml_parsing_state * -init_sgml_parsing_state(struct sgml_parser *parser, struct string *buffer); +init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer); /* When getting the sgml_parser struct it is _always_ assumed that the parser @@ -41,17 +39,15 @@ init_sgml_parsing_state(struct sgml_parser *parser, struct string *buffer); * information like node subtypes and SGML parser state information. */ static inline struct dom_node * -add_sgml_document(struct dom_stack *stack, struct uri *uri) +add_sgml_document(struct dom_stack *stack, struct dom_string *string) { - unsigned char *string = struri(uri); - size_t length = strlen(string); - struct dom_node *node = init_dom_node(DOM_NODE_DOCUMENT, string, length); + struct dom_node *node = init_dom_node(DOM_NODE_DOCUMENT, string); return node ? push_dom_node(stack, node) : NULL; } static inline struct dom_node * -add_sgml_element(struct dom_stack *stack, struct scanner_token *token) +add_sgml_element(struct dom_stack *stack, struct dom_scanner_token *token) { struct sgml_parser *parser = get_sgml_parser(stack); struct dom_node *parent = get_dom_stack_top(stack)->node; @@ -60,7 +56,7 @@ add_sgml_element(struct dom_stack *stack, struct scanner_token *token) struct dom_node *node; struct sgml_node_info *node_info; - node = add_dom_element(parent, token->string, token->length); + node = add_dom_element(parent, &token->string); if (!node) return NULL; node_info = get_sgml_node_info(parser->info->elements, node); @@ -81,17 +77,15 @@ add_sgml_element(struct dom_stack *stack, struct scanner_token *token) static inline void add_sgml_attribute(struct dom_stack *stack, - struct scanner_token *token, struct scanner_token *valtoken) + struct dom_scanner_token *token, struct dom_scanner_token *valtoken) { struct sgml_parser *parser = get_sgml_parser(stack); struct dom_node *parent = get_dom_stack_top(stack)->node; - unsigned char *value = valtoken ? valtoken->string : NULL; - size_t valuelen = valtoken ? valtoken->length : 0; + struct dom_string *value = valtoken ? &valtoken->string : NULL; struct sgml_node_info *info; struct dom_node *node; - node = add_dom_attribute(parent, token->string, token->length, - value, valuelen); + node = add_dom_attribute(parent, &token->string, value); info = get_sgml_node_info(parser->info->attributes, node); @@ -109,22 +103,23 @@ add_sgml_attribute(struct dom_stack *stack, } static inline struct dom_node * -add_sgml_proc_instruction(struct dom_stack *stack, struct scanner_token *token) +add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *token) { struct dom_node *parent = get_dom_stack_top(stack)->node; struct dom_node *node; /* Split the token in two if we can find a first space separator. */ - unsigned char *separator = memchr(token->string, ' ', token->length); + unsigned char *separator = memchr(token->string.string, ' ', token->string.length); /* Anything before the separator becomes the target name ... */ - unsigned char *name = token->string; - size_t namelen = separator ? separator - token->string : token->length; + size_t namelen = separator ? separator - token->string.string : token->string.length; + struct dom_string name = INIT_DOM_STRING(token->string.string, namelen); /* ... and everything after the instruction value. */ - unsigned char *value = separator ? separator + 1 : NULL; - size_t valuelen = value ? token->length - namelen - 1 : 0; + unsigned char *valuestr = separator ? separator + 1 : NULL; + size_t valuelen = valuestr ? token->string.length - namelen - 1 : 0; + struct dom_string value = INIT_DOM_STRING(valuestr, valuelen); - node = add_dom_proc_instruction(parent, name, namelen, value, valuelen); + node = add_dom_proc_instruction(parent, &name, &value); if (!node) return NULL; switch (token->type) { @@ -147,10 +142,10 @@ add_sgml_proc_instruction(struct dom_stack *stack, struct scanner_token *token) } static inline void -add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct scanner_token *token) +add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct dom_scanner_token *token) { struct dom_node *parent = get_dom_stack_top(stack)->node; - struct dom_node *node = add_dom_node(parent, type, token->string, token->length); + struct dom_node *node = add_dom_node(parent, type, &token->string); if (!node) return; @@ -165,24 +160,24 @@ add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct scanner_t /* SGML parser main handling: */ static inline void -parse_sgml_attributes(struct dom_stack *stack, struct scanner *scanner) +parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner) { - struct scanner_token name; + struct dom_scanner_token name; - assert(scanner_has_tokens(scanner) - && (get_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN - || get_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML)); + assert(dom_scanner_has_tokens(scanner) + && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN + || get_dom_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML)); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); - while (scanner_has_tokens(scanner)) { - struct scanner_token *token = get_scanner_token(scanner); + while (dom_scanner_has_tokens(scanner)) { + struct dom_scanner_token *token = get_dom_scanner_token(scanner); assert(token); switch (token->type) { case SGML_TOKEN_TAG_END: - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); /* and return */ case SGML_TOKEN_ELEMENT: case SGML_TOKEN_ELEMENT_BEGIN: @@ -194,11 +189,11 @@ parse_sgml_attributes(struct dom_stack *stack, struct scanner *scanner) copy_struct(&name, token); /* Skip the attribute name token */ - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (token && token->type == '=') { /* If the token is not a valid value token * ignore it. */ - token = get_next_scanner_token(scanner); + token = get_next_dom_scanner_token(scanner); if (token && token->type != SGML_TOKEN_IDENT && token->type != SGML_TOKEN_ATTRIBUTE @@ -212,28 +207,28 @@ parse_sgml_attributes(struct dom_stack *stack, struct scanner *scanner) /* Skip the value token */ if (token) - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; default: - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); } } } static void -parse_sgml_plain(struct dom_stack *stack, struct scanner *scanner) +parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) { - while (scanner_has_tokens(scanner)) { - struct scanner_token *token = get_scanner_token(scanner); + while (dom_scanner_has_tokens(scanner)) { + struct dom_scanner_token *token = get_dom_scanner_token(scanner); switch (token->type) { case SGML_TOKEN_ELEMENT: case SGML_TOKEN_ELEMENT_BEGIN: if (!add_sgml_element(stack, token)) { if (token->type == SGML_TOKEN_ELEMENT) { - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; } @@ -244,24 +239,24 @@ parse_sgml_plain(struct dom_stack *stack, struct scanner *scanner) if (token->type == SGML_TOKEN_ELEMENT_BEGIN) { parse_sgml_attributes(stack, scanner); } else { - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); } break; case SGML_TOKEN_ELEMENT_EMPTY_END: pop_dom_node(stack); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ELEMENT_END: - if (!token->length) { + if (!token->string.length) { pop_dom_node(stack); } else { struct dom_string string; struct dom_stack_state *state; - set_dom_string(&string, token->string, token->length); + set_dom_string(&string, token->string.string, token->string.length); state = search_dom_stack(stack, DOM_NODE_ELEMENT, &string); if (state) { @@ -273,12 +268,12 @@ parse_sgml_plain(struct dom_stack *stack, struct scanner *scanner) pop_dom_state(stack, state); } } - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_COMMENT: add_sgml_node(stack, DOM_NODE_COMMENT, token); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_NOTATION_ATTLIST: @@ -286,12 +281,12 @@ parse_sgml_plain(struct dom_stack *stack, struct scanner *scanner) case SGML_TOKEN_NOTATION_ELEMENT: case SGML_TOKEN_NOTATION_ENTITY: case SGML_TOKEN_NOTATION: - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_CDATA_SECTION: add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_PROCESS_XML: @@ -306,30 +301,30 @@ parse_sgml_plain(struct dom_stack *stack, struct scanner *scanner) case SGML_TOKEN_PROCESS: add_sgml_proc_instruction(stack, token); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_ENTITY: add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); break; case SGML_TOKEN_SPACE: case SGML_TOKEN_TEXT: default: add_sgml_node(stack, DOM_NODE_TEXT, token); - skip_scanner_token(scanner); + skip_dom_scanner_token(scanner); } } } struct dom_node * -parse_sgml(struct sgml_parser *parser, struct string *buffer) +parse_sgml(struct sgml_parser *parser, struct dom_string *buffer) { struct sgml_parsing_state *parsing; if (!parser->root) { - parser->root = add_sgml_document(&parser->stack, parser->uri); + parser->root = add_sgml_document(&parser->stack, &parser->uri); if (!parser->root) return NULL; get_dom_stack_top(&parser->stack)->immutable = 1; @@ -360,12 +355,10 @@ sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data) { struct sgml_parser *parser = get_sgml_parser(stack); struct sgml_parsing_state *parsing = data; - unsigned char *source = node->string.string; - unsigned char *end = source + node->string.length; parsing->depth = parser->stack.depth; get_dom_stack_top(&parser->stack)->immutable = 1; - init_scanner(&parsing->scanner, &sgml_scanner_info, source, end); + init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string); } static void @@ -422,12 +415,12 @@ static struct dom_stack_context_info sgml_parsing_context_info = { /* Create a new parsing state by pushing a new text node containing the*/ static struct sgml_parsing_state * -init_sgml_parsing_state(struct sgml_parser *parser, struct string *buffer) +init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer) { struct dom_stack_state *state; struct dom_node *node; - node = init_dom_node(DOM_NODE_TEXT, buffer->source, buffer->length); + node = init_dom_node(DOM_NODE_TEXT, buffer); if (!node || !push_dom_node(&parser->parsing, node)) return NULL; @@ -479,7 +472,7 @@ static struct dom_stack_context_info sgml_parser_context_info = { struct sgml_parser * init_sgml_parser(enum sgml_parser_type type, enum sgml_document_type doctype, - struct uri *uri) + struct dom_string *uri) { struct sgml_parser *parser; enum dom_stack_flag flags = 0; @@ -487,8 +480,12 @@ init_sgml_parser(enum sgml_parser_type type, enum sgml_document_type doctype, parser = mem_calloc(1, sizeof(*parser)); if (!parser) return NULL; + if (!init_dom_string(&parser->uri, uri->string, uri->length)) { + mem_free(parser); + return NULL; + } + parser->type = type; - parser->uri = get_uri_reference(uri); parser->info = get_sgml_info(doctype); if (type == SGML_PARSER_TREE) @@ -511,6 +508,6 @@ done_sgml_parser(struct sgml_parser *parser) { done_dom_stack(&parser->stack); done_dom_stack(&parser->parsing); - done_uri(parser->uri); + done_dom_string(&parser->uri); mem_free(parser); } diff --git a/src/document/sgml/parser.h b/src/dom/sgml/parser.h similarity index 78% rename from src/document/sgml/parser.h rename to src/dom/sgml/parser.h index 7d5ded74e..144df935a 100644 --- a/src/document/sgml/parser.h +++ b/src/dom/sgml/parser.h @@ -1,11 +1,11 @@ -#ifndef EL__DOCUMENT_SGML_PARSER_H -#define EL__DOCUMENT_SGML_PARSER_H +#ifndef EL_DOM_SGML_PARSER_H +#define EL_DOM_SGML_PARSER_H -#include "document/dom/node.h" -#include "document/dom/stack.h" -#include "document/sgml/sgml.h" -#include "util/scanner.h" +#include "dom/node.h" +#include "dom/stack.h" +#include "dom/sgml/sgml.h" +#include "dom/scanner.h" struct string; struct uri; @@ -27,7 +27,7 @@ enum sgml_parser_type { * used to feed output of stuff like ECMAScripts document.write() from *