diff --git a/src/dom/Makefile b/src/dom/Makefile index 35a6b3929..31e30e138 100644 --- a/src/dom/Makefile +++ b/src/dom/Makefile @@ -6,4 +6,7 @@ OBJS = node.o select.o stack.o scanner.o SUBDIRS-$(CONFIG_DEBUG) += test +test: all + make test -C test + include $(top_srcdir)/Makefile.lib diff --git a/src/dom/scanner.c b/src/dom/scanner.c index 51c3f28e6..10f5cc28a 100644 --- a/src/dom/scanner.c +++ b/src/dom/scanner.c @@ -153,8 +153,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info) } void -init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, - struct dom_string *string) +init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, + struct dom_string *string, int state) { if (!scanner_info->initialized) { init_dom_scanner_info(scanner_info); @@ -168,5 +168,6 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i scanner->end = string->string + string->length; scanner->current = scanner->table; scanner->info = scanner_info; + scanner->state = state; scanner->info->scan(scanner); } diff --git a/src/dom/scanner.h b/src/dom/scanner.h index e22bf28e5..2dc9722cd 100644 --- a/src/dom/scanner.h +++ b/src/dom/scanner.h @@ -91,8 +91,11 @@ struct dom_scanner_info { /* Initializes the scanner. */ -void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, - struct dom_string *string); +void init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, + struct dom_string *string, int state); + +#define init_dom_scanner(scanner, info, string) \ + init_dom_scanner_state(scanner, info, string, 0) /* The number of tokens in the scanners token table: * At best it should be big enough to contain properties with space separated diff --git a/src/dom/sgml/parser.c b/src/dom/sgml/parser.c index bb9b154bb..bbc0aa1dc 100644 --- a/src/dom/sgml/parser.c +++ b/src/dom/sgml/parser.c @@ -103,26 +103,17 @@ add_sgml_attribute(struct dom_stack *stack, } static inline struct dom_node * -add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *token) +add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target, + struct dom_scanner_token *data) { struct dom_node *parent = get_dom_stack_top(stack)->node; + struct dom_string *data_str = data ? &data->string : NULL; struct dom_node *node; - /* Split the token in two if we can find a first space separator. */ - unsigned char *separator = memchr(token->string.string, ' ', token->string.length); - /* Anything before the separator becomes the target name ... */ - size_t namelen = separator ? separator - token->string.string : token->string.length; - struct dom_string name = INIT_DOM_STRING(token->string.string, namelen); - - /* ... and everything after the instruction value. */ - unsigned char *valuestr = separator ? separator + 1 : NULL; - size_t valuelen = valuestr ? token->string.length - namelen - 1 : 0; - struct dom_string value = INIT_DOM_STRING(valuestr, valuelen); - - node = add_dom_proc_instruction(parent, &name, &value); + node = add_dom_proc_instruction(parent, &target->string, data_str); if (!node) return NULL; - switch (token->type) { + switch (target->type) { case SGML_TOKEN_PROCESS_XML: node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML; break; @@ -132,13 +123,7 @@ add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *tok node->data.proc_instruction.type = DOM_PROC_INSTRUCTION; } - if (!push_dom_node(stack, node)) - return NULL; - - if (token->type != SGML_TOKEN_PROCESS_XML) - pop_dom_node(stack); - - return node; + return push_dom_node(stack, node); } static inline void @@ -166,9 +151,12 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner) assert(dom_scanner_has_tokens(scanner) && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN - || get_dom_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML)); + || (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION + && get_dom_stack_top(stack)->node->data.proc_instruction.type + == DOM_PROC_INSTRUCTION_XML))); - skip_dom_scanner_token(scanner); + if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN) + skip_dom_scanner_token(scanner); while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); @@ -220,6 +208,8 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner) static void parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) { + struct dom_scanner_token target; + while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); @@ -290,17 +280,31 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) break; case SGML_TOKEN_PROCESS_XML: - if (!add_sgml_proc_instruction(stack, token)) { - skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END); - break; + case SGML_TOKEN_PROCESS: + copy_struct(&target, token); + + /* Skip the target token */ + token = get_next_dom_scanner_token(scanner); + if (!token) break; + + assert(token->type == SGML_TOKEN_PROCESS_DATA); + + if (add_sgml_proc_instruction(stack, &target, token) + && target.type == SGML_TOKEN_PROCESS_XML + && token->string.length > 0) { + /* Parse the . */ + struct dom_scanner attr_scanner; + + init_dom_scanner_state(&attr_scanner, + &sgml_scanner_info, + &token->string, + SGML_STATE_ELEMENT); + + if (dom_scanner_has_tokens(&attr_scanner)) + parse_sgml_attributes(stack, &attr_scanner); } - parse_sgml_attributes(stack, scanner); pop_dom_node(stack); - break; - - case SGML_TOKEN_PROCESS: - add_sgml_proc_instruction(stack, token); skip_dom_scanner_token(scanner); break; diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c index caaf3655f..88234be9d 100644 --- a/src/dom/sgml/scanner.c +++ b/src/dom/sgml/scanner.c @@ -17,14 +17,6 @@ /* Bitmap entries for the SGML character groups used in the scanner table */ -/* The SGML tokenizer maintains a state that can be either text or element - * state. The state has only meaning while doing the actual scanning and is not - * accessible at the parsing time. */ -enum sgml_scanner_state { - SGML_STATE_TEXT, - SGML_STATE_ELEMENT, -}; - enum sgml_char_group { SGML_CHAR_ENTITY = (1 << 1), SGML_CHAR_IDENT = (1 << 2), @@ -186,7 +178,7 @@ skip_comment(struct dom_scanner *scanner, unsigned char **string) unsigned char *pos = *string; int length = 0; - for (; pos < scanner->end - 3; pos++) + for (; pos < scanner->end - 2; pos++) if (pos[0] == '-' && pos[1] == '-' && pos[2] == '>') { length = pos - *string; pos += 3; @@ -296,27 +288,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t type = map_dom_scanner_string(scanner, pos, string, base); - /* Figure out where the processing instruction ends */ - for (pos = string; skip_sgml(scanner, &pos, '>', 0); ) { - if (pos[-2] != '?') continue; - - /* Set length until '?' char and move position - * beyond '>'. */ - real_length = pos - token->string.string - 2; - break; - } - - switch (type) { - case SGML_TOKEN_PROCESS_XML: - /* We want to parse the attributes */ - assert(scanner->state != SGML_STATE_ELEMENT); - scanner->state = SGML_STATE_ELEMENT; - break; - - default: - /* Just skip the whole thing */ - string = pos; - } + scanner->state = SGML_STATE_PROC_INST; } else if (*string == '/') { string++; @@ -366,6 +338,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t } else if (is_sgml_attribute(*string)) { scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; + if (string[-1] == '/' && string[0] == '>') + string--; } } else if (isquote(first_char)) { @@ -393,6 +367,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t if (is_sgml_attribute(*string)) { scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; + if (string[-1] == '/' && string[0] == '>') + string--; } } @@ -403,6 +379,38 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t } +/* Processing instruction data scanning */ + +static inline void +scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token) +{ + unsigned char *string = scanner->position; + size_t size; + + token->string.string = string; + + /* Figure out where the processing instruction ends. This doesn't use + * skip_sgml() since we MUST ignore precedence here to allow '<' inside + * the data part to be skipped correctly. */ + for (size = scanner->end - string; + size > 0 && (string = memchr(string, '>', size)); + string++) { + if (string[-1] == '?') { + string++; + break; + } + } + + if (!string) string = scanner->end; + + token->type = SGML_TOKEN_PROCESS_DATA; + token->string.length = string - token->string.string - 2; + token->precedence = get_sgml_precedence(token->type); + scanner->position = string; + scanner->state = SGML_STATE_TEXT; +} + + /* Scanner multiplexor */ static struct dom_scanner_token * @@ -419,7 +427,8 @@ scan_sgml_tokens(struct dom_scanner *scanner) current < table_end && scanner->position < scanner->end; current++) { if (scanner->state == SGML_STATE_ELEMENT - || *scanner->position == '<') { + || (*scanner->position == '<' + && scanner->state != SGML_STATE_PROC_INST)) { scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE); if (scanner->position >= scanner->end) break; @@ -429,8 +438,13 @@ scan_sgml_tokens(struct dom_scanner *scanner) if (current->type == SGML_TOKEN_SKIP) { current--; } - } else { + + } else if (scanner->state == SGML_STATE_TEXT) { scan_sgml_text_token(scanner, current); + + } else { + scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE); + scan_sgml_proc_inst_token(scanner, current); } } diff --git a/src/dom/sgml/scanner.h b/src/dom/sgml/scanner.h index 4032d718e..9abe59e32 100644 --- a/src/dom/sgml/scanner.h +++ b/src/dom/sgml/scanner.h @@ -27,8 +27,9 @@ enum sgml_token_type { SGML_TOKEN_CDATA_SECTION, /* */ - SGML_TOKEN_PROCESS, /* */ - SGML_TOKEN_PROCESS_XML, /* */ SGML_TOKEN_ELEMENT, /* <{ident}> */ SGML_TOKEN_ELEMENT_BEGIN, /* <{ident} */ @@ -56,6 +57,17 @@ enum sgml_token_type { SGML_TOKEN_NONE = 0, }; +/* The SGML tokenizer maintains a state (in the scanner->state member) that can + * be either text, element, or processing instruction state. The state has only + * meaning while doing the actual scanning and should not be used at the + * parsing time. It can however be used to initialize the scanner to a specific + * state. */ +enum sgml_scanner_state { + SGML_STATE_TEXT, + SGML_STATE_ELEMENT, + SGML_STATE_PROC_INST, +}; + extern struct dom_scanner_info sgml_scanner_info; /* Treat '<' as more valuable then '>' so that scanning of '' using diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic index 9c26420be..c9ba46775 100755 --- a/src/dom/test/test-sgml-parser-basic +++ b/src/dom/test/test-sgml-parser-basic @@ -44,6 +44,17 @@ element: html element: p #text: Hello World!' +test_output_equals \ +'Parse elements.' \ +'a' \ +' +element: root + element: child + attribute: attr -> value + element: child2 + element: child3 + #text: a' + test_output_equals \ 'Parse an enclosed comment.' \ '' \ @@ -51,6 +62,20 @@ test_output_equals \ element: root #comment: Hello World! ' +test_output_equals \ +'Parse comment combinations.' \ +'' \ +' +element: root + #comment: ' \ +' +#comment: foo' + test_output_equals \ 'Parse an enclosed CDATA section.' \ '...]]>' \ @@ -68,32 +93,111 @@ element: root attribute: name -> value with &foo; &...*...&...copy;...&;...&#;' \ +'Parse attributes with garbage.' \ +"" \ ' element: root - entity-reference: amp - #text: ... - entity-reference: #42 - #text: ... - entity-reference: ...copy - #text: ... - #text: &; - #text: ... - entity-reference: #' + attribute: a -> b + attribute: c -> d + attribute: g -> h + attribute: i -> j + attribute: k -> ' + +test_output_equals \ +'Parse attribute with non-quoted values.' \ +'...' \ +' +element: root + attribute: color -> #abc + attribute: path -> /to/%61-&\one";files +#text: ...' + +test_output_equals \ +'Parse entity references.' \ +'&-*' \ +' +entity-reference: amp +#text: - +entity-reference: #42' + +# Just how these should be gracefully handled is not clear to me. +test_output_equals \ +'Parse badly formatted entity references.' \ +'& m33p;-&.:-copy;-&;-&#;-&#xx;' \ +' +#text: & m33p; +#text: - +entity-reference: .:-copy +#text: - +#text: &; +#text: - +entity-reference: # +#text: - +entity-reference: #xx' # Test test_output_equals \ 'Parse processing instructions.' \ ' ... -' \ ' proc-instruction: xml -> encoding="UTF8" attribute: encoding -> UTF8 #text: \n...\n -proc-instruction: ecmascript -> \nvar -> val=2;\n' +proc-instruction: ecmascript -> var val=2;\n' + +test_output_equals \ +'Parse XML processing instructions.' \ +'?>-' \ +' +proc-instruction: xml -> version="1.0" /> + attribute: version -> 1.0 +proc-instruction: xml -> /' + +test_output_equals \ +'Parse exotic processing instructions.' \ +'+?>-?>---' \ +' +proc-instruction: xml -> ?+>+ +#text: -?>- +proc-instruction: js -> +#text: - +proc-instruction: -> +#text: -' + +test_output_equals \ +'Parse incorrect processing instructions.' \ +'--- < +#text: - +proc-instruction: -> <=";& +#text: -' + +test_output_equals \ +'Parse incorrect processing instructions (II).' \ +' >< / root >' \ +' +element: root + attribute: ns:attr -> value + proc-instruction: target -> data' test_done