From 889a0f16f8416a2a89865fb7a3941473ff659358 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 18:00:26 +0100 Subject: [PATCH 1/8] Fix the expected output of processing instruction parsing Spaces after the target should be skipped. --- src/dom/test/test-sgml-parser-basic | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic index 9c26420be..f1fbb653d 100755 --- a/src/dom/test/test-sgml-parser-basic +++ b/src/dom/test/test-sgml-parser-basic @@ -94,6 +94,6 @@ var val=2; proc-instruction: xml -> encoding="UTF8" attribute: encoding -> UTF8 #text: \n...\n -proc-instruction: ecmascript -> \nvar -> val=2;\n' +proc-instruction: ecmascript -> var val=2;\n' test_done From c24c67ce59a2fb4a1cb85f7675ef0806c18d42c7 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 18:20:03 +0100 Subject: [PATCH 2/8] Make it possible to initialise a scanner in a specific state --- src/dom/scanner.c | 5 +++-- src/dom/scanner.h | 7 +++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/dom/scanner.c b/src/dom/scanner.c index 51c3f28e6..10f5cc28a 100644 --- a/src/dom/scanner.c +++ b/src/dom/scanner.c @@ -153,8 +153,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info) } void -init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, - struct dom_string *string) +init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, + struct dom_string *string, int state) { if (!scanner_info->initialized) { init_dom_scanner_info(scanner_info); @@ -168,5 +168,6 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i scanner->end = string->string + string->length; scanner->current = scanner->table; scanner->info = scanner_info; + scanner->state = state; scanner->info->scan(scanner); } diff --git a/src/dom/scanner.h b/src/dom/scanner.h index e22bf28e5..2dc9722cd 100644 --- a/src/dom/scanner.h +++ b/src/dom/scanner.h @@ -91,8 +91,11 @@ struct dom_scanner_info { /* Initializes the scanner. */ -void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, - struct dom_string *string); +void init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, + struct dom_string *string, int state); + +#define init_dom_scanner(scanner, info, string) \ + init_dom_scanner_state(scanner, info, string, 0) /* The number of tokens in the scanners token table: * At best it should be big enough to contain properties with space separated From 1a177491a0249bf915a77692b60e8292888c5c5c Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 18:31:49 +0100 Subject: [PATCH 3/8] Fix SGML parsing of processing instructions () It involves adding a new scanner state which is used only to generate a new processing instruction (PI) data token. This removes some scanner specific code from the parser and makes handling of PIs more generic. The data of XML PIs are still parsed as attributes and added to the PI node. The 6th test now succeeds. Hurrah! --- src/dom/sgml/parser.c | 66 ++++++++++++++++++++++-------------------- src/dom/sgml/scanner.c | 59 +++++++++++++++++++------------------ src/dom/sgml/scanner.h | 16 ++++++++-- 3 files changed, 78 insertions(+), 63 deletions(-) diff --git a/src/dom/sgml/parser.c b/src/dom/sgml/parser.c index bb9b154bb..bbc0aa1dc 100644 --- a/src/dom/sgml/parser.c +++ b/src/dom/sgml/parser.c @@ -103,26 +103,17 @@ add_sgml_attribute(struct dom_stack *stack, } static inline struct dom_node * -add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *token) +add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target, + struct dom_scanner_token *data) { struct dom_node *parent = get_dom_stack_top(stack)->node; + struct dom_string *data_str = data ? &data->string : NULL; struct dom_node *node; - /* Split the token in two if we can find a first space separator. */ - unsigned char *separator = memchr(token->string.string, ' ', token->string.length); - /* Anything before the separator becomes the target name ... */ - size_t namelen = separator ? separator - token->string.string : token->string.length; - struct dom_string name = INIT_DOM_STRING(token->string.string, namelen); - - /* ... and everything after the instruction value. */ - unsigned char *valuestr = separator ? separator + 1 : NULL; - size_t valuelen = valuestr ? token->string.length - namelen - 1 : 0; - struct dom_string value = INIT_DOM_STRING(valuestr, valuelen); - - node = add_dom_proc_instruction(parent, &name, &value); + node = add_dom_proc_instruction(parent, &target->string, data_str); if (!node) return NULL; - switch (token->type) { + switch (target->type) { case SGML_TOKEN_PROCESS_XML: node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML; break; @@ -132,13 +123,7 @@ add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *tok node->data.proc_instruction.type = DOM_PROC_INSTRUCTION; } - if (!push_dom_node(stack, node)) - return NULL; - - if (token->type != SGML_TOKEN_PROCESS_XML) - pop_dom_node(stack); - - return node; + return push_dom_node(stack, node); } static inline void @@ -166,9 +151,12 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner) assert(dom_scanner_has_tokens(scanner) && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN - || get_dom_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML)); + || (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION + && get_dom_stack_top(stack)->node->data.proc_instruction.type + == DOM_PROC_INSTRUCTION_XML))); - skip_dom_scanner_token(scanner); + if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN) + skip_dom_scanner_token(scanner); while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); @@ -220,6 +208,8 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner) static void parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) { + struct dom_scanner_token target; + while (dom_scanner_has_tokens(scanner)) { struct dom_scanner_token *token = get_dom_scanner_token(scanner); @@ -290,17 +280,31 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) break; case SGML_TOKEN_PROCESS_XML: - if (!add_sgml_proc_instruction(stack, token)) { - skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END); - break; + case SGML_TOKEN_PROCESS: + copy_struct(&target, token); + + /* Skip the target token */ + token = get_next_dom_scanner_token(scanner); + if (!token) break; + + assert(token->type == SGML_TOKEN_PROCESS_DATA); + + if (add_sgml_proc_instruction(stack, &target, token) + && target.type == SGML_TOKEN_PROCESS_XML + && token->string.length > 0) { + /* Parse the . */ + struct dom_scanner attr_scanner; + + init_dom_scanner_state(&attr_scanner, + &sgml_scanner_info, + &token->string, + SGML_STATE_ELEMENT); + + if (dom_scanner_has_tokens(&attr_scanner)) + parse_sgml_attributes(stack, &attr_scanner); } - parse_sgml_attributes(stack, scanner); pop_dom_node(stack); - break; - - case SGML_TOKEN_PROCESS: - add_sgml_proc_instruction(stack, token); skip_dom_scanner_token(scanner); break; diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c index caaf3655f..b51f79bc0 100644 --- a/src/dom/sgml/scanner.c +++ b/src/dom/sgml/scanner.c @@ -17,14 +17,6 @@ /* Bitmap entries for the SGML character groups used in the scanner table */ -/* The SGML tokenizer maintains a state that can be either text or element - * state. The state has only meaning while doing the actual scanning and is not - * accessible at the parsing time. */ -enum sgml_scanner_state { - SGML_STATE_TEXT, - SGML_STATE_ELEMENT, -}; - enum sgml_char_group { SGML_CHAR_ENTITY = (1 << 1), SGML_CHAR_IDENT = (1 << 2), @@ -296,27 +288,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t type = map_dom_scanner_string(scanner, pos, string, base); - /* Figure out where the processing instruction ends */ - for (pos = string; skip_sgml(scanner, &pos, '>', 0); ) { - if (pos[-2] != '?') continue; - - /* Set length until '?' char and move position - * beyond '>'. */ - real_length = pos - token->string.string - 2; - break; - } - - switch (type) { - case SGML_TOKEN_PROCESS_XML: - /* We want to parse the attributes */ - assert(scanner->state != SGML_STATE_ELEMENT); - scanner->state = SGML_STATE_ELEMENT; - break; - - default: - /* Just skip the whole thing */ - string = pos; - } + scanner->state = SGML_STATE_PROC_INST; } else if (*string == '/') { string++; @@ -403,6 +375,28 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t } +/* Processing instruction data scanning */ + +static inline void +scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token) +{ + unsigned char *string = scanner->position; + + token->string.string = string++; + + /* Figure out where the processing instruction ends */ + while (skip_sgml(scanner, &string, '>', 0)) + if (string[-2] == '?') + break; + + token->type = SGML_TOKEN_PROCESS_DATA; + token->string.length = string - token->string.string - 2; + token->precedence = get_sgml_precedence(token->type); + scanner->position = string; + scanner->state = SGML_STATE_TEXT; +} + + /* Scanner multiplexor */ static struct dom_scanner_token * @@ -429,8 +423,13 @@ scan_sgml_tokens(struct dom_scanner *scanner) if (current->type == SGML_TOKEN_SKIP) { current--; } - } else { + + } else if (scanner->state == SGML_STATE_TEXT) { scan_sgml_text_token(scanner, current); + + } else { + scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE); + scan_sgml_proc_inst_token(scanner, current); } } diff --git a/src/dom/sgml/scanner.h b/src/dom/sgml/scanner.h index 4032d718e..9abe59e32 100644 --- a/src/dom/sgml/scanner.h +++ b/src/dom/sgml/scanner.h @@ -27,8 +27,9 @@ enum sgml_token_type { SGML_TOKEN_CDATA_SECTION, /* */ - SGML_TOKEN_PROCESS, /* */ - SGML_TOKEN_PROCESS_XML, /* */ SGML_TOKEN_ELEMENT, /* <{ident}> */ SGML_TOKEN_ELEMENT_BEGIN, /* <{ident} */ @@ -56,6 +57,17 @@ enum sgml_token_type { SGML_TOKEN_NONE = 0, }; +/* The SGML tokenizer maintains a state (in the scanner->state member) that can + * be either text, element, or processing instruction state. The state has only + * meaning while doing the actual scanning and should not be used at the + * parsing time. It can however be used to initialize the scanner to a specific + * state. */ +enum sgml_scanner_state { + SGML_STATE_TEXT, + SGML_STATE_ELEMENT, + SGML_STATE_PROC_INST, +}; + extern struct dom_scanner_info sgml_scanner_info; /* Treat '<' as more valuable then '>' so that scanning of '' using From beb8337fc581267ba47c55954c4403a4b0e01e37 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 18:33:59 +0100 Subject: [PATCH 4/8] Add rule to make test run from src/dom dir --- src/dom/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/dom/Makefile b/src/dom/Makefile index 35a6b3929..31e30e138 100644 --- a/src/dom/Makefile +++ b/src/dom/Makefile @@ -6,4 +6,7 @@ OBJS = node.o select.o stack.o scanner.o SUBDIRS-$(CONFIG_DEBUG) += test +test: all + make test -C test + include $(top_srcdir)/Makefile.lib From 958a4a1b51b7d3a9c13590d5b86ccca92fee2f61 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 19:13:48 +0100 Subject: [PATCH 5/8] Add tests for more things like space handling and obscure formatting --- src/dom/test/test-sgml-parser-basic | 69 ++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic index f1fbb653d..358871c6e 100755 --- a/src/dom/test/test-sgml-parser-basic +++ b/src/dom/test/test-sgml-parser-basic @@ -44,6 +44,17 @@ element: html element: p #text: Hello World!' +test_output_equals \ +'Parse elements.' \ +'a' \ +' +element: root + element: child + attribute: attr -> value + element: child2 + element: child3 + #text: a' + test_output_equals \ 'Parse an enclosed comment.' \ '' \ @@ -68,26 +79,45 @@ element: root attribute: name -> value with &foo; &...*...&...copy;...&;...&#;' \ +'Parse attributes with garbage.' \ +"" \ ' element: root - entity-reference: amp - #text: ... - entity-reference: #42 - #text: ... - entity-reference: ...copy - #text: ... - #text: &; - #text: ... - entity-reference: #' + attribute: a -> b + attribute: c -> d + attribute: g -> h + attribute: i -> j + attribute: k -> ' + +test_output_equals \ +'Parse entity references.' \ +'&-*' \ +' +entity-reference: amp +#text: - +entity-reference: #42' + +# Just how these should be gracefully handled is not clear to me. +test_output_equals \ +'Parse badly formatted entity references.' \ +'& m33p;-&.:-copy;-&;-&#;-&#xx;' \ +' +#text: & m33p; +#text: - +entity-reference: .:-copy +#text: - +#text: &; +#text: - +entity-reference: # +#text: - +entity-reference: #xx' # Test test_output_equals \ 'Parse processing instructions.' \ ' ... -' \ ' @@ -96,4 +126,19 @@ proc-instruction: xml -> encoding="UTF8" #text: \n...\n proc-instruction: ecmascript -> var val=2;\n' +test_output_equals \ +'Skip spaces not inside text.' \ +'< +root +ns:attr += +"value" +>< / root >' \ +' +element: root + attribute: ns:attr -> value + proc-instruction: target -> data' + test_done From 57168e1fbcebed4cf7fe559d8a9c10bbf74ef432 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 20:38:43 +0100 Subject: [PATCH 6/8] Handle as a self-closing tag Before the '/' before '>' would be interpreted as part of the attribute value. Hope this is sensible slurping of the markup soup. --- src/dom/sgml/scanner.c | 4 ++++ src/dom/test/test-sgml-parser-basic | 9 +++++++++ 2 files changed, 13 insertions(+) diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c index b51f79bc0..946c0bb6d 100644 --- a/src/dom/sgml/scanner.c +++ b/src/dom/sgml/scanner.c @@ -338,6 +338,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t } else if (is_sgml_attribute(*string)) { scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; + if (string[-1] == '/' && string[0] == '>') + string--; } } else if (isquote(first_char)) { @@ -365,6 +367,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t if (is_sgml_attribute(*string)) { scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; + if (string[-1] == '/' && string[0] == '>') + string--; } } diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic index 358871c6e..d91709f5a 100755 --- a/src/dom/test/test-sgml-parser-basic +++ b/src/dom/test/test-sgml-parser-basic @@ -89,6 +89,15 @@ element: root attribute: i -> j attribute: k -> ' +test_output_equals \ +'Parse attribute with non-quoted values.' \ +'...' \ +' +element: root + attribute: color -> #abc + attribute: path -> /to/%61-&\one";files +#text: ...' + test_output_equals \ 'Parse entity references.' \ '&-*' \ From bd877570d27668d63b7d2e715015a469510624b7 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Thu, 29 Dec 2005 21:52:27 +0100 Subject: [PATCH 7/8] Test some more obscure proc. instructions and fix some assertion failures --- src/dom/sgml/scanner.c | 21 ++++++++++++++++----- src/dom/test/test-sgml-parser-basic | 28 ++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c index 946c0bb6d..31f77f404 100644 --- a/src/dom/sgml/scanner.c +++ b/src/dom/sgml/scanner.c @@ -385,13 +385,23 @@ static inline void scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token) { unsigned char *string = scanner->position; + size_t size; - token->string.string = string++; + token->string.string = string; - /* Figure out where the processing instruction ends */ - while (skip_sgml(scanner, &string, '>', 0)) - if (string[-2] == '?') + /* Figure out where the processing instruction ends. This doesn't use + * skip_sgml() since we MUST ignore precedence here to allow '<' inside + * the data part to be skipped correctly. */ + for (size = scanner->end - string; + size > 0 && (string = memchr(string, '>', size)); + string++) { + if (string[-1] == '?') { + string++; break; + } + } + + if (!string) string = scanner->end; token->type = SGML_TOKEN_PROCESS_DATA; token->string.length = string - token->string.string - 2; @@ -417,7 +427,8 @@ scan_sgml_tokens(struct dom_scanner *scanner) current < table_end && scanner->position < scanner->end; current++) { if (scanner->state == SGML_STATE_ELEMENT - || *scanner->position == '<') { + || (*scanner->position == '<' + && scanner->state != SGML_STATE_PROC_INST)) { scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE); if (scanner->position >= scanner->end) break; diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic index d91709f5a..ae0739c7d 100755 --- a/src/dom/test/test-sgml-parser-basic +++ b/src/dom/test/test-sgml-parser-basic @@ -135,6 +135,34 @@ proc-instruction: xml -> encoding="UTF8" #text: \n...\n proc-instruction: ecmascript -> var val=2;\n' +test_output_equals \ +'Parse exotic processing instructions.' \ +'+?>-?>---' \ +' +proc-instruction: xml -> ?+>+ +#text: -?>- +proc-instruction: js -> +#text: - +proc-instruction: -> +#text: -' + +test_output_equals \ +'Parse incorrect processing instructions.' \ +'--- < +#text: - +proc-instruction: -> <=";& +#text: -' + +test_output_equals \ +'Parse incorrect processing instructions (II).' \ +' > Date: Thu, 29 Dec 2005 22:26:39 +0100 Subject: [PATCH 8/8] More end - 3; pos++) + for (; pos < scanner->end - 2; pos++) if (pos[0] == '-' && pos[1] == '-' && pos[2] == '>') { length = pos - *string; pos += 3; diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic index ae0739c7d..c9ba46775 100755 --- a/src/dom/test/test-sgml-parser-basic +++ b/src/dom/test/test-sgml-parser-basic @@ -62,6 +62,20 @@ test_output_equals \ element: root #comment: Hello World! ' +test_output_equals \ +'Parse comment combinations.' \ +'' \ +' +element: root + #comment: ' \ +' +#comment: foo' + test_output_equals \ 'Parse an enclosed CDATA section.' \ '...]]>' \ @@ -135,6 +149,14 @@ proc-instruction: xml -> encoding="UTF8" #text: \n...\n proc-instruction: ecmascript -> var val=2;\n' +test_output_equals \ +'Parse XML processing instructions.' \ +'?>-' \ +' +proc-instruction: xml -> version="1.0" /> + attribute: version -> 1.0 +proc-instruction: xml -> /' + test_output_equals \ 'Parse exotic processing instructions.' \ '+?>-?>---' \