From 22e647813e6b3cd2e770317f0710a86795051d28 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Fri, 20 Jan 2006 02:06:41 +0100 Subject: [PATCH 1/3] Fix DOM_CONFIG_NORMALIZE_WHITESPACE comment --- src/dom/configuration.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/dom/configuration.h b/src/dom/configuration.h index 0436ba5ae..f323c386f 100644 --- a/src/dom/configuration.h +++ b/src/dom/configuration.h @@ -63,8 +63,9 @@ enum dom_config_flag { /** "normalize-whitespace" * - * If false (default) nothing is done, else all nodes are discarded - * once they have been traversed. */ + * If false (default) nothing is done, else all text nodes are + * normalized so that sequences of space characters are changed to + * being only a single space. */ DOM_CONFIG_NORMALIZE_WHITESPACE = 64, }; From cc61578fcb1f6351db760be425ab42d80aa34ea7 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Fri, 20 Jan 2006 02:07:24 +0100 Subject: [PATCH 2/3] Fix node pushing in walk_dom_nodes() --- src/dom/stack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/dom/stack.c b/src/dom/stack.c index 61bd75a68..328a7ef4c 100644 --- a/src/dom/stack.c +++ b/src/dom/stack.c @@ -420,7 +420,7 @@ walk_dom_nodes(struct dom_stack *stack, struct dom_node *root) if (is_dom_node_list_member(list, wstate->index)) { struct dom_node *child = list->entries[wstate->index++]; - if (push_dom_node(stack, child)) + if (push_dom_node(stack, child) == DOM_STACK_CODE_OK) continue; } From 2eba71d95bb24708338ad3991b3027a126896f00 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Fri, 20 Jan 2006 02:08:46 +0100 Subject: [PATCH 3/3] Add support for testing normalization using the DOM configuration module --- src/dom/test/sgml-parser.c | 46 ++++- src/dom/test/test-dom-configuration-basic | 236 ++++++++++++++++++++++ 2 files changed, 278 insertions(+), 4 deletions(-) create mode 100755 src/dom/test/test-dom-configuration-basic diff --git a/src/dom/test/sgml-parser.c b/src/dom/test/sgml-parser.c index 28c5bd6c8..0032db857 100644 --- a/src/dom/test/sgml-parser.c +++ b/src/dom/test/sgml-parser.c @@ -11,6 +11,7 @@ #include "elinks.h" +#include "dom/configuration.h" #include "dom/node.h" #include "dom/sgml/parser.h" #include "dom/stack.h" @@ -260,7 +261,10 @@ main(int argc, char *argv[]) struct sgml_parser *parser; enum sgml_document_type doctype = SGML_DOCTYPE_HTML; enum sgml_parser_flag flags = 0; + enum sgml_parser_type type = SGML_PARSER_STREAM; enum sgml_parser_code code = 0; + enum dom_config_flag normalize_flags = 0; + int normalize = 0; int complete = 1; struct dom_string uri = INIT_DOM_STRING("dom://test", -1); struct dom_string source = INIT_DOM_STRING("(no source)", -1); @@ -298,6 +302,20 @@ main(int argc, char *argv[]) set_dom_string(&source, argv[i], strlen(argv[i])); } + } else if (!strncmp(arg, "normalize", 9)) { + arg += 9; + if (*arg == '=') { + arg++; + } else { + i++; + if (i >= argc) + die("--normalize expects a string"); + arg = argv[i]; + } + normalize = 1; + normalize_flags = parse_dom_config(arg, ','); + type = SGML_PARSER_TREE; + } else if (!strcmp(arg, "print-lines")) { flags |= SGML_PARSER_COUNT_LINES; @@ -316,11 +334,14 @@ main(int argc, char *argv[]) } } - parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, &uri, flags); + parser = init_sgml_parser(type, doctype, &uri, flags); if (!parser) return 1; parser->error_func = sgml_error_function; - add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info); + if (normalize) + add_dom_config_normalizer(&parser->stack, normalize_flags); + else + add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info); code = parse_sgml(parser, source.string, source.length, complete); if (parser->root) { @@ -330,13 +351,30 @@ main(int argc, char *argv[]) get_dom_stack_state(&parser->stack, root_offset)->immutable = 0; - /* For SGML_PARSER_STREAM this will free the DOM - * root node. */ while (!dom_stack_is_empty(&parser->stack)) pop_dom_node(&parser->stack); + + if (normalize) { + struct dom_stack stack; + + /* Note, that we cannot free nodes when walking the DOM + * tree since walk_dom_node() uses an index to traverse + * the tree. */ + init_dom_stack(&stack, DOM_STACK_FLAG_NONE); + /* XXX: This context needs to be added first because it + * assumes the parser can be accessed via + * stack->contexts[0].data. */ + add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info); + walk_dom_nodes(&stack, parser->root); + done_dom_stack(&stack); + done_dom_node(parser->root); + } } done_sgml_parser(parser); +#ifdef DEBUG_MEMLEAK + check_memory_leaks(); +#endif return code; } diff --git a/src/dom/test/test-dom-configuration-basic b/src/dom/test/test-dom-configuration-basic new file mode 100755 index 000000000..cd217b1ef --- /dev/null +++ b/src/dom/test/test-dom-configuration-basic @@ -0,0 +1,236 @@ +#!/bin/sh +# +# Copyright (c) 2005 Jonas Fonseca +# + +test_description='Test the DOM configuration module + +This test checks that the normalization performed by the DOM configuration +is done correctly. +' + +. "$TEST_LIB" + +test_normalize_output_equals () { + desc="$1"; shift + config="$1"; shift + src="$1"; shift + out="$1"; shift + + URI="test:$(echo "$desc" | sed ' + s/^[ \t]*\[[^]]*\][ \t]*//; + s/[:., \t][:., \t]*/-/g; + s/_/-/g; + # *cough* + y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/; + s/[^a-zA-Z0-9-]//g;')" + + sgml-parser --src "$src" --normalize "$config" --uri "$URI" --src "$src" | sed 's/^ //' > output + echo "#document: $URI" > expected + echo "$out" | sed -n '2,$p' >> expected + + test_expect_success "$desc" 'cmp output expected' +} + + +## Config strings ########################################################### + +NOOP='cdata-sections,comments,element-content-whitespace,entities' +NOCOMMENTS='cdata-sections,element-content-whitespace,entities' +CDATA2TEXT='comments,element-content-whitespace,entities' +ENTITIES='cdata-section,comments,element-content-whitespace' +NOWSTEXT='cdata-section,comments,entities' +NORM1='' + + +## No-ops ################################################################### + +test_normalize_output_equals \ +'Normalization no-op.' \ +"$NOOP" \ +' well' \ +' +element: roswell + #text: + #cdata-section: | that | + #comment: ends + #text: well' + +test_normalize_output_equals \ +'Keep comments.' \ +"$NOOP" \ +'andor' \ +' +#text: and +#comment: comment:1 +#text: or +#comment: comment:2 ' + +test_normalize_output_equals \ +'Keep CDATA sections ' \ +"$NOOP" \ +'or' \ +' +#cdata-section: and +#text: or +#cdata-section: maybe' + + +## Comments ################################################################# + +test_normalize_output_equals \ +'Remove comments. (I)' \ +"$NOCOMMENTS" \ +"?" \ +' +element: no + #text: ?' + +test_normalize_output_equals \ +'Remove comments. (II)' \ +"$NOCOMMENTS" \ +'and' \ +' +#text: and' + +test_normalize_output_equals \ +'Remove comments. (III)' \ +"$NOCOMMENTS" \ +'nothing to see here' \ +' +#text: nothing to see here' + + +## Entities ################################################################# + +# Entities should be shown 'verbatim' here after expansion. + +test_normalize_output_equals \ +'Expand entities. (I)' \ +"$ENTITIES" \ +'a<b>c' \ +' +#text: a<b>c' + +test_normalize_output_equals \ +'Expand entities. (II)' \ +"$ENTITIES" \ +'&bad-entity&good-entity;' \ +' +#text: &bad-entity;&good-entity;' + +test_normalize_output_equals \ +'Expand entities. (III)' \ +"$ENTITIES" \ +'&b;' \ +' +element: a + #text: &b;' + + +## CDATA Sections ########################################################### + +test_normalize_output_equals \ +'Replace CDATA section with text. (I)' \ +"$CDATA2TEXT" \ +'' \ +' +#text: a small text snippet' + +test_normalize_output_equals \ +'Replace CDATA section with text. (II)' \ +"$CDATA2TEXT" \ +' ' \ +' +#text: a small text snippet' + +test_normalize_output_equals \ +'Replace CDATA section with text. (III)' \ +"$CDATA2TEXT" \ +'before after' \ +' +#text: before and after' + + +## Element Content Whitespace ############################################### + +test_normalize_output_equals \ +'Remove element content whitespace. (I)' \ +"$NOWSTEXT" \ +' + some text +' \ +' +element: a + element: b + #text: some text' \ + +# I haven't read the specs about this thing, for now it just blasts all +# space-only text nodes. Probably not the wanted behaviour all of the time. +# --jonas +test_normalize_output_equals \ +'Remove element content whitespace. (II)' \ +"$NOWSTEXT" \ +'space between &this; &that; gets removed' \ +' +element: e + #text: space between + entity-reference: this + entity-reference: that + #text: gets removed' + + +## Mixes #################################################################### + +test_normalize_output_equals \ +'Normalization mix #1. (I)' \ +"$NORM1" \ +'before after &some; remain' \ +' +#text: before and after &some; remain' + +test_normalize_output_equals \ +'Normalization mix #1. (II)' \ +"$NORM1" \ +'b&c; ' \ +' +#text: b&c;d' + + +## Special ELinks Extensions ################################################ + +test_normalize_output_equals \ +'Remove unknown (HTML) elements and attributes. (I)' \ +"$NOOP,unknown" \ +'where?<doit>here!</doit>' \ +' +element: html + element: title + #text: where?' \ + +test_normalize_output_equals \ +'Remove unknown (HTML) elements and attributes. (II)' \ +"$NOOP,unknown" \ +' aint no HTML' \ +' +#text: aint no HTML' \ + +test_normalize_output_equals \ +'Normalize whitespace. (I)' \ +"$NOOP,normalize-whitespace" \ +'Here is a + + + +lot of useless space.' \ +' +#text: Here is a lot of useless space.' \ + +test_normalize_output_equals \ +'Normalize whitespace. (II)' \ +"$CDATA2TEXT,normalize-whitespace" \ +'Could we read that again?' \ +' +#text: Could we please read that again?' \ + +test_done