Merge with git+ssh://pasky.or.cz/srv/git/elinks.git

2025-06-30 22:19:29 -04:00 · 2006-01-20 15:27:23 +01:00 · 2006-01-20 15:27:23 +01:00 · dcc07a7f68
commit dcc07a7f68
parent 943cbd1bc6 2eba71d95b
4 changed files with 282 additions and 7 deletions
--- a/src/dom/configuration.h
+++ b/src/dom/configuration.h
@ -63,8 +63,9 @@ enum dom_config_flag {
 	/** "normalize-whitespace"
 	 *
-	 * If false (default) nothing is done, else all nodes are discarded
+	 * If false (default) nothing is done, else all text nodes are
-	 * once they have been traversed. */
+	 * normalized so that sequences of space characters are changed to
 	 * being only a single space. */
 	DOM_CONFIG_NORMALIZE_WHITESPACE = 64,
 };
--- a/src/dom/stack.c
+++ b/src/dom/stack.c
@ -420,7 +420,7 @@ walk_dom_nodes(struct dom_stack *stack, struct dom_node *root)
 		if (is_dom_node_list_member(list, wstate->index)) {
 			struct dom_node *child = list->entries[wstate->index++];
-			if (push_dom_node(stack, child))
+			if (push_dom_node(stack, child) == DOM_STACK_CODE_OK)
 				continue;
 		}
--- a/src/dom/test/sgml-parser.c
+++ b/src/dom/test/sgml-parser.c
@ -11,6 +11,7 @@
 #include "elinks.h"
 #include "dom/configuration.h"
 #include "dom/node.h"
 #include "dom/sgml/parser.h"
 #include "dom/stack.h"
@ -260,7 +261,10 @@ main(int argc, char *argv[])
 	struct sgml_parser *parser;
 	enum sgml_document_type doctype = SGML_DOCTYPE_HTML;
 	enum sgml_parser_flag flags = 0;
 	enum sgml_parser_type type = SGML_PARSER_STREAM;
 	enum sgml_parser_code code = 0;
 	enum dom_config_flag normalize_flags = 0;
 	int normalize = 0;
 	int complete = 1;
 	struct dom_string uri = INIT_DOM_STRING("dom://test", -1);
 	struct dom_string source = INIT_DOM_STRING("(no source)", -1);
@ -298,6 +302,20 @@ main(int argc, char *argv[])
 				set_dom_string(&source, argv[i], strlen(argv[i]));
 			}
 		} else if (!strncmp(arg, "normalize", 9)) {
 			arg += 9;
 			if (*arg == '=') {
 				arg++;
 			} else {
 				i++;
 				if (i >= argc)
 					die("--normalize expects a string");
 				arg = argv[i];
 			}
 			normalize = 1;
 			normalize_flags = parse_dom_config(arg, ',');
 			type = SGML_PARSER_TREE;
 		} else if (!strcmp(arg, "print-lines")) {
 			flags |= SGML_PARSER_COUNT_LINES;
@ -316,10 +334,13 @@ main(int argc, char *argv[])
 		}
 	}
-	parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, &uri, flags);
+	parser = init_sgml_parser(type, doctype, &uri, flags);
 	if (!parser) return 1;
 	parser->error_func = sgml_error_function;
 	if (normalize)
 		add_dom_config_normalizer(&parser->stack, normalize_flags);
 	else
 		add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);
 	code = parse_sgml(parser, source.string, source.length, complete);
@ -330,13 +351,30 @@ main(int argc, char *argv[])
 		get_dom_stack_state(&parser->stack, root_offset)->immutable = 0;
 		/* For SGML_PARSER_STREAM this will free the DOM
 		 * root node. */
 		while (!dom_stack_is_empty(&parser->stack))
 			pop_dom_node(&parser->stack);
 		if (normalize) {
 			struct dom_stack stack;
 			/* Note, that we cannot free nodes when walking the DOM
 			 * tree since walk_dom_node() uses an index to traverse
 			 * the tree. */
 			init_dom_stack(&stack, DOM_STACK_FLAG_NONE);
 			/* XXX: This context needs to be added first because it
 			 * assumes the parser can be accessed via
 			 * stack->contexts[0].data. */
 			add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info);
 			walk_dom_nodes(&stack, parser->root);
 			done_dom_stack(&stack);
 			done_dom_node(parser->root);
 		}
 	}
 	done_sgml_parser(parser);
 #ifdef DEBUG_MEMLEAK
 	check_memory_leaks();
 #endif
 	return code;
 }
--- a/src/dom/test/test-dom-configuration-basic
+++ b/src/dom/test/test-dom-configuration-basic
@ -0,0 +1,236 @@
 #!/bin/sh
 #
 # Copyright (c) 2005 Jonas Fonseca
 #
 test_description='Test the DOM configuration module
 This test checks that the normalization performed by the DOM configuration
 is done correctly.
 '
 . "$TEST_LIB"
 test_normalize_output_equals () {
 	desc="$1"; shift
 	config="$1"; shift
 	src="$1"; shift
 	out="$1"; shift
 	URI="test:$(echo "$desc" | sed '
 		s/^[ \t]*\[[^]]*\][ \t]*//;
 		s/[:., \t][:., \t]*/-/g;
 		s/_/-/g;
 		# *cough*
 		y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/;
 		s/[^a-zA-Z0-9-]//g;')"
 	sgml-parser --src "$src" --normalize "$config" --uri "$URI" --src "$src" | sed 's/^  //' > output
 	echo "#document: $URI" > expected
 	echo "$out" | sed -n '2,$p' >> expected
 	test_expect_success "$desc" 'cmp output expected' 
 }
 ## Config strings ###########################################################
 NOOP='cdata-sections,comments,element-content-whitespace,entities'
 NOCOMMENTS='cdata-sections,element-content-whitespace,entities'
 CDATA2TEXT='comments,element-content-whitespace,entities'
 ENTITIES='cdata-section,comments,element-content-whitespace'
 NOWSTEXT='cdata-section,comments,entities'
 NORM1=''
 ## No-ops ###################################################################
 test_normalize_output_equals \
 'Normalization no-op.' \
 "$NOOP" \
 '<roswell>   <![CDATA[|  that  |]]><!-- ends -->  well</roswell>' \
 '
 element: roswell
  #text:    
  #cdata-section: |  that  |
  #comment:  ends 
  #text:   well'
 test_normalize_output_equals \
 'Keep comments.' \
 "$NOOP" \
 'and<!-- comment:1 -->or<!-- comment:2 -->' \
 '
 #text: and
 #comment:  comment:1 
 #text: or
 #comment:  comment:2 '
 test_normalize_output_equals \
 'Keep CDATA sections ' \
 "$NOOP" \
 '<![CDATA[and]]>or<![CDATA[maybe]]>' \
 '
 #cdata-section: and
 #text: or
 #cdata-section: maybe'
 ## Comments #################################################################
 test_normalize_output_equals \
 'Remove comments. (I)' \
 "$NOCOMMENTS" \
 "<no><!-- comment -->?</no>" \
 '
 element: no
  #text: ?'
 test_normalize_output_equals \
 'Remove comments. (II)' \
 "$NOCOMMENTS" \
 '<!-- comment:1 -->and<!-- comment:2 -->' \
 '
 #text: and'
 test_normalize_output_equals \
 'Remove comments. (III)' \
 "$NOCOMMENTS" \
 'nothing to see <!-- comment -->here' \
 '
 #text: nothing to see here'
 ## Entities #################################################################
 # Entities should be shown 'verbatim' here after expansion.
 test_normalize_output_equals \
 'Expand entities. (I)' \
 "$ENTITIES" \
 'a&lt;b&gt;c' \
 '
 #text: a&lt;b&gt;c'
 test_normalize_output_equals \
 'Expand entities. (II)' \
 "$ENTITIES" \
 '&bad-entity&good-entity;' \
 '
 #text: &bad-entity;&good-entity;'
 test_normalize_output_equals \
 'Expand entities. (III)' \
 "$ENTITIES" \
 '<a>&b;</a>' \
 '
 element: a
  #text: &b;'
 ## CDATA Sections ###########################################################
 test_normalize_output_equals \
 'Replace CDATA section with text. (I)' \
 "$CDATA2TEXT" \
 '<![CDATA[a small text snippet]]>' \
 '
 #text: a small text snippet'
 test_normalize_output_equals \
 'Replace CDATA section with text. (II)' \
 "$CDATA2TEXT" \
 '<![CDATA[a small]]> <![CDATA[text snippet]]>' \
 '
 #text: a small text snippet'
 test_normalize_output_equals \
 'Replace CDATA section with text. (III)' \
 "$CDATA2TEXT" \
 'before <![CDATA[and]]> after' \
 '
 #text: before and after'
 ## Element Content Whitespace ###############################################
 test_normalize_output_equals \
 'Remove element content whitespace. (I)' \
 "$NOWSTEXT" \
 '<a>
 	<b>some text</b>
 </a>' \
 '
 element: a
  element: b
    #text: some text' \
 # I haven't read the specs about this thing, for now it just blasts all
 # space-only text nodes. Probably not the wanted behaviour all of the time.
 # --jonas
 test_normalize_output_equals \
 'Remove element content whitespace. (II)' \
 "$NOWSTEXT" \
 '<e>space between &this; &that; gets removed</e>' \
 '
 element: e
  #text: space between 
  entity-reference: this
  entity-reference: that
  #text:  gets removed'
 ## Mixes ####################################################################
 test_normalize_output_equals \
 'Normalization mix #1. (I)' \
 "$NORM1" \
 'before <![CDATA[and]]> after &some;<!--comments--> remain' \
 '
 #text: before and after &some; remain'
 test_normalize_output_equals \
 'Normalization mix #1. (II)' \
 "$NORM1" \
 '<!--a-->b&c;  <![CDATA[d]]>' \
 '
 #text: b&c;d'
 ## Special ELinks Extensions ################################################
 test_normalize_output_equals \
 'Remove unknown (HTML) elements and attributes. (I)' \
 "$NOOP,unknown" \
 '<html wack="..."><title w00t="...">where?<doit>here!</doit></title></html>' \
 '
 element: html
  element: title
    #text: where?' \
 test_normalize_output_equals \
 'Remove unknown (HTML) elements and attributes. (II)' \
 "$NOOP,unknown" \
 '<x y=""><z></z></x> aint no HTML' \
 '
 #text:  aint no HTML' \
 test_normalize_output_equals \
 'Normalize whitespace. (I)' \
 "$NOOP,normalize-whitespace" \
 'Here    is      a
 lot of     useless	space.' \
 '
 #text: Here is a lot of useless space.' \
 test_normalize_output_equals \
 'Normalize whitespace. (II)' \
 "$CDATA2TEXT,normalize-whitespace" \
 'Could we <![CDATA[    please     ]]> read that again?' \
 '
 #text: Could we please read that again?' \
 test_done