Merge with git+ssh://pasky.or.cz/srv/git/elinks.git

2025-05-18 00:48:57 -04:00 · 2006-01-20 15:27:23 +01:00 · 2006-01-20 15:27:23 +01:00 · dcc07a7f68
commit dcc07a7f68
parent 943cbd1bc6 2eba71d95b
4 changed files with 282 additions and 7 deletions
--- a/src/dom/configuration.h
+++ b/src/dom/configuration.h
@ -63,8 +63,9 @@ enum dom_config_flag {

 	/** "normalize-whitespace"
 	 *
-	 * If false (default) nothing is done, else all nodes are discarded
-	 * once they have been traversed. */
+	 * If false (default) nothing is done, else all text nodes are
+	 * normalized so that sequences of space characters are changed to
+	 * being only a single space. */
 	DOM_CONFIG_NORMALIZE_WHITESPACE = 64,
 };

--- a/src/dom/stack.c
+++ b/src/dom/stack.c
@ -420,7 +420,7 @@ walk_dom_nodes(struct dom_stack *stack, struct dom_node *root)
 		if (is_dom_node_list_member(list, wstate->index)) {
 			struct dom_node *child = list->entries[wstate->index++];

-			if (push_dom_node(stack, child))
+			if (push_dom_node(stack, child) == DOM_STACK_CODE_OK)
 				continue;
 		}

--- a/src/dom/test/sgml-parser.c
+++ b/src/dom/test/sgml-parser.c
@ -11,6 +11,7 @@

 #include "elinks.h"

+#include "dom/configuration.h"
 #include "dom/node.h"
 #include "dom/sgml/parser.h"
 #include "dom/stack.h"
@ -260,7 +261,10 @@ main(int argc, char *argv[])
 	struct sgml_parser *parser;
 	enum sgml_document_type doctype = SGML_DOCTYPE_HTML;
 	enum sgml_parser_flag flags = 0;
+	enum sgml_parser_type type = SGML_PARSER_STREAM;
 	enum sgml_parser_code code = 0;
+	enum dom_config_flag normalize_flags = 0;
+	int normalize = 0;
 	int complete = 1;
 	struct dom_string uri = INIT_DOM_STRING("dom://test", -1);
 	struct dom_string source = INIT_DOM_STRING("(no source)", -1);
@ -298,6 +302,20 @@ main(int argc, char *argv[])
 				set_dom_string(&source, argv[i], strlen(argv[i]));
 			}

+		} else if (!strncmp(arg, "normalize", 9)) {
+			arg += 9;
+			if (*arg == '=') {
+				arg++;
+			} else {
+				i++;
+				if (i >= argc)
+					die("--normalize expects a string");
+				arg = argv[i];
+			}
+			normalize = 1;
+			normalize_flags = parse_dom_config(arg, ',');
+			type = SGML_PARSER_TREE;
+
 		} else if (!strcmp(arg, "print-lines")) {
 			flags |= SGML_PARSER_COUNT_LINES;

@ -316,11 +334,14 @@ main(int argc, char *argv[])
 		}
 	}

-	parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, &uri, flags);
+	parser = init_sgml_parser(type, doctype, &uri, flags);
 	if (!parser) return 1;

 	parser->error_func = sgml_error_function;
-	add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);
+	if (normalize)
+		add_dom_config_normalizer(&parser->stack, normalize_flags);
+	else
+		add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);

 	code = parse_sgml(parser, source.string, source.length, complete);
 	if (parser->root) {
@ -330,13 +351,30 @@ main(int argc, char *argv[])

 		get_dom_stack_state(&parser->stack, root_offset)->immutable = 0;

-		/* For SGML_PARSER_STREAM this will free the DOM
-		 * root node. */
 		while (!dom_stack_is_empty(&parser->stack))
 			pop_dom_node(&parser->stack);
+
+		if (normalize) {
+			struct dom_stack stack;
+
+			/* Note, that we cannot free nodes when walking the DOM
+			 * tree since walk_dom_node() uses an index to traverse
+			 * the tree. */
+			init_dom_stack(&stack, DOM_STACK_FLAG_NONE);
+			/* XXX: This context needs to be added first because it
+			 * assumes the parser can be accessed via
+			 * stack->contexts[0].data. */
+			add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info);
+			walk_dom_nodes(&stack, parser->root);
+			done_dom_stack(&stack);
+			done_dom_node(parser->root);
+		}
 	}

 	done_sgml_parser(parser);
+#ifdef DEBUG_MEMLEAK
+	check_memory_leaks();
+#endif

 	return code;
 }
--- a/src/dom/test/test-dom-configuration-basic
+++ b/src/dom/test/test-dom-configuration-basic
@ -0,0 +1,236 @@
+#!/bin/sh
+#
+# Copyright (c) 2005 Jonas Fonseca
+#
+
+test_description='Test the DOM configuration module
+
+This test checks that the normalization performed by the DOM configuration
+is done correctly.
+'
+
+. "$TEST_LIB"
+
+test_normalize_output_equals () {
+	desc="$1"; shift
+	config="$1"; shift
+	src="$1"; shift
+	out="$1"; shift
+
+	URI="test:$(echo "$desc" | sed '
+		s/^[ \t]*\[[^]]*\][ \t]*//;
+		s/[:., \t][:., \t]*/-/g;
+		s/_/-/g;
+		# *cough*
+		y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/;
+		s/[^a-zA-Z0-9-]//g;')"
+
+	sgml-parser --src "$src" --normalize "$config" --uri "$URI" --src "$src" | sed 's/^  //' > output
+	echo "#document: $URI" > expected
+	echo "$out" | sed -n '2,$p' >> expected
+
+	test_expect_success "$desc" 'cmp output expected' 
+}
+
+
+## Config strings ###########################################################
+
+NOOP='cdata-sections,comments,element-content-whitespace,entities'
+NOCOMMENTS='cdata-sections,element-content-whitespace,entities'
+CDATA2TEXT='comments,element-content-whitespace,entities'
+ENTITIES='cdata-section,comments,element-content-whitespace'
+NOWSTEXT='cdata-section,comments,entities'
+NORM1=''
+
+
+## No-ops ###################################################################
+
+test_normalize_output_equals \
+'Normalization no-op.' \
+"$NOOP" \
+'<roswell>   <![CDATA[|  that  |]]><!-- ends -->  well</roswell>' \
+'
+element: roswell
+  #text:    
+  #cdata-section: |  that  |
+  #comment:  ends 
+  #text:   well'
+
+test_normalize_output_equals \
+'Keep comments.' \
+"$NOOP" \
+'and<!-- comment:1 -->or<!-- comment:2 -->' \
+'
+#text: and
+#comment:  comment:1 
+#text: or
+#comment:  comment:2 '
+
+test_normalize_output_equals \
+'Keep CDATA sections ' \
+"$NOOP" \
+'<![CDATA[and]]>or<![CDATA[maybe]]>' \
+'
+#cdata-section: and
+#text: or
+#cdata-section: maybe'
+
+
+## Comments #################################################################
+
+test_normalize_output_equals \
+'Remove comments. (I)' \
+"$NOCOMMENTS" \
+"<no><!-- comment -->?</no>" \
+'
+element: no
+  #text: ?'
+
+test_normalize_output_equals \
+'Remove comments. (II)' \
+"$NOCOMMENTS" \
+'<!-- comment:1 -->and<!-- comment:2 -->' \
+'
+#text: and'
+
+test_normalize_output_equals \
+'Remove comments. (III)' \
+"$NOCOMMENTS" \
+'nothing to see <!-- comment -->here' \
+'
+#text: nothing to see here'
+
+
+## Entities #################################################################
+
+# Entities should be shown 'verbatim' here after expansion.
+
+test_normalize_output_equals \
+'Expand entities. (I)' \
+"$ENTITIES" \
+'a&lt;b&gt;c' \
+'
+#text: a&lt;b&gt;c'
+
+test_normalize_output_equals \
+'Expand entities. (II)' \
+"$ENTITIES" \
+'&bad-entity&good-entity;' \
+'
+#text: &bad-entity;&good-entity;'
+
+test_normalize_output_equals \
+'Expand entities. (III)' \
+"$ENTITIES" \
+'<a>&b;</a>' \
+'
+element: a
+  #text: &b;'
+
+
+## CDATA Sections ###########################################################
+
+test_normalize_output_equals \
+'Replace CDATA section with text. (I)' \
+"$CDATA2TEXT" \
+'<![CDATA[a small text snippet]]>' \
+'
+#text: a small text snippet'
+
+test_normalize_output_equals \
+'Replace CDATA section with text. (II)' \
+"$CDATA2TEXT" \
+'<![CDATA[a small]]> <![CDATA[text snippet]]>' \
+'
+#text: a small text snippet'
+
+test_normalize_output_equals \
+'Replace CDATA section with text. (III)' \
+"$CDATA2TEXT" \
+'before <![CDATA[and]]> after' \
+'
+#text: before and after'
+
+
+## Element Content Whitespace ###############################################
+
+test_normalize_output_equals \
+'Remove element content whitespace. (I)' \
+"$NOWSTEXT" \
+'<a>
+	<b>some text</b>
+</a>' \
+'
+element: a
+  element: b
+    #text: some text' \
+
+# I haven't read the specs about this thing, for now it just blasts all
+# space-only text nodes. Probably not the wanted behaviour all of the time.
+# --jonas
+test_normalize_output_equals \
+'Remove element content whitespace. (II)' \
+"$NOWSTEXT" \
+'<e>space between &this; &that; gets removed</e>' \
+'
+element: e
+  #text: space between 
+  entity-reference: this
+  entity-reference: that
+  #text:  gets removed'
+
+
+## Mixes ####################################################################
+
+test_normalize_output_equals \
+'Normalization mix #1. (I)' \
+"$NORM1" \
+'before <![CDATA[and]]> after &some;<!--comments--> remain' \
+'
+#text: before and after &some; remain'
+
+test_normalize_output_equals \
+'Normalization mix #1. (II)' \
+"$NORM1" \
+'<!--a-->b&c;  <![CDATA[d]]>' \
+'
+#text: b&c;d'
+
+
+## Special ELinks Extensions ################################################
+
+test_normalize_output_equals \
+'Remove unknown (HTML) elements and attributes. (I)' \
+"$NOOP,unknown" \
+'<html wack="..."><title w00t="...">where?<doit>here!</doit></title></html>' \
+'
+element: html
+  element: title
+    #text: where?' \
+
+test_normalize_output_equals \
+'Remove unknown (HTML) elements and attributes. (II)' \
+"$NOOP,unknown" \
+'<x y=""><z></z></x> aint no HTML' \
+'
+#text:  aint no HTML' \
+
+test_normalize_output_equals \
+'Normalize whitespace. (I)' \
+"$NOOP,normalize-whitespace" \
+'Here    is      a
+
+
+
+lot of     useless	space.' \
+'
+#text: Here is a lot of useless space.' \
+
+test_normalize_output_equals \
+'Normalize whitespace. (II)' \
+"$CDATA2TEXT,normalize-whitespace" \
+'Could we <![CDATA[    please     ]]> read that again?' \
+'
+#text: Could we please read that again?' \
+
+test_done