1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-01-03 14:57:44 -05:00

Add support for testing normalization using the DOM configuration module

This commit is contained in:
Jonas Fonseca 2006-01-20 02:08:46 +01:00 committed by Jonas Fonseca
parent cc61578fcb
commit 2eba71d95b
2 changed files with 278 additions and 4 deletions

View File

@ -11,6 +11,7 @@
#include "elinks.h" #include "elinks.h"
#include "dom/configuration.h"
#include "dom/node.h" #include "dom/node.h"
#include "dom/sgml/parser.h" #include "dom/sgml/parser.h"
#include "dom/stack.h" #include "dom/stack.h"
@ -260,7 +261,10 @@ main(int argc, char *argv[])
struct sgml_parser *parser; struct sgml_parser *parser;
enum sgml_document_type doctype = SGML_DOCTYPE_HTML; enum sgml_document_type doctype = SGML_DOCTYPE_HTML;
enum sgml_parser_flag flags = 0; enum sgml_parser_flag flags = 0;
enum sgml_parser_type type = SGML_PARSER_STREAM;
enum sgml_parser_code code = 0; enum sgml_parser_code code = 0;
enum dom_config_flag normalize_flags = 0;
int normalize = 0;
int complete = 1; int complete = 1;
struct dom_string uri = INIT_DOM_STRING("dom://test", -1); struct dom_string uri = INIT_DOM_STRING("dom://test", -1);
struct dom_string source = INIT_DOM_STRING("(no source)", -1); struct dom_string source = INIT_DOM_STRING("(no source)", -1);
@ -298,6 +302,20 @@ main(int argc, char *argv[])
set_dom_string(&source, argv[i], strlen(argv[i])); set_dom_string(&source, argv[i], strlen(argv[i]));
} }
} else if (!strncmp(arg, "normalize", 9)) {
arg += 9;
if (*arg == '=') {
arg++;
} else {
i++;
if (i >= argc)
die("--normalize expects a string");
arg = argv[i];
}
normalize = 1;
normalize_flags = parse_dom_config(arg, ',');
type = SGML_PARSER_TREE;
} else if (!strcmp(arg, "print-lines")) { } else if (!strcmp(arg, "print-lines")) {
flags |= SGML_PARSER_COUNT_LINES; flags |= SGML_PARSER_COUNT_LINES;
@ -316,10 +334,13 @@ main(int argc, char *argv[])
} }
} }
parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, &uri, flags); parser = init_sgml_parser(type, doctype, &uri, flags);
if (!parser) return 1; if (!parser) return 1;
parser->error_func = sgml_error_function; parser->error_func = sgml_error_function;
if (normalize)
add_dom_config_normalizer(&parser->stack, normalize_flags);
else
add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info); add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);
code = parse_sgml(parser, source.string, source.length, complete); code = parse_sgml(parser, source.string, source.length, complete);
@ -330,13 +351,30 @@ main(int argc, char *argv[])
get_dom_stack_state(&parser->stack, root_offset)->immutable = 0; get_dom_stack_state(&parser->stack, root_offset)->immutable = 0;
/* For SGML_PARSER_STREAM this will free the DOM
* root node. */
while (!dom_stack_is_empty(&parser->stack)) while (!dom_stack_is_empty(&parser->stack))
pop_dom_node(&parser->stack); pop_dom_node(&parser->stack);
if (normalize) {
struct dom_stack stack;
/* Note, that we cannot free nodes when walking the DOM
* tree since walk_dom_node() uses an index to traverse
* the tree. */
init_dom_stack(&stack, DOM_STACK_FLAG_NONE);
/* XXX: This context needs to be added first because it
* assumes the parser can be accessed via
* stack->contexts[0].data. */
add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info);
walk_dom_nodes(&stack, parser->root);
done_dom_stack(&stack);
done_dom_node(parser->root);
}
} }
done_sgml_parser(parser); done_sgml_parser(parser);
#ifdef DEBUG_MEMLEAK
check_memory_leaks();
#endif
return code; return code;
} }

View File

@ -0,0 +1,236 @@
#!/bin/sh
#
# Copyright (c) 2005 Jonas Fonseca
#
test_description='Test the DOM configuration module
This test checks that the normalization performed by the DOM configuration
is done correctly.
'
. "$TEST_LIB"
test_normalize_output_equals () {
desc="$1"; shift
config="$1"; shift
src="$1"; shift
out="$1"; shift
URI="test:$(echo "$desc" | sed '
s/^[ \t]*\[[^]]*\][ \t]*//;
s/[:., \t][:., \t]*/-/g;
s/_/-/g;
# *cough*
y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/;
s/[^a-zA-Z0-9-]//g;')"
sgml-parser --src "$src" --normalize "$config" --uri "$URI" --src "$src" | sed 's/^ //' > output
echo "#document: $URI" > expected
echo "$out" | sed -n '2,$p' >> expected
test_expect_success "$desc" 'cmp output expected'
}
## Config strings ###########################################################
NOOP='cdata-sections,comments,element-content-whitespace,entities'
NOCOMMENTS='cdata-sections,element-content-whitespace,entities'
CDATA2TEXT='comments,element-content-whitespace,entities'
ENTITIES='cdata-section,comments,element-content-whitespace'
NOWSTEXT='cdata-section,comments,entities'
NORM1=''
## No-ops ###################################################################
test_normalize_output_equals \
'Normalization no-op.' \
"$NOOP" \
'<roswell> <![CDATA[| that |]]><!-- ends --> well</roswell>' \
'
element: roswell
#text:
#cdata-section: | that |
#comment: ends
#text: well'
test_normalize_output_equals \
'Keep comments.' \
"$NOOP" \
'and<!-- comment:1 -->or<!-- comment:2 -->' \
'
#text: and
#comment: comment:1
#text: or
#comment: comment:2 '
test_normalize_output_equals \
'Keep CDATA sections ' \
"$NOOP" \
'<![CDATA[and]]>or<![CDATA[maybe]]>' \
'
#cdata-section: and
#text: or
#cdata-section: maybe'
## Comments #################################################################
test_normalize_output_equals \
'Remove comments. (I)' \
"$NOCOMMENTS" \
"<no><!-- comment -->?</no>" \
'
element: no
#text: ?'
test_normalize_output_equals \
'Remove comments. (II)' \
"$NOCOMMENTS" \
'<!-- comment:1 -->and<!-- comment:2 -->' \
'
#text: and'
test_normalize_output_equals \
'Remove comments. (III)' \
"$NOCOMMENTS" \
'nothing to see <!-- comment -->here' \
'
#text: nothing to see here'
## Entities #################################################################
# Entities should be shown 'verbatim' here after expansion.
test_normalize_output_equals \
'Expand entities. (I)' \
"$ENTITIES" \
'a&lt;b&gt;c' \
'
#text: a&lt;b&gt;c'
test_normalize_output_equals \
'Expand entities. (II)' \
"$ENTITIES" \
'&bad-entity&good-entity;' \
'
#text: &bad-entity;&good-entity;'
test_normalize_output_equals \
'Expand entities. (III)' \
"$ENTITIES" \
'<a>&b;</a>' \
'
element: a
#text: &b;'
## CDATA Sections ###########################################################
test_normalize_output_equals \
'Replace CDATA section with text. (I)' \
"$CDATA2TEXT" \
'<![CDATA[a small text snippet]]>' \
'
#text: a small text snippet'
test_normalize_output_equals \
'Replace CDATA section with text. (II)' \
"$CDATA2TEXT" \
'<![CDATA[a small]]> <![CDATA[text snippet]]>' \
'
#text: a small text snippet'
test_normalize_output_equals \
'Replace CDATA section with text. (III)' \
"$CDATA2TEXT" \
'before <![CDATA[and]]> after' \
'
#text: before and after'
## Element Content Whitespace ###############################################
test_normalize_output_equals \
'Remove element content whitespace. (I)' \
"$NOWSTEXT" \
'<a>
<b>some text</b>
</a>' \
'
element: a
element: b
#text: some text' \
# I haven't read the specs about this thing, for now it just blasts all
# space-only text nodes. Probably not the wanted behaviour all of the time.
# --jonas
test_normalize_output_equals \
'Remove element content whitespace. (II)' \
"$NOWSTEXT" \
'<e>space between &this; &that; gets removed</e>' \
'
element: e
#text: space between
entity-reference: this
entity-reference: that
#text: gets removed'
## Mixes ####################################################################
test_normalize_output_equals \
'Normalization mix #1. (I)' \
"$NORM1" \
'before <![CDATA[and]]> after &some;<!--comments--> remain' \
'
#text: before and after &some; remain'
test_normalize_output_equals \
'Normalization mix #1. (II)' \
"$NORM1" \
'<!--a-->b&c; <![CDATA[d]]>' \
'
#text: b&c;d'
## Special ELinks Extensions ################################################
test_normalize_output_equals \
'Remove unknown (HTML) elements and attributes. (I)' \
"$NOOP,unknown" \
'<html wack="..."><title w00t="...">where?<doit>here!</doit></title></html>' \
'
element: html
element: title
#text: where?' \
test_normalize_output_equals \
'Remove unknown (HTML) elements and attributes. (II)' \
"$NOOP,unknown" \
'<x y=""><z></z></x> aint no HTML' \
'
#text: aint no HTML' \
test_normalize_output_equals \
'Normalize whitespace. (I)' \
"$NOOP,normalize-whitespace" \
'Here is a
lot of useless space.' \
'
#text: Here is a lot of useless space.' \
test_normalize_output_equals \
'Normalize whitespace. (II)' \
"$CDATA2TEXT,normalize-whitespace" \
'Could we <![CDATA[ please ]]> read that again?' \
'
#text: Could we please read that again?' \
test_done