mirror of
https://github.com/rkd77/elinks.git
synced 2025-01-03 14:57:44 -05:00
Add support for testing normalization using the DOM configuration module
This commit is contained in:
parent
cc61578fcb
commit
2eba71d95b
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
#include "elinks.h"
|
#include "elinks.h"
|
||||||
|
|
||||||
|
#include "dom/configuration.h"
|
||||||
#include "dom/node.h"
|
#include "dom/node.h"
|
||||||
#include "dom/sgml/parser.h"
|
#include "dom/sgml/parser.h"
|
||||||
#include "dom/stack.h"
|
#include "dom/stack.h"
|
||||||
@ -260,7 +261,10 @@ main(int argc, char *argv[])
|
|||||||
struct sgml_parser *parser;
|
struct sgml_parser *parser;
|
||||||
enum sgml_document_type doctype = SGML_DOCTYPE_HTML;
|
enum sgml_document_type doctype = SGML_DOCTYPE_HTML;
|
||||||
enum sgml_parser_flag flags = 0;
|
enum sgml_parser_flag flags = 0;
|
||||||
|
enum sgml_parser_type type = SGML_PARSER_STREAM;
|
||||||
enum sgml_parser_code code = 0;
|
enum sgml_parser_code code = 0;
|
||||||
|
enum dom_config_flag normalize_flags = 0;
|
||||||
|
int normalize = 0;
|
||||||
int complete = 1;
|
int complete = 1;
|
||||||
struct dom_string uri = INIT_DOM_STRING("dom://test", -1);
|
struct dom_string uri = INIT_DOM_STRING("dom://test", -1);
|
||||||
struct dom_string source = INIT_DOM_STRING("(no source)", -1);
|
struct dom_string source = INIT_DOM_STRING("(no source)", -1);
|
||||||
@ -298,6 +302,20 @@ main(int argc, char *argv[])
|
|||||||
set_dom_string(&source, argv[i], strlen(argv[i]));
|
set_dom_string(&source, argv[i], strlen(argv[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else if (!strncmp(arg, "normalize", 9)) {
|
||||||
|
arg += 9;
|
||||||
|
if (*arg == '=') {
|
||||||
|
arg++;
|
||||||
|
} else {
|
||||||
|
i++;
|
||||||
|
if (i >= argc)
|
||||||
|
die("--normalize expects a string");
|
||||||
|
arg = argv[i];
|
||||||
|
}
|
||||||
|
normalize = 1;
|
||||||
|
normalize_flags = parse_dom_config(arg, ',');
|
||||||
|
type = SGML_PARSER_TREE;
|
||||||
|
|
||||||
} else if (!strcmp(arg, "print-lines")) {
|
} else if (!strcmp(arg, "print-lines")) {
|
||||||
flags |= SGML_PARSER_COUNT_LINES;
|
flags |= SGML_PARSER_COUNT_LINES;
|
||||||
|
|
||||||
@ -316,10 +334,13 @@ main(int argc, char *argv[])
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
parser = init_sgml_parser(SGML_PARSER_STREAM, doctype, &uri, flags);
|
parser = init_sgml_parser(type, doctype, &uri, flags);
|
||||||
if (!parser) return 1;
|
if (!parser) return 1;
|
||||||
|
|
||||||
parser->error_func = sgml_error_function;
|
parser->error_func = sgml_error_function;
|
||||||
|
if (normalize)
|
||||||
|
add_dom_config_normalizer(&parser->stack, normalize_flags);
|
||||||
|
else
|
||||||
add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);
|
add_dom_stack_context(&parser->stack, NULL, &sgml_parser_test_context_info);
|
||||||
|
|
||||||
code = parse_sgml(parser, source.string, source.length, complete);
|
code = parse_sgml(parser, source.string, source.length, complete);
|
||||||
@ -330,13 +351,30 @@ main(int argc, char *argv[])
|
|||||||
|
|
||||||
get_dom_stack_state(&parser->stack, root_offset)->immutable = 0;
|
get_dom_stack_state(&parser->stack, root_offset)->immutable = 0;
|
||||||
|
|
||||||
/* For SGML_PARSER_STREAM this will free the DOM
|
|
||||||
* root node. */
|
|
||||||
while (!dom_stack_is_empty(&parser->stack))
|
while (!dom_stack_is_empty(&parser->stack))
|
||||||
pop_dom_node(&parser->stack);
|
pop_dom_node(&parser->stack);
|
||||||
|
|
||||||
|
if (normalize) {
|
||||||
|
struct dom_stack stack;
|
||||||
|
|
||||||
|
/* Note, that we cannot free nodes when walking the DOM
|
||||||
|
* tree since walk_dom_node() uses an index to traverse
|
||||||
|
* the tree. */
|
||||||
|
init_dom_stack(&stack, DOM_STACK_FLAG_NONE);
|
||||||
|
/* XXX: This context needs to be added first because it
|
||||||
|
* assumes the parser can be accessed via
|
||||||
|
* stack->contexts[0].data. */
|
||||||
|
add_dom_stack_context(&stack, parser, &sgml_parser_test_context_info);
|
||||||
|
walk_dom_nodes(&stack, parser->root);
|
||||||
|
done_dom_stack(&stack);
|
||||||
|
done_dom_node(parser->root);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
done_sgml_parser(parser);
|
done_sgml_parser(parser);
|
||||||
|
#ifdef DEBUG_MEMLEAK
|
||||||
|
check_memory_leaks();
|
||||||
|
#endif
|
||||||
|
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
|
236
src/dom/test/test-dom-configuration-basic
Executable file
236
src/dom/test/test-dom-configuration-basic
Executable file
@ -0,0 +1,236 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
#
|
||||||
|
# Copyright (c) 2005 Jonas Fonseca
|
||||||
|
#
|
||||||
|
|
||||||
|
test_description='Test the DOM configuration module
|
||||||
|
|
||||||
|
This test checks that the normalization performed by the DOM configuration
|
||||||
|
is done correctly.
|
||||||
|
'
|
||||||
|
|
||||||
|
. "$TEST_LIB"
|
||||||
|
|
||||||
|
test_normalize_output_equals () {
|
||||||
|
desc="$1"; shift
|
||||||
|
config="$1"; shift
|
||||||
|
src="$1"; shift
|
||||||
|
out="$1"; shift
|
||||||
|
|
||||||
|
URI="test:$(echo "$desc" | sed '
|
||||||
|
s/^[ \t]*\[[^]]*\][ \t]*//;
|
||||||
|
s/[:., \t][:., \t]*/-/g;
|
||||||
|
s/_/-/g;
|
||||||
|
# *cough*
|
||||||
|
y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/;
|
||||||
|
s/[^a-zA-Z0-9-]//g;')"
|
||||||
|
|
||||||
|
sgml-parser --src "$src" --normalize "$config" --uri "$URI" --src "$src" | sed 's/^ //' > output
|
||||||
|
echo "#document: $URI" > expected
|
||||||
|
echo "$out" | sed -n '2,$p' >> expected
|
||||||
|
|
||||||
|
test_expect_success "$desc" 'cmp output expected'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
## Config strings ###########################################################
|
||||||
|
|
||||||
|
NOOP='cdata-sections,comments,element-content-whitespace,entities'
|
||||||
|
NOCOMMENTS='cdata-sections,element-content-whitespace,entities'
|
||||||
|
CDATA2TEXT='comments,element-content-whitespace,entities'
|
||||||
|
ENTITIES='cdata-section,comments,element-content-whitespace'
|
||||||
|
NOWSTEXT='cdata-section,comments,entities'
|
||||||
|
NORM1=''
|
||||||
|
|
||||||
|
|
||||||
|
## No-ops ###################################################################
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Normalization no-op.' \
|
||||||
|
"$NOOP" \
|
||||||
|
'<roswell> <![CDATA[| that |]]><!-- ends --> well</roswell>' \
|
||||||
|
'
|
||||||
|
element: roswell
|
||||||
|
#text:
|
||||||
|
#cdata-section: | that |
|
||||||
|
#comment: ends
|
||||||
|
#text: well'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Keep comments.' \
|
||||||
|
"$NOOP" \
|
||||||
|
'and<!-- comment:1 -->or<!-- comment:2 -->' \
|
||||||
|
'
|
||||||
|
#text: and
|
||||||
|
#comment: comment:1
|
||||||
|
#text: or
|
||||||
|
#comment: comment:2 '
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Keep CDATA sections ' \
|
||||||
|
"$NOOP" \
|
||||||
|
'<![CDATA[and]]>or<![CDATA[maybe]]>' \
|
||||||
|
'
|
||||||
|
#cdata-section: and
|
||||||
|
#text: or
|
||||||
|
#cdata-section: maybe'
|
||||||
|
|
||||||
|
|
||||||
|
## Comments #################################################################
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove comments. (I)' \
|
||||||
|
"$NOCOMMENTS" \
|
||||||
|
"<no><!-- comment -->?</no>" \
|
||||||
|
'
|
||||||
|
element: no
|
||||||
|
#text: ?'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove comments. (II)' \
|
||||||
|
"$NOCOMMENTS" \
|
||||||
|
'<!-- comment:1 -->and<!-- comment:2 -->' \
|
||||||
|
'
|
||||||
|
#text: and'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove comments. (III)' \
|
||||||
|
"$NOCOMMENTS" \
|
||||||
|
'nothing to see <!-- comment -->here' \
|
||||||
|
'
|
||||||
|
#text: nothing to see here'
|
||||||
|
|
||||||
|
|
||||||
|
## Entities #################################################################
|
||||||
|
|
||||||
|
# Entities should be shown 'verbatim' here after expansion.
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Expand entities. (I)' \
|
||||||
|
"$ENTITIES" \
|
||||||
|
'a<b>c' \
|
||||||
|
'
|
||||||
|
#text: a<b>c'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Expand entities. (II)' \
|
||||||
|
"$ENTITIES" \
|
||||||
|
'&bad-entity&good-entity;' \
|
||||||
|
'
|
||||||
|
#text: &bad-entity;&good-entity;'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Expand entities. (III)' \
|
||||||
|
"$ENTITIES" \
|
||||||
|
'<a>&b;</a>' \
|
||||||
|
'
|
||||||
|
element: a
|
||||||
|
#text: &b;'
|
||||||
|
|
||||||
|
|
||||||
|
## CDATA Sections ###########################################################
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Replace CDATA section with text. (I)' \
|
||||||
|
"$CDATA2TEXT" \
|
||||||
|
'<![CDATA[a small text snippet]]>' \
|
||||||
|
'
|
||||||
|
#text: a small text snippet'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Replace CDATA section with text. (II)' \
|
||||||
|
"$CDATA2TEXT" \
|
||||||
|
'<![CDATA[a small]]> <![CDATA[text snippet]]>' \
|
||||||
|
'
|
||||||
|
#text: a small text snippet'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Replace CDATA section with text. (III)' \
|
||||||
|
"$CDATA2TEXT" \
|
||||||
|
'before <![CDATA[and]]> after' \
|
||||||
|
'
|
||||||
|
#text: before and after'
|
||||||
|
|
||||||
|
|
||||||
|
## Element Content Whitespace ###############################################
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove element content whitespace. (I)' \
|
||||||
|
"$NOWSTEXT" \
|
||||||
|
'<a>
|
||||||
|
<b>some text</b>
|
||||||
|
</a>' \
|
||||||
|
'
|
||||||
|
element: a
|
||||||
|
element: b
|
||||||
|
#text: some text' \
|
||||||
|
|
||||||
|
# I haven't read the specs about this thing, for now it just blasts all
|
||||||
|
# space-only text nodes. Probably not the wanted behaviour all of the time.
|
||||||
|
# --jonas
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove element content whitespace. (II)' \
|
||||||
|
"$NOWSTEXT" \
|
||||||
|
'<e>space between &this; &that; gets removed</e>' \
|
||||||
|
'
|
||||||
|
element: e
|
||||||
|
#text: space between
|
||||||
|
entity-reference: this
|
||||||
|
entity-reference: that
|
||||||
|
#text: gets removed'
|
||||||
|
|
||||||
|
|
||||||
|
## Mixes ####################################################################
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Normalization mix #1. (I)' \
|
||||||
|
"$NORM1" \
|
||||||
|
'before <![CDATA[and]]> after &some;<!--comments--> remain' \
|
||||||
|
'
|
||||||
|
#text: before and after &some; remain'
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Normalization mix #1. (II)' \
|
||||||
|
"$NORM1" \
|
||||||
|
'<!--a-->b&c; <![CDATA[d]]>' \
|
||||||
|
'
|
||||||
|
#text: b&c;d'
|
||||||
|
|
||||||
|
|
||||||
|
## Special ELinks Extensions ################################################
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove unknown (HTML) elements and attributes. (I)' \
|
||||||
|
"$NOOP,unknown" \
|
||||||
|
'<html wack="..."><title w00t="...">where?<doit>here!</doit></title></html>' \
|
||||||
|
'
|
||||||
|
element: html
|
||||||
|
element: title
|
||||||
|
#text: where?' \
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Remove unknown (HTML) elements and attributes. (II)' \
|
||||||
|
"$NOOP,unknown" \
|
||||||
|
'<x y=""><z></z></x> aint no HTML' \
|
||||||
|
'
|
||||||
|
#text: aint no HTML' \
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Normalize whitespace. (I)' \
|
||||||
|
"$NOOP,normalize-whitespace" \
|
||||||
|
'Here is a
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
lot of useless space.' \
|
||||||
|
'
|
||||||
|
#text: Here is a lot of useless space.' \
|
||||||
|
|
||||||
|
test_normalize_output_equals \
|
||||||
|
'Normalize whitespace. (II)' \
|
||||||
|
"$CDATA2TEXT,normalize-whitespace" \
|
||||||
|
'Could we <![CDATA[ please ]]> read that again?' \
|
||||||
|
'
|
||||||
|
#text: Could we please read that again?' \
|
||||||
|
|
||||||
|
test_done
|
Loading…
Reference in New Issue
Block a user