1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-01-03 14:57:44 -05:00

Merge with git+ssh://pasky.or.cz/srv/git/elinks.git

This commit is contained in:
Miciah Dashiel Butler Masters 2005-12-29 22:29:39 +00:00 committed by Miciah Dashiel Butler Masters
commit 65372061a0
7 changed files with 223 additions and 82 deletions

View File

@ -6,4 +6,7 @@ OBJS = node.o select.o stack.o scanner.o
SUBDIRS-$(CONFIG_DEBUG) += test
test: all
make test -C test
include $(top_srcdir)/Makefile.lib

View File

@ -153,8 +153,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info)
}
void
init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
struct dom_string *string)
init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
struct dom_string *string, int state)
{
if (!scanner_info->initialized) {
init_dom_scanner_info(scanner_info);
@ -168,5 +168,6 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i
scanner->end = string->string + string->length;
scanner->current = scanner->table;
scanner->info = scanner_info;
scanner->state = state;
scanner->info->scan(scanner);
}

View File

@ -91,8 +91,11 @@ struct dom_scanner_info {
/* Initializes the scanner. */
void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
struct dom_string *string);
void init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
struct dom_string *string, int state);
#define init_dom_scanner(scanner, info, string) \
init_dom_scanner_state(scanner, info, string, 0)
/* The number of tokens in the scanners token table:
* At best it should be big enough to contain properties with space separated

View File

@ -103,26 +103,17 @@ add_sgml_attribute(struct dom_stack *stack,
}
static inline struct dom_node *
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *token)
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
struct dom_scanner_token *data)
{
struct dom_node *parent = get_dom_stack_top(stack)->node;
struct dom_string *data_str = data ? &data->string : NULL;
struct dom_node *node;
/* Split the token in two if we can find a first space separator. */
unsigned char *separator = memchr(token->string.string, ' ', token->string.length);
/* Anything before the separator becomes the target name ... */
size_t namelen = separator ? separator - token->string.string : token->string.length;
struct dom_string name = INIT_DOM_STRING(token->string.string, namelen);
/* ... and everything after the instruction value. */
unsigned char *valuestr = separator ? separator + 1 : NULL;
size_t valuelen = valuestr ? token->string.length - namelen - 1 : 0;
struct dom_string value = INIT_DOM_STRING(valuestr, valuelen);
node = add_dom_proc_instruction(parent, &name, &value);
node = add_dom_proc_instruction(parent, &target->string, data_str);
if (!node) return NULL;
switch (token->type) {
switch (target->type) {
case SGML_TOKEN_PROCESS_XML:
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
break;
@ -132,13 +123,7 @@ add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *tok
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
}
if (!push_dom_node(stack, node))
return NULL;
if (token->type != SGML_TOKEN_PROCESS_XML)
pop_dom_node(stack);
return node;
return push_dom_node(stack, node);
}
static inline void
@ -166,9 +151,12 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
assert(dom_scanner_has_tokens(scanner)
&& (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
|| get_dom_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML));
|| (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION
&& get_dom_stack_top(stack)->node->data.proc_instruction.type
== DOM_PROC_INSTRUCTION_XML)));
skip_dom_scanner_token(scanner);
if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN)
skip_dom_scanner_token(scanner);
while (dom_scanner_has_tokens(scanner)) {
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
@ -220,6 +208,8 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
static void
parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
{
struct dom_scanner_token target;
while (dom_scanner_has_tokens(scanner)) {
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
@ -290,17 +280,31 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
break;
case SGML_TOKEN_PROCESS_XML:
if (!add_sgml_proc_instruction(stack, token)) {
skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
break;
case SGML_TOKEN_PROCESS:
copy_struct(&target, token);
/* Skip the target token */
token = get_next_dom_scanner_token(scanner);
if (!token) break;
assert(token->type == SGML_TOKEN_PROCESS_DATA);
if (add_sgml_proc_instruction(stack, &target, token)
&& target.type == SGML_TOKEN_PROCESS_XML
&& token->string.length > 0) {
/* Parse the <?xml data="attributes"?>. */
struct dom_scanner attr_scanner;
init_dom_scanner_state(&attr_scanner,
&sgml_scanner_info,
&token->string,
SGML_STATE_ELEMENT);
if (dom_scanner_has_tokens(&attr_scanner))
parse_sgml_attributes(stack, &attr_scanner);
}
parse_sgml_attributes(stack, scanner);
pop_dom_node(stack);
break;
case SGML_TOKEN_PROCESS:
add_sgml_proc_instruction(stack, token);
skip_dom_scanner_token(scanner);
break;

View File

@ -17,14 +17,6 @@
/* Bitmap entries for the SGML character groups used in the scanner table */
/* The SGML tokenizer maintains a state that can be either text or element
* state. The state has only meaning while doing the actual scanning and is not
* accessible at the parsing time. */
enum sgml_scanner_state {
SGML_STATE_TEXT,
SGML_STATE_ELEMENT,
};
enum sgml_char_group {
SGML_CHAR_ENTITY = (1 << 1),
SGML_CHAR_IDENT = (1 << 2),
@ -186,7 +178,7 @@ skip_comment(struct dom_scanner *scanner, unsigned char **string)
unsigned char *pos = *string;
int length = 0;
for (; pos < scanner->end - 3; pos++)
for (; pos < scanner->end - 2; pos++)
if (pos[0] == '-' && pos[1] == '-' && pos[2] == '>') {
length = pos - *string;
pos += 3;
@ -296,27 +288,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
type = map_dom_scanner_string(scanner, pos, string, base);
/* Figure out where the processing instruction ends */
for (pos = string; skip_sgml(scanner, &pos, '>', 0); ) {
if (pos[-2] != '?') continue;
/* Set length until '?' char and move position
* beyond '>'. */
real_length = pos - token->string.string - 2;
break;
}
switch (type) {
case SGML_TOKEN_PROCESS_XML:
/* We want to parse the attributes */
assert(scanner->state != SGML_STATE_ELEMENT);
scanner->state = SGML_STATE_ELEMENT;
break;
default:
/* Just skip the whole thing */
string = pos;
}
scanner->state = SGML_STATE_PROC_INST;
} else if (*string == '/') {
string++;
@ -366,6 +338,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
} else if (is_sgml_attribute(*string)) {
scan_sgml_attribute(scanner, string);
type = SGML_TOKEN_ATTRIBUTE;
if (string[-1] == '/' && string[0] == '>')
string--;
}
} else if (isquote(first_char)) {
@ -393,6 +367,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
if (is_sgml_attribute(*string)) {
scan_sgml_attribute(scanner, string);
type = SGML_TOKEN_ATTRIBUTE;
if (string[-1] == '/' && string[0] == '>')
string--;
}
}
@ -403,6 +379,38 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
}
/* Processing instruction data scanning */
static inline void
scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
{
unsigned char *string = scanner->position;
size_t size;
token->string.string = string;
/* Figure out where the processing instruction ends. This doesn't use
* skip_sgml() since we MUST ignore precedence here to allow '<' inside
* the data part to be skipped correctly. */
for (size = scanner->end - string;
size > 0 && (string = memchr(string, '>', size));
string++) {
if (string[-1] == '?') {
string++;
break;
}
}
if (!string) string = scanner->end;
token->type = SGML_TOKEN_PROCESS_DATA;
token->string.length = string - token->string.string - 2;
token->precedence = get_sgml_precedence(token->type);
scanner->position = string;
scanner->state = SGML_STATE_TEXT;
}
/* Scanner multiplexor */
static struct dom_scanner_token *
@ -419,7 +427,8 @@ scan_sgml_tokens(struct dom_scanner *scanner)
current < table_end && scanner->position < scanner->end;
current++) {
if (scanner->state == SGML_STATE_ELEMENT
|| *scanner->position == '<') {
|| (*scanner->position == '<'
&& scanner->state != SGML_STATE_PROC_INST)) {
scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE);
if (scanner->position >= scanner->end) break;
@ -429,8 +438,13 @@ scan_sgml_tokens(struct dom_scanner *scanner)
if (current->type == SGML_TOKEN_SKIP) {
current--;
}
} else {
} else if (scanner->state == SGML_STATE_TEXT) {
scan_sgml_text_token(scanner, current);
} else {
scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE);
scan_sgml_proc_inst_token(scanner, current);
}
}

View File

@ -27,8 +27,9 @@ enum sgml_token_type {
SGML_TOKEN_CDATA_SECTION, /* <![CDATA[ until ]]> */
SGML_TOKEN_PROCESS, /* <?{ident} until ?> */
SGML_TOKEN_PROCESS_XML, /* <?xml until */
SGML_TOKEN_PROCESS, /* <?{ident} */
SGML_TOKEN_PROCESS_XML, /* <?xml */
SGML_TOKEN_PROCESS_DATA, /* data after <?{ident} until ?> */
SGML_TOKEN_ELEMENT, /* <{ident}> */
SGML_TOKEN_ELEMENT_BEGIN, /* <{ident} */
@ -56,6 +57,17 @@ enum sgml_token_type {
SGML_TOKEN_NONE = 0,
};
/* The SGML tokenizer maintains a state (in the scanner->state member) that can
* be either text, element, or processing instruction state. The state has only
* meaning while doing the actual scanning and should not be used at the
* parsing time. It can however be used to initialize the scanner to a specific
* state. */
enum sgml_scanner_state {
SGML_STATE_TEXT,
SGML_STATE_ELEMENT,
SGML_STATE_PROC_INST,
};
extern struct dom_scanner_info sgml_scanner_info;
/* Treat '<' as more valuable then '>' so that scanning of '<a<b>' using

View File

@ -44,6 +44,17 @@ element: html
element: p
#text: Hello World!'
test_output_equals \
'Parse elements.' \
'<root><child attr="value" /><child2></><child3 >a</></root>' \
'
element: root
element: child
attribute: attr -> value
element: child2
element: child3
#text: a'
test_output_equals \
'Parse an enclosed comment.' \
'<root><!-- Hello World! --></root>' \
@ -51,6 +62,20 @@ test_output_equals \
element: root
#comment: Hello World! '
test_output_equals \
'Parse comment combinations.' \
'<root><!-- <!-- -- > --><!----></root>' \
'
element: root
#comment: <!-- -- >
#comment: '
test_output_equals \
'Parse comment combinations.' \
'<!--foo-->' \
'
#comment: foo'
test_output_equals \
'Parse an enclosed CDATA section.' \
'<root><![CDATA[...] ]>...]]></root>' \
@ -68,19 +93,47 @@ element: root
attribute: name -> value with &foo; <stuff'
test_output_equals \
'Parse entity references.' \
'<root>&amp;...&#42;...&...copy;...&;...&#;' \
'Parse attributes with garbage.' \
"<root a=b c='d' e'f' g= h i = j k =></root>" \
'
element: root
entity-reference: amp
#text: ...
entity-reference: #42
#text: ...
entity-reference: ...copy
#text: ...
#text: &;
#text: ...
entity-reference: #'
attribute: a -> b
attribute: c -> d
attribute: g -> h
attribute: i -> j
attribute: k -> '
test_output_equals \
'Parse attribute with non-quoted values.' \
'<root color=#abc path=/to/%61-&\one";files/>...' \
'
element: root
attribute: color -> #abc
attribute: path -> /to/%61-&\one";files
#text: ...'
test_output_equals \
'Parse entity references.' \
'&amp;-&#42;' \
'
entity-reference: amp
#text: -
entity-reference: #42'
# Just how these should be gracefully handled is not clear to me.
test_output_equals \
'Parse badly formatted entity references.' \
'& m33p;-&.:-copy;-&;-&#;-&#xx;' \
'
#text: & m33p;
#text: -
entity-reference: .:-copy
#text: -
#text: &;
#text: -
entity-reference: #
#text: -
entity-reference: #xx'
# Test <?>
test_output_equals \
@ -94,6 +147,57 @@ var val=2;
proc-instruction: xml -> encoding="UTF8"
attribute: encoding -> UTF8
#text: \n...\n
proc-instruction: ecmascript -> \nvar -> val=2;\n'
proc-instruction: ecmascript -> var val=2;\n'
test_output_equals \
'Parse XML processing instructions.' \
'<?xml version="1.0" />?><?xml />-' \
'
proc-instruction: xml -> version="1.0" />
attribute: version -> 1.0
proc-instruction: xml -> /'
test_output_equals \
'Parse exotic processing instructions.' \
'<?xml ?+>+?>-?>-<?js?>-<??>-' \
'
proc-instruction: xml -> ?+>+
#text: -?>-
proc-instruction: js ->
#text: -
proc-instruction: ->
#text: -'
test_output_equals \
'Parse incorrect processing instructions.' \
'<?js<?>-<?<??>-<?xml <=";&?>-<?' \
'
proc-instruction: js -> <
#text: -
proc-instruction: -> <?
#text: -
proc-instruction: xml -> <=";&
#text: -'
test_output_equals \
'Parse incorrect processing instructions (II).' \
'<?><?' \
'
proc-instruction: -> ><?'
test_output_equals \
'Skip spaces not inside text.' \
'<
root
ns:attr
=
"value"
><?
target
data?>< / root >' \
'
element: root
attribute: ns:attr -> value
proc-instruction: target -> data'
test_done