mirror of
https://github.com/rkd77/elinks.git
synced 2024-12-04 14:46:47 -05:00
Merge with git+ssh://pasky.or.cz/srv/git/elinks.git
This commit is contained in:
commit
65372061a0
@ -6,4 +6,7 @@ OBJS = node.o select.o stack.o scanner.o
|
||||
|
||||
SUBDIRS-$(CONFIG_DEBUG) += test
|
||||
|
||||
test: all
|
||||
make test -C test
|
||||
|
||||
include $(top_srcdir)/Makefile.lib
|
||||
|
@ -153,8 +153,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info)
|
||||
}
|
||||
|
||||
void
|
||||
init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
|
||||
struct dom_string *string)
|
||||
init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
|
||||
struct dom_string *string, int state)
|
||||
{
|
||||
if (!scanner_info->initialized) {
|
||||
init_dom_scanner_info(scanner_info);
|
||||
@ -168,5 +168,6 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i
|
||||
scanner->end = string->string + string->length;
|
||||
scanner->current = scanner->table;
|
||||
scanner->info = scanner_info;
|
||||
scanner->state = state;
|
||||
scanner->info->scan(scanner);
|
||||
}
|
||||
|
@ -91,8 +91,11 @@ struct dom_scanner_info {
|
||||
|
||||
|
||||
/* Initializes the scanner. */
|
||||
void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
|
||||
struct dom_string *string);
|
||||
void init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
|
||||
struct dom_string *string, int state);
|
||||
|
||||
#define init_dom_scanner(scanner, info, string) \
|
||||
init_dom_scanner_state(scanner, info, string, 0)
|
||||
|
||||
/* The number of tokens in the scanners token table:
|
||||
* At best it should be big enough to contain properties with space separated
|
||||
|
@ -103,26 +103,17 @@ add_sgml_attribute(struct dom_stack *stack,
|
||||
}
|
||||
|
||||
static inline struct dom_node *
|
||||
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *token)
|
||||
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
|
||||
struct dom_scanner_token *data)
|
||||
{
|
||||
struct dom_node *parent = get_dom_stack_top(stack)->node;
|
||||
struct dom_string *data_str = data ? &data->string : NULL;
|
||||
struct dom_node *node;
|
||||
/* Split the token in two if we can find a first space separator. */
|
||||
unsigned char *separator = memchr(token->string.string, ' ', token->string.length);
|
||||
|
||||
/* Anything before the separator becomes the target name ... */
|
||||
size_t namelen = separator ? separator - token->string.string : token->string.length;
|
||||
struct dom_string name = INIT_DOM_STRING(token->string.string, namelen);
|
||||
|
||||
/* ... and everything after the instruction value. */
|
||||
unsigned char *valuestr = separator ? separator + 1 : NULL;
|
||||
size_t valuelen = valuestr ? token->string.length - namelen - 1 : 0;
|
||||
struct dom_string value = INIT_DOM_STRING(valuestr, valuelen);
|
||||
|
||||
node = add_dom_proc_instruction(parent, &name, &value);
|
||||
node = add_dom_proc_instruction(parent, &target->string, data_str);
|
||||
if (!node) return NULL;
|
||||
|
||||
switch (token->type) {
|
||||
switch (target->type) {
|
||||
case SGML_TOKEN_PROCESS_XML:
|
||||
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
|
||||
break;
|
||||
@ -132,13 +123,7 @@ add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *tok
|
||||
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
|
||||
}
|
||||
|
||||
if (!push_dom_node(stack, node))
|
||||
return NULL;
|
||||
|
||||
if (token->type != SGML_TOKEN_PROCESS_XML)
|
||||
pop_dom_node(stack);
|
||||
|
||||
return node;
|
||||
return push_dom_node(stack, node);
|
||||
}
|
||||
|
||||
static inline void
|
||||
@ -166,8 +151,11 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
|
||||
|
||||
assert(dom_scanner_has_tokens(scanner)
|
||||
&& (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
|
||||
|| get_dom_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML));
|
||||
|| (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION
|
||||
&& get_dom_stack_top(stack)->node->data.proc_instruction.type
|
||||
== DOM_PROC_INSTRUCTION_XML)));
|
||||
|
||||
if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN)
|
||||
skip_dom_scanner_token(scanner);
|
||||
|
||||
while (dom_scanner_has_tokens(scanner)) {
|
||||
@ -220,6 +208,8 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
|
||||
static void
|
||||
parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
|
||||
{
|
||||
struct dom_scanner_token target;
|
||||
|
||||
while (dom_scanner_has_tokens(scanner)) {
|
||||
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
|
||||
|
||||
@ -290,17 +280,31 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
|
||||
break;
|
||||
|
||||
case SGML_TOKEN_PROCESS_XML:
|
||||
if (!add_sgml_proc_instruction(stack, token)) {
|
||||
skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
|
||||
break;
|
||||
case SGML_TOKEN_PROCESS:
|
||||
copy_struct(&target, token);
|
||||
|
||||
/* Skip the target token */
|
||||
token = get_next_dom_scanner_token(scanner);
|
||||
if (!token) break;
|
||||
|
||||
assert(token->type == SGML_TOKEN_PROCESS_DATA);
|
||||
|
||||
if (add_sgml_proc_instruction(stack, &target, token)
|
||||
&& target.type == SGML_TOKEN_PROCESS_XML
|
||||
&& token->string.length > 0) {
|
||||
/* Parse the <?xml data="attributes"?>. */
|
||||
struct dom_scanner attr_scanner;
|
||||
|
||||
init_dom_scanner_state(&attr_scanner,
|
||||
&sgml_scanner_info,
|
||||
&token->string,
|
||||
SGML_STATE_ELEMENT);
|
||||
|
||||
if (dom_scanner_has_tokens(&attr_scanner))
|
||||
parse_sgml_attributes(stack, &attr_scanner);
|
||||
}
|
||||
|
||||
parse_sgml_attributes(stack, scanner);
|
||||
pop_dom_node(stack);
|
||||
break;
|
||||
|
||||
case SGML_TOKEN_PROCESS:
|
||||
add_sgml_proc_instruction(stack, token);
|
||||
skip_dom_scanner_token(scanner);
|
||||
break;
|
||||
|
||||
|
@ -17,14 +17,6 @@
|
||||
|
||||
/* Bitmap entries for the SGML character groups used in the scanner table */
|
||||
|
||||
/* The SGML tokenizer maintains a state that can be either text or element
|
||||
* state. The state has only meaning while doing the actual scanning and is not
|
||||
* accessible at the parsing time. */
|
||||
enum sgml_scanner_state {
|
||||
SGML_STATE_TEXT,
|
||||
SGML_STATE_ELEMENT,
|
||||
};
|
||||
|
||||
enum sgml_char_group {
|
||||
SGML_CHAR_ENTITY = (1 << 1),
|
||||
SGML_CHAR_IDENT = (1 << 2),
|
||||
@ -186,7 +178,7 @@ skip_comment(struct dom_scanner *scanner, unsigned char **string)
|
||||
unsigned char *pos = *string;
|
||||
int length = 0;
|
||||
|
||||
for (; pos < scanner->end - 3; pos++)
|
||||
for (; pos < scanner->end - 2; pos++)
|
||||
if (pos[0] == '-' && pos[1] == '-' && pos[2] == '>') {
|
||||
length = pos - *string;
|
||||
pos += 3;
|
||||
@ -296,27 +288,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
|
||||
type = map_dom_scanner_string(scanner, pos, string, base);
|
||||
|
||||
/* Figure out where the processing instruction ends */
|
||||
for (pos = string; skip_sgml(scanner, &pos, '>', 0); ) {
|
||||
if (pos[-2] != '?') continue;
|
||||
|
||||
/* Set length until '?' char and move position
|
||||
* beyond '>'. */
|
||||
real_length = pos - token->string.string - 2;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (type) {
|
||||
case SGML_TOKEN_PROCESS_XML:
|
||||
/* We want to parse the attributes */
|
||||
assert(scanner->state != SGML_STATE_ELEMENT);
|
||||
scanner->state = SGML_STATE_ELEMENT;
|
||||
break;
|
||||
|
||||
default:
|
||||
/* Just skip the whole thing */
|
||||
string = pos;
|
||||
}
|
||||
scanner->state = SGML_STATE_PROC_INST;
|
||||
|
||||
} else if (*string == '/') {
|
||||
string++;
|
||||
@ -366,6 +338,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
} else if (is_sgml_attribute(*string)) {
|
||||
scan_sgml_attribute(scanner, string);
|
||||
type = SGML_TOKEN_ATTRIBUTE;
|
||||
if (string[-1] == '/' && string[0] == '>')
|
||||
string--;
|
||||
}
|
||||
|
||||
} else if (isquote(first_char)) {
|
||||
@ -393,6 +367,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
if (is_sgml_attribute(*string)) {
|
||||
scan_sgml_attribute(scanner, string);
|
||||
type = SGML_TOKEN_ATTRIBUTE;
|
||||
if (string[-1] == '/' && string[0] == '>')
|
||||
string--;
|
||||
}
|
||||
}
|
||||
|
||||
@ -403,6 +379,38 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
}
|
||||
|
||||
|
||||
/* Processing instruction data scanning */
|
||||
|
||||
static inline void
|
||||
scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
|
||||
{
|
||||
unsigned char *string = scanner->position;
|
||||
size_t size;
|
||||
|
||||
token->string.string = string;
|
||||
|
||||
/* Figure out where the processing instruction ends. This doesn't use
|
||||
* skip_sgml() since we MUST ignore precedence here to allow '<' inside
|
||||
* the data part to be skipped correctly. */
|
||||
for (size = scanner->end - string;
|
||||
size > 0 && (string = memchr(string, '>', size));
|
||||
string++) {
|
||||
if (string[-1] == '?') {
|
||||
string++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!string) string = scanner->end;
|
||||
|
||||
token->type = SGML_TOKEN_PROCESS_DATA;
|
||||
token->string.length = string - token->string.string - 2;
|
||||
token->precedence = get_sgml_precedence(token->type);
|
||||
scanner->position = string;
|
||||
scanner->state = SGML_STATE_TEXT;
|
||||
}
|
||||
|
||||
|
||||
/* Scanner multiplexor */
|
||||
|
||||
static struct dom_scanner_token *
|
||||
@ -419,7 +427,8 @@ scan_sgml_tokens(struct dom_scanner *scanner)
|
||||
current < table_end && scanner->position < scanner->end;
|
||||
current++) {
|
||||
if (scanner->state == SGML_STATE_ELEMENT
|
||||
|| *scanner->position == '<') {
|
||||
|| (*scanner->position == '<'
|
||||
&& scanner->state != SGML_STATE_PROC_INST)) {
|
||||
scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE);
|
||||
if (scanner->position >= scanner->end) break;
|
||||
|
||||
@ -429,8 +438,13 @@ scan_sgml_tokens(struct dom_scanner *scanner)
|
||||
if (current->type == SGML_TOKEN_SKIP) {
|
||||
current--;
|
||||
}
|
||||
} else {
|
||||
|
||||
} else if (scanner->state == SGML_STATE_TEXT) {
|
||||
scan_sgml_text_token(scanner, current);
|
||||
|
||||
} else {
|
||||
scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE);
|
||||
scan_sgml_proc_inst_token(scanner, current);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -27,8 +27,9 @@ enum sgml_token_type {
|
||||
|
||||
SGML_TOKEN_CDATA_SECTION, /* <![CDATA[ until ]]> */
|
||||
|
||||
SGML_TOKEN_PROCESS, /* <?{ident} until ?> */
|
||||
SGML_TOKEN_PROCESS_XML, /* <?xml until */
|
||||
SGML_TOKEN_PROCESS, /* <?{ident} */
|
||||
SGML_TOKEN_PROCESS_XML, /* <?xml */
|
||||
SGML_TOKEN_PROCESS_DATA, /* data after <?{ident} until ?> */
|
||||
|
||||
SGML_TOKEN_ELEMENT, /* <{ident}> */
|
||||
SGML_TOKEN_ELEMENT_BEGIN, /* <{ident} */
|
||||
@ -56,6 +57,17 @@ enum sgml_token_type {
|
||||
SGML_TOKEN_NONE = 0,
|
||||
};
|
||||
|
||||
/* The SGML tokenizer maintains a state (in the scanner->state member) that can
|
||||
* be either text, element, or processing instruction state. The state has only
|
||||
* meaning while doing the actual scanning and should not be used at the
|
||||
* parsing time. It can however be used to initialize the scanner to a specific
|
||||
* state. */
|
||||
enum sgml_scanner_state {
|
||||
SGML_STATE_TEXT,
|
||||
SGML_STATE_ELEMENT,
|
||||
SGML_STATE_PROC_INST,
|
||||
};
|
||||
|
||||
extern struct dom_scanner_info sgml_scanner_info;
|
||||
|
||||
/* Treat '<' as more valuable then '>' so that scanning of '<a<b>' using
|
||||
|
@ -44,6 +44,17 @@ element: html
|
||||
element: p
|
||||
#text: Hello World!'
|
||||
|
||||
test_output_equals \
|
||||
'Parse elements.' \
|
||||
'<root><child attr="value" /><child2></><child3 >a</></root>' \
|
||||
'
|
||||
element: root
|
||||
element: child
|
||||
attribute: attr -> value
|
||||
element: child2
|
||||
element: child3
|
||||
#text: a'
|
||||
|
||||
test_output_equals \
|
||||
'Parse an enclosed comment.' \
|
||||
'<root><!-- Hello World! --></root>' \
|
||||
@ -51,6 +62,20 @@ test_output_equals \
|
||||
element: root
|
||||
#comment: Hello World! '
|
||||
|
||||
test_output_equals \
|
||||
'Parse comment combinations.' \
|
||||
'<root><!-- <!-- -- > --><!----></root>' \
|
||||
'
|
||||
element: root
|
||||
#comment: <!-- -- >
|
||||
#comment: '
|
||||
|
||||
test_output_equals \
|
||||
'Parse comment combinations.' \
|
||||
'<!--foo-->' \
|
||||
'
|
||||
#comment: foo'
|
||||
|
||||
test_output_equals \
|
||||
'Parse an enclosed CDATA section.' \
|
||||
'<root><![CDATA[...] ]>...]]></root>' \
|
||||
@ -68,19 +93,47 @@ element: root
|
||||
attribute: name -> value with &foo; <stuff'
|
||||
|
||||
test_output_equals \
|
||||
'Parse entity references.' \
|
||||
'<root>&...*...&...copy;...&;...&#;' \
|
||||
'Parse attributes with garbage.' \
|
||||
"<root a=b c='d' e'f' g= h i = j k =></root>" \
|
||||
'
|
||||
element: root
|
||||
entity-reference: amp
|
||||
#text: ...
|
||||
entity-reference: #42
|
||||
#text: ...
|
||||
entity-reference: ...copy
|
||||
#text: ...
|
||||
#text: &;
|
||||
#text: ...
|
||||
entity-reference: #'
|
||||
attribute: a -> b
|
||||
attribute: c -> d
|
||||
attribute: g -> h
|
||||
attribute: i -> j
|
||||
attribute: k -> '
|
||||
|
||||
test_output_equals \
|
||||
'Parse attribute with non-quoted values.' \
|
||||
'<root color=#abc path=/to/%61-&\one";files/>...' \
|
||||
'
|
||||
element: root
|
||||
attribute: color -> #abc
|
||||
attribute: path -> /to/%61-&\one";files
|
||||
#text: ...'
|
||||
|
||||
test_output_equals \
|
||||
'Parse entity references.' \
|
||||
'&-*' \
|
||||
'
|
||||
entity-reference: amp
|
||||
#text: -
|
||||
entity-reference: #42'
|
||||
|
||||
# Just how these should be gracefully handled is not clear to me.
|
||||
test_output_equals \
|
||||
'Parse badly formatted entity references.' \
|
||||
'& m33p;-&.:-copy;-&;-&#;-&#xx;' \
|
||||
'
|
||||
#text: & m33p;
|
||||
#text: -
|
||||
entity-reference: .:-copy
|
||||
#text: -
|
||||
#text: &;
|
||||
#text: -
|
||||
entity-reference: #
|
||||
#text: -
|
||||
entity-reference: #xx'
|
||||
|
||||
# Test <?>
|
||||
test_output_equals \
|
||||
@ -94,6 +147,57 @@ var val=2;
|
||||
proc-instruction: xml -> encoding="UTF8"
|
||||
attribute: encoding -> UTF8
|
||||
#text: \n...\n
|
||||
proc-instruction: ecmascript -> \nvar -> val=2;\n'
|
||||
proc-instruction: ecmascript -> var val=2;\n'
|
||||
|
||||
test_output_equals \
|
||||
'Parse XML processing instructions.' \
|
||||
'<?xml version="1.0" />?><?xml />-' \
|
||||
'
|
||||
proc-instruction: xml -> version="1.0" />
|
||||
attribute: version -> 1.0
|
||||
proc-instruction: xml -> /'
|
||||
|
||||
test_output_equals \
|
||||
'Parse exotic processing instructions.' \
|
||||
'<?xml ?+>+?>-?>-<?js?>-<??>-' \
|
||||
'
|
||||
proc-instruction: xml -> ?+>+
|
||||
#text: -?>-
|
||||
proc-instruction: js ->
|
||||
#text: -
|
||||
proc-instruction: ->
|
||||
#text: -'
|
||||
|
||||
test_output_equals \
|
||||
'Parse incorrect processing instructions.' \
|
||||
'<?js<?>-<?<??>-<?xml <=";&?>-<?' \
|
||||
'
|
||||
proc-instruction: js -> <
|
||||
#text: -
|
||||
proc-instruction: -> <?
|
||||
#text: -
|
||||
proc-instruction: xml -> <=";&
|
||||
#text: -'
|
||||
|
||||
test_output_equals \
|
||||
'Parse incorrect processing instructions (II).' \
|
||||
'<?><?' \
|
||||
'
|
||||
proc-instruction: -> ><?'
|
||||
|
||||
test_output_equals \
|
||||
'Skip spaces not inside text.' \
|
||||
'<
|
||||
root
|
||||
ns:attr
|
||||
=
|
||||
"value"
|
||||
><?
|
||||
target
|
||||
data?>< / root >' \
|
||||
'
|
||||
element: root
|
||||
attribute: ns:attr -> value
|
||||
proc-instruction: target -> data'
|
||||
|
||||
test_done
|
||||
|
Loading…
Reference in New Issue
Block a user