mirror of
https://github.com/rkd77/elinks.git
synced 2025-02-02 15:09:23 -05:00
Add mode where the SGML scanner checks for completeness
This commit is contained in:
parent
af72dd8435
commit
e78d43f1ac
@ -154,7 +154,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info)
|
||||
|
||||
void
|
||||
init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
|
||||
struct dom_string *string, int state, int count_lines, int complete)
|
||||
struct dom_string *string, int state, int count_lines, int complete,
|
||||
int check_complete)
|
||||
{
|
||||
if (!scanner_info->initialized) {
|
||||
init_dom_scanner_info(scanner_info);
|
||||
@ -170,6 +171,8 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i
|
||||
scanner->info = scanner_info;
|
||||
scanner->state = state;
|
||||
scanner->count_lines = !!count_lines;
|
||||
scanner->incomplete = !complete;
|
||||
scanner->check_complete = !!check_complete;
|
||||
scanner->lineno = scanner->count_lines;
|
||||
scanner->info->scan(scanner);
|
||||
}
|
||||
|
@ -92,7 +92,8 @@ struct dom_scanner_info {
|
||||
|
||||
/* Initializes the scanner. */
|
||||
void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
|
||||
struct dom_string *string, int state, int count_lines, int complete);
|
||||
struct dom_string *string, int state, int count_lines, int complete,
|
||||
int check_complete);
|
||||
|
||||
/* The number of tokens in the scanners token table:
|
||||
* At best it should be big enough to contain properties with space separated
|
||||
@ -123,7 +124,12 @@ struct dom_scanner {
|
||||
int line;
|
||||
#endif
|
||||
|
||||
/* The following two flags are used when parsing is incremental and
|
||||
* the scanner must ensure that only tokens that are complete are
|
||||
* generated. */
|
||||
unsigned int check_complete:1; /* Only generate complete tokens */
|
||||
unsigned int incomplete:1; /* The scanned string is incomplete */
|
||||
|
||||
unsigned int count_lines:1; /* Is line counting enbaled? */
|
||||
unsigned int lineno; /* Line # of the last scanned token */
|
||||
|
||||
|
@ -391,7 +391,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack,
|
||||
struct dom_scanner scanner;
|
||||
struct dom_select_node sel;
|
||||
|
||||
init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1);
|
||||
init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0);
|
||||
|
||||
memset(&sel, 0, sizeof(sel));
|
||||
|
||||
|
@ -317,7 +317,7 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
|
||||
/* The attribute souce is complete. */
|
||||
init_dom_scanner(&attr_scanner, &sgml_scanner_info,
|
||||
&token->string, SGML_STATE_ELEMENT,
|
||||
scanner->count_lines, 1);
|
||||
scanner->count_lines, 1, 0);
|
||||
|
||||
if (dom_scanner_has_tokens(&attr_scanner)) {
|
||||
/* Ignore parser codes from this
|
||||
@ -393,11 +393,12 @@ sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
|
||||
struct sgml_parsing_state *parsing = data;
|
||||
int count_lines = !!(parser->flags & SGML_PARSER_COUNT_LINES);
|
||||
int complete = !!(parser->flags & SGML_PARSER_COMPLETE);
|
||||
int incremental = !!(parser->flags & SGML_PARSER_INCREMENTAL);
|
||||
|
||||
parsing->depth = parser->stack.depth;
|
||||
get_dom_stack_top(&parser->stack)->immutable = 1;
|
||||
init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string,
|
||||
SGML_STATE_TEXT, count_lines, complete);
|
||||
SGML_STATE_TEXT, count_lines, complete, incremental);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -98,6 +98,24 @@ skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
|
||||
*string = pos;
|
||||
}
|
||||
|
||||
#define check_sgml_incomplete(scanner, string) \
|
||||
((scanner)->check_complete \
|
||||
&& (scanner)->incomplete \
|
||||
&& (string) == (scanner)->end)
|
||||
|
||||
static void
|
||||
set_sgml_incomplete(struct dom_scanner *scanner, struct dom_scanner_token *token)
|
||||
{
|
||||
size_t left = scanner->end - scanner->position;
|
||||
|
||||
assert(left > 0);
|
||||
|
||||
token->type = SGML_TOKEN_INCOMPLETE;
|
||||
set_dom_string(&token->string, scanner->position, left);
|
||||
|
||||
/* Stop the scanning. */
|
||||
scanner->position = scanner->end;
|
||||
}
|
||||
|
||||
/* Text token scanning */
|
||||
|
||||
@ -119,6 +137,8 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
|
||||
token->string.string = string++;
|
||||
|
||||
if (first_char == '&') {
|
||||
int complete = 0;
|
||||
|
||||
if (is_sgml_entity(*string)) {
|
||||
scan_sgml(scanner, string, SGML_CHAR_ENTITY);
|
||||
type = SGML_TOKEN_ENTITY;
|
||||
@ -128,11 +148,18 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
|
||||
|
||||
foreach_sgml_cdata (scanner, string) {
|
||||
if (*string == ';') {
|
||||
complete = 1;
|
||||
string++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* We want the biggest possible text token. */
|
||||
if (check_sgml_incomplete(scanner, string) && !complete) {
|
||||
set_sgml_incomplete(scanner, token);
|
||||
return;
|
||||
}
|
||||
|
||||
} else {
|
||||
if (is_sgml_space(first_char)) {
|
||||
if (scanner->count_lines
|
||||
@ -156,6 +183,12 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
|
||||
/* m33p */;
|
||||
}
|
||||
}
|
||||
|
||||
/* We want the biggest possible text token. */
|
||||
if (check_sgml_incomplete(scanner, string)) {
|
||||
set_sgml_incomplete(scanner, token);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
token->type = type;
|
||||
@ -237,7 +270,8 @@ skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char ski
|
||||
}
|
||||
|
||||
static inline int
|
||||
skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
|
||||
skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string,
|
||||
int *possibly_incomplete)
|
||||
{
|
||||
unsigned char *pos = *string;
|
||||
int length = 0;
|
||||
@ -249,6 +283,7 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
|
||||
* preceeding '-'. */
|
||||
if (pos[-2] == '-' && pos[-1] == '-' && &pos[-2] >= *string) {
|
||||
length = pos - *string - 2;
|
||||
*possibly_incomplete = 0;
|
||||
pos++;
|
||||
break;
|
||||
}
|
||||
@ -256,6 +291,9 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
|
||||
|
||||
if (!pos) {
|
||||
pos = scanner->end;
|
||||
/* The token is incomplete but set the length to handle tag
|
||||
* tag soup graciously. */
|
||||
*possibly_incomplete = 1;
|
||||
length = pos - *string;
|
||||
}
|
||||
|
||||
@ -264,7 +302,8 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
|
||||
}
|
||||
|
||||
static inline int
|
||||
skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
|
||||
skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string,
|
||||
int *possibly_incomplete)
|
||||
{
|
||||
unsigned char *pos = *string;
|
||||
int length = 0;
|
||||
@ -274,6 +313,7 @@ skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
|
||||
* are supposed to have '<![CDATA[' before this is called. */
|
||||
if (pos[-2] == ']' && pos[-1] == ']') {
|
||||
length = pos - *string - 2;
|
||||
*possibly_incomplete = 0;
|
||||
pos++;
|
||||
break;
|
||||
}
|
||||
@ -281,6 +321,9 @@ skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
|
||||
|
||||
if (!pos) {
|
||||
pos = scanner->end;
|
||||
/* The token is incomplete but set the length to handle tag
|
||||
* soup graciously. */
|
||||
*possibly_incomplete = 1;
|
||||
length = pos - *string;
|
||||
}
|
||||
|
||||
@ -299,6 +342,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
unsigned char first_char = *string;
|
||||
enum sgml_token_type type = SGML_TOKEN_GARBAGE;
|
||||
int real_length = -1;
|
||||
int possibly_incomplete = 1;
|
||||
|
||||
token->string.string = string++;
|
||||
|
||||
@ -313,6 +357,9 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
type = SGML_TOKEN_TAG_END;
|
||||
scanner->state = SGML_STATE_TEXT;
|
||||
|
||||
/* We are creating a 'virtual' that has no source. */
|
||||
possibly_incomplete = 0;
|
||||
|
||||
} else if (is_sgml_ident(*string)) {
|
||||
token->string.string = string;
|
||||
scan_sgml(scanner, string, SGML_CHAR_IDENT);
|
||||
@ -323,7 +370,16 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
if (*string == '>') {
|
||||
type = SGML_TOKEN_ELEMENT;
|
||||
string++;
|
||||
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
|
||||
} else {
|
||||
/* Was any space skipped? */
|
||||
if (is_sgml_space(string[-1])) {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
scanner->state = SGML_STATE_ELEMENT;
|
||||
type = SGML_TOKEN_ELEMENT_BEGIN;
|
||||
}
|
||||
@ -341,7 +397,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
string += 2;
|
||||
type = SGML_TOKEN_NOTATION_COMMENT;
|
||||
token->string.string = string;
|
||||
real_length = skip_sgml_comment(scanner, &string);
|
||||
real_length = skip_sgml_comment(scanner, &string,
|
||||
&possibly_incomplete);
|
||||
assert(real_length >= 0);
|
||||
|
||||
} else if (string + 6 < scanner->end
|
||||
@ -350,13 +407,17 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
string += 7;
|
||||
type = SGML_TOKEN_CDATA_SECTION;
|
||||
token->string.string = string;
|
||||
real_length = skip_sgml_cdata_section(scanner, &string);
|
||||
real_length = skip_sgml_cdata_section(scanner, &string,
|
||||
&possibly_incomplete);
|
||||
assert(real_length >= 0);
|
||||
|
||||
} else {
|
||||
skip_sgml_space(scanner, &string);
|
||||
type = map_dom_scanner_string(scanner, ident, string, base);
|
||||
skip_sgml(scanner, &string, '>', 0);
|
||||
if (skip_sgml(scanner, &string, '>', 0)) {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (*string == '?') {
|
||||
@ -375,6 +436,11 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
real_length = string - token->string.string;
|
||||
skip_sgml_space(scanner, &string);
|
||||
|
||||
if (is_sgml_space(string[-1])) {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
|
||||
} else if (*string == '/') {
|
||||
string++;
|
||||
skip_sgml_space(scanner, &string);
|
||||
@ -385,12 +451,18 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
real_length = string - token->string.string;
|
||||
|
||||
type = SGML_TOKEN_ELEMENT_END;
|
||||
skip_sgml(scanner, &string, '>', 1);
|
||||
if (skip_sgml(scanner, &string, '>', 1)) {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
|
||||
} else if (*string == '>') {
|
||||
string++;
|
||||
real_length = 0;
|
||||
type = SGML_TOKEN_ELEMENT_END;
|
||||
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
|
||||
if (type != SGML_TOKEN_GARBAGE)
|
||||
@ -398,15 +470,28 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
|
||||
} else {
|
||||
/* Alien < > stuff so ignore it */
|
||||
skip_sgml(scanner, &string, '>', 0);
|
||||
if (skip_sgml(scanner, &string, '>', 0)) {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (first_char == '=') {
|
||||
type = '=';
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
|
||||
} else if (first_char == '?' || first_char == '>') {
|
||||
if (first_char == '?') {
|
||||
skip_sgml(scanner, &string, '>', 0);
|
||||
if (skip_sgml(scanner, &string, '>', 0)) {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
} else {
|
||||
assert(first_char == '>');
|
||||
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
|
||||
type = SGML_TOKEN_TAG_END;
|
||||
@ -414,17 +499,33 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
scanner->state = SGML_STATE_TEXT;
|
||||
|
||||
} else if (first_char == '/') {
|
||||
/* We allow '/' inside elements and only consider it as an end
|
||||
* tag if immediately preceeds the '>' char. This is to allow
|
||||
*
|
||||
* '<form action=/ >' where '/' is part of a path and
|
||||
* '<form action=a />' where '/>' is truely a tag end
|
||||
*
|
||||
* For stricter parsing we should always require attribute
|
||||
* values to be quoted.
|
||||
*/
|
||||
if (*string == '>') {
|
||||
string++;
|
||||
real_length = 0;
|
||||
type = SGML_TOKEN_ELEMENT_EMPTY_END;
|
||||
assert(scanner->state == SGML_STATE_ELEMENT);
|
||||
scanner->state = SGML_STATE_TEXT;
|
||||
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
|
||||
} else if (is_sgml_attribute(*string)) {
|
||||
scan_sgml_attribute(scanner, string);
|
||||
type = SGML_TOKEN_ATTRIBUTE;
|
||||
if (string[-1] == '/' && string[0] == '>')
|
||||
if (string[-1] == '/' && string[0] == '>') {
|
||||
string--;
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
}
|
||||
}
|
||||
|
||||
} else if (isquote(first_char)) {
|
||||
@ -436,6 +537,10 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
real_length = string_end - token->string.string;
|
||||
string = string_end + 1;
|
||||
type = SGML_TOKEN_STRING;
|
||||
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
|
||||
} else if (is_sgml_attribute(*string)) {
|
||||
token->string.string++;
|
||||
scan_sgml_attribute(scanner, string);
|
||||
@ -451,11 +556,19 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
|
||||
if (is_sgml_attribute(*string)) {
|
||||
scan_sgml_attribute(scanner, string);
|
||||
type = SGML_TOKEN_ATTRIBUTE;
|
||||
if (string[-1] == '/' && string[0] == '>')
|
||||
if (string[-1] == '/' && string[0] == '>') {
|
||||
/* We found the end. */
|
||||
possibly_incomplete = 0;
|
||||
string--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (possibly_incomplete && check_sgml_incomplete(scanner, string)) {
|
||||
set_sgml_incomplete(scanner, token);
|
||||
return;
|
||||
}
|
||||
|
||||
token->type = type;
|
||||
token->string.length = real_length >= 0 ? real_length : string - token->string.string;
|
||||
token->precedence = get_sgml_precedence(type);
|
||||
@ -482,7 +595,14 @@ scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token
|
||||
}
|
||||
}
|
||||
|
||||
if (!string) string = scanner->end;
|
||||
if (!string) {
|
||||
if (check_sgml_incomplete(scanner, string)) {
|
||||
set_sgml_incomplete(scanner, token);
|
||||
return;
|
||||
}
|
||||
|
||||
string = scanner->end;
|
||||
}
|
||||
|
||||
token->type = SGML_TOKEN_PROCESS_DATA;
|
||||
token->string.length = string - token->string.string - 2;
|
||||
|
Loading…
x
Reference in New Issue
Block a user