1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-09-28 03:06:20 -04:00

Add mode where the SGML scanner checks for completeness

This commit is contained in:
Jonas Fonseca 2006-01-02 17:46:09 +01:00 committed by Jonas Fonseca
parent af72dd8435
commit e78d43f1ac
5 changed files with 146 additions and 16 deletions

View File

@ -154,7 +154,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info)
void
init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
struct dom_string *string, int state, int count_lines, int complete)
struct dom_string *string, int state, int count_lines, int complete,
int check_complete)
{
if (!scanner_info->initialized) {
init_dom_scanner_info(scanner_info);
@ -170,6 +171,8 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i
scanner->info = scanner_info;
scanner->state = state;
scanner->count_lines = !!count_lines;
scanner->incomplete = !complete;
scanner->check_complete = !!check_complete;
scanner->lineno = scanner->count_lines;
scanner->info->scan(scanner);
}

View File

@ -92,7 +92,8 @@ struct dom_scanner_info {
/* Initializes the scanner. */
void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
struct dom_string *string, int state, int count_lines, int complete);
struct dom_string *string, int state, int count_lines, int complete,
int check_complete);
/* The number of tokens in the scanners token table:
* At best it should be big enough to contain properties with space separated
@ -123,7 +124,12 @@ struct dom_scanner {
int line;
#endif
/* The following two flags are used when parsing is incremental and
* the scanner must ensure that only tokens that are complete are
* generated. */
unsigned int check_complete:1; /* Only generate complete tokens */
unsigned int incomplete:1; /* The scanned string is incomplete */
unsigned int count_lines:1; /* Is line counting enbaled? */
unsigned int lineno; /* Line # of the last scanned token */

View File

@ -391,7 +391,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack,
struct dom_scanner scanner;
struct dom_select_node sel;
init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1);
init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0);
memset(&sel, 0, sizeof(sel));

View File

@ -317,7 +317,7 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
/* The attribute souce is complete. */
init_dom_scanner(&attr_scanner, &sgml_scanner_info,
&token->string, SGML_STATE_ELEMENT,
scanner->count_lines, 1);
scanner->count_lines, 1, 0);
if (dom_scanner_has_tokens(&attr_scanner)) {
/* Ignore parser codes from this
@ -393,11 +393,12 @@ sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
struct sgml_parsing_state *parsing = data;
int count_lines = !!(parser->flags & SGML_PARSER_COUNT_LINES);
int complete = !!(parser->flags & SGML_PARSER_COMPLETE);
int incremental = !!(parser->flags & SGML_PARSER_INCREMENTAL);
parsing->depth = parser->stack.depth;
get_dom_stack_top(&parser->stack)->immutable = 1;
init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string,
SGML_STATE_TEXT, count_lines, complete);
SGML_STATE_TEXT, count_lines, complete, incremental);
}
static void

View File

@ -98,6 +98,24 @@ skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
*string = pos;
}
#define check_sgml_incomplete(scanner, string) \
((scanner)->check_complete \
&& (scanner)->incomplete \
&& (string) == (scanner)->end)
static void
set_sgml_incomplete(struct dom_scanner *scanner, struct dom_scanner_token *token)
{
size_t left = scanner->end - scanner->position;
assert(left > 0);
token->type = SGML_TOKEN_INCOMPLETE;
set_dom_string(&token->string, scanner->position, left);
/* Stop the scanning. */
scanner->position = scanner->end;
}
/* Text token scanning */
@ -119,6 +137,8 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
token->string.string = string++;
if (first_char == '&') {
int complete = 0;
if (is_sgml_entity(*string)) {
scan_sgml(scanner, string, SGML_CHAR_ENTITY);
type = SGML_TOKEN_ENTITY;
@ -128,11 +148,18 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
foreach_sgml_cdata (scanner, string) {
if (*string == ';') {
complete = 1;
string++;
break;
}
}
/* We want the biggest possible text token. */
if (check_sgml_incomplete(scanner, string) && !complete) {
set_sgml_incomplete(scanner, token);
return;
}
} else {
if (is_sgml_space(first_char)) {
if (scanner->count_lines
@ -156,6 +183,12 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
/* m33p */;
}
}
/* We want the biggest possible text token. */
if (check_sgml_incomplete(scanner, string)) {
set_sgml_incomplete(scanner, token);
return;
}
}
token->type = type;
@ -237,7 +270,8 @@ skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char ski
}
static inline int
skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string,
int *possibly_incomplete)
{
unsigned char *pos = *string;
int length = 0;
@ -249,6 +283,7 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
* preceeding '-'. */
if (pos[-2] == '-' && pos[-1] == '-' && &pos[-2] >= *string) {
length = pos - *string - 2;
*possibly_incomplete = 0;
pos++;
break;
}
@ -256,6 +291,9 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
if (!pos) {
pos = scanner->end;
/* The token is incomplete but set the length to handle tag
* tag soup graciously. */
*possibly_incomplete = 1;
length = pos - *string;
}
@ -264,7 +302,8 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
}
static inline int
skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string,
int *possibly_incomplete)
{
unsigned char *pos = *string;
int length = 0;
@ -274,6 +313,7 @@ skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
* are supposed to have '<![CDATA[' before this is called. */
if (pos[-2] == ']' && pos[-1] == ']') {
length = pos - *string - 2;
*possibly_incomplete = 0;
pos++;
break;
}
@ -281,6 +321,9 @@ skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
if (!pos) {
pos = scanner->end;
/* The token is incomplete but set the length to handle tag
* soup graciously. */
*possibly_incomplete = 1;
length = pos - *string;
}
@ -299,6 +342,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
unsigned char first_char = *string;
enum sgml_token_type type = SGML_TOKEN_GARBAGE;
int real_length = -1;
int possibly_incomplete = 1;
token->string.string = string++;
@ -313,6 +357,9 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
type = SGML_TOKEN_TAG_END;
scanner->state = SGML_STATE_TEXT;
/* We are creating a 'virtual' that has no source. */
possibly_incomplete = 0;
} else if (is_sgml_ident(*string)) {
token->string.string = string;
scan_sgml(scanner, string, SGML_CHAR_IDENT);
@ -323,7 +370,16 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
if (*string == '>') {
type = SGML_TOKEN_ELEMENT;
string++;
/* We found the end. */
possibly_incomplete = 0;
} else {
/* Was any space skipped? */
if (is_sgml_space(string[-1])) {
/* We found the end. */
possibly_incomplete = 0;
}
scanner->state = SGML_STATE_ELEMENT;
type = SGML_TOKEN_ELEMENT_BEGIN;
}
@ -341,7 +397,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
string += 2;
type = SGML_TOKEN_NOTATION_COMMENT;
token->string.string = string;
real_length = skip_sgml_comment(scanner, &string);
real_length = skip_sgml_comment(scanner, &string,
&possibly_incomplete);
assert(real_length >= 0);
} else if (string + 6 < scanner->end
@ -350,13 +407,17 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
string += 7;
type = SGML_TOKEN_CDATA_SECTION;
token->string.string = string;
real_length = skip_sgml_cdata_section(scanner, &string);
real_length = skip_sgml_cdata_section(scanner, &string,
&possibly_incomplete);
assert(real_length >= 0);
} else {
skip_sgml_space(scanner, &string);
type = map_dom_scanner_string(scanner, ident, string, base);
skip_sgml(scanner, &string, '>', 0);
if (skip_sgml(scanner, &string, '>', 0)) {
/* We found the end. */
possibly_incomplete = 0;
}
}
} else if (*string == '?') {
@ -375,6 +436,11 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
real_length = string - token->string.string;
skip_sgml_space(scanner, &string);
if (is_sgml_space(string[-1])) {
/* We found the end. */
possibly_incomplete = 0;
}
} else if (*string == '/') {
string++;
skip_sgml_space(scanner, &string);
@ -385,12 +451,18 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
real_length = string - token->string.string;
type = SGML_TOKEN_ELEMENT_END;
skip_sgml(scanner, &string, '>', 1);
if (skip_sgml(scanner, &string, '>', 1)) {
/* We found the end. */
possibly_incomplete = 0;
}
} else if (*string == '>') {
string++;
real_length = 0;
type = SGML_TOKEN_ELEMENT_END;
/* We found the end. */
possibly_incomplete = 0;
}
if (type != SGML_TOKEN_GARBAGE)
@ -398,15 +470,28 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
} else {
/* Alien < > stuff so ignore it */
skip_sgml(scanner, &string, '>', 0);
if (skip_sgml(scanner, &string, '>', 0)) {
/* We found the end. */
possibly_incomplete = 0;
}
}
} else if (first_char == '=') {
type = '=';
/* We found the end. */
possibly_incomplete = 0;
} else if (first_char == '?' || first_char == '>') {
if (first_char == '?') {
skip_sgml(scanner, &string, '>', 0);
if (skip_sgml(scanner, &string, '>', 0)) {
/* We found the end. */
possibly_incomplete = 0;
}
} else {
assert(first_char == '>');
/* We found the end. */
possibly_incomplete = 0;
}
type = SGML_TOKEN_TAG_END;
@ -414,17 +499,33 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
scanner->state = SGML_STATE_TEXT;
} else if (first_char == '/') {
/* We allow '/' inside elements and only consider it as an end
* tag if immediately preceeds the '>' char. This is to allow
*
* '<form action=/ >' where '/' is part of a path and
* '<form action=a />' where '/>' is truely a tag end
*
* For stricter parsing we should always require attribute
* values to be quoted.
*/
if (*string == '>') {
string++;
real_length = 0;
type = SGML_TOKEN_ELEMENT_EMPTY_END;
assert(scanner->state == SGML_STATE_ELEMENT);
scanner->state = SGML_STATE_TEXT;
/* We found the end. */
possibly_incomplete = 0;
} else if (is_sgml_attribute(*string)) {
scan_sgml_attribute(scanner, string);
type = SGML_TOKEN_ATTRIBUTE;
if (string[-1] == '/' && string[0] == '>')
if (string[-1] == '/' && string[0] == '>') {
string--;
/* We found the end. */
possibly_incomplete = 0;
}
}
} else if (isquote(first_char)) {
@ -436,6 +537,10 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
real_length = string_end - token->string.string;
string = string_end + 1;
type = SGML_TOKEN_STRING;
/* We found the end. */
possibly_incomplete = 0;
} else if (is_sgml_attribute(*string)) {
token->string.string++;
scan_sgml_attribute(scanner, string);
@ -451,11 +556,19 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
if (is_sgml_attribute(*string)) {
scan_sgml_attribute(scanner, string);
type = SGML_TOKEN_ATTRIBUTE;
if (string[-1] == '/' && string[0] == '>')
if (string[-1] == '/' && string[0] == '>') {
/* We found the end. */
possibly_incomplete = 0;
string--;
}
}
}
if (possibly_incomplete && check_sgml_incomplete(scanner, string)) {
set_sgml_incomplete(scanner, token);
return;
}
token->type = type;
token->string.length = real_length >= 0 ? real_length : string - token->string.string;
token->precedence = get_sgml_precedence(type);
@ -482,7 +595,14 @@ scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token
}
}
if (!string) string = scanner->end;
if (!string) {
if (check_sgml_incomplete(scanner, string)) {
set_sgml_incomplete(scanner, token);
return;
}
string = scanner->end;
}
token->type = SGML_TOKEN_PROCESS_DATA;
token->string.length = string - token->string.string - 2;