Add mode where the SGML scanner checks for completeness

2025-02-02 15:09:23 -05:00 · 2006-01-02 17:46:09 +01:00 · 2006-01-02 17:46:09 +01:00 · e78d43f1ac
commit e78d43f1ac
parent af72dd8435
5 changed files with 146 additions and 16 deletions
--- a/src/dom/scanner.c
+++ b/src/dom/scanner.c
@ -154,7 +154,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info)

 void
 init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
-		 struct dom_string *string, int state, int count_lines, int complete)
+		 struct dom_string *string, int state, int count_lines, int complete,
+		 int check_complete)
 {
 	if (!scanner_info->initialized) {
 		init_dom_scanner_info(scanner_info);
@ -170,6 +171,8 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i
 	scanner->info = scanner_info;
 	scanner->state = state;
 	scanner->count_lines = !!count_lines;
+	scanner->incomplete = !complete;
+	scanner->check_complete = !!check_complete;
 	scanner->lineno = scanner->count_lines;
 	scanner->info->scan(scanner);
 }
--- a/src/dom/scanner.h
+++ b/src/dom/scanner.h
@ -92,7 +92,8 @@ struct dom_scanner_info {

 /* Initializes the scanner. */
 void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
-		      struct dom_string *string, int state, int count_lines, int complete);
+		      struct dom_string *string, int state, int count_lines, int complete,
+		      int check_complete);

 /* The number of tokens in the scanners token table:
 * At best it should be big enough to contain properties with space separated
@ -123,7 +124,12 @@ struct dom_scanner {
 	int line;
 #endif

+	/* The following two flags are used when parsing is incremental and
+	 * the scanner must ensure that only tokens that are complete are
+	 * generated. */
+	unsigned int check_complete:1;	/* Only generate complete tokens */
 	unsigned int incomplete:1;	/* The scanned string is incomplete */
+
 	unsigned int count_lines:1;	/* Is line counting enbaled? */
 	unsigned int lineno;		/* Line # of the last scanned token */

--- a/src/dom/select.c
+++ b/src/dom/select.c
@ -391,7 +391,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack,
 	struct dom_scanner scanner;
 	struct dom_select_node sel;

-	init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1);
+	init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0, 1, 0);

 	memset(&sel, 0, sizeof(sel));

--- a/src/dom/sgml/parser.c
+++ b/src/dom/sgml/parser.c
@ -317,7 +317,7 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
 				/* The attribute souce is complete. */
 				init_dom_scanner(&attr_scanner, &sgml_scanner_info,
 						 &token->string, SGML_STATE_ELEMENT,
-						 scanner->count_lines, 1);
+						 scanner->count_lines, 1, 0);

 				if (dom_scanner_has_tokens(&attr_scanner)) {
 					/* Ignore parser codes from this
@ -393,11 +393,12 @@ sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
 	struct sgml_parsing_state *parsing = data;
 	int count_lines = !!(parser->flags & SGML_PARSER_COUNT_LINES);
 	int complete = !!(parser->flags & SGML_PARSER_COMPLETE);
+	int incremental = !!(parser->flags & SGML_PARSER_INCREMENTAL);

 	parsing->depth = parser->stack.depth;
 	get_dom_stack_top(&parser->stack)->immutable = 1;
 	init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string,
-			 SGML_STATE_TEXT, count_lines, complete);
+			 SGML_STATE_TEXT, count_lines, complete, incremental);
 }

 static void
--- a/src/dom/sgml/scanner.c
+++ b/src/dom/sgml/scanner.c
@ -98,6 +98,24 @@ skip_sgml_space(struct dom_scanner *scanner, unsigned char **string)
 	*string = pos;
 }

+#define check_sgml_incomplete(scanner, string) \
+	((scanner)->check_complete \
+	 && (scanner)->incomplete \
+	 && (string) == (scanner)->end)
+
+static void
+set_sgml_incomplete(struct dom_scanner *scanner, struct dom_scanner_token *token)
+{
+	size_t left = scanner->end - scanner->position;
+
+	assert(left > 0);
+
+	token->type = SGML_TOKEN_INCOMPLETE;
+	set_dom_string(&token->string, scanner->position, left);
+
+	/* Stop the scanning. */
+	scanner->position = scanner->end;
+}

 /* Text token scanning */

@ -119,6 +137,8 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
 	token->string.string = string++;

 	if (first_char == '&') {
+		int complete = 0;
+
 		if (is_sgml_entity(*string)) {
 			scan_sgml(scanner, string, SGML_CHAR_ENTITY);
 			type = SGML_TOKEN_ENTITY;
@ -128,11 +148,18 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke

 		foreach_sgml_cdata (scanner, string) {
 			if (*string == ';') {
+				complete = 1;
 				string++;
 				break;
 			}
 		}

+		/* We want the biggest possible text token. */
+		if (check_sgml_incomplete(scanner, string) && !complete) {
+			set_sgml_incomplete(scanner, token);
+			return;
+		}
+
 	} else {
 		if (is_sgml_space(first_char)) {
 			if (scanner->count_lines
@ -156,6 +183,12 @@ scan_sgml_text_token(struct dom_scanner *scanner, struct dom_scanner_token *toke
 				/* m33p */;
 			}
 		}
+
+		/* We want the biggest possible text token. */
+		if (check_sgml_incomplete(scanner, string)) {
+			set_sgml_incomplete(scanner, token);
+			return;
+		}
 	}

 	token->type = type;
@ -237,7 +270,8 @@ skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char ski
 }

 static inline int
-skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
+skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string,
+		  int *possibly_incomplete)
 {
 	unsigned char *pos = *string;
 	int length = 0;
@ -249,6 +283,7 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
 		 * preceeding '-'. */
 		if (pos[-2] == '-' && pos[-1] == '-' && &pos[-2] >= *string) {
 			length = pos - *string - 2;
+			*possibly_incomplete = 0;
 			pos++;
 			break;
 		}
@ -256,6 +291,9 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)

 	if (!pos) {
 		pos = scanner->end;
+		/* The token is incomplete but set the length to handle tag
+		 * tag soup graciously. */
+		*possibly_incomplete = 1;
 		length = pos - *string;
 	}

@ -264,7 +302,8 @@ skip_sgml_comment(struct dom_scanner *scanner, unsigned char **string)
 }

 static inline int
-skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
+skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string,
+		  	int *possibly_incomplete)
 {
 	unsigned char *pos = *string;
 	int length = 0;
@ -274,6 +313,7 @@ skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)
 		 * are supposed to have '<![CDATA[' before this is called. */
 		if (pos[-2] == ']' && pos[-1] == ']') {
 			length = pos - *string - 2;
+			*possibly_incomplete = 0;
 			pos++;
 			break;
 		}
@ -281,6 +321,9 @@ skip_sgml_cdata_section(struct dom_scanner *scanner, unsigned char **string)

 	if (!pos) {
 		pos = scanner->end;
+		/* The token is incomplete but set the length to handle tag
+		 * soup graciously. */
+		*possibly_incomplete = 1;
 		length = pos - *string;
 	}

@ -299,6 +342,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 	unsigned char first_char = *string;
 	enum sgml_token_type type = SGML_TOKEN_GARBAGE;
 	int real_length = -1;
+	int possibly_incomplete = 1;

 	token->string.string = string++;

@ -313,6 +357,9 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 			type = SGML_TOKEN_TAG_END;
 			scanner->state = SGML_STATE_TEXT;

+			/* We are creating a 'virtual' that has no source. */
+			possibly_incomplete = 0;
+
 		} else if (is_sgml_ident(*string)) {
 			token->string.string = string;
 			scan_sgml(scanner, string, SGML_CHAR_IDENT);
@ -323,7 +370,16 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 			if (*string == '>') {
 				type = SGML_TOKEN_ELEMENT;
 				string++;
+
+				/* We found the end. */
+				possibly_incomplete = 0;
+
 			} else {
+				/* Was any space skipped? */
+				if (is_sgml_space(string[-1])) {
+					/* We found the end. */
+					possibly_incomplete = 0;
+				}
 				scanner->state = SGML_STATE_ELEMENT;
 				type = SGML_TOKEN_ELEMENT_BEGIN;
 			}
@ -341,7 +397,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 				string += 2;
 				type = SGML_TOKEN_NOTATION_COMMENT;
 				token->string.string = string;
-				real_length = skip_sgml_comment(scanner, &string);
+				real_length = skip_sgml_comment(scanner, &string,
+								&possibly_incomplete);
 				assert(real_length >= 0);

 			} else if (string + 6 < scanner->end
@ -350,13 +407,17 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 				string += 7;
 				type = SGML_TOKEN_CDATA_SECTION;
 				token->string.string = string;
-				real_length = skip_sgml_cdata_section(scanner, &string);
+				real_length = skip_sgml_cdata_section(scanner, &string,
+								      &possibly_incomplete);
 				assert(real_length >= 0);

 			} else {
 				skip_sgml_space(scanner, &string);
 				type = map_dom_scanner_string(scanner, ident, string, base);
-				skip_sgml(scanner, &string, '>', 0);
+				if (skip_sgml(scanner, &string, '>', 0)) {
+					/* We found the end. */
+					possibly_incomplete = 0;
+				}
 			}

 		} else if (*string == '?') {
@ -375,6 +436,11 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 			real_length = string - token->string.string;
 			skip_sgml_space(scanner, &string);

+			if (is_sgml_space(string[-1])) {
+				/* We found the end. */
+				possibly_incomplete = 0;
+			}
+
 		} else if (*string == '/') {
 			string++;
 			skip_sgml_space(scanner, &string);
@ -385,12 +451,18 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 				real_length = string - token->string.string;

 				type = SGML_TOKEN_ELEMENT_END;
-				skip_sgml(scanner, &string, '>', 1);
+				if (skip_sgml(scanner, &string, '>', 1)) {
+					/* We found the end. */
+					possibly_incomplete = 0;
+				}

 			} else if (*string == '>') {
 				string++;
 				real_length = 0;
 				type = SGML_TOKEN_ELEMENT_END;
+
+				/* We found the end. */
+				possibly_incomplete = 0;
 			}

 			if (type != SGML_TOKEN_GARBAGE)
@ -398,15 +470,28 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t

 		} else {
 			/* Alien < > stuff so ignore it */
-			skip_sgml(scanner, &string, '>', 0);
+			if (skip_sgml(scanner, &string, '>', 0)) {
+				/* We found the end. */
+				possibly_incomplete = 0;
+			}
 		}

 	} else if (first_char == '=') {
 		type = '=';
+		/* We found the end. */
+		possibly_incomplete = 0;

 	} else if (first_char == '?' || first_char == '>') {
 		if (first_char == '?') {
-			skip_sgml(scanner, &string, '>', 0);
+			if (skip_sgml(scanner, &string, '>', 0)) {
+				/* We found the end. */
+				possibly_incomplete = 0;
+			}
+		} else {
+			assert(first_char == '>');
+
+			/* We found the end. */
+			possibly_incomplete = 0;
 		}

 		type = SGML_TOKEN_TAG_END;
@ -414,17 +499,33 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 		scanner->state = SGML_STATE_TEXT;

 	} else if (first_char == '/') {
+		/* We allow '/' inside elements and only consider it as an end
+		 * tag if immediately preceeds the '>' char. This is to allow
+		 *
+		 *	'<form action=/ >'	where '/' is part of a path and
+		 *	'<form action=a />'	where '/>' is truely a tag end
+		 *
+		 * For stricter parsing we should always require attribute
+		 * values to be quoted.
+		 */
 		if (*string == '>') {
 			string++;
 			real_length = 0;
 			type = SGML_TOKEN_ELEMENT_EMPTY_END;
 			assert(scanner->state == SGML_STATE_ELEMENT);
 			scanner->state = SGML_STATE_TEXT;
+
+			/* We found the end. */
+			possibly_incomplete = 0;
+
 		} else if (is_sgml_attribute(*string)) {
 			scan_sgml_attribute(scanner, string);
 			type = SGML_TOKEN_ATTRIBUTE;
-			if (string[-1] == '/' && string[0] == '>')
+			if (string[-1] == '/' && string[0] == '>') {
 				string--;
+				/* We found the end. */
+				possibly_incomplete = 0;
+			}
 		}

 	} else if (isquote(first_char)) {
@ -436,6 +537,10 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 			real_length = string_end - token->string.string;
 			string = string_end + 1;
 			type = SGML_TOKEN_STRING;
+
+			/* We found the end. */
+			possibly_incomplete = 0;
+
 		} else if (is_sgml_attribute(*string)) {
 			token->string.string++;
 			scan_sgml_attribute(scanner, string);
@ -451,11 +556,19 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 		if (is_sgml_attribute(*string)) {
 			scan_sgml_attribute(scanner, string);
 			type = SGML_TOKEN_ATTRIBUTE;
-			if (string[-1] == '/' && string[0] == '>')
+			if (string[-1] == '/' && string[0] == '>') {
+				/* We found the end. */
+				possibly_incomplete = 0;
 				string--;
+			}
 		}
 	}

+	if (possibly_incomplete && check_sgml_incomplete(scanner, string)) {
+		set_sgml_incomplete(scanner, token);
+		return;
+	}
+
 	token->type = type;
 	token->string.length = real_length >= 0 ? real_length : string - token->string.string;
 	token->precedence = get_sgml_precedence(type);
@ -482,7 +595,14 @@ scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token
 		}
 	}

-	if (!string) string = scanner->end;
+	if (!string) {
+		if (check_sgml_incomplete(scanner, string)) {
+			set_sgml_incomplete(scanner, token);
+			return;
+		}
+
+		string = scanner->end;
+	}

 	token->type = SGML_TOKEN_PROCESS_DATA;
 	token->string.length = string - token->string.string - 2;