/* SGML token scanner utilities */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "elinks.h" #include "dom/scanner.h" #include "dom/sgml/scanner.h" #include "dom/string.h" #include "util/error.h" /* Bitmap entries for the SGML character groups used in the scanner table */ enum sgml_char_group { SGML_CHAR_ENTITY = (1 << 1), SGML_CHAR_IDENT = (1 << 2), SGML_CHAR_NEWLINE = (1 << 3), SGML_CHAR_WHITESPACE = (1 << 4), SGML_CHAR_NOT_TEXT = (1 << 5), SGML_CHAR_NOT_ATTRIBUTE = (1 << 6), }; static struct dom_scan_table_info sgml_scan_table_info[] = { DOM_SCAN_TABLE_RANGE("0", '9', SGML_CHAR_IDENT | SGML_CHAR_ENTITY), DOM_SCAN_TABLE_RANGE("A", 'Z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY), DOM_SCAN_TABLE_RANGE("a", 'z', SGML_CHAR_IDENT | SGML_CHAR_ENTITY), /* For the octal number impared (me including) \241 is 161 --jonas */ DOM_SCAN_TABLE_RANGE("\241", 255, SGML_CHAR_IDENT | SGML_CHAR_ENTITY), DOM_SCAN_TABLE_STRING("-_:.", SGML_CHAR_IDENT | SGML_CHAR_ENTITY), DOM_SCAN_TABLE_STRING("#", SGML_CHAR_ENTITY), DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE), DOM_SCAN_TABLE_STRING("\f\n", SGML_CHAR_NEWLINE), DOM_SCAN_TABLE_STRING("<&", SGML_CHAR_NOT_TEXT), DOM_SCAN_TABLE_STRING("<=>", SGML_CHAR_NOT_ATTRIBUTE), DOM_SCAN_TABLE_END, }; #define SGML_STRING_MAP(str, type, family) \ { STATIC_DOM_STRING(str), SGML_TOKEN_##type, SGML_TOKEN_##family } static struct dom_scanner_string_mapping sgml_string_mappings[] = { SGML_STRING_MAP("--", NOTATION_COMMENT, NOTATION), SGML_STRING_MAP("ATTLIST", NOTATION_ATTLIST, NOTATION), SGML_STRING_MAP("DOCTYPE", NOTATION_DOCTYPE, NOTATION), SGML_STRING_MAP("ELEMENT", NOTATION_ELEMENT, NOTATION), SGML_STRING_MAP("ENTITY", NOTATION_ENTITY, NOTATION), SGML_STRING_MAP("xml", PROCESS_XML, PROCESS), SGML_STRING_MAP("xml-stylesheet", PROCESS_XML_STYLESHEET, PROCESS), DOM_STRING_MAP_END, }; static struct dom_scanner_token *scan_sgml_tokens(struct dom_scanner *scanner); struct dom_scanner_info sgml_scanner_info = { sgml_string_mappings, sgml_scan_table_info, scan_sgml_tokens, }; #define check_sgml_table(c, bit) (sgml_scanner_info.scan_table[(unsigned char)(c)] & (bit)) #define scan_sgml(scanner, s, bit) \ while ((s) < (scanner)->end && check_sgml_table(*(s), bit)) (s)++; #define is_sgml_ident(c) check_sgml_table(c, SGML_CHAR_IDENT) #define is_sgml_entity(c) check_sgml_table(c, SGML_CHAR_ENTITY) #define is_sgml_space(c) check_sgml_table(c, SGML_CHAR_WHITESPACE) #define is_sgml_newline(c) check_sgml_table(c, SGML_CHAR_NEWLINE) #define is_sgml_text(c) !check_sgml_table(c, SGML_CHAR_NOT_TEXT) #define is_sgml_token_start(c) check_sgml_table(c, SGML_CHAR_TOKEN_START) #define is_sgml_attribute(c) !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE) static inline void skip_sgml_space(struct dom_scanner *scanner, char **string) { char *pos = *string; if (!scanner->count_lines) { scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE); } else { while (pos < scanner->end && is_sgml_space(*pos)) { if (is_sgml_newline(*pos)) scanner->lineno++; pos++; } } *string = pos; } #define check_sgml_incomplete(scanner, string) \ ((scanner)->check_complete \ && (scanner)->incomplete \ && (string) == (scanner)->end) static void set_sgml_incomplete(struct dom_scanner *scanner, struct dom_scanner_token *token) { size_t left = scanner->end - scanner->position; assert(left > 0); token->type = SGML_TOKEN_INCOMPLETE; set_dom_string(&token->string, scanner->position, left); /* Stop the scanning. */ scanner->position = scanner->end; } static inline int check_sgml_error(struct dom_scanner *scanner) { unsigned int found_error = scanner->found_error; /* Toggle if we found an error previously. */ scanner->found_error = 0; return scanner->detect_errors && !found_error; } static char * get_sgml_error_end(struct dom_scanner *scanner, /*enum sgml_token_type*/ unsigned int type, char *end) { switch (type) { case SGML_TOKEN_CDATA_SECTION: case SGML_TOKEN_NOTATION_ATTLIST: case SGML_TOKEN_NOTATION_DOCTYPE: case SGML_TOKEN_NOTATION_ELEMENT: if (scanner->position + 9 < end) end = scanner->position + 9; break; case SGML_TOKEN_NOTATION_COMMENT: /* Just include the '' are not overlapping any * preceeding '-'. Additionally also handle the quirky '--!>' * end sometimes found. */ if (pos[-2] == '-') { if (pos[-1] == '-' && &pos[-2] >= *string) { length = pos - *string - 2; *possibly_incomplete = 0; pos++; break; } else if (pos[-1] == '!' && pos[-3] == '-' && &pos[-3] >= *string) { length = pos - *string - 3; *possibly_incomplete = 0; pos++; break; } } } if (!pos) { pos = scanner->end; /* The token is incomplete but set the length to handle tag * tag soup graciously. */ *possibly_incomplete = 1; length = pos - *string; } *string = pos; return length; } static inline int skip_sgml_cdata_section(struct dom_scanner *scanner, char **string, int *possibly_incomplete) { char *pos = *string; int length = 0; for ( ; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) { /* It is always safe to access index -2 and -1 here since we * are supposed to have 'end; /* The token is incomplete but set the length to handle tag * soup graciously. */ *possibly_incomplete = 1; length = pos - *string; } *string = pos; return length; } #define scan_sgml_attribute(scanner, str) \ while ((str) < (scanner)->end && is_sgml_attribute(*(str))) \ (str)++; static inline void scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *token) { char *string = scanner->position; unsigned char first_char = *string; /*enum sgml_token_type*/ unsigned int type = SGML_TOKEN_GARBAGE; int real_length = -1; int possibly_incomplete = 1; /*enum sgml_scanner_state*/ unsigned int scanner_state = scanner->state; token->string.string = string++; if (first_char == '<') { skip_sgml_space(scanner, &string); if (scanner->state == SGML_STATE_ELEMENT) { /* Already inside an element so insert a tag end token * and continue scanning in next iteration. */ type = SGML_TOKEN_TAG_END; scanner_state = SGML_STATE_TEXT; /* We are creating a 'virtual' that has no source. */ possibly_incomplete = 0; string = token->string.string; real_length = 0; } else if (string == scanner->end) { /* It is incomplete so prevent out of bound acess to * the scanned string. */ } else if (is_sgml_ident(*string)) { token->string.string = string; scan_sgml(scanner, string, SGML_CHAR_IDENT); real_length = string - token->string.string; skip_sgml_space(scanner, &string); if (string < scanner->end && *string == '>') { type = SGML_TOKEN_ELEMENT; string++; /* We found the end. */ possibly_incomplete = 0; } else { /* Was any space skipped? */ if (is_sgml_space(string[-1])) { /* We found the end. */ possibly_incomplete = 0; } type = SGML_TOKEN_ELEMENT_BEGIN; scanner_state = SGML_STATE_ELEMENT; } } else if (*string == '!') { char *ident; /*enum sgml_token_type*/ unsigned int base = SGML_TOKEN_NOTATION; string++; skip_sgml_space(scanner, &string); token->string.string = ident = string; if (string + 1 < scanner->end && string[0] == '-' && string[1] == '-') { string += 2; type = SGML_TOKEN_NOTATION_COMMENT; token->string.string = string; real_length = skip_sgml_comment(scanner, &string, &possibly_incomplete); assert(real_length >= 0); } else if (string + 6 < scanner->end && !memcmp(string, "[CDATA[", 7)) { string += 7; type = SGML_TOKEN_CDATA_SECTION; token->string.string = string; real_length = skip_sgml_cdata_section(scanner, &string, &possibly_incomplete); assert(real_length >= 0); } else { scan_sgml(scanner, string, SGML_CHAR_IDENT); type = map_dom_scanner_string(scanner, ident, string, base); if (skip_sgml(scanner, &string, '>', 0)) { /* We found the end. */ possibly_incomplete = 0; } } } else if (*string == '?') { char *pos; /*enum sgml_token_type*/ unsigned int base = SGML_TOKEN_PROCESS; string++; skip_sgml_space(scanner, &string); token->string.string = pos = string; scan_sgml(scanner, string, SGML_CHAR_IDENT); type = map_dom_scanner_string(scanner, pos, string, base); scanner_state = SGML_STATE_PROC_INST; real_length = string - token->string.string; skip_sgml_space(scanner, &string); /* Make 'end) { /* We found the end. */ possibly_incomplete = 0; } if (scanner->check_complete && scanner->incomplete) { /* We need to fit both the process target token * and the process data token into the scanner * table. */ if (token + 1 >= scanner->table + DOM_SCANNER_TOKENS) { possibly_incomplete = 1; } else if (!possibly_incomplete) { /* FIXME: We do this twice. */ for (pos = string + 1; (pos = skip_sgml_chars(scanner, pos, '>')); pos++) { if (pos[-1] == '?') break; } if (!pos) possibly_incomplete = 1; } if (possibly_incomplete) string = scanner->end; } } else if (*string == '/') { string++; skip_sgml_space(scanner, &string); if (string == scanner->end) { /* Prevent out of bound access. */ } else if (is_sgml_ident(*string)) { token->string.string = string; scan_sgml(scanner, string, SGML_CHAR_IDENT); real_length = string - token->string.string; type = SGML_TOKEN_ELEMENT_END; if (skip_sgml(scanner, &string, '>', 1)) { /* We found the end. */ possibly_incomplete = 0; } } else if (*string == '>') { string++; real_length = 0; type = SGML_TOKEN_ELEMENT_END; /* We found the end. */ possibly_incomplete = 0; } if (type != SGML_TOKEN_GARBAGE) { scanner_state = SGML_STATE_TEXT; } } else { /* Alien < > stuff so ignore it */ if (skip_sgml(scanner, &string, '>', 0)) { /* We found the end. */ possibly_incomplete = 0; } } } else if (first_char == '=') { type = '='; /* We found the end. */ possibly_incomplete = 0; } else if (first_char == '?' || first_char == '>') { if (first_char == '?') { if (skip_sgml(scanner, &string, '>', 0)) { /* We found the end. */ possibly_incomplete = 0; } } else { assert(first_char == '>'); /* We found the end. */ possibly_incomplete = 0; } type = SGML_TOKEN_TAG_END; assert(scanner->state == SGML_STATE_ELEMENT); scanner_state = SGML_STATE_TEXT; } else if (first_char == '/') { /* We allow '/' inside elements and only consider it as an end * tag if immediately preceeds the '>' char. This is to allow * * '
' where '/' is part of a path and * '' where '/>' is truely a tag end * * For stricter parsing we should always require attribute * values to be quoted. */ if (string == scanner->end) { /* Prevent out of bound access. */ } else if (*string == '>') { string++; real_length = 0; type = SGML_TOKEN_ELEMENT_EMPTY_END; assert(scanner->state == SGML_STATE_ELEMENT); scanner_state = SGML_STATE_TEXT; /* We found the end. */ possibly_incomplete = 0; } else if (is_sgml_attribute(*string)) { scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; if (string[-1] == '/' && string[0] == '>') { string--; /* We found the end. */ possibly_incomplete = 0; } } } else if (isquote(first_char)) { char *string_end = skip_sgml_chars(scanner, string, first_char); if (string_end) { /* We don't want the delimiters in the token */ token->string.string++; real_length = string_end - token->string.string; string = string_end + 1; type = SGML_TOKEN_STRING; /* We found the end. */ possibly_incomplete = 0; } else if (scanner->check_complete && scanner->incomplete) { /* Force an incomplete token. */ string = scanner->end; } else if (string < scanner->end && is_sgml_attribute(*string)) { token->string.string++; scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; } } else if (is_sgml_attribute(first_char)) { if (is_sgml_ident(first_char)) { scan_sgml(scanner, string, SGML_CHAR_IDENT); type = SGML_TOKEN_IDENT; } if (string < scanner->end && is_sgml_attribute(*string)) { scan_sgml_attribute(scanner, string); type = SGML_TOKEN_ATTRIBUTE; if (string[-1] == '/' && string[0] == '>') { /* We found the end. */ possibly_incomplete = 0; string--; } } } if (possibly_incomplete) { if (check_sgml_incomplete(scanner, string)) { set_sgml_incomplete(scanner, token); return; } if (check_sgml_error(scanner) && string == scanner->end) { char *end; end = get_sgml_error_end(scanner, type, string); token = set_sgml_error(scanner, end); if (!token) return; } } /* Only apply the state change if the token was not abandoned because * it was incomplete. */ scanner->state = scanner_state; token->type = type; token->string.length = real_length >= 0 ? real_length : string - token->string.string; token->precedence = get_sgml_precedence(type); scanner->position = string; } /* Processing instruction data scanning */ static inline void scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token) { char *string = scanner->position; /* The length can be empty for ''. */ ssize_t length = -1; token->string.string = string++; /* Figure out where the processing instruction ends. This doesn't use * skip_sgml() since we MUST ignore precedence here to allow '<' inside * the data part to be skipped correctly. */ for ( ; (string = skip_sgml_chars(scanner, string, '>')); string++) { if (string[-1] == '?') { string++; length = string - token->string.string - 2; break; } } if (!string) { /* Makes the next succeed when checking for incompletion, and * puts the rest of the text within the token. */ string = scanner->end; if (check_sgml_incomplete(scanner, string)) { set_sgml_incomplete(scanner, token); return; } if (check_sgml_error(scanner)) { token = set_sgml_error(scanner, string); if (!token) return; } } token->type = SGML_TOKEN_PROCESS_DATA; token->string.length = length >= 0 ? length : string - token->string.string; token->precedence = get_sgml_precedence(token->type); scanner->position = string; scanner->state = SGML_STATE_TEXT; } /* Scanner multiplexor */ static struct dom_scanner_token * scan_sgml_tokens(struct dom_scanner *scanner) { struct dom_scanner_token *table_end = scanner->table + DOM_SCANNER_TOKENS; if (!begin_dom_token_scanning(scanner)) return get_dom_scanner_token(scanner); /* Scan tokens until we fill the table */ for (scanner->current = scanner->table + scanner->tokens; scanner->current < table_end && scanner->position < scanner->end; scanner->current++) { if (scanner->state == SGML_STATE_ELEMENT || (*scanner->position == '<' && scanner->state != SGML_STATE_PROC_INST)) { skip_sgml_space(scanner, &scanner->position); if (scanner->position >= scanner->end) break; scan_sgml_element_token(scanner, scanner->current); /* Shall we scratch this token? */ if (scanner->current->type == SGML_TOKEN_SKIP) { scanner->current--; } } else if (scanner->state == SGML_STATE_TEXT) { scan_sgml_text_token(scanner, scanner->current); } else { scan_sgml_proc_inst_token(scanner, scanner->current); } } return end_dom_token_scanning(scanner, scanner->current); }