/* CSS token scanner utilities */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "elinks.h" #include "document/css/scanner.h" #include "util/error.h" #include "util/scanner.h" #include "util/string.h" /* Bitmap entries for the CSS character groups used in the scanner table */ enum css_char_group { CSS_CHAR_ALPHA = (1 << 0), CSS_CHAR_DIGIT = (1 << 1), CSS_CHAR_HEX_DIGIT = (1 << 2), CSS_CHAR_IDENT = (1 << 3), CSS_CHAR_IDENT_START = (1 << 4), CSS_CHAR_NEWLINE = (1 << 5), CSS_CHAR_NON_ASCII = (1 << 6), CSS_CHAR_SGML_MARKUP = (1 << 7), CSS_CHAR_TOKEN = (1 << 8), CSS_CHAR_TOKEN_START = (1 << 9), CSS_CHAR_WHITESPACE = (1 << 10), }; static const struct scan_table_info css_scan_table_info[] = { SCAN_TABLE_RANGE("0", '9', CSS_CHAR_DIGIT | CSS_CHAR_HEX_DIGIT | CSS_CHAR_IDENT), SCAN_TABLE_RANGE("A", 'F', CSS_CHAR_HEX_DIGIT), SCAN_TABLE_RANGE("A", 'Z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), SCAN_TABLE_RANGE("a", 'f', CSS_CHAR_HEX_DIGIT), SCAN_TABLE_RANGE("a", 'z', CSS_CHAR_ALPHA | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), /* For the octal number impared (me including) \241 is 161 --jonas */ SCAN_TABLE_RANGE("\241", 255, CSS_CHAR_NON_ASCII | CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), SCAN_TABLE_STRING(" \f\n\r\t\v\000", CSS_CHAR_WHITESPACE), SCAN_TABLE_STRING("\f\n\r", CSS_CHAR_NEWLINE), SCAN_TABLE_STRING("-", CSS_CHAR_IDENT), SCAN_TABLE_STRING(".#@!\"'<-/", CSS_CHAR_TOKEN_START), /* Unicode escape (that we do not handle yet) + other special chars */ SCAN_TABLE_STRING("\\_*", CSS_CHAR_IDENT | CSS_CHAR_IDENT_START), /* This should contain mostly used char tokens like ':' and maybe a few * garbage chars that people might put in their CSS code */ SCAN_TABLE_STRING("({});:,.>", CSS_CHAR_TOKEN), SCAN_TABLE_STRING("", CSS_CHAR_SGML_MARKUP), SCAN_TABLE_END, }; static const struct scanner_string_mapping css_string_mappings[] = { { "Hz", CSS_TOKEN_FREQUENCY, CSS_TOKEN_DIMENSION }, { "cm", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, { "deg", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION }, { "em", CSS_TOKEN_EM, CSS_TOKEN_DIMENSION }, { "ex", CSS_TOKEN_EX, CSS_TOKEN_DIMENSION }, { "grad", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION }, { "in", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, { "kHz", CSS_TOKEN_FREQUENCY, CSS_TOKEN_DIMENSION }, { "mm", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, { "ms", CSS_TOKEN_TIME, CSS_TOKEN_DIMENSION }, { "pc", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, { "pt", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, { "px", CSS_TOKEN_LENGTH, CSS_TOKEN_DIMENSION }, { "rad", CSS_TOKEN_ANGLE, CSS_TOKEN_DIMENSION }, { "s", CSS_TOKEN_TIME, CSS_TOKEN_DIMENSION }, { "rgb", CSS_TOKEN_RGB, CSS_TOKEN_FUNCTION }, { "url", CSS_TOKEN_URL, CSS_TOKEN_FUNCTION }, { "charset", CSS_TOKEN_AT_CHARSET, CSS_TOKEN_AT_KEYWORD }, { "font-face", CSS_TOKEN_AT_FONT_FACE, CSS_TOKEN_AT_KEYWORD }, { "import", CSS_TOKEN_AT_IMPORT, CSS_TOKEN_AT_KEYWORD }, { "media", CSS_TOKEN_AT_MEDIA, CSS_TOKEN_AT_KEYWORD }, { "page", CSS_TOKEN_AT_PAGE, CSS_TOKEN_AT_KEYWORD }, { NULL, CSS_TOKEN_NONE, CSS_TOKEN_NONE }, }; static struct scanner_token *scan_css_tokens(struct scanner *scanner); struct scanner_info css_scanner_info = { css_string_mappings, css_scan_table_info, scan_css_tokens, }; #define check_css_table(c, bit) (css_scanner_info.scan_table[(c)] & (bit)) #define scan_css(scanner, s, bit) \ while ((s) < (scanner)->end && check_css_table(*(s), bit)) (s)++; #define scan_back_css(scanner, s, bit) \ while ((s) >= (scanner)->string && check_css_table(*(s), bit)) (s)--; #define is_css_ident_start(c) check_css_table(c, CSS_CHAR_IDENT_START) #define is_css_ident(c) check_css_table(c, CSS_CHAR_IDENT) #define is_css_digit(c) check_css_table(c, CSS_CHAR_DIGIT) #define is_css_hexdigit(c) check_css_table(c, CSS_CHAR_HEX_DIGIT) #define is_css_char_token(c) check_css_table(c, CSS_CHAR_TOKEN) #define is_css_token_start(c) check_css_table(c, CSS_CHAR_TOKEN_START) #define skip_css(scanner, s, skipto) \ while (s < (scanner)->end \ && *(s) != (skipto) \ && check_css_precedence(*(s), skipto)) { \ if (isquote(*(s))) { \ int size = (scanner)->end - (s); \ unsigned char *end = memchr(s + 1, *(s), size); \ \ if (end) (s) = end; \ } \ (s)++; \ } static inline void scan_css_token(struct scanner *scanner, struct scanner_token *token) { unsigned char *string = scanner->position; unsigned char first_char = *string; enum css_token_type type = CSS_TOKEN_GARBAGE; int real_length = -1; assert(first_char); token->string = string++; if (is_css_char_token(first_char)) { type = first_char; } else if (is_css_digit(first_char) || first_char == '.') { scan_css(scanner, string, CSS_CHAR_DIGIT); /* First scan the full number token */ if (*string == '.') { string++; if (is_css_digit(*string)) { type = CSS_TOKEN_NUMBER; scan_css(scanner, string, CSS_CHAR_DIGIT); } } /* Check what kind of number we have */ if (*string == '%') { if (first_char != '.') type = CSS_TOKEN_PERCENTAGE; string++; } else if (!is_css_ident_start(*string)) { type = CSS_TOKEN_NUMBER; } else { unsigned char *ident = string; scan_css(scanner, string, CSS_CHAR_IDENT); type = map_scanner_string(scanner, ident, string, CSS_TOKEN_DIMENSION); } } else if (is_css_ident_start(first_char)) { scan_css(scanner, string, CSS_CHAR_IDENT); if (*string == '(') { unsigned char *function_end = string + 1; /* Make sure that we have an ending ')' */ skip_css(scanner, function_end, ')'); if (*function_end == ')') { type = map_scanner_string(scanner, token->string, string, CSS_TOKEN_FUNCTION); /* If it is not a known function just skip the * how arg stuff so we don't end up generating * a lot of useless tokens. */ if (type == CSS_TOKEN_FUNCTION) { string = function_end; } else if (type == CSS_TOKEN_URL) { /* Extracting the URL first removes any * leading or ending whitespace and * then see if the url is given in a * string. If that is the case the * string delimiters are also trimmed. * This is not totally correct because * we should of course handle escape * sequences .. but that will have to * be fixed later. */ unsigned char *from = string + 1; unsigned char *to = function_end - 1; scan_css(scanner, from, CSS_CHAR_WHITESPACE); scan_back_css(scanner, to, CSS_CHAR_WHITESPACE); if (isquote(*from)) from++; if (isquote(*to)) to--; token->string = from; real_length = to - from + 1; assert(real_length >= 0); string = function_end; } assert(type != CSS_TOKEN_RGB || *string == '('); assert(type != CSS_TOKEN_URL || *string == ')'); assert(type != CSS_TOKEN_FUNCTION || *string == ')'); } string++; } else { type = CSS_TOKEN_IDENT; } } else if (!is_css_token_start(first_char)) { /* TODO: Better composing of error tokens. For now we just * split them down into char tokens */ } else if (first_char == '#') { /* Check whether it is hexcolor or hash token */ if (is_css_hexdigit(*string)) { int hexdigits; scan_css(scanner, string, CSS_CHAR_HEX_DIGIT); /* Check that the hexdigit sequence is either 3 or 6 * chars and it isn't just start of some non-hex ident * string. */ hexdigits = string - token->string - 1; if ((hexdigits == 3 || hexdigits == 6) && !is_css_ident(*string)) { type = CSS_TOKEN_HEX_COLOR; } else { scan_css(scanner, string, CSS_CHAR_IDENT); type = CSS_TOKEN_HASH; } } else if (is_css_ident(*string)) { /* Not *_ident_start() because hashes are #. */ scan_css(scanner, string, CSS_CHAR_IDENT); type = CSS_TOKEN_HASH; } } else if (first_char == '@') { /* Compose token containing @ */ if (is_css_ident_start(*string)) { unsigned char *ident = string; /* Scan both ident start and ident */ scan_css(scanner, string, CSS_CHAR_IDENT); type = map_scanner_string(scanner, ident, string, CSS_TOKEN_AT_KEYWORD); } } else if (first_char == '!') { scan_css(scanner, string, CSS_CHAR_WHITESPACE); if (!strncasecmp(string, "important", 9)) { type = CSS_TOKEN_IMPORTANT; string += 9; } } else if (isquote(first_char)) { /* TODO: Escaped delimiters --jonas */ int size = scanner->end - string; unsigned char *string_end = memchr(string, first_char, size); if (string_end) { /* We don't want the delimiters in the token */ token->string++; real_length = string_end - token->string; string = string_end + 1; type = CSS_TOKEN_STRING; } } else if (first_char == '<' || first_char == '-') { /* Try to navigate SGML tagsoup */ if (*string == '/') { /* Some kind of SGML tag end ... better bail out screaming */ type = CSS_TOKEN_NONE; } else { unsigned char *sgml = string; /* Skip anything looking like SGML "" * comments + notations. */ scan_css(scanner, sgml, CSS_CHAR_SGML_MARKUP); if (sgml - string >= 2 && ((first_char == '<' && *string == '!') || (first_char == '-' && sgml[-1] == '>'))) { type = CSS_TOKEN_SKIP; string = sgml; } } } else if (first_char == '/') { /* Comments */ if (*string == '*') { type = CSS_TOKEN_SKIP; for (string++; string < scanner->end; string++) if (*string == '*' && string[1] == '/') { string += 2; break; } } } else { INTERNAL("Someone forgot to put code for recognizing tokens " "which start with '%c'.", first_char); } token->type = type; token->length = real_length > 0 ? real_length : string - token->string; token->precedence = get_css_precedence(type); scanner->position = string; } static struct scanner_token * scan_css_tokens(struct scanner *scanner) { struct scanner_token *table_end = scanner->table + SCANNER_TOKENS; struct scanner_token *current; if (!begin_token_scanning(scanner)) return get_scanner_token(scanner); /* Scan tokens until we fill the table */ for (current = scanner->table + scanner->tokens; current < table_end && scanner->position < scanner->end; current++) { scan_css(scanner, scanner->position, CSS_CHAR_WHITESPACE); if (scanner->position >= scanner->end) break; scan_css_token(scanner, current); /* Did some one scream for us to end the madness? */ if (current->type == CSS_TOKEN_NONE) { scanner->position = NULL; current--; break; } /* Shall we scratch this token? */ if (current->type == CSS_TOKEN_SKIP) { current--; } } return end_token_scanning(scanner, current); }