From a578ed4667f297c73309f2e404ea41cdeeee6386 Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Sat, 31 Dec 2005 02:46:56 +0100 Subject: [PATCH] Make the SGML scanner (optionally) keep track of line numbers A new line is either \n or \f. The main logic for counting lines is in skip_sgml{,_chars,_space}. For the general case where line numbers are not wanted the code tries to avoid the extra checks for newlines. This will be useful for reporting errors when loading the XBEL file. --- src/dom/scanner.c | 4 +++- src/dom/scanner.h | 5 ++++- src/dom/select.c | 2 +- src/dom/sgml/parser.c | 5 +++-- src/dom/sgml/scanner.c | 36 ++++++++++++++++++++++++++++++++---- 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/dom/scanner.c b/src/dom/scanner.c index b9e9d9798..e682a7d15 100644 --- a/src/dom/scanner.c +++ b/src/dom/scanner.c @@ -154,7 +154,7 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info) void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, - struct dom_string *string, int state) + struct dom_string *string, int state, int count_lines) { if (!scanner_info->initialized) { init_dom_scanner_info(scanner_info); @@ -169,5 +169,7 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i scanner->current = scanner->table; scanner->info = scanner_info; scanner->state = state; + scanner->count_lines = !!count_lines; + scanner->lineno = scanner->count_lines; scanner->info->scan(scanner); } diff --git a/src/dom/scanner.h b/src/dom/scanner.h index a0c58ee00..71978f351 100644 --- a/src/dom/scanner.h +++ b/src/dom/scanner.h @@ -92,7 +92,7 @@ struct dom_scanner_info { /* Initializes the scanner. */ void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info, - struct dom_string *string, int state); + struct dom_string *string, int state, int count_lines); /* The number of tokens in the scanners token table: * At best it should be big enough to contain properties with space separated @@ -123,6 +123,9 @@ struct dom_scanner { int line; #endif + unsigned int count_lines:1; + unsigned int lineno; + /* Some state indicator only meaningful to the scanner internals */ int state; diff --git a/src/dom/select.c b/src/dom/select.c index b3fe4cb20..33f9e4c91 100644 --- a/src/dom/select.c +++ b/src/dom/select.c @@ -391,7 +391,7 @@ parse_dom_select(struct dom_select *select, struct dom_stack *stack, struct dom_scanner scanner; struct dom_select_node sel; - init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0); + init_dom_scanner(&scanner, &dom_css_scanner_info, string, 0, 0); memset(&sel, 0, sizeof(sel)); diff --git a/src/dom/sgml/parser.c b/src/dom/sgml/parser.c index 72ffdbe08..2602028e2 100644 --- a/src/dom/sgml/parser.c +++ b/src/dom/sgml/parser.c @@ -296,7 +296,8 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner) struct dom_scanner attr_scanner; init_dom_scanner(&attr_scanner, &sgml_scanner_info, - &token->string, SGML_STATE_ELEMENT); + &token->string, SGML_STATE_ELEMENT, + scanner->count_lines); if (dom_scanner_has_tokens(&attr_scanner)) parse_sgml_attributes(stack, &attr_scanner); @@ -361,7 +362,7 @@ sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data) parsing->depth = parser->stack.depth; get_dom_stack_top(&parser->stack)->immutable = 1; init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string, - SGML_STATE_TEXT); + SGML_STATE_TEXT, 0); } static void diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c index 306d6739e..8a51aab86 100644 --- a/src/dom/sgml/scanner.c +++ b/src/dom/sgml/scanner.c @@ -36,7 +36,7 @@ static struct dom_scan_table_info sgml_scan_table_info[] = { DOM_SCAN_TABLE_STRING("-_:.", SGML_CHAR_IDENT | SGML_CHAR_ENTITY), DOM_SCAN_TABLE_STRING("#", SGML_CHAR_ENTITY), DOM_SCAN_TABLE_STRING(" \f\n\r\t\v", SGML_CHAR_WHITESPACE), - DOM_SCAN_TABLE_STRING("\f\n\r", SGML_CHAR_NEWLINE), + DOM_SCAN_TABLE_STRING("\f\n", SGML_CHAR_NEWLINE), DOM_SCAN_TABLE_STRING("<&", SGML_CHAR_NOT_TEXT), DOM_SCAN_TABLE_STRING("<=>", SGML_CHAR_NOT_ATTRIBUTE), @@ -74,6 +74,7 @@ struct dom_scanner_info sgml_scanner_info = { #define is_sgml_ident(c) check_sgml_table(c, SGML_CHAR_IDENT) #define is_sgml_entity(c) check_sgml_table(c, SGML_CHAR_ENTITY) #define is_sgml_space(c) check_sgml_table(c, SGML_CHAR_WHITESPACE) +#define is_sgml_newline(c) check_sgml_table(c, SGML_CHAR_NEWLINE) #define is_sgml_text(c) !check_sgml_table(c, SGML_CHAR_NOT_TEXT) #define is_sgml_token_start(c) check_sgml_table(c, SGML_CHAR_TOKEN_START) #define is_sgml_attribute(c) !check_sgml_table(c, SGML_CHAR_NOT_ATTRIBUTE | SGML_CHAR_WHITESPACE) @@ -83,7 +84,16 @@ skip_sgml_space(struct dom_scanner *scanner, unsigned char **string) { unsigned char *pos = *string; - scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE); + if (!scanner->count_lines) { + scan_sgml(scanner, pos, SGML_CHAR_WHITESPACE); + } else { + while (pos < scanner->end && is_sgml_space(*pos)) { + if (is_sgml_newline(*pos)) + scanner->lineno++; + pos++; + } + } + *string = pos; } @@ -157,11 +167,26 @@ static inline unsigned char * skip_sgml_chars(struct dom_scanner *scanner, unsigned char *string, unsigned char skipto) { + int newlines; + assert(string >= scanner->position && string <= scanner->end); - for (; string < scanner->end; string++) { - if (*string == skipto) + if (!scanner->count_lines) { + size_t length = scanner->end - string; + + return memchr(string, skipto, length); + } + + for (newlines = 0; string < scanner->end; string++) { + if (is_sgml_newline(*string)) + newlines++; + if (*string == skipto) { + /* Only count newlines if we actually find the + * requested char. Else callers are assumed to discard + * the scanning. */ + scanner->lineno += newlines; return string; + } } return NULL; @@ -189,6 +214,9 @@ skip_sgml(struct dom_scanner *scanner, unsigned char **string, unsigned char ski end = skip_sgml_chars(scanner, pos + 1, *pos); if (end) pos = end; + + } else if (scanner->count_lines && is_sgml_newline(*pos)) { + scanner->lineno++; } }