1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00
elinks/src/dom/sgml/parser.c
2022-01-26 19:06:38 +01:00

729 lines
20 KiB
C

/* SGML node handling */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <stdlib.h>
#include <string.h>
#include "elinks.h"
#include "dom/node.h"
#include "dom/sgml/parser.h"
#include "dom/sgml/scanner.h"
#include "dom/sgml/sgml.h"
#include "dom/stack.h"
#include "dom/string.h"
#include "util/error.h"
#include "util/memory.h"
/* When getting the sgml_parser struct it is _always_ assumed that the parser
* is the first to add it's context, which it is since it initializes the
* stack. */
#define get_sgml_parser(stack) ((stack)->contexts[0]->data)
#define get_sgml_parser_state(stack, state) \
get_dom_stack_state_data(stack->contexts[0], state)
/* Functions for adding new nodes to the DOM tree: */
/* They wrap init_dom_node() and add_dom_*() and set up of additional
* information like node subtypes and SGML parser state information. */
static inline struct dom_node *
add_sgml_document(struct sgml_parser *parser)
{
int allocated = parser->flags & SGML_PARSER_INCREMENTAL;
struct dom_node *node;
node = init_dom_node(DOM_NODE_DOCUMENT, &parser->uri, allocated);
if (node && push_dom_node(&parser->stack, node) == DOM_CODE_OK)
return node;
return NULL;
}
static inline struct dom_node *
add_sgml_element(struct dom_stack *stack, struct dom_scanner_token *token)
{
struct sgml_parser *parser = (struct sgml_parser *)get_sgml_parser(stack);
struct dom_node *parent = get_dom_stack_top(stack)->node;
struct dom_stack_state *state;
struct sgml_parser_state *pstate;
struct dom_node *node;
struct sgml_node_info *node_info;
node = add_dom_element(parent, &token->string);
if (!node) return NULL;
node_info = get_sgml_node_info(parser->info->elements, node);
node->data.element.type = node_info->type;
if (push_dom_node(stack, node) != DOM_CODE_OK)
return NULL;
state = get_dom_stack_top(stack);
assert(node == state->node);
pstate = (struct sgml_parser_state *)get_sgml_parser_state(stack, state);
pstate->info = node_info;
return node;
}
static inline struct dom_node *
add_sgml_attribute(struct dom_stack *stack,
struct dom_scanner_token *token, struct dom_scanner_token *valtoken)
{
struct sgml_parser *parser = (struct sgml_parser *)get_sgml_parser(stack);
struct dom_node *parent = get_dom_stack_top(stack)->node;
struct dom_string *value = valtoken ? &valtoken->string : NULL;
struct sgml_node_info *info;
struct dom_node *node;
node = add_dom_attribute(parent, &token->string, value);
info = get_sgml_node_info(parser->info->attributes, node);
node->data.attribute.type = info->type;
node->data.attribute.id = !!(info->flags & SGML_ATTRIBUTE_IDENTIFIER);
node->data.attribute.reference = !!(info->flags & SGML_ATTRIBUTE_REFERENCE);
if (valtoken && valtoken->type == SGML_TOKEN_STRING)
node->data.attribute.quoted = valtoken->string.string[-1];
if (!node || push_dom_node(stack, node) != DOM_CODE_OK)
return NULL;
pop_dom_node(stack);
return node;
}
static inline struct dom_node *
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
struct dom_scanner_token *data)
{
struct dom_node *parent = get_dom_stack_top(stack)->node;
struct dom_string *data_str = data ? &data->string : NULL;
struct dom_node *node;
node = add_dom_proc_instruction(parent, &target->string, data_str);
if (!node) return NULL;
switch (target->type) {
case SGML_TOKEN_PROCESS_XML:
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
break;
case SGML_TOKEN_PROCESS_XML_STYLESHEET:
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML_STYLESHEET;
break;
case SGML_TOKEN_PROCESS:
default:
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
}
if (push_dom_node(stack, node) == DOM_CODE_OK)
return node;
return NULL;
}
static inline struct dom_node *
add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct dom_scanner_token *token)
{
struct dom_node *parent = get_dom_stack_top(stack)->node;
struct dom_node *node = add_dom_node(parent, type, &token->string);
if (!node) return NULL;
if (token->type == SGML_TOKEN_SPACE)
node->data.text.only_space = 1;
if (push_dom_node(stack, node) == DOM_CODE_OK)
pop_dom_node(stack);
return node;
}
/* SGML parser main handling: */
static enum dom_code
call_sgml_error_function(struct dom_stack *stack, struct dom_scanner_token *token)
{
struct sgml_parser *parser = (struct sgml_parser *)get_sgml_parser(stack);
unsigned int line = get_sgml_parser_line_number(parser);
assert(parser->error_func);
return parser->error_func(parser, &token->string, line);
}
/* Appends to or 'creates' an incomplete token. This can be used to
* force tokens back into the 'stream' if they require that later tokens
* are available.
*
* NOTE: You can only do this for tokens that are not stripped of markup such
* as identifiers. */
static enum dom_code
check_sgml_incomplete(struct dom_scanner *scanner,
struct dom_scanner_token *start,
struct dom_scanner_token *token)
{
if (token && token->type == SGML_TOKEN_INCOMPLETE) {
token->string.length += token->string.string - start->string.string;
token->string.string = start->string.string;
return DOM_CODE_INDEX_SIZE_ERR;
} else if (!token && scanner->check_complete && scanner->incomplete) {
size_t left = scanner->end - start->string.string;
assert(left > 0);
token = scanner->current = scanner->table;
scanner->tokens = 1;
token->type = SGML_TOKEN_INCOMPLETE;
set_dom_string(&token->string, start->string.string, left);
return DOM_CODE_INDEX_SIZE_ERR;
}
return DOM_CODE_OK;
}
static inline enum dom_code
parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
{
struct dom_scanner_token name;
while (dom_scanner_has_tokens(scanner)) {
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
assert(token);
switch (token->type) {
case SGML_TOKEN_TAG_END:
skip_dom_scanner_token(scanner);
/* and return */
case SGML_TOKEN_ELEMENT:
case SGML_TOKEN_ELEMENT_BEGIN:
case SGML_TOKEN_ELEMENT_END:
case SGML_TOKEN_ELEMENT_EMPTY_END:
return DOM_CODE_OK;
case SGML_TOKEN_IDENT:
copy_struct(&name, token);
/* Skip the attribute name token */
token = get_next_dom_scanner_token(scanner);
if (token && token->type == '=') {
/* If the token is not a valid value token
* ignore it. */
token = get_next_dom_scanner_token(scanner);
if (check_sgml_incomplete(scanner, &name, token))
return DOM_CODE_INCOMPLETE;
if (token
&& token->type != SGML_TOKEN_IDENT
&& token->type != SGML_TOKEN_ATTRIBUTE
&& token->type != SGML_TOKEN_STRING)
token = NULL;
} else if (check_sgml_incomplete(scanner, &name, token)) {
return DOM_CODE_INCOMPLETE;
} else {
token = NULL;
}
if (!add_sgml_attribute(stack, &name, token))
return DOM_CODE_ALLOC_ERR;
/* Skip the value token */
if (token)
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_INCOMPLETE:
return DOM_CODE_INCOMPLETE;
case SGML_TOKEN_ERROR:
{
enum dom_code code;
code = call_sgml_error_function(stack, token);
if (code != DOM_CODE_OK)
return code;
skip_dom_scanner_token(scanner);
break;
}
default:
skip_dom_scanner_token(scanner);
}
}
return DOM_CODE_OK;
}
static enum dom_code
parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
{
struct dom_scanner_token target;
while (dom_scanner_has_tokens(scanner)) {
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
switch (token->type) {
case SGML_TOKEN_ELEMENT:
case SGML_TOKEN_ELEMENT_BEGIN:
if (!add_sgml_element(stack, token))
return DOM_CODE_ALLOC_ERR;
if (token->type == SGML_TOKEN_ELEMENT_BEGIN) {
enum dom_code code;
skip_dom_scanner_token(scanner);
code = parse_sgml_attributes(stack, scanner);
if (code != DOM_CODE_OK)
return code;
} else {
skip_dom_scanner_token(scanner);
}
break;
case SGML_TOKEN_ELEMENT_EMPTY_END:
pop_dom_node(stack);
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_ELEMENT_END:
if (!token->string.length) {
pop_dom_node(stack);
} else {
struct dom_string string;
struct dom_stack_state *state;
set_dom_string(&string, token->string.string, token->string.length);
state = search_dom_stack(stack, DOM_NODE_ELEMENT,
&string);
if (state) {
struct sgml_parser_state *pstate;
pstate = (struct sgml_parser_state *)get_sgml_parser_state(stack, state);
copy_struct(&pstate->end_token, token);
pop_dom_state(stack, state);
}
}
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_NOTATION_COMMENT:
if (!add_sgml_node(stack, DOM_NODE_COMMENT, token))
return DOM_CODE_ALLOC_ERR;
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_NOTATION_ATTLIST:
case SGML_TOKEN_NOTATION_DOCTYPE:
case SGML_TOKEN_NOTATION_ELEMENT:
case SGML_TOKEN_NOTATION_ENTITY:
case SGML_TOKEN_NOTATION:
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_CDATA_SECTION:
if (!add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token))
return DOM_CODE_ALLOC_ERR;
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_PROCESS_XML_STYLESHEET:
case SGML_TOKEN_PROCESS_XML:
case SGML_TOKEN_PROCESS:
copy_struct(&target, token);
/* Skip the target token */
token = get_next_dom_scanner_token(scanner);
if (!token || token->type == SGML_TOKEN_INCOMPLETE)
return DOM_CODE_INCOMPLETE;
if (token->type == SGML_TOKEN_ERROR)
break;
assert(token->type == SGML_TOKEN_PROCESS_DATA);
/* Fall-through */
if (!add_sgml_proc_instruction(stack, &target, token))
return DOM_CODE_ALLOC_ERR;
if ((target.type == SGML_TOKEN_PROCESS_XML
|| target.type == SGML_TOKEN_PROCESS_XML_STYLESHEET)
&& token->string.length > 0) {
/* Parse the <?xml data="attributes"?>. */
struct dom_scanner attr_scanner;
/* The attribute souce is complete. */
init_dom_scanner(&attr_scanner, &sgml_scanner_info,
&token->string, SGML_STATE_ELEMENT,
scanner->count_lines, 1, 0, 0);
if (dom_scanner_has_tokens(&attr_scanner)) {
/* Ignore parser codes from this
* enhanced parsing of attributes. It
* is really just a simple way to try
* and support xml and xml-stylesheet
* instructions. */
parse_sgml_attributes(stack, &attr_scanner);
}
}
pop_dom_node(stack);
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_ENTITY:
add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token);
skip_dom_scanner_token(scanner);
break;
case SGML_TOKEN_INCOMPLETE:
return DOM_CODE_INCOMPLETE;
case SGML_TOKEN_ERROR:
{
enum dom_code code;
code = call_sgml_error_function(stack, token);
if (code != DOM_CODE_OK)
return code;
skip_dom_scanner_token(scanner);
break;
}
case SGML_TOKEN_SPACE:
case SGML_TOKEN_TEXT:
default:
add_sgml_node(stack, DOM_NODE_TEXT, token);
skip_dom_scanner_token(scanner);
}
}
return DOM_CODE_OK;
}
enum dom_code
parse_sgml(struct sgml_parser *parser, char *buf, size_t bufsize,
int complete)
{
struct dom_string source = INIT_DOM_STRING(buf, (unsigned int)bufsize);
struct dom_node *node;
if (complete)
parser->flags |= SGML_PARSER_COMPLETE;
if (!parser->root) {
parser->root = add_sgml_document(parser);
if (!parser->root)
return DOM_CODE_ALLOC_ERR;
get_dom_stack_top(&parser->stack)->immutable = 1;
}
node = init_dom_node(DOM_NODE_TEXT, &source, 0);
if (!node || push_dom_node(&parser->parsing, node) != DOM_CODE_OK)
return DOM_CODE_ALLOC_ERR;
return parser->code;
}
/* Parsing state management: */
/* The SGML parser can handle nested calls to parse_sgml(). This can be used to
* handle output of external processing of data in the document tree. For
* example this can allows output of the document.write() from DOM scripting
* interface to be parsed. */
/* This holds info about a chunk of text being parsed. */
struct sgml_parsing_state {
struct dom_scanner scanner;
struct dom_node *node;
struct dom_string incomplete;
size_t depth;
unsigned int resume:1;
};
enum dom_code
sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
{
struct sgml_parser *parser = (struct sgml_parser *)get_sgml_parser(stack);
struct sgml_parsing_state *parsing = (struct sgml_parsing_state *)data;
int count_lines = !!(parser->flags & SGML_PARSER_COUNT_LINES);
int complete = !!(parser->flags & SGML_PARSER_COMPLETE);
int incremental = !!(parser->flags & SGML_PARSER_INCREMENTAL);
int detect_errors = !!(parser->flags & SGML_PARSER_DETECT_ERRORS);
struct dom_string *string = &node->string;
struct dom_scanner_token *token;
struct dom_string incomplete;
/*enum sgml_scanner_state*/ unsigned int scanner_state = SGML_STATE_TEXT;
parsing->depth = parser->stack.depth;
if (stack->depth > 1) {
struct sgml_parsing_state *parent = &parsing[-1];
if (parent->resume) {
if (is_dom_string_set(&parent->incomplete)) {
if (!add_to_dom_string(&parent->incomplete,
string->string,
string->length)) {
parser->code = DOM_CODE_ALLOC_ERR;
return DOM_CODE_OK;
}
string = &parent->incomplete;
}
scanner_state = parent->scanner.state;
/* Pop down to the parent. */
parsing = parent;
parsing->resume = 0;
pop_dom_node(stack);
}
}
init_dom_scanner(&parsing->scanner, &sgml_scanner_info, string,
scanner_state, count_lines, complete, incremental,
detect_errors);
if (scanner_state == SGML_STATE_ELEMENT) {
parser->code = parse_sgml_attributes(&parser->stack, &parsing->scanner);
if (parser->code == DOM_CODE_OK)
parser->code = parse_sgml_plain(&parser->stack, &parsing->scanner);
} else {
parser->code = parse_sgml_plain(&parser->stack, &parsing->scanner);
}
if (complete) {
pop_dom_node(&parser->parsing);
return DOM_CODE_OK;
}
if (parser->code != DOM_CODE_INCOMPLETE) {
/* No need to preserve the default scanner state. */
if (parsing->scanner.state == SGML_STATE_TEXT) {
pop_dom_node(&parser->parsing);
return DOM_CODE_OK;
}
done_dom_string(&parsing->incomplete);
parsing->resume = 1;
return DOM_CODE_OK;
}
token = get_dom_scanner_token(&parsing->scanner);
assert(token && token->type == SGML_TOKEN_INCOMPLETE);
string = &token->string;
set_dom_string(&incomplete, NULL, 0);
if (!init_dom_string(&incomplete, string->string, string->length)) {
parser->code = DOM_CODE_ALLOC_ERR;
return DOM_CODE_OK;
}
done_dom_string(&parsing->incomplete);
set_dom_string(&parsing->incomplete, incomplete.string, incomplete.length);
parsing->resume = 1;
return DOM_CODE_OK;
}
enum dom_code
sgml_parsing_pop(struct dom_stack *stack, struct dom_node *node, void *data)
{
struct sgml_parser *parser = (struct sgml_parser *)get_sgml_parser(stack);
struct sgml_parsing_state *parsing = (struct sgml_parsing_state *)data;
/* Only clean up the stack if complete so that we get proper nesting. */
if (parser->flags & SGML_PARSER_COMPLETE) {
/* Pop the stack back to the state it was in. This includes cleaning
* away even immutable states left on the stack. */
while (parsing->depth < parser->stack.depth) {
get_dom_stack_top(&parser->stack)->immutable = 0;
pop_dom_node(&parser->stack);
}
/* It's bigger than when calling done_sgml_parser() in the middle of an
* incomplete parsing. */
assert(parsing->depth >= parser->stack.depth);
}
done_dom_string(&parsing->incomplete);
return DOM_CODE_OK;
}
static struct dom_stack_context_info sgml_parsing_context_info = {
/* Object size: */ sizeof(struct sgml_parsing_state),
/* Push: */
{
/* */ NULL,
/* DOM_NODE_ELEMENT */ NULL,
/* DOM_NODE_ATTRIBUTE */ NULL,
/* DOM_NODE_TEXT */ sgml_parsing_push,
/* DOM_NODE_CDATA_SECTION */ NULL,
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
/* DOM_NODE_ENTITY */ NULL,
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
/* DOM_NODE_COMMENT */ NULL,
/* DOM_NODE_DOCUMENT */ NULL,
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
/* DOM_NODE_NOTATION */ NULL,
},
/* Pop: */
{
/* */ NULL,
/* DOM_NODE_ELEMENT */ NULL,
/* DOM_NODE_ATTRIBUTE */ NULL,
/* DOM_NODE_TEXT */ sgml_parsing_pop,
/* DOM_NODE_CDATA_SECTION */ NULL,
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
/* DOM_NODE_ENTITY */ NULL,
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
/* DOM_NODE_COMMENT */ NULL,
/* DOM_NODE_DOCUMENT */ NULL,
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
/* DOM_NODE_NOTATION */ NULL,
}
};
unsigned int
get_sgml_parser_line_number(struct sgml_parser *parser)
{
struct dom_stack_state *state;
struct sgml_parsing_state *pstate;
assert(parser->flags & SGML_PARSER_COUNT_LINES);
if (dom_stack_is_empty(&parser->parsing))
return 0;
state = get_dom_stack_top(&parser->parsing);
pstate = (struct sgml_parsing_state *)get_dom_stack_state_data(parser->parsing.contexts[0], state);
assert(pstate->scanner.count_lines && pstate->scanner.lineno);
if (pstate->scanner.current
&& pstate->scanner.current < pstate->scanner.table + DOM_SCANNER_TOKENS
&& pstate->scanner.current->type == SGML_TOKEN_ERROR)
return pstate->scanner.current->lineno;
return pstate->scanner.lineno;
}
/* Parser creation and destruction: */
/* FIXME: For now the main SGML parser context doesn't do much other than
* declaring the sgml_parser_state object. */
static struct dom_stack_context_info sgml_parser_context_info = {
/* Object size: */ sizeof(struct sgml_parser_state),
/* Push: */
{
/* */ NULL,
/* DOM_NODE_ELEMENT */ NULL,
/* DOM_NODE_ATTRIBUTE */ NULL,
/* DOM_NODE_TEXT */ NULL,
/* DOM_NODE_CDATA_SECTION */ NULL,
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
/* DOM_NODE_ENTITY */ NULL,
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
/* DOM_NODE_COMMENT */ NULL,
/* DOM_NODE_DOCUMENT */ NULL,
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
/* DOM_NODE_NOTATION */ NULL,
},
/* Pop: */
{
/* */ NULL,
/* DOM_NODE_ELEMENT */ NULL,
/* DOM_NODE_ATTRIBUTE */ NULL,
/* DOM_NODE_TEXT */ NULL,
/* DOM_NODE_CDATA_SECTION */ NULL,
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
/* DOM_NODE_ENTITY */ NULL,
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
/* DOM_NODE_COMMENT */ NULL,
/* DOM_NODE_DOCUMENT */ NULL,
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
/* DOM_NODE_NOTATION */ NULL,
}
};
struct sgml_parser *
init_sgml_parser(enum sgml_parser_type type, enum sgml_document_type doctype,
struct dom_string *uri, /*enum sgml_parser_flag*/ unsigned int flags)
{
struct sgml_parser *parser;
/*enum dom_stack_flag*/ unsigned int stack_flags = 0;
parser = (struct sgml_parser *)mem_calloc(1, sizeof(*parser));
if (!parser) return NULL;
if (!init_dom_string(&parser->uri, uri->string, uri->length)) {
mem_free(parser);
return NULL;
}
if (flags & SGML_PARSER_DETECT_ERRORS)
flags |= SGML_PARSER_COUNT_LINES;
parser->type = type;
parser->flags = flags;
parser->info = get_sgml_info(doctype);
if (type == SGML_PARSER_STREAM)
stack_flags |= DOM_STACK_FLAG_FREE_NODES;
init_dom_stack(&parser->stack, stack_flags);
/* FIXME: Some sgml backend specific callbacks? Handle HTML script tags,
* and feed document.write() data back to the parser. */
add_dom_stack_context(&parser->stack, parser, &sgml_parser_context_info);
/* Don't keep the 'fake' text nodes that holds the parsing data. */
init_dom_stack(&parser->parsing, DOM_STACK_FLAG_FREE_NODES);
add_dom_stack_context(&parser->parsing, parser, &sgml_parsing_context_info);
return parser;
}
void
done_sgml_parser(struct sgml_parser *parser)
{
while (!dom_stack_is_empty(&parser->parsing))
pop_dom_node(&parser->parsing);
done_dom_stack(&parser->parsing);
done_dom_stack(&parser->stack);
done_dom_string(&parser->uri);
mem_free(parser);
}