mirror of
https://github.com/rkd77/elinks.git
synced 2025-01-03 14:57:44 -05:00
530 lines
14 KiB
C
530 lines
14 KiB
C
/* SGML node handling */
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
|
|
#include "elinks.h"
|
|
|
|
#include "dom/node.h"
|
|
#include "dom/sgml/parser.h"
|
|
#include "dom/sgml/scanner.h"
|
|
#include "dom/sgml/sgml.h"
|
|
#include "dom/stack.h"
|
|
#include "dom/string.h"
|
|
#include "util/error.h"
|
|
#include "util/memory.h"
|
|
|
|
|
|
/* This holds info about a chunk of text being parsed. The SGML parser uses
|
|
* these to keep track of possible nested calls to parse_sgml(). This can be
|
|
* used to feed output of stuff like ECMAScripts document.write() from
|
|
* <script>-elements back to the SGML parser. */
|
|
struct sgml_parsing_state {
|
|
struct dom_scanner scanner;
|
|
struct dom_node *node;
|
|
size_t depth;
|
|
};
|
|
|
|
static struct sgml_parsing_state *
|
|
init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer);
|
|
|
|
|
|
/* When getting the sgml_parser struct it is _always_ assumed that the parser
|
|
* is the first to add it's context, which it is since it initializes the
|
|
* stack. */
|
|
|
|
#define get_sgml_parser(stack) ((stack)->contexts[0]->data)
|
|
|
|
#define get_sgml_parser_state(stack, state) \
|
|
get_dom_stack_state_data(stack->contexts[0], state)
|
|
|
|
|
|
/* Functions for adding new nodes to the DOM tree: */
|
|
|
|
/* They wrap init_dom_node() and add_dom_*() and set up of additional
|
|
* information like node subtypes and SGML parser state information. */
|
|
|
|
static inline struct dom_node *
|
|
add_sgml_document(struct dom_stack *stack, struct dom_string *string)
|
|
{
|
|
struct dom_node *node = init_dom_node(DOM_NODE_DOCUMENT, string);
|
|
|
|
return node ? push_dom_node(stack, node) : NULL;
|
|
}
|
|
|
|
static inline struct dom_node *
|
|
add_sgml_element(struct dom_stack *stack, struct dom_scanner_token *token)
|
|
{
|
|
struct sgml_parser *parser = get_sgml_parser(stack);
|
|
struct dom_node *parent = get_dom_stack_top(stack)->node;
|
|
struct dom_stack_state *state;
|
|
struct sgml_parser_state *pstate;
|
|
struct dom_node *node;
|
|
struct sgml_node_info *node_info;
|
|
|
|
node = add_dom_element(parent, &token->string);
|
|
if (!node) return NULL;
|
|
|
|
node_info = get_sgml_node_info(parser->info->elements, node);
|
|
node->data.element.type = node_info->type;
|
|
|
|
if (!push_dom_node(stack, node))
|
|
return NULL;
|
|
|
|
state = get_dom_stack_top(stack);
|
|
assert(node == state->node);
|
|
|
|
pstate = get_sgml_parser_state(stack, state);
|
|
pstate->info = node_info;
|
|
|
|
return node;
|
|
}
|
|
|
|
|
|
static inline void
|
|
add_sgml_attribute(struct dom_stack *stack,
|
|
struct dom_scanner_token *token, struct dom_scanner_token *valtoken)
|
|
{
|
|
struct sgml_parser *parser = get_sgml_parser(stack);
|
|
struct dom_node *parent = get_dom_stack_top(stack)->node;
|
|
struct dom_string *value = valtoken ? &valtoken->string : NULL;
|
|
struct sgml_node_info *info;
|
|
struct dom_node *node;
|
|
|
|
node = add_dom_attribute(parent, &token->string, value);
|
|
|
|
info = get_sgml_node_info(parser->info->attributes, node);
|
|
|
|
node->data.attribute.type = info->type;
|
|
node->data.attribute.id = !!(info->flags & SGML_ATTRIBUTE_IDENTIFIER);
|
|
node->data.attribute.reference = !!(info->flags & SGML_ATTRIBUTE_REFERENCE);
|
|
|
|
if (valtoken && valtoken->type == SGML_TOKEN_STRING)
|
|
node->data.attribute.quoted = 1;
|
|
|
|
if (!node || !push_dom_node(stack, node))
|
|
return;
|
|
|
|
pop_dom_node(stack);
|
|
}
|
|
|
|
static inline struct dom_node *
|
|
add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
|
|
struct dom_scanner_token *data)
|
|
{
|
|
struct dom_node *parent = get_dom_stack_top(stack)->node;
|
|
struct dom_string *data_str = data ? &data->string : NULL;
|
|
struct dom_node *node;
|
|
|
|
node = add_dom_proc_instruction(parent, &target->string, data_str);
|
|
if (!node) return NULL;
|
|
|
|
switch (target->type) {
|
|
case SGML_TOKEN_PROCESS_XML:
|
|
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
|
|
break;
|
|
|
|
case SGML_TOKEN_PROCESS:
|
|
default:
|
|
node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
|
|
}
|
|
|
|
return push_dom_node(stack, node);
|
|
}
|
|
|
|
static inline void
|
|
add_sgml_node(struct dom_stack *stack, enum dom_node_type type, struct dom_scanner_token *token)
|
|
{
|
|
struct dom_node *parent = get_dom_stack_top(stack)->node;
|
|
struct dom_node *node = add_dom_node(parent, type, &token->string);
|
|
|
|
if (!node) return;
|
|
|
|
if (token->type == SGML_TOKEN_SPACE)
|
|
node->data.text.only_space = 1;
|
|
|
|
if (push_dom_node(stack, node))
|
|
pop_dom_node(stack);
|
|
}
|
|
|
|
|
|
/* SGML parser main handling: */
|
|
|
|
static inline void
|
|
parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
|
|
{
|
|
struct dom_scanner_token name;
|
|
|
|
assert(dom_scanner_has_tokens(scanner)
|
|
&& (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
|
|
|| (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION)));
|
|
|
|
if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN)
|
|
skip_dom_scanner_token(scanner);
|
|
|
|
while (dom_scanner_has_tokens(scanner)) {
|
|
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
|
|
|
|
assert(token);
|
|
|
|
switch (token->type) {
|
|
case SGML_TOKEN_TAG_END:
|
|
skip_dom_scanner_token(scanner);
|
|
/* and return */
|
|
case SGML_TOKEN_ELEMENT:
|
|
case SGML_TOKEN_ELEMENT_BEGIN:
|
|
case SGML_TOKEN_ELEMENT_END:
|
|
case SGML_TOKEN_ELEMENT_EMPTY_END:
|
|
return;
|
|
|
|
case SGML_TOKEN_IDENT:
|
|
copy_struct(&name, token);
|
|
|
|
/* Skip the attribute name token */
|
|
token = get_next_dom_scanner_token(scanner);
|
|
if (token && token->type == '=') {
|
|
/* If the token is not a valid value token
|
|
* ignore it. */
|
|
token = get_next_dom_scanner_token(scanner);
|
|
if (token
|
|
&& token->type != SGML_TOKEN_IDENT
|
|
&& token->type != SGML_TOKEN_ATTRIBUTE
|
|
&& token->type != SGML_TOKEN_STRING)
|
|
token = NULL;
|
|
} else {
|
|
token = NULL;
|
|
}
|
|
|
|
add_sgml_attribute(stack, &name, token);
|
|
|
|
/* Skip the value token */
|
|
if (token)
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
default:
|
|
skip_dom_scanner_token(scanner);
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
|
|
{
|
|
struct dom_scanner_token target;
|
|
|
|
while (dom_scanner_has_tokens(scanner)) {
|
|
struct dom_scanner_token *token = get_dom_scanner_token(scanner);
|
|
|
|
switch (token->type) {
|
|
case SGML_TOKEN_ELEMENT:
|
|
case SGML_TOKEN_ELEMENT_BEGIN:
|
|
if (!add_sgml_element(stack, token)) {
|
|
if (token->type == SGML_TOKEN_ELEMENT) {
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
}
|
|
|
|
skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
|
|
break;
|
|
}
|
|
|
|
if (token->type == SGML_TOKEN_ELEMENT_BEGIN) {
|
|
parse_sgml_attributes(stack, scanner);
|
|
} else {
|
|
skip_dom_scanner_token(scanner);
|
|
}
|
|
|
|
break;
|
|
|
|
case SGML_TOKEN_ELEMENT_EMPTY_END:
|
|
pop_dom_node(stack);
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_ELEMENT_END:
|
|
if (!token->string.length) {
|
|
pop_dom_node(stack);
|
|
} else {
|
|
struct dom_string string;
|
|
struct dom_stack_state *state;
|
|
|
|
set_dom_string(&string, token->string.string, token->string.length);
|
|
state = search_dom_stack(stack, DOM_NODE_ELEMENT,
|
|
&string);
|
|
if (state) {
|
|
struct sgml_parser_state *pstate;
|
|
|
|
pstate = get_sgml_parser_state(stack, state);
|
|
copy_struct(&pstate->end_token, token);
|
|
|
|
pop_dom_state(stack, state);
|
|
}
|
|
}
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_NOTATION_COMMENT:
|
|
add_sgml_node(stack, DOM_NODE_COMMENT, token);
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_NOTATION_ATTLIST:
|
|
case SGML_TOKEN_NOTATION_DOCTYPE:
|
|
case SGML_TOKEN_NOTATION_ELEMENT:
|
|
case SGML_TOKEN_NOTATION_ENTITY:
|
|
case SGML_TOKEN_NOTATION:
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_CDATA_SECTION:
|
|
add_sgml_node(stack, DOM_NODE_CDATA_SECTION, token);
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_PROCESS_XML_STYLESHEET:
|
|
case SGML_TOKEN_PROCESS_XML:
|
|
case SGML_TOKEN_PROCESS:
|
|
copy_struct(&target, token);
|
|
|
|
/* Skip the target token */
|
|
token = get_next_dom_scanner_token(scanner);
|
|
if (!token) break;
|
|
|
|
assert(token->type == SGML_TOKEN_PROCESS_DATA);
|
|
|
|
if (add_sgml_proc_instruction(stack, &target, token)
|
|
&& (target.type == SGML_TOKEN_PROCESS_XML
|
|
|| target.type == SGML_TOKEN_PROCESS_XML_STYLESHEET)
|
|
&& token->string.length > 0) {
|
|
/* Parse the <?xml data="attributes"?>. */
|
|
struct dom_scanner attr_scanner;
|
|
|
|
init_dom_scanner(&attr_scanner, &sgml_scanner_info,
|
|
&token->string, SGML_STATE_ELEMENT,
|
|
scanner->count_lines);
|
|
|
|
if (dom_scanner_has_tokens(&attr_scanner))
|
|
parse_sgml_attributes(stack, &attr_scanner);
|
|
}
|
|
|
|
pop_dom_node(stack);
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_ENTITY:
|
|
add_sgml_node(stack, DOM_NODE_ENTITY_REFERENCE, token);
|
|
skip_dom_scanner_token(scanner);
|
|
break;
|
|
|
|
case SGML_TOKEN_SPACE:
|
|
case SGML_TOKEN_TEXT:
|
|
default:
|
|
add_sgml_node(stack, DOM_NODE_TEXT, token);
|
|
skip_dom_scanner_token(scanner);
|
|
}
|
|
}
|
|
}
|
|
|
|
struct dom_node *
|
|
parse_sgml(struct sgml_parser *parser, struct dom_string *buffer)
|
|
{
|
|
struct sgml_parsing_state *parsing;
|
|
|
|
if (!parser->root) {
|
|
parser->root = add_sgml_document(&parser->stack, &parser->uri);
|
|
if (!parser->root)
|
|
return NULL;
|
|
get_dom_stack_top(&parser->stack)->immutable = 1;
|
|
}
|
|
|
|
parsing = init_sgml_parsing_state(parser, buffer);
|
|
if (!parsing) return NULL;
|
|
|
|
/* FIXME: Make parse_sgml_plain() return something (error code or if
|
|
* can be guarenteed a root node). */
|
|
parse_sgml_plain(&parser->stack, &parsing->scanner);
|
|
|
|
pop_dom_node(&parser->parsing);
|
|
|
|
return parser->root;
|
|
}
|
|
|
|
|
|
/* Parsing state management: */
|
|
|
|
/* The SGML parser can handle nested calls to parse_sgml(). This can be used to
|
|
* handle output of external processing of data in the document tree. For
|
|
* example this can allows output of the document.write() from DOM scripting
|
|
* interface to be parsed. */
|
|
|
|
static void
|
|
sgml_parsing_push(struct dom_stack *stack, struct dom_node *node, void *data)
|
|
{
|
|
struct sgml_parser *parser = get_sgml_parser(stack);
|
|
struct sgml_parsing_state *parsing = data;
|
|
int count_lines = !!(parser->flags & SGML_PARSER_COUNT_LINES);
|
|
|
|
parsing->depth = parser->stack.depth;
|
|
get_dom_stack_top(&parser->stack)->immutable = 1;
|
|
init_dom_scanner(&parsing->scanner, &sgml_scanner_info, &node->string,
|
|
SGML_STATE_TEXT, count_lines);
|
|
}
|
|
|
|
static void
|
|
sgml_parsing_pop(struct dom_stack *stack, struct dom_node *node, void *data)
|
|
{
|
|
struct sgml_parser *parser = get_sgml_parser(stack);
|
|
struct sgml_parsing_state *parsing = data;
|
|
|
|
/* Pop the stack back to the state it was in. This includes cleaning
|
|
* away even immutable states left on the stack. */
|
|
while (parsing->depth < parser->stack.depth) {
|
|
get_dom_stack_top(&parser->stack)->immutable = 0;
|
|
pop_dom_node(&parser->stack);
|
|
}
|
|
|
|
assert(parsing->depth == parser->stack.depth);
|
|
}
|
|
|
|
static struct dom_stack_context_info sgml_parsing_context_info = {
|
|
/* Object size: */ sizeof(struct sgml_parsing_state),
|
|
/* Push: */
|
|
{
|
|
/* */ NULL,
|
|
/* DOM_NODE_ELEMENT */ NULL,
|
|
/* DOM_NODE_ATTRIBUTE */ NULL,
|
|
/* DOM_NODE_TEXT */ sgml_parsing_push,
|
|
/* DOM_NODE_CDATA_SECTION */ NULL,
|
|
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
|
|
/* DOM_NODE_ENTITY */ NULL,
|
|
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
|
|
/* DOM_NODE_COMMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
|
|
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
|
|
/* DOM_NODE_NOTATION */ NULL,
|
|
},
|
|
/* Pop: */
|
|
{
|
|
/* */ NULL,
|
|
/* DOM_NODE_ELEMENT */ NULL,
|
|
/* DOM_NODE_ATTRIBUTE */ NULL,
|
|
/* DOM_NODE_TEXT */ sgml_parsing_pop,
|
|
/* DOM_NODE_CDATA_SECTION */ NULL,
|
|
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
|
|
/* DOM_NODE_ENTITY */ NULL,
|
|
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
|
|
/* DOM_NODE_COMMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
|
|
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
|
|
/* DOM_NODE_NOTATION */ NULL,
|
|
}
|
|
};
|
|
|
|
/* Create a new parsing state by pushing a new text node containing the*/
|
|
static struct sgml_parsing_state *
|
|
init_sgml_parsing_state(struct sgml_parser *parser, struct dom_string *buffer)
|
|
{
|
|
struct dom_stack_state *state;
|
|
struct dom_node *node;
|
|
|
|
node = init_dom_node(DOM_NODE_TEXT, buffer);
|
|
if (!node || !push_dom_node(&parser->parsing, node))
|
|
return NULL;
|
|
|
|
state = get_dom_stack_top(&parser->parsing);
|
|
|
|
return get_dom_stack_state_data(parser->parsing.contexts[0], state);
|
|
}
|
|
|
|
|
|
/* Parser creation and destruction: */
|
|
|
|
/* FIXME: For now the main SGML parser context doesn't do much other than
|
|
* declaring the sgml_parser_state object. */
|
|
static struct dom_stack_context_info sgml_parser_context_info = {
|
|
/* Object size: */ sizeof(struct sgml_parser_state),
|
|
/* Push: */
|
|
{
|
|
/* */ NULL,
|
|
/* DOM_NODE_ELEMENT */ NULL,
|
|
/* DOM_NODE_ATTRIBUTE */ NULL,
|
|
/* DOM_NODE_TEXT */ NULL,
|
|
/* DOM_NODE_CDATA_SECTION */ NULL,
|
|
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
|
|
/* DOM_NODE_ENTITY */ NULL,
|
|
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
|
|
/* DOM_NODE_COMMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
|
|
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
|
|
/* DOM_NODE_NOTATION */ NULL,
|
|
},
|
|
/* Pop: */
|
|
{
|
|
/* */ NULL,
|
|
/* DOM_NODE_ELEMENT */ NULL,
|
|
/* DOM_NODE_ATTRIBUTE */ NULL,
|
|
/* DOM_NODE_TEXT */ NULL,
|
|
/* DOM_NODE_CDATA_SECTION */ NULL,
|
|
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
|
|
/* DOM_NODE_ENTITY */ NULL,
|
|
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
|
|
/* DOM_NODE_COMMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT */ NULL,
|
|
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
|
|
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
|
|
/* DOM_NODE_NOTATION */ NULL,
|
|
}
|
|
};
|
|
|
|
struct sgml_parser *
|
|
init_sgml_parser(enum sgml_parser_type type, enum sgml_document_type doctype,
|
|
struct dom_string *uri, enum sgml_parser_flag flags)
|
|
{
|
|
struct sgml_parser *parser;
|
|
enum dom_stack_flag stack_flags = 0;
|
|
|
|
parser = mem_calloc(1, sizeof(*parser));
|
|
if (!parser) return NULL;
|
|
|
|
if (!init_dom_string(&parser->uri, uri->string, uri->length)) {
|
|
mem_free(parser);
|
|
return NULL;
|
|
}
|
|
|
|
parser->type = type;
|
|
parser->flags = type;
|
|
parser->info = get_sgml_info(doctype);
|
|
|
|
if (type == SGML_PARSER_TREE)
|
|
stack_flags |= DOM_STACK_KEEP_NODES;
|
|
|
|
init_dom_stack(&parser->stack, stack_flags);
|
|
/* FIXME: Some sgml backend specific callbacks? Handle HTML script tags,
|
|
* and feed document.write() data back to the parser. */
|
|
add_dom_stack_context(&parser->stack, parser, &sgml_parser_context_info);
|
|
|
|
/* Don't keep the 'fake' text nodes that holds the parsing data. */
|
|
init_dom_stack(&parser->parsing, 0);
|
|
add_dom_stack_context(&parser->parsing, parser, &sgml_parsing_context_info);
|
|
|
|
return parser;
|
|
}
|
|
|
|
void
|
|
done_sgml_parser(struct sgml_parser *parser)
|
|
{
|
|
done_dom_stack(&parser->stack);
|
|
done_dom_stack(&parser->parsing);
|
|
done_dom_string(&parser->uri);
|
|
mem_free(parser);
|
|
}
|