From 6c85c0f00959424919e6cad10fa3ba6627fed2dd Mon Sep 17 00:00:00 2001 From: Jonas Fonseca Date: Mon, 16 Jan 2006 05:12:34 +0100 Subject: [PATCH] Add DOM configuration inspired module It add support for normalizing a DOM document in various ways, such as removing comments, converting CDATA section nodes to text nodes, cleanup whitespace, etc. Use it in the RSS renderer to sanitize the text to be rendered. --- src/document/dom/renderer.c | 37 +---- src/dom/Makefile | 2 +- src/dom/configuration.c | 300 ++++++++++++++++++++++++++++++++++++ src/dom/configuration.h | 90 +++++++++++ 4 files changed, 399 insertions(+), 30 deletions(-) create mode 100644 src/dom/configuration.c create mode 100644 src/dom/configuration.h diff --git a/src/document/dom/renderer.c b/src/document/dom/renderer.c index 18f44641..10e034e9 100644 --- a/src/document/dom/renderer.c +++ b/src/document/dom/renderer.c @@ -22,6 +22,7 @@ #include "document/document.h" #include "document/dom/renderer.h" #include "document/renderer.h" +#include "dom/configuration.h" #include "dom/scanner.h" #include "dom/sgml/parser.h" #include "dom/sgml/rss/rss.h" @@ -703,6 +704,9 @@ static struct dom_stack_context_info dom_source_renderer_context_info = { /* DOM RSS Renderer */ +#define RSS_CONFIG_FLAGS \ + (DOM_CONFIG_NORMALIZE_WHITESPACE | DOM_CONFIG_NORMALIZE_CHARACTERS) + enum dom_stack_code dom_rss_push_element(struct dom_stack *stack, struct dom_node *node, void *data) { @@ -792,32 +796,6 @@ dom_rss_pop_element(struct dom_stack *stack, struct dom_node *node, void *data) return DOM_STACK_CODE_OK; } -enum dom_stack_code -dom_rss_push_content(struct dom_stack *stack, struct dom_node *node, void *data) -{ - struct dom_renderer *renderer = stack->current->data; - unsigned char *string = node->string.string; - int length = node->string.length; - - assert(node && renderer && renderer->document); - - if (!renderer->node) - return DOM_STACK_CODE_OK; - - if (node->type == DOM_NODE_ENTITY_REFERENCE) { - string -= 1; - length += 2; - } - - if (!is_dom_string_set(&renderer->text)) { - init_dom_string(&renderer->text, string, length); - } else { - add_to_dom_string(&renderer->text, string, length); - } - - return DOM_STACK_CODE_OK; -} - static struct dom_string * get_rss_node_text(struct dom_node *node) { @@ -955,9 +933,9 @@ static struct dom_stack_context_info dom_rss_renderer_context_info = { /* */ NULL, /* DOM_NODE_ELEMENT */ dom_rss_push_element, /* DOM_NODE_ATTRIBUTE */ NULL, - /* DOM_NODE_TEXT */ dom_rss_push_content, - /* DOM_NODE_CDATA_SECTION */ dom_rss_push_content, - /* DOM_NODE_ENTITY_REFERENCE */ dom_rss_push_content, + /* DOM_NODE_TEXT */ NULL, + /* DOM_NODE_CDATA_SECTION */ NULL, + /* DOM_NODE_ENTITY_REFERENCE */ NULL, /* DOM_NODE_ENTITY */ NULL, /* DOM_NODE_PROC_INSTRUCTION */ NULL, /* DOM_NODE_COMMENT */ NULL, @@ -1046,6 +1024,7 @@ render_dom_document(struct cache_entry *cached, struct document *document, } else if (doctype == SGML_DOCTYPE_RSS) { add_dom_stack_context(&parser->stack, &renderer, &dom_rss_renderer_context_info); + add_dom_config_normalizer(&parser->stack, RSS_CONFIG_FLAGS); } /* FIXME: When rendering this way we don't really care about the code. diff --git a/src/dom/Makefile b/src/dom/Makefile index 35a6b392..cdff0bab 100644 --- a/src/dom/Makefile +++ b/src/dom/Makefile @@ -2,7 +2,7 @@ top_builddir=../.. include $(top_builddir)/Makefile.config SUBDIRS = css sgml -OBJS = node.o select.o stack.o scanner.o +OBJS = configuration.o node.o select.o stack.o scanner.o SUBDIRS-$(CONFIG_DEBUG) += test diff --git a/src/dom/configuration.c b/src/dom/configuration.c new file mode 100644 index 00000000..f43690b8 --- /dev/null +++ b/src/dom/configuration.c @@ -0,0 +1,300 @@ +/* DOM Configuration */ + +#include "elinks.h" + +#include "dom/configuration.h" +#include "dom/node.h" +#include "dom/stack.h" +#include "dom/string.h" + + +static enum dom_stack_code +normalize_text_node_whitespace(struct dom_node *node) +{ + unsigned char buf[256]; + struct dom_string string = INIT_DOM_STRING(NULL, 0); + int count = 0, i = 0; + unsigned char *text = node->string.string; + + assert(node->type == DOM_NODE_TEXT); + + while (i < node->string.length) { + int j; + + for (j = 0; j < sizeof(buf) && i < node->string.length; i++) { + unsigned char data = text[i]; + + if (isspace(data)) { + if (count == 1) + continue; + + data = ' '; + count = 1; + + } else { + count = 0; + } + + buf[j++] = data; + } + + if (!add_to_dom_string(&string, buf, j)) { + done_dom_string(&string); + return DOM_STACK_CODE_ERROR_MEM_ALLOC; + } + } + + if (node->data.text.allocated) + done_dom_string(&node->string); + + set_dom_string(&node->string, string.string, string.length); + node->data.text.allocated = 1; + + return DOM_STACK_CODE_OK; + +} + +static enum dom_stack_code +append_node_text(struct dom_config *config, struct dom_node *node) +{ + struct dom_node *prev = get_dom_node_prev(node); + size_t length; + struct dom_string dest; + struct dom_string src; + int error = 0; + + copy_struct(&src, &node->string); + + if (!prev || prev->type != DOM_NODE_TEXT) { + /* Preserve text nodes with no one to append to. */ + if (node->type == DOM_NODE_TEXT) + return DOM_STACK_CODE_OK; + + prev = NULL; + set_dom_string(&dest, NULL, 0); + + } else { + if (prev->data.text.allocated) { + copy_struct(&dest, &prev->string); + } else { + set_dom_string(&dest, NULL, 0); + if (!add_to_dom_string(&dest, prev->string.string, prev->string.length)) + return DOM_STACK_CODE_ERROR_MEM_ALLOC; + set_dom_string(&prev->string, dest.string, dest.length); + prev->data.text.allocated = 1; + } + } + + length = dest.length; + + switch (node->type) { + case DOM_NODE_CDATA_SECTION: + case DOM_NODE_TEXT: + if (!add_to_dom_string(&dest, src.string, src.length)) + error = 1; + break; + + case DOM_NODE_ENTITY_REFERENCE: + /* FIXME: Until we will have uniform encoding at this point + * (UTF-8) we just add the entity reference unexpanded assuming + * that convert_string() will eventually do the work of + * expanding it. */ + if (!add_to_dom_string(&dest, "&", 1) + || !add_to_dom_string(&dest, src.string, src.length) + || !add_to_dom_string(&dest, ";", 1)) { + error = 1; + } + break; + + default: + INTERNAL("Cannot append from node %d", node->type); + } + + if (error) { + if (prev) + prev->string.length = length; + else + done_dom_string(&dest); + return DOM_STACK_CODE_ERROR_MEM_ALLOC; + } + + if (prev) { + copy_struct(&prev->string, &dest); + + if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) + && node->type != DOM_NODE_ENTITY_REFERENCE) { + /* XXX: Ignore errors since we want to always + * free the appended node at this point. */ + normalize_text_node_whitespace(prev); + } + + return DOM_STACK_CODE_FREE_NODE; + + } else { + int was_cdata_section = node->type == DOM_NODE_CDATA_SECTION; + + node->type = DOM_NODE_TEXT; + memset(&node->data, 0, sizeof(node->data)); + node->data.text.allocated = 1; + copy_struct(&node->string, &dest); + + if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) + && was_cdata_section) { + /* XXX: Ignore errors since we want to always ok the + * append. */ + normalize_text_node_whitespace(node); + } + + return DOM_STACK_CODE_OK; + } +} + +static enum dom_stack_code +dom_normalize_node_end(struct dom_stack *stack, struct dom_node *node, void *data) +{ + struct dom_config *config = stack->current->data; + enum dom_stack_code code = DOM_STACK_CODE_OK; + + switch (node->type) { + case DOM_NODE_ELEMENT: + if ((config->flags & DOM_CONFIG_UNKNOWN) + && !node->data.element.type) { + /* Drop elements that are not known from the built-in + * node info. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_ATTRIBUTE: + if ((config->flags & DOM_CONFIG_UNKNOWN) + && !node->data.attribute.type) { + /* Drop elements that are not known from the built-in + * node info. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_PROCESSING_INSTRUCTION: + if ((config->flags & DOM_CONFIG_UNKNOWN) + && !node->data.proc_instruction.type) { + /* Drop elements that are not known from the built-in + * node info. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_TEXT: + if (config->flags & DOM_CONFIG_NORMALIZE_CHARACTERS) { + code = append_node_text(config, node); + + } else if (!(config->flags & DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE) + && node->data.text.only_space) { + /* Discard all Text nodes that contain + * whitespaces in element content]. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_COMMENT: + if (!(config->flags & DOM_CONFIG_COMMENTS)) { + /* Discard all comments. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_CDATA_SECTION: + if (!(config->flags & DOM_CONFIG_CDATA_SECTIONS)) { + /* Transform CDATASection nodes into Text nodes. The new Text + * node is then combined with any adjacent Text node. */ + code = append_node_text(config, node); + } + break; + + case DOM_NODE_ENTITY_REFERENCE: + if (!(config->flags & DOM_CONFIG_ENTITIES)) { + /* Remove all EntityReference nodes from the document, + * putting the entity expansions directly in their place. Text + * nodes are normalized. Only unexpanded entity references are + * kept in the document. */ + code = append_node_text(config, node); + } + break; + + case DOM_NODE_DOCUMENT: + mem_free(config); + break; + + default: + break; + } + + return code; +} + +enum dom_stack_code +dom_normalize_text(struct dom_stack *stack, struct dom_node *node, void *data) +{ + struct dom_config *config = stack->current->data; + + if (config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) { + /* Normalize whitespace in the text. */ + return normalize_text_node_whitespace(node); + } + + return DOM_STACK_CODE_OK; +} + + +static struct dom_stack_context_info dom_config_normalizer_context = { + /* Object size: */ 0, + /* Push: */ + { + /* */ NULL, + /* DOM_NODE_ELEMENT */ NULL, + /* DOM_NODE_ATTRIBUTE */ NULL, + /* DOM_NODE_TEXT */ dom_normalize_text, + /* DOM_NODE_CDATA_SECTION */ NULL, + /* DOM_NODE_ENTITY_REFERENCE */ NULL, + /* DOM_NODE_ENTITY */ NULL, + /* DOM_NODE_PROC_INSTRUCTION */ NULL, + /* DOM_NODE_COMMENT */ NULL, + /* DOM_NODE_DOCUMENT */ NULL, + /* DOM_NODE_DOCUMENT_TYPE */ NULL, + /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL, + /* DOM_NODE_NOTATION */ NULL, + }, + /* Pop: */ + { + /* */ NULL, + /* DOM_NODE_ELEMENT */ dom_normalize_node_end, + /* DOM_NODE_ATTRIBUTE */ dom_normalize_node_end, + /* DOM_NODE_TEXT */ dom_normalize_node_end, + /* DOM_NODE_CDATA_SECTION */ dom_normalize_node_end, + /* DOM_NODE_ENTITY_REFERENCE */ dom_normalize_node_end, + /* DOM_NODE_ENTITY */ dom_normalize_node_end, + /* DOM_NODE_PROC_INSTRUCTION */ dom_normalize_node_end, + /* DOM_NODE_COMMENT */ dom_normalize_node_end, + /* DOM_NODE_DOCUMENT */ dom_normalize_node_end, + /* DOM_NODE_DOCUMENT_TYPE */ dom_normalize_node_end, + /* DOM_NODE_DOCUMENT_FRAGMENT */ dom_normalize_node_end, + /* DOM_NODE_NOTATION */ dom_normalize_node_end, + } +}; + +struct dom_config * +add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags) +{ + struct dom_config *config; + + config = mem_calloc(1, sizeof(*config)); + if (!config) return NULL; + + config->flags = flags; + + if (add_dom_stack_context(stack, config, &dom_config_normalizer_context)) + return config; + + mem_free(config); + + return NULL; +} diff --git a/src/dom/configuration.h b/src/dom/configuration.h new file mode 100644 index 00000000..267f4cc6 --- /dev/null +++ b/src/dom/configuration.h @@ -0,0 +1,90 @@ +#ifndef EL__DOM_CONFIGURATION_H +#define EL__DOM_CONFIGURATION_H + +struct dom_node; +struct dom_stack; + +/* API Doc :: dom-config */ + +/** DOM Configuration + * + * The DOMConfiguration interface represents the configuration of a document. + * Using the configuration, it is possible to change the behaviour of how + * document normalization is done, such as replacing the CDATASection nodes + * with Text nodes. + * + * Note: Parameters are similar to features and properties used in SAX2 [SAX]. + * + * The following list of parameters defined in the DOM: */ + +enum dom_config_flag { + /** "cdata-sections" + * + * The default is true and will keep CDATASection nodes in the + * document. When false, CDATASection nodes in the document are + * transformed into Text nodes. The new Text node is then combined with + * any adjacent Text node. */ + DOM_CONFIG_CDATA_SECTIONS = 1, + + /** "comments" + * + * If true (the default) keep Comment nodes in the document, else + * discard them. */ + DOM_CONFIG_COMMENTS = 2, + + /** "element-content-whitespace" + * + * The default is true and will keep all whitespaces in the document. + * When false, discard all Text nodes that contain only whitespaces. */ + DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE = 4, + + /** "entities" + * + * When true (the default) keep EntityReference nodes in the document. + * When false, remove all EntityReference nodes from the document, + * putting the entity expansions directly in their place. Text nodes + * are normalized. Only unexpanded entity references are kept in the + * document. Note: This parameter does not affect Entity nodes. */ + DOM_CONFIG_ENTITIES = 8, + + /** "normalize-characters" + * + * The default is false, not to perform character normalization, else + * fully normalized the characters in the document as defined in + * appendix B of [XML 1.1]. */ + DOM_CONFIG_NORMALIZE_CHARACTERS = 16, + + /** "unknown" + * + * If false (default) nothing is done, else elements and attributes + * that are not known according to the built-in node info are + * discarded. */ + DOM_CONFIG_UNKNOWN = 32, + + /** "normalize-whitespace" + * + * If false (default) nothing is done, else all nodes are discarded + * once they have been traversed. */ + DOM_CONFIG_NORMALIZE_WHITESPACE = 64, +}; + +struct dom_error; + +struct dom_config { + enum dom_config_flag flags; /*: DOM configuration flags. */ + + /** FIXME: "error-handler" + * + * Contains an error handler. If an error is encountered in the + * document, this handler is called. When called, DOMError.relatedData + * will contain the closest node to where the error occurred. If the + * implementation is unable to determine the node where the error + * occurs, DOMError.relatedData will contain the Document node. + */ + void (*error_handler)(struct dom_config *, struct dom_error *); +}; + +struct dom_config * +add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags); + +#endif