diff --git a/src/document/dom/renderer.c b/src/document/dom/renderer.c index 18f44641..10e034e9 100644 --- a/src/document/dom/renderer.c +++ b/src/document/dom/renderer.c @@ -22,6 +22,7 @@ #include "document/document.h" #include "document/dom/renderer.h" #include "document/renderer.h" +#include "dom/configuration.h" #include "dom/scanner.h" #include "dom/sgml/parser.h" #include "dom/sgml/rss/rss.h" @@ -703,6 +704,9 @@ static struct dom_stack_context_info dom_source_renderer_context_info = { /* DOM RSS Renderer */ +#define RSS_CONFIG_FLAGS \ + (DOM_CONFIG_NORMALIZE_WHITESPACE | DOM_CONFIG_NORMALIZE_CHARACTERS) + enum dom_stack_code dom_rss_push_element(struct dom_stack *stack, struct dom_node *node, void *data) { @@ -792,32 +796,6 @@ dom_rss_pop_element(struct dom_stack *stack, struct dom_node *node, void *data) return DOM_STACK_CODE_OK; } -enum dom_stack_code -dom_rss_push_content(struct dom_stack *stack, struct dom_node *node, void *data) -{ - struct dom_renderer *renderer = stack->current->data; - unsigned char *string = node->string.string; - int length = node->string.length; - - assert(node && renderer && renderer->document); - - if (!renderer->node) - return DOM_STACK_CODE_OK; - - if (node->type == DOM_NODE_ENTITY_REFERENCE) { - string -= 1; - length += 2; - } - - if (!is_dom_string_set(&renderer->text)) { - init_dom_string(&renderer->text, string, length); - } else { - add_to_dom_string(&renderer->text, string, length); - } - - return DOM_STACK_CODE_OK; -} - static struct dom_string * get_rss_node_text(struct dom_node *node) { @@ -955,9 +933,9 @@ static struct dom_stack_context_info dom_rss_renderer_context_info = { /* */ NULL, /* DOM_NODE_ELEMENT */ dom_rss_push_element, /* DOM_NODE_ATTRIBUTE */ NULL, - /* DOM_NODE_TEXT */ dom_rss_push_content, - /* DOM_NODE_CDATA_SECTION */ dom_rss_push_content, - /* DOM_NODE_ENTITY_REFERENCE */ dom_rss_push_content, + /* DOM_NODE_TEXT */ NULL, + /* DOM_NODE_CDATA_SECTION */ NULL, + /* DOM_NODE_ENTITY_REFERENCE */ NULL, /* DOM_NODE_ENTITY */ NULL, /* DOM_NODE_PROC_INSTRUCTION */ NULL, /* DOM_NODE_COMMENT */ NULL, @@ -1046,6 +1024,7 @@ render_dom_document(struct cache_entry *cached, struct document *document, } else if (doctype == SGML_DOCTYPE_RSS) { add_dom_stack_context(&parser->stack, &renderer, &dom_rss_renderer_context_info); + add_dom_config_normalizer(&parser->stack, RSS_CONFIG_FLAGS); } /* FIXME: When rendering this way we don't really care about the code. diff --git a/src/dom/Makefile b/src/dom/Makefile index 35a6b392..cdff0bab 100644 --- a/src/dom/Makefile +++ b/src/dom/Makefile @@ -2,7 +2,7 @@ top_builddir=../.. include $(top_builddir)/Makefile.config SUBDIRS = css sgml -OBJS = node.o select.o stack.o scanner.o +OBJS = configuration.o node.o select.o stack.o scanner.o SUBDIRS-$(CONFIG_DEBUG) += test diff --git a/src/dom/configuration.c b/src/dom/configuration.c new file mode 100644 index 00000000..f43690b8 --- /dev/null +++ b/src/dom/configuration.c @@ -0,0 +1,300 @@ +/* DOM Configuration */ + +#include "elinks.h" + +#include "dom/configuration.h" +#include "dom/node.h" +#include "dom/stack.h" +#include "dom/string.h" + + +static enum dom_stack_code +normalize_text_node_whitespace(struct dom_node *node) +{ + unsigned char buf[256]; + struct dom_string string = INIT_DOM_STRING(NULL, 0); + int count = 0, i = 0; + unsigned char *text = node->string.string; + + assert(node->type == DOM_NODE_TEXT); + + while (i < node->string.length) { + int j; + + for (j = 0; j < sizeof(buf) && i < node->string.length; i++) { + unsigned char data = text[i]; + + if (isspace(data)) { + if (count == 1) + continue; + + data = ' '; + count = 1; + + } else { + count = 0; + } + + buf[j++] = data; + } + + if (!add_to_dom_string(&string, buf, j)) { + done_dom_string(&string); + return DOM_STACK_CODE_ERROR_MEM_ALLOC; + } + } + + if (node->data.text.allocated) + done_dom_string(&node->string); + + set_dom_string(&node->string, string.string, string.length); + node->data.text.allocated = 1; + + return DOM_STACK_CODE_OK; + +} + +static enum dom_stack_code +append_node_text(struct dom_config *config, struct dom_node *node) +{ + struct dom_node *prev = get_dom_node_prev(node); + size_t length; + struct dom_string dest; + struct dom_string src; + int error = 0; + + copy_struct(&src, &node->string); + + if (!prev || prev->type != DOM_NODE_TEXT) { + /* Preserve text nodes with no one to append to. */ + if (node->type == DOM_NODE_TEXT) + return DOM_STACK_CODE_OK; + + prev = NULL; + set_dom_string(&dest, NULL, 0); + + } else { + if (prev->data.text.allocated) { + copy_struct(&dest, &prev->string); + } else { + set_dom_string(&dest, NULL, 0); + if (!add_to_dom_string(&dest, prev->string.string, prev->string.length)) + return DOM_STACK_CODE_ERROR_MEM_ALLOC; + set_dom_string(&prev->string, dest.string, dest.length); + prev->data.text.allocated = 1; + } + } + + length = dest.length; + + switch (node->type) { + case DOM_NODE_CDATA_SECTION: + case DOM_NODE_TEXT: + if (!add_to_dom_string(&dest, src.string, src.length)) + error = 1; + break; + + case DOM_NODE_ENTITY_REFERENCE: + /* FIXME: Until we will have uniform encoding at this point + * (UTF-8) we just add the entity reference unexpanded assuming + * that convert_string() will eventually do the work of + * expanding it. */ + if (!add_to_dom_string(&dest, "&", 1) + || !add_to_dom_string(&dest, src.string, src.length) + || !add_to_dom_string(&dest, ";", 1)) { + error = 1; + } + break; + + default: + INTERNAL("Cannot append from node %d", node->type); + } + + if (error) { + if (prev) + prev->string.length = length; + else + done_dom_string(&dest); + return DOM_STACK_CODE_ERROR_MEM_ALLOC; + } + + if (prev) { + copy_struct(&prev->string, &dest); + + if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) + && node->type != DOM_NODE_ENTITY_REFERENCE) { + /* XXX: Ignore errors since we want to always + * free the appended node at this point. */ + normalize_text_node_whitespace(prev); + } + + return DOM_STACK_CODE_FREE_NODE; + + } else { + int was_cdata_section = node->type == DOM_NODE_CDATA_SECTION; + + node->type = DOM_NODE_TEXT; + memset(&node->data, 0, sizeof(node->data)); + node->data.text.allocated = 1; + copy_struct(&node->string, &dest); + + if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) + && was_cdata_section) { + /* XXX: Ignore errors since we want to always ok the + * append. */ + normalize_text_node_whitespace(node); + } + + return DOM_STACK_CODE_OK; + } +} + +static enum dom_stack_code +dom_normalize_node_end(struct dom_stack *stack, struct dom_node *node, void *data) +{ + struct dom_config *config = stack->current->data; + enum dom_stack_code code = DOM_STACK_CODE_OK; + + switch (node->type) { + case DOM_NODE_ELEMENT: + if ((config->flags & DOM_CONFIG_UNKNOWN) + && !node->data.element.type) { + /* Drop elements that are not known from the built-in + * node info. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_ATTRIBUTE: + if ((config->flags & DOM_CONFIG_UNKNOWN) + && !node->data.attribute.type) { + /* Drop elements that are not known from the built-in + * node info. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_PROCESSING_INSTRUCTION: + if ((config->flags & DOM_CONFIG_UNKNOWN) + && !node->data.proc_instruction.type) { + /* Drop elements that are not known from the built-in + * node info. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_TEXT: + if (config->flags & DOM_CONFIG_NORMALIZE_CHARACTERS) { + code = append_node_text(config, node); + + } else if (!(config->flags & DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE) + && node->data.text.only_space) { + /* Discard all Text nodes that contain + * whitespaces in element content]. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_COMMENT: + if (!(config->flags & DOM_CONFIG_COMMENTS)) { + /* Discard all comments. */ + code = DOM_STACK_CODE_FREE_NODE; + } + break; + + case DOM_NODE_CDATA_SECTION: + if (!(config->flags & DOM_CONFIG_CDATA_SECTIONS)) { + /* Transform CDATASection nodes into Text nodes. The new Text + * node is then combined with any adjacent Text node. */ + code = append_node_text(config, node); + } + break; + + case DOM_NODE_ENTITY_REFERENCE: + if (!(config->flags & DOM_CONFIG_ENTITIES)) { + /* Remove all EntityReference nodes from the document, + * putting the entity expansions directly in their place. Text + * nodes are normalized. Only unexpanded entity references are + * kept in the document. */ + code = append_node_text(config, node); + } + break; + + case DOM_NODE_DOCUMENT: + mem_free(config); + break; + + default: + break; + } + + return code; +} + +enum dom_stack_code +dom_normalize_text(struct dom_stack *stack, struct dom_node *node, void *data) +{ + struct dom_config *config = stack->current->data; + + if (config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) { + /* Normalize whitespace in the text. */ + return normalize_text_node_whitespace(node); + } + + return DOM_STACK_CODE_OK; +} + + +static struct dom_stack_context_info dom_config_normalizer_context = { + /* Object size: */ 0, + /* Push: */ + { + /* */ NULL, + /* DOM_NODE_ELEMENT */ NULL, + /* DOM_NODE_ATTRIBUTE */ NULL, + /* DOM_NODE_TEXT */ dom_normalize_text, + /* DOM_NODE_CDATA_SECTION */ NULL, + /* DOM_NODE_ENTITY_REFERENCE */ NULL, + /* DOM_NODE_ENTITY */ NULL, + /* DOM_NODE_PROC_INSTRUCTION */ NULL, + /* DOM_NODE_COMMENT */ NULL, + /* DOM_NODE_DOCUMENT */ NULL, + /* DOM_NODE_DOCUMENT_TYPE */ NULL, + /* DOM_NODE_DOCUMENT_FRAGMENT */ NULL, + /* DOM_NODE_NOTATION */ NULL, + }, + /* Pop: */ + { + /* */ NULL, + /* DOM_NODE_ELEMENT */ dom_normalize_node_end, + /* DOM_NODE_ATTRIBUTE */ dom_normalize_node_end, + /* DOM_NODE_TEXT */ dom_normalize_node_end, + /* DOM_NODE_CDATA_SECTION */ dom_normalize_node_end, + /* DOM_NODE_ENTITY_REFERENCE */ dom_normalize_node_end, + /* DOM_NODE_ENTITY */ dom_normalize_node_end, + /* DOM_NODE_PROC_INSTRUCTION */ dom_normalize_node_end, + /* DOM_NODE_COMMENT */ dom_normalize_node_end, + /* DOM_NODE_DOCUMENT */ dom_normalize_node_end, + /* DOM_NODE_DOCUMENT_TYPE */ dom_normalize_node_end, + /* DOM_NODE_DOCUMENT_FRAGMENT */ dom_normalize_node_end, + /* DOM_NODE_NOTATION */ dom_normalize_node_end, + } +}; + +struct dom_config * +add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags) +{ + struct dom_config *config; + + config = mem_calloc(1, sizeof(*config)); + if (!config) return NULL; + + config->flags = flags; + + if (add_dom_stack_context(stack, config, &dom_config_normalizer_context)) + return config; + + mem_free(config); + + return NULL; +} diff --git a/src/dom/configuration.h b/src/dom/configuration.h new file mode 100644 index 00000000..267f4cc6 --- /dev/null +++ b/src/dom/configuration.h @@ -0,0 +1,90 @@ +#ifndef EL__DOM_CONFIGURATION_H +#define EL__DOM_CONFIGURATION_H + +struct dom_node; +struct dom_stack; + +/* API Doc :: dom-config */ + +/** DOM Configuration + * + * The DOMConfiguration interface represents the configuration of a document. + * Using the configuration, it is possible to change the behaviour of how + * document normalization is done, such as replacing the CDATASection nodes + * with Text nodes. + * + * Note: Parameters are similar to features and properties used in SAX2 [SAX]. + * + * The following list of parameters defined in the DOM: */ + +enum dom_config_flag { + /** "cdata-sections" + * + * The default is true and will keep CDATASection nodes in the + * document. When false, CDATASection nodes in the document are + * transformed into Text nodes. The new Text node is then combined with + * any adjacent Text node. */ + DOM_CONFIG_CDATA_SECTIONS = 1, + + /** "comments" + * + * If true (the default) keep Comment nodes in the document, else + * discard them. */ + DOM_CONFIG_COMMENTS = 2, + + /** "element-content-whitespace" + * + * The default is true and will keep all whitespaces in the document. + * When false, discard all Text nodes that contain only whitespaces. */ + DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE = 4, + + /** "entities" + * + * When true (the default) keep EntityReference nodes in the document. + * When false, remove all EntityReference nodes from the document, + * putting the entity expansions directly in their place. Text nodes + * are normalized. Only unexpanded entity references are kept in the + * document. Note: This parameter does not affect Entity nodes. */ + DOM_CONFIG_ENTITIES = 8, + + /** "normalize-characters" + * + * The default is false, not to perform character normalization, else + * fully normalized the characters in the document as defined in + * appendix B of [XML 1.1]. */ + DOM_CONFIG_NORMALIZE_CHARACTERS = 16, + + /** "unknown" + * + * If false (default) nothing is done, else elements and attributes + * that are not known according to the built-in node info are + * discarded. */ + DOM_CONFIG_UNKNOWN = 32, + + /** "normalize-whitespace" + * + * If false (default) nothing is done, else all nodes are discarded + * once they have been traversed. */ + DOM_CONFIG_NORMALIZE_WHITESPACE = 64, +}; + +struct dom_error; + +struct dom_config { + enum dom_config_flag flags; /*: DOM configuration flags. */ + + /** FIXME: "error-handler" + * + * Contains an error handler. If an error is encountered in the + * document, this handler is called. When called, DOMError.relatedData + * will contain the closest node to where the error occurred. If the + * implementation is unable to determine the node where the error + * occurs, DOMError.relatedData will contain the Document node. + */ + void (*error_handler)(struct dom_config *, struct dom_error *); +}; + +struct dom_config * +add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags); + +#endif