1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-07-26 16:45:12 -04:00

Add DOM configuration inspired module

It add support for normalizing a DOM document in various ways, such as
removing comments, converting CDATA section nodes to text nodes, cleanup
whitespace, etc.

Use it in the RSS renderer to sanitize the text to be rendered.
This commit is contained in:
Jonas Fonseca 2006-01-16 05:12:34 +01:00 committed by Jonas Fonseca
parent 768f97c38e
commit 6c85c0f009
4 changed files with 399 additions and 30 deletions

View File

@ -22,6 +22,7 @@
#include "document/document.h"
#include "document/dom/renderer.h"
#include "document/renderer.h"
#include "dom/configuration.h"
#include "dom/scanner.h"
#include "dom/sgml/parser.h"
#include "dom/sgml/rss/rss.h"
@ -703,6 +704,9 @@ static struct dom_stack_context_info dom_source_renderer_context_info = {
/* DOM RSS Renderer */
#define RSS_CONFIG_FLAGS \
(DOM_CONFIG_NORMALIZE_WHITESPACE | DOM_CONFIG_NORMALIZE_CHARACTERS)
enum dom_stack_code
dom_rss_push_element(struct dom_stack *stack, struct dom_node *node, void *data)
{
@ -792,32 +796,6 @@ dom_rss_pop_element(struct dom_stack *stack, struct dom_node *node, void *data)
return DOM_STACK_CODE_OK;
}
enum dom_stack_code
dom_rss_push_content(struct dom_stack *stack, struct dom_node *node, void *data)
{
struct dom_renderer *renderer = stack->current->data;
unsigned char *string = node->string.string;
int length = node->string.length;
assert(node && renderer && renderer->document);
if (!renderer->node)
return DOM_STACK_CODE_OK;
if (node->type == DOM_NODE_ENTITY_REFERENCE) {
string -= 1;
length += 2;
}
if (!is_dom_string_set(&renderer->text)) {
init_dom_string(&renderer->text, string, length);
} else {
add_to_dom_string(&renderer->text, string, length);
}
return DOM_STACK_CODE_OK;
}
static struct dom_string *
get_rss_node_text(struct dom_node *node)
{
@ -955,9 +933,9 @@ static struct dom_stack_context_info dom_rss_renderer_context_info = {
/* */ NULL,
/* DOM_NODE_ELEMENT */ dom_rss_push_element,
/* DOM_NODE_ATTRIBUTE */ NULL,
/* DOM_NODE_TEXT */ dom_rss_push_content,
/* DOM_NODE_CDATA_SECTION */ dom_rss_push_content,
/* DOM_NODE_ENTITY_REFERENCE */ dom_rss_push_content,
/* DOM_NODE_TEXT */ NULL,
/* DOM_NODE_CDATA_SECTION */ NULL,
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
/* DOM_NODE_ENTITY */ NULL,
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
/* DOM_NODE_COMMENT */ NULL,
@ -1046,6 +1024,7 @@ render_dom_document(struct cache_entry *cached, struct document *document,
} else if (doctype == SGML_DOCTYPE_RSS) {
add_dom_stack_context(&parser->stack, &renderer,
&dom_rss_renderer_context_info);
add_dom_config_normalizer(&parser->stack, RSS_CONFIG_FLAGS);
}
/* FIXME: When rendering this way we don't really care about the code.

View File

@ -2,7 +2,7 @@ top_builddir=../..
include $(top_builddir)/Makefile.config
SUBDIRS = css sgml
OBJS = node.o select.o stack.o scanner.o
OBJS = configuration.o node.o select.o stack.o scanner.o
SUBDIRS-$(CONFIG_DEBUG) += test

300
src/dom/configuration.c Normal file
View File

@ -0,0 +1,300 @@
/* DOM Configuration */
#include "elinks.h"
#include "dom/configuration.h"
#include "dom/node.h"
#include "dom/stack.h"
#include "dom/string.h"
static enum dom_stack_code
normalize_text_node_whitespace(struct dom_node *node)
{
unsigned char buf[256];
struct dom_string string = INIT_DOM_STRING(NULL, 0);
int count = 0, i = 0;
unsigned char *text = node->string.string;
assert(node->type == DOM_NODE_TEXT);
while (i < node->string.length) {
int j;
for (j = 0; j < sizeof(buf) && i < node->string.length; i++) {
unsigned char data = text[i];
if (isspace(data)) {
if (count == 1)
continue;
data = ' ';
count = 1;
} else {
count = 0;
}
buf[j++] = data;
}
if (!add_to_dom_string(&string, buf, j)) {
done_dom_string(&string);
return DOM_STACK_CODE_ERROR_MEM_ALLOC;
}
}
if (node->data.text.allocated)
done_dom_string(&node->string);
set_dom_string(&node->string, string.string, string.length);
node->data.text.allocated = 1;
return DOM_STACK_CODE_OK;
}
static enum dom_stack_code
append_node_text(struct dom_config *config, struct dom_node *node)
{
struct dom_node *prev = get_dom_node_prev(node);
size_t length;
struct dom_string dest;
struct dom_string src;
int error = 0;
copy_struct(&src, &node->string);
if (!prev || prev->type != DOM_NODE_TEXT) {
/* Preserve text nodes with no one to append to. */
if (node->type == DOM_NODE_TEXT)
return DOM_STACK_CODE_OK;
prev = NULL;
set_dom_string(&dest, NULL, 0);
} else {
if (prev->data.text.allocated) {
copy_struct(&dest, &prev->string);
} else {
set_dom_string(&dest, NULL, 0);
if (!add_to_dom_string(&dest, prev->string.string, prev->string.length))
return DOM_STACK_CODE_ERROR_MEM_ALLOC;
set_dom_string(&prev->string, dest.string, dest.length);
prev->data.text.allocated = 1;
}
}
length = dest.length;
switch (node->type) {
case DOM_NODE_CDATA_SECTION:
case DOM_NODE_TEXT:
if (!add_to_dom_string(&dest, src.string, src.length))
error = 1;
break;
case DOM_NODE_ENTITY_REFERENCE:
/* FIXME: Until we will have uniform encoding at this point
* (UTF-8) we just add the entity reference unexpanded assuming
* that convert_string() will eventually do the work of
* expanding it. */
if (!add_to_dom_string(&dest, "&", 1)
|| !add_to_dom_string(&dest, src.string, src.length)
|| !add_to_dom_string(&dest, ";", 1)) {
error = 1;
}
break;
default:
INTERNAL("Cannot append from node %d", node->type);
}
if (error) {
if (prev)
prev->string.length = length;
else
done_dom_string(&dest);
return DOM_STACK_CODE_ERROR_MEM_ALLOC;
}
if (prev) {
copy_struct(&prev->string, &dest);
if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE)
&& node->type != DOM_NODE_ENTITY_REFERENCE) {
/* XXX: Ignore errors since we want to always
* free the appended node at this point. */
normalize_text_node_whitespace(prev);
}
return DOM_STACK_CODE_FREE_NODE;
} else {
int was_cdata_section = node->type == DOM_NODE_CDATA_SECTION;
node->type = DOM_NODE_TEXT;
memset(&node->data, 0, sizeof(node->data));
node->data.text.allocated = 1;
copy_struct(&node->string, &dest);
if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE)
&& was_cdata_section) {
/* XXX: Ignore errors since we want to always ok the
* append. */
normalize_text_node_whitespace(node);
}
return DOM_STACK_CODE_OK;
}
}
static enum dom_stack_code
dom_normalize_node_end(struct dom_stack *stack, struct dom_node *node, void *data)
{
struct dom_config *config = stack->current->data;
enum dom_stack_code code = DOM_STACK_CODE_OK;
switch (node->type) {
case DOM_NODE_ELEMENT:
if ((config->flags & DOM_CONFIG_UNKNOWN)
&& !node->data.element.type) {
/* Drop elements that are not known from the built-in
* node info. */
code = DOM_STACK_CODE_FREE_NODE;
}
break;
case DOM_NODE_ATTRIBUTE:
if ((config->flags & DOM_CONFIG_UNKNOWN)
&& !node->data.attribute.type) {
/* Drop elements that are not known from the built-in
* node info. */
code = DOM_STACK_CODE_FREE_NODE;
}
break;
case DOM_NODE_PROCESSING_INSTRUCTION:
if ((config->flags & DOM_CONFIG_UNKNOWN)
&& !node->data.proc_instruction.type) {
/* Drop elements that are not known from the built-in
* node info. */
code = DOM_STACK_CODE_FREE_NODE;
}
break;
case DOM_NODE_TEXT:
if (config->flags & DOM_CONFIG_NORMALIZE_CHARACTERS) {
code = append_node_text(config, node);
} else if (!(config->flags & DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE)
&& node->data.text.only_space) {
/* Discard all Text nodes that contain
* whitespaces in element content]. */
code = DOM_STACK_CODE_FREE_NODE;
}
break;
case DOM_NODE_COMMENT:
if (!(config->flags & DOM_CONFIG_COMMENTS)) {
/* Discard all comments. */
code = DOM_STACK_CODE_FREE_NODE;
}
break;
case DOM_NODE_CDATA_SECTION:
if (!(config->flags & DOM_CONFIG_CDATA_SECTIONS)) {
/* Transform CDATASection nodes into Text nodes. The new Text
* node is then combined with any adjacent Text node. */
code = append_node_text(config, node);
}
break;
case DOM_NODE_ENTITY_REFERENCE:
if (!(config->flags & DOM_CONFIG_ENTITIES)) {
/* Remove all EntityReference nodes from the document,
* putting the entity expansions directly in their place. Text
* nodes are normalized. Only unexpanded entity references are
* kept in the document. */
code = append_node_text(config, node);
}
break;
case DOM_NODE_DOCUMENT:
mem_free(config);
break;
default:
break;
}
return code;
}
enum dom_stack_code
dom_normalize_text(struct dom_stack *stack, struct dom_node *node, void *data)
{
struct dom_config *config = stack->current->data;
if (config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) {
/* Normalize whitespace in the text. */
return normalize_text_node_whitespace(node);
}
return DOM_STACK_CODE_OK;
}
static struct dom_stack_context_info dom_config_normalizer_context = {
/* Object size: */ 0,
/* Push: */
{
/* */ NULL,
/* DOM_NODE_ELEMENT */ NULL,
/* DOM_NODE_ATTRIBUTE */ NULL,
/* DOM_NODE_TEXT */ dom_normalize_text,
/* DOM_NODE_CDATA_SECTION */ NULL,
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
/* DOM_NODE_ENTITY */ NULL,
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
/* DOM_NODE_COMMENT */ NULL,
/* DOM_NODE_DOCUMENT */ NULL,
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
/* DOM_NODE_NOTATION */ NULL,
},
/* Pop: */
{
/* */ NULL,
/* DOM_NODE_ELEMENT */ dom_normalize_node_end,
/* DOM_NODE_ATTRIBUTE */ dom_normalize_node_end,
/* DOM_NODE_TEXT */ dom_normalize_node_end,
/* DOM_NODE_CDATA_SECTION */ dom_normalize_node_end,
/* DOM_NODE_ENTITY_REFERENCE */ dom_normalize_node_end,
/* DOM_NODE_ENTITY */ dom_normalize_node_end,
/* DOM_NODE_PROC_INSTRUCTION */ dom_normalize_node_end,
/* DOM_NODE_COMMENT */ dom_normalize_node_end,
/* DOM_NODE_DOCUMENT */ dom_normalize_node_end,
/* DOM_NODE_DOCUMENT_TYPE */ dom_normalize_node_end,
/* DOM_NODE_DOCUMENT_FRAGMENT */ dom_normalize_node_end,
/* DOM_NODE_NOTATION */ dom_normalize_node_end,
}
};
struct dom_config *
add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags)
{
struct dom_config *config;
config = mem_calloc(1, sizeof(*config));
if (!config) return NULL;
config->flags = flags;
if (add_dom_stack_context(stack, config, &dom_config_normalizer_context))
return config;
mem_free(config);
return NULL;
}

90
src/dom/configuration.h Normal file
View File

@ -0,0 +1,90 @@
#ifndef EL__DOM_CONFIGURATION_H
#define EL__DOM_CONFIGURATION_H
struct dom_node;
struct dom_stack;
/* API Doc :: dom-config */
/** DOM Configuration
*
* The DOMConfiguration interface represents the configuration of a document.
* Using the configuration, it is possible to change the behaviour of how
* document normalization is done, such as replacing the CDATASection nodes
* with Text nodes.
*
* Note: Parameters are similar to features and properties used in SAX2 [SAX].
*
* The following list of parameters defined in the DOM: */
enum dom_config_flag {
/** "cdata-sections"
*
* The default is true and will keep CDATASection nodes in the
* document. When false, CDATASection nodes in the document are
* transformed into Text nodes. The new Text node is then combined with
* any adjacent Text node. */
DOM_CONFIG_CDATA_SECTIONS = 1,
/** "comments"
*
* If true (the default) keep Comment nodes in the document, else
* discard them. */
DOM_CONFIG_COMMENTS = 2,
/** "element-content-whitespace"
*
* The default is true and will keep all whitespaces in the document.
* When false, discard all Text nodes that contain only whitespaces. */
DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE = 4,
/** "entities"
*
* When true (the default) keep EntityReference nodes in the document.
* When false, remove all EntityReference nodes from the document,
* putting the entity expansions directly in their place. Text nodes
* are normalized. Only unexpanded entity references are kept in the
* document. Note: This parameter does not affect Entity nodes. */
DOM_CONFIG_ENTITIES = 8,
/** "normalize-characters"
*
* The default is false, not to perform character normalization, else
* fully normalized the characters in the document as defined in
* appendix B of [XML 1.1]. */
DOM_CONFIG_NORMALIZE_CHARACTERS = 16,
/** "unknown"
*
* If false (default) nothing is done, else elements and attributes
* that are not known according to the built-in node info are
* discarded. */
DOM_CONFIG_UNKNOWN = 32,
/** "normalize-whitespace"
*
* If false (default) nothing is done, else all nodes are discarded
* once they have been traversed. */
DOM_CONFIG_NORMALIZE_WHITESPACE = 64,
};
struct dom_error;
struct dom_config {
enum dom_config_flag flags; /*: DOM configuration flags. */
/** FIXME: "error-handler"
*
* Contains an error handler. If an error is encountered in the
* document, this handler is called. When called, DOMError.relatedData
* will contain the closest node to where the error occurred. If the
* implementation is unable to determine the node where the error
* occurs, DOMError.relatedData will contain the Document node.
*/
void (*error_handler)(struct dom_config *, struct dom_error *);
};
struct dom_config *
add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags);
#endif