2006-01-15 23:12:34 -05:00
|
|
|
/* DOM Configuration */
|
|
|
|
|
|
|
|
#include "elinks.h"
|
|
|
|
|
|
|
|
#include "dom/configuration.h"
|
|
|
|
#include "dom/node.h"
|
|
|
|
#include "dom/stack.h"
|
|
|
|
#include "dom/string.h"
|
|
|
|
|
|
|
|
|
|
|
|
static enum dom_stack_code
|
|
|
|
normalize_text_node_whitespace(struct dom_node *node)
|
|
|
|
{
|
|
|
|
unsigned char buf[256];
|
|
|
|
struct dom_string string = INIT_DOM_STRING(NULL, 0);
|
|
|
|
int count = 0, i = 0;
|
|
|
|
unsigned char *text = node->string.string;
|
|
|
|
|
|
|
|
assert(node->type == DOM_NODE_TEXT);
|
|
|
|
|
|
|
|
while (i < node->string.length) {
|
|
|
|
int j;
|
|
|
|
|
|
|
|
for (j = 0; j < sizeof(buf) && i < node->string.length; i++) {
|
|
|
|
unsigned char data = text[i];
|
|
|
|
|
|
|
|
if (isspace(data)) {
|
|
|
|
if (count == 1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
data = ' ';
|
|
|
|
count = 1;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
count = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
buf[j++] = data;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!add_to_dom_string(&string, buf, j)) {
|
|
|
|
done_dom_string(&string);
|
|
|
|
return DOM_STACK_CODE_ERROR_MEM_ALLOC;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (node->data.text.allocated)
|
|
|
|
done_dom_string(&node->string);
|
|
|
|
|
|
|
|
set_dom_string(&node->string, string.string, string.length);
|
|
|
|
node->data.text.allocated = 1;
|
|
|
|
|
|
|
|
return DOM_STACK_CODE_OK;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum dom_stack_code
|
|
|
|
append_node_text(struct dom_config *config, struct dom_node *node)
|
|
|
|
{
|
|
|
|
struct dom_node *prev = get_dom_node_prev(node);
|
|
|
|
size_t length;
|
|
|
|
struct dom_string dest;
|
|
|
|
struct dom_string src;
|
|
|
|
int error = 0;
|
|
|
|
|
|
|
|
copy_struct(&src, &node->string);
|
|
|
|
|
|
|
|
if (!prev || prev->type != DOM_NODE_TEXT) {
|
|
|
|
/* Preserve text nodes with no one to append to. */
|
|
|
|
if (node->type == DOM_NODE_TEXT)
|
|
|
|
return DOM_STACK_CODE_OK;
|
|
|
|
|
|
|
|
prev = NULL;
|
|
|
|
set_dom_string(&dest, NULL, 0);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
if (prev->data.text.allocated) {
|
|
|
|
copy_struct(&dest, &prev->string);
|
|
|
|
} else {
|
|
|
|
set_dom_string(&dest, NULL, 0);
|
|
|
|
if (!add_to_dom_string(&dest, prev->string.string, prev->string.length))
|
|
|
|
return DOM_STACK_CODE_ERROR_MEM_ALLOC;
|
|
|
|
set_dom_string(&prev->string, dest.string, dest.length);
|
|
|
|
prev->data.text.allocated = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
length = dest.length;
|
|
|
|
|
|
|
|
switch (node->type) {
|
|
|
|
case DOM_NODE_CDATA_SECTION:
|
|
|
|
case DOM_NODE_TEXT:
|
|
|
|
if (!add_to_dom_string(&dest, src.string, src.length))
|
|
|
|
error = 1;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_ENTITY_REFERENCE:
|
|
|
|
/* FIXME: Until we will have uniform encoding at this point
|
|
|
|
* (UTF-8) we just add the entity reference unexpanded assuming
|
|
|
|
* that convert_string() will eventually do the work of
|
|
|
|
* expanding it. */
|
|
|
|
if (!add_to_dom_string(&dest, "&", 1)
|
|
|
|
|| !add_to_dom_string(&dest, src.string, src.length)
|
|
|
|
|| !add_to_dom_string(&dest, ";", 1)) {
|
|
|
|
error = 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
INTERNAL("Cannot append from node %d", node->type);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (error) {
|
|
|
|
if (prev)
|
|
|
|
prev->string.length = length;
|
|
|
|
else
|
|
|
|
done_dom_string(&dest);
|
|
|
|
return DOM_STACK_CODE_ERROR_MEM_ALLOC;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (prev) {
|
|
|
|
copy_struct(&prev->string, &dest);
|
|
|
|
|
|
|
|
if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE)
|
|
|
|
&& node->type != DOM_NODE_ENTITY_REFERENCE) {
|
|
|
|
/* XXX: Ignore errors since we want to always
|
|
|
|
* free the appended node at this point. */
|
|
|
|
normalize_text_node_whitespace(prev);
|
|
|
|
}
|
|
|
|
|
|
|
|
return DOM_STACK_CODE_FREE_NODE;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
int was_cdata_section = node->type == DOM_NODE_CDATA_SECTION;
|
|
|
|
|
|
|
|
node->type = DOM_NODE_TEXT;
|
|
|
|
memset(&node->data, 0, sizeof(node->data));
|
|
|
|
node->data.text.allocated = 1;
|
|
|
|
copy_struct(&node->string, &dest);
|
|
|
|
|
|
|
|
if ((config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE)
|
|
|
|
&& was_cdata_section) {
|
|
|
|
/* XXX: Ignore errors since we want to always ok the
|
|
|
|
* append. */
|
|
|
|
normalize_text_node_whitespace(node);
|
|
|
|
}
|
|
|
|
|
|
|
|
return DOM_STACK_CODE_OK;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static enum dom_stack_code
|
|
|
|
dom_normalize_node_end(struct dom_stack *stack, struct dom_node *node, void *data)
|
|
|
|
{
|
|
|
|
struct dom_config *config = stack->current->data;
|
|
|
|
enum dom_stack_code code = DOM_STACK_CODE_OK;
|
|
|
|
|
|
|
|
switch (node->type) {
|
|
|
|
case DOM_NODE_ELEMENT:
|
|
|
|
if ((config->flags & DOM_CONFIG_UNKNOWN)
|
|
|
|
&& !node->data.element.type) {
|
|
|
|
/* Drop elements that are not known from the built-in
|
|
|
|
* node info. */
|
|
|
|
code = DOM_STACK_CODE_FREE_NODE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_ATTRIBUTE:
|
|
|
|
if ((config->flags & DOM_CONFIG_UNKNOWN)
|
|
|
|
&& !node->data.attribute.type) {
|
|
|
|
/* Drop elements that are not known from the built-in
|
|
|
|
* node info. */
|
|
|
|
code = DOM_STACK_CODE_FREE_NODE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_PROCESSING_INSTRUCTION:
|
|
|
|
if ((config->flags & DOM_CONFIG_UNKNOWN)
|
|
|
|
&& !node->data.proc_instruction.type) {
|
|
|
|
/* Drop elements that are not known from the built-in
|
|
|
|
* node info. */
|
|
|
|
code = DOM_STACK_CODE_FREE_NODE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_TEXT:
|
2006-01-18 22:54:30 -05:00
|
|
|
if (!(config->flags & DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE)
|
|
|
|
&& node->data.text.only_space) {
|
2006-01-15 23:12:34 -05:00
|
|
|
/* Discard all Text nodes that contain
|
|
|
|
* whitespaces in element content]. */
|
|
|
|
code = DOM_STACK_CODE_FREE_NODE;
|
2006-01-18 22:54:30 -05:00
|
|
|
} else {
|
|
|
|
code = append_node_text(config, node);
|
2006-01-15 23:12:34 -05:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_COMMENT:
|
|
|
|
if (!(config->flags & DOM_CONFIG_COMMENTS)) {
|
|
|
|
/* Discard all comments. */
|
|
|
|
code = DOM_STACK_CODE_FREE_NODE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_CDATA_SECTION:
|
|
|
|
if (!(config->flags & DOM_CONFIG_CDATA_SECTIONS)) {
|
|
|
|
/* Transform CDATASection nodes into Text nodes. The new Text
|
|
|
|
* node is then combined with any adjacent Text node. */
|
|
|
|
code = append_node_text(config, node);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_ENTITY_REFERENCE:
|
|
|
|
if (!(config->flags & DOM_CONFIG_ENTITIES)) {
|
|
|
|
/* Remove all EntityReference nodes from the document,
|
|
|
|
* putting the entity expansions directly in their place. Text
|
|
|
|
* nodes are normalized. Only unexpanded entity references are
|
|
|
|
* kept in the document. */
|
|
|
|
code = append_node_text(config, node);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case DOM_NODE_DOCUMENT:
|
|
|
|
mem_free(config);
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return code;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum dom_stack_code
|
|
|
|
dom_normalize_text(struct dom_stack *stack, struct dom_node *node, void *data)
|
|
|
|
{
|
|
|
|
struct dom_config *config = stack->current->data;
|
|
|
|
|
|
|
|
if (config->flags & DOM_CONFIG_NORMALIZE_WHITESPACE) {
|
|
|
|
/* Normalize whitespace in the text. */
|
|
|
|
return normalize_text_node_whitespace(node);
|
|
|
|
}
|
|
|
|
|
|
|
|
return DOM_STACK_CODE_OK;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static struct dom_stack_context_info dom_config_normalizer_context = {
|
|
|
|
/* Object size: */ 0,
|
|
|
|
/* Push: */
|
|
|
|
{
|
|
|
|
/* */ NULL,
|
|
|
|
/* DOM_NODE_ELEMENT */ NULL,
|
|
|
|
/* DOM_NODE_ATTRIBUTE */ NULL,
|
|
|
|
/* DOM_NODE_TEXT */ dom_normalize_text,
|
|
|
|
/* DOM_NODE_CDATA_SECTION */ NULL,
|
|
|
|
/* DOM_NODE_ENTITY_REFERENCE */ NULL,
|
|
|
|
/* DOM_NODE_ENTITY */ NULL,
|
|
|
|
/* DOM_NODE_PROC_INSTRUCTION */ NULL,
|
|
|
|
/* DOM_NODE_COMMENT */ NULL,
|
|
|
|
/* DOM_NODE_DOCUMENT */ NULL,
|
|
|
|
/* DOM_NODE_DOCUMENT_TYPE */ NULL,
|
|
|
|
/* DOM_NODE_DOCUMENT_FRAGMENT */ NULL,
|
|
|
|
/* DOM_NODE_NOTATION */ NULL,
|
|
|
|
},
|
|
|
|
/* Pop: */
|
|
|
|
{
|
|
|
|
/* */ NULL,
|
|
|
|
/* DOM_NODE_ELEMENT */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_ATTRIBUTE */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_TEXT */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_CDATA_SECTION */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_ENTITY_REFERENCE */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_ENTITY */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_PROC_INSTRUCTION */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_COMMENT */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_DOCUMENT */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_DOCUMENT_TYPE */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_DOCUMENT_FRAGMENT */ dom_normalize_node_end,
|
|
|
|
/* DOM_NODE_NOTATION */ dom_normalize_node_end,
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct dom_config *
|
|
|
|
add_dom_config_normalizer(struct dom_stack *stack, enum dom_config_flag flags)
|
|
|
|
{
|
|
|
|
struct dom_config *config;
|
|
|
|
|
|
|
|
config = mem_calloc(1, sizeof(*config));
|
|
|
|
if (!config) return NULL;
|
|
|
|
|
|
|
|
config->flags = flags;
|
|
|
|
|
|
|
|
if (add_dom_stack_context(stack, config, &dom_config_normalizer_context))
|
|
|
|
return config;
|
|
|
|
|
|
|
|
mem_free(config);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
2006-01-17 10:55:10 -05:00
|
|
|
|
|
|
|
struct dom_config_info {
|
|
|
|
struct dom_string name;
|
|
|
|
enum dom_config_flag flag;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define DOM_CONFIG(name, flag) \
|
|
|
|
{ INIT_DOM_STRING(name, -1), (flag) }
|
|
|
|
|
|
|
|
static struct dom_config_info dom_config_info[] = {
|
|
|
|
DOM_CONFIG("cdata-sections", DOM_CONFIG_CDATA_SECTIONS),
|
|
|
|
DOM_CONFIG("comments", DOM_CONFIG_COMMENTS),
|
|
|
|
DOM_CONFIG("element-content-whitespace",DOM_CONFIG_ELEMENT_CONTENT_WHITESPACE),
|
|
|
|
DOM_CONFIG("entities", DOM_CONFIG_ENTITIES),
|
|
|
|
DOM_CONFIG("normalize-characters", DOM_CONFIG_NORMALIZE_CHARACTERS),
|
|
|
|
DOM_CONFIG("unknown", DOM_CONFIG_UNKNOWN),
|
|
|
|
DOM_CONFIG("normalize-whitespace", DOM_CONFIG_NORMALIZE_WHITESPACE),
|
|
|
|
};
|
|
|
|
|
|
|
|
static enum dom_config_flag
|
|
|
|
get_dom_config_flag(struct dom_string *name)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < sizeof_array(dom_config_info); i++)
|
|
|
|
if (!dom_string_casecmp(&dom_config_info[i].name, name))
|
|
|
|
return dom_config_info[i].flag;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
enum dom_config_flag
|
|
|
|
parse_dom_config(unsigned char *flaglist, unsigned char separator)
|
|
|
|
{
|
|
|
|
enum dom_config_flag flags = 0;
|
|
|
|
|
|
|
|
while (flaglist) {
|
|
|
|
unsigned char *end = separator ? strchr(flaglist, separator) : NULL;
|
2006-01-18 22:54:30 -05:00
|
|
|
int length = end ? end - flaglist : strlen(flaglist);
|
2006-01-17 10:55:10 -05:00
|
|
|
struct dom_string name = INIT_DOM_STRING(flaglist, length);
|
|
|
|
|
|
|
|
flags |= get_dom_config_flag(&name);
|
|
|
|
if (end) end++;
|
|
|
|
flaglist = end;
|
|
|
|
}
|
|
|
|
|
|
|
|
return flags;
|
|
|
|
}
|