From 889a0f16f8416a2a89865fb7a3941473ff659358 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 18:00:26 +0100
Subject: [PATCH 1/8] Fix the expected output of processing instruction parsing

Spaces after the target should be skipped.
---
 src/dom/test/test-sgml-parser-basic | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic
index 9c26420be..f1fbb653d 100755
--- a/src/dom/test/test-sgml-parser-basic
+++ b/src/dom/test/test-sgml-parser-basic
@@ -94,6 +94,6 @@ var val=2;
 proc-instruction: xml -> encoding="UTF8"
   attribute: encoding -> UTF8
 #text: \n...\n
-proc-instruction: ecmascript -> \nvar -> val=2;\n'
+proc-instruction: ecmascript -> var val=2;\n'
 
 test_done

From c24c67ce59a2fb4a1cb85f7675ef0806c18d42c7 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 18:20:03 +0100
Subject: [PATCH 2/8] Make it possible to initialise a scanner in a specific
 state

---
 src/dom/scanner.c | 5 +++--
 src/dom/scanner.h | 7 +++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/dom/scanner.c b/src/dom/scanner.c
index 51c3f28e6..10f5cc28a 100644
--- a/src/dom/scanner.c
+++ b/src/dom/scanner.c
@@ -153,8 +153,8 @@ init_dom_scanner_info(struct dom_scanner_info *scanner_info)
 }
 
 void
-init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
-		 struct dom_string *string)
+init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
+		       struct dom_string *string, int state)
 {
 	if (!scanner_info->initialized) {
 		init_dom_scanner_info(scanner_info);
@@ -168,5 +168,6 @@ init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_i
 	scanner->end = string->string + string->length;
 	scanner->current = scanner->table;
 	scanner->info = scanner_info;
+	scanner->state = state;
 	scanner->info->scan(scanner);
 }
diff --git a/src/dom/scanner.h b/src/dom/scanner.h
index e22bf28e5..2dc9722cd 100644
--- a/src/dom/scanner.h
+++ b/src/dom/scanner.h
@@ -91,8 +91,11 @@ struct dom_scanner_info {
 
 
 /* Initializes the scanner. */
-void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
-		      struct dom_string *string);
+void init_dom_scanner_state(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
+			    struct dom_string *string, int state);
+
+#define init_dom_scanner(scanner, info, string) \
+	init_dom_scanner_state(scanner, info, string, 0)
 
 /* The number of tokens in the scanners token table:
  * At best it should be big enough to contain properties with space separated

From 1a177491a0249bf915a77692b60e8292888c5c5c Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 18:31:49 +0100
Subject: [PATCH 3/8] Fix SGML parsing of processing instructions (<?xml ...?>)

It involves adding a new scanner state which is used only to generate a new
processing instruction (PI) data token. This removes some scanner specific
code from the parser and makes handling of PIs more generic. The data of
XML PIs are still parsed as attributes and added to the PI node.

The 6th test now succeeds. Hurrah!
---
 src/dom/sgml/parser.c  | 66 ++++++++++++++++++++++--------------------
 src/dom/sgml/scanner.c | 59 +++++++++++++++++++------------------
 src/dom/sgml/scanner.h | 16 ++++++++--
 3 files changed, 78 insertions(+), 63 deletions(-)

diff --git a/src/dom/sgml/parser.c b/src/dom/sgml/parser.c
index bb9b154bb..bbc0aa1dc 100644
--- a/src/dom/sgml/parser.c
+++ b/src/dom/sgml/parser.c
@@ -103,26 +103,17 @@ add_sgml_attribute(struct dom_stack *stack,
 }
 
 static inline struct dom_node *
-add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *token)
+add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *target,
+			  struct dom_scanner_token *data)
 {
 	struct dom_node *parent = get_dom_stack_top(stack)->node;
+	struct dom_string *data_str = data ? &data->string : NULL;
 	struct dom_node *node;
-	/* Split the token in two if we can find a first space separator. */
-	unsigned char *separator = memchr(token->string.string, ' ', token->string.length);
 
-	/* Anything before the separator becomes the target name ... */
-	size_t namelen = separator ? separator - token->string.string : token->string.length;
-	struct dom_string name = INIT_DOM_STRING(token->string.string, namelen);
-
-	/* ... and everything after the instruction value. */
-	unsigned char *valuestr = separator ? separator + 1 : NULL;
-	size_t valuelen = valuestr ? token->string.length - namelen - 1 : 0;
-	struct dom_string value = INIT_DOM_STRING(valuestr, valuelen);
-
-	node = add_dom_proc_instruction(parent, &name, &value);
+	node = add_dom_proc_instruction(parent, &target->string, data_str);
 	if (!node) return NULL;
 
-	switch (token->type) {
+	switch (target->type) {
 	case SGML_TOKEN_PROCESS_XML:
 		node->data.proc_instruction.type = DOM_PROC_INSTRUCTION_XML;
 		break;
@@ -132,13 +123,7 @@ add_sgml_proc_instruction(struct dom_stack *stack, struct dom_scanner_token *tok
 		node->data.proc_instruction.type = DOM_PROC_INSTRUCTION;
 	}
 
-	if (!push_dom_node(stack, node))
-		return NULL;
-
-	if (token->type != SGML_TOKEN_PROCESS_XML)
-		pop_dom_node(stack);
-
-	return node;
+	return push_dom_node(stack, node);
 }
 
 static inline void
@@ -166,9 +151,12 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
 
 	assert(dom_scanner_has_tokens(scanner)
 	       && (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN
-	       	   || get_dom_scanner_token(scanner)->type == SGML_TOKEN_PROCESS_XML));
+	           || (get_dom_stack_top(stack)->node->type == DOM_NODE_PROCESSING_INSTRUCTION
+	       	       && get_dom_stack_top(stack)->node->data.proc_instruction.type
+		          == DOM_PROC_INSTRUCTION_XML)));
 
-	skip_dom_scanner_token(scanner);
+	if (get_dom_scanner_token(scanner)->type == SGML_TOKEN_ELEMENT_BEGIN)
+		skip_dom_scanner_token(scanner);
 
 	while (dom_scanner_has_tokens(scanner)) {
 		struct dom_scanner_token *token = get_dom_scanner_token(scanner);
@@ -220,6 +208,8 @@ parse_sgml_attributes(struct dom_stack *stack, struct dom_scanner *scanner)
 static void
 parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
 {
+	struct dom_scanner_token target;
+
 	while (dom_scanner_has_tokens(scanner)) {
 		struct dom_scanner_token *token = get_dom_scanner_token(scanner);
 
@@ -290,17 +280,31 @@ parse_sgml_plain(struct dom_stack *stack, struct dom_scanner *scanner)
 			break;
 
 		case SGML_TOKEN_PROCESS_XML:
-			if (!add_sgml_proc_instruction(stack, token)) {
-				skip_sgml_tokens(scanner, SGML_TOKEN_TAG_END);
-				break;
+		case SGML_TOKEN_PROCESS:
+			copy_struct(&target, token);
+
+			/* Skip the target token */
+			token = get_next_dom_scanner_token(scanner);
+			if (!token) break;
+
+			assert(token->type == SGML_TOKEN_PROCESS_DATA);
+
+			if (add_sgml_proc_instruction(stack, &target, token)
+			    && target.type == SGML_TOKEN_PROCESS_XML
+			    && token->string.length > 0) {
+				/* Parse the <?xml data="attributes"?>. */
+				struct dom_scanner attr_scanner;
+
+				init_dom_scanner_state(&attr_scanner,
+						       &sgml_scanner_info,
+						       &token->string,
+						       SGML_STATE_ELEMENT);
+
+				if (dom_scanner_has_tokens(&attr_scanner))
+					parse_sgml_attributes(stack, &attr_scanner);
 			}
 
-			parse_sgml_attributes(stack, scanner);
 			pop_dom_node(stack);
-			break;
-
-		case SGML_TOKEN_PROCESS:
-			add_sgml_proc_instruction(stack, token);
 			skip_dom_scanner_token(scanner);
 			break;
 
diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c
index caaf3655f..b51f79bc0 100644
--- a/src/dom/sgml/scanner.c
+++ b/src/dom/sgml/scanner.c
@@ -17,14 +17,6 @@
 
 /* Bitmap entries for the SGML character groups used in the scanner table */
 
-/* The SGML tokenizer maintains a state that can be either text or element
- * state. The state has only meaning while doing the actual scanning and is not
- * accessible at the parsing time. */
-enum sgml_scanner_state {
-	SGML_STATE_TEXT,
-	SGML_STATE_ELEMENT,
-};
-
 enum sgml_char_group {
 	SGML_CHAR_ENTITY	= (1 << 1),
 	SGML_CHAR_IDENT		= (1 << 2),
@@ -296,27 +288,7 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 
 			type = map_dom_scanner_string(scanner, pos, string, base);
 
-			/* Figure out where the processing instruction ends */
-			for (pos = string; skip_sgml(scanner, &pos, '>', 0); ) {
-				if (pos[-2] != '?') continue;
-
-				/* Set length until '?' char and move position
-				 * beyond '>'. */
-				real_length = pos - token->string.string - 2;
-				break;
-			}
-
-			switch (type) {
-			case SGML_TOKEN_PROCESS_XML:
-				/* We want to parse the attributes */
-				assert(scanner->state != SGML_STATE_ELEMENT);
-				scanner->state = SGML_STATE_ELEMENT;
-				break;
-
-			default:
-				/* Just skip the whole thing */
-				string = pos;
-			}
+			scanner->state = SGML_STATE_PROC_INST;
 
 		} else if (*string == '/') {
 			string++;
@@ -403,6 +375,28 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 }
 
 
+/* Processing instruction data scanning */
+
+static inline void
+scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
+{
+	unsigned char *string = scanner->position;
+
+	token->string.string = string++;
+
+	/* Figure out where the processing instruction ends */
+	while (skip_sgml(scanner, &string, '>', 0))
+		if (string[-2] == '?')
+			break;
+
+	token->type = SGML_TOKEN_PROCESS_DATA;
+	token->string.length = string - token->string.string - 2;
+	token->precedence = get_sgml_precedence(token->type);
+	scanner->position = string;
+	scanner->state = SGML_STATE_TEXT;
+}
+
+
 /* Scanner multiplexor */
 
 static struct dom_scanner_token *
@@ -429,8 +423,13 @@ scan_sgml_tokens(struct dom_scanner *scanner)
 			if (current->type == SGML_TOKEN_SKIP) {
 				current--;
 			}
-		} else {
+
+		} else if (scanner->state == SGML_STATE_TEXT) {
 			scan_sgml_text_token(scanner, current);
+
+		} else {
+			scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE);
+			scan_sgml_proc_inst_token(scanner, current);
 		}
 	}
 
diff --git a/src/dom/sgml/scanner.h b/src/dom/sgml/scanner.h
index 4032d718e..9abe59e32 100644
--- a/src/dom/sgml/scanner.h
+++ b/src/dom/sgml/scanner.h
@@ -27,8 +27,9 @@ enum sgml_token_type {
 
 	SGML_TOKEN_CDATA_SECTION,	/* <![CDATA[ until ]]> */
 
-	SGML_TOKEN_PROCESS,		/* <?{ident} until ?> */
-	SGML_TOKEN_PROCESS_XML,		/* <?xml until */
+	SGML_TOKEN_PROCESS,		/* <?{ident} */
+	SGML_TOKEN_PROCESS_XML,		/* <?xml */
+	SGML_TOKEN_PROCESS_DATA,	/* data after <?{ident} until ?> */
 
 	SGML_TOKEN_ELEMENT,		/* <{ident}> */
 	SGML_TOKEN_ELEMENT_BEGIN,	/* <{ident} */
@@ -56,6 +57,17 @@ enum sgml_token_type {
 	SGML_TOKEN_NONE = 0,
 };
 
+/* The SGML tokenizer maintains a state (in the scanner->state member) that can
+ * be either text, element, or processing instruction state. The state has only
+ * meaning while doing the actual scanning and should not be used at the
+ * parsing time. It can however be used to initialize the scanner to a specific
+ * state. */
+enum sgml_scanner_state {
+	SGML_STATE_TEXT,
+	SGML_STATE_ELEMENT,
+	SGML_STATE_PROC_INST,
+};
+
 extern struct dom_scanner_info sgml_scanner_info;
 
 /* Treat '<' as more valuable then '>' so that scanning of '<a<b>' using

From beb8337fc581267ba47c55954c4403a4b0e01e37 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 18:33:59 +0100
Subject: [PATCH 4/8] Add rule to make test run from src/dom dir

---
 src/dom/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/dom/Makefile b/src/dom/Makefile
index 35a6b3929..31e30e138 100644
--- a/src/dom/Makefile
+++ b/src/dom/Makefile
@@ -6,4 +6,7 @@ OBJS = node.o select.o stack.o scanner.o
 
 SUBDIRS-$(CONFIG_DEBUG) += test
 
+test: all
+	make test -C test
+
 include $(top_srcdir)/Makefile.lib

From 958a4a1b51b7d3a9c13590d5b86ccca92fee2f61 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 19:13:48 +0100
Subject: [PATCH 5/8] Add tests for more things like space handling and obscure
 formatting

---
 src/dom/test/test-sgml-parser-basic | 69 ++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic
index f1fbb653d..358871c6e 100755
--- a/src/dom/test/test-sgml-parser-basic
+++ b/src/dom/test/test-sgml-parser-basic
@@ -44,6 +44,17 @@ element: html
     element: p
       #text: Hello World!'
 
+test_output_equals \
+'Parse elements.' \
+'<root><child attr="value" /><child2></><child3 >a</></root>' \
+'
+element: root
+  element: child
+    attribute: attr -> value
+  element: child2
+  element: child3
+    #text: a'
+
 test_output_equals \
 'Parse an enclosed comment.' \
 '<root><!-- Hello World! --></root>' \
@@ -68,26 +79,45 @@ element: root
   attribute: name -> value with &foo; <stuff'
 
 test_output_equals \
-'Parse entity references.' \
-'<root>&amp;...&#42;...&...copy;...&;...&#;' \
+'Parse attributes with garbage.' \
+"<root a=b c='d' e'f' g= h i = j k =></root>" \
 '
 element: root
-  entity-reference: amp
-  #text: ...
-  entity-reference: #42
-  #text: ...
-  entity-reference: ...copy
-  #text: ...
-  #text: &;
-  #text: ...
-  entity-reference: #'
+  attribute: a -> b
+  attribute: c -> d
+  attribute: g -> h
+  attribute: i -> j
+  attribute: k -> ' 
+
+test_output_equals \
+'Parse entity references.' \
+'&amp;-&#42;' \
+'
+entity-reference: amp
+#text: -
+entity-reference: #42'
+
+# Just how these should be gracefully handled is not clear to me.
+test_output_equals \
+'Parse badly formatted entity references.' \
+'& m33p;-&.:-copy;-&;-&#;-&#xx;' \
+'
+#text: & m33p;
+#text: -
+entity-reference: .:-copy
+#text: -
+#text: &;
+#text: -
+entity-reference: #
+#text: -
+entity-reference: #xx'
 
 # Test <?>
 test_output_equals \
 'Parse processing instructions.' \
 '<?xml encoding="UTF8"?>
 ...
-<?ecmascript 
+<?ecmascript
 var val=2;
 ?>' \
 '
@@ -96,4 +126,19 @@ proc-instruction: xml -> encoding="UTF8"
 #text: \n...\n
 proc-instruction: ecmascript -> var val=2;\n'
 
+test_output_equals \
+'Skip spaces not inside text.' \
+'<
+root
+ns:attr                      
+=
+"value"
+><?	
+	target	
+ data?><	/	root	>' \
+'
+element: root
+  attribute: ns:attr -> value
+  proc-instruction: target -> data'
+
 test_done

From 57168e1fbcebed4cf7fe559d8a9c10bbf74ef432 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 20:38:43 +0100
Subject: [PATCH 6/8] Handle <element path=/to/%61-&\one";files/> as a
 self-closing tag

Before the '/' before '>' would be interpreted as part of the attribute
value.  Hope this is sensible slurping of the markup soup.
---
 src/dom/sgml/scanner.c              | 4 ++++
 src/dom/test/test-sgml-parser-basic | 9 +++++++++
 2 files changed, 13 insertions(+)

diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c
index b51f79bc0..946c0bb6d 100644
--- a/src/dom/sgml/scanner.c
+++ b/src/dom/sgml/scanner.c
@@ -338,6 +338,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 		} else if (is_sgml_attribute(*string)) {
 			scan_sgml_attribute(scanner, string);
 			type = SGML_TOKEN_ATTRIBUTE;
+			if (string[-1] == '/' && string[0] == '>')
+				string--;
 		}
 
 	} else if (isquote(first_char)) {
@@ -365,6 +367,8 @@ scan_sgml_element_token(struct dom_scanner *scanner, struct dom_scanner_token *t
 		if (is_sgml_attribute(*string)) {
 			scan_sgml_attribute(scanner, string);
 			type = SGML_TOKEN_ATTRIBUTE;
+			if (string[-1] == '/' && string[0] == '>')
+				string--;
 		}
 	}
 
diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic
index 358871c6e..d91709f5a 100755
--- a/src/dom/test/test-sgml-parser-basic
+++ b/src/dom/test/test-sgml-parser-basic
@@ -89,6 +89,15 @@ element: root
   attribute: i -> j
   attribute: k -> ' 
 
+test_output_equals \
+'Parse attribute with non-quoted values.' \
+'<root color=#abc path=/to/%61-&\one";files/>...' \
+'
+element: root
+  attribute: color -> #abc
+  attribute: path -> /to/%61-&\one";files
+#text: ...'
+
 test_output_equals \
 'Parse entity references.' \
 '&amp;-&#42;' \

From bd877570d27668d63b7d2e715015a469510624b7 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 21:52:27 +0100
Subject: [PATCH 7/8] Test some more obscure proc. instructions and fix some
 assertion failures

---
 src/dom/sgml/scanner.c              | 21 ++++++++++++++++-----
 src/dom/test/test-sgml-parser-basic | 28 ++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c
index 946c0bb6d..31f77f404 100644
--- a/src/dom/sgml/scanner.c
+++ b/src/dom/sgml/scanner.c
@@ -385,13 +385,23 @@ static inline void
 scan_sgml_proc_inst_token(struct dom_scanner *scanner, struct dom_scanner_token *token)
 {
 	unsigned char *string = scanner->position;
+	size_t size;
 
-	token->string.string = string++;
+	token->string.string = string;
 
-	/* Figure out where the processing instruction ends */
-	while (skip_sgml(scanner, &string, '>', 0))
-		if (string[-2] == '?')
+	/* Figure out where the processing instruction ends. This doesn't use
+	 * skip_sgml() since we MUST ignore precedence here to allow '<' inside
+	 * the data part to be skipped correctly. */
+	for (size = scanner->end - string;
+	     size > 0 && (string = memchr(string, '>', size));
+	     string++) {
+		if (string[-1] == '?') {
+			string++;
 			break;
+		}
+	}
+
+	if (!string) string = scanner->end;
 
 	token->type = SGML_TOKEN_PROCESS_DATA;
 	token->string.length = string - token->string.string - 2;
@@ -417,7 +427,8 @@ scan_sgml_tokens(struct dom_scanner *scanner)
 	     current < table_end && scanner->position < scanner->end;
 	     current++) {
 		if (scanner->state == SGML_STATE_ELEMENT
-		    || *scanner->position == '<') {
+		    || (*scanner->position == '<'
+			&& scanner->state != SGML_STATE_PROC_INST)) {
 			scan_sgml(scanner, scanner->position, SGML_CHAR_WHITESPACE);
 			if (scanner->position >= scanner->end) break;
 
diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic
index d91709f5a..ae0739c7d 100755
--- a/src/dom/test/test-sgml-parser-basic
+++ b/src/dom/test/test-sgml-parser-basic
@@ -135,6 +135,34 @@ proc-instruction: xml -> encoding="UTF8"
 #text: \n...\n
 proc-instruction: ecmascript -> var val=2;\n'
 
+test_output_equals \
+'Parse exotic processing instructions.' \
+'<?xml ?+>+?>-?>-<?js?>-<??>-' \
+'
+proc-instruction: xml -> ?+>+
+#text: -?>-
+proc-instruction: js -> 
+#text: -
+proc-instruction:  -> 
+#text: -'
+
+test_output_equals \
+'Parse incorrect processing instructions.' \
+'<?js<?>-<?<??>-<?xml <=";&?>-<?' \
+'
+proc-instruction: js -> <
+#text: -
+proc-instruction:  -> <?
+#text: -
+proc-instruction: xml -> <=";&
+#text: -'
+
+test_output_equals \
+'Parse incorrect processing instructions (II).' \
+'<?><?' \
+'
+proc-instruction:  -> ><?'
+
 test_output_equals \
 'Skip spaces not inside text.' \
 '<

From 76a524ddf63f55b1ef2b137906464ce78e2ed072 Mon Sep 17 00:00:00 2001
From: Jonas Fonseca <fonseca@diku.dk>
Date: Thu, 29 Dec 2005 22:26:39 +0100
Subject: [PATCH 8/8] More <?xml and comment tests, fix an off-by-one error for
 comments skipping

---
 src/dom/sgml/scanner.c              |  2 +-
 src/dom/test/test-sgml-parser-basic | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/dom/sgml/scanner.c b/src/dom/sgml/scanner.c
index 31f77f404..88234be9d 100644
--- a/src/dom/sgml/scanner.c
+++ b/src/dom/sgml/scanner.c
@@ -178,7 +178,7 @@ skip_comment(struct dom_scanner *scanner, unsigned char **string)
 	unsigned char *pos = *string;
 	int length = 0;
 
-	for (; pos < scanner->end - 3; pos++)
+	for (; pos < scanner->end - 2; pos++)
 		if (pos[0] == '-' && pos[1] == '-' && pos[2] == '>') {
 			length = pos - *string;
 			pos += 3;
diff --git a/src/dom/test/test-sgml-parser-basic b/src/dom/test/test-sgml-parser-basic
index ae0739c7d..c9ba46775 100755
--- a/src/dom/test/test-sgml-parser-basic
+++ b/src/dom/test/test-sgml-parser-basic
@@ -62,6 +62,20 @@ test_output_equals \
 element: root
   #comment:  Hello World! '
 
+test_output_equals \
+'Parse comment combinations.' \
+'<root><!-- <!-- -- > --><!----></root>' \
+'
+element: root
+  #comment:  <!-- -- > 
+  #comment: '
+
+test_output_equals \
+'Parse comment combinations.' \
+'<!--foo-->' \
+'
+#comment: foo'
+
 test_output_equals \
 'Parse an enclosed CDATA section.' \
 '<root><![CDATA[...] ]>...]]></root>' \
@@ -135,6 +149,14 @@ proc-instruction: xml -> encoding="UTF8"
 #text: \n...\n
 proc-instruction: ecmascript -> var val=2;\n'
 
+test_output_equals \
+'Parse XML processing instructions.' \
+'<?xml version="1.0" />?><?xml />-' \
+'
+proc-instruction: xml -> version="1.0" />
+  attribute: version -> 1.0
+proc-instruction: xml -> /'
+
 test_output_equals \
 'Parse exotic processing instructions.' \
 '<?xml ?+>+?>-?>-<?js?>-<??>-' \