elinks/src/dom/sgml/scanner.h


#ifndef EL_DOM_SGML_SCANNER_H
#define EL_DOM_SGML_SCANNER_H

#include "dom/scanner.h"

enum sgml_token_type {
	/* Char tokens: */

	/* Char tokens range from 1 to 255 and have their char value as type */
	/* meaning non char tokens have values from 256 and up. */

	/* Low level string tokens: */

	SGML_TOKEN_IDENT = 256,		/* [0-9a-zA-Z_-:.]+ */
	SGML_TOKEN_TAG_END,		/* > or ?> */
	SGML_TOKEN_STRING,		/* Char sequence delimted by matching ' or " */

	/* High level string tokens: */

	SGML_TOKEN_NOTATION,		/* <!{ident} until > */
	SGML_TOKEN_NOTATION_COMMENT,	/* <!-- until --> */
	SGML_TOKEN_NOTATION_DOCTYPE,	/* <!DOCTYPE until > */
	SGML_TOKEN_NOTATION_ELEMENT,	/* <!ELEMENT until > */
	SGML_TOKEN_NOTATION_ENTITY,	/* <!ENTITY  until > */
	SGML_TOKEN_NOTATION_ATTLIST,	/* <!ATTLIST until > */

	SGML_TOKEN_CDATA_SECTION,	/* <![CDATA[ until ]]> */

	SGML_TOKEN_PROCESS,		/* <?{ident} */
	SGML_TOKEN_PROCESS_XML,		/* <?xml */
	SGML_TOKEN_PROCESS_XML_STYLESHEET,/* <?xml-stylesheet */
	SGML_TOKEN_PROCESS_DATA,	/* data after <?{ident} until ?> */

	SGML_TOKEN_ELEMENT,		/* <{ident}> */
	SGML_TOKEN_ELEMENT_BEGIN,	/* <{ident} */
	SGML_TOKEN_ELEMENT_END,		/* </{ident}> or </> */
	SGML_TOKEN_ELEMENT_EMPTY_END,	/* /> */
	SGML_TOKEN_ATTRIBUTE,		/* [^>\t\r\n\f\v ]+ */

	SGML_TOKEN_ENTITY,		/* &ident; */

	SGML_TOKEN_TEXT,		/* [^<&]+ */
	SGML_TOKEN_SPACE,		/* [\t\r\n\f\v ]+ */

	/* Special tokens: */

	/* A special token for unrecognized strings */
	SGML_TOKEN_GARBAGE,

	/* Token type used internally when scanning to signal that the token
	 * should not be recorded in the scanners token table. */
	SGML_TOKEN_SKIP,

	/* Another internal token type used both to mark unused tokens in the
	 * scanner table as invalid or when scanning to signal that the
	 * scanning should end. */
	SGML_TOKEN_NONE = 0,
};

/* The SGML tokenizer maintains a state (in the scanner->state member) that can
 * be either text, element, or processing instruction state. The state has only
 * meaning while doing the actual scanning and should not be used at the
 * parsing time. It can however be used to initialize the scanner to a specific
 * state. */
enum sgml_scanner_state {
	SGML_STATE_TEXT,
	SGML_STATE_ELEMENT,
	SGML_STATE_PROC_INST,
};

extern struct dom_scanner_info sgml_scanner_info;

/* Treat '<' as more valuable then '>' so that scanning of '<a<b>' using
 * skipping to next '>' will stop at the second '<'. */
#define get_sgml_precedence(token_type) \
	((token_type) == '<' ? (1 << 11) : \
	 (token_type) == '>' ? (1 << 10) : 0)

#define skip_sgml_tokens(scanner, type) \
	skip_dom_scanner_tokens(scanner, type, get_sgml_precedence(type))

#endif
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
Elute all DOM-related code and put it in src/dom 2005-12-28 08:05:14 -05:00			`#ifndef EL_DOM_SGML_SCANNER_H`
			`#define EL_DOM_SGML_SCANNER_H`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
Elute all DOM-related code and put it in src/dom 2005-12-28 08:05:14 -05:00			`#include "dom/scanner.h"`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
			`enum sgml_token_type {`
			`/* Char tokens: */`

			`/* Char tokens range from 1 to 255 and have their char value as type */`
			`/* meaning non char tokens have values from 256 and up. */`

			`/* Low level string tokens: */`

			`SGML_TOKEN_IDENT = 256, /* [0-9a-zA-Z_-:.]+ */`
			`SGML_TOKEN_TAG_END, /* > or ?> */`
			`SGML_TOKEN_STRING, /* Char sequence delimted by matching ' or " */`

			`/* High level string tokens: */`

			`SGML_TOKEN_NOTATION, /* <!{ident} until > */`
			`SGML_TOKEN_NOTATION_COMMENT, /* <!-- until --> */`
			`SGML_TOKEN_NOTATION_DOCTYPE, /* <!DOCTYPE until > */`
			`SGML_TOKEN_NOTATION_ELEMENT, /* <!ELEMENT until > */`
			`SGML_TOKEN_NOTATION_ENTITY, /* <!ENTITY until > */`
			`SGML_TOKEN_NOTATION_ATTLIST, /* <!ATTLIST until > */`

Parse <[CDATA[ sections ]]> 2005-12-26 13:43:32 -05:00			`SGML_TOKEN_CDATA_SECTION, /* <![CDATA[ until ]]> */`

Fix SGML parsing of processing instructions (<?xml ...?>) It involves adding a new scanner state which is used only to generate a new processing instruction (PI) data token. This removes some scanner specific code from the parser and makes handling of PIs more generic. The data of XML PIs are still parsed as attributes and added to the PI node. The 6th test now succeeds. Hurrah! 2005-12-29 12:31:49 -05:00			`SGML_TOKEN_PROCESS, /* <?{ident} */`
			`SGML_TOKEN_PROCESS_XML, /* <?xml */`
Just for fun also parse <?xml-stylesheet attributes 2005-12-30 21:13:39 -05:00			`SGML_TOKEN_PROCESS_XML_STYLESHEET,/* <?xml-stylesheet */`
Fix SGML parsing of processing instructions (<?xml ...?>) It involves adding a new scanner state which is used only to generate a new processing instruction (PI) data token. This removes some scanner specific code from the parser and makes handling of PIs more generic. The data of XML PIs are still parsed as attributes and added to the PI node. The 6th test now succeeds. Hurrah! 2005-12-29 12:31:49 -05:00			`SGML_TOKEN_PROCESS_DATA, /* data after <?{ident} until ?> */`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
			`SGML_TOKEN_ELEMENT, /* <{ident}> */`
			`SGML_TOKEN_ELEMENT_BEGIN, /* <{ident} */`
			`SGML_TOKEN_ELEMENT_END, /* </{ident}> or </> */`
			`SGML_TOKEN_ELEMENT_EMPTY_END, /* /> */`
			`SGML_TOKEN_ATTRIBUTE, /* [^>\t\r\n\f\v ]+ */`

			`SGML_TOKEN_ENTITY, /* &ident; */`

			`SGML_TOKEN_TEXT, /* [^<&]+ */`
			`SGML_TOKEN_SPACE, /* [\t\r\n\f\v ]+ */`

			`/* Special tokens: */`

			`/* A special token for unrecognized strings */`
			`SGML_TOKEN_GARBAGE,`

			`/* Token type used internally when scanning to signal that the token`
			`* should not be recorded in the scanners token table. */`
			`SGML_TOKEN_SKIP,`

			`/* Another internal token type used both to mark unused tokens in the`
			`* scanner table as invalid or when scanning to signal that the`
			`* scanning should end. */`
			`SGML_TOKEN_NONE = 0,`
			`};`

Fix SGML parsing of processing instructions (<?xml ...?>) It involves adding a new scanner state which is used only to generate a new processing instruction (PI) data token. This removes some scanner specific code from the parser and makes handling of PIs more generic. The data of XML PIs are still parsed as attributes and added to the PI node. The 6th test now succeeds. Hurrah! 2005-12-29 12:31:49 -05:00			`/* The SGML tokenizer maintains a state (in the scanner->state member) that can`
			`* be either text, element, or processing instruction state. The state has only`
			`* meaning while doing the actual scanning and should not be used at the`
			`* parsing time. It can however be used to initialize the scanner to a specific`
			`* state. */`
			`enum sgml_scanner_state {`
			`SGML_STATE_TEXT,`
			`SGML_STATE_ELEMENT,`
			`SGML_STATE_PROC_INST,`
			`};`

Elute all DOM-related code and put it in src/dom 2005-12-28 08:05:14 -05:00			`extern struct dom_scanner_info sgml_scanner_info;`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
			`/* Treat '<' as more valuable then '>' so that scanning of '<a<b>' using`
			`* skipping to next '>' will stop at the second '<'. */`
			`#define get_sgml_precedence(token_type) \`
			`((token_type) == '<' ? (1 << 11) : \`
			`(token_type) == '>' ? (1 << 10) : 0)`

			`#define skip_sgml_tokens(scanner, type) \`
Elute all DOM-related code and put it in src/dom 2005-12-28 08:05:14 -05:00			`skip_dom_scanner_tokens(scanner, type, get_sgml_precedence(type))`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
			`#endif`