2005-09-15 09:58:31 -04:00
|
|
|
|
2005-12-28 08:05:14 -05:00
|
|
|
#ifndef EL_DOM_SGML_SCANNER_H
|
|
|
|
#define EL_DOM_SGML_SCANNER_H
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2005-12-28 08:05:14 -05:00
|
|
|
#include "dom/scanner.h"
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
enum sgml_token_type {
|
|
|
|
/* Char tokens: */
|
|
|
|
|
|
|
|
/* Char tokens range from 1 to 255 and have their char value as type */
|
|
|
|
/* meaning non char tokens have values from 256 and up. */
|
|
|
|
|
|
|
|
/* Low level string tokens: */
|
|
|
|
|
|
|
|
SGML_TOKEN_IDENT = 256, /* [0-9a-zA-Z_-:.]+ */
|
|
|
|
SGML_TOKEN_TAG_END, /* > or ?> */
|
|
|
|
SGML_TOKEN_STRING, /* Char sequence delimted by matching ' or " */
|
|
|
|
|
|
|
|
/* High level string tokens: */
|
|
|
|
|
|
|
|
SGML_TOKEN_NOTATION, /* <!{ident} until > */
|
|
|
|
SGML_TOKEN_NOTATION_COMMENT, /* <!-- until --> */
|
|
|
|
SGML_TOKEN_NOTATION_DOCTYPE, /* <!DOCTYPE until > */
|
|
|
|
SGML_TOKEN_NOTATION_ELEMENT, /* <!ELEMENT until > */
|
|
|
|
SGML_TOKEN_NOTATION_ENTITY, /* <!ENTITY until > */
|
|
|
|
SGML_TOKEN_NOTATION_ATTLIST, /* <!ATTLIST until > */
|
|
|
|
|
2005-12-26 13:43:32 -05:00
|
|
|
SGML_TOKEN_CDATA_SECTION, /* <![CDATA[ until ]]> */
|
|
|
|
|
2005-12-29 12:31:49 -05:00
|
|
|
SGML_TOKEN_PROCESS, /* <?{ident} */
|
|
|
|
SGML_TOKEN_PROCESS_XML, /* <?xml */
|
2005-12-30 21:13:39 -05:00
|
|
|
SGML_TOKEN_PROCESS_XML_STYLESHEET,/* <?xml-stylesheet */
|
2005-12-29 12:31:49 -05:00
|
|
|
SGML_TOKEN_PROCESS_DATA, /* data after <?{ident} until ?> */
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
SGML_TOKEN_ELEMENT, /* <{ident}> */
|
|
|
|
SGML_TOKEN_ELEMENT_BEGIN, /* <{ident} */
|
|
|
|
SGML_TOKEN_ELEMENT_END, /* </{ident}> or </> */
|
|
|
|
SGML_TOKEN_ELEMENT_EMPTY_END, /* /> */
|
|
|
|
SGML_TOKEN_ATTRIBUTE, /* [^>\t\r\n\f\v ]+ */
|
|
|
|
|
|
|
|
SGML_TOKEN_ENTITY, /* &ident; */
|
|
|
|
|
|
|
|
SGML_TOKEN_TEXT, /* [^<&]+ */
|
|
|
|
SGML_TOKEN_SPACE, /* [\t\r\n\f\v ]+ */
|
|
|
|
|
|
|
|
/* Special tokens: */
|
|
|
|
|
|
|
|
/* A special token for unrecognized strings */
|
|
|
|
SGML_TOKEN_GARBAGE,
|
|
|
|
|
|
|
|
/* Token type used internally when scanning to signal that the token
|
|
|
|
* should not be recorded in the scanners token table. */
|
|
|
|
SGML_TOKEN_SKIP,
|
|
|
|
|
|
|
|
/* Another internal token type used both to mark unused tokens in the
|
|
|
|
* scanner table as invalid or when scanning to signal that the
|
|
|
|
* scanning should end. */
|
|
|
|
SGML_TOKEN_NONE = 0,
|
|
|
|
};
|
|
|
|
|
2005-12-29 12:31:49 -05:00
|
|
|
/* The SGML tokenizer maintains a state (in the scanner->state member) that can
|
|
|
|
* be either text, element, or processing instruction state. The state has only
|
|
|
|
* meaning while doing the actual scanning and should not be used at the
|
|
|
|
* parsing time. It can however be used to initialize the scanner to a specific
|
|
|
|
* state. */
|
|
|
|
enum sgml_scanner_state {
|
|
|
|
SGML_STATE_TEXT,
|
|
|
|
SGML_STATE_ELEMENT,
|
|
|
|
SGML_STATE_PROC_INST,
|
|
|
|
};
|
|
|
|
|
2005-12-28 08:05:14 -05:00
|
|
|
extern struct dom_scanner_info sgml_scanner_info;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Treat '<' as more valuable then '>' so that scanning of '<a<b>' using
|
|
|
|
* skipping to next '>' will stop at the second '<'. */
|
|
|
|
#define get_sgml_precedence(token_type) \
|
|
|
|
((token_type) == '<' ? (1 << 11) : \
|
|
|
|
(token_type) == '>' ? (1 << 10) : 0)
|
|
|
|
|
|
|
|
#define skip_sgml_tokens(scanner, type) \
|
2005-12-28 08:05:14 -05:00
|
|
|
skip_dom_scanner_tokens(scanner, type, get_sgml_precedence(type))
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
#endif
|