elinks/src/dom/scanner.h

#ifndef EL_DOM_SCANNER_H
#define EL_DOM_SCANNER_H

#include "dom/string.h"
#include "util/error.h"

/* API Doc :: dom-scanner */

/* Define if you want a talking scanner */
/* #define DEBUG_DOM_SCANNER */

/** DOM scanner token
 *
 * This struct describes one scanner state. There are two kinds of tokens: char
 * and non-char tokens. Char tokens contains only one char and simply have
 * their char value as type. They are tokens having special control meaning in
 * the code, like ':', ';', '{', '}' and '*'. Non-char tokens have one or more
 * chars and contain stuff like number or indentifier strings.  */
struct dom_scanner_token {
	/** The type the token. */
	int type;

	/** Some precedence value. */
	int precedence;

	/** The line number; used for error tokens. */
	unsigned int lineno;

	/** The start of the token string and the token length. */
	struct dom_string string;
};

/** Skip the first charector of a token */
#define skip_dom_scanner_token_char(token) \
	do { (token)->string.string++; (token)->string.length--; } while (0)

/** Compare the token string to a "static" string */
#define dom_scanner_token_contains(token, str) \
	((token)->string.length == (sizeof(str) - 1) \
	 && !strncasecmp((token)->string.string, str, sizeof(str) - 1))


struct dom_scan_table_info {
	enum { DOM_SCAN_RANGE, DOM_SCAN_STRING, DOM_SCAN_END } type;
	struct dom_string data;
	int bits;
};

#define	DOM_SCAN_TABLE_SIZE	256

#define DOM_SCAN_TABLE_INFO(type, data1, data2, bits) \
	{ (type), INIT_DOM_STRING((data1), (data2)), (bits) }

#define DOM_SCAN_TABLE_RANGE(from, to, bits)	\
	DOM_SCAN_TABLE_INFO(DOM_SCAN_RANGE, from, to, bits)

#define DOM_SCAN_TABLE_STRING(str, bits)	\
	DOM_SCAN_TABLE_INFO(DOM_SCAN_STRING, str, sizeof(str) - 1, bits)

#define DOM_SCAN_TABLE_END			\
	DOM_SCAN_TABLE_INFO(DOM_SCAN_END, NULL, 0, 0)

struct dom_scanner_string_mapping {
	struct dom_string name;
	int type;
	int base_type;
};

#define DOM_STRING_MAP(str, type, family) \
	{ STATIC_DOM_STRING(str), (type), (family) }

#define DOM_STRING_MAP_END \
	{ INIT_DOM_STRING(NULL, 0), 0, 0 }

struct dom_scanner;

/** DOM scanner info
 *
 * Backend-specific information used during the actual scanning and
 * by the front end to fill the token table on-demand, etc.
 */
struct dom_scanner_info {
	/** Table containing how to map strings to token types */
	const struct dom_scanner_string_mapping *mappings;

	/** Information for how to initialize the scanner table */
	const struct dom_scan_table_info *scan_table_info;

	/**
	 * Fills the scanner with tokens. Already scanned tokens which have not
	 * been requested remain and are moved to the start of the scanners
	 * token table. Returns the current token or NULL if there are none. */
	struct dom_scanner_token *(*scan)(struct dom_scanner *scanner);

	/**
	 * The scanner table. Contains bitmaps for the various characters
	 * groups. Idea sync'ed from mozilla browser. */
	int scan_table[DOM_SCAN_TABLE_SIZE];

	/** Has the scanner info been initialized? */
	unsigned int initialized:1;
};


/** Initializes a DOM scanner
 *
 * See struct ref:[dom_scanner] for a description of the `int` flags. */
void init_dom_scanner(struct dom_scanner *scanner, struct dom_scanner_info *scanner_info,
		      struct dom_string *string, int state, int count_lines, int complete,
		      int check_complete, int detect_error);

/** The number of tokens in the scanners token table
 *
 * At best it should be big enough to contain properties with space separated
 * values and function calls with up to 3 variables like rgb(). At worst it
 * should be no less than 2 in order to be able to peek at the next token in
 * the scanner. */
#define DOM_SCANNER_TOKENS 10

/** DOM scanner
 *
 * Holds the current state of the scanner. */
struct dom_scanner {
	/** The start of the scanned string. */
	unsigned char *string;
	/** The end of the scanned string. */
	unsigned char *end;
	/**
	 * The position in the string where to scan next and the end of the
	 * string. If position is NULL it means that no more tokens can be
	 * retrieved from the string. */
	unsigned char *position;

	/**
	 * The current token. If the number of scanned tokens is less than
	 * ref:[DOM_SCANNER_TOKENS] it is because there are no more tokens in
	 * the string. */
	struct dom_scanner_token *current;
	/** The number of scanned tokens left in the table. */
	int tokens;

	/** The 'meta' scanner information */
	struct dom_scanner_info *info;

#ifdef DEBUG_SCANNER
	/* Debug info about the caller. */
	unsigned char *file;
	int line;
#endif

	/* The following two flags are used when parsing is incremental and
	 * the scanner must ensure that only tokens that are complete are
	 * generated. */
	unsigned int check_complete:1;	/*: Only generate complete tokens */
	unsigned int incomplete:1;	/*: The scanned string is incomplete */

	unsigned int detect_errors:1;	/*: Check for markup errors */
	unsigned int found_error;	/*: Did we already report this error? */

	unsigned int count_lines:1;	/*: Is line counting enbaled? */
	unsigned int lineno;		/*: Line # of the last scanned token */

	/** Some state indicator only meaningful to the scanner internals */
	int state;

	/**
	 * The table contain already scanned tokens. It is maintained in
	 * order to optimize the scanning a bit and make it possible to look
	 * ahead at the next token. You should always use the accessors
	 * (defined below) for getting tokens from the scanner. */
	struct dom_scanner_token table[DOM_SCANNER_TOKENS];
};

/** Check if there are more tokens */
#define dom_scanner_has_tokens(scanner) \
	((scanner)->tokens > 0 && (scanner)->current < (scanner)->table + (scanner)->tokens)

/* This macro checks if the current scanner state is valid. Meaning if the
 * scanners table is full the last token skipping or get_next_scanner_token()
 * call made it possible to get the type of the next token. */
#define check_dom_scanner(scanner) \
	(scanner->tokens < DOM_SCANNER_TOKENS \
	 || scanner->current + 1 < scanner->table + scanner->tokens)


/* Scanner table accessors and mutators */

/** Check the type of the next token */
#define check_next_dom_scanner_token(scanner, token_type)			\
	(scanner_has_tokens(scanner)						\
	 && ((scanner)->current + 1 < (scanner)->table + (scanner)->tokens)	\
	 && (scanner)->current[1].type == (token_type))

/** Get the current token */
static inline struct dom_scanner_token *
get_dom_scanner_token(struct dom_scanner *scanner)
{
	return dom_scanner_has_tokens(scanner) ? scanner->current : NULL;
}

/** Get the next token
 *
 * Getting the next token might cause a rescan so any token pointers that has
 * been stored in a local variable might not be valid after the call. */
/* Do a scanning if we do not also have access to next token. */
static inline struct dom_scanner_token *
get_next_dom_scanner_token(struct dom_scanner *scanner)
{
	return (dom_scanner_has_tokens(scanner)
		&& (++scanner->current + 1 >= scanner->table + scanner->tokens)
		? scanner->info->scan(scanner) : get_dom_scanner_token(scanner));
}

/** Skip the current token */
#define skip_dom_scanner_token(scanner) get_next_dom_scanner_token(scanner)

/** Conditionally skip tokens
 *
 * Removes tokens from the scanner until it meets a token of the given type.
 * This token will then also be skipped. */
struct dom_scanner_token *
skip_dom_scanner_tokens(struct dom_scanner *scanner, int skipto, int precedence);

/** Map a string to internal ID
 *
 * Looks up the string from @ident to @end to in the scanners string mapping
 * table. */
int
map_dom_scanner_string(struct dom_scanner *scanner,
		       unsigned char *ident, unsigned char *end, int base_type);

#ifdef DEBUG_DOM_SCANNER
void dump_dom_scanner(struct dom_scanner *scanner);
#endif

/* The begin_token_scanning() and end_token_scanning() functions provide the
 * basic setup and teardown for the rescan function made public via the
 * scanner_info->scan member. */

/* Returns NULL if it is not necessary to try to scan for more tokens */
static inline struct dom_scanner_token *
begin_dom_token_scanning(struct dom_scanner *scanner)
{
	struct dom_scanner_token *table = scanner->table;
	struct dom_scanner_token *table_end = table + scanner->tokens;
	int move_to_front = int_max(table_end - scanner->current, 0);
	struct dom_scanner_token *current = move_to_front ? scanner->current : table;
	size_t moved_size = 0;

	assert(scanner->current);

	/* Move any untouched tokens */
	if (move_to_front) {
		moved_size = move_to_front * sizeof(*table);
		memmove(table, current, moved_size);
		current = &table[move_to_front];
	}

	/* Clear all unused tokens */
	memset(current, 0, sizeof(*table) * DOM_SCANNER_TOKENS - moved_size);

	if (!scanner->position) {
		scanner->tokens = move_to_front ? move_to_front : -1;
		scanner->current = table;
		assert(check_dom_scanner(scanner));
		return NULL;
	}

	scanner->tokens = move_to_front;

	return table;
}

/* Updates the @scanner struct after scanning has been done. The position
 * _after_ the last valid token is taken as the @end argument. */
/* It is ok for @end to be < scanner->table since scanner->tokens will become
 * <= 0 anyway. */
static inline struct dom_scanner_token *
end_dom_token_scanning(struct dom_scanner *scanner, struct dom_scanner_token *end)
{
	assert(end <= scanner->table + DOM_SCANNER_TOKENS);

	scanner->tokens = (end - scanner->table);
	scanner->current = scanner->table;
	if (scanner->position >= scanner->end)
		scanner->position = NULL;

	assert(check_dom_scanner(scanner));

	return get_dom_scanner_token(scanner);
}

#endif