elinks/src/intl/charsets.h

#ifndef EL__INTL_CHARSETS_H
#define EL__INTL_CHARSETS_H

typedef uint32_t unicode_val_T;

/* U+FFFD REPLACEMENT CHARACTER.  Used when no Unicode mapping is
 * known for a byte in a codepage, or when invalid UTF-8 is received
 * from a terminal.  After generating the character, ELinks then
 * treats it like any other Unicode character.  The user can also type
 * this character directly, and it can occur in documents.  */
#define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)

/* A special value that fits in unicode_val_T but is outside the range
 * of Unicode characters.  utf8_to_unicode and cp_to_unicode return
 * this if the input is too short.  This is also used as a placeholder
 * for the second cell of a double-cell character.  */
#define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)

/* &nbsp; replacement character. See u2cp(). */
#define NBSP_CHAR ((unsigned char) 1)
#define NBSP_CHAR_STRING "\001"

struct conv_table {
	int t;
	union {
		unsigned char *str;
		struct conv_table *tbl;
	} u;
};

enum convert_string_mode {
	CSM_DEFAULT, /* Convert any char. */
	CSM_QUERY, /* Special handling of '&' and '=' chars. */
	CSM_FORM, /* Special handling of '&' and '=' chars in forms. */
	CSM_NONE, /* Convert nothing. */
};

struct conv_table *get_translation_table(int, int);
unsigned char *get_entity_string(const unsigned char *str, const int strlen, int encoding);

/* The convert_string() name is also used by Samba (version 3.0.3), which
 * provides libnss_wins.so.2, which is called somewhere inside
 * _nss_wins_gethostbyname_r(). This name clash causes the elinks hostname
 * lookup thread to crash so we need to rename the symbol. */
/* Reported by Derek Poon and filed as bug 453 */
#undef convert_string
#define convert_string convert_string_elinks

/* This routine converts a string from one charset to another according to the
 * passed @convert_table, potentially also decoding SGML (HTML) entities along
 * the way (according to @mode). It either returns dynamically allocated
 * converted string of length @length, or if the @callback is non-NULL it calls
 * it each few bytes instead and always returns NULL (@length is undefined).
 * Note that it's ok not to care and pass NULL as @length. */
unsigned char *convert_string(struct conv_table *convert_table,
			      unsigned char *chars, int charslen, int cp,
			      enum convert_string_mode mode, int *length,
			      void (*callback)(void *data, unsigned char *buf, int buflen),
			      void *callback_data);

int get_cp_index(unsigned char *);
unsigned char *get_cp_name(int);
unsigned char *get_cp_mime_name(int);
int is_cp_utf8(int);
void free_conv_table(void);
#ifdef CONFIG_UTF8
inline unsigned char *encode_utf8(unicode_val_T);
inline unsigned char *utf8_prevchar(unsigned char *, int, unsigned char *);
inline int utf8charlen(const unsigned char *);
int utf8_char2cells(unsigned char *, unsigned char *);
int utf8_ptr2cells(unsigned char *, unsigned char *);
int utf8_ptr2chars(unsigned char *, unsigned char *);
int utf8_cells2bytes(unsigned char *, int, unsigned char *);
/* How utf8_step_forward and utf8_step_backward count steps.  */
enum utf8_step {
	/* Each step is one character, even if it is a combining or
	 * double-width character.  */
	utf8_step_characters,

	/* Each step is one cell.  If the specified number of steps
	 * would end in the middle of a double-width character, do not
	 * include the character.  */
	utf8_step_cells_fewer,

	/* Each step is one cell.  If the specified number of steps
	 * would end in the middle of a double-width character,
	 * include the whole character.  */
	utf8_step_cells_more
};
unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
				 int, enum utf8_step, int *);
unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
				  int, enum utf8_step, int *);
inline int unicode_to_cell(unicode_val_T);
unicode_val_T unicode_fold_label_case(unicode_val_T);
inline int strlen_utf8(unsigned char **);
inline unicode_val_T utf8_to_unicode(unsigned char **, unsigned char *);
unicode_val_T cp_to_unicode(int, unsigned char **, unsigned char *);
#endif /* CONFIG_UTF8 */

unicode_val_T cp2u(int, unsigned char);
unsigned char *cp2utf8(int, int);

unsigned char *u2cp_(unicode_val_T, int, int no_nbsp_hack);
#define u2cp(u, to) u2cp_(u, to, 0)
#define u2cp_no_nbsp(u, to) u2cp_(u, to, 1)

void init_charsets_lookup(void);
void free_charsets_lookup(void);

/* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single
 * 16-bit code unit, and each character U+10000...U+10FFFF as a pair
 * of two code units: a high surrogate followed by a low surrogate.
 * The range U+D800...U+DFFF is reserved for these surrogates.  */
#define is_utf16_surrogate(u)           (((u) & 0xFFFFF800) == 0xD800)
#define is_utf16_high_surrogate(u)      (((u) & 0xFFFFFC00) == 0xD800)
#define is_utf16_low_surrogate(u)       (((u) & 0xFFFFFC00) == 0xDC00)
#define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00))
#define needs_utf16_surrogates(u)       ((uint32_t) ((u) - 0x10000) < 0x100000)
#define get_utf16_high_surrogate(u)     (0xD800 + (((u) - 0x10000) >> 10))
#define get_utf16_low_surrogate(u)      (0xDC00 + ((u) & 0x3FF))

#endif
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00			`#ifndef EL__INTL_CHARSETS_H`
			`#define EL__INTL_CHARSETS_H`

			`typedef uint32_t unicode_val_T;`

UTF-8: Split UCS_REPLACEMENT_CHARACTER off UCS_NO_CHAR. In the previous version, invalid UTF-8 from a terminal caused UCS_NO_CHAR (0xFFFFFFFD) to be stored in a term_event_key_T, resulting in -3 which was then incidentally treated as an unassigned special key. Now, invalid UTF-8 is instead mapped to UCS_REPLACEMENT_CHARACTER and treated as a character. The fact that handle_interlink_event calls term_send_ucs when it receives invalid UTF-8 makes it pretty clear that this is how it was intended. src/viewer/text/link.c (not changed in this commit) already referred to UCS_REPLACEMENT_CHARACTER in a comment even though it was not previously defined. 2006-08-19 06:29:37 -04:00			`/* U+FFFD REPLACEMENT CHARACTER. Used when no Unicode mapping is`
			`* known for a byte in a codepage, or when invalid UTF-8 is received`
			`* from a terminal. After generating the character, ELinks then`
			`* treats it like any other Unicode character. The user can also type`
			`* this character directly, and it can occur in documents. */`
			`#define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)`

			`/* A special value that fits in unicode_val_T but is outside the range`
Change "utf_8" to "utf8" in most identifiers. Suggested by Miciah on #elinks. What was renamed: add_utf_8 => add_utf8 cp2utf_8 => cp2utf8 encode_utf_8 => encode_utf8 get_translation_table_to_utf_8 => get_translation_table_to_utf8 goto invalid_utf_8_start_byte => goto invalid_utf8_start_byte goto utf_8 => goto utf8 goto utf_8_select => goto utf8_select terminal_interlink.utf_8 => terminal_interlink.utf8 utf_8_to_unicode => utf8_to_unicode What was not renamed: terminal._template_.utf_8_io option, TERM_OPT_UTF_8_IO Compatibility with existing elinks.conf files would require an alias. --enable-utf-8 Because the name of the charset is UTF-8, --enable-utf-8 looks better than --enable-utf8. CONFIG_UTF_8 Will be renamed in a later commit. Unicode/utf_8.cp, table_utf_8, aliases_utf_8 Will be renamed in a later commit. 2006-09-17 09:06:22 -04:00			`* of Unicode characters. utf8_to_unicode and cp_to_unicode return`
UTF-8: Split UCS_REPLACEMENT_CHARACTER off UCS_NO_CHAR. In the previous version, invalid UTF-8 from a terminal caused UCS_NO_CHAR (0xFFFFFFFD) to be stored in a term_event_key_T, resulting in -3 which was then incidentally treated as an unassigned special key. Now, invalid UTF-8 is instead mapped to UCS_REPLACEMENT_CHARACTER and treated as a character. The fact that handle_interlink_event calls term_send_ucs when it receives invalid UTF-8 makes it pretty clear that this is how it was intended. src/viewer/text/link.c (not changed in this commit) already referred to UCS_REPLACEMENT_CHARACTER in a comment even though it was not previously defined. 2006-08-19 06:29:37 -04:00			`* this if the input is too short. This is also used as a placeholder`
			`* for the second cell of a double-cell character. */`
The value of UCS_NO_CHAR was bad. There must not be a possibility to encode it using utf_8_to_unicode. If every unicode_val_T value could be a result of that function then one must add out param to the utf_8_to_unicode signaling 'true' UCS_NO_CHAR. 2006-07-31 15:23:47 -04:00			`#define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
			`/*   replacement character. See u2cp(). */`
			`#define NBSP_CHAR ((unsigned char) 1)`
			`#define NBSP_CHAR_STRING "\001"`

			`struct conv_table {`
			`int t;`
			`union {`
			`unsigned char *str;`
			`struct conv_table *tbl;`
			`} u;`
			`};`

			`enum convert_string_mode {`
			`CSM_DEFAULT, /* Convert any char. */`
			`CSM_QUERY, /* Special handling of '&' and '=' chars. */`
			`CSM_FORM, /* Special handling of '&' and '=' chars in forms. */`
			`CSM_NONE, /* Convert nothing. */`
			`};`

			`struct conv_table *get_translation_table(int, int);`
			`unsigned char get_entity_string(const unsigned char str, const int strlen, int encoding);`

			`/* The convert_string() name is also used by Samba (version 3.0.3), which`
			`* provides libnss_wins.so.2, which is called somewhere inside`
			`* _nss_wins_gethostbyname_r(). This name clash causes the elinks hostname`
			`* lookup thread to crash so we need to rename the symbol. */`
			`/* Reported by Derek Poon and filed as bug 453 */`
			`#undef convert_string`
			`#define convert_string convert_string_elinks`

			`/* This routine converts a string from one charset to another according to the`
			`* passed @convert_table, potentially also decoding SGML (HTML) entities along`
			`* the way (according to @mode). It either returns dynamically allocated`
			`* converted string of length @length, or if the @callback is non-NULL it calls`
			`* it each few bytes instead and always returns NULL (@length is undefined).`
			`* Note that it's ok not to care and pass NULL as @length. */`
			`unsigned char convert_string(struct conv_table convert_table,`
			`unsigned char *chars, int charslen, int cp,`
			`enum convert_string_mode mode, int *length,`
			`void (callback)(void data, unsigned char *buf, int buflen),`
			`void *callback_data);`

			`int get_cp_index(unsigned char *);`
			`unsigned char *get_cp_name(int);`
			`unsigned char *get_cp_mime_name(int);`
Refactor is_cp_special to is_cp_utf8 2006-07-18 11:51:03 -04:00			`int is_cp_utf8(int);`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00			`void free_conv_table(void);`
Rename CONFIG_UTF_8 to CONFIG_UTF8. The configure script no longer recognizes "CONFIG_UTF_8=yes" lines in custom features.conf files. They will have to be changed to "CONFIG_UTF8=yes". This incompatibility was deemed acceptable because no released version of ELinks supports CONFIG_UTF_8. The --enable-utf-8 option was not renamed. 2006-09-17 09:12:47 -04:00			`#ifdef CONFIG_UTF8`
Change "utf_8" to "utf8" in most identifiers. Suggested by Miciah on #elinks. What was renamed: add_utf_8 => add_utf8 cp2utf_8 => cp2utf8 encode_utf_8 => encode_utf8 get_translation_table_to_utf_8 => get_translation_table_to_utf8 goto invalid_utf_8_start_byte => goto invalid_utf8_start_byte goto utf_8 => goto utf8 goto utf_8_select => goto utf8_select terminal_interlink.utf_8 => terminal_interlink.utf8 utf_8_to_unicode => utf8_to_unicode What was not renamed: terminal._template_.utf_8_io option, TERM_OPT_UTF_8_IO Compatibility with existing elinks.conf files would require an alias. --enable-utf-8 Because the name of the charset is UTF-8, --enable-utf-8 looks better than --enable-utf8. CONFIG_UTF_8 Will be renamed in a later commit. Unicode/utf_8.cp, table_utf_8, aliases_utf_8 Will be renamed in a later commit. 2006-09-17 09:06:22 -04:00			`inline unsigned char *encode_utf8(unicode_val_T);`
Added utf8_prevchar for moving throught UTF-8 string to left. 2006-05-01 16:58:51 -04:00			`inline unsigned char utf8_prevchar(unsigned char , int, unsigned char *);`
Added UTF-8 char length lookup table Added lookup table to quick get number of bytes of UTF-8 character from first byte. 2006-01-30 19:09:49 -05:00			`inline int utf8charlen(const unsigned char *);`
Added functions for manipulating with UTF-8 strings. 2006-03-04 18:10:33 -05:00			`int utf8_char2cells(unsigned char , unsigned char );`
			`int utf8_ptr2cells(unsigned char , unsigned char );`
Added function utf8_ptr2chars for counting number of characters in string. 2006-04-07 16:06:17 -04:00			`int utf8_ptr2chars(unsigned char , unsigned char );`
Added functions for manipulating with UTF-8 strings. 2006-03-04 18:10:33 -05:00			`int utf8_cells2bytes(unsigned char , int, unsigned char );`
UTF-8: New functions for stepping forward and backward in a string. 2006-09-02 11:28:31 -04:00			`/* How utf8_step_forward and utf8_step_backward count steps. */`
			`enum utf8_step {`
			`/* Each step is one character, even if it is a combining or`
			`* double-width character. */`
			`utf8_step_characters,`

			`/* Each step is one cell. If the specified number of steps`
			`* would end in the middle of a double-width character, do not`
			`* include the character. */`
			`utf8_step_cells_fewer,`

			`/* Each step is one cell. If the specified number of steps`
			`* would end in the middle of a double-width character,`
			`* include the whole character. */`
			`utf8_step_cells_more`
			`};`
			`unsigned char utf8_step_forward(unsigned char , unsigned char *,`
			`int, enum utf8_step, int *);`
			`unsigned char utf8_step_backward(unsigned char , unsigned char *,`
			`int, enum utf8_step, int *);`
Double-width glyph support in terminal draw Added unicode_to_cell detect double-width glyphs. Modified terminal draw to correctly accept double-width glyphs. 2006-02-07 19:42:39 -05:00			`inline int unicode_to_cell(unicode_val_T);`
UTF-8: New function unicode_fold_label_case and a related script. 2006-08-05 12:45:53 -04:00			`unicode_val_T unicode_fold_label_case(unicode_val_T);`
Witekfl's UTF-8 patch v5. 2006-01-14 16:44:00 -05:00			`inline int strlen_utf8(unsigned char **);`
Change "utf_8" to "utf8" in most identifiers. Suggested by Miciah on #elinks. What was renamed: add_utf_8 => add_utf8 cp2utf_8 => cp2utf8 encode_utf_8 => encode_utf8 get_translation_table_to_utf_8 => get_translation_table_to_utf8 goto invalid_utf_8_start_byte => goto invalid_utf8_start_byte goto utf_8 => goto utf8 goto utf_8_select => goto utf8_select terminal_interlink.utf_8 => terminal_interlink.utf8 utf_8_to_unicode => utf8_to_unicode What was not renamed: terminal._template_.utf_8_io option, TERM_OPT_UTF_8_IO Compatibility with existing elinks.conf files would require an alias. --enable-utf-8 Because the name of the charset is UTF-8, --enable-utf-8 looks better than --enable-utf8. CONFIG_UTF_8 Will be renamed in a later commit. Unicode/utf_8.cp, table_utf_8, aliases_utf_8 Will be renamed in a later commit. 2006-09-17 09:06:22 -04:00			`inline unicode_val_T utf8_to_unicode(unsigned char *, unsigned char );`
UTF-8: New function cp_to_unicode(). 2006-08-13 16:35:50 -04:00			`unicode_val_T cp_to_unicode(int, unsigned char *, unsigned char );`
Rename CONFIG_UTF_8 to CONFIG_UTF8. The configure script no longer recognizes "CONFIG_UTF_8=yes" lines in custom features.conf files. They will have to be changed to "CONFIG_UTF8=yes". This incompatibility was deemed acceptable because no released version of ELinks supports CONFIG_UTF_8. The --enable-utf-8 option was not renamed. 2006-09-17 09:12:47 -04:00			`#endif /* CONFIG_UTF8 */`
Witekfl's UTF-8 patch v5. 2006-01-14 16:44:00 -05:00
try_document_key: Convert the key to UCS-4, resolving the FIXME. This requires compiling cp2u() in even without CONFIG_UTF_8. I also added an is_kbd_character macro to make try_document_key more resilient to changes in the definition of term_event_key_T. 2006-08-12 09:04:21 -04:00			`unicode_val_T cp2u(int, unsigned char);`
Change "utf_8" to "utf8" in most identifiers. Suggested by Miciah on #elinks. What was renamed: add_utf_8 => add_utf8 cp2utf_8 => cp2utf8 encode_utf_8 => encode_utf8 get_translation_table_to_utf_8 => get_translation_table_to_utf8 goto invalid_utf_8_start_byte => goto invalid_utf8_start_byte goto utf_8 => goto utf8 goto utf_8_select => goto utf8_select terminal_interlink.utf_8 => terminal_interlink.utf8 utf_8_to_unicode => utf8_to_unicode What was not renamed: terminal._template_.utf_8_io option, TERM_OPT_UTF_8_IO Compatibility with existing elinks.conf files would require an alias. --enable-utf-8 Because the name of the charset is UTF-8, --enable-utf-8 looks better than --enable-utf8. CONFIG_UTF_8 Will be renamed in a later commit. Unicode/utf_8.cp, table_utf_8, aliases_utf_8 Will be renamed in a later commit. 2006-09-17 09:06:22 -04:00			`unsigned char *cp2utf8(int, int);`
Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00
			`unsigned char *u2cp_(unicode_val_T, int, int no_nbsp_hack);`
			`#define u2cp(u, to) u2cp_(u, to, 0)`
			`#define u2cp_no_nbsp(u, to) u2cp_(u, to, 1)`

			`void init_charsets_lookup(void);`
			`void free_charsets_lookup(void);`

Define and use macros for handling UTF-16 surrogates. 2006-08-24 16:30:41 -04:00			`/* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single`
			`* 16-bit code unit, and each character U+10000...U+10FFFF as a pair`
			`* of two code units: a high surrogate followed by a low surrogate.`
			`* The range U+D800...U+DFFF is reserved for these surrogates. */`
			`#define is_utf16_surrogate(u) (((u) & 0xFFFFF800) == 0xD800)`
			`#define is_utf16_high_surrogate(u) (((u) & 0xFFFFFC00) == 0xD800)`
			`#define is_utf16_low_surrogate(u) (((u) & 0xFFFFFC00) == 0xDC00)`
			`#define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00))`
			`#define needs_utf16_surrogates(u) ((uint32_t) ((u) - 0x10000) < 0x100000)`
			`#define get_utf16_high_surrogate(u) (0xD800 + (((u) - 0x10000) >> 10))`
			`#define get_utf16_low_surrogate(u) (0xDC00 + ((u) & 0x3FF))`

Initial commit of the HEAD branch of the ELinks CVS repository, as of Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this by grafting. 2005-09-15 09:58:31 -04:00			`#endif`