2005-09-15 09:58:31 -04:00
|
|
|
#ifndef EL__INTL_CHARSETS_H
|
|
|
|
#define EL__INTL_CHARSETS_H
|
|
|
|
|
|
|
|
typedef uint32_t unicode_val_T;
|
|
|
|
|
2007-01-27 04:12:22 -05:00
|
|
|
/* U+0020 SPACE. Normally the same as ' ' or L' ' but perhaps ELinks
|
|
|
|
* shouldn't rely on that. */
|
|
|
|
#define UCS_SPACE ((unicode_val_T) 0x0020)
|
|
|
|
|
2006-08-19 06:29:37 -04:00
|
|
|
/* U+FFFD REPLACEMENT CHARACTER. Used when no Unicode mapping is
|
|
|
|
* known for a byte in a codepage, or when invalid UTF-8 is received
|
|
|
|
* from a terminal. After generating the character, ELinks then
|
|
|
|
* treats it like any other Unicode character. The user can also type
|
|
|
|
* this character directly, and it can occur in documents. */
|
|
|
|
#define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)
|
|
|
|
|
|
|
|
/* A special value that fits in unicode_val_T but is outside the range
|
2006-09-17 09:06:22 -04:00
|
|
|
* of Unicode characters. utf8_to_unicode and cp_to_unicode return
|
2006-08-19 06:29:37 -04:00
|
|
|
* this if the input is too short. This is also used as a placeholder
|
|
|
|
* for the second cell of a double-cell character. */
|
2006-07-31 15:23:47 -04:00
|
|
|
#define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2006-11-12 12:53:31 -05:00
|
|
|
/* If ELinks should display a double-cell character but there is only
|
|
|
|
* one cell available, it displays this character instead. This must
|
|
|
|
* be a single-cell character but need not be unique. Possible values
|
|
|
|
* might be U+0020 SPACE or U+303F IDEOGRAPHIC HALF FILL SPACE.
|
|
|
|
*
|
|
|
|
* Some BFU widgets (at least input fields and list boxes) currently
|
|
|
|
* ignore this setting and use U+0020 instead. (They first draw spaces
|
|
|
|
* and then overwrite with text; look for utf8_cells2bytes calls.)
|
|
|
|
* We should fix that if we ever change the value. */
|
|
|
|
#define UCS_ORPHAN_CELL ((unicode_val_T) 0x20)
|
|
|
|
|
2007-01-29 13:57:37 -05:00
|
|
|
/* replacement character. See u2cp().
|
|
|
|
* UTF-8 strings should use the encoding of U+00A0 instead. */
|
2005-09-15 09:58:31 -04:00
|
|
|
#define NBSP_CHAR ((unsigned char) 1)
|
|
|
|
#define NBSP_CHAR_STRING "\001"
|
|
|
|
|
2007-01-01 14:11:46 -05:00
|
|
|
/* How to convert a byte from a source charset. This is used in an
|
|
|
|
* array (struct conv_table[256]) indexed by the byte value. */
|
2005-09-15 09:58:31 -04:00
|
|
|
struct conv_table {
|
2007-01-01 14:11:46 -05:00
|
|
|
/* 0 if this is the final byte of a character, or 1 if more
|
|
|
|
* bytes are needed. */
|
2005-09-15 09:58:31 -04:00
|
|
|
int t;
|
|
|
|
union {
|
2007-01-01 14:11:46 -05:00
|
|
|
/* If @t==0: a null-terminated string that is the
|
|
|
|
* corresponding character in the target charset.
|
|
|
|
* Normally, the string is statically allocated.
|
2007-01-03 00:32:00 -05:00
|
|
|
* However, if the conversion table is to UTF-8, then
|
|
|
|
* the strings in elements 0x80 to 0xFF are allocated
|
|
|
|
* with @mem_alloc and owned by the table. */
|
2007-01-01 18:31:22 -05:00
|
|
|
const unsigned char *str;
|
2007-01-01 14:11:46 -05:00
|
|
|
/* If @t==1: a pointer to a nested conversion table
|
|
|
|
* (with 256 elements) that describes how to convert
|
|
|
|
* each possible subsequent byte. The conversion
|
|
|
|
* table owns the nested conversion table. */
|
2005-09-15 09:58:31 -04:00
|
|
|
struct conv_table *tbl;
|
|
|
|
} u;
|
|
|
|
};
|
|
|
|
|
|
|
|
enum convert_string_mode {
|
|
|
|
CSM_DEFAULT, /* Convert any char. */
|
|
|
|
CSM_QUERY, /* Special handling of '&' and '=' chars. */
|
|
|
|
CSM_FORM, /* Special handling of '&' and '=' chars in forms. */
|
|
|
|
CSM_NONE, /* Convert nothing. */
|
|
|
|
};
|
|
|
|
|
2007-01-29 13:57:37 -05:00
|
|
|
/* How to translate U+00A0 NO-BREAK SPACE. If u2cp_ is converting to
|
|
|
|
* UTF-8, it ignores this choice and just encodes the U+00A0. */
|
2006-10-01 18:33:41 -04:00
|
|
|
enum nbsp_mode {
|
|
|
|
/* Convert to NBSP_CHAR. This lets the HTML renderer
|
|
|
|
* recognize nbsp even if the codepage doesn't support
|
|
|
|
* nbsp. (VISCII doesn't.) */
|
|
|
|
NBSP_MODE_HACK = 0,
|
|
|
|
|
|
|
|
/* Convert to normal ASCII space. */
|
|
|
|
NBSP_MODE_ASCII = 1
|
|
|
|
};
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
struct conv_table *get_translation_table(int, int);
|
2007-01-02 01:29:08 -05:00
|
|
|
const unsigned char *get_entity_string(const unsigned char *str,
|
|
|
|
const int strlen, int encoding);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* The convert_string() name is also used by Samba (version 3.0.3), which
|
|
|
|
* provides libnss_wins.so.2, which is called somewhere inside
|
|
|
|
* _nss_wins_gethostbyname_r(). This name clash causes the elinks hostname
|
|
|
|
* lookup thread to crash so we need to rename the symbol. */
|
|
|
|
/* Reported by Derek Poon and filed as bug 453 */
|
|
|
|
#undef convert_string
|
|
|
|
#define convert_string convert_string_elinks
|
|
|
|
|
|
|
|
/* This routine converts a string from one charset to another according to the
|
|
|
|
* passed @convert_table, potentially also decoding SGML (HTML) entities along
|
|
|
|
* the way (according to @mode). It either returns dynamically allocated
|
|
|
|
* converted string of length @length, or if the @callback is non-NULL it calls
|
|
|
|
* it each few bytes instead and always returns NULL (@length is undefined).
|
|
|
|
* Note that it's ok not to care and pass NULL as @length. */
|
|
|
|
unsigned char *convert_string(struct conv_table *convert_table,
|
|
|
|
unsigned char *chars, int charslen, int cp,
|
|
|
|
enum convert_string_mode mode, int *length,
|
|
|
|
void (*callback)(void *data, unsigned char *buf, int buflen),
|
|
|
|
void *callback_data);
|
|
|
|
|
|
|
|
int get_cp_index(unsigned char *);
|
|
|
|
unsigned char *get_cp_name(int);
|
|
|
|
unsigned char *get_cp_mime_name(int);
|
2006-07-18 11:51:03 -04:00
|
|
|
int is_cp_utf8(int);
|
2005-09-15 09:58:31 -04:00
|
|
|
void free_conv_table(void);
|
2006-09-17 09:12:47 -04:00
|
|
|
#ifdef CONFIG_UTF8
|
2006-09-17 09:06:22 -04:00
|
|
|
inline unsigned char *encode_utf8(unicode_val_T);
|
2006-05-01 16:58:51 -04:00
|
|
|
inline unsigned char *utf8_prevchar(unsigned char *, int, unsigned char *);
|
2006-01-30 19:09:49 -05:00
|
|
|
inline int utf8charlen(const unsigned char *);
|
2006-03-04 18:10:33 -05:00
|
|
|
int utf8_char2cells(unsigned char *, unsigned char *);
|
|
|
|
int utf8_ptr2cells(unsigned char *, unsigned char *);
|
2006-04-07 16:06:17 -04:00
|
|
|
int utf8_ptr2chars(unsigned char *, unsigned char *);
|
2006-03-04 18:10:33 -05:00
|
|
|
int utf8_cells2bytes(unsigned char *, int, unsigned char *);
|
2006-09-02 11:28:31 -04:00
|
|
|
/* How utf8_step_forward and utf8_step_backward count steps. */
|
|
|
|
enum utf8_step {
|
|
|
|
/* Each step is one character, even if it is a combining or
|
|
|
|
* double-width character. */
|
2006-11-12 07:51:18 -05:00
|
|
|
UTF8_STEP_CHARACTERS,
|
2006-09-02 11:28:31 -04:00
|
|
|
|
|
|
|
/* Each step is one cell. If the specified number of steps
|
|
|
|
* would end in the middle of a double-width character, do not
|
|
|
|
* include the character. */
|
2006-11-12 07:51:18 -05:00
|
|
|
UTF8_STEP_CELLS_FEWER,
|
2006-09-02 11:28:31 -04:00
|
|
|
|
|
|
|
/* Each step is one cell. If the specified number of steps
|
|
|
|
* would end in the middle of a double-width character,
|
|
|
|
* include the whole character. */
|
2006-11-12 07:51:18 -05:00
|
|
|
UTF8_STEP_CELLS_MORE
|
2006-09-02 11:28:31 -04:00
|
|
|
};
|
|
|
|
unsigned char *utf8_step_forward(unsigned char *, unsigned char *,
|
|
|
|
int, enum utf8_step, int *);
|
|
|
|
unsigned char *utf8_step_backward(unsigned char *, unsigned char *,
|
|
|
|
int, enum utf8_step, int *);
|
2006-02-07 19:42:39 -05:00
|
|
|
inline int unicode_to_cell(unicode_val_T);
|
2006-08-05 12:45:53 -04:00
|
|
|
unicode_val_T unicode_fold_label_case(unicode_val_T);
|
2006-01-14 16:44:00 -05:00
|
|
|
inline int strlen_utf8(unsigned char **);
|
2006-09-17 09:06:22 -04:00
|
|
|
inline unicode_val_T utf8_to_unicode(unsigned char **, unsigned char *);
|
2006-08-13 16:35:50 -04:00
|
|
|
unicode_val_T cp_to_unicode(int, unsigned char **, unsigned char *);
|
2006-09-17 09:12:47 -04:00
|
|
|
#endif /* CONFIG_UTF8 */
|
2006-01-14 16:44:00 -05:00
|
|
|
|
2006-08-12 09:04:21 -04:00
|
|
|
unicode_val_T cp2u(int, unsigned char);
|
2007-01-02 14:39:34 -05:00
|
|
|
const unsigned char *cp2utf8(int, int);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2007-01-02 01:45:02 -05:00
|
|
|
const unsigned char *u2cp_(unicode_val_T, int, enum nbsp_mode);
|
2006-10-01 18:33:41 -04:00
|
|
|
#define u2cp(u, to) u2cp_(u, to, NBSP_MODE_HACK)
|
|
|
|
#define u2cp_no_nbsp(u, to) u2cp_(u, to, NBSP_MODE_ASCII)
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
void init_charsets_lookup(void);
|
|
|
|
void free_charsets_lookup(void);
|
|
|
|
|
2006-08-24 16:30:41 -04:00
|
|
|
/* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single
|
|
|
|
* 16-bit code unit, and each character U+10000...U+10FFFF as a pair
|
|
|
|
* of two code units: a high surrogate followed by a low surrogate.
|
|
|
|
* The range U+D800...U+DFFF is reserved for these surrogates. */
|
|
|
|
#define is_utf16_surrogate(u) (((u) & 0xFFFFF800) == 0xD800)
|
|
|
|
#define is_utf16_high_surrogate(u) (((u) & 0xFFFFFC00) == 0xD800)
|
|
|
|
#define is_utf16_low_surrogate(u) (((u) & 0xFFFFFC00) == 0xDC00)
|
|
|
|
#define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00))
|
|
|
|
#define needs_utf16_surrogates(u) ((uint32_t) ((u) - 0x10000) < 0x100000)
|
|
|
|
#define get_utf16_high_surrogate(u) (0xD800 + (((u) - 0x10000) >> 10))
|
|
|
|
#define get_utf16_low_surrogate(u) (0xDC00 + ((u) & 0x3FF))
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
#endif
|