2005-09-15 09:58:31 -04:00
|
|
|
/* Charsets convertor */
|
|
|
|
|
2006-10-12 17:43:49 -04:00
|
|
|
#ifndef _GNU_SOURCE
|
|
|
|
#define _GNU_SOURCE /* strcasecmp() */
|
|
|
|
#endif
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
#ifdef HAVE_CONFIG_H
|
|
|
|
#include "config.h"
|
|
|
|
#endif
|
|
|
|
|
2022-10-03 11:55:20 -04:00
|
|
|
#include <stdio.h>
|
|
|
|
|
2022-08-17 14:48:58 -04:00
|
|
|
#ifdef HAVE_STDINT_H
|
|
|
|
#include <stdint.h>
|
|
|
|
#endif
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
#if HAVE_LANGINFO_CODESET
|
|
|
|
#include <langinfo.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdlib.h>
|
2006-08-05 12:45:53 -04:00
|
|
|
#if HAVE_WCTYPE_H
|
|
|
|
#include <wctype.h>
|
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2010-07-23 13:59:59 -04:00
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
#include <errno.h>
|
|
|
|
#include <iconv.h>
|
|
|
|
#endif
|
|
|
|
|
2022-10-03 11:55:20 -04:00
|
|
|
#include <sys/ioctl.h>
|
|
|
|
#include <sys/kd.h>
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
#include "elinks.h"
|
|
|
|
|
|
|
|
#include "document/options.h"
|
|
|
|
#include "intl/charsets.h"
|
2022-06-05 13:12:25 -04:00
|
|
|
#ifdef CONFIG_OS_DOS
|
|
|
|
#include "osdep/dos/dos.h"
|
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
#include "util/conv.h"
|
|
|
|
#include "util/error.h"
|
|
|
|
#include "util/fastfind.h"
|
2008-01-03 06:57:24 -05:00
|
|
|
#include "util/hash.h"
|
2005-09-15 09:58:31 -04:00
|
|
|
#include "util/memory.h"
|
|
|
|
#include "util/string.h"
|
2022-10-03 11:55:20 -04:00
|
|
|
#include "osdep/osdep.h"
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
|
|
|
|
/* Fix namespace clash on MacOS. */
|
|
|
|
#define table table_elinks
|
|
|
|
|
|
|
|
struct table_entry {
|
|
|
|
unsigned char c;
|
2006-09-24 05:47:00 -04:00
|
|
|
/* This should in principle be unicode_val_T, but because all
|
|
|
|
* the values currently in codepage.inc fit in 16 bits, we can
|
|
|
|
* as well use uint16_t and halve sizeof(struct table_entry)
|
|
|
|
* from 8 bytes to 4. Should other characters ever be needed,
|
|
|
|
* unicode_val_T u : 24 might be a possibility, although it
|
|
|
|
* seems a little unportable as bitfields are in principle
|
|
|
|
* restricted to int, which may be 16-bit. */
|
|
|
|
uint16_t u;
|
2005-09-15 09:58:31 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
struct codepage_desc {
|
2022-02-15 11:53:24 -05:00
|
|
|
const char *name;
|
2022-02-15 13:02:30 -05:00
|
|
|
const char **aliases;
|
2006-09-28 18:07:54 -04:00
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
/* The Unicode mappings of codepage bytes 0x80...0xFF.
|
|
|
|
* (0x00...0x7F are assumed to be ASCII in all codepages.)
|
|
|
|
* Because all current values fit in 16 bits, we store them as
|
|
|
|
* uint16_t rather than unicode_val_T. If the codepage does
|
|
|
|
* not use some byte, then @highhalf maps that byte to 0xFFFF,
|
|
|
|
* which C code converts to UCS_REPLACEMENT_CHARACTER where
|
|
|
|
* appropriate. (U+FFFF is reserved and will never be
|
|
|
|
* assigned as a character.) */
|
|
|
|
const uint16_t *highhalf;
|
2006-09-28 18:07:54 -04:00
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
/* If some byte in the codepage corresponds to multiple Unicode
|
|
|
|
* characters, then the preferred character is in @highhalf
|
2007-01-03 00:32:00 -05:00
|
|
|
* above, and the rest are listed here in @table. This table
|
2006-09-24 09:55:29 -04:00
|
|
|
* is not used for translating from the codepage to Unicode. */
|
2006-09-24 04:59:23 -04:00
|
|
|
const struct table_entry *table;
|
2010-07-23 09:44:12 -04:00
|
|
|
|
|
|
|
/* Whether use iconv for translation */
|
|
|
|
unsigned int iconv:1;
|
2005-09-15 09:58:31 -04:00
|
|
|
};
|
|
|
|
|
|
|
|
#include "intl/codepage.inc"
|
|
|
|
#include "intl/uni_7b.inc"
|
|
|
|
#include "intl/entity.inc"
|
|
|
|
|
2009-03-28 14:15:08 -04:00
|
|
|
/* Declare the external-linkage inline functions defined in this file.
|
|
|
|
* Avoid the GCC 4.3.1 warning: `foo' declared inline after being
|
|
|
|
* called. The functions are not declared inline in charsets.h
|
|
|
|
* because C99 6.7.4p6 says that every external-linkage function
|
|
|
|
* declared inline shall be defined in the same translation unit.
|
|
|
|
* The non-inline declarations in charsets.h also make sure that the
|
|
|
|
* compiler emits global definitions for the symbols so that the
|
|
|
|
* functions can be called from other translation units. */
|
2021-01-02 10:20:27 -05:00
|
|
|
NONSTATIC_INLINE char *encode_utf8(unicode_val_T u);
|
|
|
|
NONSTATIC_INLINE int utf8charlen(const char *p);
|
|
|
|
NONSTATIC_INLINE unicode_val_T utf8_to_unicode(char **string,
|
|
|
|
const char *end);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2007-01-02 14:40:14 -05:00
|
|
|
static const char strings[256][2] = {
|
2005-09-15 09:58:31 -04:00
|
|
|
"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
|
|
|
|
"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
|
|
|
|
"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
|
|
|
|
"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
|
|
|
|
"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
|
|
|
|
"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
|
|
|
|
"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
|
|
|
|
"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
|
|
|
|
"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
|
|
|
|
"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
|
|
|
|
"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
|
|
|
|
"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
|
|
|
|
"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
|
|
|
|
"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
|
|
|
|
"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
|
|
|
|
"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
|
|
|
|
"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
|
|
|
|
"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
|
|
|
|
"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
|
|
|
|
"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
|
|
|
|
"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
|
|
|
|
"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
|
|
|
|
"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
|
|
|
|
"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
|
|
|
|
"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
|
|
|
|
"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
|
|
|
|
"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
|
|
|
|
"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
|
|
|
|
"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
|
|
|
|
"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
|
|
|
|
"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
|
|
|
|
"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
|
|
|
|
};
|
|
|
|
|
2010-07-23 13:59:59 -04:00
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
static iconv_t iconv_cd = (iconv_t)-1;
|
|
|
|
#endif
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
static void
|
|
|
|
free_translation_table(struct conv_table *p)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < 256; i++)
|
|
|
|
if (p[i].t)
|
|
|
|
free_translation_table(p[i].u.tbl);
|
|
|
|
|
|
|
|
mem_free(p);
|
|
|
|
}
|
|
|
|
|
2007-01-01 17:54:14 -05:00
|
|
|
/* A string used in conversion tables when there is no correct
|
2007-01-01 18:07:57 -05:00
|
|
|
* conversion. This is compared by address and therefore should be a
|
|
|
|
* named array rather than a pointer so that it won't share storage
|
|
|
|
* with any other string literal that happens to have the same
|
|
|
|
* characters. */
|
2021-01-02 10:20:27 -05:00
|
|
|
static const char no_str[] = "*";
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
static void
|
|
|
|
new_translation_table(struct conv_table *p)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < 256; i++)
|
|
|
|
if (p[i].t)
|
|
|
|
free_translation_table(p[i].u.tbl);
|
|
|
|
for (i = 0; i < 128; i++) {
|
|
|
|
p[i].t = 0;
|
|
|
|
p[i].u.str = strings[i];
|
|
|
|
}
|
|
|
|
for (; i < 256; i++) {
|
|
|
|
p[i].t = 0;
|
|
|
|
p[i].u.str = no_str;
|
|
|
|
}
|
2010-07-23 13:59:59 -04:00
|
|
|
p->iconv_cp = -1;
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#define BIN_SEARCH(table, entry, entries, key, result) \
|
|
|
|
{ \
|
|
|
|
long _s = 0, _e = (entries) - 1; \
|
|
|
|
\
|
|
|
|
while (_s <= _e || !((result) = -1)) { \
|
|
|
|
long _m = (_s + _e) / 2; \
|
|
|
|
\
|
|
|
|
if ((table)[_m].entry == (key)) { \
|
|
|
|
(result) = _m; \
|
|
|
|
break; \
|
|
|
|
} \
|
|
|
|
if ((table)[_m].entry > (key)) _e = _m - 1; \
|
|
|
|
if ((table)[_m].entry < (key)) _s = _m + 1; \
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
|
2022-10-03 11:55:20 -04:00
|
|
|
/* list of unicode codepoints supported by the current terminal, if this
|
|
|
|
* information is available, otherwise size = -1 */
|
|
|
|
|
|
|
|
struct {
|
|
|
|
int size;
|
|
|
|
unicode_val_T *list;
|
|
|
|
} codepoints;
|
|
|
|
|
|
|
|
int is_codepoint_supported(unicode_val_T u) {
|
|
|
|
int first, last, middle;
|
|
|
|
|
|
|
|
if (codepoints.size == -1)
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
first = 0;
|
|
|
|
last = codepoints.size - 1;
|
|
|
|
|
|
|
|
while (first <= last) {
|
|
|
|
middle = (last + first) / 2;
|
|
|
|
if (codepoints.list[middle] == u)
|
|
|
|
return u;
|
|
|
|
else if (codepoints.list[middle] > u)
|
|
|
|
last = middle - 1;
|
|
|
|
else
|
|
|
|
first = middle + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int codepoint_replacement(unicode_val_T u) {
|
|
|
|
int s;
|
|
|
|
|
|
|
|
if (is_codepoint_supported(u))
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
static const unicode_val_T strange_chars[32] = {
|
|
|
|
0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
|
|
|
|
0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
|
|
|
|
0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
|
|
|
|
0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SYSTEM_CHARSET_FLAG 128
|
2006-09-24 09:55:29 -04:00
|
|
|
#define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *
|
2006-10-01 18:33:41 -04:00
|
|
|
u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
int j;
|
|
|
|
int s;
|
|
|
|
|
|
|
|
if (u < 128) return strings[u];
|
2006-01-14 16:44:00 -05:00
|
|
|
|
2011-04-17 11:09:29 -04:00
|
|
|
if (u < 0xa0) {
|
|
|
|
u = strange_chars[u - 0x80];
|
|
|
|
if (!u) return NULL;
|
|
|
|
}
|
|
|
|
|
2006-01-14 16:44:00 -05:00
|
|
|
to &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
|
2006-09-24 06:33:58 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[to]))
|
2006-09-17 09:06:22 -04:00
|
|
|
return encode_utf8(u);
|
2006-01-14 16:44:00 -05:00
|
|
|
|
2007-01-29 13:57:37 -05:00
|
|
|
/* To mark non breaking spaces in non-UTF-8 strings, we use a
|
|
|
|
* special char NBSP_CHAR. */
|
2007-04-22 15:37:12 -04:00
|
|
|
if (u == UCS_NO_BREAK_SPACE) {
|
2006-10-01 18:33:41 -04:00
|
|
|
if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
|
|
|
|
else /* NBSP_MODE_ASCII */ return " ";
|
|
|
|
}
|
2007-04-22 15:38:40 -04:00
|
|
|
if (u == UCS_SOFT_HYPHEN) return "";
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
if (u < 0xFFFF)
|
|
|
|
for (j = 0; j < 0x80; j++)
|
|
|
|
if (codepages[to].highhalf[j] == u)
|
|
|
|
return strings[0x80 + j];
|
2005-09-15 09:58:31 -04:00
|
|
|
for (j = 0; codepages[to].table[j].c; j++)
|
|
|
|
if (codepages[to].table[j].u == u)
|
|
|
|
return strings[codepages[to].table[j].c];
|
|
|
|
|
|
|
|
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
|
|
|
|
if (s != -1) return unicode_7b[s].s;
|
|
|
|
|
|
|
|
return no_str;
|
|
|
|
}
|
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
static char utf_buffer[7];
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
NONSTATIC_INLINE char *
|
2006-09-17 09:06:22 -04:00
|
|
|
encode_utf8(unicode_val_T u)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
2022-10-03 11:55:20 -04:00
|
|
|
int s;
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
memset(utf_buffer, 0, 7);
|
|
|
|
|
2022-10-03 11:55:20 -04:00
|
|
|
if (!is_codepoint_supported(u)) {
|
|
|
|
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
|
|
|
|
if (s != -1) return unicode_7b[s].s;
|
|
|
|
}
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
if (u < 0x80)
|
|
|
|
utf_buffer[0] = u;
|
|
|
|
else if (u < 0x800)
|
|
|
|
utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
|
|
|
|
utf_buffer[1] = 0x80 | (u & 0x3f);
|
|
|
|
else if (u < 0x10000)
|
|
|
|
utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
|
|
|
|
utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
|
|
|
|
utf_buffer[2] = 0x80 | (u & 0x3f);
|
|
|
|
else if (u < 0x200000)
|
|
|
|
utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
|
|
|
|
utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
|
|
|
|
utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
|
|
|
|
utf_buffer[3] = 0x80 | (u & 0x3f);
|
|
|
|
else if (u < 0x4000000)
|
|
|
|
utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
|
|
|
|
utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
|
|
|
|
utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
|
|
|
|
utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
|
|
|
|
utf_buffer[4] = 0x80 | (u & 0x3f);
|
|
|
|
else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
|
|
|
|
utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
|
|
|
|
utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
|
|
|
|
utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
|
|
|
|
utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
|
|
|
|
utf_buffer[5] = 0x80 | (u & 0x3f);
|
|
|
|
|
|
|
|
return utf_buffer;
|
|
|
|
}
|
|
|
|
|
2006-02-02 18:27:01 -05:00
|
|
|
/* Number of bytes utf8 character indexed by first byte. Illegal bytes are
|
|
|
|
* equal ones and handled different. */
|
2007-01-01 10:18:05 -05:00
|
|
|
static const char utf8char_len_tab[256] = {
|
2006-02-02 18:27:01 -05:00
|
|
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
|
|
|
|
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
|
|
|
|
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
|
|
|
|
};
|
|
|
|
|
2008-10-18 06:51:04 -04:00
|
|
|
#ifdef CONFIG_UTF8
|
2009-03-28 14:15:08 -04:00
|
|
|
NONSTATIC_INLINE int
|
2021-01-02 10:20:27 -05:00
|
|
|
utf8charlen(const char *p)
|
2006-01-30 19:09:49 -05:00
|
|
|
{
|
2021-01-02 10:20:27 -05:00
|
|
|
return p ? utf8char_len_tab[(unsigned char)*p] : 0;
|
2006-01-30 19:09:49 -05:00
|
|
|
}
|
|
|
|
|
2009-03-28 14:15:08 -04:00
|
|
|
int
|
2021-01-02 10:20:27 -05:00
|
|
|
strlen_utf8(char **str)
|
2006-01-14 16:44:00 -05:00
|
|
|
{
|
2021-01-02 10:20:27 -05:00
|
|
|
char *s = *str;
|
2022-01-18 14:30:48 -05:00
|
|
|
char *end = strchr(s, '\0');
|
2006-01-14 16:44:00 -05:00
|
|
|
int x;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
for (x = 0;; x++, s += len) {
|
2006-01-30 19:09:49 -05:00
|
|
|
len = utf8charlen(s);
|
2006-01-14 16:44:00 -05:00
|
|
|
if (s + len > end) break;
|
|
|
|
}
|
|
|
|
*str = s;
|
|
|
|
return x;
|
|
|
|
}
|
|
|
|
|
2006-05-01 16:58:51 -04:00
|
|
|
#define utf8_issingle(p) (((p) & 0x80) == 0)
|
|
|
|
#define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
|
|
|
|
|
|
|
|
/* Start from @current and move back to @pos char. This pointer return. The
|
|
|
|
* most left pointer is @start. */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *
|
|
|
|
utf8_prevchar(char *current, int pos, char *start)
|
2006-05-01 16:58:51 -04:00
|
|
|
{
|
2006-07-25 03:59:12 -04:00
|
|
|
if (current == NULL || start == NULL || pos < 0)
|
|
|
|
return NULL;
|
|
|
|
while (pos > 0 && current != start) {
|
|
|
|
current--;
|
|
|
|
if (utf8_islead(*current))
|
|
|
|
pos--;
|
|
|
|
}
|
|
|
|
return current;
|
2006-05-01 16:58:51 -04:00
|
|
|
}
|
|
|
|
|
2006-03-04 18:10:33 -05:00
|
|
|
/* Count number of standard terminal cells needed for displaying UTF-8
|
|
|
|
* character. */
|
|
|
|
int
|
2022-01-31 10:28:56 -05:00
|
|
|
utf8_char2cells(const char *utf8_char_const, char *end)
|
2006-03-04 18:10:33 -05:00
|
|
|
{
|
|
|
|
unicode_val_T u;
|
2022-01-31 10:28:56 -05:00
|
|
|
char *utf8_char = (char *)utf8_char_const;
|
2006-03-04 18:10:33 -05:00
|
|
|
|
|
|
|
if (end == NULL)
|
2022-01-18 14:30:48 -05:00
|
|
|
end = strchr(utf8_char, '\0');
|
2006-03-04 18:10:33 -05:00
|
|
|
|
|
|
|
if(!utf8_char || !end)
|
|
|
|
return -1;
|
|
|
|
|
2006-09-17 09:06:22 -04:00
|
|
|
u = utf8_to_unicode(&utf8_char, end);
|
2006-03-04 18:10:33 -05:00
|
|
|
|
|
|
|
return unicode_to_cell(u);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Count number of standard terminal cells needed for displaying string
|
|
|
|
* with UTF-8 characters. */
|
|
|
|
int
|
2022-01-31 10:28:56 -05:00
|
|
|
utf8_ptr2cells(const char *string, char *end)
|
2006-03-04 18:10:33 -05:00
|
|
|
{
|
|
|
|
int charlen, cell, cells = 0;
|
|
|
|
|
|
|
|
if (end == NULL)
|
2022-02-21 10:27:29 -05:00
|
|
|
end = (char *)strchr(string, '\0');
|
2006-03-04 18:10:33 -05:00
|
|
|
|
|
|
|
if(!string || !end)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
do {
|
|
|
|
charlen = utf8charlen(string);
|
2006-07-27 03:51:10 -04:00
|
|
|
if (string + charlen > end)
|
2006-03-04 18:10:33 -05:00
|
|
|
break;
|
|
|
|
|
|
|
|
cell = utf8_char2cells(string, end);
|
|
|
|
if (cell < 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
cells += cell;
|
|
|
|
string += charlen;
|
|
|
|
} while (1);
|
|
|
|
|
|
|
|
return cells;
|
|
|
|
}
|
|
|
|
|
2006-04-07 16:06:17 -04:00
|
|
|
/* Count number of characters in string. */
|
|
|
|
int
|
2021-01-02 10:20:27 -05:00
|
|
|
utf8_ptr2chars(char *string, char *end)
|
2006-04-07 16:06:17 -04:00
|
|
|
{
|
|
|
|
int charlen, chars = 0;
|
|
|
|
|
|
|
|
if (end == NULL)
|
2022-01-18 14:30:48 -05:00
|
|
|
end = strchr(string, '\0');
|
2006-04-07 16:06:17 -04:00
|
|
|
|
|
|
|
if(!string || !end)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
do {
|
|
|
|
charlen = utf8charlen(string);
|
2006-07-27 03:51:10 -04:00
|
|
|
if (string + charlen > end)
|
2006-04-07 16:06:17 -04:00
|
|
|
break;
|
|
|
|
|
|
|
|
chars++;
|
|
|
|
string += charlen;
|
|
|
|
} while (1);
|
|
|
|
|
|
|
|
return chars;
|
|
|
|
}
|
|
|
|
|
2006-03-04 18:10:33 -05:00
|
|
|
/*
|
|
|
|
* Count number of bytes from begining of the string needed for displaying
|
|
|
|
* specified number of cells.
|
|
|
|
*/
|
|
|
|
int
|
2022-01-31 10:41:29 -05:00
|
|
|
utf8_cells2bytes(const char *string, int max_cells, char *end)
|
2006-03-04 18:10:33 -05:00
|
|
|
{
|
|
|
|
unsigned int bytes = 0, cells = 0;
|
|
|
|
|
|
|
|
assert(max_cells>=0);
|
|
|
|
|
|
|
|
if (end == NULL)
|
2022-02-21 10:27:29 -05:00
|
|
|
end = (char *)strchr(string, '\0');
|
2006-03-04 18:10:33 -05:00
|
|
|
|
|
|
|
if(!string || !end)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
do {
|
|
|
|
int cell = utf8_char2cells(&string[bytes], end);
|
|
|
|
if (cell < 0)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
cells += cell;
|
|
|
|
if (cells > max_cells)
|
|
|
|
break;
|
|
|
|
|
|
|
|
bytes += utf8charlen(&string[bytes]);
|
|
|
|
|
|
|
|
if (string + bytes > end) {
|
|
|
|
bytes = end - string;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} while(1);
|
|
|
|
|
|
|
|
return bytes;
|
|
|
|
}
|
2006-02-07 19:42:39 -05:00
|
|
|
|
2006-09-02 11:28:31 -04:00
|
|
|
/* Take @max steps forward from @string in the specified @way, but
|
|
|
|
* not going past @end. Return the resulting address. Store the
|
|
|
|
* number of steps taken to *@count, unless @count is NULL.
|
|
|
|
*
|
|
|
|
* This assumes the text is valid UTF-8, and @string and @end point to
|
|
|
|
* character boundaries. If not, it doesn't crash but the results may
|
|
|
|
* be inconsistent.
|
|
|
|
*
|
|
|
|
* This function can do some of the same jobs as utf8charlen(),
|
|
|
|
* utf8_cells2bytes(), and strlen_utf8(). */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *
|
|
|
|
utf8_step_forward(char *string, char *end,
|
2006-09-02 11:28:31 -04:00
|
|
|
int max, enum utf8_step way, int *count)
|
|
|
|
{
|
|
|
|
int steps = 0;
|
2021-01-02 10:20:27 -05:00
|
|
|
char *current = string;
|
2006-09-02 11:28:31 -04:00
|
|
|
|
|
|
|
assert(string);
|
|
|
|
assert(max >= 0);
|
2006-09-02 20:08:56 -04:00
|
|
|
if_assert_failed goto invalid_arg;
|
2006-09-02 11:28:31 -04:00
|
|
|
if (end == NULL)
|
2022-01-18 14:30:48 -05:00
|
|
|
end = strchr(string, '\0');
|
2006-09-02 11:28:31 -04:00
|
|
|
|
|
|
|
switch (way) {
|
2006-11-12 07:51:18 -05:00
|
|
|
case UTF8_STEP_CHARACTERS:
|
2006-09-02 11:28:31 -04:00
|
|
|
while (steps < max && current < end) {
|
|
|
|
++current;
|
|
|
|
if (utf8_islead(*current))
|
|
|
|
++steps;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2006-11-12 07:51:18 -05:00
|
|
|
case UTF8_STEP_CELLS_FEWER:
|
|
|
|
case UTF8_STEP_CELLS_MORE:
|
2009-05-26 17:50:57 -04:00
|
|
|
while (steps < max && current < end) {
|
2006-09-02 11:28:31 -04:00
|
|
|
unicode_val_T u;
|
2021-01-02 10:20:27 -05:00
|
|
|
char *prev = current;
|
2006-09-02 11:28:31 -04:00
|
|
|
int width;
|
|
|
|
|
2006-09-17 09:06:22 -04:00
|
|
|
u = utf8_to_unicode(¤t, end);
|
2006-09-02 11:28:31 -04:00
|
|
|
if (u == UCS_NO_CHAR) {
|
|
|
|
/* Assume the incomplete sequence
|
|
|
|
* costs one cell. */
|
|
|
|
current = end;
|
|
|
|
++steps;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
width = unicode_to_cell(u);
|
2006-11-12 07:51:18 -05:00
|
|
|
if (way == UTF8_STEP_CELLS_FEWER
|
2006-09-02 11:28:31 -04:00
|
|
|
&& steps + width > max) {
|
|
|
|
/* Back off. */
|
|
|
|
current = prev;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
steps += width;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
INTERNAL("impossible enum utf8_step");
|
|
|
|
}
|
|
|
|
|
2006-09-02 20:08:56 -04:00
|
|
|
invalid_arg:
|
2006-09-02 11:28:31 -04:00
|
|
|
if (count)
|
|
|
|
*count = steps;
|
|
|
|
return current;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Take @max steps backward from @string in the specified @way, but
|
|
|
|
* not going past @start. Return the resulting address. Store the
|
|
|
|
* number of steps taken to *@count, unless @count is NULL.
|
|
|
|
*
|
|
|
|
* This assumes the text is valid UTF-8, and @string and @start point
|
|
|
|
* to character boundaries. If not, it doesn't crash but the results
|
|
|
|
* may be inconsistent.
|
|
|
|
*
|
|
|
|
* This function can do some of the same jobs as utf8_prevchar(). */
|
2021-01-02 10:20:27 -05:00
|
|
|
char *
|
|
|
|
utf8_step_backward(char *string, char *start,
|
2006-09-02 11:28:31 -04:00
|
|
|
int max, enum utf8_step way, int *count)
|
|
|
|
{
|
|
|
|
int steps = 0;
|
2021-01-02 10:20:27 -05:00
|
|
|
char *current = string;
|
2006-09-02 11:28:31 -04:00
|
|
|
|
|
|
|
assert(string);
|
|
|
|
assert(start);
|
|
|
|
assert(max >= 0);
|
2006-09-02 20:08:56 -04:00
|
|
|
if_assert_failed goto invalid_arg;
|
2006-09-02 11:28:31 -04:00
|
|
|
|
|
|
|
switch (way) {
|
2006-11-12 07:51:18 -05:00
|
|
|
case UTF8_STEP_CHARACTERS:
|
2006-09-02 11:28:31 -04:00
|
|
|
while (steps < max && current > start) {
|
|
|
|
--current;
|
|
|
|
if (utf8_islead(*current))
|
|
|
|
++steps;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
2006-11-12 07:51:18 -05:00
|
|
|
case UTF8_STEP_CELLS_FEWER:
|
|
|
|
case UTF8_STEP_CELLS_MORE:
|
2006-09-02 11:28:31 -04:00
|
|
|
while (steps < max) {
|
2021-01-02 10:20:27 -05:00
|
|
|
char *prev = current;
|
|
|
|
char *look;
|
2006-09-02 11:28:31 -04:00
|
|
|
unicode_val_T u;
|
|
|
|
int width;
|
|
|
|
|
|
|
|
if (current <= start)
|
|
|
|
break;
|
|
|
|
do {
|
|
|
|
--current;
|
|
|
|
} while (current > start && !utf8_islead(*current));
|
|
|
|
|
|
|
|
look = current;
|
2006-09-17 09:06:22 -04:00
|
|
|
u = utf8_to_unicode(&look, prev);
|
2006-09-02 11:28:31 -04:00
|
|
|
if (u == UCS_NO_CHAR) {
|
|
|
|
/* Assume the incomplete sequence
|
|
|
|
* costs one cell. */
|
|
|
|
width = 1;
|
|
|
|
} else
|
|
|
|
width = unicode_to_cell(u);
|
|
|
|
|
2006-11-12 07:51:18 -05:00
|
|
|
if (way == UTF8_STEP_CELLS_FEWER
|
2006-09-02 11:28:31 -04:00
|
|
|
&& steps + width > max) {
|
|
|
|
/* Back off. */
|
|
|
|
current = prev;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
steps += width;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
INTERNAL("impossible enum utf8_step");
|
|
|
|
}
|
|
|
|
|
2006-09-02 20:08:56 -04:00
|
|
|
invalid_arg:
|
2006-09-02 11:28:31 -04:00
|
|
|
if (count)
|
|
|
|
*count = steps;
|
|
|
|
return current;
|
|
|
|
}
|
|
|
|
|
2006-07-27 03:51:10 -04:00
|
|
|
/*
|
2006-02-07 19:42:39 -05:00
|
|
|
* Find out number of standard terminal collumns needed for displaying symbol
|
|
|
|
* (glyph) which represents Unicode character c.
|
|
|
|
*
|
2006-10-21 17:05:37 -04:00
|
|
|
* TODO: Use wcwidth when it is available. This seems to require:
|
|
|
|
* - Make the configure script check whether <wchar.h> and wcwidth exist.
|
|
|
|
* - Define _XOPEN_SOURCE and include <wchar.h>.
|
|
|
|
* - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
|
|
|
|
* matches ISO 10646 in all locales.)
|
|
|
|
* However, these do not suffice, because wcwidth depends on LC_CTYPE
|
|
|
|
* in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
|
|
|
|
* is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
|
|
|
|
* <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
|
|
|
|
* U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
|
|
|
|
* character is apparently not supported in all locales. Why is that?
|
|
|
|
* - Perhaps there is standardese that requires supported characters
|
|
|
|
* to be convertable to multibyte form. Then ELinks could just pick
|
|
|
|
* some UTF-8 locale for its wcwidth purposes.
|
|
|
|
* - Perhaps wcwidth can even return different nonnegative values for
|
|
|
|
* the same ISO 10646 character in different locales. Then ELinks
|
|
|
|
* would have to set LC_CTYPE to match at least the terminal's
|
|
|
|
* charset (which may differ from the LC_CTYPE environment variable,
|
|
|
|
* especially when the master process is serving a slave terminal).
|
|
|
|
* But there is no guarantee that the libc supports all the same
|
|
|
|
* charsets as ELinks does.
|
|
|
|
* For now, it seems safest to avoid the potentially locale-dependent
|
|
|
|
* libc version of wcwidth, and instead use a hardcoded mapping.
|
|
|
|
*
|
2006-02-07 19:42:39 -05:00
|
|
|
* @return 2 for double-width glyph, 1 for others.
|
2017-05-16 12:37:29 -04:00
|
|
|
* 0 for unprintable glyphs (like 0x200e: "LEFT-TO-RIGHT MARK")
|
2006-02-07 19:42:39 -05:00
|
|
|
*/
|
2020-08-03 17:16:43 -04:00
|
|
|
|
|
|
|
#if 0
|
2009-03-28 14:15:08 -04:00
|
|
|
NONSTATIC_INLINE int
|
2006-02-07 19:42:39 -05:00
|
|
|
unicode_to_cell(unicode_val_T c)
|
|
|
|
{
|
2022-10-03 11:55:20 -04:00
|
|
|
int s;
|
|
|
|
|
|
|
|
if (!is_codepoint_supported(c)) {
|
|
|
|
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, c, s);
|
|
|
|
if (s != -1) return strlen(unicode_7b[s].s);
|
|
|
|
}
|
|
|
|
|
2017-05-16 12:37:29 -04:00
|
|
|
if (c == 0x200e || c == 0x200f)
|
|
|
|
return 0;
|
2006-02-07 19:42:39 -05:00
|
|
|
if (c >= 0x1100
|
|
|
|
&& (c <= 0x115f /* Hangul Jamo */
|
|
|
|
|| c == 0x2329
|
|
|
|
|| c == 0x232a
|
|
|
|
|| (c >= 0x2e80 && c <= 0xa4cf
|
|
|
|
&& c != 0x303f) /* CJK ... Yi */
|
|
|
|
|| (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
|
|
|
|
|| (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
|
|
|
|
Ideographs */
|
|
|
|
|| (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
|
|
|
|
|| (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
|
|
|
|
|| (c >= 0xffe0 && c <= 0xffe6)
|
|
|
|
|| (c >= 0x20000 && c <= 0x2fffd)
|
|
|
|
|| (c >= 0x30000 && c <= 0x3fffd)))
|
|
|
|
return 2;
|
|
|
|
|
2006-07-27 03:51:10 -04:00
|
|
|
return 1;
|
2006-02-07 19:42:39 -05:00
|
|
|
}
|
2020-08-03 17:16:43 -04:00
|
|
|
#endif
|
2006-02-07 19:42:39 -05:00
|
|
|
|
2006-08-05 12:45:53 -04:00
|
|
|
/* Fold the case of a Unicode character, so that hotkeys in labels can
|
2006-08-13 16:41:48 -04:00
|
|
|
* be compared case-insensitively. It is unspecified whether the
|
2006-08-05 12:45:53 -04:00
|
|
|
* result will be in upper or lower case. */
|
|
|
|
unicode_val_T
|
|
|
|
unicode_fold_label_case(unicode_val_T c)
|
|
|
|
{
|
|
|
|
#if __STDC_ISO_10646__ && HAVE_WCTYPE_H
|
|
|
|
return towlower(c);
|
|
|
|
#else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
|
|
|
|
/* For now, this supports only ASCII. It would be possible to
|
|
|
|
* use code generated from CaseFolding.txt of Unicode if the
|
|
|
|
* acknowledgements required by http://www.unicode.org/copyright.html
|
|
|
|
* were added to associated documentation of ELinks. */
|
|
|
|
if (c >= 0x41 && c <= 0x5A)
|
|
|
|
return c + 0x20;
|
|
|
|
else
|
|
|
|
return c;
|
|
|
|
#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
|
|
|
|
}
|
2008-10-18 06:51:04 -04:00
|
|
|
#endif /* CONFIG_UTF8 */
|
2006-08-05 12:45:53 -04:00
|
|
|
|
2009-03-28 14:15:08 -04:00
|
|
|
NONSTATIC_INLINE unicode_val_T
|
2021-01-02 10:20:27 -05:00
|
|
|
utf8_to_unicode(char **string, const char *end)
|
2006-01-14 16:44:00 -05:00
|
|
|
{
|
2021-01-17 15:56:40 -05:00
|
|
|
unsigned char *str = (unsigned char *)*string;
|
2006-01-14 16:44:00 -05:00
|
|
|
unicode_val_T u;
|
|
|
|
int length;
|
|
|
|
|
2021-01-17 15:56:40 -05:00
|
|
|
length = utf8char_len_tab[str[0]];
|
2006-01-14 16:44:00 -05:00
|
|
|
|
2021-01-17 15:56:40 -05:00
|
|
|
if (str + length > (const unsigned char *)end) {
|
2006-01-14 16:44:00 -05:00
|
|
|
return UCS_NO_CHAR;
|
2006-07-27 03:51:10 -04:00
|
|
|
}
|
2006-01-14 16:44:00 -05:00
|
|
|
|
|
|
|
switch (length) {
|
2006-12-22 18:48:07 -05:00
|
|
|
case 1: /* U+0000 to U+007F */
|
2006-12-19 02:31:55 -05:00
|
|
|
if (str[0] >= 0x80) {
|
|
|
|
invalid_utf8:
|
|
|
|
++*string;
|
|
|
|
return UCS_REPLACEMENT_CHARACTER;
|
|
|
|
}
|
2006-01-14 16:44:00 -05:00
|
|
|
u = str[0];
|
|
|
|
break;
|
2006-12-22 18:48:07 -05:00
|
|
|
case 2: /* U+0080 to U+07FF */
|
2006-12-19 02:31:55 -05:00
|
|
|
if ((str[1] & 0xc0) != 0x80)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
u = (str[0] & 0x1f) << 6;
|
|
|
|
u += (str[1] & 0x3f);
|
2006-12-19 02:31:55 -05:00
|
|
|
if (u < 0x80)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
break;
|
2006-12-22 18:48:07 -05:00
|
|
|
case 3: /* U+0800 to U+FFFF, except surrogates */
|
2006-12-19 02:31:55 -05:00
|
|
|
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
u = (str[0] & 0x0f) << 12;
|
|
|
|
u += ((str[1] & 0x3f) << 6);
|
|
|
|
u += (str[2] & 0x3f);
|
2006-12-22 18:48:07 -05:00
|
|
|
if (u < 0x800 || is_utf16_surrogate(u))
|
2006-12-19 02:31:55 -05:00
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
break;
|
2006-12-22 18:48:07 -05:00
|
|
|
case 4: /* U+10000 to U+1FFFFF */
|
2006-12-19 02:31:55 -05:00
|
|
|
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|
|
|
|
|| (str[3] & 0xc0) != 0x80)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
u = (str[0] & 0x0f) << 18;
|
|
|
|
u += ((str[1] & 0x3f) << 12);
|
|
|
|
u += ((str[2] & 0x3f) << 6);
|
|
|
|
u += (str[3] & 0x3f);
|
2006-12-19 02:31:55 -05:00
|
|
|
if (u < 0x10000)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
break;
|
2006-12-22 18:48:07 -05:00
|
|
|
case 5: /* U+200000 to U+3FFFFFF */
|
2006-12-19 02:31:55 -05:00
|
|
|
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|
|
|
|
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
u = (str[0] & 0x0f) << 24;
|
|
|
|
u += ((str[1] & 0x3f) << 18);
|
|
|
|
u += ((str[2] & 0x3f) << 12);
|
|
|
|
u += ((str[3] & 0x3f) << 6);
|
|
|
|
u += (str[4] & 0x3f);
|
2006-12-19 02:31:55 -05:00
|
|
|
if (u < 0x200000)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
break;
|
2006-12-22 18:48:07 -05:00
|
|
|
case 6: /* U+4000000 to U+7FFFFFFF */
|
2006-12-19 02:31:55 -05:00
|
|
|
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|
|
|
|
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
|
|
|
|
|| (str[5] & 0xc0) != 0x80)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
u = (str[0] & 0x01) << 30;
|
|
|
|
u += ((str[1] & 0x3f) << 24);
|
|
|
|
u += ((str[2] & 0x3f) << 18);
|
|
|
|
u += ((str[3] & 0x3f) << 12);
|
|
|
|
u += ((str[4] & 0x3f) << 6);
|
|
|
|
u += (str[5] & 0x3f);
|
2006-12-19 02:31:55 -05:00
|
|
|
if (u < 0x4000000)
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
break;
|
2006-12-19 02:31:55 -05:00
|
|
|
default:
|
|
|
|
INTERNAL("utf8char_len_tab out of range");
|
|
|
|
goto invalid_utf8;
|
2006-01-14 16:44:00 -05:00
|
|
|
}
|
2021-01-17 15:56:40 -05:00
|
|
|
*string = (char *)(str + length);
|
2006-07-18 14:33:34 -04:00
|
|
|
return u;
|
2006-01-14 16:44:00 -05:00
|
|
|
}
|
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
/* The common part of cp2u and cp2utf_8. */
|
2006-08-05 07:01:49 -04:00
|
|
|
static unicode_val_T
|
|
|
|
cp2u_shared(const struct codepage_desc *from, unsigned char c)
|
|
|
|
{
|
2006-09-24 09:55:29 -04:00
|
|
|
unicode_val_T u = from->highhalf[c - 0x80];
|
2006-09-28 18:07:54 -04:00
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
|
|
|
|
return u;
|
2006-08-05 07:01:49 -04:00
|
|
|
}
|
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
/* Used for converting input from the terminal. */
|
2006-08-05 07:01:49 -04:00
|
|
|
unicode_val_T
|
|
|
|
cp2u(int from, unsigned char c)
|
|
|
|
{
|
|
|
|
from &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
|
|
|
|
/* UTF-8 is a multibyte codepage and cannot be handled with
|
|
|
|
* this function. */
|
2006-09-24 06:33:58 -04:00
|
|
|
assert(!is_cp_ptr_utf8(&codepages[from]));
|
2006-08-19 06:29:37 -04:00
|
|
|
if_assert_failed return UCS_REPLACEMENT_CHARACTER;
|
2006-08-05 07:01:49 -04:00
|
|
|
|
|
|
|
if (c < 0x80) return c;
|
|
|
|
else return cp2u_shared(&codepages[from], c);
|
|
|
|
}
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* This slow and ugly code is used by the terminal utf_8_io */
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *
|
2006-09-17 09:06:22 -04:00
|
|
|
cp2utf8(int from, int c)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
from &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
|
2006-09-24 06:33:58 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
|
2005-09-15 09:58:31 -04:00
|
|
|
return strings[c];
|
|
|
|
|
2006-09-17 09:06:22 -04:00
|
|
|
return encode_utf8(cp2u_shared(&codepages[from], c));
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
|
2006-08-13 16:35:50 -04:00
|
|
|
unicode_val_T
|
2021-01-02 10:20:27 -05:00
|
|
|
cp_to_unicode(int codepage, char **string, const char *end)
|
2006-08-13 16:35:50 -04:00
|
|
|
{
|
2006-08-13 20:19:10 -04:00
|
|
|
unicode_val_T ret;
|
|
|
|
|
2006-08-13 16:35:50 -04:00
|
|
|
if (is_cp_utf8(codepage))
|
2006-09-17 09:06:22 -04:00
|
|
|
return utf8_to_unicode(string, end);
|
2006-08-13 20:19:10 -04:00
|
|
|
|
|
|
|
if (*string >= end)
|
|
|
|
return UCS_NO_CHAR;
|
|
|
|
|
|
|
|
ret = cp2u(codepage, **string);
|
|
|
|
++*string;
|
|
|
|
return ret;
|
2006-08-13 16:35:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2008-01-19 12:56:50 -05:00
|
|
|
#ifdef CONFIG_COMBINE
|
2008-01-03 06:57:24 -05:00
|
|
|
unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
|
|
|
|
unicode_val_T **combined;
|
|
|
|
struct hash *combined_hash;
|
|
|
|
|
|
|
|
unicode_val_T
|
|
|
|
get_combined(unicode_val_T *data, int length)
|
|
|
|
{
|
|
|
|
struct hash_item *item;
|
|
|
|
unicode_val_T *key;
|
|
|
|
int i, indeks;
|
|
|
|
|
|
|
|
assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
|
|
|
|
if_assert_failed return UCS_NO_CHAR;
|
|
|
|
|
|
|
|
if (!combined_hash) combined_hash = init_hash8();
|
|
|
|
if (!combined_hash) return UCS_NO_CHAR;
|
2021-01-02 10:20:27 -05:00
|
|
|
item = get_hash_item(combined_hash, (char *)data, length * sizeof(*data));
|
2008-01-03 06:57:24 -05:00
|
|
|
|
2022-08-17 14:48:58 -04:00
|
|
|
if (item) return (unicode_val_T)(intptr_t)item->value;
|
2008-01-03 06:57:24 -05:00
|
|
|
if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
|
|
|
|
|
2022-01-16 13:09:27 -05:00
|
|
|
key = (unicode_val_T *)mem_alloc((length + 1) * sizeof(*key));
|
2008-01-03 06:57:24 -05:00
|
|
|
if (!key) return UCS_NO_CHAR;
|
|
|
|
for (i = 0; i < length; i++)
|
|
|
|
key[i] = data[i];
|
|
|
|
key[i] = UCS_END_COMBINED;
|
|
|
|
|
|
|
|
last_combined++;
|
|
|
|
indeks = last_combined - UCS_BEGIN_COMBINED;
|
|
|
|
|
2022-01-16 13:38:30 -05:00
|
|
|
combined = (unicode_val_T **)mem_realloc(combined, sizeof(*combined) * (indeks + 1));
|
2008-01-03 06:57:24 -05:00
|
|
|
if (!combined) {
|
|
|
|
mem_free(key);
|
|
|
|
last_combined--;
|
|
|
|
return UCS_NO_CHAR;
|
|
|
|
}
|
|
|
|
combined[indeks] = key;
|
2021-01-02 10:20:27 -05:00
|
|
|
item = add_hash_item(combined_hash, (char *)key,
|
2022-08-17 14:48:58 -04:00
|
|
|
length * sizeof(*data), (void *)(intptr_t)(last_combined));
|
2008-01-03 06:57:24 -05:00
|
|
|
if (!item) {
|
|
|
|
last_combined--;
|
|
|
|
mem_free(key);
|
|
|
|
return UCS_NO_CHAR;
|
|
|
|
}
|
|
|
|
return last_combined;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
free_combined()
|
|
|
|
{
|
|
|
|
int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
|
|
|
|
|
|
|
|
if (combined_hash)
|
|
|
|
free_hash(&combined_hash);
|
|
|
|
for (i = 0; i < end; i++)
|
|
|
|
mem_free(combined[i]);
|
|
|
|
mem_free_if(combined);
|
|
|
|
}
|
2008-01-19 12:56:50 -05:00
|
|
|
#endif /* CONFIG_COMBINE */
|
|
|
|
|
2008-01-03 06:57:24 -05:00
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
static void
|
2021-01-02 10:20:27 -05:00
|
|
|
add_utf8(struct conv_table *ct, unicode_val_T u, const char *str)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
2021-01-02 10:20:27 -05:00
|
|
|
unsigned char *p = (unsigned char *)encode_utf8(u);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
while (p[1]) {
|
|
|
|
if (ct[*p].t) ct = ct[*p].u.tbl;
|
|
|
|
else {
|
|
|
|
struct conv_table *nct;
|
|
|
|
|
2022-01-16 15:08:50 -05:00
|
|
|
nct = (struct conv_table *)mem_calloc(256, sizeof(*nct));
|
2005-09-15 09:58:31 -04:00
|
|
|
if (!nct) return;
|
|
|
|
new_translation_table(nct);
|
|
|
|
ct[*p].t = 1;
|
|
|
|
ct[*p].u.tbl = nct;
|
|
|
|
ct = nct;
|
|
|
|
}
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ct[*p].u.str == no_str)
|
|
|
|
ct[*p].u.str = str;
|
|
|
|
}
|
|
|
|
|
2007-01-01 17:54:14 -05:00
|
|
|
/* A conversion table from some charset to UTF-8.
|
|
|
|
* If it is from UTF-8 to UTF-8, it converts each byte separately.
|
|
|
|
* Unlike in other translation tables, the strings in elements 0x80 to
|
|
|
|
* 0xFF are allocated dynamically. */
|
2005-09-15 09:58:31 -04:00
|
|
|
struct conv_table utf_table[256];
|
|
|
|
int utf_table_init = 1;
|
|
|
|
|
|
|
|
static void
|
|
|
|
free_utf_table(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2007-01-01 18:31:22 -05:00
|
|
|
/* Cast away const. */
|
2005-09-15 09:58:31 -04:00
|
|
|
for (i = 128; i < 256; i++)
|
2021-01-02 10:20:27 -05:00
|
|
|
mem_free((char *) utf_table[i].u.str);
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static struct conv_table *
|
2006-09-17 09:06:22 -04:00
|
|
|
get_translation_table_to_utf8(int from)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
static int lfr = -1;
|
|
|
|
|
|
|
|
if (from == -1) return NULL;
|
|
|
|
from &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
if (from == lfr) return utf_table;
|
2006-07-31 13:21:10 -04:00
|
|
|
lfr = from;
|
2007-03-05 15:10:02 -05:00
|
|
|
if (utf_table_init) {
|
|
|
|
memset(utf_table, 0, sizeof(utf_table));
|
2005-09-15 09:58:31 -04:00
|
|
|
utf_table_init = 0;
|
2007-03-05 15:10:02 -05:00
|
|
|
} else
|
2005-09-15 09:58:31 -04:00
|
|
|
free_utf_table();
|
|
|
|
|
|
|
|
for (i = 0; i < 128; i++)
|
|
|
|
utf_table[i].u.str = strings[i];
|
|
|
|
|
2006-09-24 06:33:58 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[from])) {
|
2005-09-15 09:58:31 -04:00
|
|
|
for (i = 128; i < 256; i++)
|
|
|
|
utf_table[i].u.str = stracpy(strings[i]);
|
|
|
|
return utf_table;
|
|
|
|
}
|
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
for (i = 128; i < 256; i++) {
|
|
|
|
unicode_val_T u = codepages[from].highhalf[i - 0x80];
|
|
|
|
|
|
|
|
if (u == 0xFFFF)
|
|
|
|
utf_table[i].u.str = NULL;
|
|
|
|
else
|
|
|
|
utf_table[i].u.str = stracpy(encode_utf8(u));
|
|
|
|
}
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
for (i = 0; codepages[from].table[i].c; i++) {
|
|
|
|
unicode_val_T u = codepages[from].table[i].u;
|
|
|
|
|
|
|
|
if (!utf_table[codepages[from].table[i].c].u.str)
|
|
|
|
utf_table[codepages[from].table[i].c].u.str =
|
2006-09-17 09:06:22 -04:00
|
|
|
stracpy(encode_utf8(u));
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 128; i < 256; i++)
|
|
|
|
if (!utf_table[i].u.str)
|
|
|
|
utf_table[i].u.str = stracpy(no_str);
|
|
|
|
|
|
|
|
return utf_table;
|
|
|
|
}
|
|
|
|
|
2007-01-01 17:54:14 -05:00
|
|
|
/* A conversion table between two charsets, where the target is not UTF-8. */
|
2007-01-01 17:58:38 -05:00
|
|
|
static struct conv_table table[256];
|
2005-09-15 09:58:31 -04:00
|
|
|
static int first = 1;
|
|
|
|
|
|
|
|
void
|
|
|
|
free_conv_table(void)
|
|
|
|
{
|
|
|
|
if (!utf_table_init) free_utf_table();
|
|
|
|
if (first) {
|
|
|
|
memset(table, 0, sizeof(table));
|
|
|
|
first = 0;
|
|
|
|
}
|
|
|
|
new_translation_table(table);
|
2010-07-23 13:59:59 -04:00
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
if (iconv_cd != (iconv_t)-1) {
|
|
|
|
iconv_close(iconv_cd);
|
|
|
|
iconv_cd = (iconv_t)-1;
|
|
|
|
}
|
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
struct conv_table *
|
|
|
|
get_translation_table(int from, int to)
|
|
|
|
{
|
|
|
|
static int lfr = -1;
|
|
|
|
static int lto = -1;
|
|
|
|
|
|
|
|
from &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
to &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
if (first) {
|
|
|
|
memset(table, 0, sizeof(table));
|
|
|
|
first = 0;
|
|
|
|
}
|
2010-07-23 13:59:59 -04:00
|
|
|
|
|
|
|
if (codepages[from].iconv) {
|
2010-07-24 03:57:59 -04:00
|
|
|
struct conv_table *table2 = get_translation_table_to_utf8(34);
|
2010-07-23 13:59:59 -04:00
|
|
|
|
2010-07-24 03:57:59 -04:00
|
|
|
if (table2) table2->iconv_cp = from;
|
|
|
|
return table2;
|
2010-07-23 13:59:59 -04:00
|
|
|
}
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
if (/*from == to ||*/ from == -1 || to == -1)
|
|
|
|
return NULL;
|
2010-07-24 03:57:59 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[to])) {
|
|
|
|
struct conv_table *table2 = get_translation_table_to_utf8(from);
|
|
|
|
|
|
|
|
if (table2) table2->iconv_cp = -1;
|
|
|
|
return table2;
|
|
|
|
}
|
2005-09-15 09:58:31 -04:00
|
|
|
if (from == lfr && to == lto)
|
|
|
|
return table;
|
|
|
|
lfr = from;
|
|
|
|
lto = to;
|
|
|
|
new_translation_table(table);
|
|
|
|
|
2006-09-24 06:33:58 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[from])) {
|
2005-09-15 09:58:31 -04:00
|
|
|
int i;
|
|
|
|
|
2007-04-26 14:39:46 -04:00
|
|
|
/* Map U+00A0 and U+00AD the same way as u2cp() would. */
|
|
|
|
add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
|
|
|
|
add_utf8(table, UCS_SOFT_HYPHEN, "");
|
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
for (i = 0x80; i <= 0xFF; i++)
|
|
|
|
if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
|
|
|
|
add_utf8(table,
|
|
|
|
codepages[to].highhalf[i - 0x80],
|
|
|
|
strings[i]);
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
for (i = 0; codepages[to].table[i].c; i++)
|
2006-09-17 09:06:22 -04:00
|
|
|
add_utf8(table, codepages[to].table[i].u,
|
|
|
|
strings[codepages[to].table[i].c]);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
for (i = 0; unicode_7b[i].x != -1; i++)
|
|
|
|
if (unicode_7b[i].x >= 0x80)
|
2006-09-17 09:06:22 -04:00
|
|
|
add_utf8(table, unicode_7b[i].x,
|
|
|
|
unicode_7b[i].s);
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
} else {
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 128; i < 256; i++) {
|
2006-09-24 09:55:29 -04:00
|
|
|
if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *u;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2006-09-24 09:55:29 -04:00
|
|
|
u = u2cp(codepages[from].highhalf[i - 0x80], to);
|
|
|
|
if (u) table[i].u.str = u;
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return table;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
2022-01-31 10:47:50 -05:00
|
|
|
xxstrcmp(const char *s1, const char *s2, int l2)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
while (l2) {
|
|
|
|
if (*s1 > *s2) return 1;
|
|
|
|
if (*s1 < *s2) return -1;
|
|
|
|
s1++;
|
|
|
|
s2++;
|
|
|
|
l2--;
|
|
|
|
}
|
|
|
|
|
|
|
|
return *s2 ? -1 : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Entity cache debugging purpose. */
|
|
|
|
#if 0
|
|
|
|
#define DEBUG_ENTITY_CACHE
|
|
|
|
#else
|
|
|
|
#undef DEBUG_ENTITY_CACHE
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct entity_cache {
|
|
|
|
unsigned int hits;
|
|
|
|
int strlen;
|
|
|
|
int encoding;
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *result;
|
|
|
|
char str[20]; /* Suffice in any case. */
|
2005-09-15 09:58:31 -04:00
|
|
|
};
|
|
|
|
|
Don't cast qsort comparison function pointers.
Instead, convert the element pointers inside the comparison functions.
The last argument of qsort() is supposed to be of type
int (*)(const void *, const void *). Previously, comp_links() was
defined to take struct link * instead of const void *, and the type
mismatch was silenced by casting the function pointer to void *.
This was in principle not portable because:
(1) The different pointer types may have different representations.
In a word-oriented machine, the const void * might include a byte
selector while the struct link * might not.
(2) Casting a function pointer to a data pointer can lose bits in some
memory models. Apparently this does not occur in POSIX-conforming
systems though, as dlsym() would fail if it did.
This commit also fixes hits_cmp() and compare_dir_entries(), which
had similar problems. However, I'm leaving alias_compare() in
src/intl/gettext/localealias.c unchanged for now, so as not to diverge
from the GNU version.
I also checked the bsearch() calls but they were all okay, apart from
one that used the alias_compare() mentioned above.
2007-10-05 22:59:20 -04:00
|
|
|
/* comparison function for qsort() */
|
2005-09-15 09:58:31 -04:00
|
|
|
static int
|
Don't cast qsort comparison function pointers.
Instead, convert the element pointers inside the comparison functions.
The last argument of qsort() is supposed to be of type
int (*)(const void *, const void *). Previously, comp_links() was
defined to take struct link * instead of const void *, and the type
mismatch was silenced by casting the function pointer to void *.
This was in principle not portable because:
(1) The different pointer types may have different representations.
In a word-oriented machine, the const void * might include a byte
selector while the struct link * might not.
(2) Casting a function pointer to a data pointer can lose bits in some
memory models. Apparently this does not occur in POSIX-conforming
systems though, as dlsym() would fail if it did.
This commit also fixes hits_cmp() and compare_dir_entries(), which
had similar problems. However, I'm leaving alias_compare() in
src/intl/gettext/localealias.c unchanged for now, so as not to diverge
from the GNU version.
I also checked the bsearch() calls but they were all okay, apart from
one that used the alias_compare() mentioned above.
2007-10-05 22:59:20 -04:00
|
|
|
hits_cmp(const void *v1, const void *v2)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
2022-01-25 12:16:12 -05:00
|
|
|
const struct entity_cache *a = (const struct entity_cache *)v1, *b = (const struct entity_cache *)v2;
|
Don't cast qsort comparison function pointers.
Instead, convert the element pointers inside the comparison functions.
The last argument of qsort() is supposed to be of type
int (*)(const void *, const void *). Previously, comp_links() was
defined to take struct link * instead of const void *, and the type
mismatch was silenced by casting the function pointer to void *.
This was in principle not portable because:
(1) The different pointer types may have different representations.
In a word-oriented machine, the const void * might include a byte
selector while the struct link * might not.
(2) Casting a function pointer to a data pointer can lose bits in some
memory models. Apparently this does not occur in POSIX-conforming
systems though, as dlsym() would fail if it did.
This commit also fixes hits_cmp() and compare_dir_entries(), which
had similar problems. However, I'm leaving alias_compare() in
src/intl/gettext/localealias.c unchanged for now, so as not to diverge
from the GNU version.
I also checked the bsearch() calls but they were all okay, apart from
one that used the alias_compare() mentioned above.
2007-10-05 22:59:20 -04:00
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
if (a->hits == b->hits) return 0;
|
|
|
|
if (a->hits > b->hits) return -1;
|
|
|
|
else return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
compare_entities(const void *key_, const void *element_)
|
|
|
|
{
|
2019-04-21 06:27:40 -04:00
|
|
|
struct string *key = (struct string *) key_;
|
2005-09-15 09:58:31 -04:00
|
|
|
struct entity *element = (struct entity *) element_;
|
|
|
|
int length = key->length;
|
2022-01-31 10:47:50 -05:00
|
|
|
const char *first = key->source;
|
|
|
|
const char *second = element->s;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
return xxstrcmp(first, second, length);
|
|
|
|
}
|
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *
|
|
|
|
get_entity_string(const char *str, const int strlen, int encoding)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
#define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
|
|
|
|
#define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
|
|
|
|
will go in [0] table */
|
|
|
|
static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
|
|
|
|
static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
|
2006-01-14 16:44:00 -05:00
|
|
|
unsigned int slen = 0;
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *result = NULL;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2008-12-27 00:32:36 -05:00
|
|
|
/* Note that an object of static storage duration is automatically
|
|
|
|
* initialised to zero in C. */
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
if (strlen <= 0) return NULL;
|
|
|
|
|
2006-09-17 09:12:47 -04:00
|
|
|
#ifdef CONFIG_UTF8
|
2006-01-14 16:44:00 -05:00
|
|
|
/* TODO: caching UTF-8 */
|
|
|
|
encoding &= ~SYSTEM_CHARSET_FLAG;
|
2006-09-24 06:33:58 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[encoding]))
|
2006-01-14 16:44:00 -05:00
|
|
|
goto skip;
|
2006-09-17 09:12:47 -04:00
|
|
|
#endif /* CONFIG_UTF8 */
|
2006-01-14 16:44:00 -05:00
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
|
|
|
|
* + google + slashdot + websites that result from a search for test on google,
|
2008-01-26 23:09:18 -05:00
|
|
|
* + various ones) show quite impressive improvment:
|
2005-09-15 09:58:31 -04:00
|
|
|
* Top ten is:
|
|
|
|
* 0: hits=2459 l=4 st='nbsp'
|
|
|
|
* 1: hits=2152 l=6 st='eacute'
|
|
|
|
* 2: hits=235 l=6 st='egrave'
|
|
|
|
* 3: hits=136 l=6 st='agrave'
|
|
|
|
* 4: hits=100 l=3 st='amp'
|
|
|
|
* 5: hits=40 l=5 st='laquo'
|
|
|
|
* 6: hits=8 l=4 st='copy'
|
|
|
|
* 7: hits=5 l=2 st='gt'
|
|
|
|
* 8: hits=2 l=2 st='lt'
|
|
|
|
* 9: hits=1 l=6 st='middot'
|
|
|
|
*
|
|
|
|
* Most of the time cache hit ratio is near 95%.
|
|
|
|
*
|
|
|
|
* A long test shows: 15186 hits vs. 24 misses and mean iteration
|
|
|
|
* count is kept < 2 (worst case 1.58). Not so bad ;)
|
|
|
|
*
|
|
|
|
* --Zas */
|
|
|
|
|
|
|
|
/* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
|
|
|
|
slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
|
|
|
|
|
|
|
|
if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < nb_entity_cache[slen]; i++) {
|
|
|
|
if (entity_cache[slen][i].encoding == encoding
|
|
|
|
&& !memcmp(str, entity_cache[slen][i].str, strlen)) {
|
|
|
|
#ifdef DEBUG_ENTITY_CACHE
|
|
|
|
static double total_iter = 0;
|
|
|
|
static unsigned long hit_count = 0;
|
|
|
|
|
|
|
|
total_iter += i + 1;
|
|
|
|
hit_count++;
|
|
|
|
fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
|
|
|
|
#endif
|
|
|
|
if (entity_cache[slen][i].hits < (unsigned int) ~0)
|
|
|
|
entity_cache[slen][i].hits++;
|
|
|
|
return entity_cache[slen][i].result;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifdef DEBUG_ENTITY_CACHE
|
|
|
|
fprintf(stderr, "miss\n");
|
|
|
|
#endif
|
|
|
|
}
|
2006-09-17 09:12:47 -04:00
|
|
|
#ifdef CONFIG_UTF8
|
2006-01-14 16:44:00 -05:00
|
|
|
skip:
|
2006-09-17 09:12:47 -04:00
|
|
|
#endif /* CONFIG_UTF8 */
|
2005-09-15 09:58:31 -04:00
|
|
|
if (*str == '#') { /* Numeric entity. */
|
|
|
|
int l = (int) strlen;
|
2021-01-02 10:20:27 -05:00
|
|
|
char *st = (char *) str;
|
2005-09-15 09:58:31 -04:00
|
|
|
unicode_val_T n = 0;
|
|
|
|
|
|
|
|
if (l == 1) goto end; /* &#; ? */
|
|
|
|
st++, l--;
|
|
|
|
if ((*st | 32) == 'x') { /* Hexadecimal */
|
|
|
|
|
|
|
|
if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
|
|
|
|
st++, l--;
|
|
|
|
do {
|
|
|
|
unsigned char c = (*(st++) | 32);
|
|
|
|
|
|
|
|
if (isdigit(c))
|
|
|
|
n = (n << 4) | (c - '0');
|
|
|
|
else if (isxdigit(c))
|
|
|
|
n = (n << 4) | (c - 'a' + 10);
|
|
|
|
else
|
|
|
|
goto end; /* Bad char. */
|
|
|
|
} while (--l);
|
|
|
|
} else { /* Decimal */
|
|
|
|
if (l > 10) goto end; /* 4294967295 max. */
|
|
|
|
do {
|
|
|
|
unsigned char c = *(st++);
|
|
|
|
|
|
|
|
if (isdigit(c))
|
|
|
|
n = n * 10 + c - '0';
|
|
|
|
else
|
|
|
|
goto end; /* Bad char. */
|
|
|
|
/* Limit to 0xFFFFFFFF. */
|
|
|
|
if (n >= (unicode_val_T) 0xFFFFFFFFu)
|
|
|
|
goto end;
|
|
|
|
} while (--l);
|
|
|
|
}
|
|
|
|
|
|
|
|
result = u2cp(n, encoding);
|
|
|
|
|
|
|
|
#ifdef DEBUG_ENTITY_CACHE
|
|
|
|
fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
|
|
|
|
#endif
|
|
|
|
} else { /* Text entity. */
|
2021-01-02 10:20:27 -05:00
|
|
|
struct string key = INIT_STRING((char *) str, strlen);
|
2022-01-25 12:16:12 -05:00
|
|
|
struct entity *element = (struct entity *)bsearch((void *) &key, entities,
|
2005-09-15 09:58:31 -04:00
|
|
|
N_ENTITIES,
|
|
|
|
sizeof(*element),
|
|
|
|
compare_entities);
|
|
|
|
|
|
|
|
if (element) result = u2cp(element->c, encoding);
|
|
|
|
}
|
|
|
|
|
2006-09-17 09:12:47 -04:00
|
|
|
#ifdef CONFIG_UTF8
|
2006-09-24 06:33:58 -04:00
|
|
|
if (is_cp_ptr_utf8(&codepages[encoding])) {
|
2006-01-14 16:44:00 -05:00
|
|
|
return result;
|
|
|
|
}
|
2006-09-17 09:12:47 -04:00
|
|
|
#endif /* CONFIG_UTF8 */
|
2005-09-15 09:58:31 -04:00
|
|
|
end:
|
|
|
|
/* Take care of potential buffer overflow. */
|
|
|
|
if (strlen < sizeof(entity_cache[slen][0].str)) {
|
2007-05-01 04:23:25 -04:00
|
|
|
struct entity_cache *ece;
|
|
|
|
|
|
|
|
/* Sort entries by hit order. */
|
|
|
|
if (nb_entity_cache[slen] > 1)
|
|
|
|
qsort(&entity_cache[slen][0], nb_entity_cache[slen],
|
Don't cast qsort comparison function pointers.
Instead, convert the element pointers inside the comparison functions.
The last argument of qsort() is supposed to be of type
int (*)(const void *, const void *). Previously, comp_links() was
defined to take struct link * instead of const void *, and the type
mismatch was silenced by casting the function pointer to void *.
This was in principle not portable because:
(1) The different pointer types may have different representations.
In a word-oriented machine, the const void * might include a byte
selector while the struct link * might not.
(2) Casting a function pointer to a data pointer can lose bits in some
memory models. Apparently this does not occur in POSIX-conforming
systems though, as dlsym() would fail if it did.
This commit also fixes hits_cmp() and compare_dir_entries(), which
had similar problems. However, I'm leaving alias_compare() in
src/intl/gettext/localealias.c unchanged for now, so as not to diverge
from the GNU version.
I also checked the bsearch() calls but they were all okay, apart from
one that used the alias_compare() mentioned above.
2007-10-05 22:59:20 -04:00
|
|
|
sizeof(entity_cache[slen][0]), hits_cmp);
|
2007-05-01 04:23:25 -04:00
|
|
|
|
|
|
|
/* Increment number of cache entries if possible.
|
|
|
|
* Else, just replace the least used entry. */
|
|
|
|
if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
|
|
|
|
ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
/* Copy new entry to cache. */
|
|
|
|
ece->hits = 1;
|
|
|
|
ece->strlen = strlen;
|
|
|
|
ece->encoding = encoding;
|
|
|
|
ece->result = result;
|
|
|
|
memcpy(ece->str, str, strlen);
|
|
|
|
ece->str[strlen] = '\0';
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef DEBUG_ENTITY_CACHE
|
|
|
|
fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
|
|
|
|
entity_cache[slen][0].strlen, entity_cache[slen][0].str);
|
|
|
|
|
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
|
|
|
|
fprintf(stderr, "- Cache entries [%u] -\n", slen);
|
|
|
|
for (i = 0; i < nb_entity_cache[slen] ; i++)
|
|
|
|
fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
|
|
|
|
entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
|
|
|
|
entity_cache[slen][i].str);
|
|
|
|
fprintf(stderr, "-----------------\n");
|
|
|
|
}
|
2007-05-01 04:23:25 -04:00
|
|
|
#endif /* DEBUG_ENTITY_CACHE */
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
char *
|
2005-09-15 09:58:31 -04:00
|
|
|
convert_string(struct conv_table *convert_table,
|
2022-01-31 09:46:05 -05:00
|
|
|
const char *chars2, int charslen2, int cp,
|
2005-09-15 09:58:31 -04:00
|
|
|
enum convert_string_mode mode, int *length,
|
2021-01-02 10:20:27 -05:00
|
|
|
void (*callback)(void *data, char *buf, int buflen),
|
2005-09-15 09:58:31 -04:00
|
|
|
void *callback_data)
|
|
|
|
{
|
2021-01-02 10:20:27 -05:00
|
|
|
char *buffer;
|
2005-09-15 09:58:31 -04:00
|
|
|
int bufferpos = 0;
|
|
|
|
int charspos = 0;
|
2021-01-02 10:20:27 -05:00
|
|
|
unsigned char *chars = (unsigned char *)chars2;
|
2010-07-23 13:59:59 -04:00
|
|
|
int charslen = charslen2;
|
|
|
|
|
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
static char iconv_input[256];
|
|
|
|
static char iconv_output[256 * 8];
|
|
|
|
static size_t iconv_offset;
|
|
|
|
static int iconv_cp;
|
|
|
|
static size_t iconv_inleft;
|
|
|
|
size_t iconv_outleft = 256 * 8;
|
|
|
|
int loop = 0;
|
|
|
|
int is_iconv = 0;
|
|
|
|
int chars_offset = 0;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
if (!convert_table && !memchr((char *)chars, '&', charslen)) {
|
2005-09-15 09:58:31 -04:00
|
|
|
if (callback) {
|
2021-01-02 10:20:27 -05:00
|
|
|
if (charslen) callback(callback_data, (char *)chars, charslen);
|
2005-09-15 09:58:31 -04:00
|
|
|
return NULL;
|
|
|
|
} else {
|
2021-01-02 10:20:27 -05:00
|
|
|
return memacpy((char *)chars, charslen);
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-07-23 13:59:59 -04:00
|
|
|
if (cp >= 0) {
|
|
|
|
if (convert_table && convert_table->iconv_cp > 0) {
|
|
|
|
is_iconv = 1;
|
|
|
|
cp = convert_table->iconv_cp;
|
|
|
|
} else {
|
|
|
|
is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Buffer allocation */
|
|
|
|
|
2022-01-16 13:09:27 -05:00
|
|
|
buffer = (char *)mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
|
2005-09-15 09:58:31 -04:00
|
|
|
if (!buffer) return NULL;
|
|
|
|
|
2010-07-23 13:59:59 -04:00
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
if (is_iconv) {
|
|
|
|
int v;
|
|
|
|
size_t before, to_copy;
|
|
|
|
char *outp, *inp;
|
|
|
|
|
2021-05-26 02:27:17 -04:00
|
|
|
if (iconv_cd >= (iconv_t)0) {
|
2010-07-23 13:59:59 -04:00
|
|
|
if (cp != iconv_cp) {
|
|
|
|
iconv_close(iconv_cd);
|
|
|
|
iconv_cd = (iconv_t)-1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (iconv_cd == (iconv_t)-1) {
|
|
|
|
iconv_offset = 0;
|
|
|
|
iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
|
|
|
|
if (iconv_cd == (iconv_t)-1) {
|
|
|
|
mem_free(buffer);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
iconv_cp = cp;
|
|
|
|
}
|
|
|
|
repeat:
|
|
|
|
to_copy = charslen2 - chars_offset;
|
|
|
|
if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
|
2014-02-13 22:27:44 -05:00
|
|
|
memcpy(iconv_input + iconv_offset, chars2 + chars_offset, to_copy);
|
2010-07-23 13:59:59 -04:00
|
|
|
iconv_outleft = 256 * 8;
|
|
|
|
iconv_inleft = iconv_offset + to_copy;
|
|
|
|
inp = iconv_input;
|
|
|
|
outp = iconv_output;
|
|
|
|
before = iconv_inleft;
|
2013-02-27 03:33:55 -05:00
|
|
|
|
2010-07-23 13:59:59 -04:00
|
|
|
v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
|
|
|
|
chars_offset += before - iconv_inleft;
|
|
|
|
charslen = 256 * 8 - iconv_outleft;
|
|
|
|
|
|
|
|
chars = (unsigned char *)iconv_output;
|
|
|
|
charspos = 0;
|
|
|
|
|
|
|
|
if (v == -1) {
|
|
|
|
switch (errno) {
|
|
|
|
case EINVAL:
|
|
|
|
memcpy(iconv_input, inp, iconv_inleft);
|
|
|
|
iconv_offset = iconv_inleft;
|
|
|
|
break;
|
|
|
|
case EILSEQ:
|
2013-02-27 03:33:55 -05:00
|
|
|
loop = 0;
|
|
|
|
goto out;
|
2010-07-23 13:59:59 -04:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
iconv_offset = 0;
|
2013-02-27 03:33:55 -05:00
|
|
|
}
|
2010-07-23 13:59:59 -04:00
|
|
|
} else {
|
|
|
|
iconv_offset = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
loop = chars_offset < charslen2;
|
|
|
|
}
|
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Iterate ;-) */
|
|
|
|
|
2013-02-27 03:33:55 -05:00
|
|
|
out:
|
2005-09-15 09:58:31 -04:00
|
|
|
while (charspos < charslen) {
|
2021-01-02 10:20:27 -05:00
|
|
|
const char *translit;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
#define PUTC do { \
|
|
|
|
buffer[bufferpos++] = chars[charspos++]; \
|
|
|
|
translit = ""; \
|
|
|
|
goto flush; \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
if (chars[charspos] != '&') {
|
|
|
|
struct conv_table *t;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (chars[charspos] < 128 || !convert_table) PUTC;
|
|
|
|
|
|
|
|
t = convert_table;
|
|
|
|
i = charspos;
|
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
while (t[(unsigned char)chars[i]].t) {
|
|
|
|
t = t[(unsigned char)chars[i++]].u.tbl;
|
2005-09-15 09:58:31 -04:00
|
|
|
if (i >= charslen) PUTC;
|
|
|
|
}
|
|
|
|
|
2021-01-02 10:20:27 -05:00
|
|
|
translit = t[(unsigned char)chars[i]].u.str;
|
2005-09-15 09:58:31 -04:00
|
|
|
charspos = i + 1;
|
|
|
|
|
|
|
|
} else if (mode == CSM_FORM || mode == CSM_NONE) {
|
|
|
|
PUTC;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
int start = charspos + 1;
|
|
|
|
int i = start;
|
|
|
|
|
|
|
|
while (i < charslen
|
|
|
|
&& (isasciialpha(chars[i])
|
|
|
|
|| isdigit(chars[i])
|
|
|
|
|| (chars[i] == '#')))
|
|
|
|
i++;
|
|
|
|
|
|
|
|
/* This prevents bug 213: we were expanding "entities"
|
|
|
|
* in URL query strings. */
|
|
|
|
/* XXX: But this disables    usage, which
|
|
|
|
* appears to be relatively common! --pasky */
|
|
|
|
if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
|
|
|
|
&& i > start
|
|
|
|
&& !isasciialpha(chars[i]) && !isdigit(chars[i])) {
|
2021-01-02 10:20:27 -05:00
|
|
|
translit = get_entity_string((const char *)&chars[start], i - start,
|
2005-09-15 09:58:31 -04:00
|
|
|
cp);
|
|
|
|
if (chars[i] != ';') {
|
|
|
|
/* Eat    <foo> happily, but
|
|
|
|
* pull back from the character after
|
|
|
|
* entity string if it is not the valid
|
|
|
|
* terminator. */
|
|
|
|
i--;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!translit) PUTC;
|
|
|
|
charspos = i + (i < charslen);
|
|
|
|
} else PUTC;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!translit[0]) continue;
|
|
|
|
|
|
|
|
if (!translit[1]) {
|
|
|
|
buffer[bufferpos++] = translit[0];
|
|
|
|
translit = "";
|
|
|
|
goto flush;
|
|
|
|
}
|
|
|
|
|
|
|
|
while (*translit) {
|
2021-01-02 10:20:27 -05:00
|
|
|
char *new_;
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
buffer[bufferpos++] = *(translit++);
|
|
|
|
flush:
|
|
|
|
if (bufferpos & (ALLOC_GR - 1)) continue;
|
|
|
|
|
|
|
|
if (callback) {
|
|
|
|
buffer[bufferpos] = 0;
|
|
|
|
callback(callback_data, buffer, bufferpos);
|
|
|
|
bufferpos = 0;
|
|
|
|
} else {
|
2022-01-16 13:38:30 -05:00
|
|
|
new_ = (char *)mem_realloc(buffer, bufferpos + ALLOC_GR);
|
2016-04-20 12:42:22 -04:00
|
|
|
if (!new_) {
|
2005-09-15 09:58:31 -04:00
|
|
|
mem_free(buffer);
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-04-20 12:42:22 -04:00
|
|
|
buffer = new_;
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
#undef PUTC
|
|
|
|
}
|
|
|
|
|
2010-07-23 13:59:59 -04:00
|
|
|
#ifdef HAVE_ICONV
|
|
|
|
if (loop) goto repeat;
|
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
/* Say bye */
|
|
|
|
|
|
|
|
buffer[bufferpos] = 0;
|
|
|
|
if (length) *length = bufferpos;
|
|
|
|
|
|
|
|
if (callback) {
|
|
|
|
if (bufferpos) callback(callback_data, buffer, bufferpos);
|
|
|
|
mem_free(buffer);
|
|
|
|
return NULL;
|
|
|
|
} else {
|
|
|
|
return buffer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef USE_FASTFIND
|
|
|
|
int
|
2021-01-02 10:20:27 -05:00
|
|
|
get_cp_index(const char *name)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
int i, a;
|
|
|
|
int syscp = 0;
|
|
|
|
|
2008-10-18 21:25:00 -04:00
|
|
|
if (!c_strcasecmp(name, "System")) {
|
2005-09-15 09:58:31 -04:00
|
|
|
#if HAVE_LANGINFO_CODESET
|
|
|
|
name = nl_langinfo(CODESET);
|
|
|
|
syscp = SYSTEM_CHARSET_FLAG;
|
2022-06-05 13:12:25 -04:00
|
|
|
#else
|
|
|
|
#ifdef CONFIG_OS_DOS
|
|
|
|
int cp = os_default_charset();
|
|
|
|
if (cp != -1) {
|
|
|
|
return cp | SYSTEM_CHARSET_FLAG;
|
|
|
|
} else {
|
|
|
|
name = "us-ascii";
|
|
|
|
}
|
2005-09-15 09:58:31 -04:00
|
|
|
#else
|
|
|
|
name = "us-ascii";
|
2022-06-05 13:12:25 -04:00
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
for (i = 0; codepages[i].name; i++) {
|
|
|
|
for (a = 0; codepages[i].aliases[a]; a++) {
|
|
|
|
/* In the past, we looked for the longest substring
|
|
|
|
* in all the names; it is way too expensive, though:
|
|
|
|
*
|
|
|
|
* % cumulative self self total
|
|
|
|
* time seconds seconds calls us/call us/call name
|
|
|
|
* 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
|
|
|
|
*
|
|
|
|
* Anything called from redraw_screen() is in fact
|
|
|
|
* relatively expensive, even if it's called just
|
|
|
|
* once. So we will do a simple strcasecmp() here.
|
|
|
|
*/
|
|
|
|
|
2008-10-18 21:25:00 -04:00
|
|
|
if (!c_strcasecmp(name, codepages[i].aliases[a]))
|
2005-09-15 09:58:31 -04:00
|
|
|
return i | syscp;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (syscp) {
|
|
|
|
return get_cp_index("us-ascii") | syscp;
|
|
|
|
} else {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static unsigned int i_name = 0;
|
|
|
|
static unsigned int i_alias = 0;
|
|
|
|
|
|
|
|
/* Reset internal list pointer */
|
|
|
|
void
|
|
|
|
charsets_list_reset(void)
|
|
|
|
{
|
|
|
|
i_name = 0;
|
|
|
|
i_alias = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Returns a pointer to a struct that contains current key and data pointers
|
|
|
|
* and increment internal pointer. It returns NULL when key is NULL. */
|
|
|
|
struct fastfind_key_value *
|
|
|
|
charsets_list_next(void)
|
|
|
|
{
|
|
|
|
static struct fastfind_key_value kv;
|
|
|
|
|
|
|
|
if (!codepages[i_name].name) return NULL;
|
|
|
|
|
|
|
|
kv.key = codepages[i_name].aliases[i_alias];
|
2006-09-24 04:59:23 -04:00
|
|
|
kv.data = (void *) &codepages[i_name]; /* cast away const */
|
2005-09-15 09:58:31 -04:00
|
|
|
|
|
|
|
if (codepages[i_name].aliases[i_alias + 1])
|
|
|
|
i_alias++;
|
|
|
|
else {
|
|
|
|
i_name++;
|
|
|
|
i_alias = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return &kv;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct fastfind_index ff_charsets_index
|
|
|
|
= INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
|
|
|
|
|
|
|
|
/* It searchs for a charset named @name or one of its aliases and
|
|
|
|
* returns index for it or -1 if not found. */
|
|
|
|
int
|
2021-01-02 10:20:27 -05:00
|
|
|
get_cp_index(const char *name)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
2006-09-24 04:59:23 -04:00
|
|
|
const struct codepage_desc *codepage;
|
2005-09-15 09:58:31 -04:00
|
|
|
int syscp = 0;
|
|
|
|
|
2008-10-18 21:25:00 -04:00
|
|
|
if (!c_strcasecmp(name, "System")) {
|
2005-09-15 09:58:31 -04:00
|
|
|
#if HAVE_LANGINFO_CODESET
|
|
|
|
name = nl_langinfo(CODESET);
|
|
|
|
syscp = SYSTEM_CHARSET_FLAG;
|
2022-06-05 13:12:25 -04:00
|
|
|
#else
|
|
|
|
#ifdef CONFIG_OS_DOS
|
|
|
|
int cp = os_default_charset();
|
|
|
|
if (cp != -1) {
|
|
|
|
return cp | SYSTEM_CHARSET_FLAG;
|
|
|
|
} else {
|
|
|
|
name = "us-ascii";
|
|
|
|
}
|
2005-09-15 09:58:31 -04:00
|
|
|
#else
|
|
|
|
name = "us-ascii";
|
2022-06-05 13:12:25 -04:00
|
|
|
#endif
|
2005-09-15 09:58:31 -04:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2022-01-25 12:16:12 -05:00
|
|
|
codepage = (const struct codepage_desc *)fastfind_search(&ff_charsets_index, name, strlen(name));
|
2005-09-15 09:58:31 -04:00
|
|
|
if (codepage) {
|
|
|
|
assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
|
|
|
|
return (codepage - codepages) | syscp;
|
|
|
|
|
|
|
|
} else if (syscp) {
|
|
|
|
return get_cp_index("us-ascii") | syscp;
|
|
|
|
|
|
|
|
} else {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* USE_FASTFIND */
|
|
|
|
|
2022-10-03 11:55:20 -04:00
|
|
|
/* create the list of codepoints supported by the terminal */
|
|
|
|
|
|
|
|
#ifdef GIO_UNIMAP
|
|
|
|
int cmpint(const void *a, const void *b) {
|
|
|
|
if (* (int *) a < * (int *) b)
|
|
|
|
return -1;
|
|
|
|
else if (* (int *) a == * (int *) b)
|
|
|
|
return 0;
|
|
|
|
else
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
void make_codepoints() {
|
|
|
|
int tty;
|
|
|
|
struct unimapdesc table;
|
|
|
|
int res;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
tty = get_ctl_handle();
|
|
|
|
if (tty == -1) {
|
|
|
|
codepoints.size = -1;
|
|
|
|
return ;
|
|
|
|
}
|
|
|
|
|
|
|
|
table.entry_ct = 0;
|
|
|
|
table.entries = NULL;
|
|
|
|
res = ioctl(tty, GIO_UNIMAP, &table);
|
|
|
|
if (res && errno != ENOMEM) {
|
|
|
|
#ifdef CONFIG_DEBUG
|
|
|
|
perror("GIO_UNIMAP");
|
|
|
|
#endif
|
|
|
|
codepoints.size = -1;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
table.entries = malloc(table.entry_ct * sizeof(struct unipair));
|
|
|
|
res = ioctl(tty, GIO_UNIMAP, &table);
|
|
|
|
if (res) {
|
|
|
|
#ifdef CONFIG_DEBUG
|
|
|
|
perror("GIO_UNIMAP");
|
|
|
|
#endif
|
|
|
|
close(tty);
|
|
|
|
codepoints.size = -1;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
codepoints.size = table.entry_ct;
|
|
|
|
codepoints.list = malloc(table.entry_ct * sizeof(unicode_val_T));
|
|
|
|
for (i = 0; i < table.entry_ct; i++)
|
|
|
|
codepoints.list[i] = table.entries[i].unicode;
|
|
|
|
|
|
|
|
qsort(codepoints.list, codepoints.size, sizeof(unicode_val_T), cmpint);
|
|
|
|
|
|
|
|
// for (i = 0; i < codepoints.size; i++)
|
|
|
|
// fprintf(stderr, "U+%04X\n", codepoints.list[i]);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
void make_codepoints() {
|
|
|
|
codepoints.size = -1;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
void
|
|
|
|
init_charsets_lookup(void)
|
|
|
|
{
|
2022-10-03 11:55:20 -04:00
|
|
|
make_codepoints();
|
2005-09-15 09:58:31 -04:00
|
|
|
#ifdef USE_FASTFIND
|
|
|
|
fastfind_index(&ff_charsets_index, FF_COMPRESS);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
free_charsets_lookup(void)
|
|
|
|
{
|
|
|
|
#ifdef USE_FASTFIND
|
|
|
|
fastfind_done(&ff_charsets_index);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2007-03-20 14:41:05 -04:00
|
|
|
/* Get the codepage's name for displaying to the user, or NULL if
|
|
|
|
* @cp_index is one past the end. In the future, we might want to
|
|
|
|
* localize these with gettext. So it may be best not to use this
|
|
|
|
* function if the name will have to be converted back to an
|
|
|
|
* index. */
|
2022-02-15 11:53:24 -05:00
|
|
|
const char *
|
2005-09-15 09:58:31 -04:00
|
|
|
get_cp_name(int cp_index)
|
|
|
|
{
|
|
|
|
if (cp_index < 0) return "none";
|
|
|
|
if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
|
|
|
|
|
|
|
|
return codepages[cp_index].name;
|
|
|
|
}
|
|
|
|
|
2007-03-20 14:41:05 -04:00
|
|
|
/* Get the codepage's name for saving to a configuration file. These
|
|
|
|
* names can be converted back to indexes, even in future versions of
|
|
|
|
* ELinks. */
|
2022-02-18 09:39:59 -05:00
|
|
|
const char *
|
2007-03-20 14:41:05 -04:00
|
|
|
get_cp_config_name(int cp_index)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
if (cp_index < 0) return "none";
|
|
|
|
if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
|
|
|
|
if (!codepages[cp_index].aliases) return NULL;
|
|
|
|
|
2022-02-15 13:02:30 -05:00
|
|
|
return (char *)codepages[cp_index].aliases[0];
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
|
|
|
|
2007-03-20 14:41:05 -04:00
|
|
|
/* Get the codepage's name for sending to a library or server that
|
|
|
|
* understands MIME charset names. This function irreversibly maps
|
|
|
|
* the "System" codepage to the underlying charset. */
|
2022-02-18 09:45:47 -05:00
|
|
|
const char *
|
2007-03-20 14:41:05 -04:00
|
|
|
get_cp_mime_name(int cp_index)
|
|
|
|
{
|
|
|
|
if (cp_index < 0) return "none";
|
|
|
|
cp_index &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
if (!codepages[cp_index].aliases) return NULL;
|
|
|
|
|
2022-02-18 09:45:47 -05:00
|
|
|
return codepages[cp_index].aliases[0];
|
2007-03-20 14:41:05 -04:00
|
|
|
}
|
|
|
|
|
2005-09-15 09:58:31 -04:00
|
|
|
int
|
2006-07-18 11:51:03 -04:00
|
|
|
is_cp_utf8(int cp_index)
|
2005-09-15 09:58:31 -04:00
|
|
|
{
|
|
|
|
cp_index &= ~SYSTEM_CHARSET_FLAG;
|
2006-09-24 06:33:58 -04:00
|
|
|
return is_cp_ptr_utf8(&codepages[cp_index]);
|
2005-09-15 09:58:31 -04:00
|
|
|
}
|
2009-01-26 15:11:14 -05:00
|
|
|
|
|
|
|
/* This function will be used by the xhtml parser. */
|
|
|
|
const uint16_t *
|
2021-01-02 10:20:27 -05:00
|
|
|
get_cp_highhalf(const char *name)
|
2009-01-26 15:11:14 -05:00
|
|
|
{
|
|
|
|
int cp = get_cp_index(name);
|
|
|
|
|
|
|
|
if (cp < 0) return NULL;
|
|
|
|
cp &= ~SYSTEM_CHARSET_FLAG;
|
|
|
|
return codepages[cp].highhalf;
|
|
|
|
}
|