1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-01-03 14:57:44 -05:00
elinks/src/intl/charsets.c

1677 lines
42 KiB
C
Raw Normal View History

/* Charsets convertor */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE /* strcasecmp() */
#endif
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if HAVE_LANGINFO_CODESET
#include <langinfo.h>
#endif
#include <ctype.h>
#include <stdlib.h>
#if HAVE_WCTYPE_H
#include <wctype.h>
#endif
#ifdef HAVE_ICONV
#include <errno.h>
#include <iconv.h>
#endif
#include "elinks.h"
#include "document/options.h"
#include "intl/charsets.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/fastfind.h"
#include "util/hash.h"
#include "util/memory.h"
#include "util/string.h"
/* Fix namespace clash on MacOS. */
#define table table_elinks
struct table_entry {
unsigned char c;
/* This should in principle be unicode_val_T, but because all
* the values currently in codepage.inc fit in 16 bits, we can
* as well use uint16_t and halve sizeof(struct table_entry)
* from 8 bytes to 4. Should other characters ever be needed,
* unicode_val_T u : 24 might be a possibility, although it
* seems a little unportable as bitfields are in principle
* restricted to int, which may be 16-bit. */
uint16_t u;
};
struct codepage_desc {
2022-02-15 11:53:24 -05:00
const char *name;
const char **aliases;
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* The Unicode mappings of codepage bytes 0x80...0xFF.
* (0x00...0x7F are assumed to be ASCII in all codepages.)
* Because all current values fit in 16 bits, we store them as
* uint16_t rather than unicode_val_T. If the codepage does
* not use some byte, then @highhalf maps that byte to 0xFFFF,
* which C code converts to UCS_REPLACEMENT_CHARACTER where
* appropriate. (U+FFFF is reserved and will never be
* assigned as a character.) */
const uint16_t *highhalf;
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* If some byte in the codepage corresponds to multiple Unicode
* characters, then the preferred character is in @highhalf
* above, and the rest are listed here in @table. This table
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
* is not used for translating from the codepage to Unicode. */
const struct table_entry *table;
/* Whether use iconv for translation */
unsigned int iconv:1;
};
#include "intl/codepage.inc"
#include "intl/uni_7b.inc"
#include "intl/entity.inc"
/* Declare the external-linkage inline functions defined in this file.
* Avoid the GCC 4.3.1 warning: `foo' declared inline after being
* called. The functions are not declared inline in charsets.h
* because C99 6.7.4p6 says that every external-linkage function
* declared inline shall be defined in the same translation unit.
* The non-inline declarations in charsets.h also make sure that the
* compiler emits global definitions for the symbols so that the
* functions can be called from other translation units. */
NONSTATIC_INLINE char *encode_utf8(unicode_val_T u);
NONSTATIC_INLINE int utf8charlen(const char *p);
NONSTATIC_INLINE unicode_val_T utf8_to_unicode(char **string,
const char *end);
2007-01-02 14:40:14 -05:00
static const char strings[256][2] = {
"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
};
#ifdef HAVE_ICONV
static iconv_t iconv_cd = (iconv_t)-1;
#endif
static void
free_translation_table(struct conv_table *p)
{
int i;
for (i = 0; i < 256; i++)
if (p[i].t)
free_translation_table(p[i].u.tbl);
mem_free(p);
}
2007-01-01 17:54:14 -05:00
/* A string used in conversion tables when there is no correct
* conversion. This is compared by address and therefore should be a
* named array rather than a pointer so that it won't share storage
* with any other string literal that happens to have the same
* characters. */
static const char no_str[] = "*";
static void
new_translation_table(struct conv_table *p)
{
int i;
for (i = 0; i < 256; i++)
if (p[i].t)
free_translation_table(p[i].u.tbl);
for (i = 0; i < 128; i++) {
p[i].t = 0;
p[i].u.str = strings[i];
}
for (; i < 256; i++) {
p[i].t = 0;
p[i].u.str = no_str;
}
p->iconv_cp = -1;
}
#define BIN_SEARCH(table, entry, entries, key, result) \
{ \
long _s = 0, _e = (entries) - 1; \
\
while (_s <= _e || !((result) = -1)) { \
long _m = (_s + _e) / 2; \
\
if ((table)[_m].entry == (key)) { \
(result) = _m; \
break; \
} \
if ((table)[_m].entry > (key)) _e = _m - 1; \
if ((table)[_m].entry < (key)) _s = _m + 1; \
} \
} \
static const unicode_val_T strange_chars[32] = {
0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
};
#define SYSTEM_CHARSET_FLAG 128
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
const char *
u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
{
int j;
int s;
if (u < 128) return strings[u];
2006-01-14 16:44:00 -05:00
if (u < 0xa0) {
u = strange_chars[u - 0x80];
if (!u) return NULL;
}
2006-01-14 16:44:00 -05:00
to &= ~SYSTEM_CHARSET_FLAG;
if (is_cp_ptr_utf8(&codepages[to]))
return encode_utf8(u);
2006-01-14 16:44:00 -05:00
/* To mark non breaking spaces in non-UTF-8 strings, we use a
* special char NBSP_CHAR. */
if (u == UCS_NO_BREAK_SPACE) {
if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
else /* NBSP_MODE_ASCII */ return " ";
}
if (u == UCS_SOFT_HYPHEN) return "";
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (u < 0xFFFF)
for (j = 0; j < 0x80; j++)
if (codepages[to].highhalf[j] == u)
return strings[0x80 + j];
for (j = 0; codepages[to].table[j].c; j++)
if (codepages[to].table[j].u == u)
return strings[codepages[to].table[j].c];
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
if (s != -1) return unicode_7b[s].s;
return no_str;
}
static char utf_buffer[7];
NONSTATIC_INLINE char *
encode_utf8(unicode_val_T u)
{
memset(utf_buffer, 0, 7);
if (u < 0x80)
utf_buffer[0] = u;
else if (u < 0x800)
utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
utf_buffer[1] = 0x80 | (u & 0x3f);
else if (u < 0x10000)
utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[2] = 0x80 | (u & 0x3f);
else if (u < 0x200000)
utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[3] = 0x80 | (u & 0x3f);
else if (u < 0x4000000)
utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[4] = 0x80 | (u & 0x3f);
else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[5] = 0x80 | (u & 0x3f);
return utf_buffer;
}
/* Number of bytes utf8 character indexed by first byte. Illegal bytes are
* equal ones and handled different. */
static const char utf8char_len_tab[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
};
#ifdef CONFIG_UTF8
NONSTATIC_INLINE int
utf8charlen(const char *p)
{
return p ? utf8char_len_tab[(unsigned char)*p] : 0;
}
int
strlen_utf8(char **str)
2006-01-14 16:44:00 -05:00
{
char *s = *str;
char *end = strchr(s, '\0');
2006-01-14 16:44:00 -05:00
int x;
int len;
for (x = 0;; x++, s += len) {
len = utf8charlen(s);
2006-01-14 16:44:00 -05:00
if (s + len > end) break;
}
*str = s;
return x;
}
#define utf8_issingle(p) (((p) & 0x80) == 0)
#define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
/* Start from @current and move back to @pos char. This pointer return. The
* most left pointer is @start. */
char *
utf8_prevchar(char *current, int pos, char *start)
{
if (current == NULL || start == NULL || pos < 0)
return NULL;
while (pos > 0 && current != start) {
current--;
if (utf8_islead(*current))
pos--;
}
return current;
}
/* Count number of standard terminal cells needed for displaying UTF-8
* character. */
int
2022-01-31 10:28:56 -05:00
utf8_char2cells(const char *utf8_char_const, char *end)
{
unicode_val_T u;
2022-01-31 10:28:56 -05:00
char *utf8_char = (char *)utf8_char_const;
if (end == NULL)
end = strchr(utf8_char, '\0');
if(!utf8_char || !end)
return -1;
u = utf8_to_unicode(&utf8_char, end);
return unicode_to_cell(u);
}
/* Count number of standard terminal cells needed for displaying string
* with UTF-8 characters. */
int
2022-01-31 10:28:56 -05:00
utf8_ptr2cells(const char *string, char *end)
{
int charlen, cell, cells = 0;
if (end == NULL)
end = strchr(string, '\0');
if(!string || !end)
return -1;
do {
charlen = utf8charlen(string);
2006-07-27 03:51:10 -04:00
if (string + charlen > end)
break;
cell = utf8_char2cells(string, end);
if (cell < 0)
return -1;
cells += cell;
string += charlen;
} while (1);
return cells;
}
/* Count number of characters in string. */
int
utf8_ptr2chars(char *string, char *end)
{
int charlen, chars = 0;
if (end == NULL)
end = strchr(string, '\0');
if(!string || !end)
return -1;
do {
charlen = utf8charlen(string);
2006-07-27 03:51:10 -04:00
if (string + charlen > end)
break;
chars++;
string += charlen;
} while (1);
return chars;
}
/*
* Count number of bytes from begining of the string needed for displaying
* specified number of cells.
*/
int
2022-01-31 10:41:29 -05:00
utf8_cells2bytes(const char *string, int max_cells, char *end)
{
unsigned int bytes = 0, cells = 0;
assert(max_cells>=0);
if (end == NULL)
end = strchr(string, '\0');
if(!string || !end)
return -1;
do {
int cell = utf8_char2cells(&string[bytes], end);
if (cell < 0)
return -1;
cells += cell;
if (cells > max_cells)
break;
bytes += utf8charlen(&string[bytes]);
if (string + bytes > end) {
bytes = end - string;
break;
}
} while(1);
return bytes;
}
/* Take @max steps forward from @string in the specified @way, but
* not going past @end. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @end point to
* character boundaries. If not, it doesn't crash but the results may
* be inconsistent.
*
* This function can do some of the same jobs as utf8charlen(),
* utf8_cells2bytes(), and strlen_utf8(). */
char *
utf8_step_forward(char *string, char *end,
int max, enum utf8_step way, int *count)
{
int steps = 0;
char *current = string;
assert(string);
assert(max >= 0);
if_assert_failed goto invalid_arg;
if (end == NULL)
end = strchr(string, '\0');
switch (way) {
case UTF8_STEP_CHARACTERS:
while (steps < max && current < end) {
++current;
if (utf8_islead(*current))
++steps;
}
break;
case UTF8_STEP_CELLS_FEWER:
case UTF8_STEP_CELLS_MORE:
while (steps < max && current < end) {
unicode_val_T u;
char *prev = current;
int width;
u = utf8_to_unicode(&current, end);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
current = end;
++steps;
break;
}
width = unicode_to_cell(u);
if (way == UTF8_STEP_CELLS_FEWER
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
invalid_arg:
if (count)
*count = steps;
return current;
}
/* Take @max steps backward from @string in the specified @way, but
* not going past @start. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @start point
* to character boundaries. If not, it doesn't crash but the results
* may be inconsistent.
*
* This function can do some of the same jobs as utf8_prevchar(). */
char *
utf8_step_backward(char *string, char *start,
int max, enum utf8_step way, int *count)
{
int steps = 0;
char *current = string;
assert(string);
assert(start);
assert(max >= 0);
if_assert_failed goto invalid_arg;
switch (way) {
case UTF8_STEP_CHARACTERS:
while (steps < max && current > start) {
--current;
if (utf8_islead(*current))
++steps;
}
break;
case UTF8_STEP_CELLS_FEWER:
case UTF8_STEP_CELLS_MORE:
while (steps < max) {
char *prev = current;
char *look;
unicode_val_T u;
int width;
if (current <= start)
break;
do {
--current;
} while (current > start && !utf8_islead(*current));
look = current;
u = utf8_to_unicode(&look, prev);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
width = 1;
} else
width = unicode_to_cell(u);
if (way == UTF8_STEP_CELLS_FEWER
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
invalid_arg:
if (count)
*count = steps;
return current;
}
2006-07-27 03:51:10 -04:00
/*
* Find out number of standard terminal collumns needed for displaying symbol
* (glyph) which represents Unicode character c.
*
* TODO: Use wcwidth when it is available. This seems to require:
* - Make the configure script check whether <wchar.h> and wcwidth exist.
* - Define _XOPEN_SOURCE and include <wchar.h>.
* - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
* matches ISO 10646 in all locales.)
* However, these do not suffice, because wcwidth depends on LC_CTYPE
* in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
* is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
* <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
* U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
* character is apparently not supported in all locales. Why is that?
* - Perhaps there is standardese that requires supported characters
* to be convertable to multibyte form. Then ELinks could just pick
* some UTF-8 locale for its wcwidth purposes.
* - Perhaps wcwidth can even return different nonnegative values for
* the same ISO 10646 character in different locales. Then ELinks
* would have to set LC_CTYPE to match at least the terminal's
* charset (which may differ from the LC_CTYPE environment variable,
* especially when the master process is serving a slave terminal).
* But there is no guarantee that the libc supports all the same
* charsets as ELinks does.
* For now, it seems safest to avoid the potentially locale-dependent
* libc version of wcwidth, and instead use a hardcoded mapping.
*
* @return 2 for double-width glyph, 1 for others.
2017-05-16 12:37:29 -04:00
* 0 for unprintable glyphs (like 0x200e: "LEFT-TO-RIGHT MARK")
*/
#if 0
NONSTATIC_INLINE int
unicode_to_cell(unicode_val_T c)
{
2017-05-16 12:37:29 -04:00
if (c == 0x200e || c == 0x200f)
return 0;
if (c >= 0x1100
&& (c <= 0x115f /* Hangul Jamo */
|| c == 0x2329
|| c == 0x232a
|| (c >= 0x2e80 && c <= 0xa4cf
&& c != 0x303f) /* CJK ... Yi */
|| (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
|| (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
Ideographs */
|| (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
|| (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
|| (c >= 0xffe0 && c <= 0xffe6)
|| (c >= 0x20000 && c <= 0x2fffd)
|| (c >= 0x30000 && c <= 0x3fffd)))
return 2;
2006-07-27 03:51:10 -04:00
return 1;
}
#endif
/* Fold the case of a Unicode character, so that hotkeys in labels can
* be compared case-insensitively. It is unspecified whether the
* result will be in upper or lower case. */
unicode_val_T
unicode_fold_label_case(unicode_val_T c)
{
#if __STDC_ISO_10646__ && HAVE_WCTYPE_H
return towlower(c);
#else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
/* For now, this supports only ASCII. It would be possible to
* use code generated from CaseFolding.txt of Unicode if the
* acknowledgements required by http://www.unicode.org/copyright.html
* were added to associated documentation of ELinks. */
if (c >= 0x41 && c <= 0x5A)
return c + 0x20;
else
return c;
#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
}
#endif /* CONFIG_UTF8 */
NONSTATIC_INLINE unicode_val_T
utf8_to_unicode(char **string, const char *end)
2006-01-14 16:44:00 -05:00
{
unsigned char *str = (unsigned char *)*string;
2006-01-14 16:44:00 -05:00
unicode_val_T u;
int length;
length = utf8char_len_tab[str[0]];
2006-01-14 16:44:00 -05:00
if (str + length > (const unsigned char *)end) {
2006-01-14 16:44:00 -05:00
return UCS_NO_CHAR;
2006-07-27 03:51:10 -04:00
}
2006-01-14 16:44:00 -05:00
switch (length) {
case 1: /* U+0000 to U+007F */
if (str[0] >= 0x80) {
invalid_utf8:
++*string;
return UCS_REPLACEMENT_CHARACTER;
}
2006-01-14 16:44:00 -05:00
u = str[0];
break;
case 2: /* U+0080 to U+07FF */
if ((str[1] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x1f) << 6;
u += (str[1] & 0x3f);
if (u < 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 3: /* U+0800 to U+FFFF, except surrogates */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 12;
u += ((str[1] & 0x3f) << 6);
u += (str[2] & 0x3f);
if (u < 0x800 || is_utf16_surrogate(u))
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 4: /* U+10000 to U+1FFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 18;
u += ((str[1] & 0x3f) << 12);
u += ((str[2] & 0x3f) << 6);
u += (str[3] & 0x3f);
if (u < 0x10000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 5: /* U+200000 to U+3FFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 24;
u += ((str[1] & 0x3f) << 18);
u += ((str[2] & 0x3f) << 12);
u += ((str[3] & 0x3f) << 6);
u += (str[4] & 0x3f);
if (u < 0x200000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 6: /* U+4000000 to U+7FFFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
|| (str[5] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x01) << 30;
u += ((str[1] & 0x3f) << 24);
u += ((str[2] & 0x3f) << 18);
u += ((str[3] & 0x3f) << 12);
u += ((str[4] & 0x3f) << 6);
u += (str[5] & 0x3f);
if (u < 0x4000000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
default:
INTERNAL("utf8char_len_tab out of range");
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
}
*string = (char *)(str + length);
2006-07-18 14:33:34 -04:00
return u;
2006-01-14 16:44:00 -05:00
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* The common part of cp2u and cp2utf_8. */
static unicode_val_T
cp2u_shared(const struct codepage_desc *from, unsigned char c)
{
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
unicode_val_T u = from->highhalf[c - 0x80];
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
return u;
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* Used for converting input from the terminal. */
unicode_val_T
cp2u(int from, unsigned char c)
{
from &= ~SYSTEM_CHARSET_FLAG;
/* UTF-8 is a multibyte codepage and cannot be handled with
* this function. */
assert(!is_cp_ptr_utf8(&codepages[from]));
if_assert_failed return UCS_REPLACEMENT_CHARACTER;
if (c < 0x80) return c;
else return cp2u_shared(&codepages[from], c);
}
/* This slow and ugly code is used by the terminal utf_8_io */
const char *
cp2utf8(int from, int c)
{
from &= ~SYSTEM_CHARSET_FLAG;
if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
return strings[c];
return encode_utf8(cp2u_shared(&codepages[from], c));
}
2006-08-13 16:35:50 -04:00
unicode_val_T
cp_to_unicode(int codepage, char **string, const char *end)
2006-08-13 16:35:50 -04:00
{
2006-08-13 20:19:10 -04:00
unicode_val_T ret;
2006-08-13 16:35:50 -04:00
if (is_cp_utf8(codepage))
return utf8_to_unicode(string, end);
2006-08-13 20:19:10 -04:00
if (*string >= end)
return UCS_NO_CHAR;
ret = cp2u(codepage, **string);
++*string;
return ret;
2006-08-13 16:35:50 -04:00
}
#ifdef CONFIG_COMBINE
unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
unicode_val_T **combined;
struct hash *combined_hash;
unicode_val_T
get_combined(unicode_val_T *data, int length)
{
struct hash_item *item;
unicode_val_T *key;
int i, indeks;
assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
if_assert_failed return UCS_NO_CHAR;
if (!combined_hash) combined_hash = init_hash8();
if (!combined_hash) return UCS_NO_CHAR;
item = get_hash_item(combined_hash, (char *)data, length * sizeof(*data));
if (item) return (unicode_val_T)(long)item->value;
if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
2022-01-16 13:09:27 -05:00
key = (unicode_val_T *)mem_alloc((length + 1) * sizeof(*key));
if (!key) return UCS_NO_CHAR;
for (i = 0; i < length; i++)
key[i] = data[i];
key[i] = UCS_END_COMBINED;
last_combined++;
indeks = last_combined - UCS_BEGIN_COMBINED;
2022-01-16 13:38:30 -05:00
combined = (unicode_val_T **)mem_realloc(combined, sizeof(*combined) * (indeks + 1));
if (!combined) {
mem_free(key);
last_combined--;
return UCS_NO_CHAR;
}
combined[indeks] = key;
item = add_hash_item(combined_hash, (char *)key,
length * sizeof(*data), (void *)(long)(last_combined));
if (!item) {
last_combined--;
mem_free(key);
return UCS_NO_CHAR;
}
return last_combined;
}
void
free_combined()
{
int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
if (combined_hash)
free_hash(&combined_hash);
for (i = 0; i < end; i++)
mem_free(combined[i]);
mem_free_if(combined);
}
#endif /* CONFIG_COMBINE */
static void
add_utf8(struct conv_table *ct, unicode_val_T u, const char *str)
{
unsigned char *p = (unsigned char *)encode_utf8(u);
while (p[1]) {
if (ct[*p].t) ct = ct[*p].u.tbl;
else {
struct conv_table *nct;
assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
if_assert_failed return;
2022-01-16 15:08:50 -05:00
nct = (struct conv_table *)mem_calloc(256, sizeof(*nct));
if (!nct) return;
new_translation_table(nct);
ct[*p].t = 1;
ct[*p].u.tbl = nct;
ct = nct;
}
p++;
}
assertm(!ct[*p].t, "bad utf encoding #2");
if_assert_failed return;
if (ct[*p].u.str == no_str)
ct[*p].u.str = str;
}
2007-01-01 17:54:14 -05:00
/* A conversion table from some charset to UTF-8.
* If it is from UTF-8 to UTF-8, it converts each byte separately.
* Unlike in other translation tables, the strings in elements 0x80 to
* 0xFF are allocated dynamically. */
struct conv_table utf_table[256];
int utf_table_init = 1;
static void
free_utf_table(void)
{
int i;
2007-01-01 18:31:22 -05:00
/* Cast away const. */
for (i = 128; i < 256; i++)
mem_free((char *) utf_table[i].u.str);
}
static struct conv_table *
get_translation_table_to_utf8(int from)
{
int i;
static int lfr = -1;
if (from == -1) return NULL;
from &= ~SYSTEM_CHARSET_FLAG;
if (from == lfr) return utf_table;
lfr = from;
if (utf_table_init) {
memset(utf_table, 0, sizeof(utf_table));
utf_table_init = 0;
} else
free_utf_table();
for (i = 0; i < 128; i++)
utf_table[i].u.str = strings[i];
if (is_cp_ptr_utf8(&codepages[from])) {
for (i = 128; i < 256; i++)
utf_table[i].u.str = stracpy(strings[i]);
return utf_table;
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
for (i = 128; i < 256; i++) {
unicode_val_T u = codepages[from].highhalf[i - 0x80];
if (u == 0xFFFF)
utf_table[i].u.str = NULL;
else
utf_table[i].u.str = stracpy(encode_utf8(u));
}
for (i = 0; codepages[from].table[i].c; i++) {
unicode_val_T u = codepages[from].table[i].u;
if (!utf_table[codepages[from].table[i].c].u.str)
utf_table[codepages[from].table[i].c].u.str =
stracpy(encode_utf8(u));
}
for (i = 128; i < 256; i++)
if (!utf_table[i].u.str)
utf_table[i].u.str = stracpy(no_str);
return utf_table;
}
2007-01-01 17:54:14 -05:00
/* A conversion table between two charsets, where the target is not UTF-8. */
static struct conv_table table[256];
static int first = 1;
void
free_conv_table(void)
{
if (!utf_table_init) free_utf_table();
if (first) {
memset(table, 0, sizeof(table));
first = 0;
}
new_translation_table(table);
#ifdef HAVE_ICONV
if (iconv_cd != (iconv_t)-1) {
iconv_close(iconv_cd);
iconv_cd = (iconv_t)-1;
}
#endif
}
struct conv_table *
get_translation_table(int from, int to)
{
static int lfr = -1;
static int lto = -1;
from &= ~SYSTEM_CHARSET_FLAG;
to &= ~SYSTEM_CHARSET_FLAG;
if (first) {
memset(table, 0, sizeof(table));
first = 0;
}
if (codepages[from].iconv) {
struct conv_table *table2 = get_translation_table_to_utf8(34);
if (table2) table2->iconv_cp = from;
return table2;
}
if (/*from == to ||*/ from == -1 || to == -1)
return NULL;
if (is_cp_ptr_utf8(&codepages[to])) {
struct conv_table *table2 = get_translation_table_to_utf8(from);
if (table2) table2->iconv_cp = -1;
return table2;
}
if (from == lfr && to == lto)
return table;
lfr = from;
lto = to;
new_translation_table(table);
if (is_cp_ptr_utf8(&codepages[from])) {
int i;
/* Map U+00A0 and U+00AD the same way as u2cp() would. */
add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
add_utf8(table, UCS_SOFT_HYPHEN, "");
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
for (i = 0x80; i <= 0xFF; i++)
if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
add_utf8(table,
codepages[to].highhalf[i - 0x80],
strings[i]);
for (i = 0; codepages[to].table[i].c; i++)
add_utf8(table, codepages[to].table[i].u,
strings[codepages[to].table[i].c]);
for (i = 0; unicode_7b[i].x != -1; i++)
if (unicode_7b[i].x >= 0x80)
add_utf8(table, unicode_7b[i].x,
unicode_7b[i].s);
} else {
int i;
for (i = 128; i < 256; i++) {
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
const char *u;
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
u = u2cp(codepages[from].highhalf[i - 0x80], to);
if (u) table[i].u.str = u;
}
}
}
return table;
}
static inline int
2022-01-31 10:47:50 -05:00
xxstrcmp(const char *s1, const char *s2, int l2)
{
while (l2) {
if (*s1 > *s2) return 1;
if (*s1 < *s2) return -1;
s1++;
s2++;
l2--;
}
return *s2 ? -1 : 0;
}
/* Entity cache debugging purpose. */
#if 0
#define DEBUG_ENTITY_CACHE
#else
#undef DEBUG_ENTITY_CACHE
#endif
struct entity_cache {
unsigned int hits;
int strlen;
int encoding;
const char *result;
char str[20]; /* Suffice in any case. */
};
/* comparison function for qsort() */
static int
hits_cmp(const void *v1, const void *v2)
{
2022-01-25 12:16:12 -05:00
const struct entity_cache *a = (const struct entity_cache *)v1, *b = (const struct entity_cache *)v2;
if (a->hits == b->hits) return 0;
if (a->hits > b->hits) return -1;
else return 1;
}
static int
compare_entities(const void *key_, const void *element_)
{
struct string *key = (struct string *) key_;
struct entity *element = (struct entity *) element_;
int length = key->length;
2022-01-31 10:47:50 -05:00
const char *first = key->source;
const char *second = element->s;
return xxstrcmp(first, second, length);
}
const char *
get_entity_string(const char *str, const int strlen, int encoding)
{
#define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
#define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
will go in [0] table */
static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
2006-01-14 16:44:00 -05:00
unsigned int slen = 0;
const char *result = NULL;
/* Note that an object of static storage duration is automatically
* initialised to zero in C. */
if (strlen <= 0) return NULL;
#ifdef CONFIG_UTF8
2006-01-14 16:44:00 -05:00
/* TODO: caching UTF-8 */
encoding &= ~SYSTEM_CHARSET_FLAG;
if (is_cp_ptr_utf8(&codepages[encoding]))
2006-01-14 16:44:00 -05:00
goto skip;
#endif /* CONFIG_UTF8 */
2006-01-14 16:44:00 -05:00
/* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
* + google + slashdot + websites that result from a search for test on google,
* + various ones) show quite impressive improvment:
* Top ten is:
* 0: hits=2459 l=4 st='nbsp'
* 1: hits=2152 l=6 st='eacute'
* 2: hits=235 l=6 st='egrave'
* 3: hits=136 l=6 st='agrave'
* 4: hits=100 l=3 st='amp'
* 5: hits=40 l=5 st='laquo'
* 6: hits=8 l=4 st='copy'
* 7: hits=5 l=2 st='gt'
* 8: hits=2 l=2 st='lt'
* 9: hits=1 l=6 st='middot'
*
* Most of the time cache hit ratio is near 95%.
*
* A long test shows: 15186 hits vs. 24 misses and mean iteration
* count is kept < 2 (worst case 1.58). Not so bad ;)
*
* --Zas */
/* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
int i;
for (i = 0; i < nb_entity_cache[slen]; i++) {
if (entity_cache[slen][i].encoding == encoding
&& !memcmp(str, entity_cache[slen][i].str, strlen)) {
#ifdef DEBUG_ENTITY_CACHE
static double total_iter = 0;
static unsigned long hit_count = 0;
total_iter += i + 1;
hit_count++;
fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
#endif
if (entity_cache[slen][i].hits < (unsigned int) ~0)
entity_cache[slen][i].hits++;
return entity_cache[slen][i].result;
}
}
#ifdef DEBUG_ENTITY_CACHE
fprintf(stderr, "miss\n");
#endif
}
#ifdef CONFIG_UTF8
2006-01-14 16:44:00 -05:00
skip:
#endif /* CONFIG_UTF8 */
if (*str == '#') { /* Numeric entity. */
int l = (int) strlen;
char *st = (char *) str;
unicode_val_T n = 0;
if (l == 1) goto end; /* &#; ? */
st++, l--;
if ((*st | 32) == 'x') { /* Hexadecimal */
if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
st++, l--;
do {
unsigned char c = (*(st++) | 32);
if (isdigit(c))
n = (n << 4) | (c - '0');
else if (isxdigit(c))
n = (n << 4) | (c - 'a' + 10);
else
goto end; /* Bad char. */
} while (--l);
} else { /* Decimal */
if (l > 10) goto end; /* 4294967295 max. */
do {
unsigned char c = *(st++);
if (isdigit(c))
n = n * 10 + c - '0';
else
goto end; /* Bad char. */
/* Limit to 0xFFFFFFFF. */
if (n >= (unicode_val_T) 0xFFFFFFFFu)
goto end;
} while (--l);
}
result = u2cp(n, encoding);
#ifdef DEBUG_ENTITY_CACHE
fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
#endif
} else { /* Text entity. */
struct string key = INIT_STRING((char *) str, strlen);
2022-01-25 12:16:12 -05:00
struct entity *element = (struct entity *)bsearch((void *) &key, entities,
N_ENTITIES,
sizeof(*element),
compare_entities);
if (element) result = u2cp(element->c, encoding);
}
#ifdef CONFIG_UTF8
if (is_cp_ptr_utf8(&codepages[encoding])) {
2006-01-14 16:44:00 -05:00
return result;
}
#endif /* CONFIG_UTF8 */
end:
/* Take care of potential buffer overflow. */
if (strlen < sizeof(entity_cache[slen][0].str)) {
struct entity_cache *ece;
/* Sort entries by hit order. */
if (nb_entity_cache[slen] > 1)
qsort(&entity_cache[slen][0], nb_entity_cache[slen],
sizeof(entity_cache[slen][0]), hits_cmp);
/* Increment number of cache entries if possible.
* Else, just replace the least used entry. */
if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
/* Copy new entry to cache. */
ece->hits = 1;
ece->strlen = strlen;
ece->encoding = encoding;
ece->result = result;
memcpy(ece->str, str, strlen);
ece->str[strlen] = '\0';
#ifdef DEBUG_ENTITY_CACHE
fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
entity_cache[slen][0].strlen, entity_cache[slen][0].str);
{
unsigned int i;
fprintf(stderr, "- Cache entries [%u] -\n", slen);
for (i = 0; i < nb_entity_cache[slen] ; i++)
fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
entity_cache[slen][i].str);
fprintf(stderr, "-----------------\n");
}
#endif /* DEBUG_ENTITY_CACHE */
}
return result;
}
char *
convert_string(struct conv_table *convert_table,
const char *chars2, int charslen2, int cp,
enum convert_string_mode mode, int *length,
void (*callback)(void *data, char *buf, int buflen),
void *callback_data)
{
char *buffer;
int bufferpos = 0;
int charspos = 0;
unsigned char *chars = (unsigned char *)chars2;
int charslen = charslen2;
#ifdef HAVE_ICONV
static char iconv_input[256];
static char iconv_output[256 * 8];
static size_t iconv_offset;
static int iconv_cp;
static size_t iconv_inleft;
size_t iconv_outleft = 256 * 8;
int loop = 0;
int is_iconv = 0;
int chars_offset = 0;
if (!convert_table && !memchr((char *)chars, '&', charslen)) {
if (callback) {
if (charslen) callback(callback_data, (char *)chars, charslen);
return NULL;
} else {
return memacpy((char *)chars, charslen);
}
}
if (cp >= 0) {
if (convert_table && convert_table->iconv_cp > 0) {
is_iconv = 1;
cp = convert_table->iconv_cp;
} else {
is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv;
}
}
#endif
/* Buffer allocation */
2022-01-16 13:09:27 -05:00
buffer = (char *)mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
if (!buffer) return NULL;
#ifdef HAVE_ICONV
if (is_iconv) {
int v;
size_t before, to_copy;
char *outp, *inp;
2021-05-26 02:27:17 -04:00
if (iconv_cd >= (iconv_t)0) {
if (cp != iconv_cp) {
iconv_close(iconv_cd);
iconv_cd = (iconv_t)-1;
}
}
if (iconv_cd == (iconv_t)-1) {
iconv_offset = 0;
iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp));
if (iconv_cd == (iconv_t)-1) {
mem_free(buffer);
return NULL;
}
iconv_cp = cp;
}
repeat:
to_copy = charslen2 - chars_offset;
if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset;
memcpy(iconv_input + iconv_offset, chars2 + chars_offset, to_copy);
iconv_outleft = 256 * 8;
iconv_inleft = iconv_offset + to_copy;
inp = iconv_input;
outp = iconv_output;
before = iconv_inleft;
v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft);
chars_offset += before - iconv_inleft;
charslen = 256 * 8 - iconv_outleft;
chars = (unsigned char *)iconv_output;
charspos = 0;
if (v == -1) {
switch (errno) {
case EINVAL:
memcpy(iconv_input, inp, iconv_inleft);
iconv_offset = iconv_inleft;
break;
case EILSEQ:
loop = 0;
goto out;
break;
default:
iconv_offset = 0;
}
} else {
iconv_offset = 0;
}
loop = chars_offset < charslen2;
}
#endif
/* Iterate ;-) */
out:
while (charspos < charslen) {
const char *translit;
#define PUTC do { \
buffer[bufferpos++] = chars[charspos++]; \
translit = ""; \
goto flush; \
} while (0)
if (chars[charspos] != '&') {
struct conv_table *t;
int i;
if (chars[charspos] < 128 || !convert_table) PUTC;
t = convert_table;
i = charspos;
while (t[(unsigned char)chars[i]].t) {
t = t[(unsigned char)chars[i++]].u.tbl;
if (i >= charslen) PUTC;
}
translit = t[(unsigned char)chars[i]].u.str;
charspos = i + 1;
} else if (mode == CSM_FORM || mode == CSM_NONE) {
PUTC;
} else {
int start = charspos + 1;
int i = start;
while (i < charslen
&& (isasciialpha(chars[i])
|| isdigit(chars[i])
|| (chars[i] == '#')))
i++;
/* This prevents bug 213: we were expanding "entities"
* in URL query strings. */
/* XXX: But this disables &nbsp&nbsp usage, which
* appears to be relatively common! --pasky */
if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
&& i > start
&& !isasciialpha(chars[i]) && !isdigit(chars[i])) {
translit = get_entity_string((const char *)&chars[start], i - start,
cp);
if (chars[i] != ';') {
/* Eat &nbsp &nbsp<foo> happily, but
* pull back from the character after
* entity string if it is not the valid
* terminator. */
i--;
}
if (!translit) PUTC;
charspos = i + (i < charslen);
} else PUTC;
}
if (!translit[0]) continue;
if (!translit[1]) {
buffer[bufferpos++] = translit[0];
translit = "";
goto flush;
}
while (*translit) {
char *new_;
buffer[bufferpos++] = *(translit++);
flush:
if (bufferpos & (ALLOC_GR - 1)) continue;
if (callback) {
buffer[bufferpos] = 0;
callback(callback_data, buffer, bufferpos);
bufferpos = 0;
} else {
2022-01-16 13:38:30 -05:00
new_ = (char *)mem_realloc(buffer, bufferpos + ALLOC_GR);
2016-04-20 12:42:22 -04:00
if (!new_) {
mem_free(buffer);
return NULL;
}
2016-04-20 12:42:22 -04:00
buffer = new_;
}
}
#undef PUTC
}
#ifdef HAVE_ICONV
if (loop) goto repeat;
#endif
/* Say bye */
buffer[bufferpos] = 0;
if (length) *length = bufferpos;
if (callback) {
if (bufferpos) callback(callback_data, buffer, bufferpos);
mem_free(buffer);
return NULL;
} else {
return buffer;
}
}
#ifndef USE_FASTFIND
int
get_cp_index(const char *name)
{
int i, a;
int syscp = 0;
if (!c_strcasecmp(name, "System")) {
#if HAVE_LANGINFO_CODESET
name = nl_langinfo(CODESET);
syscp = SYSTEM_CHARSET_FLAG;
#else
name = "us-ascii";
#endif
}
for (i = 0; codepages[i].name; i++) {
for (a = 0; codepages[i].aliases[a]; a++) {
/* In the past, we looked for the longest substring
* in all the names; it is way too expensive, though:
*
* % cumulative self self total
* time seconds seconds calls us/call us/call name
* 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
*
* Anything called from redraw_screen() is in fact
* relatively expensive, even if it's called just
* once. So we will do a simple strcasecmp() here.
*/
if (!c_strcasecmp(name, codepages[i].aliases[a]))
return i | syscp;
}
}
if (syscp) {
return get_cp_index("us-ascii") | syscp;
} else {
return -1;
}
}
#else
static unsigned int i_name = 0;
static unsigned int i_alias = 0;
/* Reset internal list pointer */
void
charsets_list_reset(void)
{
i_name = 0;
i_alias = 0;
}
/* Returns a pointer to a struct that contains current key and data pointers
* and increment internal pointer. It returns NULL when key is NULL. */
struct fastfind_key_value *
charsets_list_next(void)
{
static struct fastfind_key_value kv;
if (!codepages[i_name].name) return NULL;
kv.key = codepages[i_name].aliases[i_alias];
kv.data = (void *) &codepages[i_name]; /* cast away const */
if (codepages[i_name].aliases[i_alias + 1])
i_alias++;
else {
i_name++;
i_alias = 0;
}
return &kv;
}
static struct fastfind_index ff_charsets_index
= INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
/* It searchs for a charset named @name or one of its aliases and
* returns index for it or -1 if not found. */
int
get_cp_index(const char *name)
{
const struct codepage_desc *codepage;
int syscp = 0;
if (!c_strcasecmp(name, "System")) {
#if HAVE_LANGINFO_CODESET
name = nl_langinfo(CODESET);
syscp = SYSTEM_CHARSET_FLAG;
#else
name = "us-ascii";
#endif
}
2022-01-25 12:16:12 -05:00
codepage = (const struct codepage_desc *)fastfind_search(&ff_charsets_index, name, strlen(name));
if (codepage) {
assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
return (codepage - codepages) | syscp;
} else if (syscp) {
return get_cp_index("us-ascii") | syscp;
} else {
return -1;
}
}
#endif /* USE_FASTFIND */
void
init_charsets_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_index(&ff_charsets_index, FF_COMPRESS);
#endif
}
void
free_charsets_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_done(&ff_charsets_index);
#endif
}
/* Get the codepage's name for displaying to the user, or NULL if
* @cp_index is one past the end. In the future, we might want to
* localize these with gettext. So it may be best not to use this
* function if the name will have to be converted back to an
* index. */
2022-02-15 11:53:24 -05:00
const char *
get_cp_name(int cp_index)
{
if (cp_index < 0) return "none";
if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
return codepages[cp_index].name;
}
/* Get the codepage's name for saving to a configuration file. These
* names can be converted back to indexes, even in future versions of
* ELinks. */
char *
get_cp_config_name(int cp_index)
{
if (cp_index < 0) return "none";
if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
if (!codepages[cp_index].aliases) return NULL;
return (char *)codepages[cp_index].aliases[0];
}
/* Get the codepage's name for sending to a library or server that
* understands MIME charset names. This function irreversibly maps
* the "System" codepage to the underlying charset. */
char *
get_cp_mime_name(int cp_index)
{
if (cp_index < 0) return "none";
cp_index &= ~SYSTEM_CHARSET_FLAG;
if (!codepages[cp_index].aliases) return NULL;
return (char *)codepages[cp_index].aliases[0];
}
int
2006-07-18 11:51:03 -04:00
is_cp_utf8(int cp_index)
{
cp_index &= ~SYSTEM_CHARSET_FLAG;
return is_cp_ptr_utf8(&codepages[cp_index]);
}
/* This function will be used by the xhtml parser. */
const uint16_t *
get_cp_highhalf(const char *name)
{
int cp = get_cp_index(name);
if (cp < 0) return NULL;
cp &= ~SYSTEM_CHARSET_FLAG;
return codepages[cp].highhalf;
}