1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00
elinks/src/intl/charsets.c

1546 lines
38 KiB
C
Raw Normal View History

/* Charsets convertor */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE /* strcasecmp() */
#endif
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if HAVE_LANGINFO_CODESET
#include <langinfo.h>
#endif
#include <ctype.h>
#include <stdlib.h>
#if HAVE_WCTYPE_H
#include <wctype.h>
#endif
#include "elinks.h"
#include "document/options.h"
#include "intl/charsets.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/fastfind.h"
#include "util/hash.h"
#include "util/memory.h"
#include "util/string.h"
/* Fix namespace clash on MacOS. */
#define table table_elinks
struct table_entry {
unsigned char c;
/* This should in principle be unicode_val_T, but because all
* the values currently in codepage.inc fit in 16 bits, we can
* as well use uint16_t and halve sizeof(struct table_entry)
* from 8 bytes to 4. Should other characters ever be needed,
* unicode_val_T u : 24 might be a possibility, although it
* seems a little unportable as bitfields are in principle
* restricted to int, which may be 16-bit. */
uint16_t u;
};
struct codepage_desc {
unsigned char *name;
unsigned char *const *aliases;
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* The Unicode mappings of codepage bytes 0x80...0xFF.
* (0x00...0x7F are assumed to be ASCII in all codepages.)
* Because all current values fit in 16 bits, we store them as
* uint16_t rather than unicode_val_T. If the codepage does
* not use some byte, then @highhalf maps that byte to 0xFFFF,
* which C code converts to UCS_REPLACEMENT_CHARACTER where
* appropriate. (U+FFFF is reserved and will never be
* assigned as a character.) */
const uint16_t *highhalf;
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* If some byte in the codepage corresponds to multiple Unicode
* characters, then the preferred character is in @highhalf
* above, and the rest are listed here in @table. This table
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
* is not used for translating from the codepage to Unicode. */
const struct table_entry *table;
};
#include "intl/codepage.inc"
#include "intl/uni_7b.inc"
#include "intl/entity.inc"
2007-01-02 14:40:14 -05:00
static const char strings[256][2] = {
"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
};
static void
free_translation_table(struct conv_table *p)
{
int i;
for (i = 0; i < 256; i++)
if (p[i].t)
free_translation_table(p[i].u.tbl);
mem_free(p);
}
2007-01-01 17:54:14 -05:00
/* A string used in conversion tables when there is no correct
* conversion. This is compared by address and therefore should be a
* named array rather than a pointer so that it won't share storage
* with any other string literal that happens to have the same
* characters. */
2007-01-02 14:40:14 -05:00
static const unsigned char no_str[] = "*";
static void
new_translation_table(struct conv_table *p)
{
int i;
for (i = 0; i < 256; i++)
if (p[i].t)
free_translation_table(p[i].u.tbl);
for (i = 0; i < 128; i++) {
p[i].t = 0;
p[i].u.str = strings[i];
}
for (; i < 256; i++) {
p[i].t = 0;
p[i].u.str = no_str;
}
}
#define BIN_SEARCH(table, entry, entries, key, result) \
{ \
long _s = 0, _e = (entries) - 1; \
\
while (_s <= _e || !((result) = -1)) { \
long _m = (_s + _e) / 2; \
\
if ((table)[_m].entry == (key)) { \
(result) = _m; \
break; \
} \
if ((table)[_m].entry > (key)) _e = _m - 1; \
if ((table)[_m].entry < (key)) _s = _m + 1; \
} \
} \
static const unicode_val_T strange_chars[32] = {
0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
};
#define SYSTEM_CHARSET_FLAG 128
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
const unsigned char *
u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
{
int j;
int s;
if (u < 128) return strings[u];
2006-01-14 16:44:00 -05:00
to &= ~SYSTEM_CHARSET_FLAG;
#ifdef CONFIG_UTF8
if (is_cp_ptr_utf8(&codepages[to]))
return encode_utf8(u);
#endif /* CONFIG_UTF8 */
2006-01-14 16:44:00 -05:00
/* To mark non breaking spaces in non-UTF-8 strings, we use a
* special char NBSP_CHAR. */
if (u == UCS_NO_BREAK_SPACE) {
if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
else /* NBSP_MODE_ASCII */ return " ";
}
if (u == UCS_SOFT_HYPHEN) return "";
if (u < 0xa0) {
unicode_val_T strange = strange_chars[u - 0x80];
if (!strange) return NULL;
return u2cp_(strange, to, nbsp_mode);
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (u < 0xFFFF)
for (j = 0; j < 0x80; j++)
if (codepages[to].highhalf[j] == u)
return strings[0x80 + j];
for (j = 0; codepages[to].table[j].c; j++)
if (codepages[to].table[j].u == u)
return strings[codepages[to].table[j].c];
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
if (s != -1) return unicode_7b[s].s;
return no_str;
}
static unsigned char utf_buffer[7];
#ifdef CONFIG_UTF8
2006-01-14 16:44:00 -05:00
inline unsigned char *
encode_utf8(unicode_val_T u)
#else
static unsigned char *
encode_utf8(unicode_val_T u)
#endif /* CONFIG_UTF8 */
{
memset(utf_buffer, 0, 7);
if (u < 0x80)
utf_buffer[0] = u;
else if (u < 0x800)
utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
utf_buffer[1] = 0x80 | (u & 0x3f);
else if (u < 0x10000)
utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[2] = 0x80 | (u & 0x3f);
else if (u < 0x200000)
utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[3] = 0x80 | (u & 0x3f);
else if (u < 0x4000000)
utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[4] = 0x80 | (u & 0x3f);
else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[5] = 0x80 | (u & 0x3f);
return utf_buffer;
}
#ifdef CONFIG_UTF8
/* Number of bytes utf8 character indexed by first byte. Illegal bytes are
* equal ones and handled different. */
static const char utf8char_len_tab[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
};
inline int utf8charlen(const unsigned char *p)
{
return p ? utf8char_len_tab[*p] : 0;
}
2006-01-14 16:44:00 -05:00
inline int
strlen_utf8(unsigned char **str)
{
unsigned char *s = *str;
unsigned char *end = strchr(s, '\0');
int x;
int len;
for (x = 0;; x++, s += len) {
len = utf8charlen(s);
2006-01-14 16:44:00 -05:00
if (s + len > end) break;
}
*str = s;
return x;
}
#define utf8_issingle(p) (((p) & 0x80) == 0)
#define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
/* Start from @current and move back to @pos char. This pointer return. The
* most left pointer is @start. */
inline unsigned char *
utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
{
if (current == NULL || start == NULL || pos < 0)
return NULL;
while (pos > 0 && current != start) {
current--;
if (utf8_islead(*current))
pos--;
}
return current;
}
/* Count number of standard terminal cells needed for displaying UTF-8
* character. */
int
utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
{
unicode_val_T u;
if (end == NULL)
end = strchr(utf8_char, '\0');
if(!utf8_char || !end)
return -1;
u = utf8_to_unicode(&utf8_char, end);
return unicode_to_cell(u);
}
/* Count number of standard terminal cells needed for displaying string
* with UTF-8 characters. */
int
utf8_ptr2cells(unsigned char *string, unsigned char *end)
{
int charlen, cell, cells = 0;
if (end == NULL)
end = strchr(string, '\0');
if(!string || !end)
return -1;
do {
charlen = utf8charlen(string);
2006-07-27 03:51:10 -04:00
if (string + charlen > end)
break;
cell = utf8_char2cells(string, end);
if (cell < 0)
return -1;
cells += cell;
string += charlen;
} while (1);
return cells;
}
/* Count number of characters in string. */
int
utf8_ptr2chars(unsigned char *string, unsigned char *end)
{
int charlen, chars = 0;
if (end == NULL)
end = strchr(string, '\0');
if(!string || !end)
return -1;
do {
charlen = utf8charlen(string);
2006-07-27 03:51:10 -04:00
if (string + charlen > end)
break;
chars++;
string += charlen;
} while (1);
return chars;
}
/*
* Count number of bytes from begining of the string needed for displaying
* specified number of cells.
*/
int
utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
{
unsigned int bytes = 0, cells = 0;
assert(max_cells>=0);
if (end == NULL)
end = strchr(string, '\0');
if(!string || !end)
return -1;
do {
int cell = utf8_char2cells(&string[bytes], end);
if (cell < 0)
return -1;
cells += cell;
if (cells > max_cells)
break;
bytes += utf8charlen(&string[bytes]);
if (string + bytes > end) {
bytes = end - string;
break;
}
} while(1);
return bytes;
}
/* Take @max steps forward from @string in the specified @way, but
* not going past @end. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @end point to
* character boundaries. If not, it doesn't crash but the results may
* be inconsistent.
*
* This function can do some of the same jobs as utf8charlen(),
* utf8_cells2bytes(), and strlen_utf8(). */
unsigned char *
utf8_step_forward(unsigned char *string, unsigned char *end,
int max, enum utf8_step way, int *count)
{
int steps = 0;
unsigned char *current = string;
assert(string);
assert(max >= 0);
if_assert_failed goto invalid_arg;
if (end == NULL)
end = strchr(string, '\0');
switch (way) {
case UTF8_STEP_CHARACTERS:
while (steps < max && current < end) {
++current;
if (utf8_islead(*current))
++steps;
}
break;
case UTF8_STEP_CELLS_FEWER:
case UTF8_STEP_CELLS_MORE:
while (steps < max) {
unicode_val_T u;
unsigned char *prev = current;
int width;
u = utf8_to_unicode(&current, end);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
current = end;
++steps;
break;
}
width = unicode_to_cell(u);
if (way == UTF8_STEP_CELLS_FEWER
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
invalid_arg:
if (count)
*count = steps;
return current;
}
/* Take @max steps backward from @string in the specified @way, but
* not going past @start. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @start point
* to character boundaries. If not, it doesn't crash but the results
* may be inconsistent.
*
* This function can do some of the same jobs as utf8_prevchar(). */
unsigned char *
utf8_step_backward(unsigned char *string, unsigned char *start,
int max, enum utf8_step way, int *count)
{
int steps = 0;
unsigned char *current = string;
assert(string);
assert(start);
assert(max >= 0);
if_assert_failed goto invalid_arg;
switch (way) {
case UTF8_STEP_CHARACTERS:
while (steps < max && current > start) {
--current;
if (utf8_islead(*current))
++steps;
}
break;
case UTF8_STEP_CELLS_FEWER:
case UTF8_STEP_CELLS_MORE:
while (steps < max) {
unsigned char *prev = current;
unsigned char *look;
unicode_val_T u;
int width;
if (current <= start)
break;
do {
--current;
} while (current > start && !utf8_islead(*current));
look = current;
u = utf8_to_unicode(&look, prev);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
width = 1;
} else
width = unicode_to_cell(u);
if (way == UTF8_STEP_CELLS_FEWER
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
invalid_arg:
if (count)
*count = steps;
return current;
}
2006-07-27 03:51:10 -04:00
/*
* Find out number of standard terminal collumns needed for displaying symbol
* (glyph) which represents Unicode character c.
*
* TODO: Use wcwidth when it is available. This seems to require:
* - Make the configure script check whether <wchar.h> and wcwidth exist.
* - Define _XOPEN_SOURCE and include <wchar.h>.
* - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
* matches ISO 10646 in all locales.)
* However, these do not suffice, because wcwidth depends on LC_CTYPE
* in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
* is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
* <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
* U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
* character is apparently not supported in all locales. Why is that?
* - Perhaps there is standardese that requires supported characters
* to be convertable to multibyte form. Then ELinks could just pick
* some UTF-8 locale for its wcwidth purposes.
* - Perhaps wcwidth can even return different nonnegative values for
* the same ISO 10646 character in different locales. Then ELinks
* would have to set LC_CTYPE to match at least the terminal's
* charset (which may differ from the LC_CTYPE environment variable,
* especially when the master process is serving a slave terminal).
* But there is no guarantee that the libc supports all the same
* charsets as ELinks does.
* For now, it seems safest to avoid the potentially locale-dependent
* libc version of wcwidth, and instead use a hardcoded mapping.
*
* @return 2 for double-width glyph, 1 for others.
* TODO: May be extended to return 0 for zero-width glyphs
* (like composing, maybe unprintable too).
*/
inline int
unicode_to_cell(unicode_val_T c)
{
if (c >= 0x1100
&& (c <= 0x115f /* Hangul Jamo */
|| c == 0x2329
|| c == 0x232a
|| (c >= 0x2e80 && c <= 0xa4cf
&& c != 0x303f) /* CJK ... Yi */
|| (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
|| (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
Ideographs */
|| (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
|| (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
|| (c >= 0xffe0 && c <= 0xffe6)
|| (c >= 0x20000 && c <= 0x2fffd)
|| (c >= 0x30000 && c <= 0x3fffd)))
return 2;
2006-07-27 03:51:10 -04:00
return 1;
}
/* Fold the case of a Unicode character, so that hotkeys in labels can
* be compared case-insensitively. It is unspecified whether the
* result will be in upper or lower case. */
unicode_val_T
unicode_fold_label_case(unicode_val_T c)
{
#if __STDC_ISO_10646__ && HAVE_WCTYPE_H
return towlower(c);
#else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
/* For now, this supports only ASCII. It would be possible to
* use code generated from CaseFolding.txt of Unicode if the
* acknowledgements required by http://www.unicode.org/copyright.html
* were added to associated documentation of ELinks. */
if (c >= 0x41 && c <= 0x5A)
return c + 0x20;
else
return c;
#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
}
2006-01-14 16:44:00 -05:00
inline unicode_val_T
utf8_to_unicode(unsigned char **string, const unsigned char *end)
2006-01-14 16:44:00 -05:00
{
unsigned char *str = *string;
unicode_val_T u;
int length;
length = utf8char_len_tab[str[0]];
2006-01-14 16:44:00 -05:00
if (str + length > end) {
return UCS_NO_CHAR;
2006-07-27 03:51:10 -04:00
}
2006-01-14 16:44:00 -05:00
switch (length) {
case 1: /* U+0000 to U+007F */
if (str[0] >= 0x80) {
invalid_utf8:
++*string;
return UCS_REPLACEMENT_CHARACTER;
}
2006-01-14 16:44:00 -05:00
u = str[0];
break;
case 2: /* U+0080 to U+07FF */
if ((str[1] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x1f) << 6;
u += (str[1] & 0x3f);
if (u < 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 3: /* U+0800 to U+FFFF, except surrogates */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 12;
u += ((str[1] & 0x3f) << 6);
u += (str[2] & 0x3f);
if (u < 0x800 || is_utf16_surrogate(u))
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 4: /* U+10000 to U+1FFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 18;
u += ((str[1] & 0x3f) << 12);
u += ((str[2] & 0x3f) << 6);
u += (str[3] & 0x3f);
if (u < 0x10000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 5: /* U+200000 to U+3FFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 24;
u += ((str[1] & 0x3f) << 18);
u += ((str[2] & 0x3f) << 12);
u += ((str[3] & 0x3f) << 6);
u += (str[4] & 0x3f);
if (u < 0x200000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 6: /* U+4000000 to U+7FFFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
|| (str[5] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x01) << 30;
u += ((str[1] & 0x3f) << 24);
u += ((str[2] & 0x3f) << 18);
u += ((str[3] & 0x3f) << 12);
u += ((str[4] & 0x3f) << 6);
u += (str[5] & 0x3f);
if (u < 0x4000000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
default:
INTERNAL("utf8char_len_tab out of range");
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
}
*string = str + length;
2006-07-18 14:33:34 -04:00
return u;
2006-01-14 16:44:00 -05:00
}
#endif /* CONFIG_UTF8 */
2006-01-14 16:44:00 -05:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* The common part of cp2u and cp2utf_8. */
static unicode_val_T
cp2u_shared(const struct codepage_desc *from, unsigned char c)
{
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
unicode_val_T u = from->highhalf[c - 0x80];
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
return u;
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* Used for converting input from the terminal. */
unicode_val_T
cp2u(int from, unsigned char c)
{
from &= ~SYSTEM_CHARSET_FLAG;
/* UTF-8 is a multibyte codepage and cannot be handled with
* this function. */
assert(!is_cp_ptr_utf8(&codepages[from]));
if_assert_failed return UCS_REPLACEMENT_CHARACTER;
if (c < 0x80) return c;
else return cp2u_shared(&codepages[from], c);
}
/* This slow and ugly code is used by the terminal utf_8_io */
const unsigned char *
cp2utf8(int from, int c)
{
from &= ~SYSTEM_CHARSET_FLAG;
if (is_cp_ptr_utf8(&codepages[from]) || c < 128)
return strings[c];
return encode_utf8(cp2u_shared(&codepages[from], c));
}
#ifdef CONFIG_UTF8
2006-08-13 16:35:50 -04:00
unicode_val_T
cp_to_unicode(int codepage, unsigned char **string, unsigned char *end)
{
2006-08-13 20:19:10 -04:00
unicode_val_T ret;
2006-08-13 16:35:50 -04:00
if (is_cp_utf8(codepage))
return utf8_to_unicode(string, end);
2006-08-13 20:19:10 -04:00
if (*string >= end)
return UCS_NO_CHAR;
ret = cp2u(codepage, **string);
++*string;
return ret;
2006-08-13 16:35:50 -04:00
}
#endif /* CONFIG_UTF8 */
2006-08-13 16:35:50 -04:00
#ifdef CONFIG_COMBINE
unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1;
unicode_val_T **combined;
struct hash *combined_hash;
unicode_val_T
get_combined(unicode_val_T *data, int length)
{
struct hash_item *item;
unicode_val_T *key;
int i, indeks;
assert(length >= 1 && length <= UCS_MAX_LENGTH_COMBINED);
if_assert_failed return UCS_NO_CHAR;
if (!combined_hash) combined_hash = init_hash8();
if (!combined_hash) return UCS_NO_CHAR;
item = get_hash_item(combined_hash, (unsigned char *)data, length * sizeof(*data));
if (item) return (unicode_val_T)(long)item->value;
if (last_combined >= UCS_END_COMBINED) return UCS_NO_CHAR;
key = mem_alloc((length + 1) * sizeof(*key));
if (!key) return UCS_NO_CHAR;
for (i = 0; i < length; i++)
key[i] = data[i];
key[i] = UCS_END_COMBINED;
last_combined++;
indeks = last_combined - UCS_BEGIN_COMBINED;
combined = mem_realloc(combined, sizeof(*combined) * (indeks + 1));
if (!combined) {
mem_free(key);
last_combined--;
return UCS_NO_CHAR;
}
combined[indeks] = key;
item = add_hash_item(combined_hash, (unsigned char *)key,
length * sizeof(*data), (void *)(long)(last_combined));
if (!item) {
last_combined--;
mem_free(key);
return UCS_NO_CHAR;
}
return last_combined;
}
void
free_combined()
{
int i, end = last_combined - UCS_BEGIN_COMBINED + 1;
if (combined_hash)
free_hash(&combined_hash);
for (i = 0; i < end; i++)
mem_free(combined[i]);
mem_free_if(combined);
}
#endif /* CONFIG_COMBINE */
static void
add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str)
{
unsigned char *p = encode_utf8(u);
while (p[1]) {
if (ct[*p].t) ct = ct[*p].u.tbl;
else {
struct conv_table *nct;
assertm(ct[*p].u.str == no_str, "bad utf encoding #1");
if_assert_failed return;
nct = mem_calloc(256, sizeof(*nct));
if (!nct) return;
new_translation_table(nct);
ct[*p].t = 1;
ct[*p].u.tbl = nct;
ct = nct;
}
p++;
}
assertm(!ct[*p].t, "bad utf encoding #2");
if_assert_failed return;
if (ct[*p].u.str == no_str)
ct[*p].u.str = str;
}
2007-01-01 17:54:14 -05:00
/* A conversion table from some charset to UTF-8.
* If it is from UTF-8 to UTF-8, it converts each byte separately.
* Unlike in other translation tables, the strings in elements 0x80 to
* 0xFF are allocated dynamically. */
struct conv_table utf_table[256];
int utf_table_init = 1;
static void
free_utf_table(void)
{
int i;
2007-01-01 18:31:22 -05:00
/* Cast away const. */
for (i = 128; i < 256; i++)
2007-01-01 18:31:22 -05:00
mem_free((unsigned char *) utf_table[i].u.str);
}
static struct conv_table *
get_translation_table_to_utf8(int from)
{
int i;
static int lfr = -1;
if (from == -1) return NULL;
from &= ~SYSTEM_CHARSET_FLAG;
if (from == lfr) return utf_table;
lfr = from;
if (utf_table_init) {
memset(utf_table, 0, sizeof(utf_table));
utf_table_init = 0;
} else
free_utf_table();
for (i = 0; i < 128; i++)
utf_table[i].u.str = strings[i];
if (is_cp_ptr_utf8(&codepages[from])) {
for (i = 128; i < 256; i++)
utf_table[i].u.str = stracpy(strings[i]);
return utf_table;
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
for (i = 128; i < 256; i++) {
unicode_val_T u = codepages[from].highhalf[i - 0x80];
if (u == 0xFFFF)
utf_table[i].u.str = NULL;
else
utf_table[i].u.str = stracpy(encode_utf8(u));
}
for (i = 0; codepages[from].table[i].c; i++) {
unicode_val_T u = codepages[from].table[i].u;
if (!utf_table[codepages[from].table[i].c].u.str)
utf_table[codepages[from].table[i].c].u.str =
stracpy(encode_utf8(u));
}
for (i = 128; i < 256; i++)
if (!utf_table[i].u.str)
utf_table[i].u.str = stracpy(no_str);
return utf_table;
}
2007-01-01 17:54:14 -05:00
/* A conversion table between two charsets, where the target is not UTF-8. */
static struct conv_table table[256];
static int first = 1;
void
free_conv_table(void)
{
if (!utf_table_init) free_utf_table();
if (first) {
memset(table, 0, sizeof(table));
first = 0;
}
new_translation_table(table);
}
struct conv_table *
get_translation_table(int from, int to)
{
static int lfr = -1;
static int lto = -1;
from &= ~SYSTEM_CHARSET_FLAG;
to &= ~SYSTEM_CHARSET_FLAG;
if (first) {
memset(table, 0, sizeof(table));
first = 0;
}
if (/*from == to ||*/ from == -1 || to == -1)
return NULL;
if (is_cp_ptr_utf8(&codepages[to]))
return get_translation_table_to_utf8(from);
if (from == lfr && to == lto)
return table;
lfr = from;
lto = to;
new_translation_table(table);
if (is_cp_ptr_utf8(&codepages[from])) {
int i;
/* Map U+00A0 and U+00AD the same way as u2cp() would. */
add_utf8(table, UCS_NO_BREAK_SPACE, strings[NBSP_CHAR]);
add_utf8(table, UCS_SOFT_HYPHEN, "");
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
for (i = 0x80; i <= 0xFF; i++)
if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
add_utf8(table,
codepages[to].highhalf[i - 0x80],
strings[i]);
for (i = 0; codepages[to].table[i].c; i++)
add_utf8(table, codepages[to].table[i].u,
strings[codepages[to].table[i].c]);
for (i = 0; unicode_7b[i].x != -1; i++)
if (unicode_7b[i].x >= 0x80)
add_utf8(table, unicode_7b[i].x,
unicode_7b[i].s);
} else {
int i;
for (i = 128; i < 256; i++) {
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
const unsigned char *u;
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
u = u2cp(codepages[from].highhalf[i - 0x80], to);
if (u) table[i].u.str = u;
}
}
}
return table;
}
static inline int
xxstrcmp(unsigned char *s1, unsigned char *s2, int l2)
{
while (l2) {
if (*s1 > *s2) return 1;
if (*s1 < *s2) return -1;
s1++;
s2++;
l2--;
}
return *s2 ? -1 : 0;
}
/* Entity cache debugging purpose. */
#if 0
#define DEBUG_ENTITY_CACHE
#else
#undef DEBUG_ENTITY_CACHE
#endif
struct entity_cache {
unsigned int hits;
int strlen;
int encoding;
const unsigned char *result;
unsigned char str[20]; /* Suffice in any case. */
};
/* comparison function for qsort() */
static int
hits_cmp(const void *v1, const void *v2)
{
const struct entity_cache *a = v1, *b = v2;
if (a->hits == b->hits) return 0;
if (a->hits > b->hits) return -1;
else return 1;
}
static int
compare_entities(const void *key_, const void *element_)
{
struct string *key = (struct string *) key_;
struct entity *element = (struct entity *) element_;
int length = key->length;
unsigned char *first = key->source;
unsigned char *second = element->s;
return xxstrcmp(first, second, length);
}
const unsigned char *
get_entity_string(const unsigned char *str, const int strlen, int encoding)
{
#define ENTITY_CACHE_SIZE 10 /* 10 seems a good value. */
#define ENTITY_CACHE_MAXLEN 9 /* entities with length >= ENTITY_CACHE_MAXLEN or == 1
will go in [0] table */
static struct entity_cache entity_cache[ENTITY_CACHE_MAXLEN][ENTITY_CACHE_SIZE];
static unsigned int nb_entity_cache[ENTITY_CACHE_MAXLEN];
static int first_time = 1;
2006-01-14 16:44:00 -05:00
unsigned int slen = 0;
const unsigned char *result = NULL;
if (strlen <= 0) return NULL;
#ifdef CONFIG_UTF8
2006-01-14 16:44:00 -05:00
/* TODO: caching UTF-8 */
encoding &= ~SYSTEM_CHARSET_FLAG;
if (is_cp_ptr_utf8(&codepages[encoding]))
2006-01-14 16:44:00 -05:00
goto skip;
#endif /* CONFIG_UTF8 */
2006-01-14 16:44:00 -05:00
if (first_time) {
memset(&nb_entity_cache, 0, ENTITY_CACHE_MAXLEN * sizeof(unsigned int));
first_time = 0;
}
/* Check if cached. A test on many websites (freshmeat.net + whole ELinks website
* + google + slashdot + websites that result from a search for test on google,
* + various ones) show a quite impressive improvment:
* Top ten is:
* 0: hits=2459 l=4 st='nbsp'
* 1: hits=2152 l=6 st='eacute'
* 2: hits=235 l=6 st='egrave'
* 3: hits=136 l=6 st='agrave'
* 4: hits=100 l=3 st='amp'
* 5: hits=40 l=5 st='laquo'
* 6: hits=8 l=4 st='copy'
* 7: hits=5 l=2 st='gt'
* 8: hits=2 l=2 st='lt'
* 9: hits=1 l=6 st='middot'
*
* Most of the time cache hit ratio is near 95%.
*
* A long test shows: 15186 hits vs. 24 misses and mean iteration
* count is kept < 2 (worst case 1.58). Not so bad ;)
*
* --Zas */
/* entities with length >= ENTITY_CACHE_MAXLEN or == 1 will go in [0] table */
slen = (strlen > 1 && strlen < ENTITY_CACHE_MAXLEN) ? strlen : 0;
if (strlen < ENTITY_CACHE_MAXLEN && nb_entity_cache[slen] > 0) {
int i;
for (i = 0; i < nb_entity_cache[slen]; i++) {
if (entity_cache[slen][i].encoding == encoding
&& !memcmp(str, entity_cache[slen][i].str, strlen)) {
#ifdef DEBUG_ENTITY_CACHE
static double total_iter = 0;
static unsigned long hit_count = 0;
total_iter += i + 1;
hit_count++;
fprintf(stderr, "hit after %d iter. (mean = %0.2f)\n", i + 1, total_iter / (double) hit_count);
#endif
if (entity_cache[slen][i].hits < (unsigned int) ~0)
entity_cache[slen][i].hits++;
return entity_cache[slen][i].result;
}
}
#ifdef DEBUG_ENTITY_CACHE
fprintf(stderr, "miss\n");
#endif
}
#ifdef CONFIG_UTF8
2006-01-14 16:44:00 -05:00
skip:
#endif /* CONFIG_UTF8 */
if (*str == '#') { /* Numeric entity. */
int l = (int) strlen;
unsigned char *st = (unsigned char *) str;
unicode_val_T n = 0;
if (l == 1) goto end; /* &#; ? */
st++, l--;
if ((*st | 32) == 'x') { /* Hexadecimal */
if (l == 1 || l > 9) goto end; /* xFFFFFFFF max. */
st++, l--;
do {
unsigned char c = (*(st++) | 32);
if (isdigit(c))
n = (n << 4) | (c - '0');
else if (isxdigit(c))
n = (n << 4) | (c - 'a' + 10);
else
goto end; /* Bad char. */
} while (--l);
} else { /* Decimal */
if (l > 10) goto end; /* 4294967295 max. */
do {
unsigned char c = *(st++);
if (isdigit(c))
n = n * 10 + c - '0';
else
goto end; /* Bad char. */
/* Limit to 0xFFFFFFFF. */
if (n >= (unicode_val_T) 0xFFFFFFFFu)
goto end;
} while (--l);
}
result = u2cp(n, encoding);
#ifdef DEBUG_ENTITY_CACHE
fprintf(stderr, "%lu %016x %s\n", (unsigned long) n , n, result);
#endif
} else { /* Text entity. */
struct string key = INIT_STRING((unsigned char *) str, strlen);
struct entity *element = bsearch((void *) &key, entities,
N_ENTITIES,
sizeof(*element),
compare_entities);
if (element) result = u2cp(element->c, encoding);
}
#ifdef CONFIG_UTF8
if (is_cp_ptr_utf8(&codepages[encoding])) {
2006-01-14 16:44:00 -05:00
return result;
}
#endif /* CONFIG_UTF8 */
end:
/* Take care of potential buffer overflow. */
if (strlen < sizeof(entity_cache[slen][0].str)) {
struct entity_cache *ece;
/* Sort entries by hit order. */
if (nb_entity_cache[slen] > 1)
qsort(&entity_cache[slen][0], nb_entity_cache[slen],
sizeof(entity_cache[slen][0]), hits_cmp);
/* Increment number of cache entries if possible.
* Else, just replace the least used entry. */
if (nb_entity_cache[slen] < ENTITY_CACHE_SIZE) nb_entity_cache[slen]++;
ece = &entity_cache[slen][nb_entity_cache[slen] - 1];
/* Copy new entry to cache. */
ece->hits = 1;
ece->strlen = strlen;
ece->encoding = encoding;
ece->result = result;
memcpy(ece->str, str, strlen);
ece->str[strlen] = '\0';
#ifdef DEBUG_ENTITY_CACHE
fprintf(stderr, "Added in [%u]: l=%d st='%s'\n", slen,
entity_cache[slen][0].strlen, entity_cache[slen][0].str);
{
unsigned int i;
fprintf(stderr, "- Cache entries [%u] -\n", slen);
for (i = 0; i < nb_entity_cache[slen] ; i++)
fprintf(stderr, "%d: hits=%u l=%d st='%s'\n", i,
entity_cache[slen][i].hits, entity_cache[slen][i].strlen,
entity_cache[slen][i].str);
fprintf(stderr, "-----------------\n");
}
#endif /* DEBUG_ENTITY_CACHE */
}
return result;
}
unsigned char *
convert_string(struct conv_table *convert_table,
unsigned char *chars, int charslen, int cp,
enum convert_string_mode mode, int *length,
void (*callback)(void *data, unsigned char *buf, int buflen),
void *callback_data)
{
unsigned char *buffer;
int bufferpos = 0;
int charspos = 0;
if (!convert_table && !memchr(chars, '&', charslen)) {
if (callback) {
if (charslen) callback(callback_data, chars, charslen);
return NULL;
} else {
return memacpy(chars, charslen);
}
}
/* Buffer allocation */
buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */);
if (!buffer) return NULL;
/* Iterate ;-) */
while (charspos < charslen) {
2007-01-01 18:31:22 -05:00
const unsigned char *translit;
#define PUTC do { \
buffer[bufferpos++] = chars[charspos++]; \
translit = ""; \
goto flush; \
} while (0)
if (chars[charspos] != '&') {
struct conv_table *t;
int i;
if (chars[charspos] < 128 || !convert_table) PUTC;
t = convert_table;
i = charspos;
while (t[chars[i]].t) {
t = t[chars[i++]].u.tbl;
if (i >= charslen) PUTC;
}
translit = t[chars[i]].u.str;
charspos = i + 1;
} else if (mode == CSM_FORM || mode == CSM_NONE) {
PUTC;
} else {
int start = charspos + 1;
int i = start;
while (i < charslen
&& (isasciialpha(chars[i])
|| isdigit(chars[i])
|| (chars[i] == '#')))
i++;
/* This prevents bug 213: we were expanding "entities"
* in URL query strings. */
/* XXX: But this disables &nbsp&nbsp usage, which
* appears to be relatively common! --pasky */
if ((mode == CSM_DEFAULT || (chars[i] != '&' && chars[i] != '='))
&& i > start
&& !isasciialpha(chars[i]) && !isdigit(chars[i])) {
translit = get_entity_string(&chars[start], i - start,
cp);
if (chars[i] != ';') {
/* Eat &nbsp &nbsp<foo> happily, but
* pull back from the character after
* entity string if it is not the valid
* terminator. */
i--;
}
if (!translit) PUTC;
charspos = i + (i < charslen);
} else PUTC;
}
if (!translit[0]) continue;
if (!translit[1]) {
buffer[bufferpos++] = translit[0];
translit = "";
goto flush;
}
while (*translit) {
unsigned char *new;
buffer[bufferpos++] = *(translit++);
flush:
if (bufferpos & (ALLOC_GR - 1)) continue;
if (callback) {
buffer[bufferpos] = 0;
callback(callback_data, buffer, bufferpos);
bufferpos = 0;
} else {
new = mem_realloc(buffer, bufferpos + ALLOC_GR);
if (!new) {
mem_free(buffer);
return NULL;
}
buffer = new;
}
}
#undef PUTC
}
/* Say bye */
buffer[bufferpos] = 0;
if (length) *length = bufferpos;
if (callback) {
if (bufferpos) callback(callback_data, buffer, bufferpos);
mem_free(buffer);
return NULL;
} else {
return buffer;
}
}
#ifndef USE_FASTFIND
int
get_cp_index(unsigned char *name)
{
int i, a;
int syscp = 0;
if (!strcasecmp(name, "System")) {
#if HAVE_LANGINFO_CODESET
name = nl_langinfo(CODESET);
syscp = SYSTEM_CHARSET_FLAG;
#else
name = "us-ascii";
#endif
}
for (i = 0; codepages[i].name; i++) {
for (a = 0; codepages[i].aliases[a]; a++) {
/* In the past, we looked for the longest substring
* in all the names; it is way too expensive, though:
*
* % cumulative self self total
* time seconds seconds calls us/call us/call name
* 3.00 0.66 0.03 1325 22.64 22.64 get_cp_index
*
* Anything called from redraw_screen() is in fact
* relatively expensive, even if it's called just
* once. So we will do a simple strcasecmp() here.
*/
if (!strcasecmp(name, codepages[i].aliases[a]))
return i | syscp;
}
}
if (syscp) {
return get_cp_index("us-ascii") | syscp;
} else {
return -1;
}
}
#else
static unsigned int i_name = 0;
static unsigned int i_alias = 0;
/* Reset internal list pointer */
void
charsets_list_reset(void)
{
i_name = 0;
i_alias = 0;
}
/* Returns a pointer to a struct that contains current key and data pointers
* and increment internal pointer. It returns NULL when key is NULL. */
struct fastfind_key_value *
charsets_list_next(void)
{
static struct fastfind_key_value kv;
if (!codepages[i_name].name) return NULL;
kv.key = codepages[i_name].aliases[i_alias];
kv.data = (void *) &codepages[i_name]; /* cast away const */
if (codepages[i_name].aliases[i_alias + 1])
i_alias++;
else {
i_name++;
i_alias = 0;
}
return &kv;
}
static struct fastfind_index ff_charsets_index
= INIT_FASTFIND_INDEX("charsets_lookup", charsets_list_reset, charsets_list_next);
/* It searchs for a charset named @name or one of its aliases and
* returns index for it or -1 if not found. */
int
get_cp_index(unsigned char *name)
{
const struct codepage_desc *codepage;
int syscp = 0;
if (!strcasecmp(name, "System")) {
#if HAVE_LANGINFO_CODESET
name = nl_langinfo(CODESET);
syscp = SYSTEM_CHARSET_FLAG;
#else
name = "us-ascii";
#endif
}
codepage = fastfind_search(&ff_charsets_index, name, strlen(name));
if (codepage) {
assert(codepages <= codepage && codepage < codepages + N_CODEPAGES);
return (codepage - codepages) | syscp;
} else if (syscp) {
return get_cp_index("us-ascii") | syscp;
} else {
return -1;
}
}
#endif /* USE_FASTFIND */
void
init_charsets_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_index(&ff_charsets_index, FF_COMPRESS);
#endif
}
void
free_charsets_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_done(&ff_charsets_index);
#endif
}
/* Get the codepage's name for displaying to the user, or NULL if
* @cp_index is one past the end. In the future, we might want to
* localize these with gettext. So it may be best not to use this
* function if the name will have to be converted back to an
* index. */
unsigned char *
get_cp_name(int cp_index)
{
if (cp_index < 0) return "none";
if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
return codepages[cp_index].name;
}
/* Get the codepage's name for saving to a configuration file. These
* names can be converted back to indexes, even in future versions of
* ELinks. */
unsigned char *
get_cp_config_name(int cp_index)
{
if (cp_index < 0) return "none";
if (cp_index & SYSTEM_CHARSET_FLAG) return "System";
if (!codepages[cp_index].aliases) return NULL;
return codepages[cp_index].aliases[0];
}
/* Get the codepage's name for sending to a library or server that
* understands MIME charset names. This function irreversibly maps
* the "System" codepage to the underlying charset. */
unsigned char *
get_cp_mime_name(int cp_index)
{
if (cp_index < 0) return "none";
cp_index &= ~SYSTEM_CHARSET_FLAG;
if (!codepages[cp_index].aliases) return NULL;
return codepages[cp_index].aliases[0];
}
int
2006-07-18 11:51:03 -04:00
is_cp_utf8(int cp_index)
{
cp_index &= ~SYSTEM_CHARSET_FLAG;
return is_cp_ptr_utf8(&codepages[cp_index]);
}