1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-07-17 15:14:35 -04:00
elinks/src/intl/charsets.c

1674 lines
42 KiB
C
Raw Normal View History

/* Charsets convertor */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE /* strcasecmp() */
#endif
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#if HAVE_LANGINFO_CODESET
#include <langinfo.h>
#endif
#include <ctype.h>
#include <stdlib.h>
#if HAVE_WCTYPE_H
#include <wctype.h>
#endif
#ifdef HAVE_ICONV
#include <errno.h>
#include <iconv.h>
#endif
#include "elinks.h"
#include "document/options.h"
#include "intl/charsets.h"
#include "util/conv.h"
#include "util/error.h"
#include "util/fastfind.h"
#include "util/hash.h"
#include "util/memory.h"
#include "util/string.h"
/* Fix namespace clash on MacOS. */
#define table table_elinks
struct table_entry {
unsigned char c;
/* This should in principle be unicode_val_T, but because all
* the values currently in codepage.inc fit in 16 bits, we can
* as well use uint16_t and halve sizeof(struct table_entry)
* from 8 bytes to 4. Should other characters ever be needed,
* unicode_val_T u : 24 might be a possibility, although it
* seems a little unportable as bitfields are in principle
* restricted to int, which may be 16-bit. */
uint16_t u;
};
struct codepage_desc {
unsigned char *name;
unsigned char *const *aliases;
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* The Unicode mappings of codepage bytes 0x80...0xFF.
* (0x00...0x7F are assumed to be ASCII in all codepages.)
* Because all current values fit in 16 bits, we store them as
* uint16_t rather than unicode_val_T. If the codepage does
* not use some byte, then @highhalf maps that byte to 0xFFFF,
* which C code converts to UCS_REPLACEMENT_CHARACTER where
* appropriate. (U+FFFF is reserved and will never be
* assigned as a character.) */
const uint16_t *highhalf;
2006-09-28 18:07:54 -04:00
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* If some byte in the codepage corresponds to multiple Unicode
* characters, then the preferred character is in @highhalf
* above, and the rest are listed here in @table. This table
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
* is not used for translating from the codepage to Unicode. */
const struct table_entry *table;
/* Whether use iconv for translation */
unsigned int iconv:1;
};
#include "intl/codepage.inc"
#include "intl/uni_7b.inc"
#include "intl/entity.inc"
/* Declare the external-linkage inline functions defined in this file.
* Avoid the GCC 4.3.1 warning: `foo' declared inline after being
* called. The functions are not declared inline in charsets.h
* because C99 6.7.4p6 says that every external-linkage function
* declared inline shall be defined in the same translation unit.
* The non-inline declarations in charsets.h also make sure that the
* compiler emits global definitions for the symbols so that the
* functions can be called from other translation units. */
NONSTATIC_INLINE unsigned char *encode_utf8(unicode_val_T u);
NONSTATIC_INLINE int utf8charlen(const unsigned char *p);
NONSTATIC_INLINE int unicode_to_cell(unicode_val_T c);
NONSTATIC_INLINE unicode_val_T utf8_to_unicode(unsigned char **string,
const unsigned char *end);
2007-01-02 14:40:14 -05:00
static const char strings[256][2] = {
"\000", "\001", "\002", "\003", "\004", "\005", "\006", "\007",
"\010", "\011", "\012", "\013", "\014", "\015", "\016", "\017",
"\020", "\021", "\022", "\023", "\024", "\025", "\026", "\033",
"\030", "\031", "\032", "\033", "\034", "\035", "\036", "\033",
"\040", "\041", "\042", "\043", "\044", "\045", "\046", "\047",
"\050", "\051", "\052", "\053", "\054", "\055", "\056", "\057",
"\060", "\061", "\062", "\063", "\064", "\065", "\066", "\067",
"\070", "\071", "\072", "\073", "\074", "\075", "\076", "\077",
"\100", "\101", "\102", "\103", "\104", "\105", "\106", "\107",
"\110", "\111", "\112", "\113", "\114", "\115", "\116", "\117",
"\120", "\121", "\122", "\123", "\124", "\125", "\126", "\127",
"\130", "\131", "\132", "\133", "\134", "\135", "\136", "\137",
"\140", "\141", "\142", "\143", "\144", "\145", "\146", "\147",
"\150", "\151", "\152", "\153", "\154", "\155", "\156", "\157",
"\160", "\161", "\162", "\163", "\164", "\165", "\166", "\167",
"\170", "\171", "\172", "\173", "\174", "\175", "\176", "\177",
"\200", "\201", "\202", "\203", "\204", "\205", "\206", "\207",
"\210", "\211", "\212", "\213", "\214", "\215", "\216", "\217",
"\220", "\221", "\222", "\223", "\224", "\225", "\226", "\227",
"\230", "\231", "\232", "\233", "\234", "\235", "\236", "\237",
"\240", "\241", "\242", "\243", "\244", "\245", "\246", "\247",
"\250", "\251", "\252", "\253", "\254", "\255", "\256", "\257",
"\260", "\261", "\262", "\263", "\264", "\265", "\266", "\267",
"\270", "\271", "\272", "\273", "\274", "\275", "\276", "\277",
"\300", "\301", "\302", "\303", "\304", "\305", "\306", "\307",
"\310", "\311", "\312", "\313", "\314", "\315", "\316", "\317",
"\320", "\321", "\322", "\323", "\324", "\325", "\326", "\327",
"\330", "\331", "\332", "\333", "\334", "\335", "\336", "\337",
"\340", "\341", "\342", "\343", "\344", "\345", "\346", "\347",
"\350", "\351", "\352", "\353", "\354", "\355", "\356", "\357",
"\360", "\361", "\362", "\363", "\364", "\365", "\366", "\367",
"\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377",
};
#ifdef HAVE_ICONV
static iconv_t iconv_cd = (iconv_t)-1;
#endif
static void
free_translation_table(struct conv_table *p)
{
int i;
for (i = 0; i < 256; i++)
if (p[i].t)
free_translation_table(p[i].u.tbl);
mem_free(p);
}
2007-01-01 17:54:14 -05:00
/* A string used in conversion tables when there is no correct
* conversion. This is compared by address and therefore should be a
* named array rather than a pointer so that it won't share storage
* with any other string literal that happens to have the same
* characters. */
2007-01-02 14:40:14 -05:00
static const unsigned char no_str[] = "*";
static void
new_translation_table(struct conv_table *p)
{
int i;
for (i = 0; i < 256; i++)
if (p[i].t)
free_translation_table(p[i].u.tbl);
for (i = 0; i < 128; i++) {
p[i].t = 0;
p[i].u.str = strings[i];
}
for (; i < 256; i++) {
p[i].t = 0;
p[i].u.str = no_str;
}
p->iconv_cp = -1;
}
#define BIN_SEARCH(table, entry, entries, key, result) \
{ \
long _s = 0, _e = (entries) - 1; \
\
while (_s <= _e || !((result) = -1)) { \
long _m = (_s + _e) / 2; \
\
if ((table)[_m].entry == (key)) { \
(result) = _m; \
break; \
} \
if ((table)[_m].entry > (key)) _e = _m - 1; \
if ((table)[_m].entry < (key)) _s = _m + 1; \
} \
} \
static const unicode_val_T strange_chars[32] = {
0x20ac, 0x0000, 0x002a, 0x0000, 0x201e, 0x2026, 0x2020, 0x2021,
0x005e, 0x2030, 0x0160, 0x003c, 0x0152, 0x0000, 0x0000, 0x0000,
0x0000, 0x0060, 0x0027, 0x0022, 0x0022, 0x002a, 0x2013, 0x2014,
0x007e, 0x2122, 0x0161, 0x003e, 0x0153, 0x0000, 0x0000, 0x0000,
};
#define SYSTEM_CHARSET_FLAG 128
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)
const unsigned char *
u2cp_(unicode_val_T u, int to, enum nbsp_mode nbsp_mode)
{
int j;
int s;
if (u < 128) return strings[u];
2006-01-14 16:44:00 -05:00
if (u < 0xa0) {
u = strange_chars[u - 0x80];
if (!u) return NULL;
}
2006-01-14 16:44:00 -05:00
to &= ~SYSTEM_CHARSET_FLAG;
if (is_cp_ptr_utf8(&codepages[to]))
return encode_utf8(u);
2006-01-14 16:44:00 -05:00
/* To mark non breaking spaces in non-UTF-8 strings, we use a
* special char NBSP_CHAR. */
if (u == UCS_NO_BREAK_SPACE) {
if (nbsp_mode == NBSP_MODE_HACK) return NBSP_CHAR_STRING;
else /* NBSP_MODE_ASCII */ return " ";
}
if (u == UCS_SOFT_HYPHEN) return "";
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
if (u < 0xFFFF)
for (j = 0; j < 0x80; j++)
if (codepages[to].highhalf[j] == u)
return strings[0x80 + j];
for (j = 0; codepages[to].table[j].c; j++)
if (codepages[to].table[j].u == u)
return strings[codepages[to].table[j].c];
BIN_SEARCH(unicode_7b, x, N_UNICODE_7B, u, s);
if (s != -1) return unicode_7b[s].s;
return no_str;
}
static unsigned char utf_buffer[7];
NONSTATIC_INLINE unsigned char *
encode_utf8(unicode_val_T u)
{
memset(utf_buffer, 0, 7);
if (u < 0x80)
utf_buffer[0] = u;
else if (u < 0x800)
utf_buffer[0] = 0xc0 | ((u >> 6) & 0x1f),
utf_buffer[1] = 0x80 | (u & 0x3f);
else if (u < 0x10000)
utf_buffer[0] = 0xe0 | ((u >> 12) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[2] = 0x80 | (u & 0x3f);
else if (u < 0x200000)
utf_buffer[0] = 0xf0 | ((u >> 18) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[3] = 0x80 | (u & 0x3f);
else if (u < 0x4000000)
utf_buffer[0] = 0xf8 | ((u >> 24) & 0x0f),
utf_buffer[1] = 0x80 | ((u >> 18) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[3] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[4] = 0x80 | (u & 0x3f);
else utf_buffer[0] = 0xfc | ((u >> 30) & 0x01),
utf_buffer[1] = 0x80 | ((u >> 24) & 0x3f),
utf_buffer[2] = 0x80 | ((u >> 18) & 0x3f),
utf_buffer[3] = 0x80 | ((u >> 12) & 0x3f),
utf_buffer[4] = 0x80 | ((u >> 6) & 0x3f),
utf_buffer[5] = 0x80 | (u & 0x3f);
return utf_buffer;
}
/* Number of bytes utf8 character indexed by first byte. Illegal bytes are
* equal ones and handled different. */
static const char utf8char_len_tab[256] = {
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 5,5,5,5,6,6,1,1,
};
#ifdef CONFIG_UTF8
NONSTATIC_INLINE int
utf8charlen(const unsigned char *p)
{
return p ? utf8char_len_tab[*p] : 0;
}
int
2006-01-14 16:44:00 -05:00
strlen_utf8(unsigned char **str)
{
unsigned char *s = *str;
unsigned char *end = strchr((const char *)s, '\0');
2006-01-14 16:44:00 -05:00
int x;
int len;
for (x = 0;; x++, s += len) {
len = utf8charlen(s);
2006-01-14 16:44:00 -05:00
if (s + len > end) break;
}
*str = s;
return x;
}
#define utf8_issingle(p) (((p) & 0x80) == 0)
#define utf8_islead(p) (utf8_issingle(p) || ((p) & 0xc0) == 0xc0)
/* Start from @current and move back to @pos char. This pointer return. The
* most left pointer is @start. */
unsigned char *
utf8_prevchar(unsigned char *current, int pos, unsigned char *start)
{
if (current == NULL || start == NULL || pos < 0)
return NULL;
while (pos > 0 && current != start) {
current--;
if (utf8_islead(*current))
pos--;
}
return current;
}
/* Count number of standard terminal cells needed for displaying UTF-8
* character. */
int
utf8_char2cells(unsigned char *utf8_char, unsigned char *end)
{
unicode_val_T u;
if (end == NULL)
end = strchr((const char *)utf8_char, '\0');
if(!utf8_char || !end)
return -1;
u = utf8_to_unicode(&utf8_char, end);
return unicode_to_cell(u);
}
/* Count number of standard terminal cells needed for displaying string
* with UTF-8 characters. */
int
utf8_ptr2cells(unsigned char *string, unsigned char *end)
{
int charlen, cell, cells = 0;
if (end == NULL)
end = strchr((const char *)string, '\0');
if(!string || !end)
return -1;
do {
charlen = utf8charlen(string);
2006-07-27 03:51:10 -04:00
if (string + charlen > end)
break;
cell = utf8_char2cells(string, end);
if (cell < 0)
return -1;
cells += cell;
string += charlen;
} while (1);
return cells;
}
/* Count number of characters in string. */
int
utf8_ptr2chars(unsigned char *string, unsigned char *end)
{
int charlen, chars = 0;
if (end == NULL)
end = strchr((const char *)string, '\0');
if(!string || !end)
return -1;
do {
charlen = utf8charlen(string);
2006-07-27 03:51:10 -04:00
if (string + charlen > end)
break;
chars++;
string += charlen;
} while (1);
return chars;
}
/*
* Count number of bytes from begining of the string needed for displaying
* specified number of cells.
*/
int
utf8_cells2bytes(unsigned char *string, int max_cells, unsigned char *end)
{
unsigned int bytes = 0, cells = 0;
assert(max_cells>=0);
if (end == NULL)
end = strchr((const char *)string, '\0');
if(!string || !end)
return -1;
do {
int cell = utf8_char2cells(&string[bytes], end);
if (cell < 0)
return -1;
cells += cell;
if (cells > max_cells)
break;
bytes += utf8charlen(&string[bytes]);
if (string + bytes > end) {
bytes = end - string;
break;
}
} while(1);
return bytes;
}
/* Take @max steps forward from @string in the specified @way, but
* not going past @end. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @end point to
* character boundaries. If not, it doesn't crash but the results may
* be inconsistent.
*
* This function can do some of the same jobs as utf8charlen(),
* utf8_cells2bytes(), and strlen_utf8(). */
unsigned char *
utf8_step_forward(unsigned char *string, unsigned char *end,
int max, enum utf8_step way, int *count)
{
int steps = 0;
unsigned char *current = string;
assert(string);
assert(max >= 0);
if_assert_failed goto invalid_arg;
if (end == NULL)
end = strchr((const char *)string, '\0');
switch (way) {
case UTF8_STEP_CHARACTERS:
while (steps < max && current < end) {
++current;
if (utf8_islead(*current))
++steps;
}
break;
case UTF8_STEP_CELLS_FEWER:
case UTF8_STEP_CELLS_MORE:
while (steps < max && current < end) {
unicode_val_T u;
unsigned char *prev = current;
int width;
u = utf8_to_unicode(&current, end);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
current = end;
++steps;
break;
}
width = unicode_to_cell(u);
if (way == UTF8_STEP_CELLS_FEWER
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
invalid_arg:
if (count)
*count = steps;
return current;
}
/* Take @max steps backward from @string in the specified @way, but
* not going past @start. Return the resulting address. Store the
* number of steps taken to *@count, unless @count is NULL.
*
* This assumes the text is valid UTF-8, and @string and @start point
* to character boundaries. If not, it doesn't crash but the results
* may be inconsistent.
*
* This function can do some of the same jobs as utf8_prevchar(). */
unsigned char *
utf8_step_backward(unsigned char *string, unsigned char *start,
int max, enum utf8_step way, int *count)
{
int steps = 0;
unsigned char *current = string;
assert(string);
assert(start);
assert(max >= 0);
if_assert_failed goto invalid_arg;
switch (way) {
case UTF8_STEP_CHARACTERS:
while (steps < max && current > start) {
--current;
if (utf8_islead(*current))
++steps;
}
break;
case UTF8_STEP_CELLS_FEWER:
case UTF8_STEP_CELLS_MORE:
while (steps < max) {
unsigned char *prev = current;
unsigned char *look;
unicode_val_T u;
int width;
if (current <= start)
break;
do {
--current;
} while (current > start && !utf8_islead(*current));
look = current;
u = utf8_to_unicode(&look, prev);
if (u == UCS_NO_CHAR) {
/* Assume the incomplete sequence
* costs one cell. */
width = 1;
} else
width = unicode_to_cell(u);
if (way == UTF8_STEP_CELLS_FEWER
&& steps + width > max) {
/* Back off. */
current = prev;
break;
}
steps += width;
}
break;
default:
INTERNAL("impossible enum utf8_step");
}
invalid_arg:
if (count)
*count = steps;
return current;
}
2006-07-27 03:51:10 -04:00
/*
* Find out number of standard terminal collumns needed for displaying symbol
* (glyph) which represents Unicode character c.
*
* TODO: Use wcwidth when it is available. This seems to require:
* - Make the configure script check whether <wchar.h> and wcwidth exist.
* - Define _XOPEN_SOURCE and include <wchar.h>.
* - Test that __STDC_ISO_10646__ is defined. (This macro means wchar_t
* matches ISO 10646 in all locales.)
* However, these do not suffice, because wcwidth depends on LC_CTYPE
* in glibc-2.3.6. For instance, wcwidth(0xff20) is -1 when LC_CTYPE
* is "fi_FI.ISO-8859-1" or "C", but 2 when LC_CTYPE is "fi_FI.UTF-8".
* <features.h> defines __STDC_ISO_10646__ as 200009L, so 0xff20 means
* U+FF20 FULLWIDTH COMMERCIAL AT regardless of LC_CTYPE; but this
* character is apparently not supported in all locales. Why is that?
* - Perhaps there is standardese that requires supported characters
* to be convertable to multibyte form. Then ELinks could just pick
* some UTF-8 locale for its wcwidth purposes.
* - Perhaps wcwidth can even return different nonnegative values for
* the same ISO 10646 character in different locales. Then ELinks
* would have to set LC_CTYPE to match at least the terminal's
* charset (which may differ from the LC_CTYPE environment variable,
* especially when the master process is serving a slave terminal).
* But there is no guarantee that the libc supports all the same
* charsets as ELinks does.
* For now, it seems safest to avoid the potentially locale-dependent
* libc version of wcwidth, and instead use a hardcoded mapping.
*
* @return 2 for double-width glyph, 1 for others.
2017-05-16 12:37:29 -04:00
* 0 for unprintable glyphs (like 0x200e: "LEFT-TO-RIGHT MARK")
*/
NONSTATIC_INLINE int
unicode_to_cell(unicode_val_T c)
{
2017-05-16 12:37:29 -04:00
if (c == 0x200e || c == 0x200f)
return 0;
if (c >= 0x1100
&& (c <= 0x115f /* Hangul Jamo */
|| c == 0x2329
|| c == 0x232a
|| (c >= 0x2e80 && c <= 0xa4cf
&& c != 0x303f) /* CJK ... Yi */
|| (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */
|| (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility
Ideographs */
|| (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */
|| (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */
|| (c >= 0xffe0 && c <= 0xffe6)
|| (c >= 0x20000 && c <= 0x2fffd)
|| (c >= 0x30000 && c <= 0x3fffd)))
return 2;
2006-07-27 03:51:10 -04:00
return 1;
}
/* Fold the case of a Unicode character, so that hotkeys in labels can
* be compared case-insensitively. It is unspecified whether the
* result will be in upper or lower case. */
unicode_val_T
unicode_fold_label_case(unicode_val_T c)
{
#if __STDC_ISO_10646__ && HAVE_WCTYPE_H
return towlower(c);
#else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
/* For now, this supports only ASCII. It would be possible to
* use code generated from CaseFolding.txt of Unicode if the
* acknowledgements required by http://www.unicode.org/copyright.html
* were added to associated documentation of ELinks. */
if (c >= 0x41 && c <= 0x5A)
return c + 0x20;
else
return c;
#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */
}
#endif /* CONFIG_UTF8 */
NONSTATIC_INLINE unicode_val_T
utf8_to_unicode(unsigned char **string, const unsigned char *end)
2006-01-14 16:44:00 -05:00
{
unsigned char *str = *string;
unicode_val_T u;
int length;
length = utf8char_len_tab[str[0]];
2006-01-14 16:44:00 -05:00
if (str + length > end) {
return UCS_NO_CHAR;
2006-07-27 03:51:10 -04:00
}
2006-01-14 16:44:00 -05:00
switch (length) {
case 1: /* U+0000 to U+007F */
if (str[0] >= 0x80) {
invalid_utf8:
++*string;
return UCS_REPLACEMENT_CHARACTER;
}
2006-01-14 16:44:00 -05:00
u = str[0];
break;
case 2: /* U+0080 to U+07FF */
if ((str[1] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x1f) << 6;
u += (str[1] & 0x3f);
if (u < 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 3: /* U+0800 to U+FFFF, except surrogates */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 12;
u += ((str[1] & 0x3f) << 6);
u += (str[2] & 0x3f);
if (u < 0x800 || is_utf16_surrogate(u))
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 4: /* U+10000 to U+1FFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 18;
u += ((str[1] & 0x3f) << 12);
u += ((str[2] & 0x3f) << 6);
u += (str[3] & 0x3f);
if (u < 0x10000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 5: /* U+200000 to U+3FFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x0f) << 24;
u += ((str[1] & 0x3f) << 18);
u += ((str[2] & 0x3f) << 12);
u += ((str[3] & 0x3f) << 6);
u += (str[4] & 0x3f);
if (u < 0x200000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
case 6: /* U+4000000 to U+7FFFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
|| (str[5] & 0xc0) != 0x80)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
u = (str[0] & 0x01) << 30;
u += ((str[1] & 0x3f) << 24);
u += ((str[2] & 0x3f) << 18);
u += ((str[3] & 0x3f) << 12);
u += ((str[4] & 0x3f) << 6);
u += (str[5] & 0x3f);
if (u < 0x4000000)
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
break;
default:
INTERNAL("utf8char_len_tab out of range");
goto invalid_utf8;
2006-01-14 16:44:00 -05:00
}
*string = str + length;
2006-07-18 14:33:34 -04:00
return u;
2006-01-14 16:44:00 -05:00
}
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* The common part of cp2u and cp2utf_8. */