1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-01-03 14:57:44 -05:00
elinks/src/intl/codepage.inc

5055 lines
227 KiB
PHP
Raw Normal View History

/* Automatically generated by gen-cp */
/* DO NOT EDIT THIS FILE! EDIT Unicode/<whatever> INSTEAD! */
/* See the input files for copyrights and licences. */
/*** 7bit ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define highhalf_7bit highhalf_NULL
#define table_7bit table_NULL
char *const aliases_7bit [] = {
"us-ascii",
"ascii",
"7bit",
"7-bit",
"iso-ir-6",
"ANSI_X3.4-1968",
"ANSI_X3.4-1986",
"646",
"cp646",
"ISO_646.irv:1991",
"ISO646-US",
"us",
"IBM367",
"cp367",
"csASCII",
"ISO646.1991-IRV",
NULL
};
/*** 8859_1 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_1 [] = {
/* 0x80 */ 0xFFFF,
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0xFFFF,
/* 0x83 */ 0xFFFF,
/* 0x84 */ 0xFFFF,
/* 0x85 */ 0xFFFF,
/* 0x86 */ 0xFFFF,
/* 0x87 */ 0xFFFF,
/* 0x88 */ 0xFFFF,
/* 0x89 */ 0xFFFF,
/* 0x8A */ 0xFFFF,
/* 0x8B */ 0xFFFF,
/* 0x8C */ 0xFFFF,
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0xFFFF,
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0xFFFF,
/* 0x92 */ 0xFFFF,
/* 0x93 */ 0xFFFF,
/* 0x94 */ 0xFFFF,
/* 0x95 */ 0xFFFF,
/* 0x96 */ 0xFFFF,
/* 0x97 */ 0xFFFF,
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0xFFFF,
/* 0x9A */ 0xFFFF,
/* 0x9B */ 0xFFFF,
/* 0x9C */ 0xFFFF,
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0xFFFF,
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x00D0, /* LATIN CAPITAL LETTER ETH (Icelandic) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x00DE, /* LATIN CAPITAL LETTER THORN (Icelandic) */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S (German) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x00F0, /* LATIN SMALL LETTER ETH (Icelandic) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x00FE, /* LATIN SMALL LETTER THORN (Icelandic) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_1 table_NULL
char *const aliases_8859_1 [] = {
"ISO-8859-1",
"iso8859-1",
"8859-1",
"iso-ir-100",
"latin1",
"l1",
"il1",
"819",
"cp819",
"ISO_8859-1",
"IBM819",
"csISOLatin1",
"ISO_8859-1:1987",
"ISO8859_1",
NULL
};
/*** 8859_2 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_2 [] = {
/* 0x80 */ 0xFFFF,
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0xFFFF,
/* 0x83 */ 0xFFFF,
/* 0x84 */ 0xFFFF,
/* 0x85 */ 0xFFFF,
/* 0x86 */ 0xFFFF,
/* 0x87 */ 0xFFFF,
/* 0x88 */ 0xFFFF,
/* 0x89 */ 0xFFFF,
/* 0x8A */ 0xFFFF,
/* 0x8B */ 0xFFFF,
/* 0x8C */ 0xFFFF,
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0xFFFF,
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0xFFFF,
/* 0x92 */ 0xFFFF,
/* 0x93 */ 0xFFFF,
/* 0x94 */ 0xFFFF,
/* 0x95 */ 0xFFFF,
/* 0x96 */ 0xFFFF,
/* 0x97 */ 0xFFFF,
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0xFFFF,
/* 0x9A */ 0xFFFF,
/* 0x9B */ 0xFFFF,
/* 0x9C */ 0xFFFF,
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0xFFFF,
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA2 */ 0x02D8, /* BREVE */
/* 0xA3 */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */
/* 0xA6 */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xAA */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xAB */ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */
/* 0xAC */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xAF */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xB2 */ 0x02DB, /* OGONEK */
/* 0xB3 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x013E, /* LATIN SMALL LETTER L WITH CARON */
/* 0xB6 */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0xB7 */ 0x02C7, /* CARON */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xBA */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xBB */ 0x0165, /* LATIN SMALL LETTER T WITH CARON */
/* 0xBC */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xBD */ 0x02DD, /* DOUBLE ACUTE ACCENT */
/* 0xBE */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xBF */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xC0 */ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */
/* 0xC6 */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xD2 */ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */
/* 0xD9 */ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x0162, /* LATIN CAPITAL LETTER T WITH CEDILLA */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x0155, /* LATIN SMALL LETTER R WITH ACUTE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x013A, /* LATIN SMALL LETTER L WITH ACUTE */
/* 0xE6 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x011B, /* LATIN SMALL LETTER E WITH CARON */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x010F, /* LATIN SMALL LETTER D WITH CARON */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xF2 */ 0x0148, /* LATIN SMALL LETTER N WITH CARON */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x0159, /* LATIN SMALL LETTER R WITH CARON */
/* 0xF9 */ 0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x0163, /* LATIN SMALL LETTER T WITH CEDILLA */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_2 table_NULL
char *const aliases_8859_2 [] = {
"ISO-8859-2",
"iso8859-2",
"8859-2",
"iso-ir-101",
"latin2",
"l2",
"il2",
"ISO_8859-2:1987",
"ISO_8859-2",
"csISOLatin2",
"ISO8859_2",
NULL
};
/*** 8859_3 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_3 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0126, /* LATIN CAPITAL LETTER H WITH STROKE */
/* 0xA2 */ 0x02D8, /* BREVE */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0xFFFF,
/* 0xA6 */ 0x0124, /* LATIN CAPITAL LETTER H WITH CIRCUMFLEX */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x0130, /* LATIN CAPITAL LETTER I WITH DOT ABOVE */
/* 0xAA */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xAB */ 0x011E, /* LATIN CAPITAL LETTER G WITH BREVE */
/* 0xAC */ 0x0134, /* LATIN CAPITAL LETTER J WITH CIRCUMFLEX */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0xFFFF,
/* 0xAF */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0127, /* LATIN SMALL LETTER H WITH STROKE */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x0125, /* LATIN SMALL LETTER H WITH CIRCUMFLEX */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0131, /* LATIN SMALL LETTER DOTLESS I */
/* 0xBA */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xBB */ 0x011F, /* LATIN SMALL LETTER G WITH BREVE */
/* 0xBC */ 0x0135, /* LATIN SMALL LETTER J WITH CIRCUMFLEX */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0xFFFF,
/* 0xBF */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0xFFFF,
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x010A, /* LATIN CAPITAL LETTER C WITH DOT ABOVE */
/* 0xC6 */ 0x0108, /* LATIN CAPITAL LETTER C WITH CIRCUMFLEX */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0xFFFF,
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x0120, /* LATIN CAPITAL LETTER G WITH DOT ABOVE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x011C, /* LATIN CAPITAL LETTER G WITH CIRCUMFLEX */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x016C, /* LATIN CAPITAL LETTER U WITH BREVE */
/* 0xDE */ 0x015C, /* LATIN CAPITAL LETTER S WITH CIRCUMFLEX */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0xFFFF,
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x010B, /* LATIN SMALL LETTER C WITH DOT ABOVE */
/* 0xE6 */ 0x0109, /* LATIN SMALL LETTER C WITH CIRCUMFLEX */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0xFFFF,
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x0121, /* LATIN SMALL LETTER G WITH DOT ABOVE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x011D, /* LATIN SMALL LETTER G WITH CIRCUMFLEX */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x016D, /* LATIN SMALL LETTER U WITH BREVE */
/* 0xFE */ 0x015D, /* LATIN SMALL LETTER S WITH CIRCUMFLEX */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_3 table_NULL
char *const aliases_8859_3 [] = {
"ISO-8859-3",
"8859-3",
"ISO_8859-3:1988",
"iso-ir-109",
"ISO_8859-3",
"latin3",
"l3",
"csISOLatin3",
"ISO8859-3",
"ISO8859_3",
NULL
};
/*** 8859_4 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_4 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA2 */ 0x0138, /* LATIN SMALL LETTER KRA */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA3 */ 0x0156, /* LATIN CAPITAL LETTER R WITH CEDILLA */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x0128, /* LATIN CAPITAL LETTER I WITH TILDE */
/* 0xA6 */ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xAA */ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */
/* 0xAB */ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */
/* 0xAC */ 0x0166, /* LATIN CAPITAL LETTER T WITH STROKE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xB2 */ 0x02DB, /* OGONEK */
/* 0xB3 */ 0x0157, /* LATIN SMALL LETTER R WITH CEDILLA */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x0129, /* LATIN SMALL LETTER I WITH TILDE */
/* 0xB6 */ 0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */
/* 0xB7 */ 0x02C7, /* CARON */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xBA */ 0x0113, /* LATIN SMALL LETTER E WITH MACRON */
/* 0xBB */ 0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */
/* 0xBC */ 0x0167, /* LATIN SMALL LETTER T WITH STROKE */
/* 0xBD */ 0x014A, /* LATIN CAPITAL LETTER ENG */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xBE */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xBF */ 0x014B, /* LATIN SMALL LETTER ENG */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xC0 */ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */
/* 0xD2 */ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */
/* 0xD3 */ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x0168, /* LATIN CAPITAL LETTER U WITH TILDE */
/* 0xDE */ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xE0 */ 0x0101, /* LATIN SMALL LETTER A WITH MACRON */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x012F, /* LATIN SMALL LETTER I WITH OGONEK */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x012B, /* LATIN SMALL LETTER I WITH MACRON */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */
/* 0xF2 */ 0x014D, /* LATIN SMALL LETTER O WITH MACRON */
/* 0xF3 */ 0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x0173, /* LATIN SMALL LETTER U WITH OGONEK */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x0169, /* LATIN SMALL LETTER U WITH TILDE */
/* 0xFE */ 0x016B, /* LATIN SMALL LETTER U WITH MACRON */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_4 table_NULL
char *const aliases_8859_4 [] = {
"ISO-8859-4",
"iso8859-4",
"8859-4",
"iso-ir-110",
"latin4",
"l4",
"il4",
"ISO_8859-4:1988",
"ISO_8859-4",
"csISOLatin4",
"ISO8859_4",
NULL
};
/*** 8859_5 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_5 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xA2 */ 0x0402, /* CYRILLIC CAPITAL LETTER DJE */
/* 0xA3 */ 0x0403, /* CYRILLIC CAPITAL LETTER GJE */
/* 0xA4 */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xA5 */ 0x0405, /* CYRILLIC CAPITAL LETTER DZE */
/* 0xA6 */ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xA7 */ 0x0407, /* CYRILLIC CAPITAL LETTER YI */
/* 0xA8 */ 0x0408, /* CYRILLIC CAPITAL LETTER JE */
/* 0xA9 */ 0x0409, /* CYRILLIC CAPITAL LETTER LJE */
/* 0xAA */ 0x040A, /* CYRILLIC CAPITAL LETTER NJE */
/* 0xAB */ 0x040B, /* CYRILLIC CAPITAL LETTER TSHE */
/* 0xAC */ 0x040C, /* CYRILLIC CAPITAL LETTER KJE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x040E, /* CYRILLIC CAPITAL LETTER SHORT U */
/* 0xAF */ 0x040F, /* CYRILLIC CAPITAL LETTER DZHE */
/* 0xB0 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0xB1 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0xB2 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0xB3 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0xB4 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0xB5 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0xB6 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0xB7 */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0xB8 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0xB9 */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0xBA */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0xBB */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0xBC */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0xBD */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0xBE */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0xBF */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0xC0 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0xC1 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0xC2 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0xC3 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0xC4 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0xC5 */ 0x0425, /* CYRILLIC CAPITAL LETTER HA */
/* 0xC6 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0xC7 */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0xC8 */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0xC9 */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0xCA */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
/* 0xCB */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0xCC */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0xCD */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0xCE */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0xCF */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xD0 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xD1 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xD2 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xD3 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xD4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xD5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xD6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xD7 */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xD8 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xD9 */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xDA */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xDB */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xDC */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xDD */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xDE */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xDF */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xE0 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xE1 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xE2 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xE3 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xE4 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xE5 */ 0x0445, /* CYRILLIC SMALL LETTER HA */
/* 0xE6 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xE7 */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xE8 */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xE9 */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xEA */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xEB */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xEC */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xED */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xEE */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xEF */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xF0 */ 0x2116, /* NUMERO SIGN */
/* 0xF1 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xF2 */ 0x0452, /* CYRILLIC SMALL LETTER DJE */
/* 0xF3 */ 0x0453, /* CYRILLIC SMALL LETTER GJE */
/* 0xF4 */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xF5 */ 0x0455, /* CYRILLIC SMALL LETTER DZE */
/* 0xF6 */ 0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xF7 */ 0x0457, /* CYRILLIC SMALL LETTER YI */
/* 0xF8 */ 0x0458, /* CYRILLIC SMALL LETTER JE */
/* 0xF9 */ 0x0459, /* CYRILLIC SMALL LETTER LJE */
/* 0xFA */ 0x045A, /* CYRILLIC SMALL LETTER NJE */
/* 0xFB */ 0x045B, /* CYRILLIC SMALL LETTER TSHE */
/* 0xFC */ 0x045C, /* CYRILLIC SMALL LETTER KJE */
/* 0xFD */ 0x00A7, /* SECTION SIGN */
/* 0xFE */ 0x045E, /* CYRILLIC SMALL LETTER SHORT U */
/* 0xFF */ 0x045F, /* CYRILLIC SMALL LETTER DZHE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_5 table_NULL
char *const aliases_8859_5 [] = {
"ISO-8859-5",
"iso8859-5",
"8859-5",
"ISO_8859-5:1988",
"iso-ir-144",
"ISO_8859-5",
"cyrillic",
"csISOLatinCyrillic",
"ISO8859_5",
NULL
};
/*** 8859_6 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_6 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0xFFFF,
/* 0xA2 */ 0xFFFF,
/* 0xA3 */ 0xFFFF,
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0xFFFF,
/* 0xA6 */ 0xFFFF,
/* 0xA7 */ 0xFFFF,
/* 0xA8 */ 0xFFFF,
/* 0xA9 */ 0xFFFF,
/* 0xAA */ 0xFFFF,
/* 0xAB */ 0xFFFF,
/* 0xAC */ 0x060C, /* ARABIC COMMA */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0xFFFF,
/* 0xAF */ 0xFFFF,
/* 0xB0 */ 0xFFFF,
/* 0xB1 */ 0xFFFF,
/* 0xB2 */ 0xFFFF,
/* 0xB3 */ 0xFFFF,
/* 0xB4 */ 0xFFFF,
/* 0xB5 */ 0xFFFF,
/* 0xB6 */ 0xFFFF,
/* 0xB7 */ 0xFFFF,
/* 0xB8 */ 0xFFFF,
/* 0xB9 */ 0xFFFF,
/* 0xBA */ 0xFFFF,
/* 0xBB */ 0x061B, /* ARABIC SEMICOLON */
/* 0xBC */ 0xFFFF,
/* 0xBD */ 0xFFFF,
/* 0xBE */ 0xFFFF,
/* 0xBF */ 0x061F, /* ARABIC QUESTION MARK */
/* 0xC0 */ 0xFFFF,
/* 0xC1 */ 0x0621, /* ARABIC LETTER HAMZA */
/* 0xC2 */ 0x0622, /* ARABIC LETTER ALEF WITH MADDA ABOVE */
/* 0xC3 */ 0x0623, /* ARABIC LETTER ALEF WITH HAMZA ABOVE */
/* 0xC4 */ 0x0624, /* ARABIC LETTER WAW WITH HAMZA ABOVE */
/* 0xC5 */ 0x0625, /* ARABIC LETTER ALEF WITH HAMZA BELOW */
/* 0xC6 */ 0x0626, /* ARABIC LETTER YEH WITH HAMZA ABOVE */
/* 0xC7 */ 0x0627, /* ARABIC LETTER ALEF */
/* 0xC8 */ 0x0628, /* ARABIC LETTER BEH */
/* 0xC9 */ 0x0629, /* ARABIC LETTER TEH MARBUTA */
/* 0xCA */ 0x062A, /* ARABIC LETTER TEH */
/* 0xCB */ 0x062B, /* ARABIC LETTER THEH */
/* 0xCC */ 0x062C, /* ARABIC LETTER JEEM */
/* 0xCD */ 0x062D, /* ARABIC LETTER HAH */
/* 0xCE */ 0x062E, /* ARABIC LETTER KHAH */
/* 0xCF */ 0x062F, /* ARABIC LETTER DAL */
/* 0xD0 */ 0x0630, /* ARABIC LETTER THAL */
/* 0xD1 */ 0x0631, /* ARABIC LETTER REH */
/* 0xD2 */ 0x0632, /* ARABIC LETTER ZAIN */
/* 0xD3 */ 0x0633, /* ARABIC LETTER SEEN */
/* 0xD4 */ 0x0634, /* ARABIC LETTER SHEEN */
/* 0xD5 */ 0x0635, /* ARABIC LETTER SAD */
/* 0xD6 */ 0x0636, /* ARABIC LETTER DAD */
/* 0xD7 */ 0x0637, /* ARABIC LETTER TAH */
/* 0xD8 */ 0x0638, /* ARABIC LETTER ZAH */
/* 0xD9 */ 0x0639, /* ARABIC LETTER AIN */
/* 0xDA */ 0x063A, /* ARABIC LETTER GHAIN */
/* 0xDB */ 0xFFFF,
/* 0xDC */ 0xFFFF,
/* 0xDD */ 0xFFFF,
/* 0xDE */ 0xFFFF,
/* 0xDF */ 0xFFFF,
/* 0xE0 */ 0x0640, /* ARABIC TATWEEL */
/* 0xE1 */ 0x0641, /* ARABIC LETTER FEH */
/* 0xE2 */ 0x0642, /* ARABIC LETTER QAF */
/* 0xE3 */ 0x0643, /* ARABIC LETTER KAF */
/* 0xE4 */ 0x0644, /* ARABIC LETTER LAM */
/* 0xE5 */ 0x0645, /* ARABIC LETTER MEEM */
/* 0xE6 */ 0x0646, /* ARABIC LETTER NOON */
/* 0xE7 */ 0x0647, /* ARABIC LETTER HEH */
/* 0xE8 */ 0x0648, /* ARABIC LETTER WAW */
/* 0xE9 */ 0x0649, /* ARABIC LETTER ALEF MAKSURA */
/* 0xEA */ 0x064A, /* ARABIC LETTER YEH */
/* 0xEB */ 0x064B, /* ARABIC FATHATAN */
/* 0xEC */ 0x064C, /* ARABIC DAMMATAN */
/* 0xED */ 0x064D, /* ARABIC KASRATAN */
/* 0xEE */ 0x064E, /* ARABIC FATHA */
/* 0xEF */ 0x064F, /* ARABIC DAMMA */
/* 0xF0 */ 0x0650, /* ARABIC KASRA */
/* 0xF1 */ 0x0651, /* ARABIC SHADDA */
/* 0xF2 */ 0x0652, /* ARABIC SUKUN */
/* 0xF3 */ 0xFFFF,
/* 0xF4 */ 0xFFFF,
/* 0xF5 */ 0xFFFF,
/* 0xF6 */ 0xFFFF,
/* 0xF7 */ 0xFFFF,
/* 0xF8 */ 0xFFFF,
/* 0xF9 */ 0xFFFF,
/* 0xFA */ 0xFFFF,
/* 0xFB */ 0xFFFF,
/* 0xFC */ 0xFFFF,
/* 0xFD */ 0xFFFF,
/* 0xFE */ 0xFFFF,
/* 0xFF */ 0xFFFF,
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_6 table_NULL
char *const aliases_8859_6 [] = {
"ISO-8859-6",
"8859-6",
"ISO_8859-6",
"ISO_8859-6:1987",
"ISO-IR-127",
"ECMA-114",
"ASMO-708",
"ARABIC",
"csISOLatinArabic",
"ISO8859-6",
"ISO8859_6",
NULL
};
/*** 8859_7 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_7 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0xA2 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x20AC, /* EURO SIGN */
/* 0xA5 */ 0x20AF, /* DRACHMA SIGN */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x037A, /* GREEK YPOGEGRAMMENI */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0xFFFF,
/* 0xAF */ 0x2015, /* HORIZONTAL BAR */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x0384, /* GREEK TONOS */
/* 0xB5 */ 0x0385, /* GREEK DIALYTIKA TONOS */
/* 0xB6 */ 0x0386, /* GREEK CAPITAL LETTER ALPHA WITH TONOS */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x0388, /* GREEK CAPITAL LETTER EPSILON WITH TONOS */
/* 0xB9 */ 0x0389, /* GREEK CAPITAL LETTER ETA WITH TONOS */
/* 0xBA */ 0x038A, /* GREEK CAPITAL LETTER IOTA WITH TONOS */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x038C, /* GREEK CAPITAL LETTER OMICRON WITH TONOS */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x038E, /* GREEK CAPITAL LETTER UPSILON WITH TONOS */
/* 0xBF */ 0x038F, /* GREEK CAPITAL LETTER OMEGA WITH TONOS */
/* 0xC0 */ 0x0390, /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS */
/* 0xC1 */ 0x0391, /* GREEK CAPITAL LETTER ALPHA */
/* 0xC2 */ 0x0392, /* GREEK CAPITAL LETTER BETA */
/* 0xC3 */ 0x0393, /* GREEK CAPITAL LETTER GAMMA */
/* 0xC4 */ 0x0394, /* GREEK CAPITAL LETTER DELTA */
/* 0xC5 */ 0x0395, /* GREEK CAPITAL LETTER EPSILON */
/* 0xC6 */ 0x0396, /* GREEK CAPITAL LETTER ZETA */
/* 0xC7 */ 0x0397, /* GREEK CAPITAL LETTER ETA */
/* 0xC8 */ 0x0398, /* GREEK CAPITAL LETTER THETA */
/* 0xC9 */ 0x0399, /* GREEK CAPITAL LETTER IOTA */
/* 0xCA */ 0x039A, /* GREEK CAPITAL LETTER KAPPA */
/* 0xCB */ 0x039B, /* GREEK CAPITAL LETTER LAMDA */
/* 0xCC */ 0x039C, /* GREEK CAPITAL LETTER MU */
/* 0xCD */ 0x039D, /* GREEK CAPITAL LETTER NU */
/* 0xCE */ 0x039E, /* GREEK CAPITAL LETTER XI */
/* 0xCF */ 0x039F, /* GREEK CAPITAL LETTER OMICRON */
/* 0xD0 */ 0x03A0, /* GREEK CAPITAL LETTER PI */
/* 0xD1 */ 0x03A1, /* GREEK CAPITAL LETTER RHO */
/* 0xD2 */ 0xFFFF,
/* 0xD3 */ 0x03A3, /* GREEK CAPITAL LETTER SIGMA */
/* 0xD4 */ 0x03A4, /* GREEK CAPITAL LETTER TAU */
/* 0xD5 */ 0x03A5, /* GREEK CAPITAL LETTER UPSILON */
/* 0xD6 */ 0x03A6, /* GREEK CAPITAL LETTER PHI */
/* 0xD7 */ 0x03A7, /* GREEK CAPITAL LETTER CHI */
/* 0xD8 */ 0x03A8, /* GREEK CAPITAL LETTER PSI */
/* 0xD9 */ 0x03A9, /* GREEK CAPITAL LETTER OMEGA */
/* 0xDA */ 0x03AA, /* GREEK CAPITAL LETTER IOTA WITH DIALYTIKA */
/* 0xDB */ 0x03AB, /* GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA */
/* 0xDC */ 0x03AC, /* GREEK SMALL LETTER ALPHA WITH TONOS */
/* 0xDD */ 0x03AD, /* GREEK SMALL LETTER EPSILON WITH TONOS */
/* 0xDE */ 0x03AE, /* GREEK SMALL LETTER ETA WITH TONOS */
/* 0xDF */ 0x03AF, /* GREEK SMALL LETTER IOTA WITH TONOS */
/* 0xE0 */ 0x03B0, /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS */
/* 0xE1 */ 0x03B1, /* GREEK SMALL LETTER ALPHA */
/* 0xE2 */ 0x03B2, /* GREEK SMALL LETTER BETA */
/* 0xE3 */ 0x03B3, /* GREEK SMALL LETTER GAMMA */
/* 0xE4 */ 0x03B4, /* GREEK SMALL LETTER DELTA */
/* 0xE5 */ 0x03B5, /* GREEK SMALL LETTER EPSILON */
/* 0xE6 */ 0x03B6, /* GREEK SMALL LETTER ZETA */
/* 0xE7 */ 0x03B7, /* GREEK SMALL LETTER ETA */
/* 0xE8 */ 0x03B8, /* GREEK SMALL LETTER THETA */
/* 0xE9 */ 0x03B9, /* GREEK SMALL LETTER IOTA */
/* 0xEA */ 0x03BA, /* GREEK SMALL LETTER KAPPA */
/* 0xEB */ 0x03BB, /* GREEK SMALL LETTER LAMDA */
/* 0xEC */ 0x03BC, /* GREEK SMALL LETTER MU */
/* 0xED */ 0x03BD, /* GREEK SMALL LETTER NU */
/* 0xEE */ 0x03BE, /* GREEK SMALL LETTER XI */
/* 0xEF */ 0x03BF, /* GREEK SMALL LETTER OMICRON */
/* 0xF0 */ 0x03C0, /* GREEK SMALL LETTER PI */
/* 0xF1 */ 0x03C1, /* GREEK SMALL LETTER RHO */
/* 0xF2 */ 0x03C2, /* GREEK SMALL LETTER FINAL SIGMA */
/* 0xF3 */ 0x03C3, /* GREEK SMALL LETTER SIGMA */
/* 0xF4 */ 0x03C4, /* GREEK SMALL LETTER TAU */
/* 0xF5 */ 0x03C5, /* GREEK SMALL LETTER UPSILON */
/* 0xF6 */ 0x03C6, /* GREEK SMALL LETTER PHI */
/* 0xF7 */ 0x03C7, /* GREEK SMALL LETTER CHI */
/* 0xF8 */ 0x03C8, /* GREEK SMALL LETTER PSI */
/* 0xF9 */ 0x03C9, /* GREEK SMALL LETTER OMEGA */
/* 0xFA */ 0x03CA, /* GREEK SMALL LETTER IOTA WITH DIALYTIKA */
/* 0xFB */ 0x03CB, /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA */
/* 0xFC */ 0x03CC, /* GREEK SMALL LETTER OMICRON WITH TONOS */
/* 0xFD */ 0x03CD, /* GREEK SMALL LETTER UPSILON WITH TONOS */
/* 0xFE */ 0x03CE, /* GREEK SMALL LETTER OMEGA WITH TONOS */
/* 0xFF */ 0xFFFF,
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_7 table_NULL
char *const aliases_8859_7 [] = {
"ISO-8859-7",
"iso8859-7",
"8859-7",
"iso-ir-126",
"elot-928",
"ISO_8859-7:1987",
"ECMA-118",
"ELOT_928",
"GREEK8",
"GREEK",
"csISOLatinGreek",
"ISO8859_7",
NULL
};
/*** 8859_8 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_8 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0xFFFF,
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x00F7, /* DIVISION SIGN */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0xFFFF,
/* 0xC0 */ 0xFFFF,
/* 0xC1 */ 0xFFFF,
/* 0xC2 */ 0xFFFF,
/* 0xC3 */ 0xFFFF,
/* 0xC4 */ 0xFFFF,
/* 0xC5 */ 0xFFFF,
/* 0xC6 */ 0xFFFF,
/* 0xC7 */ 0xFFFF,
/* 0xC8 */ 0xFFFF,
/* 0xC9 */ 0xFFFF,
/* 0xCA */ 0xFFFF,
/* 0xCB */ 0xFFFF,
/* 0xCC */ 0xFFFF,
/* 0xCD */ 0xFFFF,
/* 0xCE */ 0xFFFF,
/* 0xCF */ 0xFFFF,
/* 0xD0 */ 0xFFFF,
/* 0xD1 */ 0xFFFF,
/* 0xD2 */ 0xFFFF,
/* 0xD3 */ 0xFFFF,
/* 0xD4 */ 0xFFFF,
/* 0xD5 */ 0xFFFF,
/* 0xD6 */ 0xFFFF,
/* 0xD7 */ 0xFFFF,
/* 0xD8 */ 0xFFFF,
/* 0xD9 */ 0xFFFF,
/* 0xDA */ 0xFFFF,
/* 0xDB */ 0xFFFF,
/* 0xDC */ 0xFFFF,
/* 0xDD */ 0xFFFF,
/* 0xDE */ 0xFFFF,
/* 0xDF */ 0x2017, /* DOUBLE LOW LINE */
/* 0xE0 */ 0x05D0, /* HEBREW LETTER ALEF */
/* 0xE1 */ 0x05D1, /* HEBREW LETTER BET */
/* 0xE2 */ 0x05D2, /* HEBREW LETTER GIMEL */
/* 0xE3 */ 0x05D3, /* HEBREW LETTER DALET */
/* 0xE4 */ 0x05D4, /* HEBREW LETTER HE */
/* 0xE5 */ 0x05D5, /* HEBREW LETTER VAV */
/* 0xE6 */ 0x05D6, /* HEBREW LETTER ZAYIN */
/* 0xE7 */ 0x05D7, /* HEBREW LETTER HET */
/* 0xE8 */ 0x05D8, /* HEBREW LETTER TET */
/* 0xE9 */ 0x05D9, /* HEBREW LETTER YOD */
/* 0xEA */ 0x05DA, /* HEBREW LETTER FINAL KAF */
/* 0xEB */ 0x05DB, /* HEBREW LETTER KAF */
/* 0xEC */ 0x05DC, /* HEBREW LETTER LAMED */
/* 0xED */ 0x05DD, /* HEBREW LETTER FINAL MEM */
/* 0xEE */ 0x05DE, /* HEBREW LETTER MEM */
/* 0xEF */ 0x05DF, /* HEBREW LETTER FINAL NUN */
/* 0xF0 */ 0x05E0, /* HEBREW LETTER NUN */
/* 0xF1 */ 0x05E1, /* HEBREW LETTER SAMEKH */
/* 0xF2 */ 0x05E2, /* HEBREW LETTER AYIN */
/* 0xF3 */ 0x05E3, /* HEBREW LETTER FINAL PE */
/* 0xF4 */ 0x05E4, /* HEBREW LETTER PE */
/* 0xF5 */ 0x05E5, /* HEBREW LETTER FINAL TSADI */
/* 0xF6 */ 0x05E6, /* HEBREW LETTER TSADI */
/* 0xF7 */ 0x05E7, /* HEBREW LETTER QOF */
/* 0xF8 */ 0x05E8, /* HEBREW LETTER RESH */
/* 0xF9 */ 0x05E9, /* HEBREW LETTER SHIN */
/* 0xFA */ 0x05EA, /* HEBREW LETTER TAV */
/* 0xFB */ 0xFFFF,
/* 0xFC */ 0xFFFF,
/* 0xFD */ 0x200E, /* LEFT-TO-RIGHT MARK */
/* 0xFE */ 0x200F, /* RIGHT-TO-LEFT MARK */
/* 0xFF */ 0xFFFF,
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_8 table_NULL
char *const aliases_8859_8 [] = {
"ISO-8859-8",
"8859-8",
"ISO_8859-8:1988",
"ISO-IR-138",
"HEBREW",
"csISOLatinHebrew",
"ISO8859-8",
"ISO8859_8",
NULL
};
/*** 8859_9 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_9 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x011E, /* LATIN CAPITAL LETTER G WITH BREVE */
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x0130, /* LATIN CAPITAL LETTER I WITH DOT ABOVE */
/* 0xDE */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x011F, /* LATIN SMALL LETTER G WITH BREVE */
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x0131, /* LATIN SMALL LETTER DOTLESS I */
/* 0xFE */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_9 table_NULL
char *const aliases_8859_9 [] = {
"ISO-8859-9",
"iso8859-9",
"8859-9",
"ISO_8859-9:1989",
"ISO-IR-148",
"LATIN5",
"L5",
"csISOLatin5",
"ISO8859_9",
NULL
};
/*** 8859_10 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_10 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA2 */ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */
/* 0xA3 */ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */
/* 0xA4 */ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */
/* 0xA5 */ 0x0128, /* LATIN CAPITAL LETTER I WITH TILDE */
/* 0xA6 */ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */
/* 0xA9 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xAA */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xAB */ 0x0166, /* LATIN CAPITAL LETTER T WITH STROKE */
/* 0xAC */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */
/* 0xAF */ 0x014A, /* LATIN CAPITAL LETTER ENG */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xB2 */ 0x0113, /* LATIN SMALL LETTER E WITH MACRON */
/* 0xB3 */ 0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */
/* 0xB4 */ 0x012B, /* LATIN SMALL LETTER I WITH MACRON */
/* 0xB5 */ 0x0129, /* LATIN SMALL LETTER I WITH TILDE */
/* 0xB6 */ 0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */
/* 0xB9 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xBA */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xBB */ 0x0167, /* LATIN SMALL LETTER T WITH STROKE */
/* 0xBC */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xBD */ 0x2015, /* HORIZONTAL BAR */
/* 0xBE */ 0x016B, /* LATIN SMALL LETTER U WITH MACRON */
/* 0xBF */ 0x014B, /* LATIN SMALL LETTER ENG */
/* 0xC0 */ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x00D0, /* LATIN CAPITAL LETTER ETH (Icelandic) */
/* 0xD1 */ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */
/* 0xD2 */ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x0168, /* LATIN CAPITAL LETTER U WITH TILDE */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x00DE, /* LATIN CAPITAL LETTER THORN (Icelandic) */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S (German) */
/* 0xE0 */ 0x0101, /* LATIN SMALL LETTER A WITH MACRON */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x012F, /* LATIN SMALL LETTER I WITH OGONEK */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x00F0, /* LATIN SMALL LETTER ETH (Icelandic) */
/* 0xF1 */ 0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */
/* 0xF2 */ 0x014D, /* LATIN SMALL LETTER O WITH MACRON */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x0169, /* LATIN SMALL LETTER U WITH TILDE */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x0173, /* LATIN SMALL LETTER U WITH OGONEK */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x00FE, /* LATIN SMALL LETTER THORN (Icelandic) */
/* 0xFF */ 0x0138, /* LATIN SMALL LETTER KRA */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_10 table_NULL
char *const aliases_8859_10 [] = {
"ISO-8859-10",
"8859-10",
"ISO_8859-10:1992",
"ISO-IR-157",
"LATIN6",
"L6",
"csISOLatin6",
"ISO8859-10",
NULL
};
/*** 8859_13 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_13 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x0156, /* LATIN CAPITAL LETTER R WITH CEDILLA */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x0157, /* LATIN SMALL LETTER R WITH CEDILLA */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xC0 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xC1 */ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */
/* 0xC2 */ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */
/* 0xC3 */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xC7 */ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0xCB */ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
/* 0xCC */ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */
/* 0xCD */ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */
/* 0xCE */ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */
/* 0xCF */ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */
/* 0xD0 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xD1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xD2 */ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */
/* 0xD9 */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xDA */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0xDB */ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xDE */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S (German) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xE0 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xE1 */ 0x012F, /* LATIN SMALL LETTER I WITH OGONEK */
/* 0xE2 */ 0x0101, /* LATIN SMALL LETTER A WITH MACRON */
/* 0xE3 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xE7 */ 0x0113, /* LATIN SMALL LETTER E WITH MACRON */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xEB */ 0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */
/* 0xEC */ 0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */
/* 0xED */ 0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */
/* 0xEE */ 0x012B, /* LATIN SMALL LETTER I WITH MACRON */
/* 0xEF */ 0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */
/* 0xF0 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xF1 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xF2 */ 0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x014D, /* LATIN SMALL LETTER O WITH MACRON */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x0173, /* LATIN SMALL LETTER U WITH OGONEK */
/* 0xF9 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xFA */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0xFB */ 0x016B, /* LATIN SMALL LETTER U WITH MACRON */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xFE */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xFF */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_13 table_NULL
char *const aliases_8859_13 [] = {
"ISO-8859-13",
"iso8859-13",
"8859-13",
"iso-ir-179",
"latin7",
"l7",
"il7",
"ISO_8859-13",
NULL
};
/*** 8859_14 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_14 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x1E02, /* LATIN CAPITAL LETTER B WITH DOT ABOVE */
/* 0xA2 */ 0x1E03, /* LATIN SMALL LETTER B WITH DOT ABOVE */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x010A, /* LATIN CAPITAL LETTER C WITH DOT ABOVE */
/* 0xA5 */ 0x010B, /* LATIN SMALL LETTER C WITH DOT ABOVE */
/* 0xA6 */ 0x1E0A, /* LATIN CAPITAL LETTER D WITH DOT ABOVE */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x1E80, /* LATIN CAPITAL LETTER W WITH GRAVE */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x1E82, /* LATIN CAPITAL LETTER W WITH ACUTE */
/* 0xAB */ 0x1E0B, /* LATIN SMALL LETTER D WITH DOT ABOVE */
/* 0xAC */ 0x1EF2, /* LATIN CAPITAL LETTER Y WITH GRAVE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
/* 0xB0 */ 0x1E1E, /* LATIN CAPITAL LETTER F WITH DOT ABOVE */
/* 0xB1 */ 0x1E1F, /* LATIN SMALL LETTER F WITH DOT ABOVE */
/* 0xB2 */ 0x0120, /* LATIN CAPITAL LETTER G WITH DOT ABOVE */
/* 0xB3 */ 0x0121, /* LATIN SMALL LETTER G WITH DOT ABOVE */
/* 0xB4 */ 0x1E40, /* LATIN CAPITAL LETTER M WITH DOT ABOVE */
/* 0xB5 */ 0x1E41, /* LATIN SMALL LETTER M WITH DOT ABOVE */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x1E56, /* LATIN CAPITAL LETTER P WITH DOT ABOVE */
/* 0xB8 */ 0x1E81, /* LATIN SMALL LETTER W WITH GRAVE */
/* 0xB9 */ 0x1E57, /* LATIN SMALL LETTER P WITH DOT ABOVE */
/* 0xBA */ 0x1E83, /* LATIN SMALL LETTER W WITH ACUTE */
/* 0xBB */ 0x1E60, /* LATIN CAPITAL LETTER S WITH DOT ABOVE */
/* 0xBC */ 0x1EF3, /* LATIN SMALL LETTER Y WITH GRAVE */
/* 0xBD */ 0x1E84, /* LATIN CAPITAL LETTER W WITH DIAERESIS */
/* 0xBE */ 0x1E85, /* LATIN SMALL LETTER W WITH DIAERESIS */
/* 0xBF */ 0x1E61, /* LATIN SMALL LETTER S WITH DOT ABOVE */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x0174, /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x1E6A, /* LATIN CAPITAL LETTER T WITH DOT ABOVE */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x0176, /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x0175, /* LATIN SMALL LETTER W WITH CIRCUMFLEX */
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x1E6B, /* LATIN SMALL LETTER T WITH DOT ABOVE */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x0177, /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_14 table_NULL
char *const aliases_8859_14 [] = {
"ISO-8859-14",
"8859-14",
"ISO_8859-14",
"ISO_8859-14:1998",
"ISO-IR-199",
"LATIN8",
"L8",
NULL
};
/*** 8859_15 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_15 [] = {
/* 0x80 */ 0xFFFF,
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0xFFFF,
/* 0x83 */ 0xFFFF,
/* 0x84 */ 0xFFFF,
/* 0x85 */ 0xFFFF,
/* 0x86 */ 0xFFFF,
/* 0x87 */ 0xFFFF,
/* 0x88 */ 0xFFFF,
/* 0x89 */ 0xFFFF,
/* 0x8A */ 0xFFFF,
/* 0x8B */ 0xFFFF,
/* 0x8C */ 0xFFFF,
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0xFFFF,
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0xFFFF,
/* 0x92 */ 0xFFFF,
/* 0x93 */ 0xFFFF,
/* 0x94 */ 0xFFFF,
/* 0x95 */ 0xFFFF,
/* 0x96 */ 0xFFFF,
/* 0x97 */ 0xFFFF,
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0xFFFF,
/* 0x9A */ 0xFFFF,
/* 0x9B */ 0xFFFF,
/* 0x9C */ 0xFFFF,
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0xFFFF,
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x20AC, /* EURO SIGN */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x0152, /* LATIN CAPITAL LIGATURE OE */
/* 0xBD */ 0x0153, /* LATIN SMALL LIGATURE OE */
/* 0xBE */ 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
/* 0xBF */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x00D0, /* LATIN CAPITAL LETTER ETH */
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x00DE, /* LATIN CAPITAL LETTER THORN */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x00F0, /* LATIN SMALL LETTER ETH */
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x00FE, /* LATIN SMALL LETTER THORN */
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_15 table_NULL
char *const aliases_8859_15 [] = {
"ISO-8859-15",
"iso8859-15",
"8859-15",
"latin9",
"l9",
"il9",
"latin0",
"l0",
"il0",
"ISO_8859-15",
"ISO_8859-15:1998",
"ISO-IR-203",
NULL
};
/*** 8859_16 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_16 [] = {
/* 0x80 */ 0xFFFF,
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0xFFFF,
/* 0x83 */ 0xFFFF,
/* 0x84 */ 0xFFFF,
/* 0x85 */ 0xFFFF,
/* 0x86 */ 0xFFFF,
/* 0x87 */ 0xFFFF,
/* 0x88 */ 0xFFFF,
/* 0x89 */ 0xFFFF,
/* 0x8A */ 0xFFFF,
/* 0x8B */ 0xFFFF,
/* 0x8C */ 0xFFFF,
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0xFFFF,
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0xFFFF,
/* 0x92 */ 0xFFFF,
/* 0x93 */ 0xFFFF,
/* 0x94 */ 0xFFFF,
/* 0x95 */ 0xFFFF,
/* 0x96 */ 0xFFFF,
/* 0x97 */ 0xFFFF,
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0xFFFF,
/* 0x9A */ 0xFFFF,
/* 0x9B */ 0xFFFF,
/* 0x9C */ 0xFFFF,
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0xFFFF,
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA2 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xA3 */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xA4 */ 0x20AC, /* EURO SIGN */
/* 0xA5 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA6 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x0218, /* LATIN CAPITAL LETTER S WITH COMMA BELOW */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xAC */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xAF */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xB3 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xB4 */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xB5 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xB9 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xBA */ 0x0219, /* LATIN SMALL LETTER S WITH COMMA BELOW */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x0152, /* LATIN CAPITAL LIGATURE OE */
/* 0xBD */ 0x0153, /* LATIN SMALL LIGATURE OE */
/* 0xBE */ 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
/* 0xBF */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0xD8 */ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xDE */ 0x021A, /* LATIN CAPITAL LETTER T WITH COMMA BELOW */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0xF8 */ 0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xFE */ 0x021B, /* LATIN SMALL LETTER T WITH COMMA BELOW */
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_16 table_NULL
char *const aliases_8859_16 [] = {
"ISO-8859-16",
"iso8859-16",
"8859-16",
"latin10",
"l10",
"il10",
"ISO_8859-16",
"ISO_8859-16:2000",
"ISO-IR-226",
"ISO_8859-16:2001",
NULL
};
/*** cp1250 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp1250 [] = {
/* 0x80 */ 0x20AC, /* EURO SIGN */
/* 0x81 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x82 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0x83 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x84 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0x85 */ 0x2026, /* HORIZONTAL ELLIPSIS */
/* 0x86 */ 0x2020, /* DAGGER */
/* 0x87 */ 0x2021, /* DOUBLE DAGGER */
/* 0x88 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x89 */ 0x2030, /* PER MILLE SIGN */
/* 0x8A */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0x8B */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0x8C */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0x8D */ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */
/* 0x8E */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0x8F */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0x90 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x91 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0x92 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0x93 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0x94 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0x95 */ 0x2022, /* BULLET */
/* 0x96 */ 0x2013, /* EN DASH */
/* 0x97 */ 0x2014, /* EM DASH */
/* 0x98 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x99 */ 0x2122, /* TRADE MARK SIGN */
/* 0x9A */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0x9B */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0x9C */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0x9D */ 0x0165, /* LATIN SMALL LETTER T WITH CARON */
/* 0x9E */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0x9F */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x02C7, /* CARON */
/* 0xA2 */ 0x02D8, /* BREVE */
/* 0xA3 */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x02DB, /* OGONEK */
/* 0xB3 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xBA */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */
/* 0xBD */ 0x02DD, /* DOUBLE ACUTE ACCENT */
/* 0xBE */ 0x013E, /* LATIN SMALL LETTER L WITH CARON */
/* 0xBF */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xC0 */ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */
/* 0xC6 */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xD2 */ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */
/* 0xD9 */ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x0162, /* LATIN CAPITAL LETTER T WITH CEDILLA */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x0155, /* LATIN SMALL LETTER R WITH ACUTE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x013A, /* LATIN SMALL LETTER L WITH ACUTE */
/* 0xE6 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x011B, /* LATIN SMALL LETTER E WITH CARON */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x010F, /* LATIN SMALL LETTER D WITH CARON */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xF2 */ 0x0148, /* LATIN SMALL LETTER N WITH CARON */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x0159, /* LATIN SMALL LETTER R WITH CARON */
/* 0xF9 */ 0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x0163, /* LATIN SMALL LETTER T WITH CEDILLA */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp1250 table_NULL
char *const aliases_cp1250 [] = {
"windows-1250",
"windows1250",
"1250",
"cp1250",
"MS-EE",
NULL
};
/*** cp1251 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp1251 [] = {
/* 0x80 */ 0x0402, /* CYRILLIC CAPITAL LETTER DJE */
/* 0x81 */ 0x0403, /* CYRILLIC CAPITAL LETTER GJE */
/* 0x82 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0x83 */ 0x0453, /* CYRILLIC SMALL LETTER GJE */
/* 0x84 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0x85 */ 0x2026, /* HORIZONTAL ELLIPSIS */
/* 0x86 */ 0x2020, /* DAGGER */
/* 0x87 */ 0x2021, /* DOUBLE DAGGER */
/* 0x88 */ 0x20AC, /* EURO SIGN */
/* 0x89 */ 0x2030, /* PER MILLE SIGN */
/* 0x8A */ 0x0409, /* CYRILLIC CAPITAL LETTER LJE */
/* 0x8B */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0x8C */ 0x040A, /* CYRILLIC CAPITAL LETTER NJE */
/* 0x8D */ 0x040C, /* CYRILLIC CAPITAL LETTER KJE */
/* 0x8E */ 0x040B, /* CYRILLIC CAPITAL LETTER TSHE */
/* 0x8F */ 0x040F, /* CYRILLIC CAPITAL LETTER DZHE */
/* 0x90 */ 0x0452, /* CYRILLIC SMALL LETTER DJE */
/* 0x91 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0x92 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0x93 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0x94 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0x95 */ 0x2022, /* BULLET */
/* 0x96 */ 0x2013, /* EN DASH */
/* 0x97 */ 0x2014, /* EM DASH */
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0x2122, /* TRADE MARK SIGN */
/* 0x9A */ 0x0459, /* CYRILLIC SMALL LETTER LJE */
/* 0x9B */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0x9C */ 0x045A, /* CYRILLIC SMALL LETTER NJE */
/* 0x9D */ 0x045C, /* CYRILLIC SMALL LETTER KJE */
/* 0x9E */ 0x045B, /* CYRILLIC SMALL LETTER TSHE */
/* 0x9F */ 0x045F, /* CYRILLIC SMALL LETTER DZHE */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x040E, /* CYRILLIC CAPITAL LETTER SHORT U */
/* 0xA2 */ 0x045E, /* CYRILLIC SMALL LETTER SHORT U */
/* 0xA3 */ 0x0408, /* CYRILLIC CAPITAL LETTER JE */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x0490, /* CYRILLIC CAPITAL LETTER GHE WITH UPTURN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x0407, /* CYRILLIC CAPITAL LETTER YI */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xB3 */ 0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xB4 */ 0x0491, /* CYRILLIC SMALL LETTER GHE WITH UPTURN */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xB9 */ 0x2116, /* NUMERO SIGN */
/* 0xBA */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x0458, /* CYRILLIC SMALL LETTER JE */
/* 0xBD */ 0x0405, /* CYRILLIC CAPITAL LETTER DZE */
/* 0xBE */ 0x0455, /* CYRILLIC SMALL LETTER DZE */
/* 0xBF */ 0x0457, /* CYRILLIC SMALL LETTER YI */
/* 0xC0 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0xC1 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0xC2 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0xC3 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0xC4 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0xC5 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0xC6 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0xC7 */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0xC8 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0xC9 */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0xCA */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0xCB */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0xCC */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0xCD */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0xCE */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0xCF */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0xD0 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0xD1 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0xD2 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0xD3 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0xD4 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0xD5 */ 0x0425, /* CYRILLIC CAPITAL LETTER HA */
/* 0xD6 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0xD7 */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0xD8 */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0xD9 */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0xDA */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
/* 0xDB */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0xDC */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0xDD */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0xDE */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0xDF */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xE0 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xE1 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xE2 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xE3 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xE4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xE5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xE6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xE7 */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xE8 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xE9 */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xEA */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xEB */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xEC */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xED */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xEE */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xEF */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xF0 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xF1 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xF2 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xF3 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xF4 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xF5 */ 0x0445, /* CYRILLIC SMALL LETTER HA */
/* 0xF6 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xF7 */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xF8 */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xF9 */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xFA */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xFB */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xFC */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xFD */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xFE */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xFF */ 0x044F, /* CYRILLIC SMALL LETTER YA */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp1251 table_NULL
char *const aliases_cp1251 [] = {
"windows-1251",
"windows1251",
"1251",
"cp1251",
"MS-CYRL",
NULL
};
/*** cp1252 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp1252 [] = {
/* 0x80 */ 0x20AC, /* EURO SIGN */
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0x83 */ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
/* 0x84 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0x85 */ 0x2026, /* HORIZONTAL ELLIPSIS */
/* 0x86 */ 0x2020, /* DAGGER */
/* 0x87 */ 0x2021, /* DOUBLE DAGGER */
/* 0x88 */ 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
/* 0x89 */ 0x2030, /* PER MILLE SIGN */
/* 0x8A */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0x8B */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0x8C */ 0x0152, /* LATIN CAPITAL LIGATURE OE */
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0x92 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0x93 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0x94 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0x95 */ 0x2022, /* BULLET */
/* 0x96 */ 0x2013, /* EN DASH */
/* 0x97 */ 0x2014, /* EM DASH */
/* 0x98 */ 0x02DC, /* SMALL TILDE */
/* 0x99 */ 0x2122, /* TRADE MARK SIGN */
/* 0x9A */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0x9B */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0x9C */ 0x0153, /* LATIN SMALL LIGATURE OE */
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0x9F */ 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x00D0, /* LATIN CAPITAL LETTER ETH */
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x00DE, /* LATIN CAPITAL LETTER THORN */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x00F0, /* LATIN SMALL LETTER ETH */
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x00FE, /* LATIN SMALL LETTER THORN */
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp1252 table_NULL
char *const aliases_cp1252 [] = {
"windows-1252",
"1252",
NULL
};
/*** cp1256 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp1256 [] = {
/* 0x80 */ 0x20AC, /* EURO SIGN */
/* 0x81 */ 0x067E, /* ARABIC LETTER PEH */
/* 0x82 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0x83 */ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
/* 0x84 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0x85 */ 0x2026, /* HORIZONTAL ELLIPSIS */
/* 0x86 */ 0x2020, /* DAGGER */
/* 0x87 */ 0x2021, /* DOUBLE DAGGER */
/* 0x88 */ 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
/* 0x89 */ 0x2030, /* PER MILLE SIGN */
/* 0x8A */ 0x0679, /* ARABIC LETTER TTEH */
/* 0x8B */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0x8C */ 0x0152, /* LATIN CAPITAL LIGATURE OE */
/* 0x8D */ 0x0686, /* ARABIC LETTER TCHEH */
/* 0x8E */ 0x0698, /* ARABIC LETTER JEH */
/* 0x8F */ 0x0688, /* ARABIC LETTER DDAL */
/* 0x90 */ 0x06AF, /* ARABIC LETTER GAF */
/* 0x91 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0x92 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0x93 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0x94 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0x95 */ 0x2022, /* BULLET */
/* 0x96 */ 0x2013, /* EN DASH */
/* 0x97 */ 0x2014, /* EM DASH */
/* 0x98 */ 0x06A9, /* ARABIC LETTER KEHEH */
/* 0x99 */ 0x2122, /* TRADE MARK SIGN */
/* 0x9A */ 0x0691, /* ARABIC LETTER RREH */
/* 0x9B */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0x9C */ 0x0153, /* LATIN SMALL LIGATURE OE */
/* 0x9D */ 0x200C, /* ZERO WIDTH NON-JOINER */
/* 0x9E */ 0x200D, /* ZERO WIDTH JOINER */
/* 0x9F */ 0x06BA, /* ARABIC LETTER NOON GHUNNA */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x060C, /* ARABIC COMMA */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x06BE, /* ARABIC LETTER HEH DOACHASHMEE */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x061B, /* ARABIC SEMICOLON */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x061F, /* ARABIC QUESTION MARK */
/* 0xC0 */ 0x06C1, /* ARABIC LETTER HEH GOAL */
/* 0xC1 */ 0x0621, /* ARABIC LETTER HAMZA */
/* 0xC2 */ 0x0622, /* ARABIC LETTER ALEF WITH MADDA ABOVE */
/* 0xC3 */ 0x0623, /* ARABIC LETTER ALEF WITH HAMZA ABOVE */
/* 0xC4 */ 0x0624, /* ARABIC LETTER WAW WITH HAMZA ABOVE */
/* 0xC5 */ 0x0625, /* ARABIC LETTER ALEF WITH HAMZA BELOW */
/* 0xC6 */ 0x0626, /* ARABIC LETTER YEH WITH HAMZA ABOVE */
/* 0xC7 */ 0x0627, /* ARABIC LETTER ALEF */
/* 0xC8 */ 0x0628, /* ARABIC LETTER BEH */
/* 0xC9 */ 0x0629, /* ARABIC LETTER TEH MARBUTA */
/* 0xCA */ 0x062A, /* ARABIC LETTER TEH */
/* 0xCB */ 0x062B, /* ARABIC LETTER THEH */
/* 0xCC */ 0x062C, /* ARABIC LETTER JEEM */
/* 0xCD */ 0x062D, /* ARABIC LETTER HAH */
/* 0xCE */ 0x062E, /* ARABIC LETTER KHAH */
/* 0xCF */ 0x062F, /* ARABIC LETTER DAL */
/* 0xD0 */ 0x0630, /* ARABIC LETTER THAL */
/* 0xD1 */ 0x0631, /* ARABIC LETTER REH */
/* 0xD2 */ 0x0632, /* ARABIC LETTER ZAIN */
/* 0xD3 */ 0x0633, /* ARABIC LETTER SEEN */
/* 0xD4 */ 0x0634, /* ARABIC LETTER SHEEN */
/* 0xD5 */ 0x0635, /* ARABIC LETTER SAD */
/* 0xD6 */ 0x0636, /* ARABIC LETTER DAD */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x0637, /* ARABIC LETTER TAH */
/* 0xD9 */ 0x0638, /* ARABIC LETTER ZAH */
/* 0xDA */ 0x0639, /* ARABIC LETTER AIN */
/* 0xDB */ 0x063A, /* ARABIC LETTER GHAIN */
/* 0xDC */ 0x0640, /* ARABIC TATWEEL */
/* 0xDD */ 0x0641, /* ARABIC LETTER FEH */
/* 0xDE */ 0x0642, /* ARABIC LETTER QAF */
/* 0xDF */ 0x0643, /* ARABIC LETTER KAF */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x0644, /* ARABIC LETTER LAM */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x0645, /* ARABIC LETTER MEEM */
/* 0xE4 */ 0x0646, /* ARABIC LETTER NOON */
/* 0xE5 */ 0x0647, /* ARABIC LETTER HEH */
/* 0xE6 */ 0x0648, /* ARABIC LETTER WAW */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x0649, /* ARABIC LETTER ALEF MAKSURA */
/* 0xED */ 0x064A, /* ARABIC LETTER YEH */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x064B, /* ARABIC FATHATAN */
/* 0xF1 */ 0x064C, /* ARABIC DAMMATAN */
/* 0xF2 */ 0x064D, /* ARABIC KASRATAN */
/* 0xF3 */ 0x064E, /* ARABIC FATHA */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x064F, /* ARABIC DAMMA */
/* 0xF6 */ 0x0650, /* ARABIC KASRA */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x0651, /* ARABIC SHADDA */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x0652, /* ARABIC SUKUN */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x200E, /* LEFT-TO-RIGHT MARK */
/* 0xFE */ 0x200F, /* RIGHT-TO-LEFT MARK */
/* 0xFF */ 0x06D2, /* ARABIC LETTER YEH BARREE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp1256 table_NULL
char *const aliases_cp1256 [] = {
"windows-1256",
"1256",
"cp1256",
"MS-ARAB",
NULL
};
/*** cp1257 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp1257 [] = {
/* 0x80 */ 0x20AC, /* EURO SIGN */
/* 0x81 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x82 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0x83 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x84 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0x85 */ 0x2026, /* HORIZONTAL ELLIPSIS */
/* 0x86 */ 0x2020, /* DAGGER */
/* 0x87 */ 0x2021, /* DOUBLE DAGGER */
/* 0x88 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x89 */ 0x2030, /* PER MILLE SIGN */
/* 0x8A */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x8B */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0x8C */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x8D */ 0x00A8, /* DIAERESIS */
/* 0x8E */ 0x02C7, /* CARON */
/* 0x8F */ 0x00B8, /* CEDILLA */
/* 0x90 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x91 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0x92 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0x93 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0x94 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0x95 */ 0x2022, /* BULLET */
/* 0x96 */ 0x2013, /* EN DASH */
/* 0x97 */ 0x2014, /* EM DASH */
/* 0x98 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x99 */ 0x2122, /* TRADE MARK SIGN */
/* 0x9A */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x9B */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0x9C */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x9D */ 0x00AF, /* MACRON */
/* 0x9E */ 0x02DB, /* OGONEK */
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x0156, /* LATIN CAPITAL LETTER R WITH CEDILLA */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x0157, /* LATIN SMALL LETTER R WITH CEDILLA */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xC0 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xC1 */ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */
/* 0xC2 */ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */
/* 0xC3 */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xC7 */ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0xCB */ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
/* 0xCC */ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */
/* 0xCD */ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */
/* 0xCE */ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */
/* 0xCF */ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */
/* 0xD0 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xD1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xD2 */ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */
/* 0xD9 */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xDA */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0xDB */ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xDE */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xE1 */ 0x012F, /* LATIN SMALL LETTER I WITH OGONEK */
/* 0xE2 */ 0x0101, /* LATIN SMALL LETTER A WITH MACRON */
/* 0xE3 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xE7 */ 0x0113, /* LATIN SMALL LETTER E WITH MACRON */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xEB */ 0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */
/* 0xEC */ 0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */
/* 0xED */ 0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */
/* 0xEE */ 0x012B, /* LATIN SMALL LETTER I WITH MACRON */
/* 0xEF */ 0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */
/* 0xF0 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xF1 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xF2 */ 0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x014D, /* LATIN SMALL LETTER O WITH MACRON */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x0173, /* LATIN SMALL LETTER U WITH OGONEK */
/* 0xF9 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xFA */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0xFB */ 0x016B, /* LATIN SMALL LETTER U WITH MACRON */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xFE */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp1257 table_NULL
char *const aliases_cp1257 [] = {
"windows-1257",
"windows1257",
"1257",
"cp1257",
"WINBALTRIM",
NULL
};
/*** cp437 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp437 [] = {
/* 0x80 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0x81 */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0x82 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0x83 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0x84 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0x85 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0x86 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0x87 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0x88 */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0x89 */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0x8A */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0x8B */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0x8C */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0x8D */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0x8E */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0x8F */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0x90 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x91 */ 0x00E6, /* LATIN SMALL LIGATURE AE */
/* 0x92 */ 0x00C6, /* LATIN CAPITAL LIGATURE AE */
/* 0x93 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0x94 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0x95 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0x96 */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0x97 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0x98 */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
/* 0x99 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0x9A */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0x9B */ 0x00A2, /* CENT SIGN */
/* 0x9C */ 0x00A3, /* POUND SIGN */
/* 0x9D */ 0x00A5, /* YEN SIGN */
/* 0x9E */ 0x20A7, /* PESETA SIGN */
/* 0x9F */ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
/* 0xA0 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xA1 */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xA2 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xA3 */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xA4 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xA5 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xA6 */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xA7 */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xA8 */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xA9 */ 0x2310, /* REVERSED NOT SIGN */
/* 0xAA */ 0x00AC, /* NOT SIGN */
/* 0xAB */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xAC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xAD */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xAE */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAF */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB6 */ 0x2562, /* BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE */
/* 0xB7 */ 0x2556, /* BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE */
/* 0xB8 */ 0x2555, /* BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x255C, /* BOX DRAWINGS UP DOUBLE AND LEFT SINGLE */
/* 0xBE */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xC7 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xD0 */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xD1 */ 0x2564, /* BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE */
/* 0xD2 */ 0x2565, /* BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE */
/* 0xD3 */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xD4 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xD5 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xD6 */ 0x2553, /* BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE */
/* 0xD7 */ 0x256B, /* BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE */
/* 0xD8 */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x258C, /* LEFT HALF BLOCK */
/* 0xDE */ 0x2590, /* RIGHT HALF BLOCK */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x03B1, /* GREEK SMALL LETTER ALPHA */
/* 0xE1 */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE2 */ 0x0393, /* GREEK CAPITAL LETTER GAMMA */
/* 0xE3 */ 0x03C0, /* GREEK SMALL LETTER PI */
/* 0xE4 */ 0x03A3, /* GREEK CAPITAL LETTER SIGMA */
/* 0xE5 */ 0x03C3, /* GREEK SMALL LETTER SIGMA */
/* 0xE6 */ 0x00B5, /* MICRO SIGN */
/* 0xE7 */ 0x03C4, /* GREEK SMALL LETTER TAU */
/* 0xE8 */ 0x03A6, /* GREEK CAPITAL LETTER PHI */
/* 0xE9 */ 0x0398, /* GREEK CAPITAL LETTER THETA */
/* 0xEA */ 0x03A9, /* GREEK CAPITAL LETTER OMEGA */
/* 0xEB */ 0x03B4, /* GREEK SMALL LETTER DELTA */
/* 0xEC */ 0x221E, /* INFINITY */
/* 0xED */ 0x03C6, /* GREEK SMALL LETTER PHI */
/* 0xEE */ 0x03B5, /* GREEK SMALL LETTER EPSILON */
/* 0xEF */ 0x2229, /* INTERSECTION */
/* 0xF0 */ 0x2261, /* IDENTICAL TO */
/* 0xF1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xF2 */ 0x2265, /* GREATER-THAN OR EQUAL TO */
/* 0xF3 */ 0x2264, /* LESS-THAN OR EQUAL TO */
/* 0xF4 */ 0x2320, /* TOP HALF INTEGRAL */
/* 0xF5 */ 0x2321, /* BOTTOM HALF INTEGRAL */
/* 0xF6 */ 0x00F7, /* DIVISION SIGN */
/* 0xF7 */ 0x2248, /* ALMOST EQUAL TO */
/* 0xF8 */ 0x00B0, /* DEGREE SIGN */
/* 0xF9 */ 0x2219, /* BULLET OPERATOR */
/* 0xFA */ 0x00B7, /* MIDDLE DOT */
/* 0xFB */ 0x221A, /* SQUARE ROOT */
/* 0xFC */ 0x207F, /* SUPERSCRIPT LATIN SMALL LETTER N */
/* 0xFD */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp437 table_NULL
char *const aliases_cp437 [] = {
"cp437",
"437",
"IBM437",
"csPC8CodePage437",
NULL
};
/*** cp737 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp737 [] = {
/* 0x80 */ 0x0391, /* GREEK CAPITAL LETTER ALPHA */
/* 0x81 */ 0x0392, /* GREEK CAPITAL LETTER BETA */
/* 0x82 */ 0x0393, /* GREEK CAPITAL LETTER GAMMA */
/* 0x83 */ 0x0394, /* GREEK CAPITAL LETTER DELTA */
/* 0x84 */ 0x0395, /* GREEK CAPITAL LETTER EPSILON */
/* 0x85 */ 0x0396, /* GREEK CAPITAL LETTER ZETA */
/* 0x86 */ 0x0397, /* GREEK CAPITAL LETTER ETA */
/* 0x87 */ 0x0398, /* GREEK CAPITAL LETTER THETA */
/* 0x88 */ 0x0399, /* GREEK CAPITAL LETTER IOTA */
/* 0x89 */ 0x039A, /* GREEK CAPITAL LETTER KAPPA */
/* 0x8A */ 0x039B, /* GREEK CAPITAL LETTER LAMDA */
/* 0x8B */ 0x039C, /* GREEK CAPITAL LETTER MU */
/* 0x8C */ 0x039D, /* GREEK CAPITAL LETTER NU */
/* 0x8D */ 0x039E, /* GREEK CAPITAL LETTER XI */
/* 0x8E */ 0x039F, /* GREEK CAPITAL LETTER OMICRON */
/* 0x8F */ 0x03A0, /* GREEK CAPITAL LETTER PI */
/* 0x90 */ 0x03A1, /* GREEK CAPITAL LETTER RHO */
/* 0x91 */ 0x03A3, /* GREEK CAPITAL LETTER SIGMA */
/* 0x92 */ 0x03A4, /* GREEK CAPITAL LETTER TAU */
/* 0x93 */ 0x03A5, /* GREEK CAPITAL LETTER UPSILON */
/* 0x94 */ 0x03A6, /* GREEK CAPITAL LETTER PHI */
/* 0x95 */ 0x03A7, /* GREEK CAPITAL LETTER CHI */
/* 0x96 */ 0x03A8, /* GREEK CAPITAL LETTER PSI */
/* 0x97 */ 0x03A9, /* GREEK CAPITAL LETTER OMEGA */
/* 0x98 */ 0x03B1, /* GREEK SMALL LETTER ALPHA */
/* 0x99 */ 0x03B2, /* GREEK SMALL LETTER BETA */
/* 0x9A */ 0x03B3, /* GREEK SMALL LETTER GAMMA */
/* 0x9B */ 0x03B4, /* GREEK SMALL LETTER DELTA */
/* 0x9C */ 0x03B5, /* GREEK SMALL LETTER EPSILON */
/* 0x9D */ 0x03B6, /* GREEK SMALL LETTER ZETA */
/* 0x9E */ 0x03B7, /* GREEK SMALL LETTER ETA */
/* 0x9F */ 0x03B8, /* GREEK SMALL LETTER THETA */
/* 0xA0 */ 0x03B9, /* GREEK SMALL LETTER IOTA */
/* 0xA1 */ 0x03BA, /* GREEK SMALL LETTER KAPPA */
/* 0xA2 */ 0x03BB, /* GREEK SMALL LETTER LAMDA */
/* 0xA3 */ 0x03BC, /* GREEK SMALL LETTER MU */
/* 0xA4 */ 0x03BD, /* GREEK SMALL LETTER NU */
/* 0xA5 */ 0x03BE, /* GREEK SMALL LETTER XI */
/* 0xA6 */ 0x03BF, /* GREEK SMALL LETTER OMICRON */
/* 0xA7 */ 0x03C0, /* GREEK SMALL LETTER PI */
/* 0xA8 */ 0x03C1, /* GREEK SMALL LETTER RHO */
/* 0xA9 */ 0x03C3, /* GREEK SMALL LETTER SIGMA */
/* 0xAA */ 0x03C2, /* GREEK SMALL LETTER FINAL SIGMA */
/* 0xAB */ 0x03C4, /* GREEK SMALL LETTER TAU */
/* 0xAC */ 0x03C5, /* GREEK SMALL LETTER UPSILON */
/* 0xAD */ 0x03C6, /* GREEK SMALL LETTER PHI */
/* 0xAE */ 0x03C7, /* GREEK SMALL LETTER CHI */
/* 0xAF */ 0x03C8, /* GREEK SMALL LETTER PSI */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB6 */ 0x2562, /* BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE */
/* 0xB7 */ 0x2556, /* BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE */
/* 0xB8 */ 0x2555, /* BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x255C, /* BOX DRAWINGS UP DOUBLE AND LEFT SINGLE */
/* 0xBE */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xC7 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xD0 */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xD1 */ 0x2564, /* BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE */
/* 0xD2 */ 0x2565, /* BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE */
/* 0xD3 */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xD4 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xD5 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xD6 */ 0x2553, /* BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE */
/* 0xD7 */ 0x256B, /* BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE */
/* 0xD8 */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x258C, /* LEFT HALF BLOCK */
/* 0xDE */ 0x2590, /* RIGHT HALF BLOCK */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x03C9, /* GREEK SMALL LETTER OMEGA */
/* 0xE1 */ 0x03AC, /* GREEK SMALL LETTER ALPHA WITH TONOS */
/* 0xE2 */ 0x03AD, /* GREEK SMALL LETTER EPSILON WITH TONOS */
/* 0xE3 */ 0x03AE, /* GREEK SMALL LETTER ETA WITH TONOS */
/* 0xE4 */ 0x03CA, /* GREEK SMALL LETTER IOTA WITH DIALYTIKA */
/* 0xE5 */ 0x03AF, /* GREEK SMALL LETTER IOTA WITH TONOS */
/* 0xE6 */ 0x03CC, /* GREEK SMALL LETTER OMICRON WITH TONOS */
/* 0xE7 */ 0x03CD, /* GREEK SMALL LETTER UPSILON WITH TONOS */
/* 0xE8 */ 0x03CB, /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA */
/* 0xE9 */ 0x03CE, /* GREEK SMALL LETTER OMEGA WITH TONOS */
/* 0xEA */ 0x0386, /* GREEK CAPITAL LETTER ALPHA WITH TONOS */
/* 0xEB */ 0x0388, /* GREEK CAPITAL LETTER EPSILON WITH TONOS */
/* 0xEC */ 0x0389, /* GREEK CAPITAL LETTER ETA WITH TONOS */
/* 0xED */ 0x038A, /* GREEK CAPITAL LETTER IOTA WITH TONOS */
/* 0xEE */ 0x038C, /* GREEK CAPITAL LETTER OMICRON WITH TONOS */
/* 0xEF */ 0x038E, /* GREEK CAPITAL LETTER UPSILON WITH TONOS */
/* 0xF0 */ 0x038F, /* GREEK CAPITAL LETTER OMEGA WITH TONOS */
/* 0xF1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xF2 */ 0x2265, /* GREATER-THAN OR EQUAL TO */
/* 0xF3 */ 0x2264, /* LESS-THAN OR EQUAL TO */
/* 0xF4 */ 0x03AA, /* GREEK CAPITAL LETTER IOTA WITH DIALYTIKA */
/* 0xF5 */ 0x03AB, /* GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA */
/* 0xF6 */ 0x00F7, /* DIVISION SIGN */
/* 0xF7 */ 0x2248, /* ALMOST EQUAL TO */
/* 0xF8 */ 0x00B0, /* DEGREE SIGN */
/* 0xF9 */ 0x2219, /* BULLET OPERATOR */
/* 0xFA */ 0x00B7, /* MIDDLE DOT */
/* 0xFB */ 0x221A, /* SQUARE ROOT */
/* 0xFC */ 0x207F, /* SUPERSCRIPT LATIN SMALL LETTER N */
/* 0xFD */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp737 table_NULL
char *const aliases_cp737 [] = {
"cp737",
"737",
NULL
};
/*** cp850 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp850 [] = {
/* 0x80 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0x81 */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0x82 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0x83 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0x84 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0x85 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0x86 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0x87 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0x88 */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0x89 */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0x8A */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0x8B */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0x8C */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0x8D */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0x8E */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0x8F */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0x90 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x91 */ 0x00E6, /* LATIN SMALL LIGATURE AE */
/* 0x92 */ 0x00C6, /* LATIN CAPITAL LIGATURE AE */
/* 0x93 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0x94 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0x95 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0x96 */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0x97 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0x98 */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
/* 0x99 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0x9A */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0x9B */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0x9C */ 0x00A3, /* POUND SIGN */
/* 0x9D */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0x9E */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0x9F */ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
/* 0xA0 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xA1 */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xA2 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xA3 */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xA4 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xA5 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xA6 */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xA7 */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xA8 */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xA9 */ 0x00AE, /* REGISTERED SIGN */
/* 0xAA */ 0x00AC, /* NOT SIGN */
/* 0xAB */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xAC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xAD */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xAE */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAF */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xB6 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xB7 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xB8 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x00A2, /* CENT SIGN */
/* 0xBE */ 0x00A5, /* YEN SIGN */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xC7 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x00A4, /* CURRENCY SIGN */
/* 0xD0 */ 0x00F0, /* LATIN SMALL LETTER ETH */
/* 0xD1 */ 0x00D0, /* LATIN CAPITAL LETTER ETH */
/* 0xD2 */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xD3 */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xD4 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xD5 */ 0x0131, /* LATIN SMALL LETTER DOTLESS I */
/* 0xD6 */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xD7 */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xD8 */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x00A6, /* BROKEN BAR */
/* 0xDE */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xE1 */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE2 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xE3 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xE4 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xE5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xE6 */ 0x00B5, /* MICRO SIGN */
/* 0xE7 */ 0x00FE, /* LATIN SMALL LETTER THORN */
/* 0xE8 */ 0x00DE, /* LATIN CAPITAL LETTER THORN */
/* 0xE9 */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xEA */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xEB */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xEC */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xED */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xEE */ 0x00AF, /* MACRON */
/* 0xEF */ 0x00B4, /* ACUTE ACCENT */
/* 0xF0 */ 0x00AD, /* SOFT HYPHEN */
/* 0xF1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xF2 */ 0x2017, /* DOUBLE LOW LINE */
/* 0xF3 */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xF4 */ 0x00B6, /* PILCROW SIGN */
/* 0xF5 */ 0x00A7, /* SECTION SIGN */
/* 0xF6 */ 0x00F7, /* DIVISION SIGN */
/* 0xF7 */ 0x00B8, /* CEDILLA */
/* 0xF8 */ 0x00B0, /* DEGREE SIGN */
/* 0xF9 */ 0x00A8, /* DIAERESIS */
/* 0xFA */ 0x00B7, /* MIDDLE DOT */
/* 0xFB */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xFC */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xFD */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp850 table_NULL
char *const aliases_cp850 [] = {
"cp850",
"850",
"IBM850",
"csPC850Multilingual",
NULL
};
/*** cp852 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp852 [] = {
/* 0x80 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0x81 */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0x82 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0x83 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0x84 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0x85 */ 0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */
/* 0x86 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0x87 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0x88 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0x89 */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0x8A */ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
/* 0x8B */ 0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */
/* 0x8C */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0x8D */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0x8E */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0x8F */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0x90 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x91 */ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */
/* 0x92 */ 0x013A, /* LATIN SMALL LETTER L WITH ACUTE */
/* 0x93 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0x94 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0x95 */ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */
/* 0x96 */ 0x013E, /* LATIN SMALL LETTER L WITH CARON */
/* 0x97 */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0x98 */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0x99 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0x9A */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0x9B */ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */
/* 0x9C */ 0x0165, /* LATIN SMALL LETTER T WITH CARON */
/* 0x9D */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0x9E */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0x9F */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xA0 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xA1 */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xA2 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xA3 */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xA4 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA5 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xA6 */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xA7 */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xA8 */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xA9 */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xAA */ 0x00AC, /* NOT SIGN */
/* 0xAB */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xAC */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xAD */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xAE */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAF */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xB6 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xB7 */ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */
/* 0xB8 */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xBE */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xC7 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x00A4, /* CURRENCY SIGN */
/* 0xD0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xD1 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD2 */ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */
/* 0xD3 */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xD4 */ 0x010F, /* LATIN SMALL LETTER D WITH CARON */
/* 0xD5 */ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */
/* 0xD6 */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xD7 */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xD8 */ 0x011B, /* LATIN SMALL LETTER E WITH CARON */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x0162, /* LATIN CAPITAL LETTER T WITH CEDILLA */
/* 0xDE */ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xE1 */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE2 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xE3 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xE4 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xE5 */ 0x0148, /* LATIN SMALL LETTER N WITH CARON */
/* 0xE6 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xE7 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xE8 */ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */
/* 0xE9 */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xEA */ 0x0155, /* LATIN SMALL LETTER R WITH ACUTE */
/* 0xEB */ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
/* 0xEC */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xED */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xEE */ 0x0163, /* LATIN SMALL LETTER T WITH CEDILLA */
/* 0xEF */ 0x00B4, /* ACUTE ACCENT */
/* 0xF0 */ 0x00AD, /* SOFT HYPHEN */
/* 0xF1 */ 0x02DD, /* DOUBLE ACUTE ACCENT */
/* 0xF2 */ 0x02DB, /* OGONEK */
/* 0xF3 */ 0x02C7, /* CARON */
/* 0xF4 */ 0x02D8, /* BREVE */
/* 0xF5 */ 0x00A7, /* SECTION SIGN */
/* 0xF6 */ 0x00F7, /* DIVISION SIGN */
/* 0xF7 */ 0x00B8, /* CEDILLA */
/* 0xF8 */ 0x00B0, /* DEGREE SIGN */
/* 0xF9 */ 0x00A8, /* DIAERESIS */
/* 0xFA */ 0x02D9, /* DOT ABOVE */
/* 0xFB */ 0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */
/* 0xFC */ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */
/* 0xFD */ 0x0159, /* LATIN SMALL LETTER R WITH CARON */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp852 table_NULL
char *const aliases_cp852 [] = {
"cp852",
"852",
"csPCp852",
NULL
};
/*** cp866 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp866 [] = {
/* 0x80 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0x81 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0x82 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0x83 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0x84 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0x85 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0x86 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0x87 */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0x88 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0x89 */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0x8A */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0x8B */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0x8C */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0x8D */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0x8E */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0x8F */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0x90 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0x91 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0x92 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0x93 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0x94 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0x95 */ 0x0425, /* CYRILLIC CAPITAL LETTER HA */
/* 0x96 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0x97 */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0x98 */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0x99 */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0x9A */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
/* 0x9B */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0x9C */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0x9D */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0x9E */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0x9F */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xA0 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xA1 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xA2 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xA3 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xA4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xA5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xA6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xA7 */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xA8 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xA9 */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xAA */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xAB */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xAC */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xAD */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xAE */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xAF */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB6 */ 0x2562, /* BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE */
/* 0xB7 */ 0x2556, /* BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE */
/* 0xB8 */ 0x2555, /* BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x255C, /* BOX DRAWINGS UP DOUBLE AND LEFT SINGLE */
/* 0xBE */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xC7 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xD0 */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xD1 */ 0x2564, /* BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE */
/* 0xD2 */ 0x2565, /* BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE */
/* 0xD3 */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xD4 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xD5 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xD6 */ 0x2553, /* BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE */
/* 0xD7 */ 0x256B, /* BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE */
/* 0xD8 */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x258C, /* LEFT HALF BLOCK */
/* 0xDE */ 0x2590, /* RIGHT HALF BLOCK */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xE1 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xE2 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xE3 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xE4 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xE5 */ 0x0445, /* CYRILLIC SMALL LETTER HA */
/* 0xE6 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xE7 */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xE8 */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xE9 */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xEA */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xEB */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xEC */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xED */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xEE */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xEF */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xF0 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xF1 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xF2 */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xF3 */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xF4 */ 0x0407, /* CYRILLIC CAPITAL LETTER YI */
/* 0xF5 */ 0x0457, /* CYRILLIC SMALL LETTER YI */
/* 0xF6 */ 0x040E, /* CYRILLIC CAPITAL LETTER SHORT U */
/* 0xF7 */ 0x045E, /* CYRILLIC SMALL LETTER SHORT U */
/* 0xF8 */ 0x00B0, /* DEGREE SIGN */
/* 0xF9 */ 0x2219, /* BULLET OPERATOR */
/* 0xFA */ 0x00B7, /* MIDDLE DOT */
/* 0xFB */ 0x221A, /* SQUARE ROOT */
/* 0xFC */ 0x2116, /* NUMERO SIGN */
/* 0xFD */ 0x00A4, /* CURRENCY SIGN */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp866 table_NULL
char *const aliases_cp866 [] = {
"cp866",
"866",
"IBM866",
"csIBM866",
NULL
};
/*** cp1125 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_cp1125 [] = {
/* 0x80 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0x81 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0x82 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0x83 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0x84 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0x85 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0x86 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0x87 */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0x88 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0x89 */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0x8A */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0x8B */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0x8C */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0x8D */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0x8E */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0x8F */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0x90 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0x91 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0x92 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0x93 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0x94 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0x95 */ 0x0425, /* CYRILLIC CAPITAL LETTER HA */
/* 0x96 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0x97 */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0x98 */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0x99 */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0x9A */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
/* 0x9B */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0x9C */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0x9D */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0x9E */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0x9F */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xA0 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xA1 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xA2 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xA3 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xA4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xA5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xA6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xA7 */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xA8 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xA9 */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xAA */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xAB */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xAC */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xAD */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xAE */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xAF */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB6 */ 0x2562, /* BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE */
/* 0xB7 */ 0x2556, /* BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE */
/* 0xB8 */ 0x2555, /* BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x255C, /* BOX DRAWINGS UP DOUBLE AND LEFT SINGLE */
/* 0xBE */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xC7 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xD0 */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xD1 */ 0x2564, /* BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE */
/* 0xD2 */ 0x2565, /* BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE */
/* 0xD3 */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xD4 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xD5 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xD6 */ 0x2553, /* BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE */
/* 0xD7 */ 0x256B, /* BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE */
/* 0xD8 */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x258C, /* LEFT HALF BLOCK */
/* 0xDE */ 0x2590, /* RIGHT HALF BLOCK */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xE1 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xE2 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xE3 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xE4 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xE5 */ 0x0445, /* CYRILLIC SMALL LETTER HA */
/* 0xE6 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xE7 */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xE8 */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xE9 */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xEA */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xEB */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xEC */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xED */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xEE */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xEF */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xF0 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xF1 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xF2 */ 0x0490, /* CYRILLIC CAPITAL LETTER GHE WITH UPTURN */
/* 0xF3 */ 0x0491, /* CYRILLIC SMALL LETTER GHE WITH UPTURN */
/* 0xF4 */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xF5 */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xF6 */ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xF7 */ 0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xF8 */ 0x0407, /* CYRILLIC CAPITAL LETTER YI */
/* 0xF9 */ 0x0457, /* CYRILLIC SMALL LETTER YI */
/* 0xFA */ 0x00B7, /* MIDDLE DOT */
/* 0xFB */ 0x221A, /* SQUARE ROOT */
/* 0xFC */ 0x2116, /* NUMERO SIGN */
/* 0xFD */ 0x00A4, /* CURRENCY SIGN */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_cp1125 table_NULL
char *const aliases_cp1125 [] = {
"x-cp866-u",
"ruscii",
"1125",
"cp1125",
NULL
};
/*** macroman ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_macroman [] = {
/* 0x80 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0x81 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0x82 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0x83 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x84 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0x85 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0x86 */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0x87 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0x88 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0x89 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0x8A */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0x8B */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0x8C */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0x8D */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0x8E */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0x8F */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0x90 */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0x91 */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0x92 */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0x93 */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0x94 */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0x95 */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0x96 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0x97 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0x98 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0x99 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0x9A */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0x9B */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0x9C */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0x9D */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0x9E */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0x9F */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xA0 */ 0x2020, /* DAGGER */
/* 0xA1 */ 0x00B0, /* DEGREE SIGN */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A7, /* SECTION SIGN */
/* 0xA5 */ 0x2022, /* BULLET */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA6 */ 0x00B6, /* PILCROW SIGN */
/* 0xA7 */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA8 */ 0x00AE, /* REGISTERED SIGN */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x2122, /* TRADE MARK SIGN */
/* 0xAB */ 0x00B4, /* ACUTE ACCENT */
/* 0xAC */ 0x00A8, /* DIAERESIS */
/* 0xAD */ 0x2260, /* NOT EQUAL TO */
/* 0xAE */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xAF */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xB0 */ 0x221E, /* INFINITY */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x2264, /* LESS-THAN OR EQUAL TO */
/* 0xB3 */ 0x2265, /* GREATER-THAN OR EQUAL TO */
/* 0xB4 */ 0x00A5, /* YEN SIGN */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x2202, /* PARTIAL DIFFERENTIAL */
/* 0xB7 */ 0x2211, /* N-ARY SUMMATION */
/* 0xB8 */ 0x220F, /* N-ARY PRODUCT */
/* 0xB9 */ 0x03C0, /* GREEK SMALL LETTER PI */
/* 0xBA */ 0x222B, /* INTEGRAL */
/* 0xBB */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xBC */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xBD */ 0x03A9, /* GREEK CAPITAL LETTER OMEGA */
/* 0xBE */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xBF */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xC0 */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xC1 */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xC2 */ 0x00AC, /* NOT SIGN */
/* 0xC3 */ 0x221A, /* SQUARE ROOT */
/* 0xC4 */ 0x0192, /* LATIN SMALL LETTER F WITH HOOK */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xC5 */ 0x2248, /* ALMOST EQUAL TO */
/* 0xC6 */ 0x2206, /* INCREMENT */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xC7 */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xC8 */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xC9 */ 0x2026, /* HORIZONTAL ELLIPSIS */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xCA */ 0x00A0, /* NO-BREAK SPACE */
/* 0xCB */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xCC */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xCD */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xCE */ 0x0152, /* LATIN CAPITAL LIGATURE OE */
/* 0xCF */ 0x0153, /* LATIN SMALL LIGATURE OE */
/* 0xD0 */ 0x2013, /* EN DASH */
/* 0xD1 */ 0x2014, /* EM DASH */
/* 0xD2 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0xD3 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0xD4 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0xD5 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0xD6 */ 0x00F7, /* DIVISION SIGN */
/* 0xD7 */ 0x25CA, /* LOZENGE */
/* 0xD8 */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
/* 0xD9 */ 0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
/* 0xDA */ 0x2044, /* FRACTION SLASH */
/* 0xDB */ 0x20AC, /* EURO SIGN */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xDC */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0xDD */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0xDE */ 0xFB01, /* LATIN SMALL LIGATURE FI */
/* 0xDF */ 0xFB02, /* LATIN SMALL LIGATURE FL */
/* 0xE0 */ 0x2021, /* DOUBLE DAGGER */
/* 0xE1 */ 0x00B7, /* MIDDLE DOT */
/* 0xE2 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0xE3 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0xE4 */ 0x2030, /* PER MILLE SIGN */
/* 0xE5 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xE6 */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xE7 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xE8 */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xE9 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xEA */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xEB */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xEC */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xED */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xEE */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xEF */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xF0 */ 0xF8FF, /* Apple logo */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xF1 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xF2 */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xF3 */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xF4 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xF5 */ 0x0131, /* LATIN SMALL LETTER DOTLESS I */
/* 0xF6 */ 0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
/* 0xF7 */ 0x02DC, /* SMALL TILDE */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xF8 */ 0x00AF, /* MACRON */
/* 0xF9 */ 0x02D8, /* BREVE */
/* 0xFA */ 0x02D9, /* DOT ABOVE */
/* 0xFB */ 0x02DA, /* RING ABOVE */
/* 0xFC */ 0x00B8, /* CEDILLA */
/* 0xFD */ 0x02DD, /* DOUBLE ACUTE ACCENT */
/* 0xFE */ 0x02DB, /* OGONEK */
/* 0xFF */ 0x02C7, /* CARON */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_macroman table_NULL
char *const aliases_macroman [] = {
"x-mac",
"mac",
"macroman",
"Macintosh",
"csMacintosh",
NULL
};
/*** mac_lat2 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_mac_lat2 [] = {
/* 0x80 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0x81 */ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */
/* 0x82 */ 0x0101, /* LATIN SMALL LETTER A WITH MACRON */
/* 0x83 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x84 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0x85 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0x86 */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0x87 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0x88 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0x89 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0x8A */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0x8B */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0x8C */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0x8D */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0x8E */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0x8F */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0x90 */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0x91 */ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */
/* 0x92 */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0x93 */ 0x010F, /* LATIN SMALL LETTER D WITH CARON */
/* 0x94 */ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */
/* 0x95 */ 0x0113, /* LATIN SMALL LETTER E WITH MACRON */
/* 0x96 */ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
/* 0x97 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0x98 */ 0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */
/* 0x99 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0x9A */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0x9B */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0x9C */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0x9D */ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */
/* 0x9E */ 0x011B, /* LATIN SMALL LETTER E WITH CARON */
/* 0x9F */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xA0 */ 0x2020, /* DAGGER */
/* 0xA1 */ 0x00B0, /* DEGREE SIGN */
/* 0xA2 */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A7, /* SECTION SIGN */
/* 0xA5 */ 0x2022, /* BULLET */
/* 0xA6 */ 0x00B6, /* PILCROW SIGN */
/* 0xA7 */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xA8 */ 0x00AE, /* REGISTERED SIGN */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x2122, /* TRADE MARK SIGN */
/* 0xAB */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xAC */ 0x00A8, /* DIAERESIS */
/* 0xAD */ 0x2260, /* NOT EQUAL TO */
/* 0xAE */ 0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */
/* 0xAF */ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */
/* 0xB0 */ 0x012F, /* LATIN SMALL LETTER I WITH OGONEK */
/* 0xB1 */ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */
/* 0xB2 */ 0x2264, /* LESS-THAN OR EQUAL TO */
/* 0xB3 */ 0x2265, /* GREATER-THAN OR EQUAL TO */
/* 0xB4 */ 0x012B, /* LATIN SMALL LETTER I WITH MACRON */
/* 0xB5 */ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */
/* 0xB6 */ 0x2202, /* PARTIAL DIFFERENTIAL */
/* 0xB7 */ 0x2211, /* N-ARY SUMMATION */
/* 0xB8 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xB9 */ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */
/* 0xBA */ 0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */
/* 0xBB */ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */
/* 0xBC */ 0x013E, /* LATIN SMALL LETTER L WITH CARON */
/* 0xBD */ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */
/* 0xBE */ 0x013A, /* LATIN SMALL LETTER L WITH ACUTE */
/* 0xBF */ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */
/* 0xC0 */ 0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */
/* 0xC1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xC2 */ 0x00AC, /* NOT SIGN */
/* 0xC3 */ 0x221A, /* SQUARE ROOT */
/* 0xC4 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xC5 */ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */
/* 0xC6 */ 0x2206, /* INCREMENT */
/* 0xC7 */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xC8 */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xC9 */ 0x2026, /* HORIZONTAL ELLIPSIS */
/* 0xCA */ 0x00A0, /* NO-BREAK SPACE */
/* 0xCB */ 0x0148, /* LATIN SMALL LETTER N WITH CARON */
/* 0xCC */ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
/* 0xCD */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xCE */ 0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */
/* 0xCF */ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */
/* 0xD0 */ 0x2013, /* EN DASH */
/* 0xD1 */ 0x2014, /* EM DASH */
/* 0xD2 */ 0x201C, /* LEFT DOUBLE QUOTATION MARK */
/* 0xD3 */ 0x201D, /* RIGHT DOUBLE QUOTATION MARK */
/* 0xD4 */ 0x2018, /* LEFT SINGLE QUOTATION MARK */
/* 0xD5 */ 0x2019, /* RIGHT SINGLE QUOTATION MARK */
/* 0xD6 */ 0x00F7, /* DIVISION SIGN */
/* 0xD7 */ 0x25CA, /* LOZENGE */
/* 0xD8 */ 0x014D, /* LATIN SMALL LETTER O WITH MACRON */
/* 0xD9 */ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */
/* 0xDA */ 0x0155, /* LATIN SMALL LETTER R WITH ACUTE */
/* 0xDB */ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */
/* 0xDC */ 0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
/* 0xDD */ 0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
/* 0xDE */ 0x0159, /* LATIN SMALL LETTER R WITH CARON */
/* 0xDF */ 0x0156, /* LATIN CAPITAL LETTER R WITH CEDILLA */
/* 0xE0 */ 0x0157, /* LATIN SMALL LETTER R WITH CEDILLA */
/* 0xE1 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xE2 */ 0x201A, /* SINGLE LOW-9 QUOTATION MARK */
/* 0xE3 */ 0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
/* 0xE4 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xE5 */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0xE6 */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0xE7 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xE8 */ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */
/* 0xE9 */ 0x0165, /* LATIN SMALL LETTER T WITH CARON */
/* 0xEA */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xEB */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xEC */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xED */ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */
/* 0xEE */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xEF */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xF0 */ 0x016B, /* LATIN SMALL LETTER U WITH MACRON */
/* 0xF1 */ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
/* 0xF2 */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xF3 */ 0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */
/* 0xF4 */ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
/* 0xF5 */ 0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */
/* 0xF6 */ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */
/* 0xF7 */ 0x0173, /* LATIN SMALL LETTER U WITH OGONEK */
/* 0xF8 */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xF9 */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFA */ 0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */
/* 0xFB */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xFC */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xFD */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xFE */ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */
/* 0xFF */ 0x02C7, /* CARON */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_mac_lat2 table_NULL
char *const aliases_mac_lat2 [] = {
"x-mac-ce",
"mac-ce",
"maclatin2",
"10029",
"cp10029",
NULL
};
/*** kamen ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_kamen [] = {
/* 0x80 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0x81 */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0x82 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0x83 */ 0x010F, /* LATIN SMALL LETTER D WITH CARON */
/* 0x84 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0x85 */ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */
/* 0x86 */ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */
/* 0x87 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0x88 */ 0x011B, /* LATIN SMALL LETTER E WITH CARON */
/* 0x89 */ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */
/* 0x8A */ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */
/* 0x8B */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0x8C */ 0x013E, /* LATIN SMALL LETTER L WITH CARON */
/* 0x8D */ 0x013A, /* LATIN SMALL LETTER L WITH ACUTE */
/* 0x8E */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0x8F */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0x90 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x91 */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0x92 */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0x93 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0x94 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0x95 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0x96 */ 0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */
/* 0x97 */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0x98 */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0x99 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0x9A */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0x9B */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0x9C */ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */
/* 0x9D */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0x9E */ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */
/* 0x9F */ 0x0165, /* LATIN SMALL LETTER T WITH CARON */
/* 0xA0 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xA1 */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xA2 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xA3 */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xA4 */ 0x0148, /* LATIN SMALL LETTER N WITH CARON */
/* 0xA5 */ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */
/* 0xA6 */ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
/* 0xA7 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xA8 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xA9 */ 0x0159, /* LATIN SMALL LETTER R WITH CARON */
/* 0xAA */ 0x0155, /* LATIN SMALL LETTER R WITH ACUTE */
/* 0xAB */ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */
/* 0xAC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xAD */ 0x00A7, /* SECTION SIGN */
/* 0xAE */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAF */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xB0 */ 0x2591, /* LIGHT SHADE */
/* 0xB1 */ 0x2592, /* MEDIUM SHADE */
/* 0xB2 */ 0x2593, /* DARK SHADE */
/* 0xB3 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0xB4 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0xB5 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB6 */ 0x2562, /* BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE */
/* 0xB7 */ 0x2556, /* BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE */
/* 0xB8 */ 0x2555, /* BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE */
/* 0xB9 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xBA */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xBB */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xBC */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xBD */ 0x255C, /* BOX DRAWINGS UP DOUBLE AND LEFT SINGLE */
/* 0xBE */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xBF */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0xC0 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0xC1 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0xC2 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0xC3 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0xC4 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0xC5 */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0xC6 */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xC7 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xC8 */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xC9 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xCA */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xCB */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xCC */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xCD */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xCE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xCF */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xD0 */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xD1 */ 0x2564, /* BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE */
/* 0xD2 */ 0x2565, /* BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE */
/* 0xD3 */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xD4 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xD5 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xD6 */ 0x2553, /* BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE */
/* 0xD7 */ 0x256B, /* BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE */
/* 0xD8 */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xD9 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0xDA */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0xDB */ 0x2588, /* FULL BLOCK */
/* 0xDC */ 0x2584, /* LOWER HALF BLOCK */
/* 0xDD */ 0x258C, /* LEFT HALF BLOCK */
/* 0xDE */ 0x2590, /* RIGHT HALF BLOCK */
/* 0xDF */ 0x2580, /* UPPER HALF BLOCK */
/* 0xE0 */ 0x03B1, /* GREEK SMALL LETTER ALPHA */
/* 0xE1 */ 0x03B2, /* GREEK SMALL LETTER BETA */
/* 0xE2 */ 0x0194, /* LATIN CAPITAL LETTER GAMMA */
/* 0xE3 */ 0x03C0, /* GREEK SMALL LETTER PI */
/* 0xE4 */ 0x03A3, /* GREEK CAPITAL LETTER SIGMA */
/* 0xE5 */ 0x03C3, /* GREEK SMALL LETTER SIGMA */
/* 0xE6 */ 0x03BC, /* GREEK SMALL LETTER MU */
/* 0xE7 */ 0x03C4, /* GREEK SMALL LETTER TAU */
/* 0xE8 */ 0x03A6, /* GREEK CAPITAL LETTER PHI */
/* 0xE9 */ 0x0398, /* GREEK CAPITAL LETTER THETA */
/* 0xEA */ 0x03A9, /* GREEK CAPITAL LETTER OMEGA */
/* 0xEB */ 0x03B4, /* GREEK SMALL LETTER DELTA */
/* 0xEC */ 0x221E, /* INFINITY */
/* 0xED */ 0x03C6, /* GREEK SMALL LETTER PHI */
/* 0xEE */ 0x2208, /* ELEMENT OF */
/* 0xEF */ 0x2229, /* INTERSECTION */
/* 0xF0 */ 0x224D, /* EQUIVALENT TO */
/* 0xF1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xF2 */ 0x2265, /* GREATER-THAN OR EQUAL TO */
/* 0xF3 */ 0x2264, /* LESS-THAN OR EQUAL TO */
/* 0xF4 */ 0x2320, /* TOP HALF INTEGRAL */
/* 0xF5 */ 0x2321, /* BOTTOM HALF INTEGRAL */
/* 0xF6 */ 0x00F7, /* DIVISION SIGN */
/* 0xF7 */ 0x2248, /* ALMOST EQUAL TO */
/* 0xF8 */ 0x00B0, /* DEGREE SIGN */
/* 0xF9 */ 0x2219, /* BULLET OPERATOR */
/* 0xFA */ 0x00B7, /* MIDDLE DOT */
/* 0xFB */ 0x221A, /* SQUARE ROOT */
/* 0xFC */ 0x207F, /* SUPERSCRIPT LATIN SMALL LETTER N */
/* 0xFD */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xFE */ 0x25A0, /* BLACK SQUARE */
/* 0xFF */ 0x00A0, /* NO-BREAK SPACE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_kamen table_NULL
char *const aliases_kamen [] = {
"x-kam-cs",
"kam",
"867",
"869",
NULL
};
/*** koi8_r ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_koi8_r [] = {
/* 0x80 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0x81 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0x82 */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0x83 */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0x84 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0x85 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0x86 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0x87 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0x88 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0x89 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0x8A */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0x8B */ 0x2580, /* UPPER HALF BLOCK */
/* 0x8C */ 0x2584, /* LOWER HALF BLOCK */
/* 0x8D */ 0x2588, /* FULL BLOCK */
/* 0x8E */ 0x258C, /* LEFT HALF BLOCK */
/* 0x8F */ 0x2590, /* RIGHT HALF BLOCK */
/* 0x90 */ 0x2591, /* LIGHT SHADE */
/* 0x91 */ 0x2592, /* MEDIUM SHADE */
/* 0x92 */ 0x2593, /* DARK SHADE */
/* 0x93 */ 0x2320, /* TOP HALF INTEGRAL */
/* 0x94 */ 0x25A0, /* BLACK SQUARE */
/* 0x95 */ 0x2219, /* BULLET OPERATOR */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0x96 */ 0x221A, /* SQUARE ROOT */
/* 0x97 */ 0x2248, /* ALMOST EQUAL TO */
/* 0x98 */ 0x2264, /* LESS-THAN OR EQUAL TO */
/* 0x99 */ 0x2265, /* GREATER-THAN OR EQUAL TO */
/* 0x9A */ 0x00A0, /* NO-BREAK SPACE */
/* 0x9B */ 0x2321, /* BOTTOM HALF INTEGRAL */
/* 0x9C */ 0x00B0, /* DEGREE SIGN */
/* 0x9D */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0x9E */ 0x00B7, /* MIDDLE DOT */
/* 0x9F */ 0x00F7, /* DIVISION SIGN */
/* 0xA0 */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xA1 */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xA2 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xA3 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xA4 */ 0x2553, /* BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE */
/* 0xA5 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xA6 */ 0x2555, /* BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE */
/* 0xA7 */ 0x2556, /* BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE */
/* 0xA8 */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xA9 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xAA */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xAB */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xAC */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xAD */ 0x255C, /* BOX DRAWINGS UP DOUBLE AND LEFT SINGLE */
/* 0xAE */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xAF */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xB0 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xB1 */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xB2 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB3 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xB4 */ 0x2562, /* BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE */
/* 0xB5 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xB6 */ 0x2564, /* BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE */
/* 0xB7 */ 0x2565, /* BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE */
/* 0xB8 */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xB9 */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xBA */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xBB */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xBC */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xBD */ 0x256B, /* BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE */
/* 0xBE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xBF */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xC0 */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xC1 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xC2 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xC3 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xC4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xC5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xC6 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xC7 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xC8 */ 0x0445, /* CYRILLIC SMALL LETTER HA */
/* 0xC9 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xCA */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xCB */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xCC */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xCD */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xCE */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xCF */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xD0 */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xD1 */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xD2 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xD3 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xD4 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xD5 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xD6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xD7 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xD8 */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xD9 */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xDA */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xDB */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xDC */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xDD */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xDE */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xDF */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xE0 */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0xE1 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0xE2 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0xE3 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0xE4 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0xE5 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0xE6 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0xE7 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0xE8 */ 0x0425, /* CYRILLIC CAPITAL LETTER HA */
/* 0xE9 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0xEA */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0xEB */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0xEC */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0xED */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0xEE */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0xEF */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0xF0 */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0xF1 */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xF2 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0xF3 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0xF4 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0xF5 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0xF6 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0xF7 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0xF8 */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0xF9 */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0xFA */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0xFB */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0xFC */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0xFD */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0xFE */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0xFF */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_koi8_r table_NULL
char *const aliases_koi8_r [] = {
"koi8-r",
"csKOI8R",
NULL
};
/*** koi8_u ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_koi8_u [] = {
/* 0x80 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0x81 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0x82 */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0x83 */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0x84 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0x85 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0x86 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0x87 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0x88 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0x89 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0x8A */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0x8B */ 0x2580, /* UPPER HALF BLOCK */
/* 0x8C */ 0x2584, /* LOWER HALF BLOCK */
/* 0x8D */ 0x2588, /* FULL BLOCK */
/* 0x8E */ 0x258C, /* LEFT HALF BLOCK */
/* 0x8F */ 0x2590, /* RIGHT HALF BLOCK */
/* 0x90 */ 0x2591, /* LIGHT SHADE */
/* 0x91 */ 0x2592, /* MEDIUM SHADE */
/* 0x92 */ 0x2593, /* DARK SHADE */
/* 0x93 */ 0x2320, /* TOP HALF INTEGRAL */
/* 0x94 */ 0x25A0, /* BLACK SQUARE */
/* 0x95 */ 0x2219, /* BULLET OPERATOR */
/* 0x96 */ 0x221A, /* SQUARE ROOT */
/* 0x97 */ 0x2248, /* ALMOST EQUAL TO */
/* 0x98 */ 0x2264, /* LESS THAN OR EQUAL TO */
/* 0x99 */ 0x2265, /* GREATER THAN OR EQUAL TO */
/* 0x9A */ 0x00A0, /* NO-BREAK SPACE */
/* 0x9B */ 0x2321, /* BOTTOM HALF INTEGRAL */
/* 0x9C */ 0x00B0, /* DEGREE SIGN */
/* 0x9D */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0x9E */ 0x00B7, /* MIDDLE DOT */
/* 0x9F */ 0x00F7, /* DIVISION SIGN */
/* 0xA0 */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xA1 */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xA2 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xA3 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xA4 */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xA5 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xA6 */ 0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xA7 */ 0x0457, /* CYRILLIC SMALL LETTER YI (UKRAINIAN) */
/* 0xA8 */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xA9 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xAA */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xAB */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xAC */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xAD */ 0x0491, /* CYRILLIC SMALL LETTER GHE WITH UPTURN */
/* 0xAE */ 0x255D, /* BOX DRAWINGS DOUBLE UP AND LEFT */
/* 0xAF */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xB0 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xB1 */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xB2 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB3 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xB4 */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xB5 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xB6 */ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xB7 */ 0x0407, /* CYRILLIC CAPITAL LETTER YI (UKRAINIAN) */
/* 0xB8 */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xB9 */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xBA */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xBB */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xBC */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xBD */ 0x0490, /* CYRILLIC CAPITAL LETTER GHE WITH UPTURN */
/* 0xBE */ 0x256C, /* BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL */
/* 0xBF */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xC0 */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xC1 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xC2 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xC3 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xC4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xC5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xC6 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xC7 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xC8 */ 0x0445, /* CYRILLIC SMALL LETTER KHA */
/* 0xC9 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xCA */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xCB */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xCC */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xCD */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xCE */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xCF */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xD0 */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xD1 */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xD2 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xD3 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xD4 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xD5 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xD6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xD7 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xD8 */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xD9 */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xDA */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xDB */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xDC */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xDD */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xDE */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xDF */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xE0 */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0xE1 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0xE2 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0xE3 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0xE4 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0xE5 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0xE6 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0xE7 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0xE8 */ 0x0425, /* CYRILLIC CAPITAL LETTER KHA */
/* 0xE9 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0xEA */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0xEB */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0xEC */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0xED */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0xEE */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0xEF */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0xF0 */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0xF1 */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xF2 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0xF3 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0xF4 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0xF5 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0xF6 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0xF7 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0xF8 */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0xF9 */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0xFA */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0xFB */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0xFC */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0xFD */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0xFE */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0xFF */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_koi8_u table_NULL
char *const aliases_koi8_u [] = {
"koi8-u",
NULL
};
/*** koi8_ru ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_koi8_ru [] = {
/* 0x80 */ 0x2500, /* BOX DRAWINGS LIGHT HORIZONTAL */
/* 0x81 */ 0x2502, /* BOX DRAWINGS LIGHT VERTICAL */
/* 0x82 */ 0x250C, /* BOX DRAWINGS LIGHT DOWN AND RIGHT */
/* 0x83 */ 0x2510, /* BOX DRAWINGS LIGHT DOWN AND LEFT */
/* 0x84 */ 0x2514, /* BOX DRAWINGS LIGHT UP AND RIGHT */
/* 0x85 */ 0x2518, /* BOX DRAWINGS LIGHT UP AND LEFT */
/* 0x86 */ 0x251C, /* BOX DRAWINGS LIGHT VERTICAL AND RIGHT */
/* 0x87 */ 0x2524, /* BOX DRAWINGS LIGHT VERTICAL AND LEFT */
/* 0x88 */ 0x252C, /* BOX DRAWINGS LIGHT DOWN AND HORIZONTAL */
/* 0x89 */ 0x2534, /* BOX DRAWINGS LIGHT UP AND HORIZONTAL */
/* 0x8A */ 0x253C, /* BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL */
/* 0x8B */ 0x2580, /* UPPER HALF BLOCK */
/* 0x8C */ 0x2584, /* LOWER HALF BLOCK */
/* 0x8D */ 0x2588, /* FULL BLOCK */
/* 0x8E */ 0x258C, /* LEFT HALF BLOCK */
/* 0x8F */ 0x2590, /* RIGHT HALF BLOCK */
/* 0x90 */ 0x2591, /* LIGHT SHADE */
/* 0x91 */ 0x2592, /* MEDIUM SHADE */
/* 0x92 */ 0x2593, /* DARK SHADE */
/* 0x93 */ 0x2320, /* TOP HALF INTEGRAL */
/* 0x94 */ 0x25A0, /* BLACK SQUARE */
/* 0x95 */ 0x2219, /* BULLET OPERATOR */
/* 0x96 */ 0x221A, /* SQUARE ROOT */
/* 0x97 */ 0x2248, /* ALMOST EQUAL TO */
/* 0x98 */ 0x2264, /* LESS THAN OR EQUAL TO */
/* 0x99 */ 0x2265, /* GREATER THAN OR EQUAL TO */
/* 0x9A */ 0x00A0, /* NO-BREAK SPACE */
/* 0x9B */ 0x2321, /* BOTTOM HALF INTEGRAL */
/* 0x9C */ 0x00B0, /* DEGREE SIGN */
/* 0x9D */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0x9E */ 0x00B7, /* MIDDLE DOT */
/* 0x9F */ 0x00F7, /* DIVISION SIGN */
/* 0xA0 */ 0x2550, /* BOX DRAWINGS DOUBLE HORIZONTAL */
/* 0xA1 */ 0x2551, /* BOX DRAWINGS DOUBLE VERTICAL */
/* 0xA2 */ 0x2552, /* BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE */
/* 0xA3 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xA4 */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xA5 */ 0x2554, /* BOX DRAWINGS DOUBLE DOWN AND RIGHT */
/* 0xA6 */ 0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xA7 */ 0x0457, /* CYRILLIC SMALL LETTER YI (UKRAINIAN) */
/* 0xA8 */ 0x2557, /* BOX DRAWINGS DOUBLE DOWN AND LEFT */
/* 0xA9 */ 0x2558, /* BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE */
/* 0xAA */ 0x2559, /* BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE */
/* 0xAB */ 0x255A, /* BOX DRAWINGS DOUBLE UP AND RIGHT */
/* 0xAC */ 0x255B, /* BOX DRAWINGS UP SINGLE AND LEFT DOUBLE */
/* 0xAD */ 0x0491, /* CYRILLIC SMALL LETTER GHE WITH UPTURN */
/* 0xAE */ 0x045E, /* CYRILLIC SMALL LETTER SHORT U */
/* 0xAF */ 0x255E, /* BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE */
/* 0xB0 */ 0x255F, /* BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE */
/* 0xB1 */ 0x2560, /* BOX DRAWINGS DOUBLE VERTICAL AND RIGHT */
/* 0xB2 */ 0x2561, /* BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE */
/* 0xB3 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xB4 */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xB5 */ 0x2563, /* BOX DRAWINGS DOUBLE VERTICAL AND LEFT */
/* 0xB6 */ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xB7 */ 0x0407, /* CYRILLIC CAPITAL LETTER YI (UKRAINIAN) */
/* 0xB8 */ 0x2566, /* BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL */
/* 0xB9 */ 0x2567, /* BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE */
/* 0xBA */ 0x2568, /* BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE */
/* 0xBB */ 0x2569, /* BOX DRAWINGS DOUBLE UP AND HORIZONTAL */
/* 0xBC */ 0x256A, /* BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE */
/* 0xBD */ 0x0490, /* CYRILLIC CAPITAL LETTER GHE WITH UPTURN */
/* 0xBE */ 0x040E, /* CYRILLIC CAPITAL LETTER SHORT U */
/* 0xBF */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xC0 */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xC1 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xC2 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xC3 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xC4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xC5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xC6 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xC7 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xC8 */ 0x0445, /* CYRILLIC SMALL LETTER KHA */
/* 0xC9 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xCA */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xCB */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xCC */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xCD */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xCE */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xCF */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xD0 */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xD1 */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xD2 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xD3 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xD4 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xD5 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xD6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xD7 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xD8 */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xD9 */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xDA */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xDB */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xDC */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xDD */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xDE */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xDF */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xE0 */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0xE1 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0xE2 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0xE3 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0xE4 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0xE5 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0xE6 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0xE7 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0xE8 */ 0x0425, /* CYRILLIC CAPITAL LETTER KHA */
/* 0xE9 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0xEA */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0xEB */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0xEC */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0xED */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0xEE */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0xEF */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0xF0 */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0xF1 */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xF2 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0xF3 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0xF4 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0xF5 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0xF6 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0xF7 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0xF8 */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0xF9 */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0xFA */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0xFB */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0xFC */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0xFD */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0xFE */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0xFF */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_koi8_ru table_NULL
char *const aliases_koi8_ru [] = {
"koi8-ru",
NULL
};
/*** tcvn5712 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_tcvn5712 [] = {
/* 0x80 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0x81 */ 0x1EA2, /* LATIN CAPITAL LETTER A WITH HOOK ABOVE */
/* 0x82 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0x83 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0x84 */ 0x1EA0, /* LATIN CAPITAL LETTER A WITH DOT BELOW */
/* 0x85 */ 0x1EB6, /* LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW */
/* 0x86 */ 0x1EAC, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW */
/* 0x87 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0x88 */ 0x1EBA, /* LATIN CAPITAL LETTER E WITH HOOK ABOVE */
/* 0x89 */ 0x1EBC, /* LATIN CAPITAL LETTER E WITH TILDE */
/* 0x8A */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0x8B */ 0x1EB8, /* LATIN CAPITAL LETTER E WITH DOT BELOW */
/* 0x8C */ 0x1EC6, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW */
/* 0x8D */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0x8E */ 0x1EC8, /* LATIN CAPITAL LETTER I WITH HOOK ABOVE */
/* 0x8F */ 0x0128, /* LATIN CAPITAL LETTER I WITH TILDE */
/* 0x90 */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0x91 */ 0x1ECA, /* LATIN CAPITAL LETTER I WITH DOT BELOW */
/* 0x92 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0x93 */ 0x1ECE, /* LATIN CAPITAL LETTER O WITH HOOK ABOVE */
/* 0x94 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0x95 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0x96 */ 0x1ECC, /* LATIN CAPITAL LETTER O WITH DOT BELOW */
/* 0x97 */ 0x1ED8, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW */
/* 0x98 */ 0x1EDC, /* LATIN CAPITAL LETTER O WITH HORN AND GRAVE */
/* 0x99 */ 0x1EDE, /* LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE */
/* 0x9A */ 0x1EE0, /* LATIN CAPITAL LETTER O WITH HORN AND TILDE */
/* 0x9B */ 0x1EDA, /* LATIN CAPITAL LETTER O WITH HORN AND ACUTE */
/* 0x9C */ 0x1EE2, /* LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW */
/* 0x9D */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0x9E */ 0x1EE6, /* LATIN CAPITAL LETTER U WITH HOOK ABOVE */
/* 0x9F */ 0x0168, /* LATIN CAPITAL LETTER U WITH TILDE */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xA2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xA3 */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xA4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xA5 */ 0x01A0, /* LATIN CAPITAL LETTER O WITH HOOK */
/* 0xA6 */ 0x01AF, /* LATIN CAPITAL LETTER U WITH HORN */
/* 0xA7 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xA8 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xA9 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xAA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xAB */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xAC */ 0x01A1, /* LATIN SMALL LETTER O WITH HOOK */
/* 0xAD */ 0x01B0, /* LATIN SMALL LETTER U WITH HORN */
/* 0xAE */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xAF */ 0x1EB0, /* LATIN CAPITAL LETTER A WITH BREVE AND GRAVE */
/* 0xB0 */ 0x0300, /* COMBINING GRAVE ACCENT */
/* 0xB1 */ 0x0309, /* COMBINING HOOK ABOVE */
/* 0xB2 */ 0x0303, /* COMBINING TILDE */
/* 0xB3 */ 0x0301, /* COMBINING ACUTE ACCENT */
/* 0xB4 */ 0x0323, /* COMBINING DOT BELOW */
/* 0xB5 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xB6 */ 0x1EA3, /* LATIN SMALL LETTER A WITH HOOK ABOVE */
/* 0xB7 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xB8 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xB9 */ 0x1EA1, /* LATIN SMALL LETTER A WITH DOT BELOW */
/* 0xBA */ 0x1EB2, /* LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE */
/* 0xBB */ 0x1EB1, /* LATIN SMALL LETTER A WITH BREVE AND GRAVE */
/* 0xBC */ 0x1EB3, /* LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE */
/* 0xBD */ 0x1EB5, /* LATIN SMALL LETTER A WITH BREVE AND TILDE */
/* 0xBE */ 0x1EAF, /* LATIN SMALL LETTER A WITH BREVE AND ACUTE */
/* 0xBF */ 0x1EB4, /* LATIN CAPITAL LETTER A WITH BREVE AND TILDE */
/* 0xC0 */ 0x1EAE, /* LATIN CAPITAL LETTER A WITH BREVE AND ACUTE */
/* 0xC1 */ 0x1EA6, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE */
/* 0xC2 */ 0x1EA8, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xC3 */ 0x1EAA, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE */
/* 0xC4 */ 0x1EA4, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE */
/* 0xC5 */ 0x1EC0, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE */
/* 0xC6 */ 0x1EB7, /* LATIN SMALL LETTER A WITH BREVE AND DOT BELOW */
/* 0xC7 */ 0x1EA7, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE */
/* 0xC8 */ 0x1EA9, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xC9 */ 0x1EAB, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE */
/* 0xCA */ 0x1EA5, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE */
/* 0xCB */ 0x1EAD, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW */
/* 0xCC */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xCD */ 0x1EC2, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xCE */ 0x1EBB, /* LATIN SMALL LETTER E WITH HOOK ABOVE */
/* 0xCF */ 0x1EBD, /* LATIN SMALL LETTER E WITH TILDE */
/* 0xD0 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xD1 */ 0x1EB9, /* LATIN SMALL LETTER E WITH DOT BELOW */
/* 0xD2 */ 0x1EC1, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE */
/* 0xD3 */ 0x1EC3, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xD4 */ 0x1EC5, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE */
/* 0xD5 */ 0x1EBF, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE */
/* 0xD6 */ 0x1EC7, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW */
/* 0xD7 */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xD8 */ 0x1EC9, /* LATIN SMALL LETTER I WITH HOOK ABOVE */
/* 0xD9 */ 0x1EC4, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE */
/* 0xDA */ 0x1EBE, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE */
/* 0xDB */ 0x1ED2, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE */
/* 0xDC */ 0x0129, /* LATIN SMALL LETTER I WITH TILDE */
/* 0xDD */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xDE */ 0x1ECB, /* LATIN SMALL LETTER I WITH DOT BELOW */
/* 0xDF */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xE0 */ 0x1ED4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xE1 */ 0x1ECF, /* LATIN SMALL LETTER O WITH HOOK ABOVE */
/* 0xE2 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xE3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xE4 */ 0x1ECD, /* LATIN SMALL LETTER O WITH DOT BELOW */
/* 0xE5 */ 0x1ED3, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE */
/* 0xE6 */ 0x1ED5, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xE7 */ 0x1ED7, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE */
/* 0xE8 */ 0x1ED1, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE */
/* 0xE9 */ 0x1ED9, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW */
/* 0xEA */ 0x1EDD, /* LATIN SMALL LETTER O WITH HORN AND GRAVE */
/* 0xEB */ 0x1EDF, /* LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE */
/* 0xEC */ 0x1EE1, /* LATIN SMALL LETTER O WITH HORN AND TILDE */
/* 0xED */ 0x1EDB, /* LATIN SMALL LETTER O WITH HORN AND ACUTE */
/* 0xEE */ 0x1EE3, /* LATIN SMALL LETTER O WITH HORN AND DOT BELOW */
/* 0xEF */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xF0 */ 0x1ED6, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE */
/* 0xF1 */ 0x1EE7, /* LATIN SMALL LETTER U WITH HOOK ABOVE */
/* 0xF2 */ 0x0169, /* LATIN SMALL LETTER U WITH TILDE */
/* 0xF3 */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xF4 */ 0x1EE5, /* LATIN SMALL LETTER U WITH DOT BELOW */
/* 0xF5 */ 0x1EEB, /* LATIN SMALL LETTER U WITH HORN AND GRAVE */
/* 0xF6 */ 0x1EED, /* LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE */
/* 0xF7 */ 0x1EEF, /* LATIN SMALL LETTER U WITH HORN AND TILDE */
/* 0xF8 */ 0x1EE9, /* LATIN SMALL LETTER U WITH HORN AND ACUTE */
/* 0xF9 */ 0x1EF1, /* LATIN SMALL LETTER U WITH HORN AND DOT BELOW */
/* 0xFA */ 0x1EF3, /* LATIN SMALL LETTER Y WITH GRAVE */
/* 0xFB */ 0x1EF7, /* LATIN SMALL LETTER Y WITH HOOK ABOVE */
/* 0xFC */ 0x1EF9, /* LATIN SMALL LETTER Y WITH TILDE */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x1EF5, /* LATIN SMALL LETTER Y WITH DOT BELOW */
/* 0xFF */ 0x1ED0, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE */
};
const struct table_entry table_tcvn5712 [] = {
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
{0xA7, 0x00D0}, /* letter ETH */
{0xAE, 0x00F0},
{0, 0}
};
char *const aliases_tcvn5712 [] = {
"TCVN-5712",
"TCVN",
"TCVN-0",
"TCVN-1",
"TCVN5712",
"TCVN5712-1",
"TCVN5712-1:1993",
NULL
};
/*** viscii ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_viscii [] = {
/* 0x80 */ 0x1EA0, /* LATIN CAPITAL LETTER A WITH DOT BELOW */
/* 0x81 */ 0x1EAE, /* LATIN CAPITAL LETTER A WITH BREVE AND ACUTE */
/* 0x82 */ 0x1EB0, /* LATIN CAPITAL LETTER A WITH BREVE AND GRAVE */
/* 0x83 */ 0x1EB6, /* LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW */
/* 0x84 */ 0x1EA4, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE */
/* 0x85 */ 0x1EA6, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE */
/* 0x86 */ 0x1EA8, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0x87 */ 0x1EAC, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW */
/* 0x88 */ 0x1EBC, /* LATIN CAPITAL LETTER E WITH TILDE */
/* 0x89 */ 0x1EB8, /* LATIN CAPITAL LETTER E WITH DOT BELOW */
/* 0x8A */ 0x1EBE, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE */
/* 0x8B */ 0x1EC0, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE */
/* 0x8C */ 0x1EC2, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0x8D */ 0x1EC4, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE */
/* 0x8E */ 0x1EC6, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW */
/* 0x8F */ 0x1ED0, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE */
/* 0x90 */ 0x1ED2, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE */
/* 0x91 */ 0x1ED4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0x92 */ 0x1ED6, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE */
/* 0x93 */ 0x1ED8, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW */
/* 0x94 */ 0x1EE2, /* LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW */
/* 0x95 */ 0x1EDA, /* LATIN CAPITAL LETTER O WITH HORN AND ACUTE */
/* 0x96 */ 0x1EDC, /* LATIN CAPITAL LETTER O WITH HORN AND GRAVE */
/* 0x97 */ 0x1EDE, /* LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE */
/* 0x98 */ 0x1ECA, /* LATIN CAPITAL LETTER I WITH DOT BELOW */
/* 0x99 */ 0x1ECE, /* LATIN CAPITAL LETTER O WITH HOOK ABOVE */
/* 0x9A */ 0x1ECC, /* LATIN CAPITAL LETTER O WITH DOT BELOW */
/* 0x9B */ 0x1EC8, /* LATIN CAPITAL LETTER I WITH HOOK ABOVE */
/* 0x9C */ 0x1EE6, /* LATIN CAPITAL LETTER U WITH HOOK ABOVE */
/* 0x9D */ 0x0168, /* LATIN CAPITAL LETTER U WITH TILDE */
/* 0x9E */ 0x1EE4, /* LATIN CAPITAL LETTER U WITH DOT BELOW */
/* 0x9F */ 0x1EF2, /* LATIN CAPITAL LETTER Y WITH GRAVE */
/* 0xA0 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xA1 */ 0x1EAF, /* LATIN SMALL LETTER A WITH BREVE AND ACUTE */
/* 0xA2 */ 0x1EB1, /* LATIN SMALL LETTER A WITH BREVE AND GRAVE */
/* 0xA3 */ 0x1EB7, /* LATIN SMALL LETTER A WITH BREVE AND DOT BELOW */
/* 0xA4 */ 0x1EA5, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE */
/* 0xA5 */ 0x1EA7, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE */
/* 0xA6 */ 0x1EA9, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xA7 */ 0x1EAD, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW */
/* 0xA8 */ 0x1EBD, /* LATIN SMALL LETTER E WITH TILDE */
/* 0xA9 */ 0x1EB9, /* LATIN SMALL LETTER E WITH DOT BELOW */
/* 0xAA */ 0x1EBF, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE */
/* 0xAB */ 0x1EC1, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE */
/* 0xAC */ 0x1EC3, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xAD */ 0x1EC5, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE */
/* 0xAE */ 0x1EC7, /* LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW */
/* 0xAF */ 0x1ED1, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE */
/* 0xB0 */ 0x1ED3, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE */
/* 0xB1 */ 0x1ED5, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE */
/* 0xB2 */ 0x1ED7, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE */
/* 0xB3 */ 0x1EE0, /* LATIN CAPITAL LETTER O WITH HOOK AND TILDE */
/* 0xB4 */ 0x01A0, /* LATIN CAPITAL LETTER O WITH HOOK */
/* 0xB5 */ 0x1ED9, /* LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW */
/* 0xB6 */ 0x1EDD, /* LATIN SMALL LETTER O WITH HORN AND GRAVE */
/* 0xB7 */ 0x1EDF, /* LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE */
/* 0xB8 */ 0x1ECB, /* LATIN SMALL LETTER I WITH DOT BELOW */
/* 0xB9 */ 0x1EF0, /* LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW */
/* 0xBA */ 0x1EE8, /* LATIN CAPITAL LETTER U WITH HORN AND ACUTE */
/* 0xBB */ 0x1EEA, /* LATIN CAPITAL LETTER U WITH HORN AND GRAVE */
/* 0xBC */ 0x1EEC, /* LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE */
/* 0xBD */ 0x01A1, /* LATIN SMALL LETTER O WITH HORN */
/* 0xBE */ 0x1EDB, /* LATIN SMALL LETTER O WITH HORN AND ACUTE */
/* 0xBF */ 0x01AF, /* LATIN CAPITAL LETTER U WITH HORN */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x1EA2, /* LATIN CAPITAL LETTER A WITH HOOK ABOVE */
/* 0xC5 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xC6 */ 0x1EB3, /* LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE */
/* 0xC7 */ 0x1EB5, /* LATIN SMALL LETTER A WITH BREVE AND TILDE */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x1EBA, /* LATIN CAPITAL LETTER E WITH HOOK ABOVE */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x0128, /* LATIN CAPITAL LETTER I WITH TILDE */
/* 0xCF */ 0x1EF3, /* LATIN SMALL LETTER Y WITH GRAVE */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x1EE9, /* LATIN SMALL LETTER U WITH HORN AND ACUTE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x1EA1, /* LATIN SMALL LETTER A WITH DOT BELOW */
/* 0xD6 */ 0x1EF7, /* LATIN SMALL LETTER Y WITH HOOK ABOVE */
/* 0xD7 */ 0x1EEB, /* LATIN SMALL LETTER U WITH HORN AND GRAVE */
/* 0xD8 */ 0x1EED, /* LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x1EF9, /* LATIN SMALL LETTER Y WITH TILDE */
/* 0xDC */ 0x1EF5, /* LATIN SMALL LETTER Y WITH DOT BELOW */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x1EE1, /* LATIN SMALL LETTER O WITH HORN AND TILDE */
/* 0xDF */ 0x01B0, /* LATIN SMALL LETTER U WITH HORN */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x1EA3, /* LATIN SMALL LETTER A WITH HOOK ABOVE */
/* 0xE5 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xE6 */ 0x1EEF, /* LATIN SMALL LETTER U WITH HORN AND TILDE */
/* 0xE7 */ 0x1EAB, /* LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x1EBB, /* LATIN SMALL LETTER E WITH HOOK ABOVE */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x0129, /* LATIN SMALL LETTER I WITH TILDE */
/* 0xEF */ 0x1EC9, /* LATIN SMALL LETTER I WITH HOOK ABOVE */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x1EF1, /* LATIN SMALL LETTER U WITH HORN AND DOT BELOW */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x1ECF, /* LATIN SMALL LETTER O WITH HOOK ABOVE */
/* 0xF7 */ 0x1ECD, /* LATIN SMALL LETTER O WITH DOT BELOW */
/* 0xF8 */ 0x1EE5, /* LATIN SMALL LETTER U WITH DOT BELOW */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x0169, /* LATIN SMALL LETTER U WITH TILDE */
/* 0xFC */ 0x1EE7, /* LATIN SMALL LETTER U WITH HOOK ABOVE */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x1EE3, /* LATIN SMALL LETTER O WITH HORN AND DOT BELOW */
/* 0xFF */ 0x1EEE, /* LATIN CAPITAL LETTER U WITH HORN AND TILDE */
};
const struct table_entry table_viscii [] = {
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
{0xD0, 0x00D0}, /* letter ETH */
{0xF0, 0x00F0},
{0, 0}
};
char *const aliases_viscii [] = {
"VISCII",
"VISCII-1",
"VISCII 1.1",
"VISCII-1.1",
"VISCII1.1-1",
"csVISCII",
NULL
};
/*** utf8 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define highhalf_utf8 highhalf_NULL
#define table_utf8 table_NULL
char *const aliases_utf8 [] = {
"utf-8",
"utf8",
NULL
};
2010-07-24 04:39:18 -04:00
/*** Big5, Big5-HKSCS ***/
2010-07-24 04:39:18 -04:00
/* many sites set charset="Big5", although they really use "Big5-hkscs".
Big5-hkscs is superset of Big5. */
char *const aliases_big5 [] = {
2010-07-24 04:39:18 -04:00
"Big5-HKSCS",
"Big5",
NULL
};
2010-07-24 04:43:42 -04:00
/*** Shift JIS ***/
char *const aliases_shift_jis [] = {
2010-07-24 04:43:42 -04:00
"shift_jis",
"shift-jis",
2010-07-24 06:37:29 -04:00
"sjis",
NULL
};
/*** EUC_CN ***/
char *const aliases_euc_cn [] = {
2010-07-24 06:37:29 -04:00
"euc-cn",
"euc_cn",
NULL
};
/*** EUC_JP ***/
char *const aliases_euc_jp [] = {
2010-07-24 06:37:29 -04:00
"euc-jp",
"euc_jp",
NULL
};
/*** EUC_KR ***/
char *const aliases_euc_kr [] = {
2010-07-24 06:37:29 -04:00
"euc-kr",
"euc_kr",
NULL
};
/*** EUC_TW ***/
char *const aliases_euc_tw [] = {
2010-07-24 06:37:29 -04:00
"euc-tw",
"euc_tw",
NULL
};
/*** GBK ***/
char *const aliases_gbk [] = {
"gbk",
"936",
"cp936",
"windows936",
"windows-936",
NULL
};
2010-07-24 06:37:29 -04:00
/*** GB2312 ***/
char *const aliases_gb2312 [] = {
2010-07-24 06:37:29 -04:00
"gb2312",
"gb-2312",
"gb_2312",
"gb 2312",
NULL
};
/*** GB18030 ***/
char *const aliases_gb18030 [] = {
"gb18030",
"gb-18030",
"gb_18030",
"gb 18030",
2010-07-24 06:37:29 -04:00
NULL
};
/*** ISO2022JP ***/
char *const aliases_iso2022jp [] = {
2010-07-24 06:37:29 -04:00
"iso2022jp",
"iso-2022-jp",
2010-07-24 04:43:42 -04:00
NULL
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/*** NULL ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_NULL [] = {
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,
};
const struct table_entry table_NULL [] = {
{0, 0}
};
const struct codepage_desc codepages [] = {
{"7-bit ASCII", aliases_7bit, highhalf_7bit, table_7bit, 0},
{"ISO 8859-1", aliases_8859_1, highhalf_8859_1, table_8859_1, 0},
{"ISO 8859-2", aliases_8859_2, highhalf_8859_2, table_8859_2, 0},
{"ISO 8859-3", aliases_8859_3, highhalf_8859_3, table_8859_3, 0},
{"ISO 8859-4", aliases_8859_4, highhalf_8859_4, table_8859_4, 0},
{"ISO 8859-5", aliases_8859_5, highhalf_8859_5, table_8859_5, 0},
{"ISO 8859-6", aliases_8859_6, highhalf_8859_6, table_8859_6, 0},
{"ISO 8859-7", aliases_8859_7, highhalf_8859_7, table_8859_7, 0},
{"ISO 8859-8", aliases_8859_8, highhalf_8859_8, table_8859_8, 0},
{"ISO 8859-9", aliases_8859_9, highhalf_8859_9, table_8859_9, 0},
{"ISO 8859-10", aliases_8859_10, highhalf_8859_10, table_8859_10, 0},
{"ISO 8859-13", aliases_8859_13, highhalf_8859_13, table_8859_13, 0},
{"ISO 8859-14", aliases_8859_14, highhalf_8859_14, table_8859_14, 0},
{"ISO 8859-15", aliases_8859_15, highhalf_8859_15, table_8859_15, 0},
{"ISO 8859-16", aliases_8859_16, highhalf_8859_16, table_8859_16, 0},
{"Window$ 1250", aliases_cp1250, highhalf_cp1250, table_cp1250, 0},
{"Window$ 1251", aliases_cp1251, highhalf_cp1251, table_cp1251, 0},
{"Window$ 1252", aliases_cp1252, highhalf_cp1252, table_cp1252, 0},
{"Window$ 1256", aliases_cp1256, highhalf_cp1256, table_cp1256, 0},
{"Window$ 1257", aliases_cp1257, highhalf_cp1257, table_cp1257, 0},
{"CP 437", aliases_cp437, highhalf_cp437, table_cp437, 0},
{"CP 737", aliases_cp737, highhalf_cp737, table_cp737, 0},
{"CP 850", aliases_cp850, highhalf_cp850, table_cp850, 0},
{"CP 852", aliases_cp852, highhalf_cp852, table_cp852, 0},
{"CP 866", aliases_cp866, highhalf_cp866, table_cp866, 0},
{"CP 1125 (Ukrainian)", aliases_cp1125, highhalf_cp1125, table_cp1125, 0},
{"MacRoman 2000", aliases_macroman, highhalf_macroman, table_macroman, 0},
{"Mac latin 2", aliases_mac_lat2, highhalf_mac_lat2, table_mac_lat2, 0},
{"Kamenicky Brothers", aliases_kamen, highhalf_kamen, table_kamen, 0},
{"KOI8-R", aliases_koi8_r, highhalf_koi8_r, table_koi8_r, 0},
{"KOI8-U", aliases_koi8_u, highhalf_koi8_u, table_koi8_u, 0},
{"KOI8-RU", aliases_koi8_ru, highhalf_koi8_ru, table_koi8_ru, 0},
{"TCVN-5712", aliases_tcvn5712, highhalf_tcvn5712, table_tcvn5712, 0},
{"VISCII", aliases_viscii, highhalf_viscii, table_viscii, 0},
{"Unicode UTF-8", aliases_utf8, highhalf_utf8, table_utf8, 0},
2010-07-24 04:39:18 -04:00
{"Big5", aliases_big5, highhalf_NULL, table_NULL, 1},
2010-07-24 04:43:42 -04:00
{"Shift-JIS", aliases_shift_jis, highhalf_NULL, table_NULL, 1},
2010-07-24 06:37:29 -04:00
{"EUC-CN", aliases_euc_cn, highhalf_NULL, table_NULL, 1},
{"EUC-JP", aliases_euc_jp, highhalf_NULL, table_NULL, 1},
{"EUC-KR", aliases_euc_kr, highhalf_NULL, table_NULL, 1},
{"EUC-TW", aliases_euc_tw, highhalf_NULL, table_NULL, 1},
{"GBK", aliases_gbk, highhalf_NULL, table_NULL, 1},
2010-07-24 06:37:29 -04:00
{"GB2312", aliases_gb2312, highhalf_NULL, table_NULL, 1},
{"GB18030", aliases_gb18030, highhalf_NULL, table_NULL, 1},
2010-07-24 06:37:29 -04:00
{"ISO-2022-JP", aliases_iso2022jp, highhalf_NULL, table_NULL, 1},
{NULL, NULL, NULL, 0}
};
#define N_CODEPAGES 45