1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-07-21 15:54:31 -04:00
elinks/src/intl/codepage.inc

5055 lines
227 KiB
PHP
Raw Normal View History

/* Automatically generated by gen-cp */
/* DO NOT EDIT THIS FILE! EDIT Unicode/<whatever> INSTEAD! */
/* See the input files for copyrights and licences. */
/*** 7bit ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define highhalf_7bit highhalf_NULL
#define table_7bit table_NULL
unsigned char *const aliases_7bit [] = {
"us-ascii",
"ascii",
"7bit",
"7-bit",
"iso-ir-6",
"ANSI_X3.4-1968",
"ANSI_X3.4-1986",
"646",
"cp646",
"ISO_646.irv:1991",
"ISO646-US",
"us",
"IBM367",
"cp367",
"csASCII",
"ISO646.1991-IRV",
NULL
};
/*** 8859_1 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_1 [] = {
/* 0x80 */ 0xFFFF,
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0xFFFF,
/* 0x83 */ 0xFFFF,
/* 0x84 */ 0xFFFF,
/* 0x85 */ 0xFFFF,
/* 0x86 */ 0xFFFF,
/* 0x87 */ 0xFFFF,
/* 0x88 */ 0xFFFF,
/* 0x89 */ 0xFFFF,
/* 0x8A */ 0xFFFF,
/* 0x8B */ 0xFFFF,
/* 0x8C */ 0xFFFF,
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0xFFFF,
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0xFFFF,
/* 0x92 */ 0xFFFF,
/* 0x93 */ 0xFFFF,
/* 0x94 */ 0xFFFF,
/* 0x95 */ 0xFFFF,
/* 0x96 */ 0xFFFF,
/* 0x97 */ 0xFFFF,
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0xFFFF,
/* 0x9A */ 0xFFFF,
/* 0x9B */ 0xFFFF,
/* 0x9C */ 0xFFFF,
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0xFFFF,
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x00A1, /* INVERTED EXCLAMATION MARK */
/* 0xA2 */ 0x00A2, /* CENT SIGN */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x00A5, /* YEN SIGN */
/* 0xA6 */ 0x00A6, /* BROKEN BAR */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x00A9, /* COPYRIGHT SIGN */
/* 0xAA */ 0x00AA, /* FEMININE ORDINAL INDICATOR */
/* 0xAB */ 0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xAC */ 0x00AC, /* NOT SIGN */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x00AE, /* REGISTERED SIGN */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x00B1, /* PLUS-MINUS SIGN */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x00B6, /* PILCROW SIGN */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x00B9, /* SUPERSCRIPT ONE */
/* 0xBA */ 0x00BA, /* MASCULINE ORDINAL INDICATOR */
/* 0xBB */ 0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
/* 0xBC */ 0x00BC, /* VULGAR FRACTION ONE QUARTER */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0x00BE, /* VULGAR FRACTION THREE QUARTERS */
/* 0xBF */ 0x00BF, /* INVERTED QUESTION MARK */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0x00D0, /* LATIN CAPITAL LETTER ETH (Icelandic) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x00DE, /* LATIN CAPITAL LETTER THORN (Icelandic) */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S (German) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0x00F0, /* LATIN SMALL LETTER ETH (Icelandic) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x00FE, /* LATIN SMALL LETTER THORN (Icelandic) */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xFF */ 0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_1 table_NULL
unsigned char *const aliases_8859_1 [] = {
"ISO-8859-1",
"iso8859-1",
"8859-1",
"iso-ir-100",
"latin1",
"l1",
"il1",
"819",
"cp819",
"ISO_8859-1",
"IBM819",
"csISOLatin1",
"ISO_8859-1:1987",
"ISO8859_1",
NULL
};
/*** 8859_2 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_2 [] = {
/* 0x80 */ 0xFFFF,
/* 0x81 */ 0xFFFF,
/* 0x82 */ 0xFFFF,
/* 0x83 */ 0xFFFF,
/* 0x84 */ 0xFFFF,
/* 0x85 */ 0xFFFF,
/* 0x86 */ 0xFFFF,
/* 0x87 */ 0xFFFF,
/* 0x88 */ 0xFFFF,
/* 0x89 */ 0xFFFF,
/* 0x8A */ 0xFFFF,
/* 0x8B */ 0xFFFF,
/* 0x8C */ 0xFFFF,
/* 0x8D */ 0xFFFF,
/* 0x8E */ 0xFFFF,
/* 0x8F */ 0xFFFF,
/* 0x90 */ 0xFFFF,
/* 0x91 */ 0xFFFF,
/* 0x92 */ 0xFFFF,
/* 0x93 */ 0xFFFF,
/* 0x94 */ 0xFFFF,
/* 0x95 */ 0xFFFF,
/* 0x96 */ 0xFFFF,
/* 0x97 */ 0xFFFF,
/* 0x98 */ 0xFFFF,
/* 0x99 */ 0xFFFF,
/* 0x9A */ 0xFFFF,
/* 0x9B */ 0xFFFF,
/* 0x9C */ 0xFFFF,
/* 0x9D */ 0xFFFF,
/* 0x9E */ 0xFFFF,
/* 0x9F */ 0xFFFF,
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA2 */ 0x02D8, /* BREVE */
/* 0xA3 */ 0x0141, /* LATIN CAPITAL LETTER L WITH STROKE */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x013D, /* LATIN CAPITAL LETTER L WITH CARON */
/* 0xA6 */ 0x015A, /* LATIN CAPITAL LETTER S WITH ACUTE */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xAA */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xAB */ 0x0164, /* LATIN CAPITAL LETTER T WITH CARON */
/* 0xAC */ 0x0179, /* LATIN CAPITAL LETTER Z WITH ACUTE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xAF */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xB2 */ 0x02DB, /* OGONEK */
/* 0xB3 */ 0x0142, /* LATIN SMALL LETTER L WITH STROKE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x013E, /* LATIN SMALL LETTER L WITH CARON */
/* 0xB6 */ 0x015B, /* LATIN SMALL LETTER S WITH ACUTE */
/* 0xB7 */ 0x02C7, /* CARON */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xBA */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xBB */ 0x0165, /* LATIN SMALL LETTER T WITH CARON */
/* 0xBC */ 0x017A, /* LATIN SMALL LETTER Z WITH ACUTE */
/* 0xBD */ 0x02DD, /* DOUBLE ACUTE ACCENT */
/* 0xBE */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xBF */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xC0 */ 0x0154, /* LATIN CAPITAL LETTER R WITH ACUTE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x0102, /* LATIN CAPITAL LETTER A WITH BREVE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x0139, /* LATIN CAPITAL LETTER L WITH ACUTE */
/* 0xC6 */ 0x0106, /* LATIN CAPITAL LETTER C WITH ACUTE */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x011A, /* LATIN CAPITAL LETTER E WITH CARON */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x010E, /* LATIN CAPITAL LETTER D WITH CARON */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x0143, /* LATIN CAPITAL LETTER N WITH ACUTE */
/* 0xD2 */ 0x0147, /* LATIN CAPITAL LETTER N WITH CARON */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x0150, /* LATIN CAPITAL LETTER O WITH DOUBLE ACUTE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x0158, /* LATIN CAPITAL LETTER R WITH CARON */
/* 0xD9 */ 0x016E, /* LATIN CAPITAL LETTER U WITH RING ABOVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x0170, /* LATIN CAPITAL LETTER U WITH DOUBLE ACUTE */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x00DD, /* LATIN CAPITAL LETTER Y WITH ACUTE */
/* 0xDE */ 0x0162, /* LATIN CAPITAL LETTER T WITH CEDILLA */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x0155, /* LATIN SMALL LETTER R WITH ACUTE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x0103, /* LATIN SMALL LETTER A WITH BREVE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x013A, /* LATIN SMALL LETTER L WITH ACUTE */
/* 0xE6 */ 0x0107, /* LATIN SMALL LETTER C WITH ACUTE */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x011B, /* LATIN SMALL LETTER E WITH CARON */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x010F, /* LATIN SMALL LETTER D WITH CARON */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x0144, /* LATIN SMALL LETTER N WITH ACUTE */
/* 0xF2 */ 0x0148, /* LATIN SMALL LETTER N WITH CARON */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x0151, /* LATIN SMALL LETTER O WITH DOUBLE ACUTE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x0159, /* LATIN SMALL LETTER R WITH CARON */
/* 0xF9 */ 0x016F, /* LATIN SMALL LETTER U WITH RING ABOVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x0171, /* LATIN SMALL LETTER U WITH DOUBLE ACUTE */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x00FD, /* LATIN SMALL LETTER Y WITH ACUTE */
/* 0xFE */ 0x0163, /* LATIN SMALL LETTER T WITH CEDILLA */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_2 table_NULL
unsigned char *const aliases_8859_2 [] = {
"ISO-8859-2",
"iso8859-2",
"8859-2",
"iso-ir-101",
"latin2",
"l2",
"il2",
"ISO_8859-2:1987",
"ISO_8859-2",
"csISOLatin2",
"ISO8859_2",
NULL
};
/*** 8859_3 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_3 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0126, /* LATIN CAPITAL LETTER H WITH STROKE */
/* 0xA2 */ 0x02D8, /* BREVE */
/* 0xA3 */ 0x00A3, /* POUND SIGN */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0xFFFF,
/* 0xA6 */ 0x0124, /* LATIN CAPITAL LETTER H WITH CIRCUMFLEX */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x0130, /* LATIN CAPITAL LETTER I WITH DOT ABOVE */
/* 0xAA */ 0x015E, /* LATIN CAPITAL LETTER S WITH CEDILLA */
/* 0xAB */ 0x011E, /* LATIN CAPITAL LETTER G WITH BREVE */
/* 0xAC */ 0x0134, /* LATIN CAPITAL LETTER J WITH CIRCUMFLEX */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0xFFFF,
/* 0xAF */ 0x017B, /* LATIN CAPITAL LETTER Z WITH DOT ABOVE */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0127, /* LATIN SMALL LETTER H WITH STROKE */
/* 0xB2 */ 0x00B2, /* SUPERSCRIPT TWO */
/* 0xB3 */ 0x00B3, /* SUPERSCRIPT THREE */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x00B5, /* MICRO SIGN */
/* 0xB6 */ 0x0125, /* LATIN SMALL LETTER H WITH CIRCUMFLEX */
/* 0xB7 */ 0x00B7, /* MIDDLE DOT */
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0131, /* LATIN SMALL LETTER DOTLESS I */
/* 0xBA */ 0x015F, /* LATIN SMALL LETTER S WITH CEDILLA */
/* 0xBB */ 0x011F, /* LATIN SMALL LETTER G WITH BREVE */
/* 0xBC */ 0x0135, /* LATIN SMALL LETTER J WITH CIRCUMFLEX */
/* 0xBD */ 0x00BD, /* VULGAR FRACTION ONE HALF */
/* 0xBE */ 0xFFFF,
/* 0xBF */ 0x017C, /* LATIN SMALL LETTER Z WITH DOT ABOVE */
/* 0xC0 */ 0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0xFFFF,
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x010A, /* LATIN CAPITAL LETTER C WITH DOT ABOVE */
/* 0xC6 */ 0x0108, /* LATIN CAPITAL LETTER C WITH CIRCUMFLEX */
/* 0xC7 */ 0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
/* 0xC8 */ 0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
/* 0xD0 */ 0xFFFF,
/* 0xD1 */ 0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
/* 0xD2 */ 0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
/* 0xD3 */ 0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x0120, /* LATIN CAPITAL LETTER G WITH DOT ABOVE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x011C, /* LATIN CAPITAL LETTER G WITH CIRCUMFLEX */
/* 0xD9 */ 0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x016C, /* LATIN CAPITAL LETTER U WITH BREVE */
/* 0xDE */ 0x015C, /* LATIN CAPITAL LETTER S WITH CIRCUMFLEX */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
/* 0xE0 */ 0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0xFFFF,
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x010B, /* LATIN SMALL LETTER C WITH DOT ABOVE */
/* 0xE6 */ 0x0109, /* LATIN SMALL LETTER C WITH CIRCUMFLEX */
/* 0xE7 */ 0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
/* 0xE8 */ 0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
/* 0xF0 */ 0xFFFF,
/* 0xF1 */ 0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
/* 0xF2 */ 0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
/* 0xF3 */ 0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x0121, /* LATIN SMALL LETTER G WITH DOT ABOVE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x011D, /* LATIN SMALL LETTER G WITH CIRCUMFLEX */
/* 0xF9 */ 0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x016D, /* LATIN SMALL LETTER U WITH BREVE */
/* 0xFE */ 0x015D, /* LATIN SMALL LETTER S WITH CIRCUMFLEX */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_3 table_NULL
unsigned char *const aliases_8859_3 [] = {
"ISO-8859-3",
"8859-3",
"ISO_8859-3:1988",
"iso-ir-109",
"ISO_8859-3",
"latin3",
"l3",
"csISOLatin3",
"ISO8859-3",
"ISO8859_3",
NULL
};
/*** 8859_4 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_4 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0104, /* LATIN CAPITAL LETTER A WITH OGONEK */
/* 0xA2 */ 0x0138, /* LATIN SMALL LETTER KRA */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xA3 */ 0x0156, /* LATIN CAPITAL LETTER R WITH CEDILLA */
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0x0128, /* LATIN CAPITAL LETTER I WITH TILDE */
/* 0xA6 */ 0x013B, /* LATIN CAPITAL LETTER L WITH CEDILLA */
/* 0xA7 */ 0x00A7, /* SECTION SIGN */
/* 0xA8 */ 0x00A8, /* DIAERESIS */
/* 0xA9 */ 0x0160, /* LATIN CAPITAL LETTER S WITH CARON */
/* 0xAA */ 0x0112, /* LATIN CAPITAL LETTER E WITH MACRON */
/* 0xAB */ 0x0122, /* LATIN CAPITAL LETTER G WITH CEDILLA */
/* 0xAC */ 0x0166, /* LATIN CAPITAL LETTER T WITH STROKE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x017D, /* LATIN CAPITAL LETTER Z WITH CARON */
/* 0xAF */ 0x00AF, /* MACRON */
/* 0xB0 */ 0x00B0, /* DEGREE SIGN */
/* 0xB1 */ 0x0105, /* LATIN SMALL LETTER A WITH OGONEK */
/* 0xB2 */ 0x02DB, /* OGONEK */
/* 0xB3 */ 0x0157, /* LATIN SMALL LETTER R WITH CEDILLA */
/* 0xB4 */ 0x00B4, /* ACUTE ACCENT */
/* 0xB5 */ 0x0129, /* LATIN SMALL LETTER I WITH TILDE */
/* 0xB6 */ 0x013C, /* LATIN SMALL LETTER L WITH CEDILLA */
/* 0xB7 */ 0x02C7, /* CARON */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xB8 */ 0x00B8, /* CEDILLA */
/* 0xB9 */ 0x0161, /* LATIN SMALL LETTER S WITH CARON */
/* 0xBA */ 0x0113, /* LATIN SMALL LETTER E WITH MACRON */
/* 0xBB */ 0x0123, /* LATIN SMALL LETTER G WITH CEDILLA */
/* 0xBC */ 0x0167, /* LATIN SMALL LETTER T WITH STROKE */
/* 0xBD */ 0x014A, /* LATIN CAPITAL LETTER ENG */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xBE */ 0x017E, /* LATIN SMALL LETTER Z WITH CARON */
/* 0xBF */ 0x014B, /* LATIN SMALL LETTER ENG */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xC0 */ 0x0100, /* LATIN CAPITAL LETTER A WITH MACRON */
/* 0xC1 */ 0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
/* 0xC2 */ 0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
/* 0xC3 */ 0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
/* 0xC4 */ 0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
/* 0xC5 */ 0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
/* 0xC6 */ 0x00C6, /* LATIN CAPITAL LETTER AE */
/* 0xC7 */ 0x012E, /* LATIN CAPITAL LETTER I WITH OGONEK */
/* 0xC8 */ 0x010C, /* LATIN CAPITAL LETTER C WITH CARON */
/* 0xC9 */ 0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
/* 0xCA */ 0x0118, /* LATIN CAPITAL LETTER E WITH OGONEK */
/* 0xCB */ 0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
/* 0xCC */ 0x0116, /* LATIN CAPITAL LETTER E WITH DOT ABOVE */
/* 0xCD */ 0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
/* 0xCE */ 0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
/* 0xCF */ 0x012A, /* LATIN CAPITAL LETTER I WITH MACRON */
/* 0xD0 */ 0x0110, /* LATIN CAPITAL LETTER D WITH STROKE */
/* 0xD1 */ 0x0145, /* LATIN CAPITAL LETTER N WITH CEDILLA */
/* 0xD2 */ 0x014C, /* LATIN CAPITAL LETTER O WITH MACRON */
/* 0xD3 */ 0x0136, /* LATIN CAPITAL LETTER K WITH CEDILLA */
/* 0xD4 */ 0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
/* 0xD5 */ 0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
/* 0xD6 */ 0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
/* 0xD7 */ 0x00D7, /* MULTIPLICATION SIGN */
/* 0xD8 */ 0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
/* 0xD9 */ 0x0172, /* LATIN CAPITAL LETTER U WITH OGONEK */
/* 0xDA */ 0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
/* 0xDB */ 0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
/* 0xDC */ 0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
/* 0xDD */ 0x0168, /* LATIN CAPITAL LETTER U WITH TILDE */
/* 0xDE */ 0x016A, /* LATIN CAPITAL LETTER U WITH MACRON */
/* 0xDF */ 0x00DF, /* LATIN SMALL LETTER SHARP S */
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
/* 0xE0 */ 0x0101, /* LATIN SMALL LETTER A WITH MACRON */
/* 0xE1 */ 0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
/* 0xE2 */ 0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
/* 0xE3 */ 0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
/* 0xE4 */ 0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
/* 0xE5 */ 0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
/* 0xE6 */ 0x00E6, /* LATIN SMALL LETTER AE */
/* 0xE7 */ 0x012F, /* LATIN SMALL LETTER I WITH OGONEK */
/* 0xE8 */ 0x010D, /* LATIN SMALL LETTER C WITH CARON */
/* 0xE9 */ 0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
/* 0xEA */ 0x0119, /* LATIN SMALL LETTER E WITH OGONEK */
/* 0xEB */ 0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
/* 0xEC */ 0x0117, /* LATIN SMALL LETTER E WITH DOT ABOVE */
/* 0xED */ 0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
/* 0xEE */ 0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
/* 0xEF */ 0x012B, /* LATIN SMALL LETTER I WITH MACRON */
/* 0xF0 */ 0x0111, /* LATIN SMALL LETTER D WITH STROKE */
/* 0xF1 */ 0x0146, /* LATIN SMALL LETTER N WITH CEDILLA */
/* 0xF2 */ 0x014D, /* LATIN SMALL LETTER O WITH MACRON */
/* 0xF3 */ 0x0137, /* LATIN SMALL LETTER K WITH CEDILLA */
/* 0xF4 */ 0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
/* 0xF5 */ 0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
/* 0xF6 */ 0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
/* 0xF7 */ 0x00F7, /* DIVISION SIGN */
/* 0xF8 */ 0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
/* 0xF9 */ 0x0173, /* LATIN SMALL LETTER U WITH OGONEK */
/* 0xFA */ 0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
/* 0xFB */ 0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
/* 0xFC */ 0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
/* 0xFD */ 0x0169, /* LATIN SMALL LETTER U WITH TILDE */
/* 0xFE */ 0x016B, /* LATIN SMALL LETTER U WITH MACRON */
/* 0xFF */ 0x02D9, /* DOT ABOVE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_4 table_NULL
unsigned char *const aliases_8859_4 [] = {
"ISO-8859-4",
"iso8859-4",
"8859-4",
"iso-ir-110",
"latin4",
"l4",
"il4",
"ISO_8859-4:1988",
"ISO_8859-4",
"csISOLatin4",
"ISO8859_4",
NULL
};
/*** 8859_5 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_5 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0x0401, /* CYRILLIC CAPITAL LETTER IO */
/* 0xA2 */ 0x0402, /* CYRILLIC CAPITAL LETTER DJE */
/* 0xA3 */ 0x0403, /* CYRILLIC CAPITAL LETTER GJE */
/* 0xA4 */ 0x0404, /* CYRILLIC CAPITAL LETTER UKRAINIAN IE */
/* 0xA5 */ 0x0405, /* CYRILLIC CAPITAL LETTER DZE */
/* 0xA6 */ 0x0406, /* CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xA7 */ 0x0407, /* CYRILLIC CAPITAL LETTER YI */
/* 0xA8 */ 0x0408, /* CYRILLIC CAPITAL LETTER JE */
/* 0xA9 */ 0x0409, /* CYRILLIC CAPITAL LETTER LJE */
/* 0xAA */ 0x040A, /* CYRILLIC CAPITAL LETTER NJE */
/* 0xAB */ 0x040B, /* CYRILLIC CAPITAL LETTER TSHE */
/* 0xAC */ 0x040C, /* CYRILLIC CAPITAL LETTER KJE */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0x040E, /* CYRILLIC CAPITAL LETTER SHORT U */
/* 0xAF */ 0x040F, /* CYRILLIC CAPITAL LETTER DZHE */
/* 0xB0 */ 0x0410, /* CYRILLIC CAPITAL LETTER A */
/* 0xB1 */ 0x0411, /* CYRILLIC CAPITAL LETTER BE */
/* 0xB2 */ 0x0412, /* CYRILLIC CAPITAL LETTER VE */
/* 0xB3 */ 0x0413, /* CYRILLIC CAPITAL LETTER GHE */
/* 0xB4 */ 0x0414, /* CYRILLIC CAPITAL LETTER DE */
/* 0xB5 */ 0x0415, /* CYRILLIC CAPITAL LETTER IE */
/* 0xB6 */ 0x0416, /* CYRILLIC CAPITAL LETTER ZHE */
/* 0xB7 */ 0x0417, /* CYRILLIC CAPITAL LETTER ZE */
/* 0xB8 */ 0x0418, /* CYRILLIC CAPITAL LETTER I */
/* 0xB9 */ 0x0419, /* CYRILLIC CAPITAL LETTER SHORT I */
/* 0xBA */ 0x041A, /* CYRILLIC CAPITAL LETTER KA */
/* 0xBB */ 0x041B, /* CYRILLIC CAPITAL LETTER EL */
/* 0xBC */ 0x041C, /* CYRILLIC CAPITAL LETTER EM */
/* 0xBD */ 0x041D, /* CYRILLIC CAPITAL LETTER EN */
/* 0xBE */ 0x041E, /* CYRILLIC CAPITAL LETTER O */
/* 0xBF */ 0x041F, /* CYRILLIC CAPITAL LETTER PE */
/* 0xC0 */ 0x0420, /* CYRILLIC CAPITAL LETTER ER */
/* 0xC1 */ 0x0421, /* CYRILLIC CAPITAL LETTER ES */
/* 0xC2 */ 0x0422, /* CYRILLIC CAPITAL LETTER TE */
/* 0xC3 */ 0x0423, /* CYRILLIC CAPITAL LETTER U */
/* 0xC4 */ 0x0424, /* CYRILLIC CAPITAL LETTER EF */
/* 0xC5 */ 0x0425, /* CYRILLIC CAPITAL LETTER HA */
/* 0xC6 */ 0x0426, /* CYRILLIC CAPITAL LETTER TSE */
/* 0xC7 */ 0x0427, /* CYRILLIC CAPITAL LETTER CHE */
/* 0xC8 */ 0x0428, /* CYRILLIC CAPITAL LETTER SHA */
/* 0xC9 */ 0x0429, /* CYRILLIC CAPITAL LETTER SHCHA */
/* 0xCA */ 0x042A, /* CYRILLIC CAPITAL LETTER HARD SIGN */
/* 0xCB */ 0x042B, /* CYRILLIC CAPITAL LETTER YERU */
/* 0xCC */ 0x042C, /* CYRILLIC CAPITAL LETTER SOFT SIGN */
/* 0xCD */ 0x042D, /* CYRILLIC CAPITAL LETTER E */
/* 0xCE */ 0x042E, /* CYRILLIC CAPITAL LETTER YU */
/* 0xCF */ 0x042F, /* CYRILLIC CAPITAL LETTER YA */
/* 0xD0 */ 0x0430, /* CYRILLIC SMALL LETTER A */
/* 0xD1 */ 0x0431, /* CYRILLIC SMALL LETTER BE */
/* 0xD2 */ 0x0432, /* CYRILLIC SMALL LETTER VE */
/* 0xD3 */ 0x0433, /* CYRILLIC SMALL LETTER GHE */
/* 0xD4 */ 0x0434, /* CYRILLIC SMALL LETTER DE */
/* 0xD5 */ 0x0435, /* CYRILLIC SMALL LETTER IE */
/* 0xD6 */ 0x0436, /* CYRILLIC SMALL LETTER ZHE */
/* 0xD7 */ 0x0437, /* CYRILLIC SMALL LETTER ZE */
/* 0xD8 */ 0x0438, /* CYRILLIC SMALL LETTER I */
/* 0xD9 */ 0x0439, /* CYRILLIC SMALL LETTER SHORT I */
/* 0xDA */ 0x043A, /* CYRILLIC SMALL LETTER KA */
/* 0xDB */ 0x043B, /* CYRILLIC SMALL LETTER EL */
/* 0xDC */ 0x043C, /* CYRILLIC SMALL LETTER EM */
/* 0xDD */ 0x043D, /* CYRILLIC SMALL LETTER EN */
/* 0xDE */ 0x043E, /* CYRILLIC SMALL LETTER O */
/* 0xDF */ 0x043F, /* CYRILLIC SMALL LETTER PE */
/* 0xE0 */ 0x0440, /* CYRILLIC SMALL LETTER ER */
/* 0xE1 */ 0x0441, /* CYRILLIC SMALL LETTER ES */
/* 0xE2 */ 0x0442, /* CYRILLIC SMALL LETTER TE */
/* 0xE3 */ 0x0443, /* CYRILLIC SMALL LETTER U */
/* 0xE4 */ 0x0444, /* CYRILLIC SMALL LETTER EF */
/* 0xE5 */ 0x0445, /* CYRILLIC SMALL LETTER HA */
/* 0xE6 */ 0x0446, /* CYRILLIC SMALL LETTER TSE */
/* 0xE7 */ 0x0447, /* CYRILLIC SMALL LETTER CHE */
/* 0xE8 */ 0x0448, /* CYRILLIC SMALL LETTER SHA */
/* 0xE9 */ 0x0449, /* CYRILLIC SMALL LETTER SHCHA */
/* 0xEA */ 0x044A, /* CYRILLIC SMALL LETTER HARD SIGN */
/* 0xEB */ 0x044B, /* CYRILLIC SMALL LETTER YERU */
/* 0xEC */ 0x044C, /* CYRILLIC SMALL LETTER SOFT SIGN */
/* 0xED */ 0x044D, /* CYRILLIC SMALL LETTER E */
/* 0xEE */ 0x044E, /* CYRILLIC SMALL LETTER YU */
/* 0xEF */ 0x044F, /* CYRILLIC SMALL LETTER YA */
/* 0xF0 */ 0x2116, /* NUMERO SIGN */
/* 0xF1 */ 0x0451, /* CYRILLIC SMALL LETTER IO */
/* 0xF2 */ 0x0452, /* CYRILLIC SMALL LETTER DJE */
/* 0xF3 */ 0x0453, /* CYRILLIC SMALL LETTER GJE */
/* 0xF4 */ 0x0454, /* CYRILLIC SMALL LETTER UKRAINIAN IE */
/* 0xF5 */ 0x0455, /* CYRILLIC SMALL LETTER DZE */
/* 0xF6 */ 0x0456, /* CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I */
/* 0xF7 */ 0x0457, /* CYRILLIC SMALL LETTER YI */
/* 0xF8 */ 0x0458, /* CYRILLIC SMALL LETTER JE */
/* 0xF9 */ 0x0459, /* CYRILLIC SMALL LETTER LJE */
/* 0xFA */ 0x045A, /* CYRILLIC SMALL LETTER NJE */
/* 0xFB */ 0x045B, /* CYRILLIC SMALL LETTER TSHE */
/* 0xFC */ 0x045C, /* CYRILLIC SMALL LETTER KJE */
/* 0xFD */ 0x00A7, /* SECTION SIGN */
/* 0xFE */ 0x045E, /* CYRILLIC SMALL LETTER SHORT U */
/* 0xFF */ 0x045F, /* CYRILLIC SMALL LETTER DZHE */
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_5 table_NULL
unsigned char *const aliases_8859_5 [] = {
"ISO-8859-5",
"iso8859-5",
"8859-5",
"ISO_8859-5:1988",
"iso-ir-144",
"ISO_8859-5",
"cyrillic",
"csISOLatinCyrillic",
"ISO8859_5",
NULL
};
/*** 8859_6 ***/
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
const uint16_t highhalf_8859_6 [] = {
/* 0x80 */ 0x0080, /* <control> */
/* 0x81 */ 0x0081, /* <control> */
/* 0x82 */ 0x0082, /* <control> */
/* 0x83 */ 0x0083, /* <control> */
/* 0x84 */ 0x0084, /* <control> */
/* 0x85 */ 0x0085, /* <control> */
/* 0x86 */ 0x0086, /* <control> */
/* 0x87 */ 0x0087, /* <control> */
/* 0x88 */ 0x0088, /* <control> */
/* 0x89 */ 0x0089, /* <control> */
/* 0x8A */ 0x008A, /* <control> */
/* 0x8B */ 0x008B, /* <control> */
/* 0x8C */ 0x008C, /* <control> */
/* 0x8D */ 0x008D, /* <control> */
/* 0x8E */ 0x008E, /* <control> */
/* 0x8F */ 0x008F, /* <control> */
/* 0x90 */ 0x0090, /* <control> */
/* 0x91 */ 0x0091, /* <control> */
/* 0x92 */ 0x0092, /* <control> */
/* 0x93 */ 0x0093, /* <control> */
/* 0x94 */ 0x0094, /* <control> */
/* 0x95 */ 0x0095, /* <control> */
/* 0x96 */ 0x0096, /* <control> */
/* 0x97 */ 0x0097, /* <control> */
/* 0x98 */ 0x0098, /* <control> */
/* 0x99 */ 0x0099, /* <control> */
/* 0x9A */ 0x009A, /* <control> */
/* 0x9B */ 0x009B, /* <control> */
/* 0x9C */ 0x009C, /* <control> */
/* 0x9D */ 0x009D, /* <control> */
/* 0x9E */ 0x009E, /* <control> */
/* 0x9F */ 0x009F, /* <control> */
/* 0xA0 */ 0x00A0, /* NO-BREAK SPACE */
/* 0xA1 */ 0xFFFF,
/* 0xA2 */ 0xFFFF,
/* 0xA3 */ 0xFFFF,
/* 0xA4 */ 0x00A4, /* CURRENCY SIGN */
/* 0xA5 */ 0xFFFF,
/* 0xA6 */ 0xFFFF,
/* 0xA7 */ 0xFFFF,
/* 0xA8 */ 0xFFFF,
/* 0xA9 */ 0xFFFF,
/* 0xAA */ 0xFFFF,
/* 0xAB */ 0xFFFF,
/* 0xAC */ 0x060C, /* ARABIC COMMA */
/* 0xAD */ 0x00AD, /* SOFT HYPHEN */
/* 0xAE */ 0xFFFF,
/* 0xAF */ 0xFFFF,
/* 0xB0 */ 0xFFFF,
/* 0xB1 */ 0xFFFF,
/* 0xB2 */ 0xFFFF,
/* 0xB3 */ 0xFFFF,
/* 0xB4 */ 0xFFFF,
/* 0xB5 */ 0xFFFF,
/* 0xB6 */ 0xFFFF,
/* 0xB7 */ 0xFFFF,
/* 0xB8 */ 0xFFFF,
/* 0xB9 */ 0xFFFF,
/* 0xBA */ 0xFFFF,
/* 0xBB */ 0x061B, /* ARABIC SEMICOLON */
/* 0xBC */ 0xFFFF,
/* 0xBD */ 0xFFFF,
/* 0xBE */ 0xFFFF,
/* 0xBF */ 0x061F, /* ARABIC QUESTION MARK */
/* 0xC0 */ 0xFFFF,
/* 0xC1 */ 0x0621, /* ARABIC LETTER HAMZA */
/* 0xC2 */ 0x0622, /* ARABIC LETTER ALEF WITH MADDA ABOVE */
/* 0xC3 */ 0x0623, /* ARABIC LETTER ALEF WITH HAMZA ABOVE */
/* 0xC4 */ 0x0624, /* ARABIC LETTER WAW WITH HAMZA ABOVE */
/* 0xC5 */ 0x0625, /* ARABIC LETTER ALEF WITH HAMZA BELOW */
/* 0xC6 */ 0x0626, /* ARABIC LETTER YEH WITH HAMZA ABOVE */
/* 0xC7 */ 0x0627, /* ARABIC LETTER ALEF */
/* 0xC8 */ 0x0628, /* ARABIC LETTER BEH */
/* 0xC9 */ 0x0629, /* ARABIC LETTER TEH MARBUTA */
/* 0xCA */ 0x062A, /* ARABIC LETTER TEH */
/* 0xCB */ 0x062B, /* ARABIC LETTER THEH */
/* 0xCC */ 0x062C, /* ARABIC LETTER JEEM */
/* 0xCD */ 0x062D, /* ARABIC LETTER HAH */
/* 0xCE */ 0x062E, /* ARABIC LETTER KHAH */
/* 0xCF */ 0x062F, /* ARABIC LETTER DAL */
/* 0xD0 */ 0x0630, /* ARABIC LETTER THAL */
/* 0xD1 */ 0x0631, /* ARABIC LETTER REH */
/* 0xD2 */ 0x0632, /* ARABIC LETTER ZAIN */
/* 0xD3 */ 0x0633, /* ARABIC LETTER SEEN */
/* 0xD4 */ 0x0634, /* ARABIC LETTER SHEEN */
/* 0xD5 */ 0x0635, /* ARABIC LETTER SAD */
/* 0xD6 */ 0x0636, /* ARABIC LETTER DAD */
/* 0xD7 */ 0x0637, /* ARABIC LETTER TAH */
/* 0xD8 */ 0x0638, /* ARABIC LETTER ZAH */
/* 0xD9 */ 0x0639, /* ARABIC LETTER AIN */
/* 0xDA */ 0x063A, /* ARABIC LETTER GHAIN */
/* 0xDB */ 0xFFFF,
/* 0xDC */ 0xFFFF,
/* 0xDD */ 0xFFFF,
/* 0xDE */ 0xFFFF,
/* 0xDF */ 0xFFFF,
/* 0xE0 */ 0x0640, /* ARABIC TATWEEL */
/* 0xE1 */ 0x0641, /* ARABIC LETTER FEH */
/* 0xE2 */ 0x0642, /* ARABIC LETTER QAF */
/* 0xE3 */ 0x0643, /* ARABIC LETTER KAF */
/* 0xE4 */ 0x0644, /* ARABIC LETTER LAM */
/* 0xE5 */ 0x0645, /* ARABIC LETTER MEEM */
/* 0xE6 */ 0x0646, /* ARABIC LETTER NOON */
/* 0xE7 */ 0x0647, /* ARABIC LETTER HEH */
/* 0xE8 */ 0x0648, /* ARABIC LETTER WAW */
/* 0xE9 */ 0x0649, /* ARABIC LETTER ALEF MAKSURA */
/* 0xEA */ 0x064A, /* ARABIC LETTER YEH */
/* 0xEB */ 0x064B, /* ARABIC FATHATAN */
/* 0xEC */ 0x064C, /* ARABIC DAMMATAN */
/* 0xED */ 0x064D, /* ARABIC KASRATAN */
/* 0xEE */ 0x064E, /* ARABIC FATHA */
/* 0xEF */ 0x064F, /* ARABIC DAMMA */
/* 0xF0 */ 0x0650, /* ARABIC KASRA */
/* 0xF1 */ 0x0651, /* ARABIC SHADDA */
/* 0xF2 */ 0x0652, /* ARABIC SUKUN */
/* 0xF3 */ 0xFFFF,
/* 0xF4 */ 0xFFFF,
/* 0xF5 */ 0xFFFF,
/* 0xF6 */ 0xFFFF,
/* 0xF7 */ 0xFFFF,
/* 0xF8 */ 0xFFFF,
/* 0xF9 */ 0xFFFF,
/* 0xFA */ 0xFFFF,
/* 0xFB */ 0xFFFF,
/* 0xFC */ 0xFFFF,
/* 0xFD */ 0xFFFF,
/* 0xFE */ 0xFFFF,
/* 0xFF */ 0xFFFF,
};
Bug 381: Store codepage-to-Unicode mappings as dense arrays. Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2006-09-24 09:55:29 -04:00
#define table_8859_6 table_NULL
unsigned char *const aliases_8859_6 [] = {
"ISO-8859-6",
"8859-6",
"ISO_8859-6",
"ISO_8859-6:1987",
"ISO-IR-127",
"ECMA-114",
"ASMO-708",
"ARABIC",
"csISOLatinArabic",
"ISO8859-6",
"ISO8859_6",
NULL
};
/*** 8859_7 ***/