Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode character was stored as a struct table_entry, which listed both the byte and the character. This representation may be optimal for sparse mappings, but codepages map almost every possible byte to a character, so it is more efficient to just have an array that lists the Unicode character corresponding to each byte from 0x80 to 0xFF. The bytes are not stored but rather implied by the array index. The tcvn5712 and viscii codepages have a total of four mappings that do not fit in the arrays, so we still use struct table_entry for those. This change also makes cp2u() operate in O(1) time and may speed up other functions as well. The "sed | while read" concoction in Unicode/gen-cp looks rather unhealthy. It would probably be faster and more readable if rewritten in Perl, but IMO that goes for the previous version as well, so I suppose whoever wrote it had a reason not to use Perl here. Before: text data bss dec hex filename 38948 28528 3311 70787 11483 src/intl/charsets.o 500096 85568 82112 667776 a3080 src/elinks After: text data bss dec hex filename 31558 28528 3311 63397 f7a5 src/intl/charsets.o 492878 85568 82112 660558 a144e src/elinks So the text section shrank by 7390 bytes. Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls --disable-cookies --disable-formhist --disable-globhist --disable-mailcap --disable-mimetypes --disable-smb --disable-mouse --disable-sysmouse --disable-leds --disable-marks --disable-css --enable-small --enable-utf-8 --without-gpm --without-bzlib --without-idn --without-spidermonkey --without-lua --without-gnutls --without-openssl CFLAGS="-Os -ggdb -Wall"
2025-06-30 22:19:29 -04:00 · 2006-09-24 16:55:29 +03:00 · 2006-09-24 16:55:29 +03:00 · 4a5af7fd26
commit 4a5af7fd26
parent 0e88f8ba28
3 changed files with 4494 additions and 4139 deletions
--- a/Unicode/gen-cp
+++ b/Unicode/gen-cp
@ -23,19 +23,52 @@ for i in $codepages; do
 	echo "/*** $i ***/"
 	echo

-	echo 'const struct table_entry table_'$i' [] = {'
-
-	# TODO: Comments inside of the structure are ugliness in a pure clean
-	# form, and my aesthetical feeling shivers upon glancing at it. However
-	# we should handle commentless records. A loop with read inside would
-	# be ideal, I suppose. --pasky
-	tail -n +3 $i.cp | sed 's/# *\(.*\) *$/\/* \1 *\/ /' | grep '^0x[89a-zA-Z]' \
-	| sed 's/[ 	][ 	]*/ /g' | sed 's/[ 	]*$/ },/' | sed 's/ /, /' \
-	| sed 's/^[ 	]*/	{/' | grep '.*,.*,'
-
-	echo '	{0, 0}'
-	echo '};'
-	echo
+	sed '1,2d
+		/^[	 ]*\(#.*\)\{,1\}$/d
+		h
+		s/^[^#]*//
+		s!#[	 ]*\(.*\)!/* \1 */!
+		x
+		s/#.*//
+		y/Xabcdef/xABCDEF/
+		/^0x[01234567]/d
+		/[^0x0123456789ABCDEF	 ]/d
+		G
+		s/\n//' "$i.cp" | {
+		for left in 8 9 A B C D E F; do
+			for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
+				eval "high0x$left$right="
+			done
+		done
+		table=
+		highuse=
+	    	while read byte unicode comment; do
+			if eval "[ \"\$high$byte\" ]"; then
+				table="$table	{$byte, $unicode},${comment+ }$comment
+"
+			else
+				eval "high$byte=\"\$unicode,\${comment+ }\$comment\""
+				highuse=1
+			fi
+		done
+		if [ "$highuse" ]; then
+			printf "const uint16_t highhalf_%s [] = {\n" "$i"
+			for left in 8 9 A B C D E F; do
+				for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
+					eval "printf \"\\t/* %s */ %s\\n\" \"0x$left$right\" \"\${high0x$left$right:-0xFFFF,}\""
+				done
+			done
+			printf "};\n\n"
+		else
+			printf "#define highhalf_%s highhalf_NULL\n\n" "$i"
+		fi
+		if [ "$table" ]; then
+			printf "const struct table_entry table_%s [] = {\n%s\t{0, 0}\n};\n" "$i" "$table"
+		else
+			printf "#define table_%s table_NULL\n" "$i"
+		fi
+		printf "\n"
+	}

 	echo 'unsigned char *const aliases_'$i' [] = {'
 	head -n 2 $i.cp | tail -n +2 | sed 's/ \+/ /g; s/ $//; s/\", /\",£/g; s/$/,/' | tr "£" "\n" \
@ -45,11 +78,21 @@ for i in $codepages; do
 	n=`expr $n + 1`
 done

+printf "\n/*** NULL ***/\n\n"
+printf "const uint16_t highhalf_NULL [] = {\n"
+for r in `seq 16`; do
+	printf "\t0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,\n"
+done
+printf "};\n\n"
+printf "const struct table_entry table_NULL [] = {\n"
+printf "\t{0, 0}\n"
+printf "};\n"
+
 echo
 echo 'const struct codepage_desc codepages [] = {'

 for i in $codepages; do
-	echo '	{"'`head -n 1 $i.cp`'", aliases_'$i', table_'$i'},'
+	echo '	{"'`head -n 1 $i.cp`'", aliases_'$i', highhalf_'$i', table_'$i'},'
 done

 echo '	{NULL, NULL, NULL}'
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@ -43,6 +43,21 @@ struct table_entry {
 struct codepage_desc {
 	unsigned char *name;
 	unsigned char *const *aliases;
+ 
+ 	/* The Unicode mappings of codepage bytes 0x80...0xFF.
+ 	 * (0x00...0x7F are assumed to be ASCII in all codepages.)
+ 	 * Because all current values fit in 16 bits, we store them as
+ 	 * uint16_t rather than unicode_val_T.  If the codepage does
+ 	 * not use some byte, then @highhalf maps that byte to 0xFFFF,
+ 	 * which C code converts to UCS_REPLACEMENT_CHARACTER where
+ 	 * appropriate.  (U+FFFF is reserved and will never be
+ 	 * assigned as a character.)  */
+	const uint16_t *highhalf;
+ 
+ 	/* If some byte in the codepage corresponds to multiple Unicode
+ 	 * characters, then the preferred character is in @highhalf
+ 	 * above, and the rest are listed here in @extra.  This table
+ 	 * is not used for translating from the codepage to Unicode.  */
 	const struct table_entry *table;
 };

@ -142,7 +157,7 @@ static const unicode_val_T strange_chars[32] = {
 };

 #define SYSTEM_CHARSET_FLAG 128
-#define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->table == table_utf8)
+#define is_cp_ptr_utf8(cp_ptr) ((cp_ptr)->aliases == aliases_utf8)

 unsigned char *
 u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
@ -170,7 +185,10 @@ u2cp_(unicode_val_T u, int to, int no_nbsp_hack)
 		return u2cp_(strange, to, no_nbsp_hack);
 	}

-
+	if (u < 0xFFFF)
+		for (j = 0; j < 0x80; j++)
+			if (codepages[to].highhalf[j] == u)
+				return strings[0x80 + j];
 	for (j = 0; codepages[to].table[j].c; j++)
 		if (codepages[to].table[j].u == u)
 			return strings[codepages[to].table[j].c];
@ -631,20 +649,17 @@ utf8_to_unicode(unsigned char **string, unsigned char *end)
 }
 #endif /* CONFIG_UTF8 */

-/* Slow algorithm, the common part of cp2u and cp2utf8.  */
+/* The common part of cp2u and cp2utf_8.  */
 static unicode_val_T
 cp2u_shared(const struct codepage_desc *from, unsigned char c)
 {
-	int j;
-
-	for (j = 0; from->table[j].c; j++)
-		if (from->table[j].c == c)
-			return from->table[j].u;
-
-	return UCS_REPLACEMENT_CHARACTER;
+	unicode_val_T u = from->highhalf[c - 0x80];
+	
+	if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
+	return u;
 }

-/* Slow algorithm, used for converting input from the terminal.  */
+/* Used for converting input from the terminal.  */
 unicode_val_T
 cp2u(int from, unsigned char c)
 {
@ -757,8 +772,14 @@ get_translation_table_to_utf8(int from)
 		return utf_table;
 	}

-	for (i = 128; i < 256; i++)
-		utf_table[i].u.str = NULL;
+	for (i = 128; i < 256; i++) {
+		unicode_val_T u = codepages[from].highhalf[i - 0x80];
+
+		if (u == 0xFFFF)
+			utf_table[i].u.str = NULL;
+		else
+			utf_table[i].u.str = stracpy(encode_utf8(u));
+	}

 	for (i = 0; codepages[from].table[i].c; i++) {
 		unicode_val_T u = codepages[from].table[i].u;
@ -815,6 +836,12 @@ get_translation_table(int from, int to)
 	if (is_cp_ptr_utf8(&codepages[from])) {
 		int i;

+		for (i = 0x80; i <= 0xFF; i++)
+			if (codepages[to].highhalf[i - 0x80] != 0xFFFF)
+				add_utf8(table,
+					 codepages[to].highhalf[i - 0x80],
+					 strings[i]);
+
 		for (i = 0; codepages[to].table[i].c; i++)
 			add_utf8(table, codepages[to].table[i].u,
 				 strings[codepages[to].table[i].c]);
@ -828,16 +855,11 @@ get_translation_table(int from, int to)
 		int i;

 		for (i = 128; i < 256; i++) {
-			int j;
+			if (codepages[from].highhalf[i - 0x80] != 0xFFFF) {
+				unsigned char *u;

-			for (j = 0; codepages[from].table[j].c; j++) {
-				if (codepages[from].table[j].c == i) {
-					unsigned char *u;
-
-					u = u2cp(codepages[from].table[j].u, to);
-					if (u) table[i].u.str = u;
-					break;
-				}
+				u = u2cp(codepages[from].highhalf[i - 0x80], to);
+				if (u) table[i].u.str = u;
 			}
 		}
 	}
--- a/src/intl/codepage.inc
+++ b/src/intl/codepage.inc