elinks/Unicode/gen-cp

#!/bin/sh

echo
echo Generating code page translation tables.

codepages=`cat index.txt`

if [ -n "$codepages" ]; then

(

n=0

echo '/* Automatically generated by gen-cp */'
echo '/* DO NOT EDIT THIS FILE! EDIT Unicode/<whatever> INSTEAD! */'
echo '/* See the input files for copyrights and licences. */'
echo

for i in $codepages; do
	echo -n $i' ' 1>&2
	echo

	echo
	echo "/*** $i ***/"
	echo

	sed '	# Delete the name and aliases lines.
		1,2d
		# Delete comment-only and blank lines.
		/^[	 ]*\(#.*\)\{,1\}$/d
		# Copy to the hold space.
		h
		# Delete everything except the comment.
		s/^[^#]*//
		# If there is a comment, change it to use /* */ delimiters.
		s!#[	 ]*\(.*\)!/* \1 */!
		# Exchange spaces; now hold space = comment and pattern space = all.
		x
		# Delete the comment. 
		s/#.*//
		# Canonicalize case so the strings can be used as lookup keys.
		y/Xabcdef/xABCDEF/
		# Delete mappings of bytes 0x00...0x7F.  ELinks assumes those match ASCII. 
		/^0x[01234567]/d
		# Delete lines that do not map the byte to exactly one character.
		/^[ 	]*0x[0123456789ABCDEF]\{2\}[ 	]\{1,\}0x[0123456789ABCDEF]\{1,\}[ 	]*$/!d
		# Append a newline and the comment from the hold space.
		G
		# Delete the newline added by the previous command.
		s/\n//' "$i.cp" | {
		for left in 8 9 A B C D E F; do
			for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
				eval "high0x$left$right="
			done
		done
		table=
		highuse=
	    	while read byte unicode comment; do
			if eval "[ \"\$high$byte\" ]"; then
				table="$table	{$byte, $unicode},${comment+ }$comment
"
			else
				eval "high$byte=\"\$unicode,\${comment+ }\$comment\""
				highuse=1
			fi
		done
		if [ "$highuse" ]; then
			printf "const uint16_t highhalf_%s [] = {\n" "$i"
			for left in 8 9 A B C D E F; do
				for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
					eval "printf \"\\t/* %s */ %s\\n\" \"0x$left$right\" \"\${high0x$left$right:-0xFFFF,}\""
				done
			done
			printf "};\n\n"
		else
			printf "#define highhalf_%s highhalf_NULL\n\n" "$i"
		fi
		if [ "$table" ]; then
			printf "const struct table_entry table_%s [] = {\n%s\t{0, 0}\n};\n" "$i" "$table"
		else
			printf "#define table_%s table_NULL\n" "$i"
		fi
		printf "\n"
	}

	echo 'unsigned char *const aliases_'$i' [] = {'
	head -n 2 $i.cp | tail -n +2 | sed 's/ \+/ /g; s/ $//; s/\", /\",<2C>/g; s/$/,/' | tr "<22>" "\n" \
	| sed 's/^/<2F>/g' | tr "<22>" "\t" 
	echo '	NULL
};'
	n=`expr $n + 1`
done

printf "\n/*** NULL ***/\n\n"
printf "const uint16_t highhalf_NULL [] = {\n"
for r in `seq 16`; do
	printf "\t0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,\n"
done
printf "};\n\n"
printf "const struct table_entry table_NULL [] = {\n"
printf "\t{0, 0}\n"
printf "};\n"

echo
echo 'const struct codepage_desc codepages [] = {'

for i in $codepages; do
	echo '	{"'`head -n 1 $i.cp`'", aliases_'$i', highhalf_'$i', table_'$i'},'
done

echo '	{NULL, NULL, NULL}'
echo '};'

echo
echo '#define N_CODEPAGES '$n | sed 's/
//g'

) | sed 's/
//g' > ../src/intl/codepage.inc

echo
echo Done.

fi

echo
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
+								#!/bin/sh
 								echo
 								echo Generating code page translation tables.
 								codepages=`cat index.txt`
 								if [ -n "$codepages" ]; then
 								(
 								n=0
 								echo '/* Automatically generated by gen-cp */'
 								echo '/* DO NOT EDIT THIS FILE! EDIT Unicode/<whatever> INSTEAD! */'
-												Refresh charsets from www.unicode.org.

Add copyright and licence notices, and a NEWS entry.

The data in the new versions is not entirely the same as what ELinks
used to have:

- Unicode/8859_1.cp: Adds control characters.
- Unicode/8859_2.cp: Adds control characters.
- Unicode/8859_4.cp: Adds some control characters that ELinks assumed
  there already.
- Unicode/8859_7.cp: Adds three characters.
- Unicode/8859_15.cp: Adds control characters.
- Unicode/8859_16.cp: Adds control characters and swaps 0xA5 with 0xAB.
- Unicode/koi8_r.cp: Changes 0x95 and adds some control characters
  that ELinks assumed there already.
- Unicode/macroman.cp: Changes 0xC6 and removes some control characters
  that ELinks assumes there anyway.

											
										
										
											2008-10-11 08:04:23 -04:00
+								echo '/* See the input files for copyrights and licences. */'
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
+								echo
 								for i in $codepages; do
 									echo -n $i' ' 1>&2
 									echo
 									echo
 									echo "/*** $i ***/"
 									echo
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+									sed '	# Delete the name and aliases lines.
 ,2d
 										# Delete comment-only and blank lines.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										/^[	 ]*\(#.*\)\{,1\}$/d
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Copy to the hold space.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										h
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Delete everything except the comment.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										s/^[^#]*//
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# If there is a comment, change it to use /* */ delimiters.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										s!#[	 ]*\(.*\)!/* \1 */!
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Exchange spaces; now hold space = comment and pattern space = all.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										x
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Delete the comment.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										s/#.*//
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Canonicalize case so the strings can be used as lookup keys.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										y/Xabcdef/xABCDEF/
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Delete mappings of bytes 0x00...0x7F.  ELinks assumes those match ASCII.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										/^0x[01234567]/d
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Delete lines that do not map the byte to exactly one character.
 										/^[ 	]*0x[0123456789ABCDEF]\{2\}[ 	]\{1,\}0x[0123456789ABCDEF]\{1,\}[ 	]*$/!d
 										# Append a newline and the comment from the hold space.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										G
-												cp1250, cp1257: Don't map undefined bytes to U+0000.

											
										
										
											2007-01-27 02:58:18 -05:00
+										# Delete the newline added by the previous command.
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+										s/\n//' "$i.cp" | {
 										for left in 8 9 A B C D E F; do
 											for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
 												eval "high0x$left$right="
 											done
 										done
 										table=
 										highuse=
 									    	while read byte unicode comment; do
 											if eval "[ \"\$high$byte\" ]"; then
 												table="$table	{$byte, $unicode},${comment+ }$comment
 								"
 											else
 												eval "high$byte=\"\$unicode,\${comment+ }\$comment\""
 												highuse=1
 											fi
 										done
 										if [ "$highuse" ]; then
 											printf "const uint16_t highhalf_%s [] = {\n" "$i"
 											for left in 8 9 A B C D E F; do
 												for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
 													eval "printf \"\\t/* %s */ %s\\n\" \"0x$left$right\" \"\${high0x$left$right:-0xFFFF,}\""
 												done
 											done
 											printf "};\n\n"
 										else
 											printf "#define highhalf_%s highhalf_NULL\n\n" "$i"
 										fi
 										if [ "$table" ]; then
 											printf "const struct table_entry table_%s [] = {\n%s\t{0, 0}\n};\n" "$i" "$table"
 										else
 											printf "#define table_%s table_NULL\n" "$i"
 										fi
 										printf "\n"
 									}
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
-												Bug 381: Make codepage data const.

Before:

   text	   data	    bss	    dec	    hex	filename
  25726	  62992	   3343	  92061	  1679d	src/intl/charsets.o
 653856	 120020	  82144	 856020	  d0fd4	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  60190	  28528	   3311	  92029	  1677d	src/intl/charsets.o
 688320	  85556	  82112	 855988	  d0fb4	src/elinks

So 34464 bytes were moved from the data section to the text section
and should be more likely to get shared between ELinks processes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-O2 -ggdb -Wall"

											
										
										
											2006-09-24 04:59:23 -04:00
+									echo 'unsigned char *const aliases_'$i' [] = {'
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
+									head -n 2 $i.cp | tail -n +2 | sed 's/ \+/ /g; s/ $//; s/\", /\",<2C>/g; s/$/,/' | tr "<22>" "\n" \
 									| sed 's/^/<2F>/g' | tr "<22>" "\t"
 									echo '	NULL
 								};'
 									n=`expr $n + 1`
 								done
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+								printf "\n/*** NULL ***/\n\n"
 								printf "const uint16_t highhalf_NULL [] = {\n"
 								for r in `seq 16`; do
 									printf "\t0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,\n"
 								done
 								printf "};\n\n"
 								printf "const struct table_entry table_NULL [] = {\n"
 								printf "\t{0, 0}\n"
 								printf "};\n"
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
+								echo
-												Bug 381: Make codepage data const.

Before:

   text	   data	    bss	    dec	    hex	filename
  25726	  62992	   3343	  92061	  1679d	src/intl/charsets.o
 653856	 120020	  82144	 856020	  d0fd4	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  60190	  28528	   3311	  92029	  1677d	src/intl/charsets.o
 688320	  85556	  82112	 855988	  d0fb4	src/elinks

So 34464 bytes were moved from the data section to the text section
and should be more likely to get shared between ELinks processes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-O2 -ggdb -Wall"

											
										
										
											2006-09-24 04:59:23 -04:00
+								echo 'const struct codepage_desc codepages [] = {'
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
 								for i in $codepages; do
-												Bug 381: Store codepage-to-Unicode mappings as dense arrays.

Previously, each mapping between a codepage byte and a Unicode
character was stored as a struct table_entry, which listed both the
byte and the character.  This representation may be optimal for sparse
mappings, but codepages map almost every possible byte to a character,
so it is more efficient to just have an array that lists the Unicode
character corresponding to each byte from 0x80 to 0xFF.  The bytes are
not stored but rather implied by the array index.  The tcvn5712 and
viscii codepages have a total of four mappings that do not fit in the
arrays, so we still use struct table_entry for those.

This change also makes cp2u() operate in O(1) time and may speed up
other functions as well.

The "sed | while read" concoction in Unicode/gen-cp looks rather
unhealthy.  It would probably be faster and more readable if rewritten
in Perl, but IMO that goes for the previous version as well, so I
suppose whoever wrote it had a reason not to use Perl here.

Before:

   text	   data	    bss	    dec	    hex	filename
  38948	  28528	   3311	  70787	  11483	src/intl/charsets.o
 500096	  85568	  82112	 667776	  a3080	src/elinks

After:

   text	   data	    bss	    dec	    hex	filename
  31558	  28528	   3311	  63397	   f7a5	src/intl/charsets.o
 492878	  85568	  82112	 660558	  a144e	src/elinks

So the text section shrank by 7390 bytes.

Measured on i686-pc-linux-gnu with: --disable-xbel --disable-nls
--disable-cookies --disable-formhist --disable-globhist
--disable-mailcap --disable-mimetypes --disable-smb --disable-mouse
--disable-sysmouse --disable-leds --disable-marks --disable-css
--enable-small --enable-utf-8 --without-gpm --without-bzlib
--without-idn --without-spidermonkey --without-lua --without-gnutls
--without-openssl CFLAGS="-Os -ggdb -Wall"

											
										
										
											2006-09-24 09:55:29 -04:00
+									echo '	{"'`head -n 1 $i.cp`'", aliases_'$i', highhalf_'$i', table_'$i'},'
-												Initial commit of the HEAD branch of the ELinks CVS repository, as of
Thu Sep 15 15:57:07 CEST 2005. The previous history can be added to this
by grafting.

											
										
										
											2005-09-15 09:58:31 -04:00
+								done
 								echo '	{NULL, NULL, NULL}'
 								echo '};'
 								echo
 								echo '#define N_CODEPAGES '$n | sed 's/
//g'
 								) | sed 's/
//g' > ../src/intl/codepage.inc
 								echo
 								echo Done.
 								fi
 								echo