1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00
elinks/Unicode/gen-cp
Kalle Olavi Niemitalo c9ca6fd448 Refresh charsets from www.unicode.org.
Add copyright and licence notices, and a NEWS entry.

The data in the new versions is not entirely the same as what ELinks
used to have:

- Unicode/8859_1.cp: Adds control characters.
- Unicode/8859_2.cp: Adds control characters.
- Unicode/8859_4.cp: Adds some control characters that ELinks assumed
  there already.
- Unicode/8859_7.cp: Adds three characters.
- Unicode/8859_15.cp: Adds control characters.
- Unicode/8859_16.cp: Adds control characters and swaps 0xA5 with 0xAB.
- Unicode/koi8_r.cp: Changes 0x95 and adds some control characters
  that ELinks assumed there already.
- Unicode/macroman.cp: Changes 0xC6 and removes some control characters
  that ELinks assumes there anyway.
2008-10-11 15:35:09 +03:00

125 lines
3.0 KiB
Bash
Executable File
Raw Permalink Blame History

#!/bin/sh
echo
echo Generating code page translation tables.
codepages=`cat index.txt`
if [ -n "$codepages" ]; then
(
n=0
echo '/* Automatically generated by gen-cp */'
echo '/* DO NOT EDIT THIS FILE! EDIT Unicode/<whatever> INSTEAD! */'
echo '/* See the input files for copyrights and licences. */'
echo
for i in $codepages; do
echo -n $i' ' 1>&2
echo
echo
echo "/*** $i ***/"
echo
sed ' # Delete the name and aliases lines.
1,2d
# Delete comment-only and blank lines.
/^[ ]*\(#.*\)\{,1\}$/d
# Copy to the hold space.
h
# Delete everything except the comment.
s/^[^#]*//
# If there is a comment, change it to use /* */ delimiters.
s!#[ ]*\(.*\)!/* \1 */!
# Exchange spaces; now hold space = comment and pattern space = all.
x
# Delete the comment.
s/#.*//
# Canonicalize case so the strings can be used as lookup keys.
y/Xabcdef/xABCDEF/
# Delete mappings of bytes 0x00...0x7F. ELinks assumes those match ASCII.
/^0x[01234567]/d
# Delete lines that do not map the byte to exactly one character.
/^[ ]*0x[0123456789ABCDEF]\{2\}[ ]\{1,\}0x[0123456789ABCDEF]\{1,\}[ ]*$/!d
# Append a newline and the comment from the hold space.
G
# Delete the newline added by the previous command.
s/\n//' "$i.cp" | {
for left in 8 9 A B C D E F; do
for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
eval "high0x$left$right="
done
done
table=
highuse=
while read byte unicode comment; do
if eval "[ \"\$high$byte\" ]"; then
table="$table {$byte, $unicode},${comment+ }$comment
"
else
eval "high$byte=\"\$unicode,\${comment+ }\$comment\""
highuse=1
fi
done
if [ "$highuse" ]; then
printf "const uint16_t highhalf_%s [] = {\n" "$i"
for left in 8 9 A B C D E F; do
for right in 0 1 2 3 4 5 6 7 8 9 A B C D E F; do
eval "printf \"\\t/* %s */ %s\\n\" \"0x$left$right\" \"\${high0x$left$right:-0xFFFF,}\""
done
done
printf "};\n\n"
else
printf "#define highhalf_%s highhalf_NULL\n\n" "$i"
fi
if [ "$table" ]; then
printf "const struct table_entry table_%s [] = {\n%s\t{0, 0}\n};\n" "$i" "$table"
else
printf "#define table_%s table_NULL\n" "$i"
fi
printf "\n"
}
echo 'unsigned char *const aliases_'$i' [] = {'
head -n 2 $i.cp | tail -n +2 | sed 's/ \+/ /g; s/ $//; s/\", /\",<2C>/g; s/$/,/' | tr "<22>" "\n" \
| sed 's/^/<2F>/g' | tr "<22>" "\t"
echo ' NULL
};'
n=`expr $n + 1`
done
printf "\n/*** NULL ***/\n\n"
printf "const uint16_t highhalf_NULL [] = {\n"
for r in `seq 16`; do
printf "\t0xFFFF,0xFFFF,0xFFFF,0xFFFF, 0xFFFF,0xFFFF,0xFFFF,0xFFFF,\n"
done
printf "};\n\n"
printf "const struct table_entry table_NULL [] = {\n"
printf "\t{0, 0}\n"
printf "};\n"
echo
echo 'const struct codepage_desc codepages [] = {'
for i in $codepages; do
echo ' {"'`head -n 1 $i.cp`'", aliases_'$i', highhalf_'$i', table_'$i'},'
done
echo ' {NULL, NULL, NULL}'
echo '};'
echo
echo '#define N_CODEPAGES '$n | sed 's/
//g'
) | sed 's/
//g' > ../src/intl/codepage.inc
echo
echo Done.
fi
echo