From f7fd49cf28ce8757e5f3db8db1089f06ce86594a Mon Sep 17 00:00:00 2001 From: Kalle Olavi Niemitalo Date: Sat, 5 Aug 2006 19:45:53 +0300 Subject: [PATCH] UTF-8: New function unicode_fold_label_case and a related script. --- Unicode/gen-case | 137 ++++++++++++++++++++++++++++++++++++++++++++ src/intl/charsets.c | 24 ++++++++ src/intl/charsets.h | 1 + 3 files changed, 162 insertions(+) create mode 100755 Unicode/gen-case diff --git a/Unicode/gen-case b/Unicode/gen-case new file mode 100755 index 00000000..e67037b6 --- /dev/null +++ b/Unicode/gen-case @@ -0,0 +1,137 @@ +#! /usr/bin/perl +use strict; +use warnings; + +my @trans; + +print "\t/* -*- c -*- source code generated by ", join(" ", $0, @ARGV), " */\n"; +while (<>) { + s/#.*$//; + next if /^\s*$/; + my($code, $status, $mapping) = /^([[:xdigit:]]+);\s*([CFST]);\s*([[:xdigit:]]+(?:\s+[[:xdigit:]]+)*);\s*$/ + or warn("$ARGV:$.: weird line\n"), next; + next unless $status eq "C" or $status eq "S"; + warn("$ARGV:$.: multi-char simple mapping\n"), next + if $mapping =~ /\s/; + $code = hex($code); + $mapping = hex($mapping); + $trans[$code] = $mapping; +} continue { + close ARGV if eof; +} + +sub gobble { + my($begin, $step) = @_; + my $diff = $trans[$begin] - $begin; + my @codes; + my @holes; + my $probe = $begin; + my $hole; + while (1) { + my @beyond; + while (defined($trans[$probe]) && $trans[$probe] == $probe + $diff) { + push @beyond, $probe; + $probe += $step; + } + last unless @beyond >= 2; + push @holes, $hole if defined $hole; + push @codes, @beyond; + $hole = $probe; + $probe += $step; + } + return 0 unless @codes; + + # The following formula was tuned for i486-linux-gnu-gcc-4.0 -O1. + if (@codes <= 2 + @holes) { + print "if (", join(" || ", map { sprintf("c == 0x%X", $_) } @codes), ")\n"; + } else { + printf "if (c >= 0x%X && c <= 0x%X", $codes[0], $codes[-1]; + printf " && c != 0x%X", $_ foreach @holes; + if ($step == 2) { printf " && (c & 1) == %d", $begin & 1 } + elsif ($step != 1) { printf " && c %% %d == %d", $step, $begin % $step } + print ")\n"; + } + if ($diff != 0) { + if ($diff < 0) { printf "\t\tc -= "; $diff = -$diff } + else { printf "\t\tc += " } + if ($diff < 10) { printf "%d", $diff } + else { printf "0x%X", $diff } + } + print ";\n"; + + undef $trans[$_] foreach @codes; + return 1; +} + +my $first = 1; +for (my $code = 0; $code <= $#trans; ++$code) { + next unless defined $trans[$code]; + + print $first ? "\t" : "\telse "; + gobble($code, 1) or gobble($code, 2) or gobble($code, 3) or gobble($code, 4) + or printf "if (c == 0x%X)\n\t\tc = 0x%X;\n", $code, $trans[$code]; + $first = 0; +} +close STDOUT or die "$0: -: $!\n"; + +__END__ + +=head1 NAME + +gen-case - Generate C source code for folding the case of a Unicode character. + +=head1 SYNOPSIS + +B CaseFolding.txt > ../src/intl/casefold.inc + +=head1 DESCRIPTION + +B reads F of the Unicode Character Database +and generates C source code that implements the I +as defined in that file. + +The generated source code can then be used like this: + + unicode_val_T + unicode_simple_case_fold(unicode_val_T c) + { + #include "casefold.inc" + return c; + } + +=head1 BUGS + +Does not support B<--help> nor B<--version>. + +=head1 AUTHOR + +Kalle Olavi Niemitalo + +=head1 COPYRIGHT AND LICENSE + +Copyright (c) 2006 Kalle Olavi Niemitalo. + +This program is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. In addition: + +=over + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +=back diff --git a/src/intl/charsets.c b/src/intl/charsets.c index 31905ba2..dc8d25e0 100644 --- a/src/intl/charsets.c +++ b/src/intl/charsets.c @@ -10,6 +10,9 @@ #include #include +#if HAVE_WCTYPE_H +#include +#endif #include "elinks.h" @@ -404,6 +407,27 @@ unicode_to_cell(unicode_val_T c) return 1; } +/* Fold the case of a Unicode character, so that hotkeys in labels can + * be compared case-insensitively. This should be called only if + * check_kbd_label_key(c) is true. It is unspecified whether the + * result will be in upper or lower case. */ +unicode_val_T +unicode_fold_label_case(unicode_val_T c) +{ +#if __STDC_ISO_10646__ && HAVE_WCTYPE_H + return towlower(c); +#else /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */ + /* For now, this supports only ASCII. It would be possible to + * use code generated from CaseFolding.txt of Unicode if the + * acknowledgements required by http://www.unicode.org/copyright.html + * were added to associated documentation of ELinks. */ + if (c >= 0x41 && c <= 0x5A) + return c + 0x20; + else + return c; +#endif /* !(__STDC_ISO_10646__ && HAVE_WCTYPE_H) */ +} + inline unicode_val_T utf_8_to_unicode(unsigned char **string, unsigned char *end) { diff --git a/src/intl/charsets.h b/src/intl/charsets.h index 8d11707d..ae8fe97f 100644 --- a/src/intl/charsets.h +++ b/src/intl/charsets.h @@ -62,6 +62,7 @@ int utf8_ptr2cells(unsigned char *, unsigned char *); int utf8_ptr2chars(unsigned char *, unsigned char *); int utf8_cells2bytes(unsigned char *, int, unsigned char *); inline int unicode_to_cell(unicode_val_T); +unicode_val_T unicode_fold_label_case(unicode_val_T); inline int strlen_utf8(unsigned char **); inline unicode_val_T utf_8_to_unicode(unsigned char **, unsigned char *); unicode_val_T cp2u(int, unsigned char);