diff --git a/src/document/html/parser/parse.c b/src/document/html/parser/parse.c index fcf343213..6cf2f8e32 100644 --- a/src/document/html/parser/parse.c +++ b/src/document/html/parser/parse.c @@ -151,7 +151,7 @@ next_attr: n = name; name_start = e; - while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++; + while (atchr(*n) && atchr(*e) && c_toupper(*e) == c_toupper(*n)) e++, n++; found = !*n && !atchr(*e); if (found && (flags & HTML_ATTR_TEST)) return name_start; @@ -504,8 +504,8 @@ static struct element_info elements[] = { static int compar(const void *a, const void *b) { - return strcasecmp(((struct element_info *) a)->name, - ((struct element_info *) b)->name); + return c_strcasecmp(((struct element_info *) a)->name, + ((struct element_info *) b)->name); } #else @@ -548,7 +548,7 @@ void init_tags_lookup(void) { #ifdef USE_FASTFIND - fastfind_index(&ff_tags_index, FF_COMPRESS); + fastfind_index(&ff_tags_index, FF_COMPRESS | FF_LOCALE_INDEP); #endif } diff --git a/src/util/conv.c b/src/util/conv.c index e35ecdce4..13e9f87b7 100644 --- a/src/util/conv.c +++ b/src/util/conv.c @@ -534,3 +534,99 @@ sanitize_url(unsigned char *url) return 1; } + +int c_tolower(int c) { + switch (c) + { + case 'A': return 'a'; + case 'B': return 'b'; + case 'C': return 'c'; + case 'D': return 'd'; + case 'E': return 'e'; + case 'F': return 'f'; + case 'G': return 'g'; + case 'H': return 'h'; + case 'I': return 'i'; + case 'J': return 'j'; + case 'K': return 'k'; + case 'L': return 'l'; + case 'M': return 'm'; + case 'N': return 'n'; + case 'O': return 'o'; + case 'P': return 'p'; + case 'Q': return 'q'; + case 'R': return 'r'; + case 'S': return 's'; + case 'T': return 't'; + case 'U': return 'u'; + case 'V': return 'v'; + case 'W': return 'w'; + case 'X': return 'x'; + case 'Y': return 'y'; + case 'Z': return 'z'; + default: return c; + } +} + +int c_toupper(int c) { + switch (c) { + case 'a': return 'A'; + case 'b': return 'B'; + case 'c': return 'C'; + case 'd': return 'D'; + case 'e': return 'E'; + case 'f': return 'F'; + case 'g': return 'G'; + case 'h': return 'H'; + case 'i': return 'I'; + case 'j': return 'J'; + case 'k': return 'K'; + case 'l': return 'L'; + case 'm': return 'M'; + case 'n': return 'N'; + case 'o': return 'O'; + case 'p': return 'P'; + case 'q': return 'Q'; + case 'r': return 'R'; + case 's': return 'S'; + case 't': return 'T'; + case 'u': return 'U'; + case 'v': return 'V'; + case 'w': return 'W'; + case 'x': return 'X'; + case 'y': return 'Y'; + case 'z': return 'Z'; + default: return c; + } +} + +int c_isupper (int c) +{ + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': + case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': + case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': + case 'Y': case 'Z': + return 1; + default: + return 0; + } +} + +int c_islower (int c) +{ + switch (c) + { + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': + case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': + case 's': case 't': case 'u': case 'v': case 'w': case 'x': + case 'y': case 'z': + return 1; + default: + return 0; + } +} + diff --git a/src/util/conv.h b/src/util/conv.h index 605388bb4..b364f1987 100644 --- a/src/util/conv.h +++ b/src/util/conv.h @@ -182,6 +182,16 @@ trim_chars(unsigned char *s, unsigned char c, int *len) return s; } +/* Convert a character to {lower|upper}case using the + * ASCII character set (as if in the C locale) */ +int c_tolower(int c); +int c_toupper(int c); + +/* Check whether a character is {lower|upper}case using the + * the ASCII character set (as if in the C locale) */ +int c_islower(int c); +int c_isupper(int c); + /** Convert uppercase letters in @a string with the given @a length to * lowercase. */ static inline void @@ -192,6 +202,16 @@ convert_to_lowercase(unsigned char *string, int length) string[length] = tolower(string[length]); } +/* Convert uppercase letters in @string with the given @length to lowercase + * using the ASCII character set (as if in the C locale) */ +static inline void +convert_to_lowercase_locale_indep(unsigned char *string, int length) +{ + for (length--; length >= 0; length--) + if (c_isupper(string[length])) + string[length] = c_tolower(string[length]); +} + /** This function drops control chars, nbsp char and limit the number * of consecutive space chars to one. It modifies its argument. */ void clr_spaces(unsigned char *str); diff --git a/src/util/fastfind.c b/src/util/fastfind.c index 17ac97ed8..a55b6d912 100644 --- a/src/util/fastfind.c +++ b/src/util/fastfind.c @@ -169,6 +169,7 @@ struct fastfind_info { int leafsets_count; unsigned int case_aware:1; + unsigned int locale_indep:1; unsigned int compress:1; int idxtab[FF_MAX_CHARS]; @@ -233,6 +234,7 @@ FF_DBG_dump_stats(struct fastfind_info *info) fprintf(stderr, "------ FastFind Statistics ------\n"); fprintf(stderr, "Comment : %s\n", info->debug.comment); fprintf(stderr, "Case-aware : %s\n", info->case_aware ? "yes" : "no"); + fprintf(stderr, "Locale-indep: %s\n", info->locale_indep ? "yes" : "no"); fprintf(stderr, "Compress : %s\n", info->compress ? "yes" : "no"); fprintf(stderr, "Uniq_chars : %s\n", info->uniq_chars); fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS); @@ -292,6 +294,7 @@ init_fastfind(struct fastfind_index *index, enum fastfind_flags flags) info->min_key_len = FF_MAX_KEYLEN; info->case_aware = !!(flags & FF_CASE_AWARE); + info->locale_indep = !!(flags & FF_LOCALE_INDEP); info->compress = !!(flags & FF_COMPRESS); FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug)); @@ -434,7 +437,7 @@ compress_tree(struct ff_node *leafset, struct fastfind_info *info) } } -#define ifcase(c) (info->case_aware ? (c) : toupper(c)) +#define ifcase(c) ( info->case_aware ? (c) : (info->locale_indep ? c_toupper(c) : toupper(c)) ) struct fastfind_index * fastfind_index(struct fastfind_index *index, enum fastfind_flags flags) @@ -622,7 +625,10 @@ fastfind_search(struct fastfind_index *index, if (info->case_aware) FF_SEARCH(key[i]); else - FF_SEARCH(toupper(key[i])); + if (info->locale_indep) + FF_SEARCH(c_toupper(key[i])); + else + FF_SEARCH(toupper(key[i])); return NULL; } diff --git a/src/util/fastfind.h b/src/util/fastfind.h index a6b855854..b01427828 100644 --- a/src/util/fastfind.h +++ b/src/util/fastfind.h @@ -19,6 +19,8 @@ enum fastfind_flags { FF_NONE = 0, FF_CASE_AWARE = 1, /**< honour case when comparing */ FF_COMPRESS = 2, /**< compress nodes if possible */ + FF_LOCALE_INDEP = 4 /**< whether the case conversion is + * locale independent or not */ }; struct fastfind_index { diff --git a/src/util/string.c b/src/util/string.c index 076c491b8..d4ef1c63f 100644 --- a/src/util/string.c +++ b/src/util/string.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "elinks.h" @@ -233,11 +234,91 @@ elinks_strlcmp(const unsigned char *s1, size_t n1, int elinks_strlcasecmp(const unsigned char *s1, size_t n1, - const unsigned char *s2, size_t n2) + const unsigned char *s2, size_t n2, + const int locale_indep) { - strlcmp_device("strlcasecmp", s1, n1, s2, n2, toupper(s1[p]), toupper(s2[p])); + if (locale_indep) { + strlcmp_device("strlcasecmp", s1, n1, s2, n2, c_toupper(s1[p]), c_toupper(s2[p])); + } + else { + strlcmp_device("strlcasecmp", s1, n1, s2, n2, toupper(s1[p]), toupper(s2[p])); + } } +/* c_strcasecmp + * Taken from GNU coreutils (version 6.9) + * File name: lib/c-strcasecmp.c + * Copyright (C) 1998-1999, 2005-2006 Free Software Foundation, Inc. + * Licensed under the GPL version 2 or any later version. + */ +int c_strcasecmp (const char *s1, const char *s2) +{ + register const unsigned char *p1 = (const unsigned char *) s1; + register const unsigned char *p2 = (const unsigned char *) s2; + unsigned char c1, c2; + + if (p1 == p2) + return 0; + + do + { + c1 = c_tolower (*p1); + c2 = c_tolower (*p2); + + if (c1 == '\0') + break; + + ++p1; + ++p2; + } + while (c1 == c2); + + if (UCHAR_MAX <= INT_MAX) + return c1 - c2; + else + /* On machines where 'char' and 'int' are types of the same size, the + difference of two 'unsigned char' values - including the sign bit - + doesn't fit in an 'int'. */ + return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0); +} + +/* c_strncasecmp + * Taken from GNU coreutils (version 6.9) + * File name: lib/c-strncasecmp.c + * ^ (note the "n") + * Copyright (C) 1998-1999, 2005-2006 Free Software Foundation, Inc. + * Licensed under the GPL version 2 or any later version. + */ +int c_strncasecmp (const char *s1, const char *s2, size_t n) +{ + register const unsigned char *p1 = (const unsigned char *) s1; + register const unsigned char *p2 = (const unsigned char *) s2; + unsigned char c1, c2; + + if (p1 == p2 || n == 0) + return 0; + + do + { + c1 = c_tolower (*p1); + c2 = c_tolower (*p2); + + if (--n == 0 || c1 == '\0') + break; + + ++p1; + ++p2; + } + while (c1 == c2); + + if (UCHAR_MAX <= INT_MAX) + return c1 - c2; + else + /* On machines where 'char' and 'int' are types of the same size, the + difference of two 'unsigned char' values - including the sign bit - + doesn't fit in an 'int'. */ + return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0); +} /* The new string utilities: */ diff --git a/src/util/string.h b/src/util/string.h index 0008863d5..243a348c2 100644 --- a/src/util/string.h +++ b/src/util/string.h @@ -100,9 +100,16 @@ int elinks_strlcmp(const unsigned char *s1, size_t n1, const unsigned char *s2, size_t n2); /** Acts identically to strlcmp(), except for being case insensitive. */ -#define strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d)) +#define strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d,0)) +#define c_strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d,1)) int elinks_strlcasecmp(const unsigned char *s1, size_t n1, - const unsigned char *s2, size_t n2); + const unsigned char *s2, size_t n2, + const int locale_indep); + +/* strcasecmp and strncasecmp which work as if they are + * in the C locale - both taken from GNU coreutils */ +int c_strcasecmp(const char *s1, const char *s2); +int c_strncasecmp(const char *s1, const char *s2, size_t n); /** @} */