1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00

Patch 0: Partial modification of the HTML parser and modification of the FastFind subsystem

[Forward ported to 0.12 from bug 1004 attachment 500.  --KON]
This commit is contained in:
M. Vefa Bicakci 2008-10-19 03:36:00 +02:00 committed by Kalle Olavi Niemitalo
parent 12d66ff043
commit 85c26ddc45
7 changed files with 222 additions and 10 deletions

View File

@ -151,7 +151,7 @@ next_attr:
n = name;
name_start = e;
while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;
while (atchr(*n) && atchr(*e) && c_toupper(*e) == c_toupper(*n)) e++, n++;
found = !*n && !atchr(*e);
if (found && (flags & HTML_ATTR_TEST)) return name_start;
@ -504,7 +504,7 @@ static struct element_info elements[] = {
static int
compar(const void *a, const void *b)
{
return strcasecmp(((struct element_info *) a)->name,
return c_strcasecmp(((struct element_info *) a)->name,
((struct element_info *) b)->name);
}
@ -548,7 +548,7 @@ void
init_tags_lookup(void)
{
#ifdef USE_FASTFIND
fastfind_index(&ff_tags_index, FF_COMPRESS);
fastfind_index(&ff_tags_index, FF_COMPRESS | FF_LOCALE_INDEP);
#endif
}

View File

@ -534,3 +534,99 @@ sanitize_url(unsigned char *url)
return 1;
}
int c_tolower(int c) {
switch (c)
{
case 'A': return 'a';
case 'B': return 'b';
case 'C': return 'c';
case 'D': return 'd';
case 'E': return 'e';
case 'F': return 'f';
case 'G': return 'g';
case 'H': return 'h';
case 'I': return 'i';
case 'J': return 'j';
case 'K': return 'k';
case 'L': return 'l';
case 'M': return 'm';
case 'N': return 'n';
case 'O': return 'o';
case 'P': return 'p';
case 'Q': return 'q';
case 'R': return 'r';
case 'S': return 's';
case 'T': return 't';
case 'U': return 'u';
case 'V': return 'v';
case 'W': return 'w';
case 'X': return 'x';
case 'Y': return 'y';
case 'Z': return 'z';
default: return c;
}
}
int c_toupper(int c) {
switch (c) {
case 'a': return 'A';
case 'b': return 'B';
case 'c': return 'C';
case 'd': return 'D';
case 'e': return 'E';
case 'f': return 'F';
case 'g': return 'G';
case 'h': return 'H';
case 'i': return 'I';
case 'j': return 'J';
case 'k': return 'K';
case 'l': return 'L';
case 'm': return 'M';
case 'n': return 'N';
case 'o': return 'O';
case 'p': return 'P';
case 'q': return 'Q';
case 'r': return 'R';
case 's': return 'S';
case 't': return 'T';
case 'u': return 'U';
case 'v': return 'V';
case 'w': return 'W';
case 'x': return 'X';
case 'y': return 'Y';
case 'z': return 'Z';
default: return c;
}
}
int c_isupper (int c)
{
switch (c)
{
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
return 1;
default:
return 0;
}
}
int c_islower (int c)
{
switch (c)
{
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
case 'y': case 'z':
return 1;
default:
return 0;
}
}

View File

@ -182,6 +182,16 @@ trim_chars(unsigned char *s, unsigned char c, int *len)
return s;
}
/* Convert a character to {lower|upper}case using the
* ASCII character set (as if in the C locale) */
int c_tolower(int c);
int c_toupper(int c);
/* Check whether a character is {lower|upper}case using the
* the ASCII character set (as if in the C locale) */
int c_islower(int c);
int c_isupper(int c);
/** Convert uppercase letters in @a string with the given @a length to
* lowercase. */
static inline void
@ -192,6 +202,16 @@ convert_to_lowercase(unsigned char *string, int length)
string[length] = tolower(string[length]);
}
/* Convert uppercase letters in @string with the given @length to lowercase
* using the ASCII character set (as if in the C locale) */
static inline void
convert_to_lowercase_locale_indep(unsigned char *string, int length)
{
for (length--; length >= 0; length--)
if (c_isupper(string[length]))
string[length] = c_tolower(string[length]);
}
/** This function drops control chars, nbsp char and limit the number
* of consecutive space chars to one. It modifies its argument. */
void clr_spaces(unsigned char *str);

View File

@ -169,6 +169,7 @@ struct fastfind_info {
int leafsets_count;
unsigned int case_aware:1;
unsigned int locale_indep:1;
unsigned int compress:1;
int idxtab[FF_MAX_CHARS];
@ -233,6 +234,7 @@ FF_DBG_dump_stats(struct fastfind_info *info)
fprintf(stderr, "------ FastFind Statistics ------\n");
fprintf(stderr, "Comment : %s\n", info->debug.comment);
fprintf(stderr, "Case-aware : %s\n", info->case_aware ? "yes" : "no");
fprintf(stderr, "Locale-indep: %s\n", info->locale_indep ? "yes" : "no");
fprintf(stderr, "Compress : %s\n", info->compress ? "yes" : "no");
fprintf(stderr, "Uniq_chars : %s\n", info->uniq_chars);
fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS);
@ -292,6 +294,7 @@ init_fastfind(struct fastfind_index *index, enum fastfind_flags flags)
info->min_key_len = FF_MAX_KEYLEN;
info->case_aware = !!(flags & FF_CASE_AWARE);
info->locale_indep = !!(flags & FF_LOCALE_INDEP);
info->compress = !!(flags & FF_COMPRESS);
FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug));
@ -434,7 +437,7 @@ compress_tree(struct ff_node *leafset, struct fastfind_info *info)
}
}
#define ifcase(c) (info->case_aware ? (c) : toupper(c))
#define ifcase(c) ( info->case_aware ? (c) : (info->locale_indep ? c_toupper(c) : toupper(c)) )
struct fastfind_index *
fastfind_index(struct fastfind_index *index, enum fastfind_flags flags)
@ -621,6 +624,9 @@ fastfind_search(struct fastfind_index *index,
FF_DBG_test(info);
if (info->case_aware)
FF_SEARCH(key[i]);
else
if (info->locale_indep)
FF_SEARCH(c_toupper(key[i]));
else
FF_SEARCH(toupper(key[i]));

View File

@ -19,6 +19,8 @@ enum fastfind_flags {
FF_NONE = 0,
FF_CASE_AWARE = 1, /**< honour case when comparing */
FF_COMPRESS = 2, /**< compress nodes if possible */
FF_LOCALE_INDEP = 4 /**< whether the case conversion is
* locale independent or not */
};
struct fastfind_index {

View File

@ -14,6 +14,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include "elinks.h"
@ -233,11 +234,91 @@ elinks_strlcmp(const unsigned char *s1, size_t n1,
int
elinks_strlcasecmp(const unsigned char *s1, size_t n1,
const unsigned char *s2, size_t n2)
const unsigned char *s2, size_t n2,
const int locale_indep)
{
if (locale_indep) {
strlcmp_device("strlcasecmp", s1, n1, s2, n2, c_toupper(s1[p]), c_toupper(s2[p]));
}
else {
strlcmp_device("strlcasecmp", s1, n1, s2, n2, toupper(s1[p]), toupper(s2[p]));
}
}
/* c_strcasecmp
* Taken from GNU coreutils (version 6.9)
* File name: lib/c-strcasecmp.c
* Copyright (C) 1998-1999, 2005-2006 Free Software Foundation, Inc.
* Licensed under the GPL version 2 or any later version.
*/
int c_strcasecmp (const char *s1, const char *s2)
{
register const unsigned char *p1 = (const unsigned char *) s1;
register const unsigned char *p2 = (const unsigned char *) s2;
unsigned char c1, c2;
if (p1 == p2)
return 0;
do
{
c1 = c_tolower (*p1);
c2 = c_tolower (*p2);
if (c1 == '\0')
break;
++p1;
++p2;
}
while (c1 == c2);
if (UCHAR_MAX <= INT_MAX)
return c1 - c2;
else
/* On machines where 'char' and 'int' are types of the same size, the
difference of two 'unsigned char' values - including the sign bit -
doesn't fit in an 'int'. */
return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0);
}
/* c_strncasecmp
* Taken from GNU coreutils (version 6.9)
* File name: lib/c-strncasecmp.c
* ^ (note the "n")
* Copyright (C) 1998-1999, 2005-2006 Free Software Foundation, Inc.
* Licensed under the GPL version 2 or any later version.
*/
int c_strncasecmp (const char *s1, const char *s2, size_t n)
{
register const unsigned char *p1 = (const unsigned char *) s1;
register const unsigned char *p2 = (const unsigned char *) s2;
unsigned char c1, c2;
if (p1 == p2 || n == 0)
return 0;
do
{
c1 = c_tolower (*p1);
c2 = c_tolower (*p2);
if (--n == 0 || c1 == '\0')
break;
++p1;
++p2;
}
while (c1 == c2);
if (UCHAR_MAX <= INT_MAX)
return c1 - c2;
else
/* On machines where 'char' and 'int' are types of the same size, the
difference of two 'unsigned char' values - including the sign bit -
doesn't fit in an 'int'. */
return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0);
}
/* The new string utilities: */

View File

@ -100,9 +100,16 @@ int elinks_strlcmp(const unsigned char *s1, size_t n1,
const unsigned char *s2, size_t n2);
/** Acts identically to strlcmp(), except for being case insensitive. */
#define strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d))
#define strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d,0))
#define c_strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d,1))
int elinks_strlcasecmp(const unsigned char *s1, size_t n1,
const unsigned char *s2, size_t n2);
const unsigned char *s2, size_t n2,
const int locale_indep);
/* strcasecmp and strncasecmp which work as if they are
* in the C locale - both taken from GNU coreutils */
int c_strcasecmp(const char *s1, const char *s2);
int c_strncasecmp(const char *s1, const char *s2, size_t n);
/** @} */