mirror of
https://github.com/rkd77/elinks.git
synced 2024-12-04 14:46:47 -05:00
Patch 0: Partial modification of the HTML parser and modification of the FastFind subsystem
[Forward ported to 0.12 from bug 1004 attachment 500. --KON]
This commit is contained in:
parent
12d66ff043
commit
85c26ddc45
@ -151,7 +151,7 @@ next_attr:
|
||||
n = name;
|
||||
name_start = e;
|
||||
|
||||
while (atchr(*n) && atchr(*e) && toupper(*e) == toupper(*n)) e++, n++;
|
||||
while (atchr(*n) && atchr(*e) && c_toupper(*e) == c_toupper(*n)) e++, n++;
|
||||
found = !*n && !atchr(*e);
|
||||
|
||||
if (found && (flags & HTML_ATTR_TEST)) return name_start;
|
||||
@ -504,7 +504,7 @@ static struct element_info elements[] = {
|
||||
static int
|
||||
compar(const void *a, const void *b)
|
||||
{
|
||||
return strcasecmp(((struct element_info *) a)->name,
|
||||
return c_strcasecmp(((struct element_info *) a)->name,
|
||||
((struct element_info *) b)->name);
|
||||
}
|
||||
|
||||
@ -548,7 +548,7 @@ void
|
||||
init_tags_lookup(void)
|
||||
{
|
||||
#ifdef USE_FASTFIND
|
||||
fastfind_index(&ff_tags_index, FF_COMPRESS);
|
||||
fastfind_index(&ff_tags_index, FF_COMPRESS | FF_LOCALE_INDEP);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -534,3 +534,99 @@ sanitize_url(unsigned char *url)
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
int c_tolower(int c) {
|
||||
switch (c)
|
||||
{
|
||||
case 'A': return 'a';
|
||||
case 'B': return 'b';
|
||||
case 'C': return 'c';
|
||||
case 'D': return 'd';
|
||||
case 'E': return 'e';
|
||||
case 'F': return 'f';
|
||||
case 'G': return 'g';
|
||||
case 'H': return 'h';
|
||||
case 'I': return 'i';
|
||||
case 'J': return 'j';
|
||||
case 'K': return 'k';
|
||||
case 'L': return 'l';
|
||||
case 'M': return 'm';
|
||||
case 'N': return 'n';
|
||||
case 'O': return 'o';
|
||||
case 'P': return 'p';
|
||||
case 'Q': return 'q';
|
||||
case 'R': return 'r';
|
||||
case 'S': return 's';
|
||||
case 'T': return 't';
|
||||
case 'U': return 'u';
|
||||
case 'V': return 'v';
|
||||
case 'W': return 'w';
|
||||
case 'X': return 'x';
|
||||
case 'Y': return 'y';
|
||||
case 'Z': return 'z';
|
||||
default: return c;
|
||||
}
|
||||
}
|
||||
|
||||
int c_toupper(int c) {
|
||||
switch (c) {
|
||||
case 'a': return 'A';
|
||||
case 'b': return 'B';
|
||||
case 'c': return 'C';
|
||||
case 'd': return 'D';
|
||||
case 'e': return 'E';
|
||||
case 'f': return 'F';
|
||||
case 'g': return 'G';
|
||||
case 'h': return 'H';
|
||||
case 'i': return 'I';
|
||||
case 'j': return 'J';
|
||||
case 'k': return 'K';
|
||||
case 'l': return 'L';
|
||||
case 'm': return 'M';
|
||||
case 'n': return 'N';
|
||||
case 'o': return 'O';
|
||||
case 'p': return 'P';
|
||||
case 'q': return 'Q';
|
||||
case 'r': return 'R';
|
||||
case 's': return 'S';
|
||||
case 't': return 'T';
|
||||
case 'u': return 'U';
|
||||
case 'v': return 'V';
|
||||
case 'w': return 'W';
|
||||
case 'x': return 'X';
|
||||
case 'y': return 'Y';
|
||||
case 'z': return 'Z';
|
||||
default: return c;
|
||||
}
|
||||
}
|
||||
|
||||
int c_isupper (int c)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
||||
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
|
||||
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
|
||||
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
|
||||
case 'Y': case 'Z':
|
||||
return 1;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int c_islower (int c)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
||||
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
|
||||
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
|
||||
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
|
||||
case 'y': case 'z':
|
||||
return 1;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -182,6 +182,16 @@ trim_chars(unsigned char *s, unsigned char c, int *len)
|
||||
return s;
|
||||
}
|
||||
|
||||
/* Convert a character to {lower|upper}case using the
|
||||
* ASCII character set (as if in the C locale) */
|
||||
int c_tolower(int c);
|
||||
int c_toupper(int c);
|
||||
|
||||
/* Check whether a character is {lower|upper}case using the
|
||||
* the ASCII character set (as if in the C locale) */
|
||||
int c_islower(int c);
|
||||
int c_isupper(int c);
|
||||
|
||||
/** Convert uppercase letters in @a string with the given @a length to
|
||||
* lowercase. */
|
||||
static inline void
|
||||
@ -192,6 +202,16 @@ convert_to_lowercase(unsigned char *string, int length)
|
||||
string[length] = tolower(string[length]);
|
||||
}
|
||||
|
||||
/* Convert uppercase letters in @string with the given @length to lowercase
|
||||
* using the ASCII character set (as if in the C locale) */
|
||||
static inline void
|
||||
convert_to_lowercase_locale_indep(unsigned char *string, int length)
|
||||
{
|
||||
for (length--; length >= 0; length--)
|
||||
if (c_isupper(string[length]))
|
||||
string[length] = c_tolower(string[length]);
|
||||
}
|
||||
|
||||
/** This function drops control chars, nbsp char and limit the number
|
||||
* of consecutive space chars to one. It modifies its argument. */
|
||||
void clr_spaces(unsigned char *str);
|
||||
|
@ -169,6 +169,7 @@ struct fastfind_info {
|
||||
int leafsets_count;
|
||||
|
||||
unsigned int case_aware:1;
|
||||
unsigned int locale_indep:1;
|
||||
unsigned int compress:1;
|
||||
|
||||
int idxtab[FF_MAX_CHARS];
|
||||
@ -233,6 +234,7 @@ FF_DBG_dump_stats(struct fastfind_info *info)
|
||||
fprintf(stderr, "------ FastFind Statistics ------\n");
|
||||
fprintf(stderr, "Comment : %s\n", info->debug.comment);
|
||||
fprintf(stderr, "Case-aware : %s\n", info->case_aware ? "yes" : "no");
|
||||
fprintf(stderr, "Locale-indep: %s\n", info->locale_indep ? "yes" : "no");
|
||||
fprintf(stderr, "Compress : %s\n", info->compress ? "yes" : "no");
|
||||
fprintf(stderr, "Uniq_chars : %s\n", info->uniq_chars);
|
||||
fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS);
|
||||
@ -292,6 +294,7 @@ init_fastfind(struct fastfind_index *index, enum fastfind_flags flags)
|
||||
|
||||
info->min_key_len = FF_MAX_KEYLEN;
|
||||
info->case_aware = !!(flags & FF_CASE_AWARE);
|
||||
info->locale_indep = !!(flags & FF_LOCALE_INDEP);
|
||||
info->compress = !!(flags & FF_COMPRESS);
|
||||
|
||||
FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug));
|
||||
@ -434,7 +437,7 @@ compress_tree(struct ff_node *leafset, struct fastfind_info *info)
|
||||
}
|
||||
}
|
||||
|
||||
#define ifcase(c) (info->case_aware ? (c) : toupper(c))
|
||||
#define ifcase(c) ( info->case_aware ? (c) : (info->locale_indep ? c_toupper(c) : toupper(c)) )
|
||||
|
||||
struct fastfind_index *
|
||||
fastfind_index(struct fastfind_index *index, enum fastfind_flags flags)
|
||||
@ -621,6 +624,9 @@ fastfind_search(struct fastfind_index *index,
|
||||
FF_DBG_test(info);
|
||||
if (info->case_aware)
|
||||
FF_SEARCH(key[i]);
|
||||
else
|
||||
if (info->locale_indep)
|
||||
FF_SEARCH(c_toupper(key[i]));
|
||||
else
|
||||
FF_SEARCH(toupper(key[i]));
|
||||
|
||||
|
@ -19,6 +19,8 @@ enum fastfind_flags {
|
||||
FF_NONE = 0,
|
||||
FF_CASE_AWARE = 1, /**< honour case when comparing */
|
||||
FF_COMPRESS = 2, /**< compress nodes if possible */
|
||||
FF_LOCALE_INDEP = 4 /**< whether the case conversion is
|
||||
* locale independent or not */
|
||||
};
|
||||
|
||||
struct fastfind_index {
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include "elinks.h"
|
||||
|
||||
@ -233,11 +234,91 @@ elinks_strlcmp(const unsigned char *s1, size_t n1,
|
||||
|
||||
int
|
||||
elinks_strlcasecmp(const unsigned char *s1, size_t n1,
|
||||
const unsigned char *s2, size_t n2)
|
||||
const unsigned char *s2, size_t n2,
|
||||
const int locale_indep)
|
||||
{
|
||||
if (locale_indep) {
|
||||
strlcmp_device("strlcasecmp", s1, n1, s2, n2, c_toupper(s1[p]), c_toupper(s2[p]));
|
||||
}
|
||||
else {
|
||||
strlcmp_device("strlcasecmp", s1, n1, s2, n2, toupper(s1[p]), toupper(s2[p]));
|
||||
}
|
||||
}
|
||||
|
||||
/* c_strcasecmp
|
||||
* Taken from GNU coreutils (version 6.9)
|
||||
* File name: lib/c-strcasecmp.c
|
||||
* Copyright (C) 1998-1999, 2005-2006 Free Software Foundation, Inc.
|
||||
* Licensed under the GPL version 2 or any later version.
|
||||
*/
|
||||
int c_strcasecmp (const char *s1, const char *s2)
|
||||
{
|
||||
register const unsigned char *p1 = (const unsigned char *) s1;
|
||||
register const unsigned char *p2 = (const unsigned char *) s2;
|
||||
unsigned char c1, c2;
|
||||
|
||||
if (p1 == p2)
|
||||
return 0;
|
||||
|
||||
do
|
||||
{
|
||||
c1 = c_tolower (*p1);
|
||||
c2 = c_tolower (*p2);
|
||||
|
||||
if (c1 == '\0')
|
||||
break;
|
||||
|
||||
++p1;
|
||||
++p2;
|
||||
}
|
||||
while (c1 == c2);
|
||||
|
||||
if (UCHAR_MAX <= INT_MAX)
|
||||
return c1 - c2;
|
||||
else
|
||||
/* On machines where 'char' and 'int' are types of the same size, the
|
||||
difference of two 'unsigned char' values - including the sign bit -
|
||||
doesn't fit in an 'int'. */
|
||||
return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0);
|
||||
}
|
||||
|
||||
/* c_strncasecmp
|
||||
* Taken from GNU coreutils (version 6.9)
|
||||
* File name: lib/c-strncasecmp.c
|
||||
* ^ (note the "n")
|
||||
* Copyright (C) 1998-1999, 2005-2006 Free Software Foundation, Inc.
|
||||
* Licensed under the GPL version 2 or any later version.
|
||||
*/
|
||||
int c_strncasecmp (const char *s1, const char *s2, size_t n)
|
||||
{
|
||||
register const unsigned char *p1 = (const unsigned char *) s1;
|
||||
register const unsigned char *p2 = (const unsigned char *) s2;
|
||||
unsigned char c1, c2;
|
||||
|
||||
if (p1 == p2 || n == 0)
|
||||
return 0;
|
||||
|
||||
do
|
||||
{
|
||||
c1 = c_tolower (*p1);
|
||||
c2 = c_tolower (*p2);
|
||||
|
||||
if (--n == 0 || c1 == '\0')
|
||||
break;
|
||||
|
||||
++p1;
|
||||
++p2;
|
||||
}
|
||||
while (c1 == c2);
|
||||
|
||||
if (UCHAR_MAX <= INT_MAX)
|
||||
return c1 - c2;
|
||||
else
|
||||
/* On machines where 'char' and 'int' are types of the same size, the
|
||||
difference of two 'unsigned char' values - including the sign bit -
|
||||
doesn't fit in an 'int'. */
|
||||
return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0);
|
||||
}
|
||||
|
||||
/* The new string utilities: */
|
||||
|
||||
|
@ -100,9 +100,16 @@ int elinks_strlcmp(const unsigned char *s1, size_t n1,
|
||||
const unsigned char *s2, size_t n2);
|
||||
|
||||
/** Acts identically to strlcmp(), except for being case insensitive. */
|
||||
#define strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d))
|
||||
#define strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d,0))
|
||||
#define c_strlcasecmp(a,b,c,d) (errfile = __FILE__, errline = __LINE__, elinks_strlcasecmp(a,b,c,d,1))
|
||||
int elinks_strlcasecmp(const unsigned char *s1, size_t n1,
|
||||
const unsigned char *s2, size_t n2);
|
||||
const unsigned char *s2, size_t n2,
|
||||
const int locale_indep);
|
||||
|
||||
/* strcasecmp and strncasecmp which work as if they are
|
||||
* in the C locale - both taken from GNU coreutils */
|
||||
int c_strcasecmp(const char *s1, const char *s2);
|
||||
int c_strncasecmp(const char *s1, const char *s2, size_t n);
|
||||
|
||||
/** @} */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user