/* Very fast search_keyword_in_list. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include "elinks.h" #include "util/conv.h" #include "util/error.h" #include "util/fastfind.h" #include "util/memdebug.h" #include "util/memory.h" #ifdef USE_FASTFIND /* It replaces bsearch() + strcasecmp() + callback + ... * * Following conditions should be met: * * - list keys are C strings. * - keys should not be greater than 255 characters, and optimally < 20 * characters. It can work with greater keys but then memory usage will * grow a lot. * - each key must be unique and non empty. * - list do not have to be ordered. * - total number of unique characters used in all keys should be <= 128 * - idealy total number of keys should be <= 512 (but see below) * * (c) 2003 Laurent MONIN (aka Zas) * Feel free to do whatever you want with that code. */ /* These routines use a tree search. First, a big tree is composed from the * keys on input. Then, when searching we just go through the tree. If we will * end up on an 'ending' node, we've got it. * * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it * would look like: * * [root] * b h * o e 1 * d a * Y D * g r * r o * o c * u K * D * * (the ending nodes are upcased just for this drawing, not in real) * * To optimize this for speed, leafs of nodes are organized in per-node arrays * (so-called 'leafsets'), indexed by symbol value of the key's next character. * But to optimize that for memory, we first compose own alphabet consisting * only from the chars we ever use in the key strings. @uniq_chars holds that * alphabet and @idxtab is used to translate between it and ASCII. * * Tree building: O((L+M)*N) * (L: mean key length, M: alphabet size, * N: number of items). * String lookup: O(N) (N: string length). */ /* Define it to generate performance and memory usage statistics to stderr. */ #if 0 #define DEBUG_FASTFIND #endif /* Define whether to use 32 or 64 bits per compressed element. */ #if 1 #define USE_32_BITS #endif #define END_LEAF_BITS 1 #define COMPRESSED_BITS 1 #ifdef USE_32_BITS /* Use only 32 bits per element, but has very low limits. */ /* Adequate for ELinks tags search. */ #define POINTER_INDEX_BITS 10 /* 1024 */ #define LEAFSET_INDEX_BITS 13 /* 8192 */ #define COMP_CHAR_INDEX_BITS 7 /* 128 */ #define ff_node ff_node_c /* Both are 32 bits long. */ #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \ COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \ COMPRESSED_BITS) > 32 #error Over 32 bits in struct ff_node !! #endif #else /* !USE_32_BITS */ /* Keep this one if there is more than 512 keywords in a list * it eats a bit more memory. * ELinks may need this one if fastfind is used in other * things than tags searching. */ /* This will make struct ff_node_c use 64 bits. */ #define POINTER_INDEX_BITS 12 #define LEAFSET_INDEX_BITS 18 #define COMP_CHAR_INDEX_BITS 8 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \ + END_LEAF_BITS + COMPRESSED_BITS) > 32 #error Over 32 bits in struct ff_node !! #endif struct ff_node { /* End leaf -> p is significant */ unsigned int e:END_LEAF_BITS; /* Compressed */ unsigned int c:COMPRESSED_BITS; /* Index in pointers */ unsigned int p:POINTER_INDEX_BITS; /* Index in leafsets */ unsigned int l:LEAFSET_INDEX_BITS; }; #endif /* USE_32_BITS */ #define FF_MAX_KEYS (1 << POINTER_INDEX_BITS) #define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1) #define FF_MAX_CHARS (1 << COMP_CHAR_INDEX_BITS) #define FF_MAX_KEYLEN 255 struct ff_node_c { unsigned int e:END_LEAF_BITS; unsigned int c:COMPRESSED_BITS; unsigned int p:POINTER_INDEX_BITS; unsigned int l:LEAFSET_INDEX_BITS; /* Index of char when compressed. */ unsigned int ch:COMP_CHAR_INDEX_BITS; }; struct ff_data { void *pointer; int keylen; }; struct fastfind_info { struct ff_data *data; struct ff_node **leafsets; struct ff_node *root_leafset; int min_key_len; int max_key_len; int uniq_chars_count; int count; int pointers_count; int leafsets_count; unsigned int case_aware:1; unsigned int compress:1; int idxtab[FF_MAX_CHARS]; unsigned char uniq_chars[FF_MAX_CHARS]; #ifdef DEBUG_FASTFIND struct { unsigned long searches; unsigned long found; unsigned long itertmp; unsigned long iterdelta; unsigned long itermax; unsigned long iterations; unsigned long tests; unsigned long teststmp; unsigned long testsdelta; unsigned long testsmax; unsigned long memory_usage; unsigned long total_key_len; unsigned int compressed_nodes; unsigned char *comment; } debug; #endif }; #ifdef DEBUG_FASTFIND /* These are for performance testing. */ #define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size) #define FF_DBG_test(x) (x)->debug.tests++ #define FF_DBG_iter(x) (x)->debug.iterations++ #define FF_DBG_cnode(x) (x)->debug.compressed_nodes++ #define FF_DBG_found(x) \ do { \ unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp; \ unsigned long tests = (x)->debug.tests - (x)->debug.teststmp; \ \ (x)->debug.iterdelta += iter; \ (x)->debug.testsdelta += tests; \ if (iter > (x)->debug.itermax) \ (x)->debug.itermax = iter; \ if (tests > (x)->debug.testsmax) \ (x)->debug.testsmax = tests; \ (x)->debug.found++; \ } while (0) #define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0) /* Update search stats. */ static void FF_DBG_search_stats(struct fastfind_info *info, int key_len) { info->debug.searches++; info->debug.total_key_len += key_len; info->debug.teststmp = info->debug.tests; info->debug.itertmp = info->debug.iterations; } /* Dump all stats. */ static void FF_DBG_dump_stats(struct fastfind_info *info) { fprintf(stderr, "------ FastFind Statistics ------\n"); fprintf(stderr, "Comment : %s\n", info->debug.comment); fprintf(stderr, "Case-aware : %s\n", info->case_aware ? "yes" : "no"); fprintf(stderr, "Compress : %s\n", info->compress ? "yes" : "no"); fprintf(stderr, "Uniq_chars : %s\n", info->uniq_chars); fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS); fprintf(stderr, "Min_key_len : %d\n", info->min_key_len); fprintf(stderr, "Max_key_len : %d\n", info->max_key_len); fprintf(stderr, "Entries : %d/%d max.\n", info->pointers_count, FF_MAX_KEYS); fprintf(stderr, "Leafsets : %d/%d max.\n", info->leafsets_count, FF_MAX_LEAFSETS); if (info->compress) fprintf(stderr, "C. leafsets : %u/%d (%0.2f%%)\n", info->debug.compressed_nodes, info->leafsets_count, 100 * (double) info->debug.compressed_nodes / info->leafsets_count); fprintf(stderr, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n", info->debug.memory_usage, (double) info->debug.memory_usage / info->pointers_count); fprintf(stderr, "Struct info : %zu bytes\n", sizeof(*info) - sizeof(info->debug)); fprintf(stderr, "Struct node : %zu bytes\n", sizeof(struct ff_node)); fprintf(stderr, "Struct cnode: %zu bytes\n", sizeof(struct ff_node_c)); fprintf(stderr, "Searches : %lu\n", info->debug.searches); fprintf(stderr, "Found : %lu (%0.2f%%)\n", info->debug.found, 100 * (double) info->debug.found / info->debug.searches); fprintf(stderr, "Iterations : %lu (%0.2f per search, %0.2f before found, %lu max)\n", info->debug.iterations, (double) info->debug.iterations / info->debug.searches, (double) info->debug.iterdelta / info->debug.found, info->debug.itermax); fprintf(stderr, "Tests : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n", info->debug.tests, (double) info->debug.tests / info->debug.searches, (double) info->debug.tests / info->debug.iterations, (double) info->debug.testsdelta / info->debug.found, info->debug.testsmax); fprintf(stderr, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n", info->debug.total_key_len, (double) info->debug.total_key_len / info->debug.searches, (double) info->debug.total_key_len / info->debug.iterations); fprintf(stderr, "\n"); } #else /* !DEBUG_FASTFIND */ #define FF_DBG_mem(x, size) #define FF_DBG_test(x) #define FF_DBG_iter(x) #define FF_DBG_cnode(x) #define FF_DBG_found(x) #define FF_DBG_comment(x, comment) #define FF_DBG_search_stats(info, key_len) #define FF_DBG_dump_stats(info) #endif /* DEBUG_FASTFIND */ static struct fastfind_info * init_fastfind(struct fastfind_index *index, enum fastfind_flags flags) { struct fastfind_info *info = mem_calloc(1, sizeof(*info)); index->handle = info; if (!info) return NULL; info->min_key_len = FF_MAX_KEYLEN; info->case_aware = !!(flags & FF_CASE_AWARE); info->compress = !!(flags & FF_COMPRESS); FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug)); FF_DBG_comment(info, index->comment); return info; } /* Return 1 on success, 0 on allocation failure */ static int alloc_ff_data(struct fastfind_info *info) { struct ff_data *data; assert(info->count < FF_MAX_KEYS); if_assert_failed return 0; /* On error, cleanup is done by fastfind_done(). */ data = mem_calloc(info->count, sizeof(*data)); if (!data) return 0; info->data = data; FF_DBG_mem(info, info->count * sizeof(*data)); return 1; } /* Add pointer and its key length to correspondant arrays, incrementing * internal counter. */ static void add_to_ff_data(void *p, int key_len, struct fastfind_info *info) { struct ff_data *data = &info->data[info->pointers_count++]; /* Record new pointer and key len, used in search */ data->pointer = p; data->keylen = key_len; } /* Return 1 on success, 0 on allocation failure */ static int alloc_leafset(struct fastfind_info *info) { struct ff_node **leafsets; struct ff_node *leafset; assert(info->leafsets_count < FF_MAX_LEAFSETS); if_assert_failed return 0; /* info->leafsets[0] is never used since l=0 marks no leaf * in struct ff_node. That's the reason of that + 2. */ leafsets = mem_realloc(info->leafsets, sizeof(*leafsets) * (info->leafsets_count + 2)); if (!leafsets) return 0; info->leafsets = leafsets; leafset = mem_calloc(info->uniq_chars_count, sizeof(*leafset)); if (!leafset) return 0; FF_DBG_mem(info, sizeof(*leafsets)); FF_DBG_mem(info, sizeof(*leafset) * info->uniq_chars_count); info->leafsets_count++; info->leafsets[info->leafsets_count] = leafset; return 1; } static inline int char2idx(unsigned char c, struct fastfind_info *info) { unsigned char *idx = memchr(info->uniq_chars, c, info->uniq_chars_count); if (idx) return (idx - info->uniq_chars); return -1; } static inline void init_idxtab(struct fastfind_info *info) { int i; for (i = 0; i < FF_MAX_CHARS; i++) info->idxtab[i] = char2idx((unsigned char) i, info); } static inline void compress_node(struct ff_node *leafset, struct fastfind_info *info, int i, int pos) { struct ff_node_c *new = mem_alloc(sizeof(*new)); if (!new) return; new->c = 1; new->e = leafset[pos].e; new->p = leafset[pos].p; new->l = leafset[pos].l; new->ch = pos; mem_free_set(&info->leafsets[i], (struct ff_node *) new); FF_DBG_cnode(info); FF_DBG_mem(info, sizeof(*new)); FF_DBG_mem(info, sizeof(*leafset) * -info->uniq_chars_count); } static void compress_tree(struct ff_node *leafset, struct fastfind_info *info) { int cnt = 0; int pos = 0; int i; assert(info); if_assert_failed return; for (i = 0; i < info->uniq_chars_count; i++) { if (leafset[i].c) continue; if (leafset[i].l) { /* There's a leaf leafset, descend to it and recurse */ compress_tree(info->leafsets[leafset[i].l], info); } if (leafset[i].l || leafset[i].e) { cnt++; pos = i; } } if (cnt != 1 || leafset[pos].c) return; /* Compress if possible ;) */ for (i = 1; i < info->leafsets_count; i++) { if (info->leafsets[i] == leafset) { compress_node(leafset, info, i, pos); return; } } } #define ifcase(c) (info->case_aware ? (c) : toupper(c)) struct fastfind_index * fastfind_index(struct fastfind_index *index, enum fastfind_flags flags) { struct fastfind_key_value *p; struct fastfind_info *info; assert(index && index->reset && index->next); if_assert_failed goto return_error; info = init_fastfind(index, flags); if (!info) goto return_error; /* First search min, max, count and uniq_chars. */ index->reset(); while ((p = index->next())) { int key_len = strlen(p->key); int i; assert(key_len > 0 && key_len <= FF_MAX_KEYLEN); if_assert_failed goto return_error; if (key_len < info->min_key_len) info->min_key_len = key_len; if (key_len > info->max_key_len) info->max_key_len = key_len; for (i = 0; i < key_len; i++) { /* ifcase() test should be moved outside loops but * remember we call this routine only once per list. * So I go for code readability vs performance here. * --Zas */ int k = ifcase(p->key[i]); assert(k < FF_MAX_CHARS); if_assert_failed goto return_error; if (char2idx(k, info) == -1) { assert(info->uniq_chars_count < FF_MAX_CHARS); if_assert_failed goto return_error; info->uniq_chars[info->uniq_chars_count++] = k; } } info->count++; } if (!info->count) return NULL; init_idxtab(info); /* Root leafset allocation */ if (!alloc_leafset(info)) goto return_error; info->root_leafset = info->leafsets[info->leafsets_count]; if (!alloc_ff_data(info)) goto return_error; /* Build the tree */ index->reset(); while ((p = index->next())) { int key_len = strlen(p->key); struct ff_node *leafset = info->root_leafset; int i; #if 0 fprintf(stderr, "K: %s\n", p->key); #endif for (i = 0; i < key_len - 1; i++) { /* Convert char to its index value */ int idx = info->idxtab[ifcase(p->key[i])]; /* leafset[idx] is the desired leaf node's bucket. */ if (leafset[idx].l == 0) { /* There's no leaf yet */ if (!alloc_leafset(info)) goto return_error; leafset[idx].l = info->leafsets_count; } /* Descend to leaf */ leafset = info->leafsets[leafset[idx].l]; } /* Index final leaf */ i = info->idxtab[ifcase(p->key[i])]; leafset[i].e = 1; /* Memorize pointer to data */ leafset[i].p = info->pointers_count; add_to_ff_data(p->data, key_len, info); } if (info->compress) compress_tree(info->root_leafset, info); return index; return_error: fastfind_done(index); return NULL; } #undef ifcase /* This macro searchs for the key in indexed list */ #define FF_SEARCH(what) do { \ int i; \ \ for (i = 0; i < key_len; i++) { \ int lidx, k = what; \ \ FF_DBG_iter(info); \ \ FF_DBG_test(info); \ if (k >= FF_MAX_CHARS) return NULL; \ lidx = info->idxtab[k]; \ \ FF_DBG_test(info); \ if (lidx < 0) return NULL; \ \ FF_DBG_test(info); \ if (current->c) { \ /* It is a compressed leaf. */ \ FF_DBG_test(info); \ if (((struct ff_node_c *) current)->ch != lidx) \ return NULL; \ } else { \ current = ¤t[lidx]; \ } \ \ FF_DBG_test(info); \ if (current->e) { \ struct ff_data *data = &info->data[current->p]; \ \ FF_DBG_test(info); \ if (key_len == data->keylen) { \ FF_DBG_found(info); \ return data->pointer; \ } \ } \ \ FF_DBG_test(info); \ if (!current->l) return NULL; \ current = (struct ff_node *) info->leafsets[current->l]; \ } \ } while (0) void * fastfind_search(struct fastfind_index *index, unsigned char *key, int key_len) { struct ff_node *current; struct fastfind_info *info; assert(index); if_assert_failed return NULL; info = index->handle; assertm(info, "FastFind index %s not initialized", index->comment); if_assert_failed return NULL; FF_DBG_search_stats(info, key_len); FF_DBG_test(info); if (!key) return NULL; FF_DBG_test(info); if (key_len > info->max_key_len) return NULL; FF_DBG_test(info); if (key_len < info->min_key_len) return NULL; current = info->root_leafset; /* Macro and code redundancy are there to obtain maximum * performance. Do not move it to an inlined function. * Do not even think about it. * If you find a better way (same or better performance) then * propose it and be prepared to defend it. --Zas */ FF_DBG_test(info); if (info->case_aware) FF_SEARCH(key[i]); else FF_SEARCH(toupper(key[i])); return NULL; } #undef FF_SEARCH void fastfind_done(struct fastfind_index *index) { struct fastfind_info *info; assert(index); if_assert_failed return; info = index->handle; if (!info) return; FF_DBG_dump_stats(info); mem_free_if(info->data); while (info->leafsets_count) { mem_free_if(info->leafsets[info->leafsets_count]); info->leafsets_count--; } mem_free_if(info->leafsets); mem_free(info); index->handle = NULL; } /* EXAMPLE */ #if 0 struct list { unsigned char *tag; int val; }; struct list list[] = { {"A", 1}, {"ABBR", 2}, {"ADDRESS", 3}, {"B", 4}, {"BASE", 5}, {"BASEFONT", 6}, {"BLOCKQUOTE", 7}, {"BODY", 8}, {"BR", 9}, {"BUTTON", 10}, {"CAPTION", 11}, {"CENTER", 12}, {"CODE", 13}, {"DD", 14}, {"DFN", 15}, {"DIR", 16}, {"DIV", 17}, {"DL", 18}, {"DT", 19}, {"EM", 20}, {"FIXED", 21}, {"FONT", 22}, {"FORM", 23}, {"FRAME", 24}, {"FRAMESET", 25}, {"H1", 26}, {"H2", 27}, {"H3", 28}, {"H4", 29}, {"H5", 30}, {"H6", 31}, /* {"HEAD", html_skip, 0, 0}, */ {"HR", 32}, {"I", 33}, {"IFRAME", 34}, {"IMG", 35}, {"INPUT", 36}, {"LI", 37}, {"LINK", 38}, {"LISTING", 39}, {"MENU", 40}, {"NOFRAMES", 41}, {"OL", 42}, {"OPTION", 43}, {"P", 44}, {"PRE", 45}, {"Q", 46}, {"S", 47}, {"SCRIPT", 48}, {"SELECT", 49}, {"SPAN", 50}, {"STRIKE", 51}, {"STRONG", 52}, {"STYLE", 53}, {"SUB", 54}, {"SUP", 55}, {"TABLE", 56}, {"TD", 57}, {"TEXTAREA", 58}, {"TH", 59}, {"TITLE", 60}, {"TR", 61}, {"U", 62}, {"UL", 63}, {"XMP", 64}, {NULL, 0}, /* List terminaison is key = NULL */ }; struct list *internal_pointer; /* Reset internal list pointer */ void reset_list(void) { internal_pointer = list; } /* Returns a pointer to a struct that contains * current key and data pointers and increment * internal pointer. * It returns NULL when key is NULL. */ struct fastfind_key_value * next_in_list(void) { static struct fastfind_key_value kv; if (!internal_pointer->tag) return NULL; kv.key = internal_pointer->tag; kv.data = internal_pointer; internal_pointer++; return &kv; } static struct fastfind_index ff_index = INIT_FASTFIND_INDEX("example", reset_list, next_in_list); int main(int argc, char **argv) { unsigned char *key = argv[1]; struct list *result; if (!key || !*key) { fprintf(stderr, "Usage: fastfind keyword\n"); exit(-1); } fprintf(stderr, "---------- INDEX PHASE -----------\n"); /* Mandatory */ fastfind_index(&ff_index, FF_COMPRESS); fprintf(stderr, "---------- SEARCH PHASE ----------\n"); /* Without this one ... */ result = (struct list *) fastfind_search(&ff_index, key, strlen(key)); if (result) fprintf(stderr, " Found: '%s' -> %d\n", result->tag, result->val); else fprintf(stderr, " Not found: '%s'\n", key); fprintf(stderr, "---------- CLEANUP PHASE ----------\n"); fastfind_done(&ff_index); exit(0); } #endif #endif /* USE_FASTFIND */