1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-09-27 02:56:18 -04:00
elinks/src/viewer/text/search.c

2029 lines
47 KiB
C
Raw Normal View History

2007-07-27 07:13:27 -04:00
/** Searching in the HTML document
* @file */
#ifndef _GNU_SOURCE
#define _GNU_SOURCE /* XXX: we _WANT_ strcasestr() ! */
#endif
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <ctype.h> /* tolower(), isprint() */
#if defined(CONFIG_UTF8) && defined(HAVE_WCTYPE_H)
#include <wctype.h>
#endif
#include <sys/types.h> /* FreeBSD needs this before regex.h */
#include <stdlib.h>
#include <string.h>
#ifdef CONFIG_TRE
#include <tre/tre.h>
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
#endif
#include "elinks.h"
#include "bfu/dialog.h"
#include "config/kbdbind.h"
#include "document/document.h"
#include "document/view.h"
#include "intl/charsets.h"
#include "intl/gettext/libintl.h"
#include "main/event.h"
#include "main/module.h"
#include "session/session.h"
#include "terminal/screen.h"
#include "terminal/terminal.h"
#include "util/color.h"
#include "util/error.h"
#include "util/memory.h"
#include "util/string.h"
#include "viewer/action.h"
#include "viewer/text/draw.h"
#include "viewer/text/link.h"
#include "viewer/text/search.h"
#include "viewer/text/view.h"
#include "viewer/text/vs.h"
#define SEARCH_HISTORY_FILENAME "searchhist"
static INIT_INPUT_HISTORY(search_history);
#undef UCHAR
#ifdef CONFIG_UTF8
#define UCHAR unicode_val_T
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
#define PATTERN const wchar_t
#define Regcomp tre_regwcomp
#define Regexec tre_regwexec
#else
#define UCHAR unsigned char
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
#define PATTERN const char
#define Regcomp tre_regcomp
#define Regexec tre_regexec
#endif
static UCHAR *memacpy_u(char *text, int textlen, int utf8);
static enum frame_event_status move_search_do(struct session *ses, struct document_view *doc_view, int direction);
static inline void
add_srch_chr(struct document *document, UCHAR c, int x, int y, int nn)
{
assert(document);
if_assert_failed return;
if (c == ' ' && !document->nsearch) return;
if (document->search) {
int n = document->nsearch;
document->search[n].c = c;
document->search[n].x = x;
document->search[n].y = y;
document->search[n].n = nn;
}
document->nsearch++;
}
static void
sort_srch(struct document *document)
{
int i;
int *min, *max;
assert(document);
if_assert_failed return;
document->slines1 = mem_calloc(document->height, sizeof(*document->slines1));
if (!document->slines1) return;
document->slines2 = mem_calloc(document->height, sizeof(*document->slines2));
if (!document->slines2) {
mem_free(document->slines1);
return;
}
min = mem_calloc(document->height, sizeof(*min));
if (!min) {
mem_free(document->slines1);
mem_free(document->slines2);
return;
}
max = mem_calloc(document->height, sizeof(*max));
if (!max) {
mem_free(document->slines1);
mem_free(document->slines2);
mem_free(min);
return;
}
for (i = 0; i < document->height; i++) {
min[i] = INT_MAX;
max[i] = 0;
}
for (i = 0; i < document->nsearch; i++) {
struct search *s = &document->search[i];
int sxn = s->x + s->n;
if (s->x < min[s->y]) {
min[s->y] = s->x;
document->slines1[s->y] = s;
}
if (sxn > max[s->y]) {
max[s->y] = sxn;
document->slines2[s->y] = s;
}
}
mem_free(min);
mem_free(max);
}
static int
get_srch(struct document *document)
{
struct node *node;
assert(document && document->nsearch == 0);
if_assert_failed return 0;
foreachback (node, document->nodes) {
int x, y;
int height = int_min(node->box.y + node->box.height, document->height);
for (y = node->box.y; y < height; y++) {
int width = int_min(node->box.x + node->box.width,
document->data[y].length);
for (x = node->box.x;
x < width && document->data[y].chars[x].data <= ' ';
x++);
for (; x < width; x++) {
UCHAR c = document->data[y].chars[x].data;
int count = 0;
int xx;
if (document->data[y].chars[x].attr & SCREEN_ATTR_UNSEARCHABLE)
continue;
#ifdef CONFIG_UTF8
/* skip double-width char placeholders */
if (c == UCS_NO_CHAR)
continue;
if (c == 0xA0) {
add_srch_chr(document, ' ', x, y, 1);
continue;
}
#endif
if (c > ' ') {
add_srch_chr(document, c, x, y, 1);
continue;
}
for (xx = x + 1; xx < width; xx++) {
2021-02-27 03:51:23 -05:00
if ((unsigned char)document->data[y].chars[xx].data < ' ')
continue;
count = xx - x;
break;
}
add_srch_chr(document, ' ', x, y, count);
x = xx - 1;
}
add_srch_chr(document, ' ', x, y, 0);
}
}
return document->nsearch;
}
static void
get_search_data(struct document *document)
{
int n;
assert(document);
if_assert_failed return;
if (document->search) return;
n = get_srch(document);
if (!n) return;
document->nsearch = 0;
document->search = mem_alloc(n * sizeof(*document->search));
if (!document->search) return;
get_srch(document);
while (document->nsearch
&& document->search[document->nsearch - 1].c == ' ') {
--document->nsearch;
}
sort_srch(document);
}
2007-07-27 07:13:27 -04:00
/** Assign @a s1 and @a s2 the first search node and the last search
* node needed to form the region starting at line @a y and ending at
* the greater of @a y + @a height and the end of the document, with
* allowance at the start to allow for multi-line matches that would
* otherwise be partially outside of the region.
*
2007-07-27 07:13:27 -04:00
* @returns -1 on assertion failure, 1 if @a s1 and @a s2 are not
* found, and 0 if they are found. */
static int
get_range(struct document *document, int y, int height, int l,
struct search **s1, struct search **s2)
{
int i;
assert(document && s1 && s2);
if_assert_failed return -1;
*s1 = *s2 = NULL;
int_lower_bound(&y, 0);
/* Starting with line y, find the search node referencing the earliest
* point in the document text and the node referencing the last point,
* respectively s1 and s2.
*/
for (i = y; i < y + height && i < document->height; i++) {
if (document->slines1[i] && (!*s1 || document->slines1[i] < *s1))
*s1 = document->slines1[i];
if (document->slines2[i] && (!*s2 || document->slines2[i] > *s2))
*s2 = document->slines2[i];
}
if (!*s1 || !*s2) return 1;
/* Skip back by l to facilitate multi-line matches where the match
* begins before the start of the search region but is still partly
* within. */
*s1 -= l;
if (*s1 < document->search)
*s1 = document->search;
if (*s2 > document->search + document->nsearch - l + 1)
*s2 = document->search + document->nsearch - l + 1;
if (*s1 > *s2)
*s1 = *s2 = NULL;
if (!*s1 || !*s2)
return 1;
return 0;
}
#ifdef CONFIG_TRE
2007-07-27 07:13:27 -04:00
/** Returns a string @c doc that is a copy of the text in the search
* nodes from @a s1 to (@a s1 + @a doclen - 1) with the space at the
* end of each line converted to a new-line character (LF). */
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
static UCHAR *
get_search_region_from_search_nodes(struct search *s1, struct search *s2,
int pattern_len, int *doclen)
{
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
UCHAR *doc;
int i;
/* We must include @a pattern_len in this expression because get_range
* caps the end of the search region, @a s2, to the length of the
* document minus the length of the search pattern. */
*doclen = s2 - s1 + pattern_len;
if (!*doclen) return NULL;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
doc = mem_alloc((*doclen + 1) * sizeof(UCHAR));
if (!doc) {
*doclen = -1;
return NULL;
}
for (i = 0; i < *doclen; i++) {
if (s1[i].n == 0)
doc[i] = '\n';
else
doc[i] = s1[i].c;
}
doc[*doclen] = 0;
return doc;
}
struct regex_match_context {
struct search *s1;
struct search *s2;
int textlen;
int y1;
int y2;
int found;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
UCHAR *pattern;
};
static int
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
init_regex(regex_t *regex, UCHAR *pattern)
{
int regex_flags = REG_NEWLINE;
int reg_err;
if (get_opt_int("document.browse.search.regex", NULL) == 2)
regex_flags |= REG_EXTENDED;
if (!get_opt_bool("document.browse.search.case", NULL))
regex_flags |= REG_ICASE;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
reg_err = Regcomp(regex, (PATTERN *)pattern, regex_flags);
if (reg_err) {
tre_regfree(regex);
return 0;
}
return 1;
}
static void
search_for_pattern(struct regex_match_context *common_ctx, void *data,
void (*match)(struct regex_match_context *, void *))
{
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
UCHAR *doc;
UCHAR *doctmp;
int doclen;
int regexec_flags = 0;
regex_t regex;
regmatch_t regmatch;
int pos = 0;
struct search *search_start = common_ctx->s1;
unsigned char save_c;
/* TODO: show error message */
/* XXX: This will probably require that reg_err be passed thru
* common_ctx to the caller. */
if (!init_regex(&regex, common_ctx->pattern)) {
#if 0
/* Where and how should we display the error dialog ? */
unsigned char regerror_string[MAX_STR_LEN];
tre_regerror(reg_err, &regex, regerror_string, sizeof(regerror_string));
#endif
common_ctx->found = -2;
return;
}
doc = get_search_region_from_search_nodes(common_ctx->s1, common_ctx->s2, common_ctx->textlen, &doclen);
if (!doc) {
tre_regfree(&regex);
common_ctx->found = doclen;
return;
}
doctmp = doc;
find_next:
while (pos < doclen) {
int y = search_start[pos].y;
if (y >= common_ctx->y1 && y <= common_ctx->y2) break;
pos++;
}
doctmp = &doc[pos];
common_ctx->s1 = &search_start[pos];
while (pos < doclen) {
int y = search_start[pos].y;
if (y < common_ctx->y1 || y > common_ctx->y2) break;
pos++;
}
save_c = doc[pos];
doc[pos] = 0;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
while (*doctmp && !Regexec(&regex, (PATTERN *)doctmp, 1, &regmatch, regexec_flags)) {
regexec_flags = REG_NOTBOL;
common_ctx->textlen = regmatch.rm_eo - regmatch.rm_so;
if (!common_ctx->textlen) { doc[pos] = save_c; common_ctx->found = 1; goto free_stuff; }
common_ctx->s1 += regmatch.rm_so;
doctmp += regmatch.rm_so;
match(common_ctx, data);
doctmp += int_max(common_ctx->textlen, 1);
common_ctx->s1 += int_max(common_ctx->textlen, 1);
}
doc[pos] = save_c;
if (pos < doclen)
goto find_next;
free_stuff:
tre_regfree(&regex);
mem_free(doc);
}
struct is_in_range_regex_context {
int y;
int *min;
int *max;
};
static void
is_in_range_regex_match(struct regex_match_context *common_ctx, void *data)
{
struct is_in_range_regex_context *ctx = data;
int i;
if (common_ctx->s1[common_ctx->textlen].y < ctx->y || common_ctx->s1[common_ctx->textlen].y >= common_ctx->y2)
return;
common_ctx->found = 1;
for (i = 0; i < common_ctx->textlen; i++) {
if (!common_ctx->s1[i].n) continue;
int_upper_bound(ctx->min, common_ctx->s1[i].x);
int_lower_bound(ctx->max, common_ctx->s1[i].x + common_ctx->s1[i].n);
}
}
static int
is_in_range_regex(struct document *document, int y, int height,
char *text, int textlen,
int *min, int *max,
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
struct search *s1, struct search *s2, int utf8)
{
struct regex_match_context common_ctx;
struct is_in_range_regex_context ctx;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
UCHAR *txt = memacpy_u(text, textlen, utf8);
if (!txt) return -1;
ctx.y = y;
ctx.min = min;
ctx.max = max;
common_ctx.found = 0;
common_ctx.textlen = textlen;
common_ctx.y1 = y - 1;
common_ctx.y2 = y + height;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
common_ctx.pattern = txt;
common_ctx.s1 = s1;
common_ctx.s2 = s2;
search_for_pattern(&common_ctx, &ctx, is_in_range_regex_match);
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
mem_free(txt);
return common_ctx.found;
}
#endif /* CONFIG_TRE */
static UCHAR *
memacpy_u(char *text, int textlen, int utf8)
{
#ifdef CONFIG_UTF8
UCHAR *mem = mem_alloc((textlen + 1) * sizeof(UCHAR));
if (!mem) return NULL;
if (utf8) {
int i;
for (i = 0; i < textlen; i++)
mem[i] = utf8_to_unicode(&text, text + 7);
} else {
int i;
for (i = 0; i < textlen; i++)
mem[i] = text[i];
}
mem[textlen] = 0;
return mem;
#else
return memacpy(text, textlen);
#endif
}
static int
strlen_u(char *text, int utf8)
{
#ifdef CONFIG_UTF8
if (utf8)
return strlen_utf8(&text);
#endif
return strlen(text);
}
2007-07-27 07:13:27 -04:00
/** Returns an allocated string which is a lowered copy of passed one. */
static UCHAR *
lowered_string(char *text, int textlen, int utf8)
{
UCHAR *ret;
if (textlen < 0) textlen = strlen_u(text, utf8);
ret = memacpy_u(text, textlen, utf8);
if (ret && textlen) {
do {
#if defined(CONFIG_UTF8) && defined(HAVE_WCTYPE_H)
ret[textlen] = utf8 ? towlower(ret[textlen]) : tolower(ret[textlen]);
#else
ret[textlen] = tolower(ret[textlen]);
#endif
} while (textlen--);
}
return ret;
}
static int
is_in_range_plain(struct document *document, int y, int height,
char *text, int textlen,
int *min, int *max,
struct search *s1, struct search *s2, int utf8)
{
int yy = y + height;
UCHAR *txt;
int found = 0;
int case_sensitive = get_opt_bool("document.browse.search.case", NULL);
txt = case_sensitive ? memacpy_u(text, textlen, utf8) : lowered_string(text, textlen, utf8);
if (!txt) return -1;
/* TODO: This is a great candidate for nice optimizations. Fresh CS
* graduates can use their knowledge of ie. KMP (should be quite
* trivial, probably a starter; very fast as well) or Turbo-BM (or
* maybe some other Boyer-Moore variant, I don't feel that strong in
* this area), hmm? >:) --pasky */
#if defined(CONFIG_UTF8) && defined(HAVE_WCTYPE_H)
#define maybe_tolower(c) (case_sensitive ? (c) : utf8 ? towlower(c) : tolower(c))
#else
#define maybe_tolower(c) (case_sensitive ? (c) : tolower(c))
#endif
for (; s1 <= s2; s1++) {
int i;
if (maybe_tolower(s1->c) != txt[0]) {
srch_failed:
continue;
}
for (i = 1; i < textlen; i++)
if (maybe_tolower(s1[i].c) != txt[i])
goto srch_failed;
if (s1[i].y < y || s1[i].y >= yy)
continue;
found = 1;
for (i = 0; i < textlen; i++) {
if (!s1[i].n) continue;
int_upper_bound(min, s1[i].x);
int_lower_bound(max, s1[i].x + s1[i].n);
}
}
#undef maybe_tolower
mem_free(txt);
return found;
}
static int
is_in_range(struct document *document, int y, int height,
char *text, int *min, int *max)
{
struct search *s1, *s2;
int textlen;
int utf8 = 0;
assert(document && text && min && max);
if_assert_failed return -1;
#ifdef CONFIG_UTF8
utf8 = document->options.utf8;
#endif
*min = INT_MAX, *max = 0;
textlen = strlen_u(text, utf8);
if (get_range(document, y, height, textlen, &s1, &s2))
return 0;
#ifdef CONFIG_TRE
if (get_opt_int("document.browse.search.regex", NULL))
return is_in_range_regex(document, y, height, text, textlen,
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
min, max, s1, s2, utf8);
#endif
return is_in_range_plain(document, y, height, text, textlen,
min, max, s1, s2, utf8);
}
#define realloc_points(pts, size) \
mem_align_alloc(pts, size, (size) + 1, 0xFF)
static void
get_searched_plain(struct document_view *doc_view, struct point **pt, int *pl,
int l, struct search *s1, struct search *s2, int utf8)
{
UCHAR *txt;
struct point *points = NULL;
struct el_box *box;
int xoffset, yoffset;
int len = 0;
int case_sensitive = get_opt_bool("document.browse.search.case", NULL);
txt = case_sensitive ? memacpy_u(*doc_view->search_word, l, utf8)
: lowered_string(*doc_view->search_word, l, utf8);
if (!txt) return;
box = &doc_view->box;
xoffset = box->x - doc_view->vs->x;
yoffset = box->y - doc_view->vs->y;
#if defined(CONFIG_UTF8) && defined(HAVE_WCTYPE_H)
#define maybe_tolower(c) (case_sensitive ? (c) : utf8 ? towlower(c) : tolower(c))
#else
#define maybe_tolower(c) (case_sensitive ? (c) : tolower(c))
#endif
for (; s1 <= s2; s1++) {
int i;
if (maybe_tolower(s1[0].c) != txt[0]) {
srch_failed:
continue;
}
for (i = 1; i < l; i++)
if (maybe_tolower(s1[i].c) != txt[i])
goto srch_failed;
for (i = 0; i < l; i++) {
int j;
int y = s1[i].y + yoffset;
if (!row_is_in_box(box, y))
continue;
for (j = 0; j < s1[i].n; j++) {
int sx = s1[i].x + j;
int x = sx + xoffset;
if (!col_is_in_box(box, x))
continue;
if (!realloc_points(&points, len))
continue;
points[len].x = sx;
points[len++].y = s1[i].y;
}
}
}
#undef maybe_tolower
mem_free(txt);
*pt = points;
*pl = len;
}
static void
get_searched_plain_all(struct document_view *doc_view, struct point **pt, int *pl,
int l, struct search *s1, struct search *s2, int utf8)
{
UCHAR *txt;
struct point *points = NULL;
int len = 0;
int case_sensitive = get_opt_bool("document.browse.search.case", NULL);
txt = case_sensitive ? memacpy_u(*doc_view->search_word, l, utf8)
: lowered_string(*doc_view->search_word, l, utf8);
if (!txt) return;
#if defined(CONFIG_UTF8) && defined(HAVE_WCTYPE_H)
#define maybe_tolower(c) (case_sensitive ? (c) : utf8 ? towlower(c) : tolower(c))
#else
#define maybe_tolower(c) (case_sensitive ? (c) : tolower(c))
#endif
for (; s1 <= s2; s1++) {
int i;
if (maybe_tolower(s1[0].c) != txt[0]) {
srch_failed:
continue;
}
for (i = 1; i < l; i++)
if (maybe_tolower(s1[i].c) != txt[i])
goto srch_failed;
if (!realloc_points(&points, len))
continue;
points[len].x = s1[0].x;
points[len++].y = s1[0].y;
}
#undef maybe_tolower
mem_free(txt);
*pt = points;
*pl = len;
}
#ifdef CONFIG_TRE
struct get_searched_regex_context {
int xoffset;
int yoffset;
struct el_box *box;
struct point *points;
int len;
};
static void
get_searched_regex_match(struct regex_match_context *common_ctx, void *data)
{
struct get_searched_regex_context *ctx = data;
int i;
for (i = 0; i < common_ctx->textlen; i++) {
int j;
int y = common_ctx->s1[i].y + ctx->yoffset;
if (!row_is_in_box(ctx->box, y))
continue;
for (j = 0; j < common_ctx->s1[i].n; j++) {
int sx = common_ctx->s1[i].x + j;
int x = sx + ctx->xoffset;
if (!col_is_in_box(ctx->box, x))
continue;
if (!realloc_points(&ctx->points, ctx->len))
continue;
ctx->points[ctx->len].x = sx;
ctx->points[ctx->len++].y = common_ctx->s1[i].y;
}
}
}
static void
get_searched_regex_match_all(struct regex_match_context *common_ctx, void *data)
{
struct get_searched_regex_context *ctx = data;
if (!realloc_points(&ctx->points, ctx->len))
return;
ctx->points[ctx->len].x = common_ctx->s1[0].x;
ctx->points[ctx->len++].y = common_ctx->s1[0].y;
}
static void
get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
int textlen, struct search *s1, struct search *s2, int utf8)
{
struct regex_match_context common_ctx;
struct get_searched_regex_context ctx;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
UCHAR *txt = memacpy_u(*doc_view->search_word, textlen, utf8);
if (!txt) return;
ctx.points = NULL;
ctx.len = 0;
ctx.box = &doc_view->box;
ctx.xoffset = ctx.box->x - doc_view->vs->x;
ctx.yoffset = ctx.box->y - doc_view->vs->y;
common_ctx.found = 0;
common_ctx.textlen = textlen;
common_ctx.y1 = doc_view->vs->y - 1;
common_ctx.y2 = doc_view->vs->y + ctx.box->height;
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
common_ctx.pattern = txt;
common_ctx.s1 = s1;
common_ctx.s2 = s2;
search_for_pattern(&common_ctx, &ctx, get_searched_regex_match);
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
mem_free(txt);
*pt = ctx.points;
*pl = ctx.len;
}
static void
get_searched_regex_all(struct document_view *doc_view, struct point **pt, int *pl,
int textlen, struct search *s1, struct search *s2, int utf8)
{
struct regex_match_context common_ctx;
struct get_searched_regex_context ctx;
UCHAR *txt = memacpy_u(*doc_view->search_word, textlen, utf8);
if (!txt) return;
ctx.points = NULL;
ctx.len = 0;
ctx.box = &doc_view->box;
ctx.xoffset = 0;
ctx.yoffset = 0;
common_ctx.found = 0;
common_ctx.textlen = textlen;
common_ctx.y1 = -1;
common_ctx.y2 = doc_view->document->height;
common_ctx.pattern = txt;
common_ctx.s1 = s1;
common_ctx.s2 = s2;
search_for_pattern(&common_ctx, &ctx, get_searched_regex_match_all);
mem_free(txt);
*pt = ctx.points;
*pl = ctx.len;
}
#endif /* CONFIG_TRE */
static void
get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf8)
{
struct search *s1, *s2;
int l;
assert(doc_view && doc_view->vs && pt && pl);
if_assert_failed return;
if (!has_search_word(doc_view))
return;
get_search_data(doc_view->document);
l = strlen_u(*doc_view->search_word, utf8);
if (get_range(doc_view->document, doc_view->vs->y,
doc_view->box.height, l, &s1, &s2)) {
*pt = NULL;
*pl = 0;
return;
}
#ifdef CONFIG_TRE
if (get_opt_int("document.browse.search.regex", NULL))
Bug 1060: Use libtre for regexp searches. When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2008-12-24 08:48:00 -05:00
get_searched_regex(doc_view, pt, pl, l, s1, s2, utf8);
else
#endif
get_searched_plain(doc_view, pt, pl, l, s1, s2, utf8);
}
2007-07-27 07:13:27 -04:00
/** Highlighting of searched strings. */
void
draw_searched(struct terminal *term, struct document_view *doc_view)
{
struct point *pt = NULL;
int len = 0;
int utf8 = 0;
assert(term && doc_view);
if_assert_failed return;
if (!has_search_word(doc_view))
return;
#ifdef CONFIG_UTF8
utf8 = doc_view->document->options.utf8;
#endif
get_searched(doc_view, &pt, &len, utf8);
if (len) {
int i;
struct color_pair *color = get_bfu_color(term, "searched");
int xoffset = doc_view->box.x - doc_view->vs->x;
int yoffset = doc_view->box.y - doc_view->vs->y;
for (i = 0; i < len; i++) {
int x = pt[i].x + xoffset;
int y = pt[i].y + yoffset;
/* TODO: We should take in account original colors and
* combine them with defined color. */
#if 0
/* This piece of code shows the old way of handling
* colors and screen char attributes. */
unsigned co = get_char(term, x, y);
co = ((co >> 3) & 0x0700) | ((co << 3) & 0x3800);
#endif
draw_char_color(term, x, y, color);
}
}
mem_free_if(pt);
}
enum find_error {
FIND_ERROR_NONE,
FIND_ERROR_NO_PREVIOUS_SEARCH,
FIND_ERROR_HIT_TOP,
FIND_ERROR_HIT_BOTTOM,
FIND_ERROR_NOT_FOUND,
FIND_ERROR_MEMORY,
FIND_ERROR_REGEX,
};
static enum find_error find_next_do(struct session *ses,
struct document_view *doc_view,
int direction);
static void print_find_error(struct session *ses, enum find_error find_error);
static enum find_error
get_searched_all(struct session *ses, struct document_view *doc_view, struct point **pt, int *pl, int utf8)
{
struct search *s1, *s2;
int l;
assert(ses && doc_view && doc_view->vs && pt && pl);
if_assert_failed return FIND_ERROR_MEMORY;
if (!ses->search_word) {
if (!ses->last_search_word) {
return FIND_ERROR_NO_PREVIOUS_SEARCH;
}
ses->search_word = stracpy(ses->last_search_word);
if (!ses->search_word) return FIND_ERROR_MEMORY;
}
get_search_data(doc_view->document);
l = strlen_u(*doc_view->search_word, utf8);
if (get_range(doc_view->document, 0,
doc_view->document->height, l, &s1, &s2)) {
*pt = NULL;
*pl = 0;
return FIND_ERROR_NOT_FOUND;
}
#ifdef CONFIG_TRE
if (get_opt_int("document.browse.search.regex", NULL))
get_searched_regex_all(doc_view, pt, pl, l, s1, s2, utf8);
else
#endif
get_searched_plain_all(doc_view, pt, pl, l, s1, s2, utf8);
2018-04-15 12:08:54 -04:00
if (*pt == NULL)
return FIND_ERROR_NOT_FOUND;
move_search_do(ses, doc_view, 0);
return FIND_ERROR_NONE;
}
static enum find_error
search_for_do(struct session *ses, char *str, int direction,
int report_errors)
{
struct document_view *doc_view;
2020-06-05 12:07:45 -04:00
int utf8 = 0;
enum find_error error;
assert(ses && str);
if_assert_failed return FIND_ERROR_NOT_FOUND;
doc_view = current_frame(ses);
assert(doc_view);
if_assert_failed return FIND_ERROR_NOT_FOUND;
#ifdef CONFIG_UTF8
utf8 = doc_view->document->options.utf8;
#endif
mem_free_set(&ses->search_word, NULL);
mem_free_set(&ses->last_search_word, NULL);
mem_free_set(&doc_view->document->search_points, NULL);
doc_view->document->number_of_search_points = 0;
doc_view->vs->current_search_number = -1;
if (!*str) return FIND_ERROR_NOT_FOUND;
/* We only set the last search word because we don.t want find_next()
* to try to find next link in search before the search data has been
* initialized. find_next() will set ses->search_word for us. */
ses->last_search_word = stracpy(str);
if (!ses->last_search_word) return FIND_ERROR_NOT_FOUND;
ses->search_direction = direction;
error = get_searched_all(ses, doc_view, &doc_view->document->search_points,
&doc_view->document->number_of_search_points, utf8);
if (report_errors && error == FIND_ERROR_NOT_FOUND)
print_find_error(ses, error);
return error;
}
static void
search_for_back(struct session *ses, char *str)
{
assert(ses && str);
if_assert_failed return;
search_for_do(ses, str, -1, 1);
}
2020-04-25 07:59:40 -04:00
void
search_for(struct session *ses, char *str)
{
assert(ses && str);
if_assert_failed return;
search_for_do(ses, str, 1, 1);
}
static inline int
point_intersect(struct point *p1, int l1, struct point *p2, int l2)
{
#define HASH_SIZE 4096
#define HASH(p) ((((p).y << 6) + (p).x) & (HASH_SIZE - 1))
int i;
static char hash[HASH_SIZE];
/* Note that an object of static storage duration is automatically
* initialised to zero in C. */
assert(p2);
if_assert_failed return 0;
for (i = 0; i < l1; i++) hash[HASH(p1[i])] = 1;
for (i = 0; i < l2; i++) {
int j;
if (!hash[HASH(p2[i])]) continue;
for (j = 0; j < l1; j++) {
if (p1[j].x != p2[i].x) continue;
if (p1[j].y != p2[i].y) continue;
for (i = 0; i < l1; i++)
hash[HASH(p1[i])] = 0;
return 1;
}
}
for (i = 0; i < l1; i++) hash[HASH(p1[i])] = 0;
return 0;
#undef HASH
#undef HASH_SIZE
}
static int
find_next_link_in_search(struct document_view *doc_view, int direction)
{
int utf8 = 0;
struct point *pt = NULL;
struct link *link;
int len;
#ifdef CONFIG_UTF8
utf8 = doc_view->document->options.utf8;
#endif
assert(doc_view && doc_view->vs);
if_assert_failed return 0;
if (direction == -2 || direction == 2) {
direction /= 2;
if (direction < 0)
find_link_page_up(doc_view);
else
find_link_page_down(doc_view);
if (doc_view->vs->current_link == -1) return 1;
goto nt;
}
while (doc_view->vs->current_link != -1
&& next_link_in_view(doc_view, doc_view->vs->current_link + direction,
direction)) {
nt:
link = &doc_view->document->links[doc_view->vs->current_link];
get_searched(doc_view, &pt, &len, utf8);
if (point_intersect(pt, len, link->points, link->npoints)) {
mem_free(pt);
return 0;
}
mem_free_if(pt);
}
if (direction < 0)
find_link_page_up(doc_view);
else
find_link_page_down(doc_view);
return 1;
}
static enum find_error
find_next_do(struct session *ses, struct document_view *doc_view, int direction)
{
int p, min, max, c = 0;
int step, hit_bottom = 0, hit_top = 0;
int height;
assert(ses && ses->tab && ses->tab->term && doc_view && doc_view->vs
&& direction);
if_assert_failed return FIND_ERROR_NONE;
direction *= ses->search_direction;
p = doc_view->vs->y;
height = doc_view->box.height;
step = direction * height;
if (ses->search_word) {
if (!find_next_link_in_search(doc_view, direction))
return FIND_ERROR_NONE;
p += step;
}
if (!ses->search_word) {
if (!ses->last_search_word) {
return FIND_ERROR_NO_PREVIOUS_SEARCH;
}
ses->search_word = stracpy(ses->last_search_word);
if (!ses->search_word) return FIND_ERROR_NONE;
}
get_search_data(doc_view->document);
do {
int in_range = is_in_range(doc_view->document, p, height,
ses->search_word, &min, &max);
if (in_range == -1) return FIND_ERROR_MEMORY;
if (in_range == -2) return FIND_ERROR_REGEX;
if (in_range) {
doc_view->vs->y = p;
if (max >= min)
doc_view->vs->x = int_min(int_max(doc_view->vs->x,
max - doc_view->box.width),
min);
set_link(doc_view);
find_next_link_in_search(doc_view, direction * 2);
if (hit_top)
return FIND_ERROR_HIT_TOP;
if (hit_bottom)
return FIND_ERROR_HIT_BOTTOM;
return FIND_ERROR_NONE;
}
p += step;
if (p > doc_view->document->height) {
hit_bottom = 1;
p = 0;
}
if (p < 0) {
hit_top = 1;
p = 0;
while (p < doc_view->document->height) p += height;
p -= height;
}
c += height;
} while (c < doc_view->document->height + height);
return FIND_ERROR_NOT_FOUND;
}
static void
print_find_error_not_found(struct session *ses, char *title,
char *message, char *search_string)
{
switch (get_opt_int("document.browse.search.show_not_found", NULL)) {
case 2:
info_box(ses->tab->term, MSGBOX_FREE_TEXT,
title, ALIGN_CENTER,
msg_text(ses->tab->term, message,
search_string));
break;
case 1:
beep_terminal(ses->tab->term);
default:
break;
}
}
static void
print_find_error(struct session *ses, enum find_error find_error)
{
int hit_top = 0;
char *message = NULL;
switch (find_error) {
case FIND_ERROR_HIT_TOP:
hit_top = 1;
case FIND_ERROR_HIT_BOTTOM:
if (!get_opt_bool("document.browse.search"
".show_hit_top_bottom", NULL))
break;
message = hit_top
? N_("Search hit top, continuing at bottom.")
: N_("Search hit bottom, continuing at top.");
break;
case FIND_ERROR_NO_PREVIOUS_SEARCH:
message = N_("No previous search");
break;
case FIND_ERROR_NOT_FOUND:
print_find_error_not_found(ses, N_("Search"),
N_("Search string"
" '%s' not found"),
ses->search_word);
break;
case FIND_ERROR_REGEX:
print_find_error_not_found(ses, N_("Search"),
N_("Could not compile"
" regular expression"
" '%s'"),
ses->search_word);
break;
case FIND_ERROR_MEMORY:
/* Why bother trying to create a msg_box?
* We probably don't have the memory... */
case FIND_ERROR_NONE:
break;
}
if (!message) return;
info_box(ses->tab->term, 0, N_("Search"), ALIGN_CENTER, message);
}
static enum find_error move_search_number(struct session *ses, struct document_view *doc_view, int number);
static int
is_y_on_screen(struct document_view *doc_view, int y)
{
return y >= doc_view->vs->y && y < doc_view->vs->y + doc_view->box.height;
}
static void
find_first_search_in_view(struct session *ses, struct document_view *doc_view)
{
int i;
int current_search_number = doc_view->vs->current_search_number;
if (current_search_number >= 0 && current_search_number < doc_view->document->number_of_search_points) {
struct point *point = doc_view->document->search_points + current_search_number;
if (is_y_on_screen(doc_view, point[0].y))
return;
}
for (i = 0; i < doc_view->document->number_of_search_points; ++i) {
int y = doc_view->document->search_points[i].y;
if (y >= doc_view->vs->y)
break;
}
doc_view->vs->current_search_number = i;
}
static enum frame_event_status
move_search_do(struct session *ses, struct document_view *doc_view, int direction)
{
if (!doc_view->document->number_of_search_points) {
#ifdef CONFIG_UTF8
int utf8 = doc_view->document->options.utf8;
#else
int utf8 = 0;
#endif
doc_view->vs->current_search_number = -1;
enum find_error error = get_searched_all(ses, doc_view, &doc_view->document->search_points,
&doc_view->document->number_of_search_points, utf8);
if (error == FIND_ERROR_NOT_FOUND) {
return FRAME_EVENT_OK;
}
}
int number;
find_first_search_in_view(ses, doc_view);
number = doc_view->vs->current_search_number + direction;
print_find_error(ses, move_search_number(ses, doc_view, number));
return FRAME_EVENT_REFRESH;
}
enum frame_event_status
move_search_next(struct session *ses, struct document_view *doc_view)
{
return move_search_do(ses, doc_view, 1);
}
enum frame_event_status
move_search_prev(struct session *ses, struct document_view *doc_view)
{
return move_search_do(ses, doc_view, -1);
}
static enum find_error
move_search_number(struct session *ses, struct document_view *doc_view, int number)
{
struct point *pt;
int x, y, step;
2018-04-15 10:45:23 -04:00
enum find_error ret = FIND_ERROR_NONE;
if (doc_view->document->number_of_search_points == 0) {
return FIND_ERROR_NO_PREVIOUS_SEARCH;
}
else if (number < 0) {
2018-04-15 10:45:23 -04:00
ret = FIND_ERROR_HIT_TOP;
if (!get_opt_bool("document.browse.search.wraparound", NULL)) return ret;
number = doc_view->document->number_of_search_points - 1;
}
else if (number >= doc_view->document->number_of_search_points) {
ret = FIND_ERROR_HIT_BOTTOM;
if (!get_opt_bool("document.browse.search.wraparound", NULL)) return ret;
number = 0;
}
doc_view->vs->current_search_number = number;
pt = doc_view->document->search_points;
2021-02-28 10:44:17 -05:00
if (!pt) {
return ret;
}
x = pt[number].x;
y = pt[number].y;
if (!col_is_in_box(&doc_view->box, x)) {
horizontal_scroll_extended(ses, doc_view, x - doc_view->vs->x, 0);
}
step = y - doc_view->vs->y - get_opt_int("document.browse.scrolling.vertical_overlap", ses);
vertical_scroll(ses, doc_view, step);
2018-04-15 10:45:23 -04:00
return ret;
}
enum frame_event_status
find_next(struct session *ses, struct document_view *doc_view, int direction)
{
print_find_error(ses, find_next_do(ses, doc_view, direction));
/* FIXME: Make this more fine-grained */
return FRAME_EVENT_REFRESH;
}
2007-07-27 07:13:27 -04:00
/** @name Link typeahead
* @{ */
enum typeahead_code {
TYPEAHEAD_MATCHED,
TYPEAHEAD_ERROR,
TYPEAHEAD_ERROR_NO_FURTHER,
TYPEAHEAD_CANCEL,
};
static void
typeahead_error(struct session *ses, char *typeahead, int no_further)
{
char *message;
if (no_further)
message = N_("No further matches for '%s'.");
else
message = N_("Could not find a link with the text '%s'.");
print_find_error_not_found(ses, N_("Typeahead"), message, typeahead);
}
static inline char *
get_link_typeahead_text(struct link *link)
{
char *name = get_link_name(link);
if (name) return name;
if (link->where) return link->where;
if (link->where_img) return link->where_img;
return "";
}
static int
match_link_text(struct link *link, char *text, int textlen,
int case_sensitive)
{
char *match = get_link_typeahead_text(link);
char *matchpos;
if (link_is_form(link) || textlen > strlen(match))
return -1;
2016-04-20 14:46:33 -04:00
matchpos = case_sensitive ? strstr((const char *)match, (const char *)text)
: strcasestr((const char *)match, (const char *)text);
if (matchpos) {
return matchpos - match;
}
return -1;
}
/* Searches the @document for a link with the given @text. takes the
* current_link in the view, the link to start searching from @i and the
* direction to search (1 is forward, -1 is back). */
static inline int
search_link_text(struct document *document, int current_link, int i,
char *text, int direction, int *offset)
{
int upper_link, lower_link;
int case_sensitive = get_opt_bool("document.browse.search.case", NULL);
int wraparound = get_opt_bool("document.browse.search.wraparound",
NULL);
int textlen = strlen(text);
assert(textlen && direction && offset);
/* The link interval in which we are currently searching */
/* Set up the range of links that should be search in first attempt */
if (direction > 0) {
upper_link = document->nlinks;
lower_link = i - 1;
} else {
upper_link = i + 1;
lower_link = -1;
}
for (; i > lower_link && i < upper_link; i += direction) {
struct link *link = &document->links[i];
int match_offset = match_link_text(link, text, textlen,
case_sensitive);
if (match_offset >= 0) {
*offset = match_offset;
return i;
}
if (!wraparound) continue;
/* Check if we are at the end of the first range.
* Only wrap around one time. Initialize @i with
* {+= direction} in mind. */
if (direction > 0) {
if (i == upper_link - 1) {
upper_link = current_link + 1;
lower_link = -1;
i = lower_link;
wraparound = 0;
}
} else {
if (i == lower_link + 1) {
upper_link = document->nlinks;
lower_link = current_link - 1;
i = upper_link;
wraparound = 0;
}
}
}
return -1;
}
static inline void
fixup_typeahead_match(struct session *ses, struct document_view *doc_view)
{
/* We adjust the box_size to account for the typeahead input line
* (we don't want the input line to cover the current link). */
doc_view->box.height -= 1;
check_vs(doc_view);
doc_view->box.height += 1;
}
static inline UCHAR
get_document_char(struct document *document, int x, int y)
{
return (document->height > y && document->data[y].length > x)
? document->data[y].chars[x].data : 0;
}
static void
draw_typeahead_match(struct terminal *term, struct document_view *doc_view,
int chars, int offset)
{
struct color_pair *color = get_bfu_color(term, "searched");
int xoffset = doc_view->box.x - doc_view->vs->x;
int yoffset = doc_view->box.y - doc_view->vs->y;
struct link *link = get_current_link(doc_view);
char *text = get_link_typeahead_text(link);
int end = offset + chars;
int i, j;
for (i = 0, j = 0; text[j] && i < end; i++, j++) {
int x = link->points[i].x;
int y = link->points[i].y;
UCHAR data = get_document_char(doc_view->document, x, y);
/* Text wrapping might remove space chars from the link
* position array so try to align the matched typeahead text
* with what is actually on the screen by shifting the link
* position variables if the canvas data do not match. */
if (data != text[j]) {
i--;
end--;
offset--;
} else if (i >= offset) {
/* TODO: We should take in account original colors and
* combine them with defined color. */
draw_char_color(term, xoffset + x, yoffset + y, color);
}
}
}
static enum typeahead_code
do_typeahead(struct session *ses, struct document_view *doc_view,
char *text, int action_id, int *offset)
{
int current = int_max(doc_view->vs->current_link, 0);
int direction, match, i = current;
struct document *document = doc_view->document;
switch (action_id) {
case ACT_EDIT_PREVIOUS_ITEM:
case ACT_EDIT_UP:
direction = -1;
i--;
break;
case ACT_EDIT_NEXT_ITEM:
case ACT_EDIT_DOWN:
direction = 1;
i++;
break;
case ACT_EDIT_ENTER:
goto_current_link(ses, doc_view, 0);
return TYPEAHEAD_CANCEL;
default:
direction = 1;
}
if (i < 0 || i >= doc_view->document->nlinks) {
if (!get_opt_bool("document.browse.search.wraparound", NULL)) {
if (match_link_text(&document->links[current],
text, strlen(text),
get_opt_bool("document.browse"
".search.case", NULL))
>= 0) {
return TYPEAHEAD_ERROR_NO_FURTHER;
}
return TYPEAHEAD_ERROR;
}
i = direction > 0 ? 0 : doc_view->document->nlinks - 1;
}
match = search_link_text(document, current, i, text, direction, offset);
if (match == current && i != current)
return TYPEAHEAD_ERROR_NO_FURTHER;
if (match < 0) {
if (i != current)
return TYPEAHEAD_ERROR_NO_FURTHER;
return TYPEAHEAD_ERROR;
}
assert(match >= 0 && match < doc_view->document->nlinks);
doc_view->vs->current_link = match;
return TYPEAHEAD_MATCHED;
}
2007-07-27 07:13:27 -04:00
/** @} */
2007-07-27 07:13:27 -04:00
/** @name Typeahead
* @{ */
/** @a action_id can be a value from enum edit_action, in which case the
* approriate action is performed; -1, which indicates to search and report any
* errors; or -2, which indicates to search without reporting any errors. */
static enum input_line_code
text_typeahead_handler(struct input_line *line, int action_id)
{
struct session *ses = line->ses;
char *buffer = line->buffer;
struct document_view *doc_view = current_frame(ses);
int direction = ((char *) line->data)[0] == '/' ? 1 : -1;
int report_errors = action_id == -1;
enum find_error error;
assertm(doc_view != NULL, "document not formatted");
if_assert_failed return INPUT_LINE_CANCEL;
switch (action_id) {
case ACT_EDIT_REDRAW:
return INPUT_LINE_PROCEED;
case ACT_EDIT_ENTER:
if (!*buffer) {
/* This ensures that search-typeahead-text
* followed immediately with enter
* clears the last search. */
search_for_do(ses, buffer, direction, 0);
}
return INPUT_LINE_CANCEL;
case ACT_EDIT_PREVIOUS_ITEM:
find_next(ses, doc_view, -1);
break;
case ACT_EDIT_NEXT_ITEM:
find_next(ses, doc_view, 1);
break;
case ACT_EDIT_SEARCH_TOGGLE_REGEX: {
struct option *opt =
get_opt_rec(config_options,
"document.browse.search.regex");
if (opt) {
opt->value.number = (opt->value.number + 1)
% (opt->max + 1);
option_changed(ses, opt);
}
}
/* Fall thru */
default:
error = search_for_do(ses, buffer, direction, 0);
if (error == FIND_ERROR_REGEX)
break;
if (report_errors)
print_find_error(ses, error);
/* We need to check |*buffer| here because
* the input-line code will call this handler
* even after it handles a back-space press. */
if (error != FIND_ERROR_HIT_TOP
&& error != FIND_ERROR_HIT_BOTTOM
&& error != FIND_ERROR_NONE && *buffer)
return INPUT_LINE_REWIND;
}
draw_formatted(ses, 0);
return INPUT_LINE_PROCEED;
}
static enum input_line_code
link_typeahead_handler(struct input_line *line, int action_id)
{
struct session *ses = line->ses;
char *buffer = line->buffer;
struct document_view *doc_view = current_frame(ses);
int offset = 0;
assertm(doc_view != NULL, "document not formatted");
if_assert_failed return INPUT_LINE_CANCEL;
/* If there is nothing to match with don't start searching */
if (!*buffer) {
/* If something already were typed we need to redraw
* in order to remove the coloring of the link text. */
if (line->data) draw_formatted(ses, 0);
return INPUT_LINE_PROCEED;
}
if (action_id == ACT_EDIT_REDRAW) {
int current = doc_view->vs->current_link;
int offset, bufferlen;
if (current < 0) return INPUT_LINE_PROCEED;
bufferlen = strlen(buffer);
offset = match_link_text(&doc_view->document->links[current],
buffer, bufferlen,
get_opt_bool("document.browse"
".search.case", NULL));
if (offset >= 0) {
draw_typeahead_match(ses->tab->term, doc_view,
bufferlen, offset);
}
return INPUT_LINE_PROCEED;
}
/* Hack time .. should we change mode? */
if (!line->data) {
enum main_action action_id = ACT_MAIN_NONE;
switch (*buffer) {
case '#':
action_id = ACT_MAIN_SEARCH_TYPEAHEAD_LINK;
break;
case '?':
action_id = ACT_MAIN_SEARCH_TYPEAHEAD_TEXT_BACK;
break;
case '/':
action_id = ACT_MAIN_SEARCH_TYPEAHEAD_TEXT;
break;
default:
break;
}
/* Should we reboot the input line .. (inefficient but easy) */
if (action_id != ACT_MAIN_NONE) {
search_typeahead(ses, doc_view, action_id);
return INPUT_LINE_CANCEL;
}
line->data = "#";
}
switch (do_typeahead(ses, doc_view, buffer, action_id, &offset)) {
case TYPEAHEAD_MATCHED:
fixup_typeahead_match(ses, doc_view);
draw_formatted(ses, 0);
draw_typeahead_match(ses->tab->term, doc_view, strlen(buffer), offset);
return INPUT_LINE_PROCEED;
case TYPEAHEAD_ERROR_NO_FURTHER:
typeahead_error(ses, buffer, 1);
draw_typeahead_match(ses->tab->term, doc_view, strlen(buffer), offset);
return INPUT_LINE_PROCEED;
case TYPEAHEAD_ERROR:
typeahead_error(ses, buffer, 0);
return INPUT_LINE_REWIND;
case TYPEAHEAD_CANCEL:
default:
return INPUT_LINE_CANCEL;
}
}
enum frame_event_status
search_typeahead(struct session *ses, struct document_view *doc_view,
action_id_T action_id)
{
char *prompt = "#";
char *data = NULL;
input_line_handler_T handler = text_typeahead_handler;
struct input_history *history = &search_history;
switch (action_id) {
case ACT_MAIN_SEARCH_TYPEAHEAD_TEXT:
prompt = data = "/";
break;
case ACT_MAIN_SEARCH_TYPEAHEAD_TEXT_BACK:
prompt = data = "?";
break;
case ACT_MAIN_SEARCH_TYPEAHEAD_LINK:
data = "#";
/* Falling forward .. good punk rock */
case ACT_MAIN_SEARCH_TYPEAHEAD:
default:
if (doc_view->document->nlinks) {
handler = link_typeahead_handler;
break;
}
info_box(ses->tab->term, MSGBOX_FREE_TEXT,
N_("Typeahead"), ALIGN_CENTER,
msg_text(ses->tab->term,
N_("No links in current document")));
return FRAME_EVENT_OK;
}
input_field_line(ses, prompt, data, history, handler);
return FRAME_EVENT_OK;
}
2007-07-27 07:13:27 -04:00
/** @} */
/* The dialog functions are clones of input_field() ones. Gross code
* duplication. */
/* TODO: This is just hacked input_field(), containing a lot of generic crap
* etc. The useless cruft should be blasted out. And it's quite ugly anyway,
* a nice cleanup target ;-). --pasky */
enum search_option {
#ifdef CONFIG_TRE
SEARCH_OPT_REGEX,
#endif
SEARCH_OPT_CASE,
SEARCH_OPTIONS,
};
static struct option_resolver resolvers[] = {
#ifdef CONFIG_TRE
{ SEARCH_OPT_REGEX, "regex" },
#endif
{ SEARCH_OPT_CASE, "case" },
};
struct search_dlg_hop {
void *data;
union option_value values[SEARCH_OPTIONS];
};
static widget_handler_status_T
search_dlg_cancel(struct dialog_data *dlg_data, struct widget_data *widget_data)
{
void (*fn)(void *) = widget_data->widget->data;
struct search_dlg_hop *hop = dlg_data->dlg->udata2;
void *data = hop->data;
if (fn) fn(data);
return cancel_dialog(dlg_data, widget_data);
}
static widget_handler_status_T
search_dlg_ok(struct dialog_data *dlg_data, struct widget_data *widget_data)
{
void (*fn)(void *, char *) = widget_data->widget->data;
struct search_dlg_hop *hop = dlg_data->dlg->udata2;
void *data = hop->data;
char *text = dlg_data->widgets_data->cdata;
update_dialog_data(dlg_data);
commit_option_values(resolvers, get_opt_rec(config_options,
"document.browse.search"),
hop->values, SEARCH_OPTIONS);
if (check_dialog(dlg_data)) return EVENT_NOT_PROCESSED;
add_to_input_history(dlg_data->dlg->widgets->info.field.history, text, 1);
if (fn) fn(data, text);
return cancel_dialog(dlg_data, widget_data);
}
/* XXX: @data is ignored. */
static void
search_dlg_do(struct terminal *term, struct memory_list *ml,
char *title, void *data,
struct input_history *history,
void (*fn)(void *, char *))
{
Here is a framework that detects cases where a PO file assigns the same accelerator key to multiple buttons in a dialog box or to multiple items in a menu. ELinks already has some support for this but it requires the translator to run ELinks and manually scan through all menus and dialogs. The attached changes make it possible to quickly detect and list any conflicts, including ones that can only occur on operating systems or configurations that the translator is not currently using. The changes have no immediate effect on the elinks executable or the MO files. PO files become larger, however. The scheme works like this: - Like before, accelerator keys in translatable strings are tagged with the tilde (~) character. - Whenever a C source file defines an accelerator key, it must assign one or more named "contexts" to it. The translations in the PO files inherit these contexts. If multiple strings use the same accelerator (case insensitive) in the same context, that's a conflict and can be detected automatically. - The contexts are defined with "gettext_accelerator_context" comments in source files. These comments delimit regions where all translatable strings containing tildes are given the same contexts. There must be one special comment at the top of the region; it lists the contexts assigned to that region. The region automatically ends at the end of the function (found with regexp /^\}/), but it can also be closed explicitly with another special comment. The comments are formatted like this: /* [gettext_accelerator_context(foo, bar, baz)] begins a region that uses the contexts "foo", "bar", and "baz". The comma is the delimiter; whitespace is optional. [gettext_accelerator_context()] ends the region. */ The scripts don't currently check whether this syntax occurs inside or outside comments. - The names of contexts consist of C identifiers delimited with periods. I typically used the name of a function that sets up a dialog, or the name of an array where the items of a menu are listed. There is a special feature for static functions: if the name begins with a period, then the period will be replaced with the name of the source file and a colon. - If a menu is programmatically generated from multiple parts, of which some are never used together, so that it is safe to use the same accelerators in them, then it is necessary to define multiple contexts for the same menu. link_menu() in src/viewer/text/link.c is the most complex example of this. - During make update-po: - A Perl script (po/gather-accelerator-contexts.pl) reads po/elinks.pot, scans the source files listed in it for "gettext_accelerator_context" comments, and rewrites po/elinks.pot with "accelerator_context" comments that indicate the contexts of each msgid: the union of all contexts of all of its uses in the source files. It also removes any "gettext_accelerator_context" comments that xgettext --add-comments has copied to elinks.pot. - If po/gather-accelerator-contexts.pl does not find any contexts for some use of an msgid that seems to contain an accelerator (because it contains a tilde), it warns. If the tilde refers to e.g. "~/.elinks" and does not actually mark an accelerator, the warning can be silenced by specifying the special context "IGNORE", which the script otherwise ignores. - msgmerge copies the "accelerator_context" comments from po/elinks.pot to po/*.po. Translators do not edit those comments. - During make check-po: - Another Perl script (po/check-accelerator-contexts.pl) reads po/*.po and keeps track of which accelerators have been bound in each context. It warns about any conflicts it finds. This script does not access the C source files; thus it does not matter if the line numbers in "#:" lines are out of date. This implementation is not perfect and I am not proposing to add it to the main source tree at this time. Specifically: - It introduces compile-time dependencies on Perl and Locale::PO. There should be a configure-time or compile-time check so that the new features are skipped if the prerequisites are missing. - When the scripts include msgstr strings in warnings, they should transcode them from the charset of the PO file to the one specified by the user's locale. - It is not adequately documented (well, except perhaps here). - po/check-accelerator-contexts.pl reports the same conflict multiple times if it occurs in multiple contexts. - The warning messages should include line numbers, so that users of Emacs could conveniently edit the conflicting part of the PO file. This is not feasible with the current version of Locale::PO. - Locale::PO does not understand #~ lines and spews warnings about them. There is an ugly hack to hide these warnings. - Jonas Fonseca suggested the script could propose accelerators that are still available. This has not been implemented. There are three files attached: - po/gather-accelerator-contexts.pl: Augments elinks.pot with context information. - po/check-accelerator-contexts.pl: Checks conflicts. - accelerator-contexts.diff: Makes po/Makefile run the scripts, and adds special comments to source files.
2005-12-04 18:38:29 -05:00
/* [gettext_accelerator_context(.search_dlg_do)] */
struct dialog *dlg;
char *field;
struct search_dlg_hop *hop;
char *text = _("Search for text", term);
struct option *search_options;
hop = mem_calloc(1, sizeof(*hop));
if (!hop) return;
search_options = get_opt_rec(config_options, "document.browse.search");
checkout_option_values(resolvers, search_options,
hop->values, SEARCH_OPTIONS);
hop->data = data;
#ifdef CONFIG_TRE
#define SEARCH_WIDGETS_COUNT 8
#else
#define SEARCH_WIDGETS_COUNT 5
#endif
dlg = calloc_dialog(SEARCH_WIDGETS_COUNT, MAX_STR_LEN);
if (!dlg) {
mem_free(hop);
return;
}
dlg->title = _(title, term);
dlg->layouter = generic_dialog_layouter;
dlg->layout.fit_datalen = 1;
dlg->layout.float_groups = 1;
dlg->udata = text;
dlg->udata2 = hop;
add_to_ml(&ml, (void *) hop, (void *) NULL);
/* @field is automatically cleared by calloc() */
field = get_dialog_offset(dlg, SEARCH_WIDGETS_COUNT);
add_dlg_field(dlg, text, 0, 0, NULL, MAX_STR_LEN, field, history);
#ifdef CONFIG_TRE
add_dlg_radio(dlg, _("Normal search", term), 1, 0, &hop->values[SEARCH_OPT_REGEX].number);
add_dlg_radio(dlg, _("Regexp search", term), 1, 1, &hop->values[SEARCH_OPT_REGEX].number);
add_dlg_radio(dlg, _("Extended regexp search", term), 1, 2, &hop->values[SEARCH_OPT_REGEX].number);
#endif
add_dlg_radio(dlg, _("Case sensitive", term), 2, 1, &hop->values[SEARCH_OPT_CASE].number);
add_dlg_radio(dlg, _("Case insensitive", term), 2, 0, &hop->values[SEARCH_OPT_CASE].number);
add_dlg_button(dlg, _("~OK", term), B_ENTER, search_dlg_ok, fn);
add_dlg_button(dlg, _("~Cancel", term), B_ESC, search_dlg_cancel, NULL);
add_dlg_end(dlg, SEARCH_WIDGETS_COUNT);
add_to_ml(&ml, (void *) dlg, (void *) NULL);
do_dialog(term, dlg, ml);
}
enum frame_event_status
search_dlg(struct session *ses, struct document_view *doc_view, int direction)
{
char *title;
void *search_function;
assert(direction);
if_assert_failed return FRAME_EVENT_OK;
if (direction > 0) {
title = N_("Search");
search_function = search_for;
} else {
title = N_("Search backward");
search_function = search_for_back;
}
search_dlg_do(ses->tab->term, NULL,
title, ses,
&search_history,
search_function);
return FRAME_EVENT_OK;
}
static enum evhook_status
search_history_write_hook(va_list ap, void *data)
{
save_input_history(&search_history, SEARCH_HISTORY_FILENAME);
return EVENT_HOOK_STATUS_NEXT;
}
static struct event_hook_info search_history_hooks[] = {
{ "periodic-saving", 0, search_history_write_hook, NULL },
NULL_EVENT_HOOK_INFO,
};
static void
init_search_history(struct module *module)
{
load_input_history(&search_history, SEARCH_HISTORY_FILENAME);
}
static void
done_search_history(struct module *module)
{
save_input_history(&search_history, SEARCH_HISTORY_FILENAME);
free_list(search_history.entries);
}
struct module search_history_module = struct_module(
/* name: */ N_("Search History"),
/* options: */ NULL,
/* hooks: */ search_history_hooks,
/* submodules: */ NULL,
/* data: */ NULL,
/* init: */ init_search_history,
/* done: */ done_search_history
);