Bug 1060: Use libtre for regexp searches.

When the user tells ELinks to search for a regexp, ELinks 0.11.0
passes the regexp to regcomp() and the formatted document to
regexec(), both in the terminal charset.  This works OK for unibyte
ASCII-compatible charsets because the regexp metacharacters are all in
the ASCII range.  And ELinks 0.11.0 doesn't support multibyte or
ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no
big deal if regexp searches fail in such locales.

ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if
CONFIG_UTF8 is defined.  Then, struct search contains unicode_val_T c
rather than unsigned char c, and get_srch() and add_srch_chr()
together save UTF-32 values there if the terminal charset is UTF-8.
In plain-text searches, is_in_range_plain() compares those values
directly if the search is case sensitive, or folds them to lower case
if the search is case insensitive: with towlower() if the terminal
charset is UTF-8, or with tolower() otherwise.  In regexp searches
however, get_search_region_from_search_nodes() still truncates all
values to 8 bits in order to generate the string that
search_for_pattern() then passes to regexec().  In UTF-8 locales,
regexec() expects this string to be in UTF-8 and can't make sense of
the truncated characters.  There is also a possible conflict in
regcomp() if the locale is UTF-8 but the terminal charset is not, or
vice versa.

Rejected ways of fixing the charset mismatches:

* When the terminal charset is UTF-8, recode the formatted document
  from UTF-32 to UTF-8 for regexp searching.  This would work if the
  terminal and the locale both use UTF-8, or if both use unibyte
  ASCII-compatible charsets, but not if only one of them uses UTF-8.

* Convert both the regexp and the formatted document to the charset of
  the locale, as that is what regcomp() and regexec() expect.  ELinks
  would have to somehow keep track of which bytes in the converted
  string correspond to which characters in the document; not entirely
  trivial because convert_string() can replace a single unconvertible
  character with a string of ASCII characters.  If ELinks were
  eventually changed to use iconv() for unrecognized charsets, such
  tracking would become even harder.

* Temporarily switch to a locale that uses the charset of the
  terminal.  Unfortunately, it seems there is no portable way to
  construct a name for such a locale.  It is also possible that no
  suitable locale is available; especially on Windows, whose C library
  defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales.

Instead, this commit makes ELinks do the regexp matching with regwcomp
and regwexec from the TRE library.  This way, ELinks can losslessly
recode both the pattern and the document to Unicode and rely on the
regexp code in TRE decoding them properly, regardless of locale.

There are some possible problems though:

1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE
   uses wchar_t instead.  If wchar_t is UTF-16, as it is on Microsoft
   Windows, then TRE will misdecode the strings.  It wouldn't be too
   hard to make ELinks convert to UTF-16 in this case, but (a) TRE
   doesn't currently support UTF-16 either, and it seems possible that
   wchar_t-independent UTF-32 interfaces will be added to TRE; and (b)
   there seems to be little interest on using ELinks on Windows anyway.

2. The Citrus Project apparently wanted BSD to use a locale-dependent
   wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in
   others.  Regexp searches in ELinks now do not support the latter.

[ Adapted to elinks-0.12 from bug 1060 attachment 506.
  Commit message by me.  --KON ]
This commit is contained in:
Witold Filipczyk 2008-12-24 15:48:00 +02:00 committed by Kalle Olavi Niemitalo
parent 264a66fe4d
commit c5a7f87c43
5 changed files with 67 additions and 35 deletions

View File

@ -87,6 +87,8 @@ SEE_CFLAGS = @SEE_CFLAGS@

View File

@ -250,12 +250,6 @@ EL_CHECK_CODE([variadic macros], HAVE_VARIADIC_MACROS,
#define a(b,c...) printf(b,##c)],
# ===================================================================
# Check for POSIX <regex.h>
# ===================================================================
EL_CHECK_SYS_TYPE(regex_t, HAVE_REGEX_H, [#include <regex.h>])
# ===================================================================
# Checks for library functions.
# ===================================================================
@ -906,6 +900,24 @@ else
# ===================================================================
# Check for TRE library
# ===================================================================
if pkg-config tre; then
TRE_CFLAGS=`pkg-config --cflags tre`
TRE_LIBS=`pkg-config --libs tre`
if test "$cf_result" = yes; then
# ===================================================================
# Check for Ruby, optional even if installed.

View File

@ -389,7 +389,7 @@ static struct option_info config_options_info[] = {
N_("Whether the search should match the document text while maintaining\n"
"case sensitivity.")),
INIT_OPT_INT("document.browse.search", N_("Regular expressions"),
"regex", 0, 0, 2, 0,
N_("Enable searching with regular expressions:\n"

View File

@ -1,6 +1,8 @@
include $(top_builddir)/Makefile.config
OBJS-$(CONFIG_MARKS) += marks.o
OBJS = draw.o form.o link.o search.o textarea.o view.o vs.o

View File

@ -16,11 +16,11 @@
#include <sys/types.h> /* FreeBSD needs this before regex.h */
#include <regex.h>
#include <stdlib.h>
#include <string.h>
#include <tre/regex.h>
#include "elinks.h"
@ -54,10 +54,18 @@ static INIT_INPUT_HISTORY(search_history);
#undef UCHAR
#ifdef CONFIG_UTF8
#define UCHAR unicode_val_T
#define PATTERN const wchar_t
#define Regcomp regwcomp
#define Regexec regwexec
#define UCHAR unsigned char
#define PATTERN const char
#define Regcomp regcomp
#define Regexec regexec
static UCHAR *memacpy_u(unsigned char *text, int textlen, int utf8);
static inline void
add_srch_chr(struct document *document, UCHAR c, int x, int y, int nn)
@ -262,21 +270,21 @@ get_range(struct document *document, int y, int height, int l,
return 0;
/** Returns a string @c doc that is a copy of the text in the search
* nodes from @a s1 to (@a s1 + @a doclen - 1) with the space at the
* end of each line converted to a new-line character (LF). */
static unsigned char *
static UCHAR *
get_search_region_from_search_nodes(struct search *s1, struct search *s2,
int pattern_len, int *doclen)
unsigned char *doc;
UCHAR *doc;
int i;
*doclen = s2 - s1 + pattern_len;
if (!*doclen) return NULL;
doc = mem_alloc(*doclen + 1);
doc = mem_alloc((*doclen + 1) * sizeof(UCHAR));
if (!doc) {
*doclen = -1;
return NULL;
@ -301,11 +309,11 @@ struct regex_match_context {
int y1;
int y2;
int found;
unsigned char *pattern;
UCHAR *pattern;
static int
init_regex(regex_t *regex, unsigned char *pattern)
init_regex(regex_t *regex, UCHAR *pattern)
int regex_flags = REG_NEWLINE;
int reg_err;
@ -316,7 +324,7 @@ init_regex(regex_t *regex, unsigned char *pattern)
if (!get_opt_bool("document.browse.search.case"))
regex_flags |= REG_ICASE;
reg_err = regcomp(regex, pattern, regex_flags);
reg_err = Regcomp(regex, (PATTERN *)pattern, regex_flags);
if (reg_err) {
return 0;
@ -329,8 +337,8 @@ static void
search_for_pattern(struct regex_match_context *common_ctx, void *data,
void (*match)(struct regex_match_context *, void *))
unsigned char *doc;
unsigned char *doctmp;
UCHAR *doc;
UCHAR *doctmp;
int doclen;
int regexec_flags = 0;
regex_t regex;
@ -381,7 +389,7 @@ find_next:
save_c = doc[pos];
doc[pos] = 0;
while (*doctmp && !regexec(&regex, doctmp, 1, &regmatch, regexec_flags)) {
while (*doctmp && !Regexec(&regex, (PATTERN *)doctmp, 1, &regmatch, regexec_flags)) {
regexec_flags = REG_NOTBOL;
common_ctx->textlen = regmatch.rm_eo - regmatch.rm_so;
if (!common_ctx->textlen) { doc[pos] = save_c; common_ctx->found = 1; goto free_stuff; }
@ -432,10 +440,13 @@ static int
is_in_range_regex(struct document *document, int y, int height,
unsigned char *text, int textlen,
int *min, int *max,
struct search *s1, struct search *s2)
struct search *s1, struct search *s2, int utf8)
struct regex_match_context common_ctx;
struct is_in_range_regex_context ctx;
UCHAR *txt = memacpy_u(text, textlen, utf8);
if (!txt) return -1;
ctx.y = y;
ctx.min = min;
@ -445,15 +456,16 @@ is_in_range_regex(struct document *document, int y, int height,
common_ctx.textlen = textlen;
common_ctx.y1 = y - 1;
common_ctx.y2 = y + height;
common_ctx.pattern = text;
common_ctx.pattern = txt;
common_ctx.s1 = s1;
common_ctx.s2 = s2;
search_for_pattern(&common_ctx, &ctx, is_in_range_regex_match);
return common_ctx.found;
#endif /* HAVE_REGEX_H */
#endif /* HAVE_TRE_REGEX_H */
static UCHAR *
memacpy_u(unsigned char *text, int textlen, int utf8)
@ -590,10 +602,10 @@ is_in_range(struct document *document, int y, int height,
if (get_range(document, y, height, textlen, &s1, &s2))
return 0;
if (get_opt_int("document.browse.search.regex"))
return is_in_range_regex(document, y, height, text, textlen,
min, max, s1, s2);
min, max, s1, s2, utf8);
return is_in_range_plain(document, y, height, text, textlen,
min, max, s1, s2, utf8);
@ -669,7 +681,7 @@ srch_failed:
*pl = len;
struct get_searched_regex_context {
int xoffset;
int yoffset;
@ -709,10 +721,13 @@ get_searched_regex_match(struct regex_match_context *common_ctx, void *data)
static void
get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
int textlen, struct search *s1, struct search *s2)
int textlen, struct search *s1, struct search *s2, int utf8)
struct regex_match_context common_ctx;
struct get_searched_regex_context ctx;
UCHAR *txt = memacpy_u(*doc_view->search_word, textlen, utf8);
if (!txt) return;
ctx.points = NULL;
ctx.len = 0;
@ -724,16 +739,17 @@ get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
common_ctx.textlen = textlen;
common_ctx.y1 = doc_view->vs->y - 1;
common_ctx.y2 = doc_view->vs->y + ctx.box->height;
common_ctx.pattern = *doc_view->search_word;
common_ctx.pattern = txt;
common_ctx.s1 = s1;
common_ctx.s2 = s2;
search_for_pattern(&common_ctx, &ctx, get_searched_regex_match);
*pt = ctx.points;
*pl = ctx.len;
#endif /* HAVE_REGEX_H */
#endif /* HAVE_TRE_REGEX_H */
static void
get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf8)
@ -757,9 +773,9 @@ get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf
if (get_opt_int("document.browse.search.regex"))
get_searched_regex(doc_view, pt, pl, l, s1, s2);
get_searched_regex(doc_view, pt, pl, l, s1, s2, utf8);
get_searched_plain(doc_view, pt, pl, l, s1, s2, utf8);
@ -1576,7 +1592,7 @@ search_typeahead(struct session *ses, struct document_view *doc_view,
* a nice cleanup target ;-). --pasky */
enum search_option {
@ -1584,7 +1600,7 @@ enum search_option {
static struct option_resolver resolvers[] = {
{ SEARCH_OPT_REGEX, "regex" },
{ SEARCH_OPT_CASE, "case" },
@ -1651,7 +1667,7 @@ search_dlg_do(struct terminal *term, struct memory_list *ml,
hop->values, SEARCH_OPTIONS);
hop->data = data;
@ -1675,7 +1691,7 @@ search_dlg_do(struct terminal *term, struct memory_list *ml,
field = get_dialog_offset(dlg, SEARCH_WIDGETS_COUNT);
add_dlg_field(dlg, text, 0, 0, NULL, MAX_STR_LEN, field, history);
add_dlg_radio(dlg, _("Normal search", term), 1, 0, &hop->values[SEARCH_OPT_REGEX].number);
add_dlg_radio(dlg, _("Regexp search", term), 1, 1, &hop->values[SEARCH_OPT_REGEX].number);
add_dlg_radio(dlg, _("Extended regexp search", term), 1, 2, &hop->values[SEARCH_OPT_REGEX].number);