Bug 1060: Use libtre for regexp searches.

When the user tells ELinks to search for a regexp, ELinks 0.11.0 passes the regexp to regcomp() and the formatted document to regexec(), both in the terminal charset. This works OK for unibyte ASCII-compatible charsets because the regexp metacharacters are all in the ASCII range. And ELinks 0.11.0 doesn't support multibyte or ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no big deal if regexp searches fail in such locales. ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if CONFIG_UTF8 is defined. Then, struct search contains unicode_val_T c rather than unsigned char c, and get_srch() and add_srch_chr() together save UTF-32 values there if the terminal charset is UTF-8. In plain-text searches, is_in_range_plain() compares those values directly if the search is case sensitive, or folds them to lower case if the search is case insensitive: with towlower() if the terminal charset is UTF-8, or with tolower() otherwise. In regexp searches however, get_search_region_from_search_nodes() still truncates all values to 8 bits in order to generate the string that search_for_pattern() then passes to regexec(). In UTF-8 locales, regexec() expects this string to be in UTF-8 and can't make sense of the truncated characters. There is also a possible conflict in regcomp() if the locale is UTF-8 but the terminal charset is not, or vice versa. Rejected ways of fixing the charset mismatches: * When the terminal charset is UTF-8, recode the formatted document from UTF-32 to UTF-8 for regexp searching. This would work if the terminal and the locale both use UTF-8, or if both use unibyte ASCII-compatible charsets, but not if only one of them uses UTF-8. * Convert both the regexp and the formatted document to the charset of the locale, as that is what regcomp() and regexec() expect. ELinks would have to somehow keep track of which bytes in the converted string correspond to which characters in the document; not entirely trivial because convert_string() can replace a single unconvertible character with a string of ASCII characters. If ELinks were eventually changed to use iconv() for unrecognized charsets, such tracking would become even harder. * Temporarily switch to a locale that uses the charset of the terminal. Unfortunately, it seems there is no portable way to construct a name for such a locale. It is also possible that no suitable locale is available; especially on Windows, whose C library defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales. Instead, this commit makes ELinks do the regexp matching with regwcomp and regwexec from the TRE library. This way, ELinks can losslessly recode both the pattern and the document to Unicode and rely on the regexp code in TRE decoding them properly, regardless of locale. There are some possible problems though: 1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE uses wchar_t instead. If wchar_t is UTF-16, as it is on Microsoft Windows, then TRE will misdecode the strings. It wouldn't be too hard to make ELinks convert to UTF-16 in this case, but (a) TRE doesn't currently support UTF-16 either, and it seems possible that wchar_t-independent UTF-32 interfaces will be added to TRE; and (b) there seems to be little interest on using ELinks on Windows anyway. 2. The Citrus Project apparently wanted BSD to use a locale-dependent wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in others. Regexp searches in ELinks now do not support the latter. [ Adapted to elinks-0.12 from bug 1060 attachment 506. Commit message by me. --KON ]
2025-06-30 22:19:29 -04:00 · 2008-12-24 15:48:00 +02:00 · 2008-12-24 15:48:00 +02:00 · c5a7f87c43
commit c5a7f87c43
parent 264a66fe4d
5 changed files with 67 additions and 35 deletions
--- a/Makefile.config.in
+++ b/Makefile.config.in
@ -87,6 +87,8 @@ SEE_CFLAGS = @SEE_CFLAGS@
 SPARSE = @SPARSE@
 SPIDERMONKEY_CFLAGS = @SPIDERMONKEY_CFLAGS@
 SPIDERMONKEY_LIBS = @SPIDERMONKEY_LIBS@
+TRE_CFLAGS = @TRE_CFLAGS@
+TRE_LIBS = @TRE_LIBS@
 VERSION = @VERSION@
 XMLTO = @XMLTO@
 X_CFLAGS = @X_CFLAGS@
--- a/configure.in
+++ b/configure.in
@ -250,12 +250,6 @@ EL_CHECK_CODE([variadic macros], HAVE_VARIADIC_MACROS,
 		 #define a(b,c...) printf(b,##c)],
                [a("foo");a("%s","bar");a("%s%s","baz","quux");])

-# ===================================================================
-# Check for POSIX <regex.h>
-# ===================================================================
-
-EL_CHECK_SYS_TYPE(regex_t, HAVE_REGEX_H, [#include <regex.h>])
-
 # ===================================================================
 # Checks for library functions.
 # ===================================================================
@ -906,6 +900,24 @@ else
 	AC_SUBST(LUA_CFLAGS)
 fi

+# ===================================================================
+# Check for TRE library
+# ===================================================================
+AC_MSG_CHECKING([for TRE])
+cf_result=no
+if pkg-config tre; then
+	TRE_CFLAGS=`pkg-config --cflags tre`
+	TRE_LIBS=`pkg-config --libs tre`
+	AC_SUBST(TRE_CFLAGS)
+	AC_SUBST(TRE_LIBS)
+	CFLAGS="$TRE_CFLAGS $CFLAGS"
+	LIBS="$TRE_LIBS $LIBS"
+	cf_result=yes
+fi
+AC_MSG_RESULT($cf_result)
+if test "$cf_result" = yes; then
+	AC_CHECK_HEADERS(tre/regex.h)
+fi

 # ===================================================================
 # Check for Ruby, optional even if installed.
--- a/src/config/options.inc
+++ b/src/config/options.inc
@ -389,7 +389,7 @@ static struct option_info config_options_info[] = {
 		N_("Whether the search should match the document text while maintaining\n"
 		"case sensitivity.")),

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	INIT_OPT_INT("document.browse.search", N_("Regular expressions"),
 		"regex", 0, 0, 2, 0,
 		N_("Enable searching with regular expressions:\n"
--- a/src/viewer/text/Makefile
+++ b/src/viewer/text/Makefile
@ -1,6 +1,8 @@
 top_builddir=../../..
 include $(top_builddir)/Makefile.config

+INCLUDES += $(TRE_CFLAGS)
+
 OBJS-$(CONFIG_MARKS) += marks.o

 OBJS = draw.o form.o link.o search.o textarea.o view.o vs.o
--- a/src/viewer/text/search.c
+++ b/src/viewer/text/search.c
@ -16,11 +16,11 @@
 #endif

 #include <sys/types.h> /* FreeBSD needs this before regex.h */
-#ifdef HAVE_REGEX_H
-#include <regex.h>
-#endif
 #include <stdlib.h>
 #include <string.h>
+#ifdef HAVE_TRE_REGEX_H
+#include <tre/regex.h>
+#endif

 #include "elinks.h"

@ -54,10 +54,18 @@ static INIT_INPUT_HISTORY(search_history);
 #undef UCHAR
 #ifdef CONFIG_UTF8
 #define UCHAR unicode_val_T
+#define PATTERN const wchar_t
+#define Regcomp regwcomp
+#define Regexec regwexec
 #else
 #define UCHAR unsigned char
+#define PATTERN const char
+#define Regcomp regcomp
+#define Regexec regexec
 #endif

+static UCHAR *memacpy_u(unsigned char *text, int textlen, int utf8);
+
 static inline void
 add_srch_chr(struct document *document, UCHAR c, int x, int y, int nn)
 {
@ -262,21 +270,21 @@ get_range(struct document *document, int y, int height, int l,
 	return 0;
 }

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 /** Returns a string @c doc that is a copy of the text in the search
 * nodes from @a s1 to (@a s1 + @a doclen - 1) with the space at the
 * end of each line converted to a new-line character (LF). */
-static unsigned char *
+static UCHAR *
 get_search_region_from_search_nodes(struct search *s1, struct search *s2,
 				    int pattern_len, int *doclen)
 {
-	unsigned char *doc;
+	UCHAR *doc;
 	int i;

 	*doclen = s2 - s1 + pattern_len;
 	if (!*doclen) return NULL;

-	doc = mem_alloc(*doclen + 1);
+	doc = mem_alloc((*doclen + 1) * sizeof(UCHAR));
 	if (!doc) {
 		*doclen = -1;
 		return NULL;
@ -301,11 +309,11 @@ struct regex_match_context {
 	int y1;
 	int y2;
 	int found;
-	unsigned char *pattern;
+	UCHAR *pattern;
 };

 static int
-init_regex(regex_t *regex, unsigned char *pattern)
+init_regex(regex_t *regex, UCHAR *pattern)
 {
 	int regex_flags = REG_NEWLINE;
 	int reg_err;
@ -316,7 +324,7 @@ init_regex(regex_t *regex, unsigned char *pattern)
 	if (!get_opt_bool("document.browse.search.case"))
 		regex_flags |= REG_ICASE;

-	reg_err = regcomp(regex, pattern, regex_flags);
+	reg_err = Regcomp(regex, (PATTERN *)pattern, regex_flags);
 	if (reg_err) {
 		regfree(regex);
 		return 0;
@ -329,8 +337,8 @@ static void
 search_for_pattern(struct regex_match_context *common_ctx, void *data,
 		   void (*match)(struct regex_match_context *, void *))
 {
-	unsigned char *doc;
-	unsigned char *doctmp;
+	UCHAR *doc;
+	UCHAR *doctmp;
 	int doclen;
 	int regexec_flags = 0;
 	regex_t regex;
@ -381,7 +389,7 @@ find_next:
 	save_c = doc[pos];
 	doc[pos] = 0;

-	while (*doctmp && !regexec(&regex, doctmp, 1, &regmatch, regexec_flags)) {
+	while (*doctmp && !Regexec(&regex, (PATTERN *)doctmp, 1, &regmatch, regexec_flags)) {
 		regexec_flags = REG_NOTBOL;
 		common_ctx->textlen = regmatch.rm_eo - regmatch.rm_so;
 		if (!common_ctx->textlen) { doc[pos] = save_c; common_ctx->found = 1; goto free_stuff; }
@ -432,10 +440,13 @@ static int
 is_in_range_regex(struct document *document, int y, int height,
 		  unsigned char *text, int textlen,
 		  int *min, int *max,
-		  struct search *s1, struct search *s2)
+		  struct search *s1, struct search *s2, int utf8)
 {
 	struct regex_match_context common_ctx;
 	struct is_in_range_regex_context ctx;
+	UCHAR *txt = memacpy_u(text, textlen, utf8);
+
+	if (!txt) return -1;

 	ctx.y = y;
 	ctx.min = min;
@ -445,15 +456,16 @@ is_in_range_regex(struct document *document, int y, int height,
 	common_ctx.textlen = textlen;
 	common_ctx.y1 = y - 1;
 	common_ctx.y2 = y + height;
-	common_ctx.pattern = text;
+	common_ctx.pattern = txt;
 	common_ctx.s1 = s1;
 	common_ctx.s2 = s2;

 	search_for_pattern(&common_ctx, &ctx, is_in_range_regex_match);
+	mem_free(txt);

 	return common_ctx.found;
 }
-#endif /* HAVE_REGEX_H */
+#endif /* HAVE_TRE_REGEX_H */

 static UCHAR *
 memacpy_u(unsigned char *text, int textlen, int utf8)
@ -590,10 +602,10 @@ is_in_range(struct document *document, int y, int height,
 	if (get_range(document, y, height, textlen, &s1, &s2))
 		return 0;

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	if (get_opt_int("document.browse.search.regex"))
 		return is_in_range_regex(document, y, height, text, textlen,
-					 min, max, s1, s2);
+					 min, max, s1, s2, utf8);
 #endif
 	return is_in_range_plain(document, y, height, text, textlen,
 				 min, max, s1, s2, utf8);
@ -669,7 +681,7 @@ srch_failed:
 	*pl = len;
 }

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 struct get_searched_regex_context {
 	int xoffset;
 	int yoffset;
@ -709,10 +721,13 @@ get_searched_regex_match(struct regex_match_context *common_ctx, void *data)

 static void
 get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
-		   int textlen, struct search *s1, struct search *s2)
+		   int textlen, struct search *s1, struct search *s2, int utf8)
 {
 	struct regex_match_context common_ctx;
 	struct get_searched_regex_context ctx;
+	UCHAR *txt = memacpy_u(*doc_view->search_word, textlen, utf8);
+
+	if (!txt) return;

 	ctx.points = NULL;
 	ctx.len = 0;
@ -724,16 +739,17 @@ get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
 	common_ctx.textlen = textlen;
 	common_ctx.y1 = doc_view->vs->y - 1;
 	common_ctx.y2 = doc_view->vs->y + ctx.box->height;
-	common_ctx.pattern = *doc_view->search_word;
+	common_ctx.pattern = txt;
 	common_ctx.s1 = s1;
 	common_ctx.s2 = s2;

 	search_for_pattern(&common_ctx, &ctx, get_searched_regex_match);

+	mem_free(txt);
 	*pt = ctx.points;
 	*pl = ctx.len;
 }
-#endif /* HAVE_REGEX_H */
+#endif /* HAVE_TRE_REGEX_H */

 static void
 get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf8)
@ -757,9 +773,9 @@ get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf
 		return;
 	}

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	if (get_opt_int("document.browse.search.regex"))
-		get_searched_regex(doc_view, pt, pl, l, s1, s2);
+		get_searched_regex(doc_view, pt, pl, l, s1, s2, utf8);
 	else
 #endif
 		get_searched_plain(doc_view, pt, pl, l, s1, s2, utf8);
@ -1576,7 +1592,7 @@ search_typeahead(struct session *ses, struct document_view *doc_view,
 * a nice cleanup target ;-). --pasky */

 enum search_option {
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	SEARCH_OPT_REGEX,
 #endif
 	SEARCH_OPT_CASE,
@ -1584,7 +1600,7 @@ enum search_option {
 };

 static struct option_resolver resolvers[] = {
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	{ SEARCH_OPT_REGEX,	"regex" },
 #endif
 	{ SEARCH_OPT_CASE,	"case" },
@ -1651,7 +1667,7 @@ search_dlg_do(struct terminal *term, struct memory_list *ml,
 			       hop->values, SEARCH_OPTIONS);
 	hop->data = data;

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 #define SEARCH_WIDGETS_COUNT 8
 #else
 #define SEARCH_WIDGETS_COUNT 5
@ -1675,7 +1691,7 @@ search_dlg_do(struct terminal *term, struct memory_list *ml,
 	field = get_dialog_offset(dlg, SEARCH_WIDGETS_COUNT);
 	add_dlg_field(dlg, text, 0, 0, NULL, MAX_STR_LEN, field, history);

-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	add_dlg_radio(dlg, _("Normal search", term), 1, 0, &hop->values[SEARCH_OPT_REGEX].number);
 	add_dlg_radio(dlg, _("Regexp search", term), 1, 1, &hop->values[SEARCH_OPT_REGEX].number);
 	add_dlg_radio(dlg, _("Extended regexp search", term), 1, 2, &hop->values[SEARCH_OPT_REGEX].number);