From c5a7f87c43afd214f028dd557730f7eb7da3cee4 Mon Sep 17 00:00:00 2001
From: Witold Filipczyk <witekfl@poczta.onet.pl>
Date: Wed, 24 Dec 2008 15:48:00 +0200
Subject: [PATCH] Bug 1060: Use libtre for regexp searches.

When the user tells ELinks to search for a regexp, ELinks 0.11.0
passes the regexp to regcomp() and the formatted document to
regexec(), both in the terminal charset.  This works OK for unibyte
ASCII-compatible charsets because the regexp metacharacters are all in
the ASCII range.  And ELinks 0.11.0 doesn't support multibyte or
ASCII-incompatible (e.g. EBCDIC) charsets in terminals, so it is no
big deal if regexp searches fail in such locales.

ELinks 0.12pre1 attempts to support UTF-8 as the terminal charset if
CONFIG_UTF8 is defined.  Then, struct search contains unicode_val_T c
rather than unsigned char c, and get_srch() and add_srch_chr()
together save UTF-32 values there if the terminal charset is UTF-8.
In plain-text searches, is_in_range_plain() compares those values
directly if the search is case sensitive, or folds them to lower case
if the search is case insensitive: with towlower() if the terminal
charset is UTF-8, or with tolower() otherwise.  In regexp searches
however, get_search_region_from_search_nodes() still truncates all
values to 8 bits in order to generate the string that
search_for_pattern() then passes to regexec().  In UTF-8 locales,
regexec() expects this string to be in UTF-8 and can't make sense of
the truncated characters.  There is also a possible conflict in
regcomp() if the locale is UTF-8 but the terminal charset is not, or
vice versa.

Rejected ways of fixing the charset mismatches:

* When the terminal charset is UTF-8, recode the formatted document
  from UTF-32 to UTF-8 for regexp searching.  This would work if the
  terminal and the locale both use UTF-8, or if both use unibyte
  ASCII-compatible charsets, but not if only one of them uses UTF-8.

* Convert both the regexp and the formatted document to the charset of
  the locale, as that is what regcomp() and regexec() expect.  ELinks
  would have to somehow keep track of which bytes in the converted
  string correspond to which characters in the document; not entirely
  trivial because convert_string() can replace a single unconvertible
  character with a string of ASCII characters.  If ELinks were
  eventually changed to use iconv() for unrecognized charsets, such
  tracking would become even harder.

* Temporarily switch to a locale that uses the charset of the
  terminal.  Unfortunately, it seems there is no portable way to
  construct a name for such a locale.  It is also possible that no
  suitable locale is available; especially on Windows, whose C library
  defines MB_LEN_MAX as 2 and thus cannot support UTF-8 locales.

Instead, this commit makes ELinks do the regexp matching with regwcomp
and regwexec from the TRE library.  This way, ELinks can losslessly
recode both the pattern and the document to Unicode and rely on the
regexp code in TRE decoding them properly, regardless of locale.

There are some possible problems though:

1. ELinks stores strings as UTF-32 in arrays of unicode_val_T, but TRE
   uses wchar_t instead.  If wchar_t is UTF-16, as it is on Microsoft
   Windows, then TRE will misdecode the strings.  It wouldn't be too
   hard to make ELinks convert to UTF-16 in this case, but (a) TRE
   doesn't currently support UTF-16 either, and it seems possible that
   wchar_t-independent UTF-32 interfaces will be added to TRE; and (b)
   there seems to be little interest on using ELinks on Windows anyway.

2. The Citrus Project apparently wanted BSD to use a locale-dependent
   wchar_t: e.g. UTF-32 in some locales and an ISO 2022 derivative in
   others.  Regexp searches in ELinks now do not support the latter.

[ Adapted to elinks-0.12 from bug 1060 attachment 506.
  Commit message by me.  --KON ]
---
 Makefile.config.in       |  2 ++
 configure.in             | 24 ++++++++++----
 src/config/options.inc   |  2 +-
 src/viewer/text/Makefile |  2 ++
 src/viewer/text/search.c | 72 ++++++++++++++++++++++++----------------
 5 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/Makefile.config.in b/Makefile.config.in
index f8a1cd43..8da184ef 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -87,6 +87,8 @@ SEE_CFLAGS = @SEE_CFLAGS@
 SPARSE = @SPARSE@
 SPIDERMONKEY_CFLAGS = @SPIDERMONKEY_CFLAGS@
 SPIDERMONKEY_LIBS = @SPIDERMONKEY_LIBS@
+TRE_CFLAGS = @TRE_CFLAGS@
+TRE_LIBS = @TRE_LIBS@
 VERSION = @VERSION@
 XMLTO = @XMLTO@
 X_CFLAGS = @X_CFLAGS@
diff --git a/configure.in b/configure.in
index 3910b543..7023c662 100644
--- a/configure.in
+++ b/configure.in
@@ -250,12 +250,6 @@ EL_CHECK_CODE([variadic macros], HAVE_VARIADIC_MACROS,
 		 #define a(b,c...) printf(b,##c)],
                 [a("foo");a("%s","bar");a("%s%s","baz","quux");])
 
-# ===================================================================
-# Check for POSIX <regex.h>
-# ===================================================================
-
-EL_CHECK_SYS_TYPE(regex_t, HAVE_REGEX_H, [#include <regex.h>])
-
 # ===================================================================
 # Checks for library functions.
 # ===================================================================
@@ -906,6 +900,24 @@ else
 	AC_SUBST(LUA_CFLAGS)
 fi
 
+# ===================================================================
+# Check for TRE library
+# ===================================================================
+AC_MSG_CHECKING([for TRE])
+cf_result=no
+if pkg-config tre; then
+	TRE_CFLAGS=`pkg-config --cflags tre`
+	TRE_LIBS=`pkg-config --libs tre`
+	AC_SUBST(TRE_CFLAGS)
+	AC_SUBST(TRE_LIBS)
+	CFLAGS="$TRE_CFLAGS $CFLAGS"
+	LIBS="$TRE_LIBS $LIBS"
+	cf_result=yes
+fi
+AC_MSG_RESULT($cf_result)
+if test "$cf_result" = yes; then
+	AC_CHECK_HEADERS(tre/regex.h)
+fi
 
 # ===================================================================
 # Check for Ruby, optional even if installed.
diff --git a/src/config/options.inc b/src/config/options.inc
index 0c508ae4..78de2bd5 100644
--- a/src/config/options.inc
+++ b/src/config/options.inc
@@ -389,7 +389,7 @@ static struct option_info config_options_info[] = {
 		N_("Whether the search should match the document text while maintaining\n"
 		"case sensitivity.")),
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	INIT_OPT_INT("document.browse.search", N_("Regular expressions"),
 		"regex", 0, 0, 2, 0,
 		N_("Enable searching with regular expressions:\n"
diff --git a/src/viewer/text/Makefile b/src/viewer/text/Makefile
index 06190f1e..3ed42624 100644
--- a/src/viewer/text/Makefile
+++ b/src/viewer/text/Makefile
@@ -1,6 +1,8 @@
 top_builddir=../../..
 include $(top_builddir)/Makefile.config
 
+INCLUDES += $(TRE_CFLAGS)
+
 OBJS-$(CONFIG_MARKS) += marks.o
 
 OBJS = draw.o form.o link.o search.o textarea.o view.o vs.o
diff --git a/src/viewer/text/search.c b/src/viewer/text/search.c
index 4c0201e0..4b9dc20e 100644
--- a/src/viewer/text/search.c
+++ b/src/viewer/text/search.c
@@ -16,11 +16,11 @@
 #endif
 
 #include <sys/types.h> /* FreeBSD needs this before regex.h */
-#ifdef HAVE_REGEX_H
-#include <regex.h>
-#endif
 #include <stdlib.h>
 #include <string.h>
+#ifdef HAVE_TRE_REGEX_H
+#include <tre/regex.h>
+#endif
 
 #include "elinks.h"
 
@@ -54,10 +54,18 @@ static INIT_INPUT_HISTORY(search_history);
 #undef UCHAR
 #ifdef CONFIG_UTF8
 #define UCHAR unicode_val_T
+#define PATTERN const wchar_t
+#define Regcomp regwcomp
+#define Regexec regwexec
 #else
 #define UCHAR unsigned char
+#define PATTERN const char
+#define Regcomp regcomp
+#define Regexec regexec
 #endif
 
+static UCHAR *memacpy_u(unsigned char *text, int textlen, int utf8);
+
 static inline void
 add_srch_chr(struct document *document, UCHAR c, int x, int y, int nn)
 {
@@ -262,21 +270,21 @@ get_range(struct document *document, int y, int height, int l,
 	return 0;
 }
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 /** Returns a string @c doc that is a copy of the text in the search
  * nodes from @a s1 to (@a s1 + @a doclen - 1) with the space at the
  * end of each line converted to a new-line character (LF). */
-static unsigned char *
+static UCHAR *
 get_search_region_from_search_nodes(struct search *s1, struct search *s2,
 				    int pattern_len, int *doclen)
 {
-	unsigned char *doc;
+	UCHAR *doc;
 	int i;
 
 	*doclen = s2 - s1 + pattern_len;
 	if (!*doclen) return NULL;
 
-	doc = mem_alloc(*doclen + 1);
+	doc = mem_alloc((*doclen + 1) * sizeof(UCHAR));
 	if (!doc) {
 		*doclen = -1;
 		return NULL;
@@ -301,11 +309,11 @@ struct regex_match_context {
 	int y1;
 	int y2;
 	int found;
-	unsigned char *pattern;
+	UCHAR *pattern;
 };
 
 static int
-init_regex(regex_t *regex, unsigned char *pattern)
+init_regex(regex_t *regex, UCHAR *pattern)
 {
 	int regex_flags = REG_NEWLINE;
 	int reg_err;
@@ -316,7 +324,7 @@ init_regex(regex_t *regex, unsigned char *pattern)
 	if (!get_opt_bool("document.browse.search.case"))
 		regex_flags |= REG_ICASE;
 
-	reg_err = regcomp(regex, pattern, regex_flags);
+	reg_err = Regcomp(regex, (PATTERN *)pattern, regex_flags);
 	if (reg_err) {
 		regfree(regex);
 		return 0;
@@ -329,8 +337,8 @@ static void
 search_for_pattern(struct regex_match_context *common_ctx, void *data,
 		   void (*match)(struct regex_match_context *, void *))
 {
-	unsigned char *doc;
-	unsigned char *doctmp;
+	UCHAR *doc;
+	UCHAR *doctmp;
 	int doclen;
 	int regexec_flags = 0;
 	regex_t regex;
@@ -381,7 +389,7 @@ find_next:
 	save_c = doc[pos];
 	doc[pos] = 0;
 
-	while (*doctmp && !regexec(&regex, doctmp, 1, &regmatch, regexec_flags)) {
+	while (*doctmp && !Regexec(&regex, (PATTERN *)doctmp, 1, &regmatch, regexec_flags)) {
 		regexec_flags = REG_NOTBOL;
 		common_ctx->textlen = regmatch.rm_eo - regmatch.rm_so;
 		if (!common_ctx->textlen) { doc[pos] = save_c; common_ctx->found = 1; goto free_stuff; }
@@ -432,10 +440,13 @@ static int
 is_in_range_regex(struct document *document, int y, int height,
 		  unsigned char *text, int textlen,
 		  int *min, int *max,
-		  struct search *s1, struct search *s2)
+		  struct search *s1, struct search *s2, int utf8)
 {
 	struct regex_match_context common_ctx;
 	struct is_in_range_regex_context ctx;
+	UCHAR *txt = memacpy_u(text, textlen, utf8);
+
+	if (!txt) return -1;
 
 	ctx.y = y;
 	ctx.min = min;
@@ -445,15 +456,16 @@ is_in_range_regex(struct document *document, int y, int height,
 	common_ctx.textlen = textlen;
 	common_ctx.y1 = y - 1;
 	common_ctx.y2 = y + height;
-	common_ctx.pattern = text;
+	common_ctx.pattern = txt;
 	common_ctx.s1 = s1;
 	common_ctx.s2 = s2;
 
 	search_for_pattern(&common_ctx, &ctx, is_in_range_regex_match);
+	mem_free(txt);
 
 	return common_ctx.found;
 }
-#endif /* HAVE_REGEX_H */
+#endif /* HAVE_TRE_REGEX_H */
 
 static UCHAR *
 memacpy_u(unsigned char *text, int textlen, int utf8)
@@ -590,10 +602,10 @@ is_in_range(struct document *document, int y, int height,
 	if (get_range(document, y, height, textlen, &s1, &s2))
 		return 0;
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	if (get_opt_int("document.browse.search.regex"))
 		return is_in_range_regex(document, y, height, text, textlen,
-					 min, max, s1, s2);
+					 min, max, s1, s2, utf8);
 #endif
 	return is_in_range_plain(document, y, height, text, textlen,
 				 min, max, s1, s2, utf8);
@@ -669,7 +681,7 @@ srch_failed:
 	*pl = len;
 }
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 struct get_searched_regex_context {
 	int xoffset;
 	int yoffset;
@@ -709,10 +721,13 @@ get_searched_regex_match(struct regex_match_context *common_ctx, void *data)
 
 static void
 get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
-		   int textlen, struct search *s1, struct search *s2)
+		   int textlen, struct search *s1, struct search *s2, int utf8)
 {
 	struct regex_match_context common_ctx;
 	struct get_searched_regex_context ctx;
+	UCHAR *txt = memacpy_u(*doc_view->search_word, textlen, utf8);
+
+	if (!txt) return;
 
 	ctx.points = NULL;
 	ctx.len = 0;
@@ -724,16 +739,17 @@ get_searched_regex(struct document_view *doc_view, struct point **pt, int *pl,
 	common_ctx.textlen = textlen;
 	common_ctx.y1 = doc_view->vs->y - 1;
 	common_ctx.y2 = doc_view->vs->y + ctx.box->height;
-	common_ctx.pattern = *doc_view->search_word;
+	common_ctx.pattern = txt;
 	common_ctx.s1 = s1;
 	common_ctx.s2 = s2;
 
 	search_for_pattern(&common_ctx, &ctx, get_searched_regex_match);
 
+	mem_free(txt);
 	*pt = ctx.points;
 	*pl = ctx.len;
 }
-#endif /* HAVE_REGEX_H */
+#endif /* HAVE_TRE_REGEX_H */
 
 static void
 get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf8)
@@ -757,9 +773,9 @@ get_searched(struct document_view *doc_view, struct point **pt, int *pl, int utf
 		return;
 	}
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	if (get_opt_int("document.browse.search.regex"))
-		get_searched_regex(doc_view, pt, pl, l, s1, s2);
+		get_searched_regex(doc_view, pt, pl, l, s1, s2, utf8);
 	else
 #endif
 		get_searched_plain(doc_view, pt, pl, l, s1, s2, utf8);
@@ -1576,7 +1592,7 @@ search_typeahead(struct session *ses, struct document_view *doc_view,
  * a nice cleanup target ;-). --pasky */
 
 enum search_option {
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	SEARCH_OPT_REGEX,
 #endif
 	SEARCH_OPT_CASE,
@@ -1584,7 +1600,7 @@ enum search_option {
 };
 
 static struct option_resolver resolvers[] = {
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	{ SEARCH_OPT_REGEX,	"regex" },
 #endif
 	{ SEARCH_OPT_CASE,	"case" },
@@ -1651,7 +1667,7 @@ search_dlg_do(struct terminal *term, struct memory_list *ml,
 			       hop->values, SEARCH_OPTIONS);
 	hop->data = data;
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 #define SEARCH_WIDGETS_COUNT 8
 #else
 #define SEARCH_WIDGETS_COUNT 5
@@ -1675,7 +1691,7 @@ search_dlg_do(struct terminal *term, struct memory_list *ml,
 	field = get_dialog_offset(dlg, SEARCH_WIDGETS_COUNT);
 	add_dlg_field(dlg, text, 0, 0, NULL, MAX_STR_LEN, field, history);
 
-#ifdef HAVE_REGEX_H
+#ifdef HAVE_TRE_REGEX_H
 	add_dlg_radio(dlg, _("Normal search", term), 1, 0, &hop->values[SEARCH_OPT_REGEX].number);
 	add_dlg_radio(dlg, _("Regexp search", term), 1, 1, &hop->values[SEARCH_OPT_REGEX].number);
 	add_dlg_radio(dlg, _("Extended regexp search", term), 1, 2, &hop->values[SEARCH_OPT_REGEX].number);