From 79bbca4644cad7f2dee89c7ac6b8f9acc2c8b427 Mon Sep 17 00:00:00 2001 From: ailin-nemui Date: Thu, 16 Feb 2017 22:48:13 +0100 Subject: [PATCH] Refactor regex and implement UTF8 mode for GRegex - with non-unicode byte to Private Use Area A mapping - move all ifdefs to iregex.h file only --- configure.ac | 1 + src/core/Makefile.am | 8 ++ src/core/ignore.c | 40 ++------- src/core/ignore.h | 11 +-- src/core/iregex-gregex.c | 137 ++++++++++++++++++++++++++++++ src/core/iregex-regexh.c | 101 ++++++++++++++++++++++ src/core/iregex.h | 52 ++++++++++++ src/core/misc.c | 4 - src/fe-common/core/fe-ignore.c | 5 -- src/fe-common/core/hilight-text.c | 51 +++-------- src/fe-common/core/hilight-text.h | 12 +-- src/fe-text/textbuffer.c | 37 ++------ 12 files changed, 327 insertions(+), 132 deletions(-) create mode 100644 src/core/iregex-gregex.c create mode 100644 src/core/iregex-regexh.c create mode 100644 src/core/iregex.h diff --git a/configure.ac b/configure.ac index 3855ba4c..459d3a2d 100644 --- a/configure.ac +++ b/configure.ac @@ -505,6 +505,7 @@ AM_CONDITIONAL(BUILD_IRSSIBOT, test "$want_irssibot" = "yes") AM_CONDITIONAL(BUILD_IRSSIFUZZER, test "$want_irssifuzzer" = "yes") AM_CONDITIONAL(BUILD_IRSSIPROXY, test "$want_irssiproxy" = "yes") AM_CONDITIONAL(HAVE_PERL, test "$want_perl" != "no") +AM_CONDITIONAL(USE_GREGEX, test "x$want_gregex" = "xyes") # move LIBS to PROG_LIBS so they're not tried to be used when linking eg. perl libraries PROG_LIBS=$LIBS diff --git a/src/core/Makefile.am b/src/core/Makefile.am index 10bd035a..91daba3f 100644 --- a/src/core/Makefile.am +++ b/src/core/Makefile.am @@ -7,6 +7,12 @@ AM_CPPFLAGS = \ -DSYSCONFDIR=\""$(sysconfdir)"\" \ -DMODULEDIR=\""$(libdir)/irssi/modules"\" +if USE_GREGEX +regex_impl=iregex-gregex.c +else +regex_impl=iregex-regexh.c +endif + libcore_a_SOURCES = \ args.c \ channels.c \ @@ -45,6 +51,7 @@ libcore_a_SOURCES = \ signals.c \ special-vars.c \ utf8.c \ + $(regex_impl) \ wcwidth.c \ tls.c \ write-buffer.c @@ -97,6 +104,7 @@ pkginc_core_HEADERS = \ signals.h \ special-vars.h \ utf8.h \ + iregex.h \ window-item-def.h \ tls.h \ write-buffer.h \ diff --git a/src/core/ignore.c b/src/core/ignore.c index d4a92e3c..63a507f5 100644 --- a/src/core/ignore.c +++ b/src/core/ignore.c @@ -24,6 +24,7 @@ #include "levels.h" #include "lib-config/iconfig.h" #include "settings.h" +#include "iregex.h" #include "masks.h" #include "servers.h" @@ -67,13 +68,8 @@ static int ignore_match_pattern(IGNORE_REC *rec, const char *text) return FALSE; if (rec->regexp) { -#ifdef USE_GREGEX return rec->preg != NULL && - g_regex_match(rec->preg, text, 0, NULL); -#else - return rec->regexp_compiled && - regexec(&rec->preg, text, 0, NULL, 0) == 0; -#endif + i_regex_match(rec->preg, text, 0, NULL, NULL); } return rec->fullword ? @@ -327,41 +323,19 @@ static void ignore_remove_config(IGNORE_REC *rec) static void ignore_init_rec(IGNORE_REC *rec) { -#ifdef USE_GREGEX if (rec->preg != NULL) - g_regex_unref(rec->preg); + i_regex_unref(rec->preg); if (rec->regexp && rec->pattern != NULL) { GError *re_error = NULL; - rec->preg = g_regex_new(rec->pattern, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_CASELESS, 0, &re_error); + rec->preg = i_regex_new(rec->pattern, G_REGEX_OPTIMIZE | G_REGEX_CASELESS, 0, &re_error); if (rec->preg == NULL) { g_warning("Failed to compile regexp '%s': %s", rec->pattern, re_error->message); g_error_free(re_error); } } -#else - char *errbuf; - int errcode, errbuf_len; - - if (rec->regexp_compiled) regfree(&rec->preg); - rec->regexp_compiled = FALSE; - - if (rec->regexp && rec->pattern != NULL) { - errcode = regcomp(&rec->preg, rec->pattern, - REG_EXTENDED|REG_ICASE|REG_NOSUB); - if (errcode != 0) { - errbuf_len = regerror(errcode, &rec->preg, 0, 0); - errbuf = g_malloc(errbuf_len); - regerror(errcode, &rec->preg, errbuf, errbuf_len); - g_warning("Failed to compile regexp '%s': %s", rec->pattern, errbuf); - g_free(errbuf); - } else { - rec->regexp_compiled = TRUE; - } - } -#endif } void ignore_add_rec(IGNORE_REC *rec) @@ -381,11 +355,7 @@ static void ignore_destroy(IGNORE_REC *rec, int send_signal) if (send_signal) signal_emit("ignore destroyed", 1, rec); -#ifdef USE_GREGEX - if (rec->preg != NULL) g_regex_unref(rec->preg); -#else - if (rec->regexp_compiled) regfree(&rec->preg); -#endif + if (rec->preg != NULL) i_regex_unref(rec->preg); if (rec->channels != NULL) g_strfreev(rec->channels); g_free_not_null(rec->mask); g_free_not_null(rec->servertag); diff --git a/src/core/ignore.h b/src/core/ignore.h index 80ae1d12..e18be3c4 100644 --- a/src/core/ignore.h +++ b/src/core/ignore.h @@ -1,9 +1,7 @@ #ifndef __IGNORE_H #define __IGNORE_H -#ifndef USE_GREGEX -# include -#endif +#include "iregex.h" typedef struct _IGNORE_REC IGNORE_REC; @@ -20,12 +18,7 @@ struct _IGNORE_REC { unsigned int regexp:1; unsigned int fullword:1; unsigned int replies:1; /* ignore replies to nick in channel */ -#ifdef USE_GREGEX - GRegex *preg; -#else - unsigned int regexp_compiled:1; /* should always be TRUE, unless regexp is invalid */ - regex_t preg; -#endif + Regex *preg; }; extern GSList *ignores; diff --git a/src/core/iregex-gregex.c b/src/core/iregex-gregex.c new file mode 100644 index 00000000..0de11e64 --- /dev/null +++ b/src/core/iregex-gregex.c @@ -0,0 +1,137 @@ +#include + +#include "iregex.h" + +const gchar * +make_valid_utf8(const gchar *text, gboolean *free_ret) +{ + GString *str; + const gchar *ptr; + if (g_utf8_validate(text, -1, NULL)) { + if (free_ret) + *free_ret = FALSE; + return text; + } + + str = g_string_sized_new(strlen(text) + 12); + + ptr = text; + while (*ptr) { + gunichar c = g_utf8_get_char_validated(ptr, -1); + /* the unicode is invalid */ + if (c == (gunichar)-1 || c == (gunichar)-2) { + /* encode the byte into PUA-A */ + g_string_append_unichar(str, (gunichar) (0xfff00 | (*ptr & 0xff))); + ptr++; + } else { + g_string_append_unichar(str, c); + ptr = g_utf8_next_char(ptr); + } + } + + if (free_ret) + *free_ret = TRUE; + return g_string_free(str, FALSE); +} + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + const gchar *valid_pattern; + gboolean free_valid_pattern; + Regex *ret = NULL; + + valid_pattern = make_valid_utf8(pattern, &free_valid_pattern); + ret = g_regex_new(valid_pattern, compile_options, match_options, error); + + if (free_valid_pattern) + g_free_not_null((gchar *)valid_pattern); + + return ret; +} + +void +i_regex_unref (Regex *regex) +{ + g_regex_unref(regex); +} + +/* if new_string is present, the caller must free new_string. + otherwise, g_match_info_get_string must not be used. */ +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info, + const gchar **new_string) +{ + gboolean ret; + gboolean free_valid_string; + const gchar *valid_string = make_valid_utf8(string, &free_valid_string); + + ret = g_regex_match(regex, valid_string, match_options, match_info); + if (free_valid_string) { + if (new_string) + *new_string = valid_string; + else + g_free_not_null((gchar *)valid_string); + } + return ret; +} + +gsize +strlen_pua_oddly(const char *str) +{ + const gchar *ptr; + gsize ret = 0; + ptr = str; + + while (*ptr) { + const gchar *old; + gunichar c = g_utf8_get_char(ptr); + old = ptr; + ptr = g_utf8_next_char(ptr); + + /* it is our PUA encoded byte */ + if ((c & 0xfff00) == 0xfff00) + ret++; + else + ret += ptr - old; + } + + return ret; +} + +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos, + const gchar *new_string) +{ + gint tmp_start, tmp_end, new_start_pos; + gboolean ret; + + if (!new_string || (!start_pos && !end_pos)) + return g_match_info_fetch_pos(match_info, match_num, start_pos, end_pos); + + ret = g_match_info_fetch_pos(match_info, match_num, &tmp_start, &tmp_end); + if (start_pos || end_pos) { + gchar *to_start = g_strndup(new_string, tmp_start); + new_start_pos = strlen_pua_oddly(to_start); + g_free_not_null(to_start); + + if (start_pos) + *start_pos = new_start_pos; + + if (end_pos) { + gchar *to_end = g_strndup(new_string + tmp_start, tmp_end - tmp_start); + *end_pos = new_start_pos + strlen_pua_oddly(to_end); + g_free_not_null(to_end); + } + } + return ret; +} diff --git a/src/core/iregex-regexh.c b/src/core/iregex-regexh.c new file mode 100644 index 00000000..aabe44f6 --- /dev/null +++ b/src/core/iregex-regexh.c @@ -0,0 +1,101 @@ +#include "iregex.h" + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error) +{ + Regex *regex; + char *errbuf; + int cflags; + int errcode, errbuf_len; + + regex = g_new0(Regex, 1); + cflags = REG_EXTENDED; + if (compile_options & G_REGEX_CASELESS) + cflags |= REG_ICASE; + if (compile_options & G_REGEX_MULTILINE) + cflags |= REG_NEWLINE; + if (match_options & G_REGEX_MATCH_NOTBOL) + cflags |= REG_NOTBOL; + if (match_options & G_REGEX_MATCH_NOTEOL) + cflags |= REG_NOTEOL; + + errcode = regcomp(regex, pattern, cflags); + if (errcode != 0) { + errbuf_len = regerror(errcode, regex, 0, 0); + errbuf = g_malloc(errbuf_len); + regerror(errcode, regex, errbuf, errbuf_len); + g_set_error(error, G_REGEX_ERROR, errcode, "%s", errbuf); + g_free(errbuf); + g_free(regex); + return NULL; + } else { + return regex; + } +} + +void +i_regex_unref (Regex *regex) +{ + regfree(regex); + g_free(regex); +} + +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info, + const gchar **new_string) +{ + int groups; + int eflags; + + g_return_val_if_fail(regex != NULL, FALSE); + + if (match_info != NULL) { + groups = 1 + regex->re_nsub; + *match_info = g_new0(MatchInfo, groups); + } else { + groups = 0; + } + + eflags = 0; + if (match_options & G_REGEX_MATCH_NOTBOL) + eflags |= REG_NOTBOL; + if (match_options & G_REGEX_MATCH_NOTEOL) + eflags |= REG_NOTEOL; + + return regexec(regex, string, groups, groups ? *match_info : NULL, eflags) == 0; +} + +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos, + const gchar *new_string) +{ + if (start_pos != NULL) + *start_pos = match_info[match_num].rm_so; + if (end_pos != NULL) + *end_pos = match_info[match_num].rm_eo; + + return TRUE; +} + +gboolean +i_match_info_matches (const MatchInfo *match_info) +{ + g_return_val_if_fail(match_info != NULL, FALSE); + + return match_info[0].rm_so != -1; +} + +void +i_match_info_free (MatchInfo *match_info) +{ + g_free(match_info); +} diff --git a/src/core/iregex.h b/src/core/iregex.h new file mode 100644 index 00000000..adeea987 --- /dev/null +++ b/src/core/iregex.h @@ -0,0 +1,52 @@ +#ifndef __REGEX_H +#define __REGEX_H + +#include "common.h" + +#ifdef USE_GREGEX + +#include +typedef GRegex Regex; +typedef GMatchInfo MatchInfo; + +#define i_match_info_matches g_match_info_matches +#define i_match_info_free g_match_info_free + +#else + +#include +typedef regex_t Regex; +typedef regmatch_t MatchInfo; + +gboolean +i_match_info_matches (const MatchInfo *match_info); + +void +i_match_info_free (MatchInfo *match_info); + +#endif + +Regex * +i_regex_new (const gchar *pattern, + GRegexCompileFlags compile_options, + GRegexMatchFlags match_options, + GError **error); + +void +i_regex_unref (Regex *regex); + +gboolean +i_regex_match (const Regex *regex, + const gchar *string, + GRegexMatchFlags match_options, + MatchInfo **match_info, + const gchar **new_string); + +gboolean +i_match_info_fetch_pos (const MatchInfo *match_info, + gint match_num, + gint *start_pos, + gint *end_pos, + const gchar *new_string); + +#endif diff --git a/src/core/misc.c b/src/core/misc.c index 0f038cbb..4b1e72f6 100644 --- a/src/core/misc.c +++ b/src/core/misc.c @@ -22,10 +22,6 @@ #include "misc.h" #include "commands.h" -#ifndef USE_GREGEX -# include -#endif - typedef struct { int condition; GInputFunction function; diff --git a/src/fe-common/core/fe-ignore.c b/src/fe-common/core/fe-ignore.c index 800e881d..03fd4dd2 100644 --- a/src/fe-common/core/fe-ignore.c +++ b/src/fe-common/core/fe-ignore.c @@ -58,13 +58,8 @@ static void ignore_print(int index, IGNORE_REC *rec) g_string_append(options, "-regexp "); if (rec->pattern == NULL) g_string_append(options, "[INVALID! -pattern missing] "); -#ifdef USE_GREGEX else if (rec->preg == NULL) g_string_append(options, "[INVALID!] "); -#else - else if (!rec->regexp_compiled) - g_string_append(options, "[INVALID!] "); -#endif } if (rec->fullword) g_string_append(options, "-full "); if (rec->replies) g_string_append(options, "-replies "); diff --git a/src/fe-common/core/hilight-text.c b/src/fe-common/core/hilight-text.c index dd38be87..6a2c97dc 100644 --- a/src/fe-common/core/hilight-text.c +++ b/src/fe-common/core/hilight-text.c @@ -26,6 +26,7 @@ #include "misc.h" #include "lib-config/iconfig.h" #include "settings.h" +#include "iregex.h" #include "servers.h" #include "channels.h" @@ -101,11 +102,7 @@ static void hilight_destroy(HILIGHT_REC *rec) { g_return_if_fail(rec != NULL); -#ifdef USE_GREGEX - if (rec->preg != NULL) g_regex_unref(rec->preg); -#else - if (rec->regexp_compiled) regfree(&rec->preg); -#endif + if (rec->preg != NULL) i_regex_unref(rec->preg); if (rec->channels != NULL) g_strfreev(rec->channels); g_free_not_null(rec->color); g_free_not_null(rec->act_color); @@ -122,19 +119,10 @@ static void hilights_destroy_all(void) static void hilight_init_rec(HILIGHT_REC *rec) { -#ifdef USE_GREGEX if (rec->preg != NULL) - g_regex_unref(rec->preg); + i_regex_unref(rec->preg); - rec->preg = g_regex_new(rec->text, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_CASELESS, 0, NULL); -#else - if (rec->regexp_compiled) regfree(&rec->preg); - if (!rec->regexp) - rec->regexp_compiled = FALSE; - else - rec->regexp_compiled = regcomp(&rec->preg, rec->text, - rec->case_sensitive ? REG_EXTENDED : (REG_EXTENDED|REG_ICASE)) == 0; -#endif + rec->preg = i_regex_new(rec->text, G_REGEX_OPTIMIZE | G_REGEX_CASELESS, 0, NULL); } void hilight_create(HILIGHT_REC *rec) @@ -207,30 +195,18 @@ static gboolean hilight_match_text(HILIGHT_REC *rec, const char *text, gboolean ret = FALSE; if (rec->regexp) { -#ifdef USE_GREGEX if (rec->preg != NULL) { - GMatchInfo *match; + MatchInfo *match; + const char *new_text = NULL; - g_regex_match (rec->preg, text, 0, &match); + i_regex_match(rec->preg, text, 0, &match, &new_text); - if (g_match_info_matches(match)) - ret = g_match_info_fetch_pos(match, 0, match_beg, match_end); + if (i_match_info_matches(match)) + ret = i_match_info_fetch_pos(match, 0, match_beg, match_end, new_text); - g_match_info_free(match); + i_match_info_free(match); + g_free_not_null((char *)new_text); } -#else - regmatch_t rmatch[1]; - - if (rec->regexp_compiled && - regexec(&rec->preg, text, 1, rmatch, 0) == 0) { - if (rmatch[0].rm_so > 0 && - match_beg != NULL && match_end != NULL) { - *match_beg = rmatch[0].rm_so; - *match_end = rmatch[0].rm_eo; - } - ret = TRUE; - } -#endif } else { char *match; @@ -524,13 +500,8 @@ static void hilight_print(int index, HILIGHT_REC *rec) if (rec->case_sensitive) g_string_append(options, "-matchcase "); if (rec->regexp) { g_string_append(options, "-regexp "); -#ifdef USE_GREGEX if (rec->preg == NULL) g_string_append(options, "[INVALID!] "); -#else - if (!rec->regexp_compiled) - g_string_append(options, "[INVALID!] "); -#endif } if (rec->priority != 0) diff --git a/src/fe-common/core/hilight-text.h b/src/fe-common/core/hilight-text.h index 76beec1f..1d942f29 100644 --- a/src/fe-common/core/hilight-text.h +++ b/src/fe-common/core/hilight-text.h @@ -1,10 +1,7 @@ #ifndef __HILIGHT_TEXT_H #define __HILIGHT_TEXT_H -#ifndef USE_GREGEX -# include -#endif - +#include "iregex.h" #include "formats.h" struct _HILIGHT_REC { @@ -24,12 +21,7 @@ struct _HILIGHT_REC { unsigned int fullword:1; /* match `text' only for full words */ unsigned int regexp:1; /* `text' is a regular expression */ unsigned int case_sensitive:1;/* `text' must match case */ -#ifdef USE_GREGEX - GRegex *preg; -#else - unsigned int regexp_compiled:1; /* should always be TRUE, unless regexp is invalid */ - regex_t preg; -#endif + Regex *preg; char *servertag; }; diff --git a/src/fe-text/textbuffer.c b/src/fe-text/textbuffer.c index 3668f4c7..eb841096 100644 --- a/src/fe-text/textbuffer.c +++ b/src/fe-text/textbuffer.c @@ -24,13 +24,10 @@ #include "misc.h" #include "formats.h" #include "utf8.h" +#include "iregex.h" #include "textbuffer.h" -#ifndef USE_GREGEX -# include -#endif - #define TEXT_CHUNK_USABLE_SIZE (LINE_TEXT_CHUNK_SIZE-2-(int)sizeof(char*)) TEXT_BUFFER_REC *textbuffer_create(void) @@ -545,11 +542,7 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline, int before, int after, int regexp, int fullword, int case_sensitive) { -#ifdef USE_GREGEX - GRegex *preg; -#else - regex_t preg; -#endif + Regex *preg; LINE_REC *line, *pre_line; GList *matches; GString *str; @@ -559,23 +552,14 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline, g_return_val_if_fail(buffer != NULL, NULL); g_return_val_if_fail(text != NULL, NULL); -#ifdef USE_GREGEX preg = NULL; if (regexp) { - preg = g_regex_new(text, G_REGEX_RAW | (case_sensitive ? 0 : G_REGEX_CASELESS), 0, NULL); + preg = i_regex_new(text, case_sensitive ? 0 : G_REGEX_CASELESS, 0, NULL); if (preg == NULL) return NULL; } -#else - if (regexp) { - int flags = REG_EXTENDED | REG_NOSUB | - (case_sensitive ? 0 : REG_ICASE); - if (regcomp(&preg, text, flags) != 0) - return NULL; - } -#endif matches = NULL; match_after = 0; str = g_string_new(NULL); @@ -592,17 +576,16 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline, (line->info.level & nolevel) == 0; if (*text != '\0') { + const char *tmp = NULL; textbuffer_line2text(line, FALSE, str); if (line_matched) { line_matched = regexp ? -#ifdef USE_GREGEX - g_regex_match(preg, str->str, 0, NULL) -#else - regexec(&preg, str->str, 0, NULL, 0) == 0 -#endif + i_regex_match(preg, str->str, 0, NULL, &tmp) : match_func(str->str, text) != NULL; } + if (tmp && tmp != str->str) + g_free_not_null((char *)tmp); } if (line_matched) { @@ -631,12 +614,8 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline, } } -#ifdef USE_GREGEX if (preg != NULL) - g_regex_unref(preg); -#else - if (regexp) regfree(&preg); -#endif + i_regex_unref(preg); g_string_free(str, TRUE); return matches; }