1
0
mirror of https://github.com/irssi/irssi.git synced 2024-09-01 04:14:16 -04:00

Refactor regex and implement UTF8 mode for GRegex

- with non-unicode byte to Private Use Area A mapping
- move all ifdefs to iregex.h file only
This commit is contained in:
ailin-nemui 2017-02-16 22:48:13 +01:00
parent 31b9d115b0
commit 79bbca4644
12 changed files with 327 additions and 132 deletions

View File

@ -505,6 +505,7 @@ AM_CONDITIONAL(BUILD_IRSSIBOT, test "$want_irssibot" = "yes")
AM_CONDITIONAL(BUILD_IRSSIFUZZER, test "$want_irssifuzzer" = "yes")
AM_CONDITIONAL(BUILD_IRSSIPROXY, test "$want_irssiproxy" = "yes")
AM_CONDITIONAL(HAVE_PERL, test "$want_perl" != "no")
AM_CONDITIONAL(USE_GREGEX, test "x$want_gregex" = "xyes")
# move LIBS to PROG_LIBS so they're not tried to be used when linking eg. perl libraries
PROG_LIBS=$LIBS

View File

@ -7,6 +7,12 @@ AM_CPPFLAGS = \
-DSYSCONFDIR=\""$(sysconfdir)"\" \
-DMODULEDIR=\""$(libdir)/irssi/modules"\"
if USE_GREGEX
regex_impl=iregex-gregex.c
else
regex_impl=iregex-regexh.c
endif
libcore_a_SOURCES = \
args.c \
channels.c \
@ -45,6 +51,7 @@ libcore_a_SOURCES = \
signals.c \
special-vars.c \
utf8.c \
$(regex_impl) \
wcwidth.c \
tls.c \
write-buffer.c
@ -97,6 +104,7 @@ pkginc_core_HEADERS = \
signals.h \
special-vars.h \
utf8.h \
iregex.h \
window-item-def.h \
tls.h \
write-buffer.h \

View File

@ -24,6 +24,7 @@
#include "levels.h"
#include "lib-config/iconfig.h"
#include "settings.h"
#include "iregex.h"
#include "masks.h"
#include "servers.h"
@ -67,13 +68,8 @@ static int ignore_match_pattern(IGNORE_REC *rec, const char *text)
return FALSE;
if (rec->regexp) {
#ifdef USE_GREGEX
return rec->preg != NULL &&
g_regex_match(rec->preg, text, 0, NULL);
#else
return rec->regexp_compiled &&
regexec(&rec->preg, text, 0, NULL, 0) == 0;
#endif
i_regex_match(rec->preg, text, 0, NULL, NULL);
}
return rec->fullword ?
@ -327,41 +323,19 @@ static void ignore_remove_config(IGNORE_REC *rec)
static void ignore_init_rec(IGNORE_REC *rec)
{
#ifdef USE_GREGEX
if (rec->preg != NULL)
g_regex_unref(rec->preg);
i_regex_unref(rec->preg);
if (rec->regexp && rec->pattern != NULL) {
GError *re_error = NULL;
rec->preg = g_regex_new(rec->pattern, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_CASELESS, 0, &re_error);
rec->preg = i_regex_new(rec->pattern, G_REGEX_OPTIMIZE | G_REGEX_CASELESS, 0, &re_error);
if (rec->preg == NULL) {
g_warning("Failed to compile regexp '%s': %s", rec->pattern, re_error->message);
g_error_free(re_error);
}
}
#else
char *errbuf;
int errcode, errbuf_len;
if (rec->regexp_compiled) regfree(&rec->preg);
rec->regexp_compiled = FALSE;
if (rec->regexp && rec->pattern != NULL) {
errcode = regcomp(&rec->preg, rec->pattern,
REG_EXTENDED|REG_ICASE|REG_NOSUB);
if (errcode != 0) {
errbuf_len = regerror(errcode, &rec->preg, 0, 0);
errbuf = g_malloc(errbuf_len);
regerror(errcode, &rec->preg, errbuf, errbuf_len);
g_warning("Failed to compile regexp '%s': %s", rec->pattern, errbuf);
g_free(errbuf);
} else {
rec->regexp_compiled = TRUE;
}
}
#endif
}
void ignore_add_rec(IGNORE_REC *rec)
@ -381,11 +355,7 @@ static void ignore_destroy(IGNORE_REC *rec, int send_signal)
if (send_signal)
signal_emit("ignore destroyed", 1, rec);
#ifdef USE_GREGEX
if (rec->preg != NULL) g_regex_unref(rec->preg);
#else
if (rec->regexp_compiled) regfree(&rec->preg);
#endif
if (rec->preg != NULL) i_regex_unref(rec->preg);
if (rec->channels != NULL) g_strfreev(rec->channels);
g_free_not_null(rec->mask);
g_free_not_null(rec->servertag);

View File

@ -1,9 +1,7 @@
#ifndef __IGNORE_H
#define __IGNORE_H
#ifndef USE_GREGEX
# include <regex.h>
#endif
#include "iregex.h"
typedef struct _IGNORE_REC IGNORE_REC;
@ -20,12 +18,7 @@ struct _IGNORE_REC {
unsigned int regexp:1;
unsigned int fullword:1;
unsigned int replies:1; /* ignore replies to nick in channel */
#ifdef USE_GREGEX
GRegex *preg;
#else
unsigned int regexp_compiled:1; /* should always be TRUE, unless regexp is invalid */
regex_t preg;
#endif
Regex *preg;
};
extern GSList *ignores;

137
src/core/iregex-gregex.c Normal file
View File

@ -0,0 +1,137 @@
#include <string.h>
#include "iregex.h"
const gchar *
make_valid_utf8(const gchar *text, gboolean *free_ret)
{
GString *str;
const gchar *ptr;
if (g_utf8_validate(text, -1, NULL)) {
if (free_ret)
*free_ret = FALSE;
return text;
}
str = g_string_sized_new(strlen(text) + 12);
ptr = text;
while (*ptr) {
gunichar c = g_utf8_get_char_validated(ptr, -1);
/* the unicode is invalid */
if (c == (gunichar)-1 || c == (gunichar)-2) {
/* encode the byte into PUA-A */
g_string_append_unichar(str, (gunichar) (0xfff00 | (*ptr & 0xff)));
ptr++;
} else {
g_string_append_unichar(str, c);
ptr = g_utf8_next_char(ptr);
}
}
if (free_ret)
*free_ret = TRUE;
return g_string_free(str, FALSE);
}
Regex *
i_regex_new (const gchar *pattern,
GRegexCompileFlags compile_options,
GRegexMatchFlags match_options,
GError **error)
{
const gchar *valid_pattern;
gboolean free_valid_pattern;
Regex *ret = NULL;
valid_pattern = make_valid_utf8(pattern, &free_valid_pattern);
ret = g_regex_new(valid_pattern, compile_options, match_options, error);
if (free_valid_pattern)
g_free_not_null((gchar *)valid_pattern);
return ret;
}
void
i_regex_unref (Regex *regex)
{
g_regex_unref(regex);
}
/* if new_string is present, the caller must free new_string.
otherwise, g_match_info_get_string must not be used. */
gboolean
i_regex_match (const Regex *regex,
const gchar *string,
GRegexMatchFlags match_options,
MatchInfo **match_info,
const gchar **new_string)
{
gboolean ret;
gboolean free_valid_string;
const gchar *valid_string = make_valid_utf8(string, &free_valid_string);
ret = g_regex_match(regex, valid_string, match_options, match_info);
if (free_valid_string) {
if (new_string)
*new_string = valid_string;
else
g_free_not_null((gchar *)valid_string);
}
return ret;
}
gsize
strlen_pua_oddly(const char *str)
{
const gchar *ptr;
gsize ret = 0;
ptr = str;
while (*ptr) {
const gchar *old;
gunichar c = g_utf8_get_char(ptr);
old = ptr;
ptr = g_utf8_next_char(ptr);
/* it is our PUA encoded byte */
if ((c & 0xfff00) == 0xfff00)
ret++;
else
ret += ptr - old;
}
return ret;
}
gboolean
i_match_info_fetch_pos (const MatchInfo *match_info,
gint match_num,
gint *start_pos,
gint *end_pos,
const gchar *new_string)
{
gint tmp_start, tmp_end, new_start_pos;
gboolean ret;
if (!new_string || (!start_pos && !end_pos))
return g_match_info_fetch_pos(match_info, match_num, start_pos, end_pos);
ret = g_match_info_fetch_pos(match_info, match_num, &tmp_start, &tmp_end);
if (start_pos || end_pos) {
gchar *to_start = g_strndup(new_string, tmp_start);
new_start_pos = strlen_pua_oddly(to_start);
g_free_not_null(to_start);
if (start_pos)
*start_pos = new_start_pos;
if (end_pos) {
gchar *to_end = g_strndup(new_string + tmp_start, tmp_end - tmp_start);
*end_pos = new_start_pos + strlen_pua_oddly(to_end);
g_free_not_null(to_end);
}
}
return ret;
}

101
src/core/iregex-regexh.c Normal file
View File

@ -0,0 +1,101 @@
#include "iregex.h"
Regex *
i_regex_new (const gchar *pattern,
GRegexCompileFlags compile_options,
GRegexMatchFlags match_options,
GError **error)
{
Regex *regex;
char *errbuf;
int cflags;
int errcode, errbuf_len;
regex = g_new0(Regex, 1);
cflags = REG_EXTENDED;
if (compile_options & G_REGEX_CASELESS)
cflags |= REG_ICASE;
if (compile_options & G_REGEX_MULTILINE)
cflags |= REG_NEWLINE;
if (match_options & G_REGEX_MATCH_NOTBOL)
cflags |= REG_NOTBOL;
if (match_options & G_REGEX_MATCH_NOTEOL)
cflags |= REG_NOTEOL;
errcode = regcomp(regex, pattern, cflags);
if (errcode != 0) {
errbuf_len = regerror(errcode, regex, 0, 0);
errbuf = g_malloc(errbuf_len);
regerror(errcode, regex, errbuf, errbuf_len);
g_set_error(error, G_REGEX_ERROR, errcode, "%s", errbuf);
g_free(errbuf);
g_free(regex);
return NULL;
} else {
return regex;
}
}
void
i_regex_unref (Regex *regex)
{
regfree(regex);
g_free(regex);
}
gboolean
i_regex_match (const Regex *regex,
const gchar *string,
GRegexMatchFlags match_options,
MatchInfo **match_info,
const gchar **new_string)
{
int groups;
int eflags;
g_return_val_if_fail(regex != NULL, FALSE);
if (match_info != NULL) {
groups = 1 + regex->re_nsub;
*match_info = g_new0(MatchInfo, groups);
} else {
groups = 0;
}
eflags = 0;
if (match_options & G_REGEX_MATCH_NOTBOL)
eflags |= REG_NOTBOL;
if (match_options & G_REGEX_MATCH_NOTEOL)
eflags |= REG_NOTEOL;
return regexec(regex, string, groups, groups ? *match_info : NULL, eflags) == 0;
}
gboolean
i_match_info_fetch_pos (const MatchInfo *match_info,
gint match_num,
gint *start_pos,
gint *end_pos,
const gchar *new_string)
{
if (start_pos != NULL)
*start_pos = match_info[match_num].rm_so;
if (end_pos != NULL)
*end_pos = match_info[match_num].rm_eo;
return TRUE;
}
gboolean
i_match_info_matches (const MatchInfo *match_info)
{
g_return_val_if_fail(match_info != NULL, FALSE);
return match_info[0].rm_so != -1;
}
void
i_match_info_free (MatchInfo *match_info)
{
g_free(match_info);
}

52
src/core/iregex.h Normal file
View File

@ -0,0 +1,52 @@
#ifndef __REGEX_H
#define __REGEX_H
#include "common.h"
#ifdef USE_GREGEX
#include <glib.h>
typedef GRegex Regex;
typedef GMatchInfo MatchInfo;
#define i_match_info_matches g_match_info_matches
#define i_match_info_free g_match_info_free
#else
#include <regex.h>
typedef regex_t Regex;
typedef regmatch_t MatchInfo;
gboolean
i_match_info_matches (const MatchInfo *match_info);
void
i_match_info_free (MatchInfo *match_info);
#endif
Regex *
i_regex_new (const gchar *pattern,
GRegexCompileFlags compile_options,
GRegexMatchFlags match_options,
GError **error);
void
i_regex_unref (Regex *regex);
gboolean
i_regex_match (const Regex *regex,
const gchar *string,
GRegexMatchFlags match_options,
MatchInfo **match_info,
const gchar **new_string);
gboolean
i_match_info_fetch_pos (const MatchInfo *match_info,
gint match_num,
gint *start_pos,
gint *end_pos,
const gchar *new_string);
#endif

View File

@ -22,10 +22,6 @@
#include "misc.h"
#include "commands.h"
#ifndef USE_GREGEX
# include <regex.h>
#endif
typedef struct {
int condition;
GInputFunction function;

View File

@ -58,13 +58,8 @@ static void ignore_print(int index, IGNORE_REC *rec)
g_string_append(options, "-regexp ");
if (rec->pattern == NULL)
g_string_append(options, "[INVALID! -pattern missing] ");
#ifdef USE_GREGEX
else if (rec->preg == NULL)
g_string_append(options, "[INVALID!] ");
#else
else if (!rec->regexp_compiled)
g_string_append(options, "[INVALID!] ");
#endif
}
if (rec->fullword) g_string_append(options, "-full ");
if (rec->replies) g_string_append(options, "-replies ");

View File

@ -26,6 +26,7 @@
#include "misc.h"
#include "lib-config/iconfig.h"
#include "settings.h"
#include "iregex.h"
#include "servers.h"
#include "channels.h"
@ -101,11 +102,7 @@ static void hilight_destroy(HILIGHT_REC *rec)
{
g_return_if_fail(rec != NULL);
#ifdef USE_GREGEX
if (rec->preg != NULL) g_regex_unref(rec->preg);
#else
if (rec->regexp_compiled) regfree(&rec->preg);
#endif
if (rec->preg != NULL) i_regex_unref(rec->preg);
if (rec->channels != NULL) g_strfreev(rec->channels);
g_free_not_null(rec->color);
g_free_not_null(rec->act_color);
@ -122,19 +119,10 @@ static void hilights_destroy_all(void)
static void hilight_init_rec(HILIGHT_REC *rec)
{
#ifdef USE_GREGEX
if (rec->preg != NULL)
g_regex_unref(rec->preg);
i_regex_unref(rec->preg);
rec->preg = g_regex_new(rec->text, G_REGEX_OPTIMIZE | G_REGEX_RAW | G_REGEX_CASELESS, 0, NULL);
#else
if (rec->regexp_compiled) regfree(&rec->preg);
if (!rec->regexp)
rec->regexp_compiled = FALSE;
else
rec->regexp_compiled = regcomp(&rec->preg, rec->text,
rec->case_sensitive ? REG_EXTENDED : (REG_EXTENDED|REG_ICASE)) == 0;
#endif
rec->preg = i_regex_new(rec->text, G_REGEX_OPTIMIZE | G_REGEX_CASELESS, 0, NULL);
}
void hilight_create(HILIGHT_REC *rec)
@ -207,30 +195,18 @@ static gboolean hilight_match_text(HILIGHT_REC *rec, const char *text,
gboolean ret = FALSE;
if (rec->regexp) {
#ifdef USE_GREGEX
if (rec->preg != NULL) {
GMatchInfo *match;
MatchInfo *match;
const char *new_text = NULL;
g_regex_match (rec->preg, text, 0, &match);
i_regex_match(rec->preg, text, 0, &match, &new_text);
if (g_match_info_matches(match))
ret = g_match_info_fetch_pos(match, 0, match_beg, match_end);
if (i_match_info_matches(match))
ret = i_match_info_fetch_pos(match, 0, match_beg, match_end, new_text);
g_match_info_free(match);
i_match_info_free(match);
g_free_not_null((char *)new_text);
}
#else
regmatch_t rmatch[1];
if (rec->regexp_compiled &&
regexec(&rec->preg, text, 1, rmatch, 0) == 0) {
if (rmatch[0].rm_so > 0 &&
match_beg != NULL && match_end != NULL) {
*match_beg = rmatch[0].rm_so;
*match_end = rmatch[0].rm_eo;
}
ret = TRUE;
}
#endif
} else {
char *match;
@ -524,13 +500,8 @@ static void hilight_print(int index, HILIGHT_REC *rec)
if (rec->case_sensitive) g_string_append(options, "-matchcase ");
if (rec->regexp) {
g_string_append(options, "-regexp ");
#ifdef USE_GREGEX
if (rec->preg == NULL)
g_string_append(options, "[INVALID!] ");
#else
if (!rec->regexp_compiled)
g_string_append(options, "[INVALID!] ");
#endif
}
if (rec->priority != 0)

View File

@ -1,10 +1,7 @@
#ifndef __HILIGHT_TEXT_H
#define __HILIGHT_TEXT_H
#ifndef USE_GREGEX
# include <regex.h>
#endif
#include "iregex.h"
#include "formats.h"
struct _HILIGHT_REC {
@ -24,12 +21,7 @@ struct _HILIGHT_REC {
unsigned int fullword:1; /* match `text' only for full words */
unsigned int regexp:1; /* `text' is a regular expression */
unsigned int case_sensitive:1;/* `text' must match case */
#ifdef USE_GREGEX
GRegex *preg;
#else
unsigned int regexp_compiled:1; /* should always be TRUE, unless regexp is invalid */
regex_t preg;
#endif
Regex *preg;
char *servertag;
};

View File

@ -24,13 +24,10 @@
#include "misc.h"
#include "formats.h"
#include "utf8.h"
#include "iregex.h"
#include "textbuffer.h"
#ifndef USE_GREGEX
# include <regex.h>
#endif
#define TEXT_CHUNK_USABLE_SIZE (LINE_TEXT_CHUNK_SIZE-2-(int)sizeof(char*))
TEXT_BUFFER_REC *textbuffer_create(void)
@ -545,11 +542,7 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline,
int before, int after,
int regexp, int fullword, int case_sensitive)
{
#ifdef USE_GREGEX
GRegex *preg;
#else
regex_t preg;
#endif
Regex *preg;
LINE_REC *line, *pre_line;
GList *matches;
GString *str;
@ -559,23 +552,14 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline,
g_return_val_if_fail(buffer != NULL, NULL);
g_return_val_if_fail(text != NULL, NULL);
#ifdef USE_GREGEX
preg = NULL;
if (regexp) {
preg = g_regex_new(text, G_REGEX_RAW | (case_sensitive ? 0 : G_REGEX_CASELESS), 0, NULL);
preg = i_regex_new(text, case_sensitive ? 0 : G_REGEX_CASELESS, 0, NULL);
if (preg == NULL)
return NULL;
}
#else
if (regexp) {
int flags = REG_EXTENDED | REG_NOSUB |
(case_sensitive ? 0 : REG_ICASE);
if (regcomp(&preg, text, flags) != 0)
return NULL;
}
#endif
matches = NULL; match_after = 0;
str = g_string_new(NULL);
@ -592,17 +576,16 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline,
(line->info.level & nolevel) == 0;
if (*text != '\0') {
const char *tmp = NULL;
textbuffer_line2text(line, FALSE, str);
if (line_matched) {
line_matched = regexp ?
#ifdef USE_GREGEX
g_regex_match(preg, str->str, 0, NULL)
#else
regexec(&preg, str->str, 0, NULL, 0) == 0
#endif
i_regex_match(preg, str->str, 0, NULL, &tmp)
: match_func(str->str, text) != NULL;
}
if (tmp && tmp != str->str)
g_free_not_null((char *)tmp);
}
if (line_matched) {
@ -631,12 +614,8 @@ GList *textbuffer_find_text(TEXT_BUFFER_REC *buffer, LINE_REC *startline,
}
}
#ifdef USE_GREGEX
if (preg != NULL)
g_regex_unref(preg);
#else
if (regexp) regfree(&preg);
#endif
i_regex_unref(preg);
g_string_free(str, TRUE);
return matches;
}