forked from aniani/vim
updated for version 7.3.1011
Problem: New regexp engine is inefficient with multi-byte characters. Solution: Handle a character at a time instead of a byte at a time. Also make \Z partly work.
This commit is contained in:
159
src/regexp_nfa.c
159
src/regexp_nfa.c
@@ -46,9 +46,6 @@ enum
|
|||||||
NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
|
NFA_NCLOSE, /* End of subexpr. marked with \%( ... \) */
|
||||||
NFA_START_INVISIBLE,
|
NFA_START_INVISIBLE,
|
||||||
NFA_END_INVISIBLE,
|
NFA_END_INVISIBLE,
|
||||||
NFA_MULTIBYTE, /* Next nodes in NFA are part of the same
|
|
||||||
multibyte char */
|
|
||||||
NFA_END_MULTIBYTE, /* End of multibyte char in the NFA */
|
|
||||||
NFA_COMPOSING, /* Next nodes in NFA are part of the
|
NFA_COMPOSING, /* Next nodes in NFA are part of the
|
||||||
composing multibyte char */
|
composing multibyte char */
|
||||||
NFA_END_COMPOSING, /* End of a composing char in the NFA */
|
NFA_END_COMPOSING, /* End of a composing char in the NFA */
|
||||||
@@ -195,26 +192,6 @@ static long nfa_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf,
|
|||||||
*post_ptr++ = c; \
|
*post_ptr++ = c; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define EMIT_MBYTE(c) \
|
|
||||||
len = (*mb_char2bytes)(c, buf); \
|
|
||||||
EMIT(buf[0]); \
|
|
||||||
for (i = 1; i < len; i++) \
|
|
||||||
{ \
|
|
||||||
EMIT(buf[i]); \
|
|
||||||
EMIT(NFA_CONCAT); \
|
|
||||||
} \
|
|
||||||
EMIT(NFA_MULTIBYTE);
|
|
||||||
|
|
||||||
#define EMIT_COMPOSING_UTF(input) \
|
|
||||||
len = utfc_ptr2len(input); \
|
|
||||||
EMIT(input[0]); \
|
|
||||||
for (i = 1; i < len; i++) \
|
|
||||||
{ \
|
|
||||||
EMIT(input[i]); \
|
|
||||||
EMIT(NFA_CONCAT); \
|
|
||||||
} \
|
|
||||||
EMIT(NFA_COMPOSING);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize internal variables before NFA compilation.
|
* Initialize internal variables before NFA compilation.
|
||||||
* Return OK on success, FAIL otherwise.
|
* Return OK on success, FAIL otherwise.
|
||||||
@@ -611,8 +588,6 @@ nfa_regatom()
|
|||||||
#ifdef FEAT_MBYTE
|
#ifdef FEAT_MBYTE
|
||||||
char_u *old_regparse = regparse;
|
char_u *old_regparse = regparse;
|
||||||
int clen;
|
int clen;
|
||||||
int len;
|
|
||||||
static char_u buf[30];
|
|
||||||
int i;
|
int i;
|
||||||
#endif
|
#endif
|
||||||
int extra = 0;
|
int extra = 0;
|
||||||
@@ -845,14 +820,7 @@ nfa_regatom()
|
|||||||
return FAIL;
|
return FAIL;
|
||||||
|
|
||||||
c = coll_get_char();
|
c = coll_get_char();
|
||||||
#ifdef FEAT_MBYTE
|
EMIT(c);
|
||||||
if ((*mb_char2len)(c) > 1)
|
|
||||||
{
|
|
||||||
EMIT_MBYTE(c);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
EMIT(c);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
/* Catch \%^ and \%$ regardless of where they appear in the
|
/* Catch \%^ and \%$ regardless of where they appear in the
|
||||||
@@ -1135,12 +1103,7 @@ collection:
|
|||||||
* skip it. */
|
* skip it. */
|
||||||
for (c = startc + 1; c <= endc; c++)
|
for (c = startc + 1; c <= endc; c++)
|
||||||
{
|
{
|
||||||
if ((*mb_char2len)(c) > 1)
|
EMIT(c);
|
||||||
{
|
|
||||||
EMIT_MBYTE(c);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
EMIT(c);
|
|
||||||
TRY_NEG();
|
TRY_NEG();
|
||||||
EMIT_GLUE();
|
EMIT_GLUE();
|
||||||
}
|
}
|
||||||
@@ -1187,14 +1150,7 @@ collection:
|
|||||||
if (got_coll_char == TRUE && startc == 0)
|
if (got_coll_char == TRUE && startc == 0)
|
||||||
EMIT(0x0a);
|
EMIT(0x0a);
|
||||||
else
|
else
|
||||||
#ifdef FEAT_MBYTE
|
EMIT(startc);
|
||||||
if ((*mb_char2len)(startc) > 1)
|
|
||||||
{
|
|
||||||
EMIT_MBYTE(startc);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
|
||||||
EMIT(startc);
|
|
||||||
TRY_NEG();
|
TRY_NEG();
|
||||||
EMIT_GLUE();
|
EMIT_GLUE();
|
||||||
}
|
}
|
||||||
@@ -1242,30 +1198,30 @@ collection:
|
|||||||
int plen;
|
int plen;
|
||||||
|
|
||||||
nfa_do_multibyte:
|
nfa_do_multibyte:
|
||||||
/* length of current char, with composing chars,
|
/* Length of current char with composing chars. */
|
||||||
* from pointer */
|
if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse)))
|
||||||
plen = (*mb_ptr2len)(old_regparse);
|
|
||||||
if (enc_utf8 && clen != plen)
|
|
||||||
{
|
{
|
||||||
/* A composing character is always handled as a
|
/* A base character plus composing characters.
|
||||||
* separate atom, surrounded by NFA_COMPOSING and
|
* This requires creating a separate atom as if enclosing
|
||||||
* NFA_END_COMPOSING. Note that right now we are
|
* the characters in (), where NFA_COMPOSING is the ( and
|
||||||
|
* NFA_END_COMPOSING is the ). Note that right now we are
|
||||||
* building the postfix form, not the NFA itself;
|
* building the postfix form, not the NFA itself;
|
||||||
* a composing char could be: a, b, c, NFA_COMPOSING
|
* a composing char could be: a, b, c, NFA_COMPOSING
|
||||||
* where 'a', 'b', 'c' are chars with codes > 256.
|
* where 'b' and 'c' are chars with codes > 256. */
|
||||||
*/
|
i = 0;
|
||||||
EMIT_COMPOSING_UTF(old_regparse);
|
for (;;)
|
||||||
|
{
|
||||||
|
EMIT(c);
|
||||||
|
if (i > 0)
|
||||||
|
EMIT(NFA_CONCAT);
|
||||||
|
if (i += utf_char2len(c) >= plen)
|
||||||
|
break;
|
||||||
|
c = utf_ptr2char(old_regparse + i);
|
||||||
|
}
|
||||||
|
EMIT(NFA_COMPOSING);
|
||||||
regparse = old_regparse + plen;
|
regparse = old_regparse + plen;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
/* A multi-byte character is always handled as a
|
|
||||||
* separate atom, surrounded by NFA_MULTIBYTE and
|
|
||||||
* NFA_END_MULTIBYTE */
|
|
||||||
if (plen > 1)
|
|
||||||
{
|
|
||||||
EMIT_MBYTE(c);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
c = no_Magic(c);
|
c = no_Magic(c);
|
||||||
@@ -1702,9 +1658,6 @@ nfa_set_code(c)
|
|||||||
case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
|
case NFA_START_INVISIBLE: STRCPY(code, "NFA_START_INVISIBLE"); break;
|
||||||
case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
|
case NFA_END_INVISIBLE: STRCPY(code, "NFA_END_INVISIBLE"); break;
|
||||||
|
|
||||||
case NFA_MULTIBYTE: STRCPY(code, "NFA_MULTIBYTE"); break;
|
|
||||||
case NFA_END_MULTIBYTE: STRCPY(code, "NFA_END_MULTIBYTE"); break;
|
|
||||||
|
|
||||||
case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
|
case NFA_COMPOSING: STRCPY(code, "NFA_COMPOSING"); break;
|
||||||
case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
|
case NFA_END_COMPOSING: STRCPY(code, "NFA_END_COMPOSING"); break;
|
||||||
|
|
||||||
@@ -2194,7 +2147,7 @@ post2nfa(postfix, end, nfa_calc_size)
|
|||||||
}
|
}
|
||||||
e1 = POP();
|
e1 = POP();
|
||||||
e1.start->negated = TRUE;
|
e1.start->negated = TRUE;
|
||||||
if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING)
|
if (e1.start->c == NFA_COMPOSING)
|
||||||
e1.start->out1->negated = TRUE;
|
e1.start->out1->negated = TRUE;
|
||||||
PUSH(e1);
|
PUSH(e1);
|
||||||
break;
|
break;
|
||||||
@@ -2311,6 +2264,16 @@ post2nfa(postfix, end, nfa_calc_size)
|
|||||||
PUSH(frag(s, list1(&s1->out)));
|
PUSH(frag(s, list1(&s1->out)));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case NFA_COMPOSING: /* char with composing char */
|
||||||
|
#if 0
|
||||||
|
/* TODO */
|
||||||
|
if (regflags & RF_ICOMBINE)
|
||||||
|
{
|
||||||
|
goto normalchar;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
/* FALLTHROUGH */
|
||||||
|
|
||||||
case NFA_MOPEN + 0: /* Submatch */
|
case NFA_MOPEN + 0: /* Submatch */
|
||||||
case NFA_MOPEN + 1:
|
case NFA_MOPEN + 1:
|
||||||
case NFA_MOPEN + 2:
|
case NFA_MOPEN + 2:
|
||||||
@@ -2322,8 +2285,6 @@ post2nfa(postfix, end, nfa_calc_size)
|
|||||||
case NFA_MOPEN + 8:
|
case NFA_MOPEN + 8:
|
||||||
case NFA_MOPEN + 9:
|
case NFA_MOPEN + 9:
|
||||||
case NFA_NOPEN: /* \%( "Invisible Submatch" */
|
case NFA_NOPEN: /* \%( "Invisible Submatch" */
|
||||||
case NFA_MULTIBYTE: /* mbyte char */
|
|
||||||
case NFA_COMPOSING: /* composing char */
|
|
||||||
if (nfa_calc_size == TRUE)
|
if (nfa_calc_size == TRUE)
|
||||||
{
|
{
|
||||||
nstate += 2;
|
nstate += 2;
|
||||||
@@ -2336,9 +2297,6 @@ post2nfa(postfix, end, nfa_calc_size)
|
|||||||
case NFA_NOPEN:
|
case NFA_NOPEN:
|
||||||
mclose = NFA_NCLOSE;
|
mclose = NFA_NCLOSE;
|
||||||
break;
|
break;
|
||||||
case NFA_MULTIBYTE:
|
|
||||||
mclose = NFA_END_MULTIBYTE;
|
|
||||||
break;
|
|
||||||
case NFA_COMPOSING:
|
case NFA_COMPOSING:
|
||||||
mclose = NFA_END_COMPOSING;
|
mclose = NFA_END_COMPOSING;
|
||||||
break;
|
break;
|
||||||
@@ -2377,9 +2335,8 @@ post2nfa(postfix, end, nfa_calc_size)
|
|||||||
goto theend;
|
goto theend;
|
||||||
patch(e.out, s1);
|
patch(e.out, s1);
|
||||||
|
|
||||||
if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING)
|
if (mopen == NFA_COMPOSING)
|
||||||
/* MULTIBYTE->out1 = END_MULTIBYTE
|
/* COMPOSING->out1 = END_COMPOSING */
|
||||||
* COMPOSING->out1 = END_COMPOSING */
|
|
||||||
patch(list1(&s->out1), s1);
|
patch(list1(&s->out1), s1);
|
||||||
|
|
||||||
PUSH(frag(s, list1(&s1->out)));
|
PUSH(frag(s, list1(&s1->out)));
|
||||||
@@ -2540,17 +2497,8 @@ addstate(l, state, m, off, lid, match)
|
|||||||
case NFA_COMPOSING:
|
case NFA_COMPOSING:
|
||||||
/* nfa_regmatch() will match all the bytes of this composing char. */
|
/* nfa_regmatch() will match all the bytes of this composing char. */
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case NFA_MULTIBYTE:
|
|
||||||
/* nfa_regmatch() will match all the bytes of this multibyte char. */
|
|
||||||
break;
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
case NFA_END_MULTIBYTE:
|
|
||||||
/* Successfully matched this mbyte char */
|
|
||||||
addstate(l, state->out, m, off, lid, match);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case NFA_NOPEN:
|
case NFA_NOPEN:
|
||||||
case NFA_NCLOSE:
|
case NFA_NCLOSE:
|
||||||
addstate(l, state->out, m, off, lid, match);
|
addstate(l, state->out, m, off, lid, match);
|
||||||
@@ -2841,7 +2789,7 @@ nfa_regmatch(start, submatch, m)
|
|||||||
regsub_T *submatch;
|
regsub_T *submatch;
|
||||||
regsub_T *m;
|
regsub_T *m;
|
||||||
{
|
{
|
||||||
int c = -1;
|
int c;
|
||||||
int n;
|
int n;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int result;
|
int result;
|
||||||
@@ -2859,7 +2807,6 @@ nfa_regmatch(start, submatch, m)
|
|||||||
List *listtbl[2][2];
|
List *listtbl[2][2];
|
||||||
List *ll;
|
List *ll;
|
||||||
int listid = 1;
|
int listid = 1;
|
||||||
int endnode;
|
|
||||||
List *thislist;
|
List *thislist;
|
||||||
List *nextlist;
|
List *nextlist;
|
||||||
List *neglist;
|
List *neglist;
|
||||||
@@ -3190,33 +3137,35 @@ nfa_regmatch(start, submatch, m)
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
case NFA_MULTIBYTE:
|
#ifdef FEAT_MBYTE
|
||||||
case NFA_COMPOSING:
|
case NFA_COMPOSING:
|
||||||
endnode = t->state->c + 1;
|
{
|
||||||
|
int mc = c;
|
||||||
|
|
||||||
result = OK;
|
result = OK;
|
||||||
sta = t->state->out;
|
sta = t->state->out;
|
||||||
len = 1;
|
len = 0;
|
||||||
while (sta->c != endnode && len <= n)
|
while (sta->c != NFA_END_COMPOSING && len < n)
|
||||||
{
|
{
|
||||||
if (reginput[len-1] != sta->c)
|
if (len > 0)
|
||||||
{
|
mc = mb_ptr2char(reginput + len);
|
||||||
result = FAIL;
|
if (mc != sta->c)
|
||||||
break;
|
break;
|
||||||
}
|
len += mb_char2len(mc);
|
||||||
len++;
|
|
||||||
sta = sta->out;
|
sta = sta->out;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if input char length doesn't match regexp char length */
|
/* if input char length doesn't match regexp char length */
|
||||||
if (len -1 < n || sta->c != endnode)
|
if (len < n || sta->c != NFA_END_COMPOSING)
|
||||||
result = FAIL;
|
result = FAIL;
|
||||||
end = t->state->out1; /* NFA_END_MULTIBYTE or
|
end = t->state->out1; /* NFA_END_COMPOSING */
|
||||||
NFA_END_COMPOSING */
|
|
||||||
/* If \Z was present, then ignore composing characters */
|
/* If \Z was present, then ignore composing characters */
|
||||||
if (ireg_icombine && endnode == NFA_END_COMPOSING)
|
if (ireg_icombine)
|
||||||
result = 1 ^ sta->negated;
|
result = 1 ^ sta->negated;
|
||||||
ADD_POS_NEG_STATE(end);
|
ADD_POS_NEG_STATE(end);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
case NFA_NEWL:
|
case NFA_NEWL:
|
||||||
if (!reg_line_lbr && REG_MULTI
|
if (!reg_line_lbr && REG_MULTI
|
||||||
@@ -3425,6 +3374,14 @@ nfa_regmatch(start, submatch, m)
|
|||||||
if (!result)
|
if (!result)
|
||||||
result = ireg_ic == TRUE
|
result = ireg_ic == TRUE
|
||||||
&& MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
|
&& MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
|
||||||
|
#ifdef FEAT_MBYTE
|
||||||
|
/* If there is a composing character which is not being
|
||||||
|
* ignored there can be no match. Match with composing
|
||||||
|
* character uses NFA_COMPOSING above. */
|
||||||
|
if (result && enc_utf8 && !ireg_icombine
|
||||||
|
&& n != utf_char2len(c))
|
||||||
|
result = FALSE;
|
||||||
|
#endif
|
||||||
ADD_POS_NEG_STATE(t->state);
|
ADD_POS_NEG_STATE(t->state);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@@ -35,6 +35,10 @@ STARTTEST
|
|||||||
:call add(tl, ['\f\+', '&*fname ', 'fname'])
|
:call add(tl, ['\f\+', '&*fname ', 'fname'])
|
||||||
:call add(tl, ['\%#=1\f\+', '&*fname ', 'fname'])
|
:call add(tl, ['\%#=1\f\+', '&*fname ', 'fname'])
|
||||||
|
|
||||||
|
:"""" Test composing character matching
|
||||||
|
:call add(tl, ['.ม', 'xม่x yมy', 'yม'])
|
||||||
|
:call add(tl, ['.ม่', 'xม่x yมy', 'xม่'])
|
||||||
|
|
||||||
:"""" Test \Z
|
:"""" Test \Z
|
||||||
:call add(tl, ['ú\Z', 'x'])
|
:call add(tl, ['ú\Z', 'x'])
|
||||||
|
|
||||||
|
@@ -9,5 +9,7 @@ OK - \i\+
|
|||||||
OK - \%#=1\i\+
|
OK - \%#=1\i\+
|
||||||
OK - \f\+
|
OK - \f\+
|
||||||
OK - \%#=1\f\+
|
OK - \%#=1\f\+
|
||||||
|
OK - .ม
|
||||||
|
OK - .ม่
|
||||||
OK - ú\Z
|
OK - ú\Z
|
||||||
OK - [^[=a=]]\+
|
OK - [^[=a=]]\+
|
||||||
|
@@ -728,6 +728,8 @@ static char *(features[]) =
|
|||||||
|
|
||||||
static int included_patches[] =
|
static int included_patches[] =
|
||||||
{ /* Add new patch number below this line */
|
{ /* Add new patch number below this line */
|
||||||
|
/**/
|
||||||
|
1011,
|
||||||
/**/
|
/**/
|
||||||
1010,
|
1010,
|
||||||
/**/
|
/**/
|
||||||
|
Reference in New Issue
Block a user