diff --git a/runtime/doc/version9.txt b/runtime/doc/version9.txt index e8e9194dc5..73e54a1cf3 100644 --- a/runtime/doc/version9.txt +++ b/runtime/doc/version9.txt @@ -1,4 +1,4 @@ -*version9.txt* For Vim version 9.1. Last change: 2024 Jul 28 +*version9.txt* For Vim version 9.1. Last change: 2024 Jul 30 VIM REFERENCE MANUAL by Bram Moolenaar @@ -41590,6 +41590,8 @@ Changed~ behaviour/inconsistency (see |d-special| and |cw|). - allow to specify additional attributes in the completion menu (allows to mark deprecated attributes from LSP server) |complete-items| +- the regex engines match correctly case-insensitive multi-byte characters + (and apply proper case folding) *added-9.2* Added ~ diff --git a/src/mbyte.c b/src/mbyte.c index a68ba7be3d..d8c47acdd1 100644 --- a/src/mbyte.c +++ b/src/mbyte.c @@ -3800,6 +3800,15 @@ utf_strnicmp( * Returns zero if s1 and s2 are equal (ignoring case), the difference between * two characters otherwise. */ + int +mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2) +{ + if (n1 == n2 || !enc_utf8) + return mb_strnicmp(s1, s2, n1); + else + return utf_strnicmp(s1, s2, n1, n2); +} + int mb_strnicmp(char_u *s1, char_u *s2, size_t nn) { diff --git a/src/proto/mbyte.pro b/src/proto/mbyte.pro index c57c94c8ae..bb976e3bf8 100644 --- a/src/proto/mbyte.pro +++ b/src/proto/mbyte.pro @@ -48,6 +48,7 @@ int utf_islower(int a); int utf_tolower(int a); int utf_isupper(int a); int mb_strnicmp(char_u *s1, char_u *s2, size_t nn); +int mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2); void show_utf8(void); int latin_head_off(char_u *base, char_u *p); int dbcs_screen_head_off(char_u *base, char_u *p); diff --git a/src/regexp.c b/src/regexp.c index ff201d9ffe..a1b080e7d9 100644 --- a/src/regexp.c +++ b/src/regexp.c @@ -1729,7 +1729,9 @@ mb_decompose(int c, int *c1, int *c2, int *c3) /* * Compare two strings, ignore case if rex.reg_ic set. * Return 0 if strings match, non-zero otherwise. - * Correct the length "*n" when composing characters are ignored. + * Correct the length "*n" when composing characters are ignored + * or for utf8 when both utf codepoints are considered equal because of + * case-folding but have different length (e.g. 's' and 'ſ') */ static int cstrncmp(char_u *s1, char_u *s2, int *n) @@ -1738,6 +1740,29 @@ cstrncmp(char_u *s1, char_u *s2, int *n) if (!rex.reg_ic) result = STRNCMP(s1, s2, *n); + else if (enc_utf8) + { + char_u *p = s1; + size_t n2 = 0; + int n1 = *n; + // count the number of characters for byte-length of s1 + while (n1 > 0 && *p != NUL) + { + n1 -= mb_ptr2len(s1); + MB_PTR_ADV(p); + n2++; + } + // count the number of bytes to advance the same number of chars for s2 + p = s2; + while (n2-- > 0 && *p != NUL) + MB_PTR_ADV(p); + + n2 = p - s2; + + result = MB_STRNICMP2(s1, s2, *n, n2); + if (result == 0 && (int)n2 < *n) + *n = n2; + } else result = MB_STRNICMP(s1, s2, *n); @@ -1787,7 +1812,7 @@ cstrncmp(char_u *s1, char_u *s2, int *n) cstrchr(char_u *s, int c) { char_u *p; - int cc; + int cc, lc; if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1)) return vim_strchr(s, c); @@ -1796,26 +1821,35 @@ cstrchr(char_u *s, int c) // faster (esp. when using MS Visual C++!). // For UTF-8 need to use folded case. if (enc_utf8 && c > 0x80) + { cc = utf_fold(c); + lc = cc; + } else - if (MB_ISUPPER(c)) - cc = MB_TOLOWER(c); - else if (MB_ISLOWER(c)) - cc = MB_TOUPPER(c); - else - return vim_strchr(s, c); + if (MB_ISUPPER(c)) + { + cc = MB_TOLOWER(c); + lc = cc; + } + else if (MB_ISLOWER(c)) + { + cc = MB_TOUPPER(c); + lc = c; + } + else + return vim_strchr(s, c); if (has_mbyte) { for (p = s; *p != NUL; p += (*mb_ptr2len)(p)) { - if (enc_utf8 && c > 0x80) + int uc = utf_ptr2char(p); + if (enc_utf8 && (c > 0x80 || uc > 0x80)) { - int uc = utf_ptr2char(p); - // Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf, // not 0xff. - if ((uc < 0x80 || uc != *p) && utf_fold(uc) == cc) + // compare with lower case of the character + if ((uc < 0x80 || uc != *p) && utf_fold(uc) == lc) return p; } else if (*p == c || *p == cc) diff --git a/src/regexp_bt.c b/src/regexp_bt.c index 5452dda0f6..16dac730de 100644 --- a/src/regexp_bt.c +++ b/src/regexp_bt.c @@ -3823,6 +3823,14 @@ regmatch( } } } + else if (enc_utf8) + { + if (cstrncmp(opnd, rex.input, &len) != 0) + { + status = RA_NOMATCH; + break; + } + } else for (i = 0; i < len; ++i) if (opnd[i] != rex.input[i]) diff --git a/src/regexp_nfa.c b/src/regexp_nfa.c index 4f07a21d5d..6db4134628 100644 --- a/src/regexp_nfa.c +++ b/src/regexp_nfa.c @@ -5666,7 +5666,12 @@ find_match_text(colnr_T *startcol, int regstart, char_u *match_text) for (;;) { match = TRUE; - len2 = MB_CHAR2LEN(regstart); // skip regstart + // skip regstart + len2 = MB_CHAR2LEN(regstart); + if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2) + // because of case-folding of the previously matched text, we may need + // to skip fewer bytes than mb_char2len(regstart) + len2 = mb_char2len(utf_fold(regstart)); for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1)) { c1 = PTR2CHAR(match_text + len1); @@ -7502,7 +7507,7 @@ nfa_regexec_both( // If match_text is set it contains the full text that must match. // Nothing else to try. Doesn't handle combining chars well. - if (prog->match_text != NULL && !rex.reg_icombine) + if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine) { retval = find_match_text(&col, prog->regstart, prog->match_text); if (REG_MULTI) diff --git a/src/testdir/test_regexp_utf8.vim b/src/testdir/test_regexp_utf8.vim index bc705441e7..51c0984adb 100644 --- a/src/testdir/test_regexp_utf8.vim +++ b/src/testdir/test_regexp_utf8.vim @@ -587,4 +587,36 @@ func Test_combining_chars_in_collection() bw! endfunc +func Test_search_multibyte_match_ascii() + new + " Match single 'ſ' and 's' + call setline(1, 'das abc heraus abc ſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + endfor + " Match several 'ſſ' and 'ss' + call setline(1, 'das abc herauss abc ſſich abc ſind') + for i in range(0, 2) + exe "set re="..i + let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text}) + let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text}) + let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text}) + + call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re) + call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re) + call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re) + endfor + bw! +endfunc + " vim: shiftwidth=2 sts=2 expandtab diff --git a/src/version.c b/src/version.c index a2662861e3..f19e9415e9 100644 --- a/src/version.c +++ b/src/version.c @@ -704,6 +704,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ +/**/ + 645, /**/ 644, /**/ diff --git a/src/vim.h b/src/vim.h index c022f2e7f4..9c1434cc63 100644 --- a/src/vim.h +++ b/src/vim.h @@ -1769,6 +1769,7 @@ void *vim_memset(void *, int, size_t); # define MB_STRICMP(d, s) mb_strnicmp((char_u *)(d), (char_u *)(s), (int)MAXCOL) # define MB_STRNICMP(d, s, n) mb_strnicmp((char_u *)(d), (char_u *)(s), (int)(n)) +# define MB_STRNICMP2(d, s, n1, n2) mb_strnicmp2((char_u *)(d), (char_u *)(s), (n1), (n2)) #define STRCAT(d, s) strcat((char *)(d), (char *)(s)) #define STRNCAT(d, s, n) strncat((char *)(d), (char *)(s), (size_t)(n))