0
0
mirror of https://github.com/vim/vim.git synced 2025-10-24 08:54:47 -04:00

patch 9.1.1258: regexp: max \U and \%U value is limited by INT_MAX

Problem:  regexp: max \U and \%U value is limited by INT_MAX but gives a
          confusing error message (related: v8.1.0985).
Solution: give a better error message when the value reaches INT_MAX

When searching Vim allows to get up to 8 hex characters using the /\V
and /\%V regex atoms.  However, when using "/\UFFFFFFFF" the code point is
already above what an integer variable can hold, which is 2,147,483,647.

Since patch v8.1.0985, Vim already limited the max codepoint to INT_MAX
(otherwise it caused a crash in the nfa regex engine), but instead of
error'ing out it silently fell back to parse the number as a backslash
value and not as a codepoint value and as such this "/[\UFFFFFFFF]" will
happily find a "\" or an literal "F".  And this "/[\d127-\UFFFFFFFF]"
will error out as "reverse range in character class).

Interestingly, the max Unicode codepoint value is U+10FFFF which still
fits into an ordinary integer value,  which means, that we don't even
need to parse 8 hex characters, but 6 should have been enough.

However, let's not limit Vim to search for only max 6 hex characters
(which would be a backward incompatible change), but instead allow all 8
characters and only if the codepoint reaches INT_MAX, give a more
precise error message (about what the max unicode codepoint value is).
This allows to search for "[\U7FFFFFFE]" (will likely return "E486
Pattern not found") and "[/\U7FFFFFF]" now errors "E1517: Value too
large, max Unicode codepoint is U+10FFFF".

While this change is straight forward on architectures where long is 8
bytes, this is not so simple on Windows or 32bit architectures where long
is 4 bytes (and therefore the test fails there).  To account for that,
let's make use of the vimlong_T number type and make a few corresponding
changes in the regex engine code and cast the value to the expected data
type. This however may not work correctly on systems that doesn't have
the long long datatype (e.g. OpenVMS) and probably the test will fail
there.

fixes: #16949
closes: #16994

Signed-off-by: Christian Brabandt <cb@256bit.org>
This commit is contained in:
Christian Brabandt
2025-03-29 09:08:58 +01:00
parent 90e52490b3
commit f2b16986a1
9 changed files with 70 additions and 21 deletions

View File

@@ -1,4 +1,4 @@
*pattern.txt* For Vim version 9.1. Last change: 2025 Mar 21 *pattern.txt* For Vim version 9.1. Last change: 2025 Mar 28
VIM REFERENCE MANUAL by Bram Moolenaar VIM REFERENCE MANUAL by Bram Moolenaar
@@ -1222,7 +1222,8 @@ x A single character, with no special meaning, matches itself
\o40 octal number of character up to 0o377 \o40 octal number of character up to 0o377
\x20 hexadecimal number of character up to 0xff \x20 hexadecimal number of character up to 0xff
\u20AC hex. number of multibyte character up to 0xffff \u20AC hex. number of multibyte character up to 0xffff
\U1234 hex. number of multibyte character up to 0xffffffff \U1234 hex. number of multibyte character up to 8 characters
0xffffffff |E1541|
NOTE: The other backslash codes mentioned above do not work inside NOTE: The other backslash codes mentioned above do not work inside
[]! []!
- Matching with a collection can be slow, because each character in - Matching with a collection can be slow, because each character in
@@ -1263,7 +1264,8 @@ x A single character, with no special meaning, matches itself
\%u20AC Matches the character specified with up to four hexadecimal \%u20AC Matches the character specified with up to four hexadecimal
characters. characters.
\%U1234abcd Matches the character specified with up to eight hexadecimal \%U1234abcd Matches the character specified with up to eight hexadecimal
characters, up to 0x7fffffff characters, up to 0x7fffffff (the maximum allowed value is INT_MAX
|E1541|, but the maximum valid Unicode codepoint is U+10FFFF).
============================================================================== ==============================================================================
7. Ignoring case in a pattern */ignorecase* 7. Ignoring case in a pattern */ignorecase*

View File

@@ -4621,6 +4621,7 @@ E1538 eval.txt /*E1538*
E1539 vim9.txt /*E1539* E1539 vim9.txt /*E1539*
E154 helphelp.txt /*E154* E154 helphelp.txt /*E154*
E1540 eval.txt /*E1540* E1540 eval.txt /*E1540*
E1541 vi_diff.txt /*E1541*
E155 sign.txt /*E155* E155 sign.txt /*E155*
E156 sign.txt /*E156* E156 sign.txt /*E156*
E157 sign.txt /*E157* E157 sign.txt /*E157*

View File

@@ -1,4 +1,4 @@
*vi_diff.txt* For Vim version 9.1. Last change: 2024 Nov 10 *vi_diff.txt* For Vim version 9.1. Last change: 2025 Mar 28
VIM REFERENCE MANUAL by Bram Moolenaar VIM REFERENCE MANUAL by Bram Moolenaar
@@ -91,8 +91,11 @@ Maximum display width Unix and Win32: 1024 characters, otherwise 255
Maximum lhs of a mapping 50 characters. Maximum lhs of a mapping 50 characters.
Number of different highlighting types: over 30000 Number of different highlighting types: over 30000
Range of a Number variable: -2147483648 to 2147483647 (might be more on 64 Range of a Number variable: -2147483648 to 2147483647 (might be more on 64
bit systems) bit systems) See also: |v:numbermax|,
|v:numbermin| and |v:numbersize|
Maximum length of a line in a tags file: 512 bytes. Maximum length of a line in a tags file: 512 bytes.
*E1541*
Maximum value for |/\U| and |/\%U|: 2147483647 (for 32bit integer).
Information for undo and text in registers is kept in memory, thus when making Information for undo and text in registers is kept in memory, thus when making
(big) changes the amount of (virtual) memory available limits the number of (big) changes the amount of (virtual) memory available limits the number of

View File

@@ -3716,3 +3716,5 @@ EXTERN char e_variadic_tuple_must_end_with_list_type_str[]
EXTERN char e_cannot_use_variadic_tuple_in_concatenation[] EXTERN char e_cannot_use_variadic_tuple_in_concatenation[]
INIT(= N_("E1540: Cannot use a variadic tuple in concatenation")); INIT(= N_("E1540: Cannot use a variadic tuple in concatenation"));
#endif #endif
EXTERN char e_unicode_val_too_large[]
INIT(= N_("E1541: Value too large, max Unicode codepoint is U+10FFFF"));

View File

@@ -427,9 +427,9 @@ static void skipchr_keepstart(void);
static int peekchr(void); static int peekchr(void);
static void skipchr(void); static void skipchr(void);
static void ungetchr(void); static void ungetchr(void);
static long gethexchrs(int maxinputlen); static vimlong_T gethexchrs(int maxinputlen);
static long getoctchrs(void); static long getoctchrs(void);
static long getdecchrs(void); static vimlong_T getdecchrs(void);
static int coll_get_char(void); static int coll_get_char(void);
static int prog_magic_wrong(void); static int prog_magic_wrong(void);
static int cstrncmp(char_u *s1, char_u *s2, int *n); static int cstrncmp(char_u *s1, char_u *s2, int *n);
@@ -979,7 +979,7 @@ ungetchr(void)
* The parameter controls the maximum number of input characters. This will be * The parameter controls the maximum number of input characters. This will be
* 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence. * 2 when reading a \%x20 sequence and 4 when reading a \%u20AC sequence.
*/ */
static long static vimlong_T
gethexchrs(int maxinputlen) gethexchrs(int maxinputlen)
{ {
long_u nr = 0; long_u nr = 0;
@@ -998,14 +998,14 @@ gethexchrs(int maxinputlen)
if (i == 0) if (i == 0)
return -1; return -1;
return (long)nr; return nr;
} }
/* /*
* Get and return the value of the decimal string immediately after the * Get and return the value of the decimal string immediately after the
* current position. Return -1 for invalid. Consumes all digits. * current position. Return -1 for invalid. Consumes all digits.
*/ */
static long static vimlong_T
getdecchrs(void) getdecchrs(void)
{ {
long_u nr = 0; long_u nr = 0;
@@ -1025,7 +1025,7 @@ getdecchrs(void)
if (i == 0) if (i == 0)
return -1; return -1;
return (long)nr; return nr;
} }
/* /*

View File

@@ -1589,7 +1589,7 @@ regatom(int *flagp)
case 'u': // %uabcd hex 4 case 'u': // %uabcd hex 4
case 'U': // %U1234abcd hex 8 case 'U': // %U1234abcd hex 8
{ {
long i; vimlong_T i;
switch (c) switch (c)
{ {
@@ -1612,7 +1612,7 @@ regatom(int *flagp)
if (i == 0) if (i == 0)
regc(0x0a); regc(0x0a);
else else
regmbc(i); regmbc((int)i);
regc(NUL); regc(NUL);
*flagp |= HASWIDTH; *flagp |= HASWIDTH;
break; break;
@@ -1831,6 +1831,10 @@ collection:
|| *regparse == 'U') || *regparse == 'U')
{ {
startc = coll_get_char(); startc = coll_get_char();
// max UTF-8 Codepoint is U+10FFFF,
// but allow values until INT_MAX
if (startc == INT_MAX)
EMSG_RET_NULL(_(e_unicode_val_too_large));
if (startc == 0) if (startc == 0)
regc(0x0a); regc(0x0a);
else else
@@ -2131,7 +2135,7 @@ regpiece(int *flagp)
int lop = END; int lop = END;
long nr; long nr;
nr = getdecchrs(); nr = (long)getdecchrs();
switch (no_Magic(getchr())) switch (no_Magic(getchr()))
{ {
case '=': lop = MATCH; break; // \@= case '=': lop = MATCH; break; // \@=
@@ -2610,7 +2614,7 @@ vim_regcomp_had_eol(void)
static int static int
coll_get_char(void) coll_get_char(void)
{ {
long nr = -1; vimlong_T nr = -1;
switch (*regparse++) switch (*regparse++)
{ {
@@ -2620,13 +2624,15 @@ coll_get_char(void)
case 'u': nr = gethexchrs(4); break; case 'u': nr = gethexchrs(4); break;
case 'U': nr = gethexchrs(8); break; case 'U': nr = gethexchrs(8); break;
} }
if (nr < 0 || nr > INT_MAX) if (nr < 0)
{ {
// If getting the number fails be backwards compatible: the character // If getting the number fails be backwards compatible: the character
// is a backslash. // is a backslash.
--regparse; --regparse;
nr = '\\'; nr = '\\';
} }
if (nr > INT_MAX)
nr = INT_MAX;
return nr; return nr;
} }

View File

@@ -1560,7 +1560,7 @@ nfa_regatom(void)
case 'u': // %uabcd hex 4 case 'u': // %uabcd hex 4
case 'U': // %U1234abcd hex 8 case 'U': // %U1234abcd hex 8
{ {
long nr; vimlong_T nr;
switch (c) switch (c)
{ {
@@ -1577,7 +1577,7 @@ nfa_regatom(void)
reg_magic == MAGIC_ALL); reg_magic == MAGIC_ALL);
// A NUL is stored in the text as NL // A NUL is stored in the text as NL
// TODO: what if a composing character follows? // TODO: what if a composing character follows?
EMIT(nr == 0 ? 0x0a : nr); EMIT(nr == 0 ? 0x0a : (long)nr);
} }
break; break;
@@ -1953,6 +1953,10 @@ collection:
{ {
// TODO(RE) This needs more testing // TODO(RE) This needs more testing
startc = coll_get_char(); startc = coll_get_char();
// max UTF-8 Codepoint is U+10FFFF,
// but allow values until INT_MAX
if (startc == INT_MAX)
EMSG_RET_FAIL(_(e_unicode_val_too_large));
got_coll_char = TRUE; got_coll_char = TRUE;
MB_PTR_BACK(old_regparse, regparse); MB_PTR_BACK(old_regparse, regparse);
} }
@@ -2218,7 +2222,7 @@ nfa_regpiece(void)
break; break;
case Magic('@'): case Magic('@'):
c2 = getdecchrs(); c2 = (long)getdecchrs();
op = no_Magic(getchr()); op = no_Magic(getchr());
i = 0; i = 0;
switch(op) switch(op)

View File

@@ -1541,17 +1541,46 @@ func Test_large_hex_chars2()
try try
/[\Ufffffc1f] /[\Ufffffc1f]
catch catch
call assert_match('E486:', v:exception) call assert_match('E1541:', v:exception)
endtry endtry
try try
set re=1 set re=1
/[\Ufffffc1f] /[\Ufffffc1f]
catch catch
call assert_match('E486:', v:exception) call assert_match('E1541:', v:exception)
endtry endtry
set re& set re&
endfunc endfunc
func Test_large_hex_chars3()
" Validate max number of Unicode char
try
/[\UFFFFFFFF]
catch
call assert_match('E1541:', v:exception)
endtry
try
/[\UFFFFFFF]
catch
call assert_match('E486:', v:exception)
endtry
try
/\%#=2[\d32-\UFFFFFFFF]
catch
call assert_match('E1541:', v:exception)
endtry
try
/\%#=1[\UFFFFFFFF]
catch
call assert_match('E1541:', v:exception)
endtry
try
/\%#=1[\d32-\UFFFFFFFF]
catch
call assert_match('E945:', v:exception)
endtry
endfunc
func Test_one_error_msg() func Test_one_error_msg()
" This was also giving an internal error " This was also giving an internal error
call assert_fails('call search(" \\((\\v[[=P=]]){185}+ ")', 'E871:') call assert_fails('call search(" \\((\\v[[=P=]]){185}+ ")', 'E871:')

View File

@@ -704,6 +704,8 @@ static char *(features[]) =
static int included_patches[] = static int included_patches[] =
{ /* Add new patch number below this line */ { /* Add new patch number below this line */
/**/
1258,
/**/ /**/
1257, 1257,
/**/ /**/