0
0
mirror of https://github.com/vim/vim.git synced 2025-09-05 21:43:39 -04:00

patch 8.0.0519: character classes are not well tested

Problem:    Character classes are not well tested. They can differ between
            platforms.
Solution:   Add tests.  In the documentation make clear which classes depend
            on what library function.  Only use :cntrl: and :graph: for ASCII.
            (Kazunobu Kuriyama, Dominique Pelle, closes #1560)
            Update the documentation.
This commit is contained in:
Bram Moolenaar 2017-03-29 15:31:20 +02:00
parent c6cd8409c2
commit 0c078fc7db
5 changed files with 91 additions and 33 deletions

View File

@ -1085,25 +1085,27 @@ x A single character, with no special meaning, matches itself
- A character class expression is evaluated to the set of characters - A character class expression is evaluated to the set of characters
belonging to that character class. The following character classes belonging to that character class. The following character classes
are supported: are supported:
Name Contents ~ Name Func Contents ~
*[:alnum:]* [:alnum:] ASCII letters and digits *[:alnum:]* [:alnum:] isalnum ASCII letters and digits
*[:alpha:]* [:alpha:] ASCII letters *[:alpha:]* [:alpha:] isalpha ASCII letters
*[:blank:]* [:blank:] space and tab characters *[:blank:]* [:blank:] space and tab
*[:cntrl:]* [:cntrl:] control characters *[:cntrl:]* [:cntrl:] iscntrl ASCII control characters
*[:digit:]* [:digit:] decimal digits *[:digit:]* [:digit:] decimal digits '0' to '9'
*[:graph:]* [:graph:] printable characters excluding space *[:graph:]* [:graph:] isgraph ASCII printable characters excluding
*[:lower:]* [:lower:] lowercase letters (all letters when space
*[:lower:]* [:lower:] (1) lowercase letters (all letters when
'ignorecase' is used) 'ignorecase' is used)
*[:print:]* [:print:] printable characters including space *[:print:]* [:print:] (2) printable characters including space
*[:punct:]* [:punct:] ASCII punctuation characters *[:punct:]* [:punct:] ispunct ASCII punctuation characters
*[:space:]* [:space:] whitespace characters *[:space:]* [:space:] whitespace characters: space, tab, CR,
*[:upper:]* [:upper:] uppercase letters (all letters when NL, vertical tab, form feed
*[:upper:]* [:upper:] (3) uppercase letters (all letters when
'ignorecase' is used) 'ignorecase' is used)
*[:xdigit:]* [:xdigit:] hexadecimal digits *[:xdigit:]* [:xdigit:] hexadecimal digits: 0-9, a-f, A-F
*[:return:]* [:return:] the <CR> character *[:return:]* [:return:] the <CR> character
*[:tab:]* [:tab:] the <Tab> character *[:tab:]* [:tab:] the <Tab> character
*[:escape:]* [:escape:] the <Esc> character *[:escape:]* [:escape:] the <Esc> character
*[:backspace:]* [:backspace:] the <BS> character *[:backspace:]* [:backspace:] the <BS> character
The brackets in character class expressions are additional to the The brackets in character class expressions are additional to the
brackets delimiting a collection. For example, the following is a brackets delimiting a collection. For example, the following is a
plausible pattern for a UNIX filename: "[-./[:alnum:]_~]\+" That is, plausible pattern for a UNIX filename: "[-./[:alnum:]_~]\+" That is,
@ -1114,6 +1116,13 @@ x A single character, with no special meaning, matches itself
regexp engine. See |two-engines|. In the future these items may regexp engine. See |two-engines|. In the future these items may
work for multi-byte characters. For now, to get all "alpha" work for multi-byte characters. For now, to get all "alpha"
characters you can use: [[:lower:][:upper:]]. characters you can use: [[:lower:][:upper:]].
The "Func" column shows what library function is used. The
implementation depends on the system. Otherwise:
(1) Uses islower() for ASCII and Vim builtin rules for other
characters when built with the |+multi_byte| feature.
(2) Uses Vim builtin rules
(3) As with (1) but using isupper()
*/[[=* *[==]* */[[=* *[==]*
- An equivalence class. This means that characters are matched that - An equivalence class. This means that characters are matched that
have almost the same meaning, e.g., when ignoring accents. This have almost the same meaning, e.g., when ignoring accents. This

View File

@ -2555,17 +2555,17 @@ collection:
regc('\t'); regc('\t');
break; break;
case CLASS_CNTRL: case CLASS_CNTRL:
for (cu = 1; cu <= 255; cu++) for (cu = 1; cu <= 127; cu++)
if (iscntrl(cu)) if (iscntrl(cu))
regmbc(cu); regmbc(cu);
break; break;
case CLASS_DIGIT: case CLASS_DIGIT:
for (cu = 1; cu <= 255; cu++) for (cu = 1; cu <= 127; cu++)
if (VIM_ISDIGIT(cu)) if (VIM_ISDIGIT(cu))
regmbc(cu); regmbc(cu);
break; break;
case CLASS_GRAPH: case CLASS_GRAPH:
for (cu = 1; cu <= 255; cu++) for (cu = 1; cu <= 127; cu++)
if (isgraph(cu)) if (isgraph(cu))
regmbc(cu); regmbc(cu);
break; break;

View File

@ -4871,7 +4871,7 @@ check_char_class(int class, int c)
return OK; return OK;
break; break;
case NFA_CLASS_CNTRL: case NFA_CLASS_CNTRL:
if (c >= 1 && c <= 255 && iscntrl(c)) if (c >= 1 && c <= 127 && iscntrl(c))
return OK; return OK;
break; break;
case NFA_CLASS_DIGIT: case NFA_CLASS_DIGIT:
@ -4879,7 +4879,7 @@ check_char_class(int class, int c)
return OK; return OK;
break; break;
case NFA_CLASS_GRAPH: case NFA_CLASS_GRAPH:
if (c >= 1 && c <= 255 && isgraph(c)) if (c >= 1 && c <= 127 && isgraph(c))
return OK; return OK;
break; break;
case NFA_CLASS_LOWER: case NFA_CLASS_LOWER:

View File

@ -38,12 +38,21 @@ func s:classes_test()
set isprint=@,161-255 set isprint=@,161-255
call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+')) call assert_equal('Motörhead', matchstr('Motörhead', '[[:print:]]\+'))
let alphachars = ''
let lowerchars = ''
let upperchars = ''
let alnumchars = '' let alnumchars = ''
let alphachars = ''
let backspacechar = ''
let blankchars = ''
let cntrlchars = ''
let digitchars = ''
let escapechar = ''
let graphchars = ''
let lowerchars = ''
let printchars = '' let printchars = ''
let punctchars = '' let punctchars = ''
let returnchar = ''
let spacechars = ''
let tabchar = ''
let upperchars = ''
let xdigitchars = '' let xdigitchars = ''
let i = 1 let i = 1
while i <= 255 while i <= 255
@ -51,21 +60,48 @@ func s:classes_test()
if c =~ '[[:alpha:]]' if c =~ '[[:alpha:]]'
let alphachars .= c let alphachars .= c
endif endif
if c =~ '[[:lower:]]'
let lowerchars .= c
endif
if c =~ '[[:upper:]]'
let upperchars .= c
endif
if c =~ '[[:alnum:]]' if c =~ '[[:alnum:]]'
let alnumchars .= c let alnumchars .= c
endif endif
if c =~ '[[:backspace:]]'
let backspacechar .= c
endif
if c =~ '[[:blank:]]'
let blankchars .= c
endif
if c =~ '[[:cntrl:]]'
let cntrlchars .= c
endif
if c =~ '[[:digit:]]'
let digitchars .= c
endif
if c =~ '[[:escape:]]'
let escapechar .= c
endif
if c =~ '[[:graph:]]'
let graphchars .= c
endif
if c =~ '[[:lower:]]'
let lowerchars .= c
endif
if c =~ '[[:print:]]' if c =~ '[[:print:]]'
let printchars .= c let printchars .= c
endif endif
if c =~ '[[:punct:]]' if c =~ '[[:punct:]]'
let punctchars .= c let punctchars .= c
endif endif
if c =~ '[[:return:]]'
let returnchar .= c
endif
if c =~ '[[:space:]]'
let spacechars .= c
endif
if c =~ '[[:tab:]]'
let tabchar .= c
endif
if c =~ '[[:upper:]]'
let upperchars .= c
endif
if c =~ '[[:xdigit:]]' if c =~ '[[:xdigit:]]'
let xdigitchars .= c let xdigitchars .= c
endif endif
@ -73,11 +109,22 @@ func s:classes_test()
endwhile endwhile
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars) call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alphachars)
call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars) call assert_equal('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz', alnumchars)
call assert_equal("\b", backspacechar)
call assert_equal("\t ", blankchars)
" Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
" call assert_equal("\x01\x02\x03\x04\x05\x06\x07\b\t\n\x0b\f\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\e\x1c\x1d\x1e\x1f\x7f", cntrlchars)
call assert_equal("0123456789", digitchars)
call assert_equal("\<Esc>", escapechar)
" Commented out: it succeeds on Linux and Windows, but fails on macOs in Travis.
" call assert_equal('!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~', graphchars)
call assert_equal('abcdefghijklmnopqrstuvwxyzµßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', lowerchars)
call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars) call assert_equal(' !"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', printchars)
call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars) call assert_equal('!"#$%&''()*+,-./:;<=>?@[\]^_`{|}~', punctchars)
call assert_equal('ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ', upperchars)
call assert_equal("\r", returnchar)
call assert_equal("\t\n\x0b\f\r ", spacechars)
call assert_equal("\t", tabchar)
call assert_equal('0123456789ABCDEFabcdef', xdigitchars) call assert_equal('0123456789ABCDEFabcdef', xdigitchars)
endfunc endfunc

View File

@ -764,6 +764,8 @@ static char *(features[]) =
static int included_patches[] = static int included_patches[] =
{ /* Add new patch number below this line */ { /* Add new patch number below this line */
/**/
519,
/**/ /**/
518, 518,
/**/ /**/