1
0
mirror of https://github.com/rfivet/uemacs.git synced 2024-12-18 15:26:23 -05:00

Revise sanity check on UTF-8 keyboard input sequence.

This commit is contained in:
Renaud 2015-02-10 18:09:59 +08:00
parent 5401aec485
commit cbb6a26e33
2 changed files with 66 additions and 70 deletions

14
input.c
View File

@ -455,20 +455,18 @@ handle_CSI:
return CTLX | c;
}
/* Accept UTF-8 sequence */
if( c <= 0xC1 || c > 0xF4)
return c ;
else {
char utf[ 4] ;
char cc ;
utf[ 0] = c ;
if( (c & 0xE0) == 0xC0)
utf[ 1] = get1key() ;
else if( (c & 0xF0) == 0xE0) {
utf[ 1] = get1key() ;
utf[ 2] = get1key() ;
} else if( (c & 0xF8) == 0xF0) {
utf[ 1] = get1key() ;
utf[ 2] = get1key() ;
utf[ 1] = cc = get1key() ;
if( (c & 0x20) && ((cc & 0xC0) == 0x80)) { /* at least 3 bytes and a valid encoded char */
utf[ 2] = cc = get1key() ;
if( (c & 0x10) && ((cc & 0xC0) == 0x80)) /* at least 4 bytes and a valid encoded char */
utf[ 3] = get1key() ;
}

8
utf8.c
View File

@ -28,6 +28,7 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
* 1100000x is start of overlong encoding sequence
* Sequence longer than 4 bytes are invalid
* Last valid code is 0x10FFFF, encoding start with 0xF4
*/
if( c <= 0xC1 || c > 0xF4)
return 1;
@ -40,10 +41,7 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
mask >>= 1;
}
/* Invalid? Do it as a single byte Latin1 */
/* if (bytes > 6) * bytes is <= 4 as we limit c value to max 0xF4
return 1; *
*/
/* bytes is in range [2..4] */
len -= index;
if (bytes > len)
return 1;
@ -59,7 +57,7 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
value = (value << 6) | (c & 0x3f);
}
if( value > 0x10FFFF)
if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
return 1 ;
*res = value;