1
0
mirror of https://github.com/rfivet/uemacs.git synced 2024-06-09 05:20:42 +00:00

Revise sanity check on UTF-8 keyboard input sequence.

This commit is contained in:
Renaud 2015-02-10 18:09:59 +08:00
parent 5401aec485
commit cbb6a26e33
2 changed files with 66 additions and 70 deletions

16
input.c
View File

@ -455,21 +455,19 @@ handle_CSI:
return CTLX | c; return CTLX | c;
} }
/* Accept UTF-8 sequence */
if( c <= 0xC1 || c > 0xF4) if( c <= 0xC1 || c > 0xF4)
return c ; return c ;
else { else {
char utf[ 4] ; char utf[ 4] ;
char cc ;
utf[ 0] = c ; utf[ 0] = c ;
if( (c & 0xE0) == 0xC0) utf[ 1] = cc = get1key() ;
utf[ 1] = get1key() ; if( (c & 0x20) && ((cc & 0xC0) == 0x80)) { /* at least 3 bytes and a valid encoded char */
else if( (c & 0xF0) == 0xE0) { utf[ 2] = cc = get1key() ;
utf[ 1] = get1key() ; if( (c & 0x10) && ((cc & 0xC0) == 0x80)) /* at least 4 bytes and a valid encoded char */
utf[ 2] = get1key() ; utf[ 3] = get1key() ;
} else if( (c & 0xF8) == 0xF0) {
utf[ 1] = get1key() ;
utf[ 2] = get1key() ;
utf[ 3] = get1key() ;
} }
utf8_to_unicode( utf, 0, sizeof utf, (unicode_t *) &c) ; utf8_to_unicode( utf, 0, sizeof utf, (unicode_t *) &c) ;

120
utf8.c
View File

@ -17,62 +17,60 @@
*/ */
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res) unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{ {
unicode_t value ; unicode_t value ;
unsigned char c = line[index]; unsigned char c = line[index];
unsigned bytes, mask, i; unsigned bytes, mask, i;
*res = c; *res = c;
/* /*
* 0xxxxxxx is valid one byte utf8 * 0xxxxxxx is valid one byte utf8
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1 * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
* 1100000x is start of overlong encoding sequence * 1100000x is start of overlong encoding sequence
* Sequence longer than 4 bytes are invalid * Sequence longer than 4 bytes are invalid
*/ * Last valid code is 0x10FFFF, encoding start with 0xF4
if( c <= 0xC1 || c > 0xF4) */
return 1; if( c <= 0xC1 || c > 0xF4)
return 1;
/* Ok, it's 11xxxxxx, do a stupid decode */ /* Ok, it's 11xxxxxx, do a stupid decode */
mask = 0x20; mask = 0x20;
bytes = 2; bytes = 2;
while (c & mask) { while (c & mask) {
bytes++; bytes++;
mask >>= 1; mask >>= 1;
} }
/* Invalid? Do it as a single byte Latin1 */ /* bytes is in range [2..4] */
/* if (bytes > 6) * bytes is <= 4 as we limit c value to max 0xF4 len -= index;
return 1; * if (bytes > len)
*/ return 1;
len -= index;
if (bytes > len)
return 1;
value = c & (mask-1); value = c & (mask-1);
/* Ok, do the bytes */ /* Ok, do the bytes */
line += index; line += index;
for (i = 1; i < bytes; i++) { for (i = 1; i < bytes; i++) {
c = line[i]; c = line[i];
if ((c & 0xc0) != 0x80) if ((c & 0xc0) != 0x80)
return 1; return 1;
value = (value << 6) | (c & 0x3f); value = (value << 6) | (c & 0x3f);
} }
if( value > 0x10FFFF)
return 1 ;
*res = value; if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
return bytes; return 1 ;
*res = value;
return bytes;
} }
static void reverse_string(char *begin, char *end) static void reverse_string(char *begin, char *end)
{ {
do { do {
char a = *begin, b = *end; char a = *begin, b = *end;
*end = a; *begin = b; *end = a; *begin = b;
begin++; end--; begin++; end--;
} while (begin < end); } while (begin < end);
} }
/* /*
@ -87,21 +85,21 @@ static void reverse_string(char *begin, char *end)
* overlong utf-8 sequences. * overlong utf-8 sequences.
*/ */
unsigned unicode_to_utf8( unicode_t c, char *utf8) { unsigned unicode_to_utf8( unicode_t c, char *utf8) {
int bytes = 1 ; int bytes = 1 ;
assert( c <= 0x10FFFF) ; assert( c <= 0x10FFFF) ;
*utf8 = c ; *utf8 = c ;
if (c > 0x7f) { if (c > 0x7f) {
int prefix = 0x40; int prefix = 0x40;
char *p = utf8; char *p = utf8;
do { do {
*p++ = 0x80 + (c & 0x3f); *p++ = 0x80 + (c & 0x3f);
bytes++; bytes++;
prefix >>= 1; prefix >>= 1;
c >>= 6; c >>= 6;
} while( c >= prefix) ; } while( c >= prefix) ;
*p = c - 2*prefix; *p = c - 2*prefix;
reverse_string(utf8, p); reverse_string(utf8, p);
} }
return bytes; return bytes;
} }