mirror of
https://github.com/rfivet/uemacs.git
synced 2024-12-18 23:36:23 -05:00
Revise sanity check on UTF-8 keyboard input sequence.
This commit is contained in:
parent
5401aec485
commit
cbb6a26e33
16
input.c
16
input.c
@ -455,21 +455,19 @@ handle_CSI:
|
|||||||
return CTLX | c;
|
return CTLX | c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Accept UTF-8 sequence */
|
||||||
if( c <= 0xC1 || c > 0xF4)
|
if( c <= 0xC1 || c > 0xF4)
|
||||||
return c ;
|
return c ;
|
||||||
else {
|
else {
|
||||||
char utf[ 4] ;
|
char utf[ 4] ;
|
||||||
|
char cc ;
|
||||||
|
|
||||||
utf[ 0] = c ;
|
utf[ 0] = c ;
|
||||||
if( (c & 0xE0) == 0xC0)
|
utf[ 1] = cc = get1key() ;
|
||||||
utf[ 1] = get1key() ;
|
if( (c & 0x20) && ((cc & 0xC0) == 0x80)) { /* at least 3 bytes and a valid encoded char */
|
||||||
else if( (c & 0xF0) == 0xE0) {
|
utf[ 2] = cc = get1key() ;
|
||||||
utf[ 1] = get1key() ;
|
if( (c & 0x10) && ((cc & 0xC0) == 0x80)) /* at least 4 bytes and a valid encoded char */
|
||||||
utf[ 2] = get1key() ;
|
utf[ 3] = get1key() ;
|
||||||
} else if( (c & 0xF8) == 0xF0) {
|
|
||||||
utf[ 1] = get1key() ;
|
|
||||||
utf[ 2] = get1key() ;
|
|
||||||
utf[ 3] = get1key() ;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
utf8_to_unicode( utf, 0, sizeof utf, (unicode_t *) &c) ;
|
utf8_to_unicode( utf, 0, sizeof utf, (unicode_t *) &c) ;
|
||||||
|
118
utf8.c
118
utf8.c
@ -17,62 +17,60 @@
|
|||||||
*/
|
*/
|
||||||
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
|
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
|
||||||
{
|
{
|
||||||
unicode_t value ;
|
unicode_t value ;
|
||||||
unsigned char c = line[index];
|
unsigned char c = line[index];
|
||||||
unsigned bytes, mask, i;
|
unsigned bytes, mask, i;
|
||||||
|
|
||||||
*res = c;
|
*res = c;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* 0xxxxxxx is valid one byte utf8
|
* 0xxxxxxx is valid one byte utf8
|
||||||
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
|
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
|
||||||
* 1100000x is start of overlong encoding sequence
|
* 1100000x is start of overlong encoding sequence
|
||||||
* Sequence longer than 4 bytes are invalid
|
* Sequence longer than 4 bytes are invalid
|
||||||
*/
|
* Last valid code is 0x10FFFF, encoding start with 0xF4
|
||||||
if( c <= 0xC1 || c > 0xF4)
|
*/
|
||||||
return 1;
|
if( c <= 0xC1 || c > 0xF4)
|
||||||
|
return 1;
|
||||||
|
|
||||||
/* Ok, it's 11xxxxxx, do a stupid decode */
|
/* Ok, it's 11xxxxxx, do a stupid decode */
|
||||||
mask = 0x20;
|
mask = 0x20;
|
||||||
bytes = 2;
|
bytes = 2;
|
||||||
while (c & mask) {
|
while (c & mask) {
|
||||||
bytes++;
|
bytes++;
|
||||||
mask >>= 1;
|
mask >>= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Invalid? Do it as a single byte Latin1 */
|
/* bytes is in range [2..4] */
|
||||||
/* if (bytes > 6) * bytes is <= 4 as we limit c value to max 0xF4
|
len -= index;
|
||||||
return 1; *
|
if (bytes > len)
|
||||||
*/
|
return 1;
|
||||||
len -= index;
|
|
||||||
if (bytes > len)
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
value = c & (mask-1);
|
value = c & (mask-1);
|
||||||
|
|
||||||
/* Ok, do the bytes */
|
/* Ok, do the bytes */
|
||||||
line += index;
|
line += index;
|
||||||
for (i = 1; i < bytes; i++) {
|
for (i = 1; i < bytes; i++) {
|
||||||
c = line[i];
|
c = line[i];
|
||||||
if ((c & 0xc0) != 0x80)
|
if ((c & 0xc0) != 0x80)
|
||||||
return 1;
|
return 1;
|
||||||
value = (value << 6) | (c & 0x3f);
|
value = (value << 6) | (c & 0x3f);
|
||||||
}
|
}
|
||||||
|
|
||||||
if( value > 0x10FFFF)
|
if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
|
||||||
return 1 ;
|
return 1 ;
|
||||||
|
|
||||||
*res = value;
|
*res = value;
|
||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void reverse_string(char *begin, char *end)
|
static void reverse_string(char *begin, char *end)
|
||||||
{
|
{
|
||||||
do {
|
do {
|
||||||
char a = *begin, b = *end;
|
char a = *begin, b = *end;
|
||||||
*end = a; *begin = b;
|
*end = a; *begin = b;
|
||||||
begin++; end--;
|
begin++; end--;
|
||||||
} while (begin < end);
|
} while (begin < end);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -87,21 +85,21 @@ static void reverse_string(char *begin, char *end)
|
|||||||
* overlong utf-8 sequences.
|
* overlong utf-8 sequences.
|
||||||
*/
|
*/
|
||||||
unsigned unicode_to_utf8( unicode_t c, char *utf8) {
|
unsigned unicode_to_utf8( unicode_t c, char *utf8) {
|
||||||
int bytes = 1 ;
|
int bytes = 1 ;
|
||||||
|
|
||||||
assert( c <= 0x10FFFF) ;
|
assert( c <= 0x10FFFF) ;
|
||||||
*utf8 = c ;
|
*utf8 = c ;
|
||||||
if (c > 0x7f) {
|
if (c > 0x7f) {
|
||||||
int prefix = 0x40;
|
int prefix = 0x40;
|
||||||
char *p = utf8;
|
char *p = utf8;
|
||||||
do {
|
do {
|
||||||
*p++ = 0x80 + (c & 0x3f);
|
*p++ = 0x80 + (c & 0x3f);
|
||||||
bytes++;
|
bytes++;
|
||||||
prefix >>= 1;
|
prefix >>= 1;
|
||||||
c >>= 6;
|
c >>= 6;
|
||||||
} while( c >= prefix) ;
|
} while( c >= prefix) ;
|
||||||
*p = c - 2*prefix;
|
*p = c - 2*prefix;
|
||||||
reverse_string(utf8, p);
|
reverse_string(utf8, p);
|
||||||
}
|
}
|
||||||
return bytes;
|
return bytes;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user