Revise sanity check on UTF-8 keyboard input sequence.

2025-07-05 16:37:38 -04:00 · 2015-02-10 18:09:59 +08:00 · 2015-02-10 18:09:59 +08:00 · cbb6a26e33
commit cbb6a26e33
parent 5401aec485
2 changed files with 66 additions and 70 deletions
--- a/input.c
+++ b/input.c
@ -455,21 +455,19 @@ handle_CSI:
        return CTLX | c;
    }
 	/* Accept UTF-8 sequence */
 	if( c <= 0xC1 || c > 0xF4)
 		return c ;
 	else {
 		char utf[ 4] ;
 		char cc ;
 		utf[ 0] = c ;
-		if( (c & 0xE0) == 0xC0)
+		utf[ 1] = cc = get1key() ;
-			utf[ 1] = get1key() ;
+		if( (c & 0x20) && ((cc & 0xC0) == 0x80)) { /* at least 3 bytes and a valid encoded char */
-		else if( (c & 0xF0) == 0xE0) {
+			utf[ 2] = cc = get1key() ;
-			utf[ 1] = get1key() ;
+			if( (c & 0x10) && ((cc & 0xC0) == 0x80)) /* at least 4 bytes and a valid encoded char */
-			utf[ 2] = get1key() ;
+				utf[ 3] = get1key() ;
 		} else if( (c & 0xF8) == 0xF0) {
 			utf[ 1] = get1key() ;
 			utf[ 2] = get1key() ;
 			utf[ 3] = get1key() ;
 		}
 		utf8_to_unicode( utf, 0, sizeof utf, (unicode_t *) &c) ;
--- a/utf8.c
+++ b/utf8.c
@ -17,62 +17,60 @@
 */
 unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
 {
-	unicode_t	value ;
+    unicode_t   value ;
-	unsigned char c = line[index];
+    unsigned char c = line[index];
-	unsigned bytes, mask, i;
+    unsigned bytes, mask, i;
-	*res = c;
+    *res = c;
-	/*
+    /*
-	 * 0xxxxxxx is valid one byte utf8
+     * 0xxxxxxx is valid one byte utf8
-	 * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
+     * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
-	 * 1100000x is start of overlong encoding sequence
+     * 1100000x is start of overlong encoding sequence
-	 * Sequence longer than 4 bytes are invalid
+     * Sequence longer than 4 bytes are invalid
-	 */
+     * Last valid code is 0x10FFFF, encoding start with 0xF4
-	if( c <= 0xC1 || c > 0xF4)
+     */
-		return 1;
+    if( c <= 0xC1 || c > 0xF4)
        return 1;
-	/* Ok, it's 11xxxxxx, do a stupid decode */
+    /* Ok, it's 11xxxxxx, do a stupid decode */
-	mask = 0x20;
+    mask = 0x20;
-	bytes = 2;
+    bytes = 2;
-	while (c & mask) {
+    while (c & mask) {
-		bytes++;
+        bytes++;
-		mask >>= 1;
+        mask >>= 1;
-	}
+    }
-	/* Invalid? Do it as a single byte Latin1 */
+	/* bytes is in range [2..4] */
-/*	if (bytes > 6)	* bytes is <= 4 as we limit c value to max 0xF4
+    len -= index;
-		return 1;	*
+    if (bytes > len)
-*/
+        return 1;
 	len -= index;
 	if (bytes > len)
 		return 1;
-	value = c & (mask-1);
+    value = c & (mask-1);
-	/* Ok, do the bytes */
+    /* Ok, do the bytes */
-	line += index;
+    line += index;
-	for (i = 1; i < bytes; i++) {
+    for (i = 1; i < bytes; i++) {
-		c = line[i];
+        c = line[i];
-		if ((c & 0xc0) != 0x80)
+        if ((c & 0xc0) != 0x80)
-			return 1;
+            return 1;
-		value = (value << 6) | (c & 0x3f);
+        value = (value << 6) | (c & 0x3f);
-	}
+    }
-	if( value > 0x10FFFF)
+    if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
-		return 1 ;
+        return 1 ;
-	*res = value;
+    *res = value;
-	return bytes;
+    return bytes;
 }
 static void reverse_string(char *begin, char *end)
 {
-	do {
+    do {
-		char a = *begin, b = *end;
+        char a = *begin, b = *end;
-		*end = a; *begin = b;
+        *end = a; *begin = b;
-		begin++; end--;
+        begin++; end--;
-	} while (begin < end);
+    } while (begin < end);
 }
 /*
@ -87,21 +85,21 @@ static void reverse_string(char *begin, char *end)
 * overlong utf-8 sequences.
 */
 unsigned unicode_to_utf8( unicode_t c, char *utf8) {
-	int bytes = 1 ;
+    int bytes = 1 ;
-	assert( c <= 0x10FFFF) ;
+    assert( c <= 0x10FFFF) ;
-	*utf8 = c ;
+    *utf8 = c ;
-	if (c > 0x7f) {
+    if (c > 0x7f) {
-		int prefix = 0x40;
+        int prefix = 0x40;
-		char *p = utf8;
+        char *p = utf8;
-		do {
+        do {
-			*p++ = 0x80 + (c & 0x3f);
+            *p++ = 0x80 + (c & 0x3f);
-			bytes++;
+            bytes++;
-			prefix >>= 1;
+            prefix >>= 1;
-			c >>= 6;
+            c >>= 6;
-		} while( c >= prefix) ;
+        } while( c >= prefix) ;
-		*p = c - 2*prefix;
+        *p = c - 2*prefix;
-		reverse_string(utf8, p);
+        reverse_string(utf8, p);
-	}
+    }
-	return bytes;
+    return bytes;
 }