Revise sanity check on UTF-8 keyboard input sequence.

2025-07-05 16:37:38 -04:00 · 2015-02-10 18:09:59 +08:00 · 2015-02-10 18:09:59 +08:00 · cbb6a26e33
commit cbb6a26e33
parent 5401aec485
2 changed files with 66 additions and 70 deletions
--- a/input.c
+++ b/input.c
@ -455,20 +455,18 @@ handle_CSI:
        return CTLX | c;
    }

+	/* Accept UTF-8 sequence */
 	if( c <= 0xC1 || c > 0xF4)
 		return c ;
 	else {
 		char utf[ 4] ;
+		char cc ;

 		utf[ 0] = c ;
-		if( (c & 0xE0) == 0xC0)
-			utf[ 1] = get1key() ;
-		else if( (c & 0xF0) == 0xE0) {
-			utf[ 1] = get1key() ;
-			utf[ 2] = get1key() ;
-		} else if( (c & 0xF8) == 0xF0) {
-			utf[ 1] = get1key() ;
-			utf[ 2] = get1key() ;
+		utf[ 1] = cc = get1key() ;
+		if( (c & 0x20) && ((cc & 0xC0) == 0x80)) { /* at least 3 bytes and a valid encoded char */
+			utf[ 2] = cc = get1key() ;
+			if( (c & 0x10) && ((cc & 0xC0) == 0x80)) /* at least 4 bytes and a valid encoded char */
 				utf[ 3] = get1key() ;
 		}

--- a/utf8.c
+++ b/utf8.c
@ -28,6 +28,7 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
     * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
     * 1100000x is start of overlong encoding sequence
     * Sequence longer than 4 bytes are invalid
+     * Last valid code is 0x10FFFF, encoding start with 0xF4
     */
    if( c <= 0xC1 || c > 0xF4)
        return 1;
@ -40,10 +41,7 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
        mask >>= 1;
    }

-	/* Invalid? Do it as a single byte Latin1 */
-/*	if (bytes > 6)	* bytes is <= 4 as we limit c value to max 0xF4
-		return 1;	*
-*/
+	/* bytes is in range [2..4] */
    len -= index;
    if (bytes > len)
        return 1;
@ -59,7 +57,7 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
        value = (value << 6) | (c & 0x3f);
    }

-	if( value > 0x10FFFF)
+    if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
        return 1 ;

    *res = value;