Review cursor movement in presence of mixed latin1/unicode encoding.

2025-07-25 10:54:29 -04:00 · 2015-02-04 12:37:57 +08:00 · 2015-02-04 12:37:57 +08:00 · 4cbf1e9ae1
commit 4cbf1e9ae1
parent a2d1b54c61
4 changed files with 38 additions and 30 deletions
--- a/display.c
+++ b/display.c
@ -570,18 +570,14 @@ void updpos(void)
 	i = 0;
 	while (i < curwp->w_doto) {
 		unicode_t c;
-		int bytes;

-		bytes = utf8_to_unicode(lp->l_text, i, curwp->w_doto, &c);
-		i += bytes;
-		if (c == '\t')
-			curcol |= tabmask;
-		else if( bytes == 1) {
-			if( c < 0x20 || c == 0x7F)
+		i += utf8_to_unicode( lp->l_text, i, curwp->w_doto, &c) ;
+		if( c == '\t')
+			curcol |= tabmask ;
+		else if( c < 0x20 || c == 0x7F)
 				curcol += 1 ;	/* displayed as ^c */
-			else if( c >= 0x80 && c <= 0xA0)
+		else if( c >= 0x80 && c <= 0xA0)
 				curcol += 2 ;	/* displayed as \xx */
-		}

 		++curcol;
 	}
--- a/eval.c
+++ b/eval.c
@ -515,13 +515,18 @@ static char *gtfun( char *fname) {
 	case UFTRUTH:
 		retstr = ltos( atoi( argx) == 42) ;
 		break ;
-	case UFASCII:
-		retstr = i_to_a( (int) argx[ 0] & 0xFF) ;
+	case UFASCII: {
+		unicode_t	c ;
+		
+		utf8_to_unicode( argx, 0, 4, &c) ;
+		retstr = i_to_a( c) ;
+		}
+
 		break ;
 	case UFCHR:
-		result[0] = atoi(argx);
-		result[1] = 0;
-		retstr = result ;
+		sz = unicode_to_utf8( atoi( argx), result) ;
+		result[ sz] = 0 ;
+		retstr = result ;		
 		break ;
 	case UFGTKEY:
 		result[0] = tgetc();
--- a/line.c
+++ b/line.c
@ -141,13 +141,11 @@ int forwchar(int f, int n)
 			curwp->w_doto = 0;
 			curwp->w_flag |= WFMOVE;
 		} else {
-			do {
-				unsigned char c;
-				curwp->w_doto++;
-				c = lgetc(curwp->w_dotp, curwp->w_doto);
-				if (is_beginning_utf8(c))
-					break;
-			} while (curwp->w_doto < len);
+			unicode_t unc ;
+			unsigned bytes ;
+			
+			bytes = utf8_to_unicode( curwp->w_dotp->l_text, curwp->w_doto, len, &unc) ;
+			curwp->w_doto += bytes ;
 		}
 	}
 	return TRUE;
@ -257,6 +255,8 @@ int insspace(int f, int n)
 	return TRUE;
 }

+static int linsert_byte( int n, int c) ;
+
 /*
 * linstr -- Insert a string at the current point
 */
@ -269,7 +269,7 @@ int linstr( char *instr) {

 		while( (tmpc = *instr++ & 0xFF)) {
 			status =
-			    (tmpc == '\n' ? lnewline() : linsert( 1, tmpc)) ;
+			    (tmpc == '\n' ? lnewline() : linsert_byte( 1, tmpc)) ;

 			/* Insertion error? */
 			if( status != TRUE) {
--- a/utf8.c
+++ b/utf8.c
@ -15,19 +15,19 @@
 */
 unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
 {
-	unsigned value;
+	unicode_t	value ;
 	unsigned char c = line[index];
 	unsigned bytes, mask, i;

 	*res = c;
-	line += index;
-	len -= index;

 	/*
-	 * 0xxxxxxx is valid utf8
-	 * 10xxxxxx is invalid UTF-8, we assume it is Latin1
+	 * 0xxxxxxx is valid one byte utf8
+	 * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
+	 * 1100000x is start of overlong encoding sequence
+	 * Sequence longer than 4 bytes are invalid
 	 */
-	if (c < 0xc0)
+	if( c <= 0xc0 || c > 0xF4 || c == 0xC1)
 		return 1;

 	/* Ok, it's 11xxxxxx, do a stupid decode */
@ -39,20 +39,27 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
 	}

 	/* Invalid? Do it as a single byte Latin1 */
-	if (bytes > 6)
-		return 1;
+/*	if (bytes > 6)	* bytes is <= 4 as we limit c value to max 0xF4
+		return 1;	*
+*/
+	len -= index;
 	if (bytes > len)
 		return 1;

 	value = c & (mask-1);

 	/* Ok, do the bytes */
+	line += index;
 	for (i = 1; i < bytes; i++) {
 		c = line[i];
 		if ((c & 0xc0) != 0x80)
 			return 1;
 		value = (value << 6) | (c & 0x3f);
 	}
+	
+	if( value > 0x10FFFF)
+		return 1 ;
+
 	*res = value;
 	return bytes;
 }