Review cursor movement in presence of mixed latin1/unicode encoding.

This commit is contained in:
Renaud 2015-02-04 12:37:57 +08:00
parent a2d1b54c61
commit 4cbf1e9ae1
4 changed files with 38 additions and 30 deletions

View File

@ -570,18 +570,14 @@ void updpos(void)
i = 0;
while (i < curwp->w_doto) {
unicode_t c;
int bytes;
bytes = utf8_to_unicode(lp->l_text, i, curwp->w_doto, &c);
i += bytes;
if (c == '\t')
curcol |= tabmask;
else if( bytes == 1) {
if( c < 0x20 || c == 0x7F)
i += utf8_to_unicode( lp->l_text, i, curwp->w_doto, &c) ;
if( c == '\t')
curcol |= tabmask ;
else if( c < 0x20 || c == 0x7F)
curcol += 1 ; /* displayed as ^c */
else if( c >= 0x80 && c <= 0xA0)
else if( c >= 0x80 && c <= 0xA0)
curcol += 2 ; /* displayed as \xx */
}
++curcol;
}

15
eval.c
View File

@ -515,13 +515,18 @@ static char *gtfun( char *fname) {
case UFTRUTH:
retstr = ltos( atoi( argx) == 42) ;
break ;
case UFASCII:
retstr = i_to_a( (int) argx[ 0] & 0xFF) ;
case UFASCII: {
unicode_t c ;
utf8_to_unicode( argx, 0, 4, &c) ;
retstr = i_to_a( c) ;
}
break ;
case UFCHR:
result[0] = atoi(argx);
result[1] = 0;
retstr = result ;
sz = unicode_to_utf8( atoi( argx), result) ;
result[ sz] = 0 ;
retstr = result ;
break ;
case UFGTKEY:
result[0] = tgetc();

16
line.c
View File

@ -141,13 +141,11 @@ int forwchar(int f, int n)
curwp->w_doto = 0;
curwp->w_flag |= WFMOVE;
} else {
do {
unsigned char c;
curwp->w_doto++;
c = lgetc(curwp->w_dotp, curwp->w_doto);
if (is_beginning_utf8(c))
break;
} while (curwp->w_doto < len);
unicode_t unc ;
unsigned bytes ;
bytes = utf8_to_unicode( curwp->w_dotp->l_text, curwp->w_doto, len, &unc) ;
curwp->w_doto += bytes ;
}
}
return TRUE;
@ -257,6 +255,8 @@ int insspace(int f, int n)
return TRUE;
}
static int linsert_byte( int n, int c) ;
/*
* linstr -- Insert a string at the current point
*/
@ -269,7 +269,7 @@ int linstr( char *instr) {
while( (tmpc = *instr++ & 0xFF)) {
status =
(tmpc == '\n' ? lnewline() : linsert( 1, tmpc)) ;
(tmpc == '\n' ? lnewline() : linsert_byte( 1, tmpc)) ;
/* Insertion error? */
if( status != TRUE) {

23
utf8.c
View File

@ -15,19 +15,19 @@
*/
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{
unsigned value;
unicode_t value ;
unsigned char c = line[index];
unsigned bytes, mask, i;
*res = c;
line += index;
len -= index;
/*
* 0xxxxxxx is valid utf8
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
* 0xxxxxxx is valid one byte utf8
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
* 1100000x is start of overlong encoding sequence
* Sequence longer than 4 bytes are invalid
*/
if (c < 0xc0)
if( c <= 0xc0 || c > 0xF4 || c == 0xC1)
return 1;
/* Ok, it's 11xxxxxx, do a stupid decode */
@ -39,20 +39,27 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re
}
/* Invalid? Do it as a single byte Latin1 */
if (bytes > 6)
return 1;
/* if (bytes > 6) * bytes is <= 4 as we limit c value to max 0xF4
return 1; *
*/
len -= index;
if (bytes > len)
return 1;
value = c & (mask-1);
/* Ok, do the bytes */
line += index;
for (i = 1; i < bytes; i++) {
c = line[i];
if ((c & 0xc0) != 0x80)
return 1;
value = (value << 6) | (c & 0x3f);
}
if( value > 0x10FFFF)
return 1 ;
*res = value;
return bytes;
}