diff --git a/display.c b/display.c index 55b5696..62f939b 100644 --- a/display.c +++ b/display.c @@ -570,18 +570,14 @@ void updpos(void) i = 0; while (i < curwp->w_doto) { unicode_t c; - int bytes; - bytes = utf8_to_unicode(lp->l_text, i, curwp->w_doto, &c); - i += bytes; - if (c == '\t') - curcol |= tabmask; - else if( bytes == 1) { - if( c < 0x20 || c == 0x7F) + i += utf8_to_unicode( lp->l_text, i, curwp->w_doto, &c) ; + if( c == '\t') + curcol |= tabmask ; + else if( c < 0x20 || c == 0x7F) curcol += 1 ; /* displayed as ^c */ - else if( c >= 0x80 && c <= 0xA0) + else if( c >= 0x80 && c <= 0xA0) curcol += 2 ; /* displayed as \xx */ - } ++curcol; } diff --git a/eval.c b/eval.c index 112c230..cb16a3f 100644 --- a/eval.c +++ b/eval.c @@ -515,13 +515,18 @@ static char *gtfun( char *fname) { case UFTRUTH: retstr = ltos( atoi( argx) == 42) ; break ; - case UFASCII: - retstr = i_to_a( (int) argx[ 0] & 0xFF) ; + case UFASCII: { + unicode_t c ; + + utf8_to_unicode( argx, 0, 4, &c) ; + retstr = i_to_a( c) ; + } + break ; case UFCHR: - result[0] = atoi(argx); - result[1] = 0; - retstr = result ; + sz = unicode_to_utf8( atoi( argx), result) ; + result[ sz] = 0 ; + retstr = result ; break ; case UFGTKEY: result[0] = tgetc(); diff --git a/line.c b/line.c index e6409df..8af7404 100644 --- a/line.c +++ b/line.c @@ -141,13 +141,11 @@ int forwchar(int f, int n) curwp->w_doto = 0; curwp->w_flag |= WFMOVE; } else { - do { - unsigned char c; - curwp->w_doto++; - c = lgetc(curwp->w_dotp, curwp->w_doto); - if (is_beginning_utf8(c)) - break; - } while (curwp->w_doto < len); + unicode_t unc ; + unsigned bytes ; + + bytes = utf8_to_unicode( curwp->w_dotp->l_text, curwp->w_doto, len, &unc) ; + curwp->w_doto += bytes ; } } return TRUE; @@ -257,6 +255,8 @@ int insspace(int f, int n) return TRUE; } +static int linsert_byte( int n, int c) ; + /* * linstr -- Insert a string at the current point */ @@ -269,7 +269,7 @@ int linstr( char *instr) { while( (tmpc = *instr++ & 0xFF)) { status = - (tmpc == '\n' ? lnewline() : linsert( 1, tmpc)) ; + (tmpc == '\n' ? lnewline() : linsert_byte( 1, tmpc)) ; /* Insertion error? */ if( status != TRUE) { diff --git a/utf8.c b/utf8.c index 3bffd1e..07ea48f 100644 --- a/utf8.c +++ b/utf8.c @@ -15,19 +15,19 @@ */ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res) { - unsigned value; + unicode_t value ; unsigned char c = line[index]; unsigned bytes, mask, i; *res = c; - line += index; - len -= index; /* - * 0xxxxxxx is valid utf8 - * 10xxxxxx is invalid UTF-8, we assume it is Latin1 + * 0xxxxxxx is valid one byte utf8 + * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1 + * 1100000x is start of overlong encoding sequence + * Sequence longer than 4 bytes are invalid */ - if (c < 0xc0) + if( c <= 0xc0 || c > 0xF4 || c == 0xC1) return 1; /* Ok, it's 11xxxxxx, do a stupid decode */ @@ -39,20 +39,27 @@ unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *re } /* Invalid? Do it as a single byte Latin1 */ - if (bytes > 6) - return 1; +/* if (bytes > 6) * bytes is <= 4 as we limit c value to max 0xF4 + return 1; * +*/ + len -= index; if (bytes > len) return 1; value = c & (mask-1); /* Ok, do the bytes */ + line += index; for (i = 1; i < bytes; i++) { c = line[i]; if ((c & 0xc0) != 0x80) return 1; value = (value << 6) | (c & 0x3f); } + + if( value > 0x10FFFF) + return 1 ; + *res = value; return bytes; }