From 6b793211c2aec69115dd2769892be0524801f7d8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 10 Jul 2012 16:40:36 -0700 Subject: [PATCH] Make cursor movement (largely) understand UTF-8 character boundaries Ok, so it may do odd things if it's not truly utf-8, and when moving up and down lines that have utf-8 the cursor moves oddly (because the byte offset within the line stays constant, rather than the character offset), but with this you can actually open the UTF8 example file and move around it, and at least some of the movement makes sense. Signed-off-by: Linus Torvalds --- basic.c | 26 +++++++++++++++++++++----- display.c | 11 +++++------ utf8.h | 5 +++++ 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/basic.c b/basic.c index 3bf0227..3a7d6f7 100644 --- a/basic.c +++ b/basic.c @@ -15,6 +15,7 @@ #include "edef.h" #include "efunc.h" #include "line.h" +#include "utf8.h" /* * This routine, given a pointer to a struct line, and the current cursor goal @@ -74,8 +75,15 @@ int backchar(int f, int n) curwp->w_dotp = lp; curwp->w_doto = llength(lp); curwp->w_flag |= WFMOVE; - } else - curwp->w_doto--; + } else { + do { + unsigned char c; + curwp->w_doto--; + c = lgetc(curwp->w_dotp, curwp->w_doto); + if (is_beginning_utf8(c)) + break; + } while (curwp->w_doto); + } } return TRUE; } @@ -100,14 +108,22 @@ int forwchar(int f, int n) if (n < 0) return backchar(f, -n); while (n--) { - if (curwp->w_doto == llength(curwp->w_dotp)) { + int len = llength(curwp->w_dotp); + if (curwp->w_doto == len) { if (curwp->w_dotp == curbp->b_linep) return FALSE; curwp->w_dotp = lforw(curwp->w_dotp); curwp->w_doto = 0; curwp->w_flag |= WFMOVE; - } else - curwp->w_doto++; + } else { + do { + unsigned char c; + curwp->w_doto++; + c = lgetc(curwp->w_dotp, curwp->w_doto); + if (is_beginning_utf8(c)) + break; + } while (curwp->w_doto < len); + } } return TRUE; } diff --git a/display.c b/display.c index 82b4f84..676514d 100644 --- a/display.c +++ b/display.c @@ -528,7 +528,6 @@ static void updall(struct window *wp) void updpos(void) { struct line *lp; - int c; int i; /* find the current row */ @@ -543,13 +542,13 @@ void updpos(void) curcol = 0; i = 0; while (i < curwp->w_doto) { - c = lgetc(lp, i++); + unicode_t c; + int bytes; + + bytes = utf8_to_unicode(lp->l_text, i, curwp->w_doto, &c); + i += bytes; if (c == '\t') curcol |= tabmask; - else if (c < 0x20 || c == 0x7f) - ++curcol; - else if (c >= 0x80 && c <= 0xa0) - curcol+=2; ++curcol; } diff --git a/utf8.h b/utf8.h index b60ccd2..c317a6a 100644 --- a/utf8.h +++ b/utf8.h @@ -6,4 +6,9 @@ typedef unsigned int unicode_t; unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res); unsigned unicode_to_utf8(unsigned int c, char *utf8); +static inline int is_beginning_utf8(unsigned char c) +{ + return (c & 0xc0) != 0x80; +} + #endif