Make cursor movement (largely) understand UTF-8 character boundaries

Ok, so it may do odd things if it's not truly utf-8, and when moving up and down lines that have utf-8 the cursor moves oddly (because the byte offset within the line stays constant, rather than the character offset), but with this you can actually open the UTF8 example file and move around it, and at least some of the movement makes sense. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-07-10 16:40:36 -07:00 · 2012-07-10 16:40:36 -07:00 · 6b793211c2
parent e62cdf04cf
commit 6b793211c2
3 changed files with 31 additions and 11 deletions
--- a/basic.c
+++ b/basic.c
@ -15,6 +15,7 @@
 #include "edef.h"
 #include "efunc.h"
 #include "line.h"
+#include "utf8.h"

 /*
 * This routine, given a pointer to a struct line, and the current cursor goal
@ -74,8 +75,15 @@ int backchar(int f, int n)
 			curwp->w_dotp = lp;
 			curwp->w_doto = llength(lp);
 			curwp->w_flag |= WFMOVE;
-		} else
-			curwp->w_doto--;
+		} else {
+			do {
+				unsigned char c;
+				curwp->w_doto--;
+				c = lgetc(curwp->w_dotp, curwp->w_doto);
+				if (is_beginning_utf8(c))
+					break;
+			} while (curwp->w_doto);
+		}
 	}
 	return TRUE;
 }
@ -100,14 +108,22 @@ int forwchar(int f, int n)
 	if (n < 0)
 		return backchar(f, -n);
 	while (n--) {
-		if (curwp->w_doto == llength(curwp->w_dotp)) {
+		int len = llength(curwp->w_dotp);
+		if (curwp->w_doto == len) {
 			if (curwp->w_dotp == curbp->b_linep)
 				return FALSE;
 			curwp->w_dotp = lforw(curwp->w_dotp);
 			curwp->w_doto = 0;
 			curwp->w_flag |= WFMOVE;
-		} else
-			curwp->w_doto++;
+		} else {
+			do {
+				unsigned char c;
+				curwp->w_doto++;
+				c = lgetc(curwp->w_dotp, curwp->w_doto);
+				if (is_beginning_utf8(c))
+					break;
+			} while (curwp->w_doto < len);
+		}
 	}
 	return TRUE;
 }
--- a/display.c
+++ b/display.c
@ -528,7 +528,6 @@ static void updall(struct window *wp)
 void updpos(void)
 {
 	struct line *lp;
-	int c;
 	int i;

 	/* find the current row */
@ -543,13 +542,13 @@ void updpos(void)
 	curcol = 0;
 	i = 0;
 	while (i < curwp->w_doto) {
-		c = lgetc(lp, i++);
+		unicode_t c;
+		int bytes;
+
+		bytes = utf8_to_unicode(lp->l_text, i, curwp->w_doto, &c);
+		i += bytes;
 		if (c == '\t')
 			curcol |= tabmask;
-		else if (c < 0x20 || c == 0x7f)
-			++curcol;
-		else if (c >= 0x80 && c <= 0xa0)
-			curcol+=2;

 		++curcol;
 	}
--- a/utf8.h
+++ b/utf8.h
@ -6,4 +6,9 @@ typedef unsigned int unicode_t;
 unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res);
 unsigned unicode_to_utf8(unsigned int c, char *utf8);

+static inline int is_beginning_utf8(unsigned char c)
+{
+	return (c & 0xc0) != 0x80;
+}
+
 #endif