Add support for a "utf-8" mode

NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8 support is purely an issue of terminal input and output. The file contents themselves are in the 8-bit space. In that space, Unicode is the same as Latin1. The new mode is called "utf-8", and is enabled automatically by the new emacs.rc when $LANG contains the substring "UTF-8". I'm sure people would like to some day also edit real UTF-8 contents, rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal. However, that's an independent (and much bigger and thornier) issue. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2025-07-26 11:24:26 -04:00 · 2005-10-02 17:47:28 -07:00 · 2005-10-02 17:47:28 -07:00 · f313bcf64a
commit f313bcf64a
parent 0fc43a8429
4 changed files with 103 additions and 5 deletions
--- a/emacs.rc
+++ b/emacs.rc
@ -282,4 +282,8 @@ bind-to-key	newline	^J
 	!endif
 !endif

+!if &gre &sin $LANG "UTF-8" 0
+	add-global-mode "utf-8"
+!endif
+
 set $discmd "TRUE"
--- a/estruct.h
+++ b/estruct.h
@ -493,7 +493,7 @@ typedef struct BUFFER {
 #define	BFTRUNC	0x04		/* buffer was truncated when read */

 /*	mode flags	*/
-#define	NUMMODES	9	/* # of defined modes           */
+#define	NUMMODES	10	/* # of defined modes           */

 #define	MDWRAP	0x0001		/* word wrap                    */
 #define	MDCMOD	0x0002		/* C indentation and fence match */
@ -504,6 +504,7 @@ typedef struct BUFFER {
 #define MDMAGIC	0x0040		/* regular expresions in search */
 #define	MDCRYPT	0x0080		/* encrytion mode active        */
 #define	MDASAVE	0x0100		/* auto-save mode               */
+#define MDUTF8  0x0200		/* UTF-8 input/output mode      */

 /*
 * The starting position of a region, and the size of the region in
--- a/globals.c
+++ b/globals.c
@ -13,13 +13,13 @@ int revexist = FALSE;		/* does reverse video exist?    */
 int flickcode = FALSE;		/* do flicker supression?       */
 char *modename[] = {		/* name of modes                */
 	"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
-	"MAGIC", "CRYPT", "ASAVE"
+	"MAGIC", "CRYPT", "ASAVE", "UTF-8"
 };
 char *mode2name[] = {		/* name of modes                */
 	"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
-	"Magic", "Crypt", "Asave"
+	"Magic", "Crypt", "Asave", "utf-8"
 };
-char modecode[] = "WCSEVOMYA";	/* letters to represent modes   */
+char modecode[] = "WCSEVOMYAU";	/* letters to represent modes   */
 int gmode = 0;			/* global editor mode           */
 int gflags = GFREAD;		/* global control flag          */
 #if	PKCODE & IBMPC
--- a/posix.c
+++ b/posix.c
@ -23,6 +23,20 @@
 #include <fcntl.h>
 #include <errno.h>

+/*
+ * NOTE NOTE NOTE!
+ *
+ * Uemacs is currently very much byte-oriented, and not at all UTF8-aware
+ * interally. However, this allows it to understand a _terminal_ that is
+ * in utf-8 mode, and will turn input into the 8-bit subset, and will turn
+ * things back into UTF8 on output.
+ *
+ * Do _not_ confuse this with the notion of actually being able to edit
+ * UTF-8 file _contents_. That's a totally different thing.
+ */
+#define utf8_mode() \
+	(curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8))
+
 static int kbdflgs;			/* saved keyboard fd flags      */
 static int kbdpoll;			/* in O_NDELAY mode             */

@ -99,6 +113,22 @@ void ttclose(void)
 */
 int ttputc(int c)
 {
+	/*
+	 * We always represent things in 1 byte, but if we output
+	 * in UTF-8, we may need to expand that into 2 bytes..
+	 *
+	 * Some day we might even be able to handle UTF-8 _content_.
+	 *
+	 * That day is not today.
+	 */
+	if (utf8_mode()) {
+		c &= 0xff;
+		if (c >= 0x80) {
+			unsigned char first = (c >> 6) | 0xc0;
+			fputc(first, stdout);
+			c = (c & 0x3f) | 0x80;
+		}
+	}
 	fputc(c, stdout);
 	return (TRUE);
 }
@ -138,7 +168,70 @@ void ttflush(void)
 */
 int ttgetc(void)
 {
-	return (255 & fgetc(stdin));	/* 8BIT P.K. */
+	static unsigned char pending;
+	unsigned char c, second;
+	int n;
+
+	if (pending) {
+		c = pending;
+		pending = 0;
+		return c;
+	}
+
+	n = read(0, &c, 1);
+	if (n != 1)
+		return 0;
+
+	if (!utf8_mode())
+		return c;
+
+	/* Normal 7-bit? */
+	if (!(c & 0x80))
+		return c;
+
+	/*
+	 * Unexpected UTF-8 continuation character? Maybe
+	 * we're in non-UTF mode, or maybe it's a control
+	 * character.. Regardless, just pass it on.
+	 */
+	if (!(c & 0x40))
+		return c;
+
+	/*
+	 * Multi-byte sequences.. Right now we only
+	 * want to get characters that can be represented
+	 * in a single byte, so we're not interested in
+	 * anything else..
+	 */
+	if (c & 0x3c)
+		return c;
+
+	/*
+	 * Two-byte sequence representing 0x80-0xff.. We want
+	 * to do this read with a timeout.
+	 */
+	ntermios.c_cc[VMIN] = 1;
+	ntermios.c_cc[VTIME] = 10;		/* 1 second */
+	tcsetattr(0, TCSANOW, &ntermios);
+
+	n = read(0, &second, 1);
+
+	/* Undo timeout */
+	ntermios.c_cc[VTIME] = 0;
+	tcsetattr(0, TCSANOW, &ntermios);
+
+	if (n != 1)
+		return c;
+
+	if ((second & 0xc0) != 0x80) {
+		pending = second;
+		return c;
+	}
+
+	c = (c << 6) | (second & 0x3f);
+
+	/* Ok, real UTF-8 character */
+	return c;
 }

 /* typahead:	Check to see if any characters are already in the