Add support for a "utf-8" mode

NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8 support is purely an issue of terminal input and output. The file contents themselves are in the 8-bit space. In that space, Unicode is the same as Latin1. The new mode is called "utf-8", and is enabled automatically by the new emacs.rc when $LANG contains the substring "UTF-8". I'm sure people would like to some day also edit real UTF-8 contents, rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal. However, that's an independent (and much bigger and thornier) issue. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2025-07-05 16:37:38 -04:00 · 2005-10-02 17:47:28 -07:00 · 2005-10-02 17:47:28 -07:00 · f313bcf64a
commit f313bcf64a
parent 0fc43a8429
4 changed files with 103 additions and 5 deletions
--- a/emacs.rc
+++ b/emacs.rc
@ -282,4 +282,8 @@ bind-to-key	newline	^J
 	!endif
 !endif
 !if &gre &sin $LANG "UTF-8" 0
 	add-global-mode "utf-8"
 !endif
 set $discmd "TRUE"
--- a/estruct.h
+++ b/estruct.h
@ -493,7 +493,7 @@ typedef struct BUFFER {
 #define	BFTRUNC	0x04		/* buffer was truncated when read */
 /*	mode flags	*/
-#define	NUMMODES	9	/* # of defined modes           */
+#define	NUMMODES	10	/* # of defined modes           */
 #define	MDWRAP	0x0001		/* word wrap                    */
 #define	MDCMOD	0x0002		/* C indentation and fence match */
@ -504,6 +504,7 @@ typedef struct BUFFER {
 #define MDMAGIC	0x0040		/* regular expresions in search */
 #define	MDCRYPT	0x0080		/* encrytion mode active        */
 #define	MDASAVE	0x0100		/* auto-save mode               */
 #define MDUTF8  0x0200		/* UTF-8 input/output mode      */
 /*
 * The starting position of a region, and the size of the region in
--- a/globals.c
+++ b/globals.c
@ -13,13 +13,13 @@ int revexist = FALSE;		/* does reverse video exist?    */
 int flickcode = FALSE;		/* do flicker supression?       */
 char *modename[] = {		/* name of modes                */
 	"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
-	"MAGIC", "CRYPT", "ASAVE"
+	"MAGIC", "CRYPT", "ASAVE", "UTF-8"
 };
 char *mode2name[] = {		/* name of modes                */
 	"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
-	"Magic", "Crypt", "Asave"
+	"Magic", "Crypt", "Asave", "utf-8"
 };
-char modecode[] = "WCSEVOMYA";	/* letters to represent modes   */
+char modecode[] = "WCSEVOMYAU";	/* letters to represent modes   */
 int gmode = 0;			/* global editor mode           */
 int gflags = GFREAD;		/* global control flag          */
 #if	PKCODE & IBMPC
--- a/posix.c
+++ b/posix.c
@ -23,6 +23,20 @@
 #include <fcntl.h>
 #include <errno.h>
 /*
 * NOTE NOTE NOTE!
 *
 * Uemacs is currently very much byte-oriented, and not at all UTF8-aware
 * interally. However, this allows it to understand a _terminal_ that is
 * in utf-8 mode, and will turn input into the 8-bit subset, and will turn
 * things back into UTF8 on output.
 *
 * Do _not_ confuse this with the notion of actually being able to edit
 * UTF-8 file _contents_. That's a totally different thing.
 */
 #define utf8_mode() \
 	(curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8))
 static int kbdflgs;			/* saved keyboard fd flags      */
 static int kbdpoll;			/* in O_NDELAY mode             */
@ -99,6 +113,22 @@ void ttclose(void)
 */
 int ttputc(int c)
 {
 	/*
 	 * We always represent things in 1 byte, but if we output
 	 * in UTF-8, we may need to expand that into 2 bytes..
 	 *
 	 * Some day we might even be able to handle UTF-8 _content_.
 	 *
 	 * That day is not today.
 	 */
 	if (utf8_mode()) {
 		c &= 0xff;
 		if (c >= 0x80) {
 			unsigned char first = (c >> 6) | 0xc0;
 			fputc(first, stdout);
 			c = (c & 0x3f) | 0x80;
 		}
 	}
 	fputc(c, stdout);
 	return (TRUE);
 }
@ -138,7 +168,70 @@ void ttflush(void)
 */
 int ttgetc(void)
 {
-	return (255 & fgetc(stdin));	/* 8BIT P.K. */
+	static unsigned char pending;
 	unsigned char c, second;
 	int n;
 	if (pending) {
 		c = pending;
 		pending = 0;
 		return c;
 	}
 	n = read(0, &c, 1);
 	if (n != 1)
 		return 0;
 	if (!utf8_mode())
 		return c;
 	/* Normal 7-bit? */
 	if (!(c & 0x80))
 		return c;
 	/*
 	 * Unexpected UTF-8 continuation character? Maybe
 	 * we're in non-UTF mode, or maybe it's a control
 	 * character.. Regardless, just pass it on.
 	 */
 	if (!(c & 0x40))
 		return c;
 	/*
 	 * Multi-byte sequences.. Right now we only
 	 * want to get characters that can be represented
 	 * in a single byte, so we're not interested in
 	 * anything else..
 	 */
 	if (c & 0x3c)
 		return c;
 	/*
 	 * Two-byte sequence representing 0x80-0xff.. We want
 	 * to do this read with a timeout.
 	 */
 	ntermios.c_cc[VMIN] = 1;
 	ntermios.c_cc[VTIME] = 10;		/* 1 second */
 	tcsetattr(0, TCSANOW, &ntermios);
 	n = read(0, &second, 1);
 	/* Undo timeout */
 	ntermios.c_cc[VTIME] = 0;
 	tcsetattr(0, TCSANOW, &ntermios);
 	if (n != 1)
 		return c;
 	if ((second & 0xc0) != 0x80) {
 		pending = second;
 		return c;
 	}
 	c = (c << 6) | (second & 0x3f);
 	/* Ok, real UTF-8 character */
 	return c;
 }
 /* typahead:	Check to see if any characters are already in the