Add support for a "utf-8" mode

NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8
support is purely an issue of terminal input and output.  The file
contents themselves are in the 8-bit space.  In that space, Unicode is
the same as Latin1.

The new mode is called "utf-8", and is enabled automatically by the
new emacs.rc when $LANG contains the substring "UTF-8".

I'm sure people would like to some day also edit real UTF-8 contents,
rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal.
However, that's an independent (and much bigger and thornier) issue.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Linus Torvalds 2005-10-02 17:47:28 -07:00
parent 0fc43a8429
commit f313bcf64a
4 changed files with 103 additions and 5 deletions

View File

@ -282,4 +282,8 @@ bind-to-key newline ^J
!endif
!endif
!if &gre &sin $LANG "UTF-8" 0
add-global-mode "utf-8"
!endif
set $discmd "TRUE"

View File

@ -493,7 +493,7 @@ typedef struct BUFFER {
#define BFTRUNC 0x04 /* buffer was truncated when read */
/* mode flags */
#define NUMMODES 9 /* # of defined modes */
#define NUMMODES 10 /* # of defined modes */
#define MDWRAP 0x0001 /* word wrap */
#define MDCMOD 0x0002 /* C indentation and fence match */
@ -504,6 +504,7 @@ typedef struct BUFFER {
#define MDMAGIC 0x0040 /* regular expresions in search */
#define MDCRYPT 0x0080 /* encrytion mode active */
#define MDASAVE 0x0100 /* auto-save mode */
#define MDUTF8 0x0200 /* UTF-8 input/output mode */
/*
* The starting position of a region, and the size of the region in

View File

@ -13,13 +13,13 @@ int revexist = FALSE; /* does reverse video exist? */
int flickcode = FALSE; /* do flicker supression? */
char *modename[] = { /* name of modes */
"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
"MAGIC", "CRYPT", "ASAVE"
"MAGIC", "CRYPT", "ASAVE", "UTF-8"
};
char *mode2name[] = { /* name of modes */
"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
"Magic", "Crypt", "Asave"
"Magic", "Crypt", "Asave", "utf-8"
};
char modecode[] = "WCSEVOMYA"; /* letters to represent modes */
char modecode[] = "WCSEVOMYAU"; /* letters to represent modes */
int gmode = 0; /* global editor mode */
int gflags = GFREAD; /* global control flag */
#if PKCODE & IBMPC

95
posix.c
View File

@ -23,6 +23,20 @@
#include <fcntl.h>
#include <errno.h>
/*
* NOTE NOTE NOTE!
*
* Uemacs is currently very much byte-oriented, and not at all UTF8-aware
* interally. However, this allows it to understand a _terminal_ that is
* in utf-8 mode, and will turn input into the 8-bit subset, and will turn
* things back into UTF8 on output.
*
* Do _not_ confuse this with the notion of actually being able to edit
* UTF-8 file _contents_. That's a totally different thing.
*/
#define utf8_mode() \
(curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8))
static int kbdflgs; /* saved keyboard fd flags */
static int kbdpoll; /* in O_NDELAY mode */
@ -99,6 +113,22 @@ void ttclose(void)
*/
int ttputc(int c)
{
/*
* We always represent things in 1 byte, but if we output
* in UTF-8, we may need to expand that into 2 bytes..
*
* Some day we might even be able to handle UTF-8 _content_.
*
* That day is not today.
*/
if (utf8_mode()) {
c &= 0xff;
if (c >= 0x80) {
unsigned char first = (c >> 6) | 0xc0;
fputc(first, stdout);
c = (c & 0x3f) | 0x80;
}
}
fputc(c, stdout);
return (TRUE);
}
@ -138,7 +168,70 @@ void ttflush(void)
*/
int ttgetc(void)
{
return (255 & fgetc(stdin)); /* 8BIT P.K. */
static unsigned char pending;
unsigned char c, second;
int n;
if (pending) {
c = pending;
pending = 0;
return c;
}
n = read(0, &c, 1);
if (n != 1)
return 0;
if (!utf8_mode())
return c;
/* Normal 7-bit? */
if (!(c & 0x80))
return c;
/*
* Unexpected UTF-8 continuation character? Maybe
* we're in non-UTF mode, or maybe it's a control
* character.. Regardless, just pass it on.
*/
if (!(c & 0x40))
return c;
/*
* Multi-byte sequences.. Right now we only
* want to get characters that can be represented
* in a single byte, so we're not interested in
* anything else..
*/
if (c & 0x3c)
return c;
/*
* Two-byte sequence representing 0x80-0xff.. We want
* to do this read with a timeout.
*/
ntermios.c_cc[VMIN] = 1;
ntermios.c_cc[VTIME] = 10; /* 1 second */
tcsetattr(0, TCSANOW, &ntermios);
n = read(0, &second, 1);
/* Undo timeout */
ntermios.c_cc[VTIME] = 0;
tcsetattr(0, TCSANOW, &ntermios);
if (n != 1)
return c;
if ((second & 0xc0) != 0x80) {
pending = second;
return c;
}
c = (c << 6) | (second & 0x3f);
/* Ok, real UTF-8 character */
return c;
}
/* typahead: Check to see if any characters are already in the