mirror of
https://github.com/rfivet/uemacs.git
synced 2024-12-18 07:16:23 -05:00
Add support for a "utf-8" mode
NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8 support is purely an issue of terminal input and output. The file contents themselves are in the 8-bit space. In that space, Unicode is the same as Latin1. The new mode is called "utf-8", and is enabled automatically by the new emacs.rc when $LANG contains the substring "UTF-8". I'm sure people would like to some day also edit real UTF-8 contents, rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal. However, that's an independent (and much bigger and thornier) issue. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
0fc43a8429
commit
f313bcf64a
4
emacs.rc
4
emacs.rc
@ -282,4 +282,8 @@ bind-to-key newline ^J
|
||||
!endif
|
||||
!endif
|
||||
|
||||
!if &gre &sin $LANG "UTF-8" 0
|
||||
add-global-mode "utf-8"
|
||||
!endif
|
||||
|
||||
set $discmd "TRUE"
|
||||
|
@ -493,7 +493,7 @@ typedef struct BUFFER {
|
||||
#define BFTRUNC 0x04 /* buffer was truncated when read */
|
||||
|
||||
/* mode flags */
|
||||
#define NUMMODES 9 /* # of defined modes */
|
||||
#define NUMMODES 10 /* # of defined modes */
|
||||
|
||||
#define MDWRAP 0x0001 /* word wrap */
|
||||
#define MDCMOD 0x0002 /* C indentation and fence match */
|
||||
@ -504,6 +504,7 @@ typedef struct BUFFER {
|
||||
#define MDMAGIC 0x0040 /* regular expresions in search */
|
||||
#define MDCRYPT 0x0080 /* encrytion mode active */
|
||||
#define MDASAVE 0x0100 /* auto-save mode */
|
||||
#define MDUTF8 0x0200 /* UTF-8 input/output mode */
|
||||
|
||||
/*
|
||||
* The starting position of a region, and the size of the region in
|
||||
|
@ -13,13 +13,13 @@ int revexist = FALSE; /* does reverse video exist? */
|
||||
int flickcode = FALSE; /* do flicker supression? */
|
||||
char *modename[] = { /* name of modes */
|
||||
"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
|
||||
"MAGIC", "CRYPT", "ASAVE"
|
||||
"MAGIC", "CRYPT", "ASAVE", "UTF-8"
|
||||
};
|
||||
char *mode2name[] = { /* name of modes */
|
||||
"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
|
||||
"Magic", "Crypt", "Asave"
|
||||
"Magic", "Crypt", "Asave", "utf-8"
|
||||
};
|
||||
char modecode[] = "WCSEVOMYA"; /* letters to represent modes */
|
||||
char modecode[] = "WCSEVOMYAU"; /* letters to represent modes */
|
||||
int gmode = 0; /* global editor mode */
|
||||
int gflags = GFREAD; /* global control flag */
|
||||
#if PKCODE & IBMPC
|
||||
|
95
posix.c
95
posix.c
@ -23,6 +23,20 @@
|
||||
#include <fcntl.h>
|
||||
#include <errno.h>
|
||||
|
||||
/*
|
||||
* NOTE NOTE NOTE!
|
||||
*
|
||||
* Uemacs is currently very much byte-oriented, and not at all UTF8-aware
|
||||
* interally. However, this allows it to understand a _terminal_ that is
|
||||
* in utf-8 mode, and will turn input into the 8-bit subset, and will turn
|
||||
* things back into UTF8 on output.
|
||||
*
|
||||
* Do _not_ confuse this with the notion of actually being able to edit
|
||||
* UTF-8 file _contents_. That's a totally different thing.
|
||||
*/
|
||||
#define utf8_mode() \
|
||||
(curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8))
|
||||
|
||||
static int kbdflgs; /* saved keyboard fd flags */
|
||||
static int kbdpoll; /* in O_NDELAY mode */
|
||||
|
||||
@ -99,6 +113,22 @@ void ttclose(void)
|
||||
*/
|
||||
int ttputc(int c)
|
||||
{
|
||||
/*
|
||||
* We always represent things in 1 byte, but if we output
|
||||
* in UTF-8, we may need to expand that into 2 bytes..
|
||||
*
|
||||
* Some day we might even be able to handle UTF-8 _content_.
|
||||
*
|
||||
* That day is not today.
|
||||
*/
|
||||
if (utf8_mode()) {
|
||||
c &= 0xff;
|
||||
if (c >= 0x80) {
|
||||
unsigned char first = (c >> 6) | 0xc0;
|
||||
fputc(first, stdout);
|
||||
c = (c & 0x3f) | 0x80;
|
||||
}
|
||||
}
|
||||
fputc(c, stdout);
|
||||
return (TRUE);
|
||||
}
|
||||
@ -138,7 +168,70 @@ void ttflush(void)
|
||||
*/
|
||||
int ttgetc(void)
|
||||
{
|
||||
return (255 & fgetc(stdin)); /* 8BIT P.K. */
|
||||
static unsigned char pending;
|
||||
unsigned char c, second;
|
||||
int n;
|
||||
|
||||
if (pending) {
|
||||
c = pending;
|
||||
pending = 0;
|
||||
return c;
|
||||
}
|
||||
|
||||
n = read(0, &c, 1);
|
||||
if (n != 1)
|
||||
return 0;
|
||||
|
||||
if (!utf8_mode())
|
||||
return c;
|
||||
|
||||
/* Normal 7-bit? */
|
||||
if (!(c & 0x80))
|
||||
return c;
|
||||
|
||||
/*
|
||||
* Unexpected UTF-8 continuation character? Maybe
|
||||
* we're in non-UTF mode, or maybe it's a control
|
||||
* character.. Regardless, just pass it on.
|
||||
*/
|
||||
if (!(c & 0x40))
|
||||
return c;
|
||||
|
||||
/*
|
||||
* Multi-byte sequences.. Right now we only
|
||||
* want to get characters that can be represented
|
||||
* in a single byte, so we're not interested in
|
||||
* anything else..
|
||||
*/
|
||||
if (c & 0x3c)
|
||||
return c;
|
||||
|
||||
/*
|
||||
* Two-byte sequence representing 0x80-0xff.. We want
|
||||
* to do this read with a timeout.
|
||||
*/
|
||||
ntermios.c_cc[VMIN] = 1;
|
||||
ntermios.c_cc[VTIME] = 10; /* 1 second */
|
||||
tcsetattr(0, TCSANOW, &ntermios);
|
||||
|
||||
n = read(0, &second, 1);
|
||||
|
||||
/* Undo timeout */
|
||||
ntermios.c_cc[VTIME] = 0;
|
||||
tcsetattr(0, TCSANOW, &ntermios);
|
||||
|
||||
if (n != 1)
|
||||
return c;
|
||||
|
||||
if ((second & 0xc0) != 0x80) {
|
||||
pending = second;
|
||||
return c;
|
||||
}
|
||||
|
||||
c = (c << 6) | (second & 0x3f);
|
||||
|
||||
/* Ok, real UTF-8 character */
|
||||
return c;
|
||||
}
|
||||
|
||||
/* typahead: Check to see if any characters are already in the
|
||||
|
Loading…
Reference in New Issue
Block a user