mirror of
https://github.com/rfivet/uemacs.git
synced 2024-12-18 15:26:23 -05:00
Add support for a "utf-8" mode
NOTE! MicroEmacs is very much a byte-based editor, and the new utf-8 support is purely an issue of terminal input and output. The file contents themselves are in the 8-bit space. In that space, Unicode is the same as Latin1. The new mode is called "utf-8", and is enabled automatically by the new emacs.rc when $LANG contains the substring "UTF-8". I'm sure people would like to some day also edit real UTF-8 contents, rather than just edit old 8-bit Latin1 contents in a UTF-8 terminal. However, that's an independent (and much bigger and thornier) issue. Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
0fc43a8429
commit
f313bcf64a
4
emacs.rc
4
emacs.rc
@ -282,4 +282,8 @@ bind-to-key newline ^J
|
|||||||
!endif
|
!endif
|
||||||
!endif
|
!endif
|
||||||
|
|
||||||
|
!if &gre &sin $LANG "UTF-8" 0
|
||||||
|
add-global-mode "utf-8"
|
||||||
|
!endif
|
||||||
|
|
||||||
set $discmd "TRUE"
|
set $discmd "TRUE"
|
||||||
|
@ -493,7 +493,7 @@ typedef struct BUFFER {
|
|||||||
#define BFTRUNC 0x04 /* buffer was truncated when read */
|
#define BFTRUNC 0x04 /* buffer was truncated when read */
|
||||||
|
|
||||||
/* mode flags */
|
/* mode flags */
|
||||||
#define NUMMODES 9 /* # of defined modes */
|
#define NUMMODES 10 /* # of defined modes */
|
||||||
|
|
||||||
#define MDWRAP 0x0001 /* word wrap */
|
#define MDWRAP 0x0001 /* word wrap */
|
||||||
#define MDCMOD 0x0002 /* C indentation and fence match */
|
#define MDCMOD 0x0002 /* C indentation and fence match */
|
||||||
@ -504,6 +504,7 @@ typedef struct BUFFER {
|
|||||||
#define MDMAGIC 0x0040 /* regular expresions in search */
|
#define MDMAGIC 0x0040 /* regular expresions in search */
|
||||||
#define MDCRYPT 0x0080 /* encrytion mode active */
|
#define MDCRYPT 0x0080 /* encrytion mode active */
|
||||||
#define MDASAVE 0x0100 /* auto-save mode */
|
#define MDASAVE 0x0100 /* auto-save mode */
|
||||||
|
#define MDUTF8 0x0200 /* UTF-8 input/output mode */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The starting position of a region, and the size of the region in
|
* The starting position of a region, and the size of the region in
|
||||||
|
@ -13,13 +13,13 @@ int revexist = FALSE; /* does reverse video exist? */
|
|||||||
int flickcode = FALSE; /* do flicker supression? */
|
int flickcode = FALSE; /* do flicker supression? */
|
||||||
char *modename[] = { /* name of modes */
|
char *modename[] = { /* name of modes */
|
||||||
"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
|
"WRAP", "CMODE", "SPELL", "EXACT", "VIEW", "OVER",
|
||||||
"MAGIC", "CRYPT", "ASAVE"
|
"MAGIC", "CRYPT", "ASAVE", "UTF-8"
|
||||||
};
|
};
|
||||||
char *mode2name[] = { /* name of modes */
|
char *mode2name[] = { /* name of modes */
|
||||||
"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
|
"Wrap", "Cmode", "Spell", "Exact", "View", "Over",
|
||||||
"Magic", "Crypt", "Asave"
|
"Magic", "Crypt", "Asave", "utf-8"
|
||||||
};
|
};
|
||||||
char modecode[] = "WCSEVOMYA"; /* letters to represent modes */
|
char modecode[] = "WCSEVOMYAU"; /* letters to represent modes */
|
||||||
int gmode = 0; /* global editor mode */
|
int gmode = 0; /* global editor mode */
|
||||||
int gflags = GFREAD; /* global control flag */
|
int gflags = GFREAD; /* global control flag */
|
||||||
#if PKCODE & IBMPC
|
#if PKCODE & IBMPC
|
||||||
|
95
posix.c
95
posix.c
@ -23,6 +23,20 @@
|
|||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* NOTE NOTE NOTE!
|
||||||
|
*
|
||||||
|
* Uemacs is currently very much byte-oriented, and not at all UTF8-aware
|
||||||
|
* interally. However, this allows it to understand a _terminal_ that is
|
||||||
|
* in utf-8 mode, and will turn input into the 8-bit subset, and will turn
|
||||||
|
* things back into UTF8 on output.
|
||||||
|
*
|
||||||
|
* Do _not_ confuse this with the notion of actually being able to edit
|
||||||
|
* UTF-8 file _contents_. That's a totally different thing.
|
||||||
|
*/
|
||||||
|
#define utf8_mode() \
|
||||||
|
(curwp && curwp->w_bufp && (curwp->w_bufp->b_mode & MDUTF8))
|
||||||
|
|
||||||
static int kbdflgs; /* saved keyboard fd flags */
|
static int kbdflgs; /* saved keyboard fd flags */
|
||||||
static int kbdpoll; /* in O_NDELAY mode */
|
static int kbdpoll; /* in O_NDELAY mode */
|
||||||
|
|
||||||
@ -99,6 +113,22 @@ void ttclose(void)
|
|||||||
*/
|
*/
|
||||||
int ttputc(int c)
|
int ttputc(int c)
|
||||||
{
|
{
|
||||||
|
/*
|
||||||
|
* We always represent things in 1 byte, but if we output
|
||||||
|
* in UTF-8, we may need to expand that into 2 bytes..
|
||||||
|
*
|
||||||
|
* Some day we might even be able to handle UTF-8 _content_.
|
||||||
|
*
|
||||||
|
* That day is not today.
|
||||||
|
*/
|
||||||
|
if (utf8_mode()) {
|
||||||
|
c &= 0xff;
|
||||||
|
if (c >= 0x80) {
|
||||||
|
unsigned char first = (c >> 6) | 0xc0;
|
||||||
|
fputc(first, stdout);
|
||||||
|
c = (c & 0x3f) | 0x80;
|
||||||
|
}
|
||||||
|
}
|
||||||
fputc(c, stdout);
|
fputc(c, stdout);
|
||||||
return (TRUE);
|
return (TRUE);
|
||||||
}
|
}
|
||||||
@ -138,7 +168,70 @@ void ttflush(void)
|
|||||||
*/
|
*/
|
||||||
int ttgetc(void)
|
int ttgetc(void)
|
||||||
{
|
{
|
||||||
return (255 & fgetc(stdin)); /* 8BIT P.K. */
|
static unsigned char pending;
|
||||||
|
unsigned char c, second;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
if (pending) {
|
||||||
|
c = pending;
|
||||||
|
pending = 0;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
n = read(0, &c, 1);
|
||||||
|
if (n != 1)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (!utf8_mode())
|
||||||
|
return c;
|
||||||
|
|
||||||
|
/* Normal 7-bit? */
|
||||||
|
if (!(c & 0x80))
|
||||||
|
return c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unexpected UTF-8 continuation character? Maybe
|
||||||
|
* we're in non-UTF mode, or maybe it's a control
|
||||||
|
* character.. Regardless, just pass it on.
|
||||||
|
*/
|
||||||
|
if (!(c & 0x40))
|
||||||
|
return c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Multi-byte sequences.. Right now we only
|
||||||
|
* want to get characters that can be represented
|
||||||
|
* in a single byte, so we're not interested in
|
||||||
|
* anything else..
|
||||||
|
*/
|
||||||
|
if (c & 0x3c)
|
||||||
|
return c;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Two-byte sequence representing 0x80-0xff.. We want
|
||||||
|
* to do this read with a timeout.
|
||||||
|
*/
|
||||||
|
ntermios.c_cc[VMIN] = 1;
|
||||||
|
ntermios.c_cc[VTIME] = 10; /* 1 second */
|
||||||
|
tcsetattr(0, TCSANOW, &ntermios);
|
||||||
|
|
||||||
|
n = read(0, &second, 1);
|
||||||
|
|
||||||
|
/* Undo timeout */
|
||||||
|
ntermios.c_cc[VTIME] = 0;
|
||||||
|
tcsetattr(0, TCSANOW, &ntermios);
|
||||||
|
|
||||||
|
if (n != 1)
|
||||||
|
return c;
|
||||||
|
|
||||||
|
if ((second & 0xc0) != 0x80) {
|
||||||
|
pending = second;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
c = (c << 6) | (second & 0x3f);
|
||||||
|
|
||||||
|
/* Ok, real UTF-8 character */
|
||||||
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* typahead: Check to see if any characters are already in the
|
/* typahead: Check to see if any characters are already in the
|
||||||
|
Loading…
Reference in New Issue
Block a user