mirror of
https://github.com/rfivet/uemacs.git
synced 2025-02-20 06:57:11 -05:00
Split up the utf8 helper functions into a file of their own
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
12e4647deb
commit
e62cdf04cf
8
Makefile
8
Makefile
@ -20,13 +20,13 @@ SRC=ansi.c basic.c bind.c buffer.c crypt.c display.c eval.c exec.c \
|
||||
file.c fileio.c ibmpc.c input.c isearch.c line.c lock.c main.c \
|
||||
pklock.c posix.c random.c region.c search.c spawn.c tcap.c \
|
||||
termio.c vmsvt.c vt52.c window.c word.c names.c globals.c version.c \
|
||||
usage.c wrapper.c
|
||||
usage.c wrapper.c utf8.c
|
||||
|
||||
OBJ=ansi.o basic.o bind.o buffer.o crypt.o display.o eval.o exec.o \
|
||||
file.o fileio.o ibmpc.o input.o isearch.o line.o lock.o main.o \
|
||||
pklock.o posix.o random.o region.o search.o spawn.o tcap.o \
|
||||
termio.o vmsvt.o vt52.o window.o word.o names.o globals.o version.o \
|
||||
usage.o wrapper.o
|
||||
usage.o wrapper.o utf8.o
|
||||
|
||||
HDR=ebind.h edef.h efunc.h epath.h estruct.h evar.h util.h version.h
|
||||
|
||||
@ -132,7 +132,7 @@ basic.o: basic.c estruct.h edef.h
|
||||
bind.o: bind.c estruct.h edef.h epath.h
|
||||
buffer.o: buffer.c estruct.h edef.h
|
||||
crypt.o: crypt.c estruct.h edef.h
|
||||
display.o: display.c estruct.h edef.h
|
||||
display.o: display.c estruct.h edef.h utf8.h
|
||||
eval.o: eval.c estruct.h edef.h evar.h
|
||||
exec.o: exec.c estruct.h edef.h
|
||||
file.o: file.c estruct.h edef.h
|
||||
@ -144,12 +144,14 @@ line.o: line.c estruct.h edef.h
|
||||
lock.o: lock.c estruct.h edef.h
|
||||
main.o: main.c estruct.h efunc.h edef.h ebind.h
|
||||
pklock.o: pklock.c estruct.h
|
||||
posix.o: posix.c estruct.h utf8.h
|
||||
random.o: random.c estruct.h edef.h
|
||||
region.o: region.c estruct.h edef.h
|
||||
search.o: search.c estruct.h edef.h
|
||||
spawn.o: spawn.c estruct.h edef.h
|
||||
tcap.o: tcap.c estruct.h edef.h
|
||||
termio.o: termio.c estruct.h edef.h
|
||||
utf8.o: utf8.c utf8.h
|
||||
vmsvt.o: vmsvt.c estruct.h edef.h
|
||||
vt52.o: vt52.c estruct.h edef.h
|
||||
window.o: window.c estruct.h edef.h
|
||||
|
47
display.c
47
display.c
@ -19,8 +19,7 @@
|
||||
#include "line.h"
|
||||
#include "version.h"
|
||||
#include "wrapper.h"
|
||||
|
||||
typedef unsigned int unicode_t;
|
||||
#include "utf8.h"
|
||||
|
||||
struct video {
|
||||
int v_flag; /* Flags */
|
||||
@ -434,50 +433,6 @@ static int reframe(struct window *wp)
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
|
||||
{
|
||||
unsigned value;
|
||||
unsigned char c = line[index];
|
||||
unsigned bytes, mask, i;
|
||||
|
||||
*res = c;
|
||||
line += index;
|
||||
len -= index;
|
||||
|
||||
/*
|
||||
* 0xxxxxxx is valid utf8
|
||||
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
|
||||
*/
|
||||
if (c < 0xc0)
|
||||
return 1;
|
||||
|
||||
/* Ok, it's 11xxxxxx, do a stupid decode */
|
||||
mask = 0x20;
|
||||
bytes = 2;
|
||||
while (c & mask) {
|
||||
bytes++;
|
||||
mask >>= 1;
|
||||
}
|
||||
|
||||
/* Invalid? Do it as a single byte Latin1 */
|
||||
if (bytes > 6)
|
||||
return 1;
|
||||
|
||||
value = c & (mask-1);
|
||||
|
||||
/* Ok, do the bytes */
|
||||
for (i = 1; i < bytes; i++) {
|
||||
if (i > len)
|
||||
return 1;
|
||||
c = line[i];
|
||||
if ((c & 0xc0) != 0x80)
|
||||
return 1;
|
||||
value = (value << 6) | (c & 0x3f);
|
||||
}
|
||||
*res = value;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static void show_line(struct line *lp)
|
||||
{
|
||||
unsigned i = 0, len = llength(lp);
|
||||
|
22
posix.c
22
posix.c
@ -22,6 +22,7 @@
|
||||
#include "estruct.h"
|
||||
#include "edef.h"
|
||||
#include "efunc.h"
|
||||
#include "utf8.h"
|
||||
|
||||
/* Since Mac OS X's termios.h doesn't have the following 2 macros, define them.
|
||||
*/
|
||||
@ -106,24 +107,11 @@ void ttclose(void)
|
||||
*/
|
||||
int ttputc(int c)
|
||||
{
|
||||
unsigned char utf8[6], *p = utf8+5;
|
||||
int bytes = 1;
|
||||
char utf8[6];
|
||||
int bytes;
|
||||
|
||||
if (c < 0)
|
||||
return 0;
|
||||
*p = c;
|
||||
if (c > 0x7f) {
|
||||
int prefix = 0x40;
|
||||
do {
|
||||
*p = 0x80 + (c & 0x3f);
|
||||
--p;
|
||||
bytes++;
|
||||
prefix >>= 1;
|
||||
c >>= 6;
|
||||
} while (c > prefix);
|
||||
*p = c - 2*prefix;
|
||||
}
|
||||
fwrite(p, 1, bytes, stdout);
|
||||
bytes = unicode_to_utf8(c, utf8);
|
||||
fwrite(utf8, 1, bytes, stdout);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
98
utf8.c
Normal file
98
utf8.c
Normal file
@ -0,0 +1,98 @@
|
||||
#include "utf8.h"
|
||||
|
||||
/*
|
||||
* utf8_to_unicode()
|
||||
*
|
||||
* Convert a UTF-8 sequence to its unicode value, and return the length of
|
||||
* the sequence in bytes.
|
||||
*
|
||||
* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
|
||||
* either use it as-is (ie as Latin1) or you can check for invalid UTF-8
|
||||
* by checking for a length of 1 and a result > 127.
|
||||
*
|
||||
* NOTE 2! This does *not* verify things like minimality. So overlong forms
|
||||
* are happily accepted and decoded, as are the various "invalid values".
|
||||
*/
|
||||
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
|
||||
{
|
||||
unsigned value;
|
||||
unsigned char c = line[index];
|
||||
unsigned bytes, mask, i;
|
||||
|
||||
*res = c;
|
||||
line += index;
|
||||
len -= index;
|
||||
|
||||
/*
|
||||
* 0xxxxxxx is valid utf8
|
||||
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
|
||||
*/
|
||||
if (c < 0xc0)
|
||||
return 1;
|
||||
|
||||
/* Ok, it's 11xxxxxx, do a stupid decode */
|
||||
mask = 0x20;
|
||||
bytes = 2;
|
||||
while (c & mask) {
|
||||
bytes++;
|
||||
mask >>= 1;
|
||||
}
|
||||
|
||||
/* Invalid? Do it as a single byte Latin1 */
|
||||
if (bytes > 6)
|
||||
return 1;
|
||||
|
||||
value = c & (mask-1);
|
||||
|
||||
/* Ok, do the bytes */
|
||||
for (i = 1; i < bytes; i++) {
|
||||
if (i > len)
|
||||
return 1;
|
||||
c = line[i];
|
||||
if ((c & 0xc0) != 0x80)
|
||||
return 1;
|
||||
value = (value << 6) | (c & 0x3f);
|
||||
}
|
||||
*res = value;
|
||||
return bytes;
|
||||
}
|
||||
|
||||
static void reverse_string(char *begin, char *end)
|
||||
{
|
||||
do {
|
||||
char a = *begin, b = *end;
|
||||
*end = a; *begin = b;
|
||||
begin++; end--;
|
||||
} while (begin < end);
|
||||
}
|
||||
|
||||
/*
|
||||
* unicode_to_utf8()
|
||||
*
|
||||
* Convert a unicode value to its canonical utf-8 sequence.
|
||||
*
|
||||
* NOTE! This does not check for - or care about - the "invalid" unicode
|
||||
* values. Also, converting a utf-8 sequence to unicode and back does
|
||||
* *not* guarantee the same sequence, since this generates the shortest
|
||||
* possible sequence, while utf8_to_unicode() accepts both Latin1 and
|
||||
* overlong utf-8 sequences.
|
||||
*/
|
||||
unsigned unicode_to_utf8(unsigned int c, char *utf8)
|
||||
{
|
||||
int bytes = 1;
|
||||
|
||||
*utf8 = c;
|
||||
if (c > 0x7f) {
|
||||
int prefix = 0x40;
|
||||
char *p = utf8;
|
||||
do {
|
||||
*p++ = 0x80 + (c & 0x3f);
|
||||
bytes++;
|
||||
prefix >>= 1;
|
||||
c >>= 6;
|
||||
} while (c > prefix);
|
||||
*p = c - 2*prefix;
|
||||
reverse_string(utf8, p);
|
||||
}
|
||||
return bytes;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user