1
0
mirror of https://github.com/rfivet/uemacs.git synced 2025-02-20 06:57:11 -05:00

Split up the utf8 helper functions into a file of their own

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Linus Torvalds 2012-07-10 16:21:35 -07:00
parent 12e4647deb
commit e62cdf04cf
5 changed files with 118 additions and 66 deletions

View File

@ -20,13 +20,13 @@ SRC=ansi.c basic.c bind.c buffer.c crypt.c display.c eval.c exec.c \
file.c fileio.c ibmpc.c input.c isearch.c line.c lock.c main.c \
pklock.c posix.c random.c region.c search.c spawn.c tcap.c \
termio.c vmsvt.c vt52.c window.c word.c names.c globals.c version.c \
usage.c wrapper.c
usage.c wrapper.c utf8.c
OBJ=ansi.o basic.o bind.o buffer.o crypt.o display.o eval.o exec.o \
file.o fileio.o ibmpc.o input.o isearch.o line.o lock.o main.o \
pklock.o posix.o random.o region.o search.o spawn.o tcap.o \
termio.o vmsvt.o vt52.o window.o word.o names.o globals.o version.o \
usage.o wrapper.o
usage.o wrapper.o utf8.o
HDR=ebind.h edef.h efunc.h epath.h estruct.h evar.h util.h version.h
@ -132,7 +132,7 @@ basic.o: basic.c estruct.h edef.h
bind.o: bind.c estruct.h edef.h epath.h
buffer.o: buffer.c estruct.h edef.h
crypt.o: crypt.c estruct.h edef.h
display.o: display.c estruct.h edef.h
display.o: display.c estruct.h edef.h utf8.h
eval.o: eval.c estruct.h edef.h evar.h
exec.o: exec.c estruct.h edef.h
file.o: file.c estruct.h edef.h
@ -144,12 +144,14 @@ line.o: line.c estruct.h edef.h
lock.o: lock.c estruct.h edef.h
main.o: main.c estruct.h efunc.h edef.h ebind.h
pklock.o: pklock.c estruct.h
posix.o: posix.c estruct.h utf8.h
random.o: random.c estruct.h edef.h
region.o: region.c estruct.h edef.h
search.o: search.c estruct.h edef.h
spawn.o: spawn.c estruct.h edef.h
tcap.o: tcap.c estruct.h edef.h
termio.o: termio.c estruct.h edef.h
utf8.o: utf8.c utf8.h
vmsvt.o: vmsvt.c estruct.h edef.h
vt52.o: vt52.c estruct.h edef.h
window.o: window.c estruct.h edef.h

View File

@ -19,8 +19,7 @@
#include "line.h"
#include "version.h"
#include "wrapper.h"
typedef unsigned int unicode_t;
#include "utf8.h"
struct video {
int v_flag; /* Flags */
@ -434,50 +433,6 @@ static int reframe(struct window *wp)
return TRUE;
}
static unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{
unsigned value;
unsigned char c = line[index];
unsigned bytes, mask, i;
*res = c;
line += index;
len -= index;
/*
* 0xxxxxxx is valid utf8
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
*/
if (c < 0xc0)
return 1;
/* Ok, it's 11xxxxxx, do a stupid decode */
mask = 0x20;
bytes = 2;
while (c & mask) {
bytes++;
mask >>= 1;
}
/* Invalid? Do it as a single byte Latin1 */
if (bytes > 6)
return 1;
value = c & (mask-1);
/* Ok, do the bytes */
for (i = 1; i < bytes; i++) {
if (i > len)
return 1;
c = line[i];
if ((c & 0xc0) != 0x80)
return 1;
value = (value << 6) | (c & 0x3f);
}
*res = value;
return bytes;
}
static void show_line(struct line *lp)
{
unsigned i = 0, len = llength(lp);

22
posix.c
View File

@ -22,6 +22,7 @@
#include "estruct.h"
#include "edef.h"
#include "efunc.h"
#include "utf8.h"
/* Since Mac OS X's termios.h doesn't have the following 2 macros, define them.
*/
@ -106,24 +107,11 @@ void ttclose(void)
*/
int ttputc(int c)
{
unsigned char utf8[6], *p = utf8+5;
int bytes = 1;
char utf8[6];
int bytes;
if (c < 0)
return 0;
*p = c;
if (c > 0x7f) {
int prefix = 0x40;
do {
*p = 0x80 + (c & 0x3f);
--p;
bytes++;
prefix >>= 1;
c >>= 6;
} while (c > prefix);
*p = c - 2*prefix;
}
fwrite(p, 1, bytes, stdout);
bytes = unicode_to_utf8(c, utf8);
fwrite(utf8, 1, bytes, stdout);
return 0;
}

98
utf8.c Normal file
View File

@ -0,0 +1,98 @@
#include "utf8.h"
/*
* utf8_to_unicode()
*
* Convert a UTF-8 sequence to its unicode value, and return the length of
* the sequence in bytes.
*
* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
* either use it as-is (ie as Latin1) or you can check for invalid UTF-8
* by checking for a length of 1 and a result > 127.
*
* NOTE 2! This does *not* verify things like minimality. So overlong forms
* are happily accepted and decoded, as are the various "invalid values".
*/
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{
unsigned value;
unsigned char c = line[index];
unsigned bytes, mask, i;
*res = c;
line += index;
len -= index;
/*
* 0xxxxxxx is valid utf8
* 10xxxxxx is invalid UTF-8, we assume it is Latin1
*/
if (c < 0xc0)
return 1;
/* Ok, it's 11xxxxxx, do a stupid decode */
mask = 0x20;
bytes = 2;
while (c & mask) {
bytes++;
mask >>= 1;
}
/* Invalid? Do it as a single byte Latin1 */
if (bytes > 6)
return 1;
value = c & (mask-1);
/* Ok, do the bytes */
for (i = 1; i < bytes; i++) {
if (i > len)
return 1;
c = line[i];
if ((c & 0xc0) != 0x80)
return 1;
value = (value << 6) | (c & 0x3f);
}
*res = value;
return bytes;
}
static void reverse_string(char *begin, char *end)
{
do {
char a = *begin, b = *end;
*end = a; *begin = b;
begin++; end--;
} while (begin < end);
}
/*
* unicode_to_utf8()
*
* Convert a unicode value to its canonical utf-8 sequence.
*
* NOTE! This does not check for - or care about - the "invalid" unicode
* values. Also, converting a utf-8 sequence to unicode and back does
* *not* guarantee the same sequence, since this generates the shortest
* possible sequence, while utf8_to_unicode() accepts both Latin1 and
* overlong utf-8 sequences.
*/
unsigned unicode_to_utf8(unsigned int c, char *utf8)
{
int bytes = 1;
*utf8 = c;
if (c > 0x7f) {
int prefix = 0x40;
char *p = utf8;
do {
*p++ = 0x80 + (c & 0x3f);
bytes++;
prefix >>= 1;
c >>= 6;
} while (c > prefix);
*p = c - 2*prefix;
reverse_string(utf8, p);
}
return bytes;
}

9
utf8.h Normal file
View File

@ -0,0 +1,9 @@
#ifndef UTF8_H
#define UTF8_H
typedef unsigned int unicode_t;
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res);
unsigned unicode_to_utf8(unsigned int c, char *utf8);
#endif