uemacs/utf8.c

#include "utf8.h"

/*
 * utf8_to_unicode()
 *
 * Convert a UTF-8 sequence to its unicode value, and return the length of
 * the sequence in bytes.
 *
 * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
 * either use it as-is (ie as Latin1) or you can check for invalid UTF-8
 * by checking for a length of 1 and a result > 127.
 *
 * NOTE 2! This does *not* verify things like minimality. So overlong forms
 * are happily accepted and decoded, as are the various "invalid values".
 */
unsigned utf8_to_unicode(char *line, unsigned index, unsigned len, unicode_t *res)
{
	unicode_t	value ;
	unsigned char c = line[index];
	unsigned bytes, mask, i;

	*res = c;

	/*
	 * 0xxxxxxx is valid one byte utf8
	 * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
	 * 1100000x is start of overlong encoding sequence
	 * Sequence longer than 4 bytes are invalid
	 */
	if( c <= 0xc0 || c > 0xF4 || c == 0xC1)
		return 1;

	/* Ok, it's 11xxxxxx, do a stupid decode */
	mask = 0x20;
	bytes = 2;
	while (c & mask) {
		bytes++;
		mask >>= 1;
	}

	/* Invalid? Do it as a single byte Latin1 */
/*	if (bytes > 6)	* bytes is <= 4 as we limit c value to max 0xF4
		return 1;	*
*/
	len -= index;
	if (bytes > len)
		return 1;

	value = c & (mask-1);

	/* Ok, do the bytes */
	line += index;
	for (i = 1; i < bytes; i++) {
		c = line[i];
		if ((c & 0xc0) != 0x80)
			return 1;
		value = (value << 6) | (c & 0x3f);
	}
	
	if( value > 0x10FFFF)
		return 1 ;

	*res = value;
	return bytes;
}

static void reverse_string(char *begin, char *end)
{
	do {
		char a = *begin, b = *end;
		*end = a; *begin = b;
		begin++; end--;
	} while (begin < end);
}

/*
 * unicode_to_utf8()
 *
 * Convert a unicode value to its canonical utf-8 sequence.
 *
 * NOTE! This does not check for - or care about - the "invalid" unicode
 * values.  Also, converting a utf-8 sequence to unicode and back does
 * *not* guarantee the same sequence, since this generates the shortest
 * possible sequence, while utf8_to_unicode() accepts both Latin1 and
 * overlong utf-8 sequences.
 */
unsigned unicode_to_utf8(unsigned int c, char *utf8)
{
	int bytes = 1;

	*utf8 = c;
	if (c > 0x7f) {
		int prefix = 0x40;
		char *p = utf8;
		do {
			*p++ = 0x80 + (c & 0x3f);
			bytes++;
			prefix >>= 1;
			c >>= 6;
		} while( c >= prefix) ;
		*p = c - 2*prefix;
		reverse_string(utf8, p);
	}
	return bytes;
}
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`#include "utf8.h"`

			`/*`
			`* utf8_to_unicode()`
			`*`
			`* Convert a UTF-8 sequence to its unicode value, and return the length of`
			`* the sequence in bytes.`
			`*`
			`* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can`
			`* either use it as-is (ie as Latin1) or you can check for invalid UTF-8`
			`* by checking for a length of 1 and a result > 127.`
			`*`
			`* NOTE 2! This does not verify things like minimality. So overlong forms`
			`* are happily accepted and decoded, as are the various "invalid values".`
			`*/`
			`unsigned utf8_to_unicode(char line, unsigned index, unsigned len, unicode_t res)`
			`{`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-04 04:37:57 +00:00			`unicode_t value ;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`unsigned char c = line[index];`
			`unsigned bytes, mask, i;`

			`*res = c;`

			`/*`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-04 04:37:57 +00:00			`* 0xxxxxxx is valid one byte utf8`
			`* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1`
			`* 1100000x is start of overlong encoding sequence`
			`* Sequence longer than 4 bytes are invalid`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`*/`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-04 04:37:57 +00:00			`if( c <= 0xc0 \|\| c > 0xF4 \|\| c == 0xC1)`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`return 1;`

			`/* Ok, it's 11xxxxxx, do a stupid decode */`
			`mask = 0x20;`
			`bytes = 2;`
			`while (c & mask) {`
			`bytes++;`
			`mask >>= 1;`
			`}`

			`/* Invalid? Do it as a single byte Latin1 */`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-04 04:37:57 +00:00			`/* if (bytes > 6) * bytes is <= 4 as we limit c value to max 0xF4`
			`return 1; *`
			`*/`
			`len -= index;`
utf8: make sure to honor the array length properly Right now the input side can give partial utf8 input, and that showed that we didn't properly handle that case. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-11 00:56:53 +00:00			`if (bytes > len)`
			`return 1;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00
			`value = c & (mask-1);`

			`/* Ok, do the bytes */`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-04 04:37:57 +00:00			`line += index;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`for (i = 1; i < bytes; i++) {`
			`c = line[i];`
			`if ((c & 0xc0) != 0x80)`
			`return 1;`
			`value = (value << 6) \| (c & 0x3f);`
			`}`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-04 04:37:57 +00:00
			`if( value > 0x10FFFF)`
			`return 1 ;`

Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`*res = value;`
			`return bytes;`
			`}`

			`static void reverse_string(char begin, char end)`
			`{`
			`do {`
			`char a = begin, b = end;`
			`end = a; begin = b;`
			`begin++; end--;`
			`} while (begin < end);`
			`}`

			`/*`
			`* unicode_to_utf8()`
			`*`
			`* Convert a unicode value to its canonical utf-8 sequence.`
			`*`
			`* NOTE! This does not check for - or care about - the "invalid" unicode`
			`* values. Also, converting a utf-8 sequence to unicode and back does`
			`* not guarantee the same sequence, since this generates the shortest`
			`* possible sequence, while utf8_to_unicode() accepts both Latin1 and`
			`* overlong utf-8 sequences.`
			`*/`
			`unsigned unicode_to_utf8(unsigned int c, char *utf8)`
			`{`
			`int bytes = 1;`

			`*utf8 = c;`
			`if (c > 0x7f) {`
			`int prefix = 0x40;`
			`char *p = utf8;`
			`do {`
			`*p++ = 0x80 + (c & 0x3f);`
			`bytes++;`
			`prefix >>= 1;`
			`c >>= 6;`
Insure correct UTF-8 encoding: asc( chr( 0x800)) == 0x800. buffer-position displays unicode value of character under cursor instead of first byte of unicode sequence. 2015-02-06 05:20:51 +00:00			`} while( c >= prefix) ;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 23:21:35 +00:00			`p = c - 2prefix;`
			`reverse_string(utf8, p);`
			`}`
			`return bytes;`
			`}`