uemacs/utf8.c

/* utf8.c -- implements utf8.h, converts between unicode and UTF-8 */

#define _XOPEN_SOURCE	/* wcwidth in wchar.h */

#include "utf8.h"

#include <assert.h>
#include <wchar.h>

/*
 * Display width of UTF-8 character
 */
int utf8_width( unicode_t c) {
#if CYGWIN
	assert( sizeof( wchar_t) == 2) ;	/* wcwidth only supports UTF-16 */
	return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;
#else
	return wcwidth( (wchar_t) c) ;
#endif
}

/*
 * utf8_to_unicode()
 *
 * Convert a UTF-8 sequence to its unicode value, and return the length of
 * the sequence in bytes.
 *
 * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
 * either use it as-is (ie as Latin1) or you can check for invalid UTF-8
 * by checking for a length of 1 and a result > 127.
 *
 * NOTE 2! This does *not* verify things like minimality. So overlong forms
 * are happily accepted and decoded, as are the various "invalid values".
 */
unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len,
															unicode_t *res) {
    unicode_t   value ;
    unsigned	c ;
    unsigned	bytes, mask, i;

	if( index >= len)
		return 0 ;

    *res = c = line[ index] & 0xFFU ;

    /*
     * 0xxxxxxx is valid one byte utf8
     * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
     * 1100000x is start of overlong encoding sequence
     * Sequence longer than 4 bytes are invalid
     * Last valid code is 0x10FFFF, encoding start with 0xF4
     */
    if( c <= 0xC1 || c > 0xF4)
        return 1;

    /* Ok, it's 11xxxxxx, do a stupid decode */
    mask = 0x20;
    bytes = 2;
    while( (c & mask) != 0) {
        bytes++;
        mask >>= 1;
    }

	/* bytes is in range [2..4] as c was in range [C2..F4] */
    len -= index;
    if (bytes > len)
        return 1;

    value = c & (mask-1);

    /* Ok, do the bytes */
    line += index;
    for (i = 1; i < bytes; i++) {
        c = line[i] & 0xFFU ;
        if ((c & 0xc0) != 0x80)
            return 1;
        value = (value << 6) | (c & 0x3f);
    }

    if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
        return 1 ;

    *res = value;
    return bytes;
}

/*
 * unicode_to_utf8()
 *
 * Convert a unicode value to its canonical utf-8 sequence.
 *
 * NOTE! This does not check for - or care about - the "invalid" unicode
 * values.  Also, converting a utf-8 sequence to unicode and back does
 * *not* guarantee the same sequence, since this generates the shortest
 * possible sequence, while utf8_to_unicode() accepts both Latin1 and
 * overlong utf-8 sequences.
 */
unsigned unicode_to_utf8( unicode_t c, char *utf8) {
    unsigned bytes = 1 ;

    assert( c <= 0x10FFFF) ;

#ifdef NDEBUG
	if( c > 0x10FFFF)	/* Let's assume this is due to sign extension */
		c &= 0xFF ;
#endif

    if( c <= 0x7f)
	    *utf8 = (char) c ;
    else {
        unsigned prefix = 0x40 ;
        char *p = utf8 ;
        do {
            *p++ = (char) (0x80 + (c & 0x3f)) ;
            bytes++ ;
            prefix >>= 1 ;
            c >>= 6 ;
        } while( c >= prefix) ;

		*p-- = *utf8 ;
		*utf8++ = (char) (c - 2 * prefix) ;
		if( utf8 < p) {	/* swap middle two bytes if 4 bytes utf-8 code */
			char c = *p ;
			*p = *utf8 ;
			*utf8 = c ;
		}
    }

    return bytes ;
}

unsigned utf8_revdelta( unsigned char *p, unsigned pos) {
	unsigned delta = 0 ;

	if( (*p & 0xC0) == 0x80) {
		unsigned char c ;

		c = *--p ;
		if( (c & 0xE0) == 0xC0)	/* valid 2 bytes unicode seq */
			delta = 1 ;
		else if( ((c & 0xC0) == 0x80) && (pos > 1)) {
			c = *--p ;
			if( (c & 0xF0) == 0xE0)	/* valid 3 bytes unicode seq */
				delta = 2 ;
			else if( ((c & 0xC0) == 0x80) && (pos > 2))
				if( (p[ -1] & 0xF8) == 0xF0)	/* valid 4 bytes unicode seq */
					delta = 3 ;
		}
	}

	return delta ;
}


/* end of utf8.c */
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`/* utf8.c -- implements utf8.h, converts between unicode and UTF-8 */`

Handle wide character display based on wcwidth implementation (UTF-16 ready). 2019-11-05 22:24:18 -05:00			`#define _XOPEN_SOURCE /* wcwidth in wchar.h */`

Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00			`#include "utf8.h"`

Assert that unicode are limited to 0-10FFFF. 2015-02-08 01:26:07 -05:00			`#include <assert.h>`
Handle wide character display based on wcwidth implementation (UTF-16 ready). 2019-11-05 22:24:18 -05:00			`#include <wchar.h>`

			`/*`
			`* Display width of UTF-8 character`
			`*/`
Clean up handling of unicode character width (non printable are displayed as \u) and insure modeline displays filename including double and zero width characters. 2020-06-24 04:38:03 -04:00			`int utf8_width( unicode_t c) {`
Merge NetBSD adaptation. 2020-06-16 02:09:44 -04:00			`#if CYGWIN`
Clean up handling of unicode character width (non printable are displayed as \u) and insure modeline displays filename including double and zero width characters. 2020-06-24 04:38:03 -04:00			`assert( sizeof( wchar_t) == 2) ; /* wcwidth only supports UTF-16 */`
			`return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;`
Cater to wcwidth implementation difference (Cygwin: UTF-16, Linux: UTF-32). 2019-11-06 00:31:05 -05:00			`#else`
Clean up handling of unicode character width (non printable are displayed as \u) and insure modeline displays filename including double and zero width characters. 2020-06-24 04:38:03 -04:00			`return wcwidth( (wchar_t) c) ;`
Cater to wcwidth implementation difference (Cygwin: UTF-16, Linux: UTF-32). 2019-11-06 00:31:05 -05:00			`#endif`
Handle wide character display based on wcwidth implementation (UTF-16 ready). 2019-11-05 22:24:18 -05:00			`}`
Assert that unicode are limited to 0-10FFFF. 2015-02-08 01:26:07 -05:00
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00			`/*`
			`* utf8_to_unicode()`
			`*`
			`* Convert a UTF-8 sequence to its unicode value, and return the length of`
			`* the sequence in bytes.`
			`*`
			`* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can`
			`* either use it as-is (ie as Latin1) or you can check for invalid UTF-8`
			`* by checking for a length of 1 and a result > 127.`
			`*`
			`* NOTE 2! This does not verify things like minimality. So overlong forms`
			`* are happily accepted and decoded, as are the various "invalid values".`
			`*/`
Display UTF-8 on the modeline [buffer name, file name]. 2019-08-12 21:14:08 -04:00			`unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len,`
&left and &mid handling of UTF-8 encoded characters. 2017-05-16 00:13:12 -04:00			`unicode_t *res) {`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`unicode_t value ;`
Correct column position when displaying double width unicode character (assumed in range \u3000-\u3FFF). 2017-05-07 02:05:47 -04:00			`unsigned c ;`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`unsigned bytes, mask, i;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Correct column position when displaying double width unicode character (assumed in range \u3000-\u3FFF). 2017-05-07 02:05:47 -04:00			`if( index >= len)`
			`return 0 ;`
&left and &mid handling of UTF-8 encoded characters. 2017-05-16 00:13:12 -04:00
Correct column position when displaying double width unicode character (assumed in range \u3000-\u3FFF). 2017-05-07 02:05:47 -04:00			`*res = c = line[ index] & 0xFFU ;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`/*`
			`* 0xxxxxxx is valid one byte utf8`
			`* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1`
			`* 1100000x is start of overlong encoding sequence`
			`* Sequence longer than 4 bytes are invalid`
			`* Last valid code is 0x10FFFF, encoding start with 0xF4`
			`*/`
			`if( c <= 0xC1 \|\| c > 0xF4)`
			`return 1;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`/* Ok, it's 11xxxxxx, do a stupid decode */`
			`mask = 0x20;`
			`bytes = 2;`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`while( (c & mask) != 0) {`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`bytes++;`
			`mask >>= 1;`
			`}`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`/* bytes is in range [2..4] as c was in range [C2..F4] */`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`len -= index;`
			`if (bytes > len)`
			`return 1;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`value = c & (mask-1);`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`/* Ok, do the bytes */`
			`line += index;`
			`for (i = 1; i < bytes; i++) {`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`c = line[i] & 0xFFU ;`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`if ((c & 0xc0) != 0x80)`
			`return 1;`
			`value = (value << 6) \| (c & 0x3f);`
			`}`
Review cursor movement in presence of mixed latin1/unicode encoding. 2015-02-03 23:37:57 -05:00
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */`
			`return 1 ;`

			`*res = value;`
			`return bytes;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00			`}`

			`/*`
			`* unicode_to_utf8()`
			`*`
			`* Convert a unicode value to its canonical utf-8 sequence.`
			`*`
			`* NOTE! This does not check for - or care about - the "invalid" unicode`
			`* values. Also, converting a utf-8 sequence to unicode and back does`
			`* not guarantee the same sequence, since this generates the shortest`
			`* possible sequence, while utf8_to_unicode() accepts both Latin1 and`
			`* overlong utf-8 sequences.`
			`*/`
Assert that unicode are limited to 0-10FFFF. 2015-02-08 01:26:07 -05:00			`unsigned unicode_to_utf8( unicode_t c, char *utf8) {`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`unsigned bytes = 1 ;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`assert( c <= 0x10FFFF) ;`
Version 4.2 Consistent display of µEMACS as program name among - ue --version - on status bar - insert-string $progname - write-message $progname (FIX). 2015-02-12 00:15:45 -05:00
			`#ifdef NDEBUG`
			`if( c > 0x10FFFF) /* Let's assume this is due to sign extension */`
			`c &= 0xFF ;`
			`#endif`

Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`if( c <= 0x7f)`
			`*utf8 = (char) c ;`
			`else {`
			`unsigned prefix = 0x40 ;`
			`char *p = utf8 ;`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`do {`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00			`*p++ = (char) (0x80 + (c & 0x3f)) ;`
			`bytes++ ;`
			`prefix >>= 1 ;`
			`c >>= 6 ;`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`} while( c >= prefix) ;`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00
Consistent unicode handling in buffer-position (CTL-X =) and $curchar. 2017-05-04 22:49:02 -04:00			`p-- = utf8 ;`
			`utf8++ = (char) (c - 2 prefix) ;`
			`if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */`
			`char c = *p ;`
			`p = utf8 ;`
			`*utf8 = c ;`
			`}`
Revise sanity check on UTF-8 keyboard input sequence. 2015-02-10 05:09:59 -05:00			`}`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00
			`return bytes ;`
Split up the utf8 helper functions into a file of their own Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2012-07-10 19:21:35 -04:00			`}`
Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00
Improve support of utf8 in filename completion and key in of command arguments. 2019-08-12 09:41:51 -04:00			`unsigned utf8_revdelta( unsigned char *p, unsigned pos) {`
			`unsigned delta = 0 ;`

			`if( (*p & 0xC0) == 0x80) {`
			`unsigned char c ;`

			`c = *--p ;`
			`if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */`
			`delta = 1 ;`
			`else if( ((c & 0xC0) == 0x80) && (pos > 1)) {`
			`c = *--p ;`
			`if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */`
			`delta = 2 ;`
			`else if( ((c & 0xC0) == 0x80) && (pos > 2))`
			`if( (p[ -1] & 0xF8) == 0xF0) /* valid 4 bytes unicode seq */`
			`delta = 3 ;`
			`}`
			`}`

			`return delta ;`
			`}`

Clean up splint warnings: - Inconsistencies between defines.h and estruct.h. - Review scope of termio local variables. - Type mismatch in utf8. 2015-02-16 00:41:59 -05:00
			`/* end of utf8.c */`