2021-08-15 23:05:24 -04:00
|
|
|
/* utf8.c -- implements utf8.h, conversion between unicode and UTF-8 */
|
2012-07-10 19:21:35 -04:00
|
|
|
#include "utf8.h"
|
|
|
|
|
2015-02-08 01:26:07 -05:00
|
|
|
#include <assert.h>
|
2021-08-17 21:37:47 -04:00
|
|
|
#include <wchar.h> /* either _XOPEN_SOURCE or _GNU_SOURCE */
|
2019-11-05 22:24:18 -05:00
|
|
|
|
2021-08-14 21:41:35 -04:00
|
|
|
|
|
|
|
/* Display width of UTF-8 character */
|
|
|
|
int _utf8_width( unicode_t c) {
|
2021-08-17 21:37:47 -04:00
|
|
|
#if __SIZEOF_WCHAR_T__ == 2 /* wcwidth only supports UTF-16 */
|
2021-08-15 23:05:24 -04:00
|
|
|
return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ;
|
2019-11-06 00:31:05 -05:00
|
|
|
#else
|
2021-08-15 23:05:24 -04:00
|
|
|
return wcwidth( (wchar_t) c) ;
|
2019-11-06 00:31:05 -05:00
|
|
|
#endif
|
2019-11-05 22:24:18 -05:00
|
|
|
}
|
2015-02-08 01:26:07 -05:00
|
|
|
|
2021-08-14 21:41:35 -04:00
|
|
|
|
|
|
|
int utf8_width( unicode_t c) {
|
2021-08-15 23:05:24 -04:00
|
|
|
int w = _utf8_width( c) ;
|
|
|
|
return (w < 0) ? 2 : w ; /* display \u if can't figure out width */
|
2021-08-14 21:41:35 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* utf8_to_unicode()
|
2012-07-10 19:21:35 -04:00
|
|
|
*
|
|
|
|
* Convert a UTF-8 sequence to its unicode value, and return the length of
|
|
|
|
* the sequence in bytes.
|
|
|
|
*
|
|
|
|
* NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can
|
|
|
|
* either use it as-is (ie as Latin1) or you can check for invalid UTF-8
|
|
|
|
* by checking for a length of 1 and a result > 127.
|
|
|
|
*
|
|
|
|
* NOTE 2! This does *not* verify things like minimality. So overlong forms
|
|
|
|
* are happily accepted and decoded, as are the various "invalid values".
|
|
|
|
*/
|
2019-08-12 21:14:08 -04:00
|
|
|
unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len,
|
2021-08-15 23:05:24 -04:00
|
|
|
unicode_t *res) {
|
|
|
|
assert( index < len) ;
|
|
|
|
unsigned c = *res = (unsigned char) line[ index] ;
|
2012-07-10 19:21:35 -04:00
|
|
|
|
2021-08-15 23:05:24 -04:00
|
|
|
/* 0xxxxxxx is valid one byte utf8
|
2015-02-10 05:09:59 -05:00
|
|
|
* 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1
|
|
|
|
* 1100000x is start of overlong encoding sequence
|
|
|
|
* Sequence longer than 4 bytes are invalid
|
|
|
|
* Last valid code is 0x10FFFF, encoding start with 0xF4
|
|
|
|
*/
|
|
|
|
if( c <= 0xC1 || c > 0xF4)
|
2021-08-15 23:05:24 -04:00
|
|
|
return 1 ;
|
2012-07-10 19:21:35 -04:00
|
|
|
|
2015-02-10 05:09:59 -05:00
|
|
|
/* Ok, it's 11xxxxxx, do a stupid decode */
|
2021-08-15 23:05:24 -04:00
|
|
|
unsigned mask = 0x20 ;
|
|
|
|
unsigned bytes = 2 ;
|
2015-02-16 00:41:59 -05:00
|
|
|
while( (c & mask) != 0) {
|
2021-08-15 23:05:24 -04:00
|
|
|
bytes++ ;
|
|
|
|
mask >>= 1 ;
|
2015-02-10 05:09:59 -05:00
|
|
|
}
|
2012-07-10 19:21:35 -04:00
|
|
|
|
2021-08-15 23:05:24 -04:00
|
|
|
/* bytes is in range [2..4] as c was in range [C2..F4] */
|
|
|
|
len -= index ;
|
|
|
|
if( bytes > len)
|
|
|
|
return 1 ;
|
2012-07-10 19:21:35 -04:00
|
|
|
|
2021-08-15 23:05:24 -04:00
|
|
|
unicode_t value = c & (mask - 1) ;
|
2012-07-10 19:21:35 -04:00
|
|
|
|
2015-02-10 05:09:59 -05:00
|
|
|
/* Ok, do the bytes */
|
2021-08-15 23:05:24 -04:00
|
|
|
line += index ;
|
|
|
|
for( unsigned i = 2 ; i <= bytes ; i++) {
|
|
|
|
c = (unsigned char) *++line ;
|
|
|
|
if( (c & 0xc0) != 0x80)
|
|
|
|
return 1 ;
|
|
|
|
|
|
|
|
value = (value << 6) | (c & 0x3f) ;
|
2015-02-10 05:09:59 -05:00
|
|
|
}
|
2015-02-03 23:37:57 -05:00
|
|
|
|
2015-02-10 05:09:59 -05:00
|
|
|
if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */
|
|
|
|
return 1 ;
|
|
|
|
|
2021-08-15 23:05:24 -04:00
|
|
|
*res = value ;
|
|
|
|
return bytes ;
|
2012-07-10 19:21:35 -04:00
|
|
|
}
|
|
|
|
|
2021-08-14 21:41:35 -04:00
|
|
|
|
|
|
|
/* unicode_to_utf8()
|
2012-07-10 19:21:35 -04:00
|
|
|
*
|
|
|
|
* Convert a unicode value to its canonical utf-8 sequence.
|
|
|
|
*
|
|
|
|
* NOTE! This does not check for - or care about - the "invalid" unicode
|
|
|
|
* values. Also, converting a utf-8 sequence to unicode and back does
|
|
|
|
* *not* guarantee the same sequence, since this generates the shortest
|
|
|
|
* possible sequence, while utf8_to_unicode() accepts both Latin1 and
|
|
|
|
* overlong utf-8 sequences.
|
|
|
|
*/
|
2015-02-08 01:26:07 -05:00
|
|
|
unsigned unicode_to_utf8( unicode_t c, char *utf8) {
|
2015-02-16 00:41:59 -05:00
|
|
|
unsigned bytes = 1 ;
|
2012-07-10 19:21:35 -04:00
|
|
|
|
2015-02-10 05:09:59 -05:00
|
|
|
assert( c <= 0x10FFFF) ;
|
2015-02-12 00:15:45 -05:00
|
|
|
|
|
|
|
#ifdef NDEBUG
|
2021-08-15 23:05:24 -04:00
|
|
|
if( c > 0x10FFFF) /* Let's assume this is due to sign extension */
|
|
|
|
c &= 0xFF ;
|
2015-02-12 00:15:45 -05:00
|
|
|
#endif
|
|
|
|
|
2015-02-16 00:41:59 -05:00
|
|
|
if( c <= 0x7f)
|
2021-08-15 23:05:24 -04:00
|
|
|
*utf8 = (char) c ;
|
2015-02-16 00:41:59 -05:00
|
|
|
else {
|
|
|
|
unsigned prefix = 0x40 ;
|
|
|
|
char *p = utf8 ;
|
2015-02-10 05:09:59 -05:00
|
|
|
do {
|
2015-02-16 00:41:59 -05:00
|
|
|
*p++ = (char) (0x80 + (c & 0x3f)) ;
|
|
|
|
bytes++ ;
|
|
|
|
prefix >>= 1 ;
|
|
|
|
c >>= 6 ;
|
2015-02-10 05:09:59 -05:00
|
|
|
} while( c >= prefix) ;
|
2015-02-16 00:41:59 -05:00
|
|
|
|
2021-08-15 23:05:24 -04:00
|
|
|
*p-- = *utf8 ;
|
|
|
|
*utf8++ = (char) (c - 2 * prefix) ;
|
|
|
|
if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */
|
|
|
|
char c = *p ;
|
|
|
|
*p = *utf8 ;
|
|
|
|
*utf8 = c ;
|
|
|
|
}
|
2015-02-10 05:09:59 -05:00
|
|
|
}
|
2015-02-16 00:41:59 -05:00
|
|
|
|
|
|
|
return bytes ;
|
2012-07-10 19:21:35 -04:00
|
|
|
}
|
2015-02-16 00:41:59 -05:00
|
|
|
|
2019-08-12 09:41:51 -04:00
|
|
|
unsigned utf8_revdelta( unsigned char *p, unsigned pos) {
|
2021-08-15 23:05:24 -04:00
|
|
|
unsigned delta = 0 ;
|
|
|
|
|
|
|
|
if( (*p & 0xC0) == 0x80) {
|
|
|
|
unsigned char c ;
|
|
|
|
|
|
|
|
c = *--p ;
|
|
|
|
if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */
|
|
|
|
delta = 1 ;
|
|
|
|
else if( ((c & 0xC0) == 0x80) && (pos > 1)) {
|
|
|
|
c = *--p ;
|
|
|
|
if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */
|
|
|
|
delta = 2 ;
|
|
|
|
else if( ((c & 0xC0) == 0x80) && (pos > 2))
|
|
|
|
if( (p[ -1] & 0xF8) == 0xF0) /* valid 4 bytes unicode seq */
|
|
|
|
delta = 3 ;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return delta ;
|
2019-08-12 09:41:51 -04:00
|
|
|
}
|
|
|
|
|
2015-02-16 00:41:59 -05:00
|
|
|
|
|
|
|
/* end of utf8.c */
|