/* utf8.c -- implements utf8.h, conversion between unicode and UTF-8 */ #include "utf8.h" #define _XOPEN_SOURCE /* wcwidth in wchar.h */ #include #include /* Display width of UTF-8 character */ int _utf8_width( unicode_t c) { #if CYGWIN assert( sizeof( wchar_t) == 2) ; /* wcwidth only supports UTF-16 */ return (c < 0x10000) ? wcwidth( (wchar_t) c) : -1 ; #else return wcwidth( (wchar_t) c) ; #endif } int utf8_width( unicode_t c) { int w = _utf8_width( c) ; return (w < 0) ? 2 : w ; /* display \u if can't figure out width */ } /* utf8_to_unicode() * * Convert a UTF-8 sequence to its unicode value, and return the length of * the sequence in bytes. * * NOTE! Invalid UTF-8 will be converted to a one-byte sequence, so you can * either use it as-is (ie as Latin1) or you can check for invalid UTF-8 * by checking for a length of 1 and a result > 127. * * NOTE 2! This does *not* verify things like minimality. So overlong forms * are happily accepted and decoded, as are the various "invalid values". */ unsigned utf8_to_unicode( const char *line, unsigned index, unsigned len, unicode_t *res) { assert( index < len) ; unsigned c = *res = (unsigned char) line[ index] ; /* 0xxxxxxx is valid one byte utf8 * 10xxxxxx is invalid UTF-8 start byte, we assume it is Latin1 * 1100000x is start of overlong encoding sequence * Sequence longer than 4 bytes are invalid * Last valid code is 0x10FFFF, encoding start with 0xF4 */ if( c <= 0xC1 || c > 0xF4) return 1 ; /* Ok, it's 11xxxxxx, do a stupid decode */ unsigned mask = 0x20 ; unsigned bytes = 2 ; while( (c & mask) != 0) { bytes++ ; mask >>= 1 ; } /* bytes is in range [2..4] as c was in range [C2..F4] */ len -= index ; if( bytes > len) return 1 ; unicode_t value = c & (mask - 1) ; /* Ok, do the bytes */ line += index ; for( unsigned i = 2 ; i <= bytes ; i++) { c = (unsigned char) *++line ; if( (c & 0xc0) != 0x80) return 1 ; value = (value << 6) | (c & 0x3f) ; } if( value > 0x10FFFF) /* Avoid 110000 - 13FFFF */ return 1 ; *res = value ; return bytes ; } /* unicode_to_utf8() * * Convert a unicode value to its canonical utf-8 sequence. * * NOTE! This does not check for - or care about - the "invalid" unicode * values. Also, converting a utf-8 sequence to unicode and back does * *not* guarantee the same sequence, since this generates the shortest * possible sequence, while utf8_to_unicode() accepts both Latin1 and * overlong utf-8 sequences. */ unsigned unicode_to_utf8( unicode_t c, char *utf8) { unsigned bytes = 1 ; assert( c <= 0x10FFFF) ; #ifdef NDEBUG if( c > 0x10FFFF) /* Let's assume this is due to sign extension */ c &= 0xFF ; #endif if( c <= 0x7f) *utf8 = (char) c ; else { unsigned prefix = 0x40 ; char *p = utf8 ; do { *p++ = (char) (0x80 + (c & 0x3f)) ; bytes++ ; prefix >>= 1 ; c >>= 6 ; } while( c >= prefix) ; *p-- = *utf8 ; *utf8++ = (char) (c - 2 * prefix) ; if( utf8 < p) { /* swap middle two bytes if 4 bytes utf-8 code */ char c = *p ; *p = *utf8 ; *utf8 = c ; } } return bytes ; } unsigned utf8_revdelta( unsigned char *p, unsigned pos) { unsigned delta = 0 ; if( (*p & 0xC0) == 0x80) { unsigned char c ; c = *--p ; if( (c & 0xE0) == 0xC0) /* valid 2 bytes unicode seq */ delta = 1 ; else if( ((c & 0xC0) == 0x80) && (pos > 1)) { c = *--p ; if( (c & 0xF0) == 0xE0) /* valid 3 bytes unicode seq */ delta = 2 ; else if( ((c & 0xC0) == 0x80) && (pos > 2)) if( (p[ -1] & 0xF8) == 0xF0) /* valid 4 bytes unicode seq */ delta = 3 ; } } return delta ; } /* end of utf8.c */