134 lines
3.7 KiB
C++
134 lines
3.7 KiB
C++
|
|
||
|
///////////////////////////////////////////////////////////////////////
|
||
|
// Simple conversion routines, to, from UTF8 and full Unicode character
|
||
|
// (using int).
|
||
|
//
|
||
|
|
||
|
// Only when needed
|
||
|
#if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0
|
||
|
|
||
|
static char g_utf8_length[256];
|
||
|
static int g_did_init_length;
|
||
|
|
||
|
void sqplus_init_utf8_lengths() {
|
||
|
// Fill in lengths in array above
|
||
|
for( int lb=0; lb<256; lb++ ){
|
||
|
int l = -1;
|
||
|
if( !(lb&0x80) ) l=1;
|
||
|
else if( (lb&0xE0)==0xC0 ) l=2;
|
||
|
else if( (lb&0xF0)==0xE0 ) l=3;
|
||
|
else if( (lb&0xF8)==0xF0 ) l=4;
|
||
|
else if( (lb&0xFC)==0xF8 ) l=5;
|
||
|
else if( (lb&0xFE)==0xFC ) l=6;
|
||
|
g_utf8_length[lb] = l;
|
||
|
}
|
||
|
g_did_init_length = 1;
|
||
|
}
|
||
|
|
||
|
// Length of a UTF8 encoded Unicode character
|
||
|
int sqplus_utf8_len(int lead_byte){
|
||
|
if( !(lead_byte&0x80) ) return 1; // Special case, make faster
|
||
|
if( !g_did_init_length )
|
||
|
sqplus_init_utf8_lengths();
|
||
|
|
||
|
return g_utf8_length[(unsigned char)lead_byte];
|
||
|
}
|
||
|
|
||
|
int sqplus_utf8_len_first(const char* pc){
|
||
|
int lb = *(unsigned char*)pc;
|
||
|
if( !(lb&0x80) ) return 1; // Special case, make faster
|
||
|
if( !g_did_init_length )
|
||
|
sqplus_init_utf8_lengths();
|
||
|
|
||
|
int l = g_utf8_length[(unsigned char)lb];
|
||
|
if( l>0 ) return l;
|
||
|
|
||
|
// Invalid UTF8 lead byte. Look for next valid character.
|
||
|
const char *pc1 = pc+1;
|
||
|
while( ((*pc1)&0xC0)==0x80 )
|
||
|
pc1++;
|
||
|
return int(pc1 - pc);
|
||
|
}
|
||
|
|
||
|
|
||
|
// Length of a UTF8 encoded Unicode string (number of Unicode characters)
|
||
|
int sqplus_utf8_strlen(const char *str) {
|
||
|
if( !str ) return 0;
|
||
|
int l, tl=0;
|
||
|
while( *str ){
|
||
|
l = sqplus_utf8_len_first(str);
|
||
|
str += l;
|
||
|
tl++;
|
||
|
}
|
||
|
return tl;
|
||
|
}
|
||
|
|
||
|
// Convert one UTF8 encoded character to Unicode point
|
||
|
int sqplus_utf8_to_wchar(int *result, const char *string){
|
||
|
int res=-1;
|
||
|
|
||
|
// Assume argument pointers to be OK
|
||
|
unsigned char ch = *string;
|
||
|
int l = sqplus_utf8_len(ch);
|
||
|
|
||
|
if( l<1 ) return -1;
|
||
|
int wc = l>1 ? (ch&(0x7F>>l)) : ch;
|
||
|
while( l>1 ){
|
||
|
wc = (wc<<6) + (*++string & 0x3F);
|
||
|
l--;
|
||
|
}
|
||
|
*result = wc;
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
// Convert one Unicode point to UTF8 encoded version.
|
||
|
// Checks if output fits in 1/4/6 bytes buffer.
|
||
|
int sqplus_wchar_to_utf8(char *s, int wc, int size){
|
||
|
if( size<1 ) return -1;
|
||
|
if( (unsigned int)wc>=0x80000000 ) return -2;
|
||
|
|
||
|
// Single byte case
|
||
|
if( wc<0x80 ){
|
||
|
*s = (char)wc;
|
||
|
//*s = (char)wc&0x7F;
|
||
|
return 1;
|
||
|
}
|
||
|
if( size<4 ) return -3;
|
||
|
|
||
|
// Two or more UTF8 bytes
|
||
|
int p = 1; // Index of last UTF8 byte
|
||
|
if( wc>0x7FF ){ // 11 bits
|
||
|
// Three or more UTF8 bytes
|
||
|
p++; // p>=2
|
||
|
if( wc>0xFFFF ){ // 16 bits
|
||
|
// Four or more UTF8 bytes
|
||
|
p++; // p>=3
|
||
|
if( wc>0x1FFFFF ){ // 21 bits
|
||
|
// Five or more UTF8 bytes
|
||
|
if( size<6 ) return -3;
|
||
|
p++; // p>=4 UTF8 bytes
|
||
|
if( wc>0x3FFFFFF ){ // 26 bits
|
||
|
// Six UTF8 bytes
|
||
|
p++; // p>=5
|
||
|
if( (unsigned int)wc>(unsigned int)0x7FFFFFF ){ // 31 bits
|
||
|
// Would need 7 UTF8 bytes. Not supported.
|
||
|
return -10;
|
||
|
}
|
||
|
s[p-4] = 0x80 | ((wc>>24)&0x3F); // Bit 24..29
|
||
|
}
|
||
|
s[p-3] = 0x80 | ((wc>>18)&0x3F); // Bit 18..23
|
||
|
}
|
||
|
s[p-2] = 0x80 | ((wc>>12)&0x3F); // Bit 12..17
|
||
|
}
|
||
|
s[p-1] = 0x80 | ((wc>>6)&0x3F); // Bit 6..11
|
||
|
}
|
||
|
s[p] = 0x80 | (wc&0x3F); // Bit 0..5
|
||
|
s[0] = (0xFC << (5-p)) | (wc>>(p*6));
|
||
|
|
||
|
return p+1;
|
||
|
}
|
||
|
|
||
|
#endif // #if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0
|
||
|
|