cuberite-2a/squirrel_3_0_1_stable/sqplus/SqPlusUtf8.cpp


///////////////////////////////////////////////////////////////////////
// Simple conversion routines, to, from UTF8 and full Unicode character
// (using int). 
// 

// Only when needed 
#if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0

static char g_utf8_length[256];
static int g_did_init_length; 

void sqplus_init_utf8_lengths() {
    // Fill in lengths in array above
    for( int lb=0; lb<256; lb++ ){
        int l = -1;
        if( !(lb&0x80) ) l=1;
        else if( (lb&0xE0)==0xC0 ) l=2;
        else if( (lb&0xF0)==0xE0 ) l=3;
        else if( (lb&0xF8)==0xF0 ) l=4;
        else if( (lb&0xFC)==0xF8 ) l=5;
        else if( (lb&0xFE)==0xFC ) l=6;
        g_utf8_length[lb] = l;
    }
    g_did_init_length = 1;
}

// Length of a UTF8 encoded Unicode character
int sqplus_utf8_len(int lead_byte){
    if( !(lead_byte&0x80) ) return 1;  // Special case, make faster
    if( !g_did_init_length )
        sqplus_init_utf8_lengths();
        
    return g_utf8_length[(unsigned char)lead_byte];
}

int sqplus_utf8_len_first(const char* pc){
    int lb = *(unsigned char*)pc;
    if( !(lb&0x80) ) return 1;  // Special case, make faster
    if( !g_did_init_length )
        sqplus_init_utf8_lengths();

    int l = g_utf8_length[(unsigned char)lb];
    if( l>0 ) return l;
    
    // Invalid UTF8 lead byte. Look for next valid character. 
    const char *pc1 = pc+1;
    while( ((*pc1)&0xC0)==0x80 ) 
        pc1++;
    return int(pc1 - pc);
}


// Length of a UTF8 encoded Unicode string (number of Unicode characters)
int sqplus_utf8_strlen(const char *str) {
    if( !str ) return 0;
    int l, tl=0;
    while( *str ){
        l = sqplus_utf8_len_first(str);
        str += l;
        tl++;
    }
    return tl;
}

// Convert one UTF8 encoded character to Unicode point
int sqplus_utf8_to_wchar(int *result, const char *string){
    int res=-1;
    
    // Assume argument pointers to be OK
    unsigned char ch = *string;
    int l = sqplus_utf8_len(ch);
    
    if( l<1  ) return -1;
    int wc = l>1 ? (ch&(0x7F>>l)) : ch;
    while( l>1 ){
        wc = (wc<<6) + (*++string & 0x3F);
        l--;
    }
    *result = wc;
    
    return 0;
}

// Convert one Unicode point to UTF8 encoded version.
// Checks if output fits in 1/4/6 bytes buffer.
int sqplus_wchar_to_utf8(char *s, int wc, int size){
    if( size<1 ) return -1;
    if( (unsigned int)wc>=0x80000000 ) return -2;
    
    // Single byte case
    if( wc<0x80 ){
        *s = (char)wc;
        //*s = (char)wc&0x7F;
        return 1;
    }
    if( size<4 ) return -3;

    // Two or more UTF8 bytes
    int p = 1;  // Index of last UTF8 byte
    if( wc>0x7FF ){ // 11 bits
        // Three or more UTF8 bytes
        p++; // p>=2
        if( wc>0xFFFF ){    // 16 bits
            // Four or more UTF8 bytes
            p++; // p>=3
            if( wc>0x1FFFFF ){  // 21 bits
                // Five or more UTF8 bytes
                if( size<6 ) return -3;
                p++; // p>=4 UTF8 bytes
                if( wc>0x3FFFFFF ){ // 26 bits
                    // Six UTF8 bytes
                    p++; // p>=5 
                    if( (unsigned int)wc>(unsigned int)0x7FFFFFF ){  // 31 bits
						// Would need 7 UTF8 bytes. Not supported.
						return -10;
                    }
                    s[p-4] = 0x80 | ((wc>>24)&0x3F);    // Bit 24..29
                }
                s[p-3] = 0x80 | ((wc>>18)&0x3F);    // Bit 18..23
            }
            s[p-2] = 0x80 | ((wc>>12)&0x3F);    // Bit 12..17
        }
        s[p-1] = 0x80 | ((wc>>6)&0x3F);    // Bit 6..11
    }
    s[p] = 0x80 | (wc&0x3F);    // Bit 0..5
    s[0] = (0xFC << (5-p)) | (wc>>(p*6));
    
    return p+1;
}

#endif // #if !defined(SQPLUS_USE_LATIN1) || SQPLUS_USE_LATIN1==0
It's a Squirrel!! In SquirrelBindings.h use #define USE_SQUIRREL 1 to enable squirrel git-svn-id: http://mc-server.googlecode.com/svn/trunk@76 0a769ca7-a7f5-676a-18bf-c427514a06d6 2011-11-07 20:25:01 -05:00
			`///////////////////////////////////////////////////////////////////////`
			`// Simple conversion routines, to, from UTF8 and full Unicode character`
			`// (using int).`
			`//`

			`// Only when needed`
			`#if !defined(SQPLUS_USE_LATIN1) \|\| SQPLUS_USE_LATIN1==0`

			`static char g_utf8_length[256];`
			`static int g_did_init_length;`

			`void sqplus_init_utf8_lengths() {`
			`// Fill in lengths in array above`
			`for( int lb=0; lb<256; lb++ ){`
			`int l = -1;`
			`if( !(lb&0x80) ) l=1;`
			`else if( (lb&0xE0)==0xC0 ) l=2;`
			`else if( (lb&0xF0)==0xE0 ) l=3;`
			`else if( (lb&0xF8)==0xF0 ) l=4;`
			`else if( (lb&0xFC)==0xF8 ) l=5;`
			`else if( (lb&0xFE)==0xFC ) l=6;`
			`g_utf8_length[lb] = l;`
			`}`
			`g_did_init_length = 1;`
			`}`

			`// Length of a UTF8 encoded Unicode character`
			`int sqplus_utf8_len(int lead_byte){`
			`if( !(lead_byte&0x80) ) return 1; // Special case, make faster`
			`if( !g_did_init_length )`
			`sqplus_init_utf8_lengths();`

			`return g_utf8_length[(unsigned char)lead_byte];`
			`}`

			`int sqplus_utf8_len_first(const char* pc){`
			`int lb = (unsigned char)pc;`
			`if( !(lb&0x80) ) return 1; // Special case, make faster`
			`if( !g_did_init_length )`
			`sqplus_init_utf8_lengths();`

			`int l = g_utf8_length[(unsigned char)lb];`
			`if( l>0 ) return l;`

			`// Invalid UTF8 lead byte. Look for next valid character.`
			`const char *pc1 = pc+1;`
			`while( ((*pc1)&0xC0)==0x80 )`
			`pc1++;`
			`return int(pc1 - pc);`
			`}`


			`// Length of a UTF8 encoded Unicode string (number of Unicode characters)`
			`int sqplus_utf8_strlen(const char *str) {`
			`if( !str ) return 0;`
			`int l, tl=0;`
			`while( *str ){`
			`l = sqplus_utf8_len_first(str);`
			`str += l;`
			`tl++;`
			`}`
			`return tl;`
			`}`

			`// Convert one UTF8 encoded character to Unicode point`
			`int sqplus_utf8_to_wchar(int result, const char string){`
			`int res=-1;`

			`// Assume argument pointers to be OK`
			`unsigned char ch = *string;`
			`int l = sqplus_utf8_len(ch);`

			`if( l<1 ) return -1;`
			`int wc = l>1 ? (ch&(0x7F>>l)) : ch;`
			`while( l>1 ){`
			`wc = (wc<<6) + (*++string & 0x3F);`
			`l--;`
			`}`
			`*result = wc;`

			`return 0;`
			`}`

			`// Convert one Unicode point to UTF8 encoded version.`
			`// Checks if output fits in 1/4/6 bytes buffer.`
			`int sqplus_wchar_to_utf8(char *s, int wc, int size){`
			`if( size<1 ) return -1;`
			`if( (unsigned int)wc>=0x80000000 ) return -2;`

			`// Single byte case`
			`if( wc<0x80 ){`
			`*s = (char)wc;`
			`//*s = (char)wc&0x7F;`
			`return 1;`
			`}`
			`if( size<4 ) return -3;`

			`// Two or more UTF8 bytes`
			`int p = 1; // Index of last UTF8 byte`
			`if( wc>0x7FF ){ // 11 bits`
			`// Three or more UTF8 bytes`
			`p++; // p>=2`
			`if( wc>0xFFFF ){ // 16 bits`
			`// Four or more UTF8 bytes`
			`p++; // p>=3`
			`if( wc>0x1FFFFF ){ // 21 bits`
			`// Five or more UTF8 bytes`
			`if( size<6 ) return -3;`
			`p++; // p>=4 UTF8 bytes`
			`if( wc>0x3FFFFFF ){ // 26 bits`
			`// Six UTF8 bytes`
			`p++; // p>=5`
			`if( (unsigned int)wc>(unsigned int)0x7FFFFFF ){ // 31 bits`
			`// Would need 7 UTF8 bytes. Not supported.`
			`return -10;`
			`}`
			`s[p-4] = 0x80 \| ((wc>>24)&0x3F); // Bit 24..29`
			`}`
			`s[p-3] = 0x80 \| ((wc>>18)&0x3F); // Bit 18..23`
			`}`
			`s[p-2] = 0x80 \| ((wc>>12)&0x3F); // Bit 12..17`
			`}`
			`s[p-1] = 0x80 \| ((wc>>6)&0x3F); // Bit 6..11`
			`}`
			`s[p] = 0x80 \| (wc&0x3F); // Bit 0..5`
			`s[0] = (0xFC << (5-p)) \| (wc>>(p*6));`

			`return p+1;`
			`}`

			`#endif // #if !defined(SQPLUS_USE_LATIN1) \|\| SQPLUS_USE_LATIN1==0`