2002-02-15 08:38:24 -05:00
|
|
|
/* utf8.c - Operations on UTF-8 strings.
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002 Timo Sirainen
|
|
|
|
*
|
|
|
|
* Based on GLib code by
|
|
|
|
*
|
|
|
|
* Copyright (C) 1999 Tom Tromey
|
|
|
|
* Copyright (C) 2000 Red Hat, Inc.
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
* License along with this library; if not, write to the
|
|
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
|
|
* Boston, MA 02111-1307, USA.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "module.h"
|
2002-01-28 20:33:41 -05:00
|
|
|
|
|
|
|
#define UTF8_COMPUTE(Char, Mask, Len) \
|
|
|
|
if (Char < 128) \
|
|
|
|
{ \
|
|
|
|
Len = 1; \
|
|
|
|
Mask = 0x7f; \
|
|
|
|
} \
|
|
|
|
else if ((Char & 0xe0) == 0xc0) \
|
|
|
|
{ \
|
|
|
|
Len = 2; \
|
|
|
|
Mask = 0x1f; \
|
|
|
|
} \
|
|
|
|
else if ((Char & 0xf0) == 0xe0) \
|
|
|
|
{ \
|
|
|
|
Len = 3; \
|
|
|
|
Mask = 0x0f; \
|
|
|
|
} \
|
|
|
|
else if ((Char & 0xf8) == 0xf0) \
|
|
|
|
{ \
|
|
|
|
Len = 4; \
|
|
|
|
Mask = 0x07; \
|
|
|
|
} \
|
|
|
|
else if ((Char & 0xfc) == 0xf8) \
|
|
|
|
{ \
|
|
|
|
Len = 5; \
|
|
|
|
Mask = 0x03; \
|
|
|
|
} \
|
|
|
|
else if ((Char & 0xfe) == 0xfc) \
|
|
|
|
{ \
|
|
|
|
Len = 6; \
|
|
|
|
Mask = 0x01; \
|
|
|
|
} \
|
|
|
|
else \
|
|
|
|
Len = -1;
|
|
|
|
|
|
|
|
#define UTF8_GET(Result, Chars, Count, Mask, Len) \
|
|
|
|
(Result) = (Chars)[0] & (Mask); \
|
|
|
|
for ((Count) = 1; (Count) < (Len); ++(Count)) \
|
|
|
|
{ \
|
|
|
|
if (((Chars)[(Count)] & 0xc0) != 0x80) \
|
|
|
|
{ \
|
|
|
|
(Result) = -1; \
|
|
|
|
break; \
|
|
|
|
} \
|
|
|
|
(Result) <<= 6; \
|
|
|
|
(Result) |= ((Chars)[(Count)] & 0x3f); \
|
|
|
|
}
|
|
|
|
|
2002-02-15 08:38:24 -05:00
|
|
|
unichar get_utf8_char(const unsigned char **ptr, int len)
|
2002-01-28 20:33:41 -05:00
|
|
|
{
|
2002-02-15 08:38:24 -05:00
|
|
|
int i, result, mask, chrlen;
|
2002-01-28 20:33:41 -05:00
|
|
|
|
2002-02-15 08:38:24 -05:00
|
|
|
mask = 0;
|
|
|
|
UTF8_COMPUTE(**ptr, mask, chrlen);
|
2002-02-15 09:09:10 -05:00
|
|
|
if (chrlen == -1)
|
2002-02-15 08:38:24 -05:00
|
|
|
return (unichar) -2;
|
|
|
|
|
|
|
|
if (chrlen > len)
|
|
|
|
return (unichar) -1;
|
2002-01-28 20:33:41 -05:00
|
|
|
|
2002-02-15 09:09:10 -05:00
|
|
|
UTF8_GET(result, *ptr, i, mask, chrlen);
|
2002-01-28 20:33:41 -05:00
|
|
|
if (result == -1)
|
2002-02-15 08:38:24 -05:00
|
|
|
return (unichar) -2;
|
|
|
|
|
2002-02-15 09:09:10 -05:00
|
|
|
*ptr += chrlen-1;
|
2002-02-15 08:38:24 -05:00
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
int strlen_utf8(const char *str)
|
|
|
|
{
|
|
|
|
const unsigned char *p = (const unsigned char *) str;
|
|
|
|
int len;
|
|
|
|
|
|
|
|
len = 0;
|
|
|
|
while (*p != '\0' && get_utf8_char(&p, 6) > 0) {
|
|
|
|
len++;
|
|
|
|
p++;
|
|
|
|
}
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
int utf16_char_to_utf8(unichar c, unsigned char *outbuf)
|
|
|
|
{
|
|
|
|
int len, i, first;
|
|
|
|
|
|
|
|
len = 0;
|
|
|
|
if (c < 0x80) {
|
|
|
|
first = 0;
|
|
|
|
len = 1;
|
|
|
|
} else if (c < 0x800) {
|
|
|
|
first = 0xc0;
|
|
|
|
len = 2;
|
|
|
|
} else if (c < 0x10000) {
|
|
|
|
first = 0xe0;
|
|
|
|
len = 3;
|
|
|
|
} else if (c < 0x200000) {
|
|
|
|
first = 0xf0;
|
|
|
|
len = 4;
|
|
|
|
} else if (c < 0x4000000) {
|
|
|
|
first = 0xf8;
|
|
|
|
len = 5;
|
|
|
|
} else {
|
|
|
|
first = 0xfc;
|
|
|
|
len = 6;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (outbuf) {
|
|
|
|
for (i = len - 1; i > 0; --i) {
|
|
|
|
outbuf[i] = (c & 0x3f) | 0x80;
|
|
|
|
c >>= 6;
|
|
|
|
}
|
|
|
|
outbuf[0] = c | first;
|
|
|
|
}
|
|
|
|
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
void utf8_to_utf16(const char *str, unichar *out)
|
|
|
|
{
|
|
|
|
const unsigned char *p = (const unsigned char *) str;
|
|
|
|
int i, result, mask, len;
|
|
|
|
|
|
|
|
while (*p != '\0') {
|
|
|
|
mask = 0;
|
|
|
|
UTF8_COMPUTE(*p, mask, len);
|
|
|
|
if (len == -1)
|
|
|
|
break;
|
|
|
|
|
|
|
|
UTF8_GET(result, p, i, mask, len);
|
|
|
|
if (result == -1)
|
|
|
|
break;
|
|
|
|
|
|
|
|
p += len;
|
|
|
|
*out++ = result;
|
|
|
|
}
|
|
|
|
|
|
|
|
*out = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
void utf16_to_utf8(const unichar *str, char *out)
|
|
|
|
{
|
|
|
|
int len;
|
|
|
|
|
|
|
|
while (*str != '\0') {
|
|
|
|
len = utf16_char_to_utf8(*str, out);
|
|
|
|
out += len;
|
2002-01-28 20:33:41 -05:00
|
|
|
|
2002-02-15 08:38:24 -05:00
|
|
|
str++;
|
|
|
|
}
|
|
|
|
*out = '\0';
|
2002-01-28 20:33:41 -05:00
|
|
|
}
|