irssi/src/fe-text/utf8.c

/* utf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 2002 Timo Sirainen
 *
 * Based on GLib code by
 *
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include "module.h"

#define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
    {									      \
      Len = 1;								      \
      Mask = 0x7f;							      \
    }									      \
  else if ((Char & 0xe0) == 0xc0)					      \
    {									      \
      Len = 2;								      \
      Mask = 0x1f;							      \
    }									      \
  else if ((Char & 0xf0) == 0xe0)					      \
    {									      \
      Len = 3;								      \
      Mask = 0x0f;							      \
    }									      \
  else if ((Char & 0xf8) == 0xf0)					      \
    {									      \
      Len = 4;								      \
      Mask = 0x07;							      \
    }									      \
  else if ((Char & 0xfc) == 0xf8)					      \
    {									      \
      Len = 5;								      \
      Mask = 0x03;							      \
    }									      \
  else if ((Char & 0xfe) == 0xfc)					      \
    {									      \
      Len = 6;								      \
      Mask = 0x01;							      \
    }									      \
  else									      \
    Len = -1;

#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
  (Result) = (Chars)[0] & (Mask);					      \
  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
    {									      \
      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
	{								      \
	  (Result) = -1;						      \
	  break;							      \
	}								      \
      (Result) <<= 6;							      \
      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
    }

unichar get_utf8_char(const unsigned char **ptr, int len)
{
	int i, result, mask, chrlen;

        mask = 0;
	UTF8_COMPUTE(**ptr, mask, chrlen);
	if (len == -1)
		return (unichar) -2;

	if (chrlen > len)
                return (unichar) -1;

	UTF8_GET(result, *ptr, i, mask, len);
	if (result == -1)
                return (unichar) -2;

	*ptr += len-1;
        return result;
}

int strlen_utf8(const char *str)
{
	const unsigned char *p = (const unsigned char *) str;
        int len;

	len = 0;
	while (*p != '\0' && get_utf8_char(&p, 6) > 0) {
		len++;
                p++;
	}
        return len;
}

int utf16_char_to_utf8(unichar c, unsigned char *outbuf)
{
	int len, i, first;

        len = 0;
	if (c < 0x80) {
		first = 0;
		len = 1;
	} else if (c < 0x800) {
		first = 0xc0;
		len = 2;
	} else if (c < 0x10000) {
		first = 0xe0;
		len = 3;
	} else if (c < 0x200000) {
		first = 0xf0;
		len = 4;
	} else if (c < 0x4000000) {
		first = 0xf8;
		len = 5;
	} else {
		first = 0xfc;
		len = 6;
	}

	if (outbuf) {
		for (i = len - 1; i > 0; --i) {
			outbuf[i] = (c & 0x3f) | 0x80;
			c >>= 6;
		}
		outbuf[0] = c | first;
	}

	return len;
}

void utf8_to_utf16(const char *str, unichar *out)
{
	const unsigned char *p = (const unsigned char *) str;
        int i, result, mask, len;

	while (*p != '\0') {
                mask = 0;
		UTF8_COMPUTE(*p, mask, len);
		if (len == -1)
                        break;

		UTF8_GET(result, p, i, mask, len);
		if (result == -1)
                        break;

                p += len;
                *out++ = result;
	}

	*out = '\0';
}

void utf16_to_utf8(const unichar *str, char *out)
{
	int len;

	while (*str != '\0') {
		len = utf16_char_to_utf8(*str, out);
                out += len;

		str++;
	}
	*out = '\0';
}
/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`/* utf8.c - Operations on UTF-8 strings.`
			`*`
			`* Copyright (C) 2002 Timo Sirainen`
			`*`
			`* Based on GLib code by`
			`*`
			`* Copyright (C) 1999 Tom Tromey`
			`* Copyright (C) 2000 Red Hat, Inc.`
			`*`
			`* This library is free software; you can redistribute it and/or`
			`* modify it under the terms of the GNU Lesser General Public`
			`* License as published by the Free Software Foundation; either`
			`* version 2 of the License, or (at your option) any later version.`
			`*`
			`* This library is distributed in the hope that it will be useful,`
			`* but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`* Lesser General Public License for more details.`
			`*`
			`* You should have received a copy of the GNU Lesser General Public`
			`* License along with this library; if not, write to the`
			`* Free Software Foundation, Inc., 59 Temple Place - Suite 330,`
			`* Boston, MA 02111-1307, USA.`
			`*/`

			`#include "module.h"`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00
			`#define UTF8_COMPUTE(Char, Mask, Len) \`
			`if (Char < 128) \`
			`{ \`
			`Len = 1; \`
			`Mask = 0x7f; \`
			`} \`
			`else if ((Char & 0xe0) == 0xc0) \`
			`{ \`
			`Len = 2; \`
			`Mask = 0x1f; \`
			`} \`
			`else if ((Char & 0xf0) == 0xe0) \`
			`{ \`
			`Len = 3; \`
			`Mask = 0x0f; \`
			`} \`
			`else if ((Char & 0xf8) == 0xf0) \`
			`{ \`
			`Len = 4; \`
			`Mask = 0x07; \`
			`} \`
			`else if ((Char & 0xfc) == 0xf8) \`
			`{ \`
			`Len = 5; \`
			`Mask = 0x03; \`
			`} \`
			`else if ((Char & 0xfe) == 0xfc) \`
			`{ \`
			`Len = 6; \`
			`Mask = 0x01; \`
			`} \`
			`else \`
			`Len = -1;`

			`#define UTF8_GET(Result, Chars, Count, Mask, Len) \`
			`(Result) = (Chars)[0] & (Mask); \`
			`for ((Count) = 1; (Count) < (Len); ++(Count)) \`
			`{ \`
			`if (((Chars)[(Count)] & 0xc0) != 0x80) \`
			`{ \`
			`(Result) = -1; \`
			`break; \`
			`} \`
			`(Result) <<= 6; \`
			`(Result) \|= ((Chars)[(Count)] & 0x3f); \`
			`}`

/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`unichar get_utf8_char(const unsigned char **ptr, int len)`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00			`{`
/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`int i, result, mask, chrlen;`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00
/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`mask = 0;`
			`UTF8_COMPUTE(**ptr, mask, chrlen);`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00			`if (len == -1)`
/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`return (unichar) -2;`

			`if (chrlen > len)`
			`return (unichar) -1;`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00
			`UTF8_GET(result, *ptr, i, mask, len);`
			`if (result == -1)`
/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`return (unichar) -2;`

			`*ptr += len-1;`
			`return result;`
			`}`

			`int strlen_utf8(const char *str)`
			`{`
			`const unsigned char p = (const unsigned char ) str;`
			`int len;`

			`len = 0;`
			`while (*p != '\0' && get_utf8_char(&p, 6) > 0) {`
			`len++;`
			`p++;`
			`}`
			`return len;`
			`}`

			`int utf16_char_to_utf8(unichar c, unsigned char *outbuf)`
			`{`
			`int len, i, first;`

			`len = 0;`
			`if (c < 0x80) {`
			`first = 0;`
			`len = 1;`
			`} else if (c < 0x800) {`
			`first = 0xc0;`
			`len = 2;`
			`} else if (c < 0x10000) {`
			`first = 0xe0;`
			`len = 3;`
			`} else if (c < 0x200000) {`
			`first = 0xf0;`
			`len = 4;`
			`} else if (c < 0x4000000) {`
			`first = 0xf8;`
			`len = 5;`
			`} else {`
			`first = 0xfc;`
			`len = 6;`
			`}`

			`if (outbuf) {`
			`for (i = len - 1; i > 0; --i) {`
			`outbuf[i] = (c & 0x3f) \| 0x80;`
			`c >>= 6;`
			`}`
			`outbuf[0] = c \| first;`
			`}`

			`return len;`
			`}`

			`void utf8_to_utf16(const char str, unichar out)`
			`{`
			`const unsigned char p = (const unsigned char ) str;`
			`int i, result, mask, len;`

			`while (*p != '\0') {`
			`mask = 0;`
			`UTF8_COMPUTE(*p, mask, len);`
			`if (len == -1)`
			`break;`

			`UTF8_GET(result, p, i, mask, len);`
			`if (result == -1)`
			`break;`

			`p += len;`
			`*out++ = result;`
			`}`

			`*out = '\0';`
			`}`

			`void utf16_to_utf8(const unichar str, char out)`
			`{`
			`int len;`

			`while (*str != '\0') {`
			`len = utf16_char_to_utf8(*str, out);`
			`out += len;`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00
/SET term_type 8bit\|utf8\|big5 - Removed --with-big5 configure option, it's now included the same way as utf8, though i'm not sure if it really works but at least it should partially :) Input line is now internally using 32bit chars but it's converted to 8bit chars for external use. Text buffer supports only 8bit + utf8 for now. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2448 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-02-15 08:38:24 -05:00			`str++;`
			`}`
			`*out = '\0';`
Preliminary support for UTF8 with /SET term_utf8 ON. Input line is still messed up, but lines should wrap properly in text buffer. git-svn-id: http://svn.irssi.org/repos/irssi/trunk@2354 dbcabf3a-b0e7-0310-adc4-f8d773084564 2002-01-28 20:33:41 -05:00			`}`