From 6d567bb8ce9b77eed0d7ad3a17250178c455804a Mon Sep 17 00:00:00 2001 From: Witold Filipczyk Date: Fri, 23 Jul 2010 19:59:59 +0200 Subject: [PATCH] Support for multibyte encodings using iconv. New charsets must be added by analogy to Big5. --- src/intl/charsets.c | 112 +++++++++++++++++++++++++++++++++++++++++- src/intl/charsets.h | 1 + src/intl/codepage.inc | 15 +++++- 3 files changed, 126 insertions(+), 2 deletions(-) diff --git a/src/intl/charsets.c b/src/intl/charsets.c index a9bf9da0d..310fe1ed1 100644 --- a/src/intl/charsets.c +++ b/src/intl/charsets.c @@ -18,6 +18,11 @@ #include #endif +#ifdef HAVE_ICONV +#include +#include +#endif + #include "elinks.h" #include "document/options.h" @@ -122,6 +127,10 @@ static const char strings[256][2] = { "\370", "\371", "\372", "\373", "\374", "\375", "\376", "\377", }; +#ifdef HAVE_ICONV +static iconv_t iconv_cd = (iconv_t)-1; +#endif + static void free_translation_table(struct conv_table *p) { @@ -157,6 +166,7 @@ new_translation_table(struct conv_table *p) p[i].t = 0; p[i].u.str = no_str; } + p->iconv_cp = -1; } #define BIN_SEARCH(table, entry, entries, key, result) \ @@ -952,6 +962,12 @@ free_conv_table(void) first = 0; } new_translation_table(table); +#ifdef HAVE_ICONV + if (iconv_cd != (iconv_t)-1) { + iconv_close(iconv_cd); + iconv_cd = (iconv_t)-1; + } +#endif } @@ -967,6 +983,14 @@ get_translation_table(int from, int to) memset(table, 0, sizeof(table)); first = 0; } + + if (codepages[from].iconv) { + struct conv_table *table = get_translation_table_to_utf8(34); + + if (table) table->iconv_cp = from; + return table; + } + if (/*from == to ||*/ from == -1 || to == -1) return NULL; if (is_cp_ptr_utf8(&codepages[to])) @@ -1242,7 +1266,7 @@ end: unsigned char * convert_string(struct conv_table *convert_table, - unsigned char *chars, int charslen, int cp, + unsigned char *chars2, int charslen2, int cp, enum convert_string_mode mode, int *length, void (*callback)(void *data, unsigned char *buf, int buflen), void *callback_data) @@ -1250,6 +1274,19 @@ convert_string(struct conv_table *convert_table, unsigned char *buffer; int bufferpos = 0; int charspos = 0; + unsigned char *chars = chars2; + int charslen = charslen2; + +#ifdef HAVE_ICONV + static char iconv_input[256]; + static char iconv_output[256 * 8]; + static size_t iconv_offset; + static int iconv_cp; + static size_t iconv_inleft; + size_t iconv_outleft = 256 * 8; + int loop = 0; + int is_iconv = 0; + int chars_offset = 0; if (!convert_table && !memchr(chars, '&', charslen)) { if (callback) { @@ -1260,11 +1297,81 @@ convert_string(struct conv_table *convert_table, } } + if (cp >= 0) { + if (convert_table && convert_table->iconv_cp > 0) { + is_iconv = 1; + cp = convert_table->iconv_cp; + } else { + is_iconv = codepages[cp & ~SYSTEM_CHARSET_FLAG].iconv; + } + } +#endif + /* Buffer allocation */ buffer = mem_alloc(ALLOC_GR + 1 /* trailing \0 */); if (!buffer) return NULL; +#ifdef HAVE_ICONV + if (is_iconv) { + int v; + size_t before, to_copy; + char *outp, *inp; + + if (iconv_cd >= 0) { + if (cp != iconv_cp) { + iconv_close(iconv_cd); + iconv_cd = (iconv_t)-1; + } + } + if (iconv_cd == (iconv_t)-1) { + iconv_offset = 0; + iconv_cd = iconv_open("utf-8", get_cp_mime_name(cp)); + if (iconv_cd == (iconv_t)-1) { + mem_free(buffer); + return NULL; + } + iconv_cp = cp; + } +repeat: + to_copy = charslen2 - chars_offset; + if (to_copy > 256 - iconv_offset) to_copy = 256 - iconv_offset; + memcpy(iconv_input + iconv_offset, chars + chars_offset, to_copy); + iconv_outleft = 256 * 8; + iconv_inleft = iconv_offset + to_copy; + inp = iconv_input; + outp = iconv_output; + before = iconv_inleft; +again: + v = iconv(iconv_cd, &inp, &iconv_inleft, &outp, &iconv_outleft); + chars_offset += before - iconv_inleft; + charslen = 256 * 8 - iconv_outleft; + + chars = (unsigned char *)iconv_output; + charspos = 0; + + if (v == -1) { + switch (errno) { + case EINVAL: + memcpy(iconv_input, inp, iconv_inleft); + iconv_offset = iconv_inleft; + break; + case EILSEQ: + chars_offset++; + iconv_inleft--; + inp++; + goto again; + break; + default: + iconv_offset = 0; + } + } else { + iconv_offset = 0; + } + + loop = chars_offset < charslen2; + } +#endif /* Iterate ;-) */ while (charspos < charslen) { @@ -1359,6 +1466,9 @@ flush: #undef PUTC } +#ifdef HAVE_ICONV + if (loop) goto repeat; +#endif /* Say bye */ buffer[bufferpos] = 0; diff --git a/src/intl/charsets.h b/src/intl/charsets.h index 714fae034..7c4df2a5c 100644 --- a/src/intl/charsets.h +++ b/src/intl/charsets.h @@ -74,6 +74,7 @@ struct conv_table { * table owns the nested conversion table. */ struct conv_table *tbl; } u; + int iconv_cp; }; enum convert_string_mode { diff --git a/src/intl/codepage.inc b/src/intl/codepage.inc index 14b540381..992a247b2 100644 --- a/src/intl/codepage.inc +++ b/src/intl/codepage.inc @@ -4884,6 +4884,18 @@ unsigned char *const aliases_utf8 [] = { NULL }; +/*** Big5 ***/ + +#define highhalf_big5 highhalf_NULL + +#define table_big5 table_NULL + +unsigned char *const aliases_big5 [] = { + "Big5", + NULL +}; + + /*** NULL ***/ @@ -4947,7 +4959,8 @@ const struct codepage_desc codepages [] = { {"TCVN-5712", aliases_tcvn5712, highhalf_tcvn5712, table_tcvn5712, 0}, {"VISCII", aliases_viscii, highhalf_viscii, table_viscii, 0}, {"Unicode UTF-8", aliases_utf8, highhalf_utf8, table_utf8, 0}, + {"Big5", aliases_big5, highhalf_big5, table_big5, 1}, {NULL, NULL, NULL, 0} }; -#define N_CODEPAGES 35 +#define N_CODEPAGES 36