diff --git a/configure.in b/configure.in index 061ae9cb..87d5ed23 100644 --- a/configure.in +++ b/configure.in @@ -305,7 +305,7 @@ AC_CHECK_FUNCS(setenv putenv, HAVE_SETENV_OR_PUTENV=yes) AC_CHECK_FUNCS(getuid, HAVE_GETUID=yes) AC_CHECK_FUNCS(geteuid, HAVE_GETEUID=yes) -AC_CHECK_FUNCS(wcwidth) +AC_CHECK_FUNCS(wcwidth, HAVE_WCWIDTH=yes) dnl These aren't probably needed now, as they are commented in links.h. dnl I've no idea about their historical background, but I keep them here @@ -1338,6 +1338,9 @@ EL_ARG_ENABLE(CONFIG_SMALL, small, [Small binary], EL_ARG_ENABLE(CONFIG_UTF8, utf-8, [UTF-8], [ --disable-utf-8 disable UTF-8 support]) +EL_ARG_DEPEND(CONFIG_COMBINE, combining, [CONFIG_UTF8:yes HAVE_WCWIDTH:yes], [Combining characters], + [ --enable-combining support Unicode combining characters (experimental)]) + AC_ARG_ENABLE(weehoofooboomookerchoo, [ diff --git a/features.conf b/features.conf index ad253c54..caf19ad3 100644 --- a/features.conf +++ b/features.conf @@ -621,7 +621,7 @@ CONFIG_SMALL=no # support for double-width characters (like Japanese, etc.). # # Some features of Unicode are not handled at all. Combining characters is -# most visible absence. +# most visible absence; but see CONFIG_COMBINE below. # Some features are partially supported. Like line breaking between # double-width characters. There is no other detection for determining when to # break or not. @@ -633,6 +633,42 @@ CONFIG_SMALL=no CONFIG_UTF8=yes +### Unicode combining characters support +# +# Extends CONFIG_UTF8 with spotty support for combining characters +# such as U+0303 COMBINING TILDE. +# +# This feature is experimental and has been filed as enhancement 824. +# Known bugs and weaknesses: +# +# - It assumes wcwidth(wc)==0 means wc is a combining character. +# However, wcwidth also returns 0 for various control characters +# (e.g. U+200E LEFT-TO-RIGHT MARK), and apparently returns -1 if +# LC_CTYPE does not support the wide character. Besides, wchar_t +# might not be Unicode at all. ELinks should instead use Unicode +# character properties, perhaps via ICU. +# +# - It assumes all combining characters are nonspacing. +# +# - It works only if the terminal is using the UTF-8 charset. +# +# - It allocates an internal code for each different combining +# character sequence. A malicious web page could easily use up all +# the available codes, and the ELinks process would thenceforth be +# unable to display any new sequences. +# +# - It does not understand canonical equivalences. +# +# - Combining characters work only in HTML text. They do not work in +# HTML forms, HTML links, HTML document titles, plain text, menus, +# dialog boxes, or keymaps. +# +# - Combining characters at the end of the document do not take effect. +# +# Default: disabled + +CONFIG_COMBINE=no + ### Back-trace Printing # diff --git a/src/config/options.inc b/src/config/options.inc index 61211142..5ed890ac 100644 --- a/src/config/options.inc +++ b/src/config/options.inc @@ -864,10 +864,12 @@ static struct option_info config_options_info[] = { "only the subset of UTF-8 according to terminal codepage is used.\n" "ELinks ignores this option if the terminal codepage is UTF-8.")), +#ifdef CONFIG_COMBINE INIT_OPT_BOOL("terminal._template_", N_("Combining characters"), "combine", 0, 0, N_("Enable combining characters. It works only with " "the xterm in UTF-8 mode.")), +#endif INIT_OPT_BOOL("terminal._template_", N_("Restrict frames in cp850/852"), "restrict_852", 0, 0, diff --git a/src/dialogs/options.c b/src/dialogs/options.c index ebb343bd..4f7fb8c8 100644 --- a/src/dialogs/options.c +++ b/src/dialogs/options.c @@ -91,7 +91,9 @@ enum termopt { TERM_OPT_UTF_8_IO, TERM_OPT_TRANSPARENCY, TERM_OPT_UNDERLINE, +#ifdef CONFIG_COMBINE TERM_OPT_COMBINE, +#endif TERM_OPTIONS, }; @@ -105,7 +107,9 @@ static struct option_resolver resolvers[] = { { TERM_OPT_TRANSPARENCY, "transparency" }, { TERM_OPT_UTF_8_IO, "utf_8_io" }, { TERM_OPT_UNDERLINE, "underline" }, +#ifdef CONFIG_COMBINE { TERM_OPT_COMBINE, "combine" }, +#endif }; static widget_handler_status_T @@ -151,7 +155,7 @@ push_save_button(struct dialog_data *dlg_data, struct widget_data *button) #define RADIO_TRUE 0 #endif -#define TERMOPT_WIDGETS_COUNT (20 + RADIO_88 + RADIO_256 + RADIO_TRUE) +#define TERMOPT_WIDGETS_COUNT (12 + TERM_OPTIONS + RADIO_88 + RADIO_256 + RADIO_TRUE) #define TERM_OPTION_VALUE_SIZE (sizeof(union option_value) * TERM_OPTIONS) @@ -232,7 +236,9 @@ terminal_options(struct terminal *term, void *xxx, struct session *ses) add_dlg_checkbox(dlg, _("Transparency", term), &values[TERM_OPT_TRANSPARENCY].number); add_dlg_checkbox(dlg, _("Underline", term), &values[TERM_OPT_UNDERLINE].number); add_dlg_checkbox(dlg, _("UTF-8 I/O", term), &values[TERM_OPT_UTF_8_IO].number); +#ifdef CONFIG_COMBINE add_dlg_checkbox(dlg, _("Combining characters", term), &values[TERM_OPT_COMBINE].number); +#endif add_dlg_button(dlg, _("~OK", term), B_ENTER, push_ok_button, NULL); if (!anonymous) diff --git a/src/document/document.c b/src/document/document.c index fbbc2eef..52b93030 100644 --- a/src/document/document.c +++ b/src/document/document.c @@ -57,7 +57,7 @@ init_document(struct cache_entry *cached, struct document_options *options) init_list(document->onload_snippets); #endif -#ifdef CONFIG_UTF8 +#ifdef CONFIG_COMBINE document->comb_x = -1; document->comb_y = -1; #endif diff --git a/src/document/document.h b/src/document/document.h index 51905bb1..a9beb791 100644 --- a/src/document/document.h +++ b/src/document/document.h @@ -201,6 +201,8 @@ struct document { #ifdef CONFIG_UTF8 unsigned char buf[7]; unsigned char buf_length; +#endif +#ifdef CONFIG_COMBINE /* base char + 5 combining chars = 6 */ unicode_val_T combi[UCS_MAX_LENGTH_COMBINED]; /* the number of combining characters. The base char is not counted. */ diff --git a/src/document/html/renderer.c b/src/document/html/renderer.c index f855b3af..f3979e9e 100644 --- a/src/document/html/renderer.c +++ b/src/document/html/renderer.c @@ -4,15 +4,18 @@ #include "config.h" #endif -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) -#define _XOPEN_SOURCE 500 +/* Our current implementation of combining characters requires + * wcwidth(). Therefore the configure script should have disabled + * CONFIG_COMBINE if wcwidth() doesn't exist. */ +#ifdef CONFIG_COMBINE +#define _XOPEN_SOURCE 500 /* for wcwidth */ #endif #include #include #include -#if defined(HAVE_WCHAR_H) && defined(HAVE_WCWIDTH) +#ifdef HAVE_WCHAR_H #include #endif @@ -489,7 +492,8 @@ good_char: if (data == UCS_NO_BREAK_SPACE && html_context->options->wrap_nbsp) data = UCS_SPACE; -#ifdef HAVE_WCWIDTH + +#ifdef CONFIG_COMBINE if (wcwidth((wchar_t)data)) { if (document->combi_length) { if (document->comb_x != -1) { @@ -521,7 +525,7 @@ good_char: part->char_width[x] = unicode_to_cell(data); schar->data = (unicode_val_T)data; } -#ifdef HAVE_WCWIDTH +#ifdef CONFIG_COMBINE document->comb_x = x; document->comb_y = y; #endif diff --git a/src/intl/charsets.c b/src/intl/charsets.c index fb1a0b39..8e1d02aa 100644 --- a/src/intl/charsets.c +++ b/src/intl/charsets.c @@ -770,8 +770,10 @@ cp_to_unicode(int codepage, unsigned char **string, unsigned char *end) ++*string; return ret; } +#endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE unicode_val_T last_combined = UCS_BEGIN_COMBINED - 1; unicode_val_T **combined; struct hash *combined_hash; @@ -830,7 +832,8 @@ free_combined() mem_free(combined[i]); mem_free_if(combined); } -#endif /* CONFIG_UTF8 */ +#endif /* CONFIG_COMBINE */ + static void add_utf8(struct conv_table *ct, unicode_val_T u, const unsigned char *str) @@ -1540,4 +1543,3 @@ is_cp_utf8(int cp_index) cp_index &= ~SYSTEM_CHARSET_FLAG; return is_cp_ptr_utf8(&codepages[cp_index]); } - diff --git a/src/intl/charsets.h b/src/intl/charsets.h index 8eb06fca..e137fa56 100644 --- a/src/intl/charsets.h +++ b/src/intl/charsets.h @@ -27,12 +27,14 @@ typedef uint32_t unicode_val_T; * for the second cell of a double-cell character. */ #define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD) +#ifdef CONFIG_COMBINE #define UCS_END_COMBINED ((unicode_val_T) 0xFFFFFFFC) #define UCS_BEGIN_COMBINED ((unicode_val_T) (UCS_END_COMBINED - (unicode_val_T) 10000)) /* Base character and up to 5 combining characters. */ #define UCS_MAX_LENGTH_COMBINED 6 +#endif /* CONFIG_COMBINE */ /* If ELinks should display a double-cell character but there is only * one cell available, it displays this character instead. This must @@ -154,15 +156,15 @@ unicode_val_T unicode_fold_label_case(unicode_val_T); inline int strlen_utf8(unsigned char **); inline unicode_val_T utf8_to_unicode(unsigned char **, const unsigned char *); unicode_val_T cp_to_unicode(int, unsigned char **, unsigned char *); +#endif /* CONFIG_UTF8 */ - +#ifdef CONFIG_COMBINE extern unicode_val_T last_combined; extern unicode_val_T **combined; extern struct hash *combined_hash; unicode_val_T get_combined(unicode_val_T *, int); void free_combined(); - -#endif /* CONFIG_UTF8 */ +#endif /* CONFIG_COMBINE */ unicode_val_T cp2u(int, unsigned char); const unsigned char *cp2utf8(int, int); diff --git a/src/main/main.c b/src/main/main.c index cdea245b..d791641a 100644 --- a/src/main/main.c +++ b/src/main/main.c @@ -303,7 +303,7 @@ terminate_all_subsystems(void) done_options(); done_event(); terminate_osdep(); -#ifdef CONFIG_UTF8 +#ifdef CONFIG_COMBINE free_combined(); #endif } diff --git a/src/main/version.c b/src/main/version.c index dd8be192..b6506710 100644 --- a/src/main/version.c +++ b/src/main/version.c @@ -142,6 +142,9 @@ get_dyn_full_version(struct terminal *term, int more) #endif #ifdef CONFIG_UTF8 comma, "UTF-8", +#endif +#ifdef CONFIG_COMBINE + comma, _("Combining characters", term), #endif comma, (unsigned char *) NULL diff --git a/src/terminal/screen.c b/src/terminal/screen.c index 162c454b..6553d777 100644 --- a/src/terminal/screen.c +++ b/src/terminal/screen.c @@ -223,10 +223,12 @@ struct screen_driver { * is the same as is_cp_utf8(charsets[0]), except the * latter might crash if UTF-8 I/O is disabled. */ unsigned int utf8_cp:1; +#endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE /* Whether the terminal supports combining characters. */ unsigned int combine:1; -#endif /* CONFIG_UTF8 */ +#endif /* CONFIG_COMBINE */ } opt; /* The terminal._template_ name. */ @@ -243,8 +245,10 @@ static const struct screen_driver_opt dumb_screen_driver_opt = { /* transparent: */ 1, #ifdef CONFIG_UTF8 /* utf8_cp: */ 0, - /* combine */ 0, #endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE + /* combine */ 0, +#endif /* CONFIG_COMBINE */ }; /** Default options for ::TERM_VT100. */ @@ -257,8 +261,10 @@ static const struct screen_driver_opt vt100_screen_driver_opt = { /* transparent: */ 1, #ifdef CONFIG_UTF8 /* utf8_cp: */ 0, - /* combine */ 0, #endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE + /* combine */ 0, +#endif /* CONFIG_COMBINE */ }; /** Default options for ::TERM_LINUX. */ @@ -271,8 +277,10 @@ static const struct screen_driver_opt linux_screen_driver_opt = { /* transparent: */ 1, #ifdef CONFIG_UTF8 /* utf8_cp: */ 0, - /* combine */ 0, #endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE + /* combine */ 0, +#endif /* CONFIG_COMBINE */ }; /** Default options for ::TERM_KOI8. */ @@ -285,8 +293,10 @@ static const struct screen_driver_opt koi8_screen_driver_opt = { /* transparent: */ 1, #ifdef CONFIG_UTF8 /* utf8_cp: */ 0, - /* combine */ 0, #endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE + /* combine */ 0, +#endif /* CONFIG_COMBINE */ }; /** Default options for ::TERM_FREEBSD. */ @@ -299,8 +309,10 @@ static const struct screen_driver_opt freebsd_screen_driver_opt = { /* transparent: */ 1, #ifdef CONFIG_UTF8 /* utf8_cp: */ 0, - /* combine */ 0, #endif /* CONFIG_UTF8 */ +#ifdef CONFIG_COMBINE + /* combine */ 0, +#endif /* CONFIG_COMBINE */ }; /** Default options for all the different types of terminals. @@ -332,8 +344,10 @@ set_screen_driver_opt(struct screen_driver *driver, struct option *term_spec) * function need not carefully restore options one by one. */ copy_struct(&driver->opt, screen_driver_opts[driver->type]); -#ifdef CONFIG_UTF8 +#ifdef CONFIG_COMBINE driver->opt.combine = get_opt_bool_tree(term_spec, "combine", NULL); +#endif /* CONFIG_COMBINE */ +#ifdef CONFIG_UTF8 /* Force UTF-8 I/O if the UTF-8 charset is selected. Various * places assume that the terminal's charset is unibyte if * UTF-8 I/O is disabled. (bug 827) */ @@ -648,6 +662,7 @@ add_char_data(struct string *screen, struct screen_driver *driver, } if (data == UCS_NO_CHAR) return; +#ifdef CONFIG_COMBINE if (data >= UCS_BEGIN_COMBINED && data <= last_combined) { unicode_val_T *text = combined[data - UCS_BEGIN_COMBINED]; @@ -663,6 +678,7 @@ add_char_data(struct string *screen, struct screen_driver *driver, data = *text; } } +#endif /* CONFIG_COMBINE */ if (!isscreensafe_ucs(data)) data = UCS_SPACE; add_to_string(screen, encode_utf8(data));