diff --git a/src/document/html/renderer.c b/src/document/html/renderer.c index dfd7f6258..440203191 100644 --- a/src/document/html/renderer.c +++ b/src/document/html/renderer.c @@ -406,6 +406,21 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen, if (realloc_spaces(part, x + charslen)) return 0; + /* U+00AD SOFT HYPHEN characters in HTML documents are + * supposed to be displayed only if the word is broken at that + * point. ELinks currently does not use them, so it should + * not display them. If the input @chars is in UTF-8, then + * set_hline() discards the characters. If the input is in + * some other charset, then set_hline() does not know which + * byte that charset uses for U+00AD, so it cannot discard + * the characters; instead, the translation table used by + * convert_string() has already discarded the characters. + * + * Likewise, if the input @chars is in UTF-8, then it may + * contain U+00A0 NO-BREAK SPACE characters; but if the input + * is in some other charset, then the translation table + * has mapped those characters to NBSP_CHAR. */ + if (part->document) { /* Reallocate LINE(y).chars[] to large enough. The * last parameter of realloc_line is the index of the @@ -424,7 +439,7 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen, if (orig_length < 0) /* error */ return 0; if (utf8) { - unsigned char *end = chars + charslen; + unsigned char *const end = chars + charslen; unicode_val_T data; if (part->document->buf_length) { @@ -459,7 +474,7 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen, } } - for (; chars < end; x++) { + while (chars < end) { /* ELinks does not use NBSP_CHAR in UTF-8. */ data = utf8_to_unicode(&chars, end); @@ -473,7 +488,7 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen, schar->attr = SCREEN_ATTR_FRAME; copy_screen_chars(&POS(x, y), schar, 1); schar->attr = attr; - part->char_width[x] = 0; + part->char_width[x++] = 0; continue; } else { unsigned char i; @@ -486,6 +501,8 @@ set_hline(struct html_context *html_context, unsigned char *chars, int charslen, } } else { good_char: + if (data == UCS_SOFT_HYPHEN) + continue; if (data == UCS_NO_BREAK_SPACE && html_context->options->wrap_nbsp) data = UCS_SPACE; @@ -502,8 +519,8 @@ good_char: schar->data = (unicode_val_T)data; } } - copy_screen_chars(&POS(x, y), schar, 1); - } + copy_screen_chars(&POS(x++, y), schar, 1); + } /* while chars < end */ } else { /* not UTF-8 */ for (; charslen > 0; charslen--, x++, chars++) { part->char_width[x] = 1; @@ -535,13 +552,15 @@ good_char: len = x - x2; } else { /* part->document == NULL */ if (utf8) { - unsigned char *end; + unsigned char *const end = chars + charslen; - for (end = chars + charslen; chars < end; x++) { + while (chars < end) { unicode_val_T data; - part->spaces[x] = (*chars == ' '); data = utf8_to_unicode(&chars, end); + if (data == UCS_SOFT_HYPHEN) + continue; + part->spaces[x] = (data == UCS_SPACE); part->char_width[x] = unicode_to_cell(data); if (part->char_width[x] == 2) { x++; @@ -552,7 +571,8 @@ good_char: /* this is at the end only */ return x - x2; } - } + x++; + } /* while chars < end */ len = x - x2; } else { /* not UTF-8 */ for (; charslen > 0; charslen--, x++, chars++) {