1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-12-04 14:46:47 -05:00

UTF-8: Split UCS_REPLACEMENT_CHARACTER off UCS_NO_CHAR.

In the previous version, invalid UTF-8 from a terminal caused
UCS_NO_CHAR (0xFFFFFFFD) to be stored in a term_event_key_T, resulting
in -3 which was then incidentally treated as an unassigned special key.

Now, invalid UTF-8 is instead mapped to UCS_REPLACEMENT_CHARACTER
and treated as a character.  The fact that handle_interlink_event
calls term_send_ucs when it receives invalid UTF-8 makes it pretty
clear that this is how it was intended.

src/viewer/text/link.c (not changed in this commit) already referred
to UCS_REPLACEMENT_CHARACTER in a comment even though it was not
previously defined.
This commit is contained in:
Kalle Olavi Niemitalo 2006-08-19 13:29:37 +03:00 committed by Kalle Olavi Niemitalo
parent fa1859c0f1
commit 0748ee8c92
3 changed files with 21 additions and 10 deletions

View File

@ -491,7 +491,7 @@ cp2u_shared(const struct codepage_desc *from, unsigned char c)
if (from->table[j].c == c)
return from->table[j].u;
return UCS_NO_CHAR;
return UCS_REPLACEMENT_CHARACTER;
}
/* Slow algorithm, used for converting input from the terminal. */
@ -503,7 +503,7 @@ cp2u(int from, unsigned char c)
/* UTF-8 is a multibyte codepage and cannot be handled with
* this function. */
assert(codepages[from].table != table_utf_8);
if_assert_failed return UCS_NO_CHAR;
if_assert_failed return UCS_REPLACEMENT_CHARACTER;
if (c < 0x80) return c;
else return cp2u_shared(&codepages[from], c);

View File

@ -3,7 +3,17 @@
typedef uint32_t unicode_val_T;
/* UCS/Unicode replacement character. */
/* U+FFFD REPLACEMENT CHARACTER. Used when no Unicode mapping is
* known for a byte in a codepage, or when invalid UTF-8 is received
* from a terminal. After generating the character, ELinks then
* treats it like any other Unicode character. The user can also type
* this character directly, and it can occur in documents. */
#define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)
/* A special value that fits in unicode_val_T but is outside the range
* of Unicode characters. utf_8_to_unicode and cp_to_unicode return
* this if the input is too short. This is also used as a placeholder
* for the second cell of a double-cell character. */
#define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)
/* &nbsp; replacement character. See u2cp(). */

View File

@ -338,20 +338,21 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
unicode_val_T u = interlink->utf_8.ucs;
if (u < interlink->utf_8.min)
u = UCS_NO_CHAR;
u = UCS_REPLACEMENT_CHARACTER;
term_send_ucs(term, u,
term->interlink->utf_8.modifier);
}
break;
} else {
/* The byte sequence for this character is
* ending prematurely. Send UCS_NO_CHAR for the
* terminated character, but don't break; let
* this byte be handled below. */
/* The byte sequence for this character
* is ending prematurely. Send
* UCS_REPLACEMENT_CHARACTER for the
* terminated character, but don't break;
* let this byte be handled below. */
interlink->utf_8.len = 0;
term_send_ucs(term, UCS_NO_CHAR,
term_send_ucs(term, UCS_REPLACEMENT_CHARACTER,
term->interlink->utf_8.modifier);
}
}
@ -404,7 +405,7 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
break;
}
term_send_ucs(term, UCS_NO_CHAR, modifier);
term_send_ucs(term, UCS_REPLACEMENT_CHARACTER, modifier);
break;
}