mirror of
https://github.com/rkd77/elinks.git
synced 2024-12-04 14:46:47 -05:00
UTF-8: Split UCS_REPLACEMENT_CHARACTER off UCS_NO_CHAR.
In the previous version, invalid UTF-8 from a terminal caused UCS_NO_CHAR (0xFFFFFFFD) to be stored in a term_event_key_T, resulting in -3 which was then incidentally treated as an unassigned special key. Now, invalid UTF-8 is instead mapped to UCS_REPLACEMENT_CHARACTER and treated as a character. The fact that handle_interlink_event calls term_send_ucs when it receives invalid UTF-8 makes it pretty clear that this is how it was intended. src/viewer/text/link.c (not changed in this commit) already referred to UCS_REPLACEMENT_CHARACTER in a comment even though it was not previously defined.
This commit is contained in:
parent
fa1859c0f1
commit
0748ee8c92
@ -491,7 +491,7 @@ cp2u_shared(const struct codepage_desc *from, unsigned char c)
|
||||
if (from->table[j].c == c)
|
||||
return from->table[j].u;
|
||||
|
||||
return UCS_NO_CHAR;
|
||||
return UCS_REPLACEMENT_CHARACTER;
|
||||
}
|
||||
|
||||
/* Slow algorithm, used for converting input from the terminal. */
|
||||
@ -503,7 +503,7 @@ cp2u(int from, unsigned char c)
|
||||
/* UTF-8 is a multibyte codepage and cannot be handled with
|
||||
* this function. */
|
||||
assert(codepages[from].table != table_utf_8);
|
||||
if_assert_failed return UCS_NO_CHAR;
|
||||
if_assert_failed return UCS_REPLACEMENT_CHARACTER;
|
||||
|
||||
if (c < 0x80) return c;
|
||||
else return cp2u_shared(&codepages[from], c);
|
||||
|
@ -3,7 +3,17 @@
|
||||
|
||||
typedef uint32_t unicode_val_T;
|
||||
|
||||
/* UCS/Unicode replacement character. */
|
||||
/* U+FFFD REPLACEMENT CHARACTER. Used when no Unicode mapping is
|
||||
* known for a byte in a codepage, or when invalid UTF-8 is received
|
||||
* from a terminal. After generating the character, ELinks then
|
||||
* treats it like any other Unicode character. The user can also type
|
||||
* this character directly, and it can occur in documents. */
|
||||
#define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)
|
||||
|
||||
/* A special value that fits in unicode_val_T but is outside the range
|
||||
* of Unicode characters. utf_8_to_unicode and cp_to_unicode return
|
||||
* this if the input is too short. This is also used as a placeholder
|
||||
* for the second cell of a double-cell character. */
|
||||
#define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)
|
||||
|
||||
/* replacement character. See u2cp(). */
|
||||
|
@ -338,20 +338,21 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
|
||||
unicode_val_T u = interlink->utf_8.ucs;
|
||||
|
||||
if (u < interlink->utf_8.min)
|
||||
u = UCS_NO_CHAR;
|
||||
u = UCS_REPLACEMENT_CHARACTER;
|
||||
term_send_ucs(term, u,
|
||||
term->interlink->utf_8.modifier);
|
||||
}
|
||||
break;
|
||||
|
||||
} else {
|
||||
/* The byte sequence for this character is
|
||||
* ending prematurely. Send UCS_NO_CHAR for the
|
||||
* terminated character, but don't break; let
|
||||
* this byte be handled below. */
|
||||
/* The byte sequence for this character
|
||||
* is ending prematurely. Send
|
||||
* UCS_REPLACEMENT_CHARACTER for the
|
||||
* terminated character, but don't break;
|
||||
* let this byte be handled below. */
|
||||
|
||||
interlink->utf_8.len = 0;
|
||||
term_send_ucs(term, UCS_NO_CHAR,
|
||||
term_send_ucs(term, UCS_REPLACEMENT_CHARACTER,
|
||||
term->interlink->utf_8.modifier);
|
||||
}
|
||||
}
|
||||
@ -404,7 +405,7 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
|
||||
break;
|
||||
}
|
||||
|
||||
term_send_ucs(term, UCS_NO_CHAR, modifier);
|
||||
term_send_ucs(term, UCS_REPLACEMENT_CHARACTER, modifier);
|
||||
break;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user