UTF-8: Split UCS_REPLACEMENT_CHARACTER off UCS_NO_CHAR.

In the previous version, invalid UTF-8 from a terminal caused UCS_NO_CHAR (0xFFFFFFFD) to be stored in a term_event_key_T, resulting in -3 which was then incidentally treated as an unassigned special key. Now, invalid UTF-8 is instead mapped to UCS_REPLACEMENT_CHARACTER and treated as a character. The fact that handle_interlink_event calls term_send_ucs when it receives invalid UTF-8 makes it pretty clear that this is how it was intended. src/viewer/text/link.c (not changed in this commit) already referred to UCS_REPLACEMENT_CHARACTER in a comment even though it was not previously defined.
2025-06-30 22:19:29 -04:00 · 2006-08-19 13:29:37 +03:00 · 2006-08-19 13:29:37 +03:00 · 0748ee8c92
commit 0748ee8c92
parent fa1859c0f1
3 changed files with 21 additions and 10 deletions
--- a/src/intl/charsets.c
+++ b/src/intl/charsets.c
@ -491,7 +491,7 @@ cp2u_shared(const struct codepage_desc *from, unsigned char c)
 		if (from->table[j].c == c)
 			return from->table[j].u;

-	return UCS_NO_CHAR;
+	return UCS_REPLACEMENT_CHARACTER;
 }

 /* Slow algorithm, used for converting input from the terminal.  */
@ -503,7 +503,7 @@ cp2u(int from, unsigned char c)
 	/* UTF-8 is a multibyte codepage and cannot be handled with
 	 * this function.  */
 	assert(codepages[from].table != table_utf_8);
-	if_assert_failed return UCS_NO_CHAR;
+	if_assert_failed return UCS_REPLACEMENT_CHARACTER;

 	if (c < 0x80) return c;
 	else return cp2u_shared(&codepages[from], c);
--- a/src/intl/charsets.h
+++ b/src/intl/charsets.h
@ -3,7 +3,17 @@

 typedef uint32_t unicode_val_T;

-/* UCS/Unicode replacement character. */
+/* U+FFFD REPLACEMENT CHARACTER.  Used when no Unicode mapping is
+ * known for a byte in a codepage, or when invalid UTF-8 is received
+ * from a terminal.  After generating the character, ELinks then
+ * treats it like any other Unicode character.  The user can also type
+ * this character directly, and it can occur in documents.  */
+#define UCS_REPLACEMENT_CHARACTER ((unicode_val_T) 0xFFFD)
+
+/* A special value that fits in unicode_val_T but is outside the range
+ * of Unicode characters.  utf_8_to_unicode and cp_to_unicode return
+ * this if the input is too short.  This is also used as a placeholder
+ * for the second cell of a double-cell character.  */
 #define UCS_NO_CHAR ((unicode_val_T) 0xFFFFFFFD)

 /* &nbsp; replacement character. See u2cp(). */
--- a/src/terminal/event.c
+++ b/src/terminal/event.c
@ -338,20 +338,21 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
 					unicode_val_T u = interlink->utf_8.ucs;

 					if (u < interlink->utf_8.min)
-						u = UCS_NO_CHAR;
+						u = UCS_REPLACEMENT_CHARACTER;
 					term_send_ucs(term, u,
 						      term->interlink->utf_8.modifier);
 				}
 				break;

 			} else {
-				/* The byte sequence for this character is
-				 * ending prematurely.  Send UCS_NO_CHAR for the
-				 * terminated character, but don't break; let
-				 * this byte be handled below. */
+				/* The byte sequence for this character
+				 * is ending prematurely.  Send
+				 * UCS_REPLACEMENT_CHARACTER for the
+				 * terminated character, but don't break;
+				 * let this byte be handled below. */

 				interlink->utf_8.len = 0;
-				term_send_ucs(term, UCS_NO_CHAR,
+				term_send_ucs(term, UCS_REPLACEMENT_CHARACTER,
 					      term->interlink->utf_8.modifier);
 			}
 		}
@ -404,7 +405,7 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
 			break;
 		}

-		term_send_ucs(term, UCS_NO_CHAR, modifier);
+		term_send_ucs(term, UCS_REPLACEMENT_CHARACTER, modifier);
 		break;
 	}