0
0
mirror of https://github.com/rkd77/elinks.git synced 2025-06-30 22:19:29 -04:00

[utf] Debug REPLACEMENT_CHARACTER. Refs #249

This commit is contained in:
Witold Filipczyk 2023-07-29 09:38:08 +02:00
parent 4ef70a1cfa
commit 5cd66e06ba
4 changed files with 51 additions and 13 deletions

View File

@ -748,66 +748,88 @@ utf8_to_unicode(char **string, const char *end)
if (str[0] >= 0x80) {
invalid_utf8:
++*string;
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
return UCS_REPLACEMENT_CHARACTER;
}
u = str[0];
break;
case 2: /* U+0080 to U+07FF */
if ((str[1] & 0xc0) != 0x80)
if ((str[1] & 0xc0) != 0x80) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
u = (str[0] & 0x1f) << 6;
u += (str[1] & 0x3f);
if (u < 0x80)
if (u < 0x80) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
break;
case 3: /* U+0800 to U+FFFF, except surrogates */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
u = (str[0] & 0x0f) << 12;
u += ((str[1] & 0x3f) << 6);
u += (str[2] & 0x3f);
if (u < 0x800 || is_utf16_surrogate(u))
if (u < 0x800 || is_utf16_surrogate(u)) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
break;
case 4: /* U+10000 to U+1FFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80)
|| (str[3] & 0xc0) != 0x80) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
u = (str[0] & 0x0f) << 18;
u += ((str[1] & 0x3f) << 12);
u += ((str[2] & 0x3f) << 6);
u += (str[3] & 0x3f);
if (u < 0x10000)
if (u < 0x10000) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
break;
case 5: /* U+200000 to U+3FFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
u = (str[0] & 0x0f) << 24;
u += ((str[1] & 0x3f) << 18);
u += ((str[2] & 0x3f) << 12);
u += ((str[3] & 0x3f) << 6);
u += (str[4] & 0x3f);
if (u < 0x200000)
if (u < 0x200000) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
break;
case 6: /* U+4000000 to U+7FFFFFFF */
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
|| (str[5] & 0xc0) != 0x80)
|| (str[5] & 0xc0) != 0x80) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
u = (str[0] & 0x01) << 30;
u += ((str[1] & 0x3f) << 24);
u += ((str[2] & 0x3f) << 18);
u += ((str[3] & 0x3f) << 12);
u += ((str[4] & 0x3f) << 6);
u += (str[5] & 0x3f);
if (u < 0x4000000)
if (u < 0x4000000) {
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
break;
default:
INTERNAL("utf8char_len_tab out of range");
fprintf(stderr, "goto %s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
goto invalid_utf8;
}
*string = (char *)(str + length);
@ -820,7 +842,10 @@ cp2u_shared(const struct codepage_desc *from, unsigned char c)
{
unicode_val_T u = from->highhalf[c - 0x80];
if (u == 0xFFFF) u = UCS_REPLACEMENT_CHARACTER;
if (u == 0xFFFF) {
u = UCS_REPLACEMENT_CHARACTER;
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
}
return u;
}
@ -833,7 +858,10 @@ cp2u(int from, unsigned char c)
/* UTF-8 is a multibyte codepage and cannot be handled with
* this function. */
assert(!is_cp_ptr_utf8(&codepages[from]));
if_assert_failed return UCS_REPLACEMENT_CHARACTER;
if_assert_failed {
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
return UCS_REPLACEMENT_CHARACTER;
}
if (c < 0x80) return c;
else return cp2u_shared(&codepages[from], c);

View File

@ -26,8 +26,11 @@
#include <js/SourceText.h>
#include <js/Warnings.h>
#include <stdio.h>
#define SMJS_HOOKS_FILENAME "hooks.js"
JSContext *smjs_ctx;
JSObject *smjs_elinks_object;
struct session *smjs_ses;
@ -236,6 +239,7 @@ add_jschars_to_utf8_string(struct string *utf8,
unicode = join_utf16_surrogates(unicode,
utf16[pos++]);
} else {
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
unicode = UCS_REPLACEMENT_CHARACTER;
}
}

View File

@ -38,6 +38,7 @@
#include "viewer/text/textarea.h"
#include "viewer/timer.h"
#include <stdio.h>
/** Information used for communication between ELinks instances */
struct terminal_interlink {
@ -362,8 +363,10 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
/* UTF-8 allows neither overlong
* sequences nor surrogates. */
if (u < interlink->utf8.min
|| is_utf16_surrogate(u))
|| is_utf16_surrogate(u)) {
u = UCS_REPLACEMENT_CHARACTER;
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
}
term_send_ucs(term, u,
term->interlink->utf8.modifier);
}
@ -377,6 +380,7 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
* let this byte be handled below. */
interlink->utf8.len = 0;
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
term_send_ucs(term, UCS_REPLACEMENT_CHARACTER,
term->interlink->utf8.modifier);
}
@ -447,6 +451,7 @@ handle_interlink_event(struct terminal *term, struct interlink_event *ilev)
}
invalid_utf8_start_byte:
fprintf(stderr, "%s:%d:%s\n", __FILE__, __LINE__, __FUNCTION__);
term_send_ucs(term, UCS_REPLACEMENT_CHARACTER, modifier);
break;
}

1
test/chars.txt Normal file
View File

@ -0,0 +1 @@
U+00C0 À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï