1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-01-03 14:57:44 -05:00

utf8_to_unicode: Reject invalid sequences, such as overlong.

Convert each byte of them to UCS_REPLACEMENT_CHARACTER.  This may not
be the optimal solution but at least it ought to be safe.  Also raise
an internal error if the value read from utf8char_len_tab[] is out of
range.

Note that ELinks is still using the RFC 2279 definition of UTF-8 and
thus allows characters up to 0x7FFFFFFF, even though RFC 3629 has
changed the maximum to 0x10FFFF.
This commit is contained in:
Kalle Olavi Niemitalo 2006-12-19 09:31:55 +02:00 committed by Kalle Olavi Niemitalo
parent 47f7ba24c6
commit 114ce8c833

View File

@ -640,39 +640,70 @@ utf8_to_unicode(unsigned char **string, unsigned char *end)
switch (length) { switch (length) {
case 1: case 1:
if (str[0] >= 0x80) {
invalid_utf8:
++*string;
return UCS_REPLACEMENT_CHARACTER;
}
u = str[0]; u = str[0];
break; break;
case 2: case 2:
if ((str[1] & 0xc0) != 0x80)
goto invalid_utf8;
u = (str[0] & 0x1f) << 6; u = (str[0] & 0x1f) << 6;
u += (str[1] & 0x3f); u += (str[1] & 0x3f);
if (u < 0x80)
goto invalid_utf8;
break; break;
case 3: case 3:
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80)
goto invalid_utf8;
u = (str[0] & 0x0f) << 12; u = (str[0] & 0x0f) << 12;
u += ((str[1] & 0x3f) << 6); u += ((str[1] & 0x3f) << 6);
u += (str[2] & 0x3f); u += (str[2] & 0x3f);
if (u < 0x800)
goto invalid_utf8;
break; break;
case 4: case 4:
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80)
goto invalid_utf8;
u = (str[0] & 0x0f) << 18; u = (str[0] & 0x0f) << 18;
u += ((str[1] & 0x3f) << 12); u += ((str[1] & 0x3f) << 12);
u += ((str[2] & 0x3f) << 6); u += ((str[2] & 0x3f) << 6);
u += (str[3] & 0x3f); u += (str[3] & 0x3f);
if (u < 0x10000)
goto invalid_utf8;
break; break;
case 5: case 5:
if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80)
goto invalid_utf8;
u = (str[0] & 0x0f) << 24; u = (str[0] & 0x0f) << 24;
u += ((str[1] & 0x3f) << 18); u += ((str[1] & 0x3f) << 18);
u += ((str[2] & 0x3f) << 12); u += ((str[2] & 0x3f) << 12);
u += ((str[3] & 0x3f) << 6); u += ((str[3] & 0x3f) << 6);
u += (str[4] & 0x3f); u += (str[4] & 0x3f);
if (u < 0x200000)
goto invalid_utf8;
break; break;
case 6: case 6:
default: if ((str[1] & 0xc0) != 0x80 || (str[2] & 0xc0) != 0x80
|| (str[3] & 0xc0) != 0x80 || (str[4] & 0xc0) != 0x80
|| (str[5] & 0xc0) != 0x80)
goto invalid_utf8;
u = (str[0] & 0x01) << 30; u = (str[0] & 0x01) << 30;
u += ((str[1] & 0x3f) << 24); u += ((str[1] & 0x3f) << 24);
u += ((str[2] & 0x3f) << 18); u += ((str[2] & 0x3f) << 18);
u += ((str[3] & 0x3f) << 12); u += ((str[3] & 0x3f) << 12);
u += ((str[4] & 0x3f) << 6); u += ((str[4] & 0x3f) << 6);
u += (str[5] & 0x3f); u += (str[5] & 0x3f);
if (u < 0x4000000)
goto invalid_utf8;
break; break;
default:
INTERNAL("utf8char_len_tab out of range");
goto invalid_utf8;
} }
*string = str + length; *string = str + length;
return u; return u;