1
0
mirror of https://github.com/rkd77/elinks.git synced 2025-02-02 15:09:23 -05:00

Define and use macros for handling UTF-16 surrogates.

This commit is contained in:
Kalle Olavi Niemitalo 2006-08-24 23:30:41 +03:00 committed by Kalle Olavi Niemitalo
parent b9d66bd9bd
commit 38fe5b72f7
3 changed files with 26 additions and 16 deletions

View File

@ -111,9 +111,9 @@ append_unicode_to_SEE_string(struct SEE_interpreter *interp,
/* TODO: Should this reject code points in the
* surrogate range? */
SEE_string_addch(str, u);
} else if (u <= 0x10FFFF) {
SEE_string_addch(str, 0xD800 + ((u - 0x10000) >> 10));
SEE_string_addch(str, 0xDC00 + (u & 0x3FF));
} else if (needs_utf16_surrogates(u)) {
SEE_string_addch(str, get_utf16_high_surrogate(u));
SEE_string_addch(str, get_utf16_low_surrogate(u));
} else {
/* str->interpreter exists but is not documented, so don't
* use it; use a separate @interp parameter instead.
@ -131,14 +131,12 @@ SEE_string_to_unicode(struct SEE_interpreter *interp, struct SEE_string *S)
if (S->length < 1) {
SEE_error_throw(interp, interp->Error,
"String is empty");
} else if (S->data[0] < 0xD800 || S->data[0] > 0xDFFF) {
} else if (!is_utf16_surrogate(S->data[0])) {
return S->data[0];
} else if (S->length >= 2
&& S->data[0] >= 0xD800 && S->data[0] <= 0xDBFF
&& S->data[1] >= 0xDC00 && S->data[1] <= 0xDFFF) {
return 0x10000
+ ((S->data[0] & 0x3FF) << 10)
+ (S->data[1] & 0x3FF);
&& is_utf16_high_surrogate(S->data[0])
&& is_utf16_low_surrogate(S->data[1])) {
return join_utf16_surrogates(S->data[0], S->data[1]);
} else {
SEE_error_throw(interp, interp->Error,
"Invalid UTF-16 sequence");

View File

@ -998,9 +998,9 @@ unicode_to_jsstring(JSContext *ctx, unicode_val_T u)
* surrogate range? */
buf[0] = u;
return JS_NewUCStringCopyN(ctx, buf, 1);
} else if (u <= 0x10FFFF) {
buf[0] = 0xD800 + ((u - 0x10000) >> 10);
buf[1] = 0xDC00 + (u & 0x3FF);
} else if (needs_utf16_surrogates(u)) {
buf[0] = get_utf16_high_surrogate(u);
buf[1] = get_utf16_low_surrogate(u);
return JS_NewUCStringCopyN(ctx, buf, 2);
} else {
return NULL;
@ -1024,12 +1024,12 @@ jsval_to_accesskey(JSContext *ctx, jsval *vp)
/* This implementation ignores extra characters in the string. */
if (len < 1)
return 0; /* which means no access key */
if (chr[0] < 0xD800 || chr[0] > 0xDFFF)
if (!is_utf16_surrogate(chr[0]))
return chr[0];
if (len >= 2
&& chr[0] >= 0xD800 && chr[0] <= 0xDBFF
&& chr[1] >= 0xDC00 && chr[1] <= 0xDFFF)
return 0x10000 + ((chr[0] & 0x3FF) << 10) + (chr[1] & 0x3FF);
&& is_utf16_high_surrogate(chr[0])
&& is_utf16_low_surrogate(chr[1]))
return join_utf16_surrogates(chr[0], chr[1]);
JS_ReportError(ctx, "Invalid UTF-16 sequence");
return UCS_NO_CHAR; /* which the caller will reject */
}

View File

@ -88,4 +88,16 @@ unsigned char *u2cp_(unicode_val_T, int, int no_nbsp_hack);
void init_charsets_lookup(void);
void free_charsets_lookup(void);
/* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single
* 16-bit code unit, and each character U+10000...U+10FFFF as a pair
* of two code units: a high surrogate followed by a low surrogate.
* The range U+D800...U+DFFF is reserved for these surrogates. */
#define is_utf16_surrogate(u) (((u) & 0xFFFFF800) == 0xD800)
#define is_utf16_high_surrogate(u) (((u) & 0xFFFFFC00) == 0xD800)
#define is_utf16_low_surrogate(u) (((u) & 0xFFFFFC00) == 0xDC00)
#define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00))
#define needs_utf16_surrogates(u) ((uint32_t) ((u) - 0x10000) < 0x100000)
#define get_utf16_high_surrogate(u) (0xD800 + (((u) - 0x10000) >> 10))
#define get_utf16_low_surrogate(u) (0xDC00 + ((u) & 0x3FF))
#endif