diff --git a/src/ecmascript/see/input.c b/src/ecmascript/see/input.c index 5d3b91f53..cba243551 100644 --- a/src/ecmascript/see/input.c +++ b/src/ecmascript/see/input.c @@ -111,9 +111,9 @@ append_unicode_to_SEE_string(struct SEE_interpreter *interp, /* TODO: Should this reject code points in the * surrogate range? */ SEE_string_addch(str, u); - } else if (u <= 0x10FFFF) { - SEE_string_addch(str, 0xD800 + ((u - 0x10000) >> 10)); - SEE_string_addch(str, 0xDC00 + (u & 0x3FF)); + } else if (needs_utf16_surrogates(u)) { + SEE_string_addch(str, get_utf16_high_surrogate(u)); + SEE_string_addch(str, get_utf16_low_surrogate(u)); } else { /* str->interpreter exists but is not documented, so don't * use it; use a separate @interp parameter instead. @@ -131,14 +131,12 @@ SEE_string_to_unicode(struct SEE_interpreter *interp, struct SEE_string *S) if (S->length < 1) { SEE_error_throw(interp, interp->Error, "String is empty"); - } else if (S->data[0] < 0xD800 || S->data[0] > 0xDFFF) { + } else if (!is_utf16_surrogate(S->data[0])) { return S->data[0]; } else if (S->length >= 2 - && S->data[0] >= 0xD800 && S->data[0] <= 0xDBFF - && S->data[1] >= 0xDC00 && S->data[1] <= 0xDFFF) { - return 0x10000 - + ((S->data[0] & 0x3FF) << 10) - + (S->data[1] & 0x3FF); + && is_utf16_high_surrogate(S->data[0]) + && is_utf16_low_surrogate(S->data[1])) { + return join_utf16_surrogates(S->data[0], S->data[1]); } else { SEE_error_throw(interp, interp->Error, "Invalid UTF-16 sequence"); diff --git a/src/ecmascript/spidermonkey/form.c b/src/ecmascript/spidermonkey/form.c index 643503a07..6da9b49c2 100644 --- a/src/ecmascript/spidermonkey/form.c +++ b/src/ecmascript/spidermonkey/form.c @@ -998,9 +998,9 @@ unicode_to_jsstring(JSContext *ctx, unicode_val_T u) * surrogate range? */ buf[0] = u; return JS_NewUCStringCopyN(ctx, buf, 1); - } else if (u <= 0x10FFFF) { - buf[0] = 0xD800 + ((u - 0x10000) >> 10); - buf[1] = 0xDC00 + (u & 0x3FF); + } else if (needs_utf16_surrogates(u)) { + buf[0] = get_utf16_high_surrogate(u); + buf[1] = get_utf16_low_surrogate(u); return JS_NewUCStringCopyN(ctx, buf, 2); } else { return NULL; @@ -1024,12 +1024,12 @@ jsval_to_accesskey(JSContext *ctx, jsval *vp) /* This implementation ignores extra characters in the string. */ if (len < 1) return 0; /* which means no access key */ - if (chr[0] < 0xD800 || chr[0] > 0xDFFF) + if (!is_utf16_surrogate(chr[0])) return chr[0]; if (len >= 2 - && chr[0] >= 0xD800 && chr[0] <= 0xDBFF - && chr[1] >= 0xDC00 && chr[1] <= 0xDFFF) - return 0x10000 + ((chr[0] & 0x3FF) << 10) + (chr[1] & 0x3FF); + && is_utf16_high_surrogate(chr[0]) + && is_utf16_low_surrogate(chr[1])) + return join_utf16_surrogates(chr[0], chr[1]); JS_ReportError(ctx, "Invalid UTF-16 sequence"); return UCS_NO_CHAR; /* which the caller will reject */ } diff --git a/src/intl/charsets.h b/src/intl/charsets.h index 095fe48e3..c0452aff7 100644 --- a/src/intl/charsets.h +++ b/src/intl/charsets.h @@ -88,4 +88,16 @@ unsigned char *u2cp_(unicode_val_T, int, int no_nbsp_hack); void init_charsets_lookup(void); void free_charsets_lookup(void); +/* UTF-16 encodes each Unicode character U+0000...U+FFFF as a single + * 16-bit code unit, and each character U+10000...U+10FFFF as a pair + * of two code units: a high surrogate followed by a low surrogate. + * The range U+D800...U+DFFF is reserved for these surrogates. */ +#define is_utf16_surrogate(u) (((u) & 0xFFFFF800) == 0xD800) +#define is_utf16_high_surrogate(u) (((u) & 0xFFFFFC00) == 0xD800) +#define is_utf16_low_surrogate(u) (((u) & 0xFFFFFC00) == 0xDC00) +#define join_utf16_surrogates(high,low) (0x10000 + (((high) - 0xD800L) << 10) + ((low) - 0xDC00)) +#define needs_utf16_surrogates(u) ((uint32_t) ((u) - 0x10000) < 0x100000) +#define get_utf16_high_surrogate(u) (0xD800 + (((u) - 0x10000) >> 10)) +#define get_utf16_low_surrogate(u) (0xDC00 + ((u) & 0x3FF)) + #endif