From b5c62bbe77751fbc81db8f0f69946038f5ddf024 Mon Sep 17 00:00:00 2001 From: Chuck Lee Date: Wed, 20 Feb 2013 11:20:54 +0800 Subject: [PATCH] Bug 828782 - 0001. Add inflate function for replacing invalid byte while decoding UTF-8 character. r=terrence --- js/src/jsstr.cpp | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ js/src/jsstr.h | 10 ++++++ 2 files changed, 102 insertions(+) diff --git a/js/src/jsstr.cpp b/js/src/jsstr.cpp index b0e4c14659f..ae78de0aec2 100644 --- a/js/src/jsstr.cpp +++ b/js/src/jsstr.cpp @@ -100,6 +100,7 @@ static JSBool str_encodeURI_Component(JSContext *cx, unsigned argc, Value *vp); static const uint32_t INVALID_UTF8 = UINT32_MAX; +static const uint32_t REPLACE_UTF8 = 0xFFFD; static uint32_t Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length); @@ -3932,6 +3933,97 @@ js::InflateStringToBuffer(JSContext *maybecx, const char *src, size_t srclen, return JS_TRUE; } +bool +js::InflateUTF8StringToBufferReplaceInvalid(JSContext *cx, const char *src, + size_t srclen, jschar *dst, + size_t *dstlenp) +{ + mozilla::Maybe suppress; + if (cx) + suppress.construct(cx); + + size_t dstlen, origDstlen, offset, j, n; + uint32_t v; + + dstlen = dst ? *dstlenp : (size_t) -1; + origDstlen = dstlen; + offset = 0; + + while (srclen) { + v = (uint8_t) *src; + n = 1; + if (v & 0x80) { + while (v & (0x80 >> n)) + n++; + if (n > srclen || n == 1 || n > 4) { + /* Incorrect length for decoding. */ + v = REPLACE_UTF8; + n = 1; + goto appendCharacter; + } + + /* + * Check for invalid second byte. + * + * @From Unicode Standard v6.2, Table 3-7 Well-Formed UTF-8 Byte Sequences. + */ + if ((v == 0xE0 && ((uint8_t)src[1] & 0xE0) != 0xA0) || // E0 A0~BF + (v == 0xED && ((uint8_t)src[1] & 0xE0) != 0x80) || // ED 80~9F + (v == 0xF0 && ((uint8_t)src[1] & 0xF0) == 0x80) || // F0 90~BF + (v == 0xF4 && ((uint8_t)src[1] & 0xF0) != 0x80)) // F4 80~8F + { + v = REPLACE_UTF8; + n = 1; + goto appendCharacter; + } + + for (j = 1; j < n; j++) { + if ((src[j] & 0xC0) != 0x80) { + /* Invalid sub-sequence. */ + v = REPLACE_UTF8; + n = j; + goto appendCharacter; + } + } + + v = Utf8ToOneUcs4Char((uint8_t *)src, n); + if (v >= 0x10000) { + v -= 0x10000; + if (v > 0xFFFFF || dstlen < 2) { + /* Incorrect code point. */ + v = REPLACE_UTF8; + n = 1; + goto appendCharacter; + } + if (dst) { + *dst++ = (jschar)((v >> 10) + 0xD800); + v = (jschar)((v & 0x3FF) + 0xDC00); + } + dstlen--; + } + } +appendCharacter: + if (!dstlen) + goto bufferTooSmall; + if (dst) + *dst++ = (jschar) v; + dstlen--; + offset += n; + src += n; + srclen -= n; + } + *dstlenp = (origDstlen - dstlen); + return true; + +bufferTooSmall: + *dstlenp = (origDstlen - dstlen); + if (cx) { + JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL, + JSMSG_BUFFER_TOO_SMALL); + } + return false; +} + bool js::InflateUTF8StringToBuffer(JSContext *cx, const char *src, size_t srclen, jschar *dst, size_t *dstlenp) diff --git a/js/src/jsstr.h b/js/src/jsstr.h index 068570aa447..e6ecc0b813e 100644 --- a/js/src/jsstr.h +++ b/js/src/jsstr.h @@ -247,6 +247,16 @@ extern bool InflateUTF8StringToBuffer(JSContext *cx, const char *bytes, size_t length, jschar *chars, size_t *charsLength); +/* + * The same as InflateUTF8StringToBuffer(), except that any malformed UTF-8 + * characters will be replaced by \uFFFD. No exception will be thrown for + * malformed UTF-8 input. + */ +extern bool +InflateUTF8StringToBufferReplaceInvalid(JSContext *cx, const char *bytes, + size_t length, jschar *chars, + size_t *charsLength); + /* * Deflate JS chars to bytes into a buffer. 'bytes' must be large enough for * 'length chars. The buffer is NOT null-terminated. The destination length