Bug 828782 - 0001. Add inflate function for replacing invalid byte while decoding UTF-8 character. r=terrence

This commit is contained in:
Chuck Lee 2013-02-20 11:20:54 +08:00
parent e4f8bd3055
commit b5c62bbe77
2 changed files with 102 additions and 0 deletions

View File

@ -100,6 +100,7 @@ static JSBool
str_encodeURI_Component(JSContext *cx, unsigned argc, Value *vp);
static const uint32_t INVALID_UTF8 = UINT32_MAX;
static const uint32_t REPLACE_UTF8 = 0xFFFD;
static uint32_t
Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length);
@ -3932,6 +3933,97 @@ js::InflateStringToBuffer(JSContext *maybecx, const char *src, size_t srclen,
return JS_TRUE;
}
bool
js::InflateUTF8StringToBufferReplaceInvalid(JSContext *cx, const char *src,
size_t srclen, jschar *dst,
size_t *dstlenp)
{
mozilla::Maybe<AutoSuppressGC> suppress;
if (cx)
suppress.construct(cx);
size_t dstlen, origDstlen, offset, j, n;
uint32_t v;
dstlen = dst ? *dstlenp : (size_t) -1;
origDstlen = dstlen;
offset = 0;
while (srclen) {
v = (uint8_t) *src;
n = 1;
if (v & 0x80) {
while (v & (0x80 >> n))
n++;
if (n > srclen || n == 1 || n > 4) {
/* Incorrect length for decoding. */
v = REPLACE_UTF8;
n = 1;
goto appendCharacter;
}
/*
* Check for invalid second byte.
*
* @From Unicode Standard v6.2, Table 3-7 Well-Formed UTF-8 Byte Sequences.
*/
if ((v == 0xE0 && ((uint8_t)src[1] & 0xE0) != 0xA0) || // E0 A0~BF
(v == 0xED && ((uint8_t)src[1] & 0xE0) != 0x80) || // ED 80~9F
(v == 0xF0 && ((uint8_t)src[1] & 0xF0) == 0x80) || // F0 90~BF
(v == 0xF4 && ((uint8_t)src[1] & 0xF0) != 0x80)) // F4 80~8F
{
v = REPLACE_UTF8;
n = 1;
goto appendCharacter;
}
for (j = 1; j < n; j++) {
if ((src[j] & 0xC0) != 0x80) {
/* Invalid sub-sequence. */
v = REPLACE_UTF8;
n = j;
goto appendCharacter;
}
}
v = Utf8ToOneUcs4Char((uint8_t *)src, n);
if (v >= 0x10000) {
v -= 0x10000;
if (v > 0xFFFFF || dstlen < 2) {
/* Incorrect code point. */
v = REPLACE_UTF8;
n = 1;
goto appendCharacter;
}
if (dst) {
*dst++ = (jschar)((v >> 10) + 0xD800);
v = (jschar)((v & 0x3FF) + 0xDC00);
}
dstlen--;
}
}
appendCharacter:
if (!dstlen)
goto bufferTooSmall;
if (dst)
*dst++ = (jschar) v;
dstlen--;
offset += n;
src += n;
srclen -= n;
}
*dstlenp = (origDstlen - dstlen);
return true;
bufferTooSmall:
*dstlenp = (origDstlen - dstlen);
if (cx) {
JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL,
JSMSG_BUFFER_TOO_SMALL);
}
return false;
}
bool
js::InflateUTF8StringToBuffer(JSContext *cx, const char *src, size_t srclen,
jschar *dst, size_t *dstlenp)

View File

@ -247,6 +247,16 @@ extern bool
InflateUTF8StringToBuffer(JSContext *cx, const char *bytes, size_t length,
jschar *chars, size_t *charsLength);
/*
* The same as InflateUTF8StringToBuffer(), except that any malformed UTF-8
* characters will be replaced by \uFFFD. No exception will be thrown for
* malformed UTF-8 input.
*/
extern bool
InflateUTF8StringToBufferReplaceInvalid(JSContext *cx, const char *bytes,
size_t length, jschar *chars,
size_t *charsLength);
/*
* Deflate JS chars to bytes into a buffer. 'bytes' must be large enough for
* 'length chars. The buffer is NOT null-terminated. The destination length