mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Treat unpaired surrogate characters in UTF-16 as illegal characters. Bug 312716. Patch by me and Peter Annema <jag@tty.nl>, r=me, jag, jshin; sr=dveditz
This commit is contained in:
parent
c4850d0d07
commit
c1e02f694e
109
intl/uconv/tests/unit/test_bug317216.js
Normal file
109
intl/uconv/tests/unit/test_bug317216.js
Normal file
@ -0,0 +1,109 @@
|
||||
/* Test case for bug 317216
|
||||
*
|
||||
* Uses nsIConverterInputStream to decode UTF-16 text with valid surrogate
|
||||
* pairs and lone surrogate characters
|
||||
*
|
||||
* Sample text is: "A" in Mathematical Bold Capitals (U+1D400)
|
||||
*
|
||||
* The test uses buffers of 4 different lengths to test end of buffer in mid-
|
||||
* UTF16 character and mid-surrogate pair
|
||||
*/
|
||||
|
||||
const test = [
|
||||
// 0: Valid surrogate pair
|
||||
["%00%2D%00%2D%D8%35%DC%20%00%2D%00%2D",
|
||||
// expected: surrogate pair
|
||||
"--\uD835\uDC20--"],
|
||||
// 1: Lone high surrogate
|
||||
["%00%2D%00%2D%D8%35%00%2D%00%2D",
|
||||
// expected: one replacement char
|
||||
"--\uFFFD--"],
|
||||
// 2: Lone low surrogate
|
||||
["%00%2D%00%2D%DC%20%00%2D%00%2D",
|
||||
// expected: one replacement char
|
||||
"--\uFFFD--"],
|
||||
// 3: Two high surrogates
|
||||
["%00%2D%00%2D%D8%35%D8%35%00%2D%00%2D",
|
||||
// expected: two replacement chars
|
||||
"--\uFFFD\uFFFD--"],
|
||||
// 4: Two low surrogates
|
||||
["%00%2D%00%2D%DC%20%DC%20%00%2D%00%2D",
|
||||
// expected: two replacement chars
|
||||
"--\uFFFD\uFFFD--"],
|
||||
// 5: Low surrogate followed by high surrogate
|
||||
["%00%2D%00%2D%DC%20%D8%35%00%2D%00%2D",
|
||||
// expected: two replacement chars
|
||||
"--\uFFFD\uFFFD--"],
|
||||
// 6: Lone high surrogate followed by valid surrogate pair
|
||||
["%00%2D%00%2D%D8%35%D8%35%DC%20%00%2D%00%2D",
|
||||
// expected: replacement char followed by surrogate pair
|
||||
"--\uFFFD\uD835\uDC20--"],
|
||||
// 7: Lone low surrogate followed by valid surrogate pair
|
||||
["%00%2D%00%2D%DC%20%D8%35%DC%20%00%2D%00%2D",
|
||||
// expected: replacement char followed by surrogate pair
|
||||
"--\uFFFD\uD835\uDC20--"],
|
||||
// 8: Valid surrogate pair followed by lone high surrogate
|
||||
["%00%2D%00%2D%D8%35%DC%20%D8%35%00%2D%00%2D",
|
||||
// expected: surrogate pair followed by replacement char
|
||||
"--\uD835\uDC20\uFFFD--"],
|
||||
// 9: Valid surrogate pair followed by lone low surrogate
|
||||
["%00%2D%00%2D%D8%35%DC%20%DC%20%00%2D%00%2D",
|
||||
// expected: surrogate pair followed by replacement char
|
||||
"--\uD835\uDC20\uFFFD--"],
|
||||
// 10: Lone high surrogate at the end of the input
|
||||
["%00%2D%00%2D%00%2D%00%2D%D8%35%",
|
||||
// expected: nothing
|
||||
"----"],
|
||||
// 11: Half code unit at the end of the input
|
||||
["%00%2D%00%2D%00%2D%00%2D%D8",
|
||||
// expected: nothing
|
||||
"----"]];
|
||||
|
||||
const IOService = Components.Constructor("@mozilla.org/network/io-service;1",
|
||||
"nsIIOService");
|
||||
const ConverterInputStream =
|
||||
Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
|
||||
"nsIConverterInputStream",
|
||||
"init");
|
||||
const ios = new IOService();
|
||||
|
||||
function testCase(testText, expectedText, bufferLength, charset)
|
||||
{
|
||||
var dataURI = "data:text/plain;charset=" + charset + "," + testText;
|
||||
|
||||
var channel = ios.newChannel(dataURI, "", null);
|
||||
var testInputStream = channel.open();
|
||||
var testConverter = new ConverterInputStream(testInputStream,
|
||||
charset,
|
||||
bufferLength,
|
||||
0xFFFD);
|
||||
|
||||
if (!(testConverter instanceof
|
||||
Components.interfaces.nsIUnicharLineInputStream))
|
||||
throw "not line input stream";
|
||||
|
||||
var outStr = "";
|
||||
var more;
|
||||
do {
|
||||
// read the line and check for eof
|
||||
var line = {};
|
||||
more = testConverter.readLine(line);
|
||||
outStr += line.value;
|
||||
} while (more);
|
||||
|
||||
// escape the strings before comparing for better readability
|
||||
do_check_eq(escape(outStr), escape(expectedText));
|
||||
}
|
||||
|
||||
// Byte-swap %-encoded utf-16
|
||||
function flip(str) { return str.replace(/(%..)(%..)/g, "$2$1"); }
|
||||
|
||||
function run_test()
|
||||
{
|
||||
for (var i = 0; i < 12; ++i) {
|
||||
for (var bufferLength = 4; bufferLength < 8; ++ bufferLength) {
|
||||
testCase(test[i][0], test[i][1], bufferLength, "UTF-16BE");
|
||||
testCase(flip(test[i][0]), test[i][1], bufferLength, "UTF-16LE");
|
||||
}
|
||||
}
|
||||
}
|
@ -63,7 +63,21 @@ function testCase(withBOM, charset, charsetDec, decoder, bufferLength)
|
||||
outStr += line.value;
|
||||
} while (more);
|
||||
|
||||
do_check_eq(outStr, expected);
|
||||
if (outStr != expected) {
|
||||
dump("Failed with BOM = " + withBOM + "; charset = " + charset +
|
||||
"; charset declaration = " + charsetDec + "; decoder = " + decoder +
|
||||
"; bufferLength = " + bufferLength + "\n");
|
||||
if (outStr.length == expected.length) {
|
||||
for (i = 0; i < outStr.length; ++i) {
|
||||
if (outStr.charCodeAt(i) != expected.charCodeAt(i)) {
|
||||
dump(i + ": " + outStr.charCodeAt(i).toString(16) + " != " + expected.charCodeAt(i).toString(16) + "\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// escape the strings before comparing for better readability
|
||||
do_check_eq(escape(outStr), escape(expected));
|
||||
}
|
||||
|
||||
function run_test()
|
||||
|
@ -38,6 +38,7 @@
|
||||
#include "nsUCConstructors.h"
|
||||
#include "nsUCS2BEToUnicode.h"
|
||||
#include "nsUCvLatinDll.h"
|
||||
#include "nsCharTraits.h"
|
||||
#include <string.h>
|
||||
#include "prtypes.h"
|
||||
|
||||
@ -46,11 +47,12 @@
|
||||
#define STATE_FIRST_CALL 2
|
||||
#define STATE_FOUND_BOM 3
|
||||
|
||||
// XXX : illegal surrogate code points are just passed through !!
|
||||
static nsresult
|
||||
UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aData, const char * aSrc,
|
||||
UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aOddByte,
|
||||
PRUnichar& aOddHighSurrogate, const char * aSrc,
|
||||
PRInt32 * aSrcLength, PRUnichar * aDest,
|
||||
PRInt32 * aDestLength)
|
||||
PRInt32 * aDestLength,
|
||||
PRBool aSwapBytes)
|
||||
{
|
||||
const char* src = aSrc;
|
||||
const char* srcEnd = aSrc + *aSrcLength;
|
||||
@ -81,41 +83,80 @@ UTF16ConvertToUnicode(PRUint8& aState, PRUint8& aData, const char * aSrc,
|
||||
aState = STATE_NORMAL;
|
||||
}
|
||||
|
||||
PRInt32 copybytes;
|
||||
if (src == srcEnd) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
if((STATE_HALF_CODE_POINT == aState) && (src < srcEnd))
|
||||
{
|
||||
if(dest >= destEnd)
|
||||
PRUnichar oddHighSurrogate = aOddHighSurrogate;
|
||||
|
||||
const char* srcEvenEnd;
|
||||
|
||||
PRUnichar u;
|
||||
if (aState == STATE_HALF_CODE_POINT) {
|
||||
// the 1st byte of a 16-bit code unit was stored in |aOddByte| in the
|
||||
// previous run while the 2nd byte has to come from |*src|.
|
||||
aState = STATE_NORMAL;
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
u = (aOddByte << 8) | *src++; // safe, we know we have at least one byte.
|
||||
#else
|
||||
u = (*src++ << 8) | aOddByte; // safe, we know we have at least one byte.
|
||||
#endif
|
||||
srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
|
||||
goto have_codepoint;
|
||||
} else {
|
||||
srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
|
||||
}
|
||||
|
||||
while (src != srcEvenEnd) {
|
||||
if (dest == destEnd)
|
||||
goto error;
|
||||
|
||||
char tmpbuf[2];
|
||||
u = *(const PRUnichar*)src;
|
||||
src += 2;
|
||||
|
||||
// the 1st byte of a 16-bit code unit was stored in |aData| in the previous
|
||||
// run while the 2nd byte has to come from |*src|. We just have to copy
|
||||
// 'byte-by-byte'. Byte-swapping, if necessary, will be done in |Convert| of
|
||||
// LE and BE converters.
|
||||
PRUnichar * up = (PRUnichar*) &tmpbuf[0];
|
||||
tmpbuf[0]= aData;
|
||||
tmpbuf[1]= *src++;
|
||||
*dest++ = *up;
|
||||
have_codepoint:
|
||||
if (aSwapBytes)
|
||||
u = u << 8 | u >> 8;
|
||||
|
||||
if (!IS_SURROGATE(u)) {
|
||||
if (oddHighSurrogate) {
|
||||
*dest++ = UCS2_REPLACEMENT_CHAR;
|
||||
if (dest == destEnd)
|
||||
goto error;
|
||||
oddHighSurrogate = 0;
|
||||
}
|
||||
*dest++ = u;
|
||||
} else if (NS_IS_HIGH_SURROGATE(u)) {
|
||||
if (oddHighSurrogate) {
|
||||
*dest++ = UCS2_REPLACEMENT_CHAR;
|
||||
if (dest == destEnd)
|
||||
goto error;
|
||||
}
|
||||
oddHighSurrogate = u;
|
||||
}
|
||||
else /* if (NS_IS_LOW_SURROGATE(u)) */ {
|
||||
if (oddHighSurrogate) {
|
||||
if (dest == destEnd - 1) {
|
||||
*dest++ = UCS2_REPLACEMENT_CHAR;
|
||||
goto error;
|
||||
}
|
||||
*dest++ = oddHighSurrogate;
|
||||
*dest++ = u;
|
||||
oddHighSurrogate = 0;
|
||||
} else {
|
||||
*dest++ = UCS2_REPLACEMENT_CHAR;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
copybytes = (destEnd-dest)*2;
|
||||
// if |srcEnd-src| is odd, we copy one fewer bytes.
|
||||
if(copybytes > (~1 & (srcEnd - src)))
|
||||
copybytes = ~1 & (srcEnd - src);
|
||||
memcpy(dest,src,copybytes);
|
||||
src +=copybytes;
|
||||
dest +=(copybytes/2);
|
||||
if(srcEnd == src) { // srcLength was even.
|
||||
aState = STATE_NORMAL;
|
||||
} else if(1 == (srcEnd - src) ) { // srcLength was odd.
|
||||
aState = STATE_HALF_CODE_POINT;
|
||||
aData = *src++; // store the lead byte of a 16-bit unit for the next run.
|
||||
} else {
|
||||
goto error;
|
||||
if (src != srcEnd) {
|
||||
// store the lead byte of a 16-bit unit for the next run.
|
||||
aOddByte = *src++;
|
||||
aState = STATE_HALF_CODE_POINT;
|
||||
}
|
||||
|
||||
|
||||
aOddHighSurrogate = oddHighSurrogate;
|
||||
|
||||
*aDestLength = dest - aDest;
|
||||
*aSrcLength = src - aSrc;
|
||||
return NS_OK;
|
||||
@ -126,18 +167,12 @@ error:
|
||||
return NS_OK_UDEC_MOREOUTPUT;
|
||||
}
|
||||
|
||||
static void
|
||||
SwapBytes(PRUnichar *aDest, PRInt32 aLen)
|
||||
{
|
||||
for (PRUnichar *p = aDest; aLen > 0; ++p, --aLen)
|
||||
*p = ((*p & 0xff) << 8) | ((*p >> 8) & 0xff);
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUTF16ToUnicodeBase::Reset()
|
||||
{
|
||||
mState = STATE_FIRST_CALL;
|
||||
mData = 0;
|
||||
mOddByte = 0;
|
||||
mOddHighSurrogate = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
@ -145,8 +180,10 @@ NS_IMETHODIMP
|
||||
nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, PRInt32 aSrcLength,
|
||||
PRInt32 * aDestLength)
|
||||
{
|
||||
// the left-over byte of the previous run has to be taken into account.
|
||||
*aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2;
|
||||
// the left-over data of the previous run have to be taken into account.
|
||||
*aDestLength = (aSrcLength +
|
||||
((STATE_HALF_CODE_POINT == mState) ? 1 : 0)) / 2 +
|
||||
((mOddHighSurrogate != 0) ? 1 : 0);
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
@ -174,12 +211,14 @@ nsUTF16BEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
|
||||
}
|
||||
#endif
|
||||
|
||||
nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength,
|
||||
aDest, aDestLength);
|
||||
|
||||
nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
|
||||
aSrc, aSrcLength, aDest, aDestLength,
|
||||
#ifdef IS_LITTLE_ENDIAN
|
||||
SwapBytes(aDest, *aDestLength);
|
||||
PR_TRUE
|
||||
#else
|
||||
PR_FALSE
|
||||
#endif
|
||||
);
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -206,12 +245,14 @@ nsUTF16LEToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
|
||||
}
|
||||
#endif
|
||||
|
||||
nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength, aDest,
|
||||
aDestLength);
|
||||
|
||||
nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
|
||||
aSrc, aSrcLength, aDest, aDestLength,
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
SwapBytes(aDest, *aDestLength);
|
||||
PR_TRUE
|
||||
#else
|
||||
PR_FALSE
|
||||
#endif
|
||||
);
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -262,17 +303,16 @@ nsUTF16ToUnicode::Convert(const char * aSrc, PRInt32 * aSrcLength,
|
||||
}
|
||||
}
|
||||
|
||||
nsresult rv = UTF16ConvertToUnicode(mState, mData, aSrc, aSrcLength, aDest,
|
||||
aDestLength);
|
||||
|
||||
nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
|
||||
aSrc, aSrcLength, aDest, aDestLength,
|
||||
#ifdef IS_BIG_ENDIAN
|
||||
if (mEndian == kLittleEndian)
|
||||
(mEndian == kLittleEndian)
|
||||
#elif defined(IS_LITTLE_ENDIAN)
|
||||
if (mEndian == kBigEndian)
|
||||
(mEndian == kBigEndian)
|
||||
#else
|
||||
#error "Unknown endianness"
|
||||
#endif
|
||||
SwapBytes(aDest, *aDestLength);
|
||||
);
|
||||
|
||||
// If BOM is not found and we're to return NS_OK, signal that BOM
|
||||
// is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
|
||||
|
@ -58,7 +58,10 @@ public:
|
||||
|
||||
protected:
|
||||
PRUint8 mState;
|
||||
PRUint8 mData;
|
||||
// to store an odd byte left over between runs
|
||||
PRUint8 mOddByte;
|
||||
// to store an odd high surrogate left over between runs
|
||||
PRUnichar mOddHighSurrogate;
|
||||
};
|
||||
|
||||
// UTF-16 big endian
|
||||
|
Loading…
Reference in New Issue
Block a user