Various fixes to multi-byte Unicode decoders. Bug 715319, r=emk

This commit is contained in:
Simon Montagu 2012-03-16 13:41:41 -07:00
parent 29722aef5f
commit 068e9f59a2
5 changed files with 74 additions and 45 deletions

View File

@ -294,7 +294,7 @@ cp932=Shift_JIS
# #
# Aliases for EUC_JP # Aliases for EUC_JP
# #
cseucjpkdfmtjapanese=EUC-JP cseucpkdfmtjapanese=EUC-JP
x-euc-jp=EUC-JP x-euc-jp=EUC-JP
# #
# Aliases for ISO-2022-JP # Aliases for ISO-2022-JP

View File

@ -87,6 +87,7 @@ x-mac-farsi.isXSSVulnerable = true
x-mac-hebrew.isXSSVulnerable = true x-mac-hebrew.isXSSVulnerable = true
x-imap4-modified-utf7.isXSSVulnerable = true x-imap4-modified-utf7.isXSSVulnerable = true
utf-7.isXSSVulnerable = true utf-7.isXSSVulnerable = true
t.61-8bit.isXSSVulnerable = true
t.61-8bit.notForOutgoing = true t.61-8bit.notForOutgoing = true
utf-7.notForOutgoing = true utf-7.notForOutgoing = true

View File

@ -217,11 +217,16 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
} }
} }
} }
aSrc += 4;
i += 3;
} else { } else {
*aDest = UCS2_NO_MAPPING; *aDest = UCS2_NO_MAPPING;
// If the third and fourth bytes are not in the legal ranges for
// a four-byte sequnce, resynchronize on the second byte
// (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
// 0x30-0x39)
aSrc++;
} }
aSrc += 4;
i+=3;
} }
else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 ) else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
{ {

View File

@ -46,9 +46,14 @@
* http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html * http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
* and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html. * and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
* *
* In an effort to match the similar extended capability of Microsoft Internet Explorer * Earlier versions of the converter said:
* 5.0. We also accept the 8-bit GB encoded chars mixed in a HZ string. * "In an effort to match the similar extended capability of Microsoft
* But this should not be a recommendedd practice for HTML authors. * Internet Explorer 5.0. We also accept the 8-bit GB encoded chars
* mixed in a HZ string.
* But this should not be a recommendedd practice for HTML authors."
* However, testing in current versions of IE shows that it only accepts
* 8-bit characters when the converter is in GB state, and when in ASCII
* state each single 8-bit character is converted to U+FFFD
* *
* The priority of converting are as follows: first convert 8-bit GB code; then, * The priority of converting are as follows: first convert 8-bit GB code; then,
* consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current * consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
@ -74,7 +79,6 @@
#define HZLEAD1 '~' #define HZLEAD1 '~'
#define HZLEAD2 '{' #define HZLEAD2 '{'
#define HZLEAD3 '}' #define HZLEAD3 '}'
#define HZLEAD4 '\n'
#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG)) #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG)) #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
@ -107,23 +111,28 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
char srcByte = *aSrc++; char srcByte = *aSrc++;
(*aSrcLength)++; (*aSrcLength)++;
if (!HZ_ODD_BYTE_STATE) { if (!HZ_ODD_BYTE_STATE) {
if (srcByte & 0x80 || srcByte == HZLEAD1 || HZ_ENCODING_STATE == HZ_STATE_GB) { if (srcByte == HZLEAD1 ||
(HZ_ENCODING_STATE == HZ_STATE_GB &&
(UINT8_IN_RANGE(0x21, srcByte, 0x7E) ||
UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) {
oddByte = srcByte; oddByte = srcByte;
mHZState |= HZ_STATE_ODD_BYTE_FLAG; mHZState |= HZ_STATE_ODD_BYTE_FLAG;
} else { } else {
*aDest++ = CAST_CHAR_TO_UNICHAR(srcByte); *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING :
CAST_CHAR_TO_UNICHAR(srcByte);
iDestlen++; iDestlen++;
} }
} else { } else {
if (oddByte & 0x80) { // if it is a 8-bit byte if (oddByte & 0x80) {
if (UINT8_IN_RANGE(0x81, oddByte, 0xFE) && // Accept legal 8-bit GB 2312-80 sequences in GB mode only
UINT8_IN_RANGE(0x40, srcByte, 0xFE)) { NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB,
// The source is a 8-bit GBCode "Invalid lead byte in ASCII mode");
*aDest++ = mUtil.GBKCharToUnicode(oddByte, srcByte); *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
} else { UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ?
*aDest++ = UCS2_NO_MAPPING; mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING;
} mRunLength++;
iDestlen++; iDestlen++;
// otherwise, it is a 7-bit byte // otherwise, it is a 7-bit byte
// The source will be an ASCII or a 7-bit HZ code depending on oddByte // The source will be an ASCII or a 7-bit HZ code depending on oddByte
@ -132,14 +141,14 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
case HZLEAD2: case HZLEAD2:
// we got a '~{' // we got a '~{'
// we are switching to HZ state // we are switching to HZ state
mHZState = HZ_STATE_GB | HZ_ODD_BYTE_STATE; mHZState = HZ_STATE_GB;
mRunLength = 0; mRunLength = 0;
break; break;
case HZLEAD3: case HZLEAD3:
// we got a '~}' // we got a '~}'
// we are switching to ASCII state // we are switching to ASCII state
mHZState = HZ_STATE_ASCII | HZ_ODD_BYTE_STATE; mHZState = HZ_STATE_ASCII;
if (mRunLength == 0) { if (mRunLength == 0) {
*aDest++ = UCS2_NO_MAPPING; *aDest++ = UCS2_NO_MAPPING;
iDestlen++; iDestlen++;
@ -154,25 +163,28 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
mRunLength++; mRunLength++;
break; break;
case HZLEAD4:
// we got a "~\n", it means maintain double byte mode cross lines,
// ignore the '~' itself
// mHZState = HZ_STATE_GB;
// I find that "~\n" should interpreted as line continuation
// without mode change
// It should not be interpreted as line continuation with double
// byte mode on
break;
default: default:
// undefined ESC sequence '~X' are ignored since this is an // Undefined ESC sequence '~X': treat as an error if X is a
// illegal combination // printable character or we are in ASCII mode, and resynchronize
*aDest++ = UCS2_NO_MAPPING; // on the second character.
//
// N.B. For compatibility with other implementations, we treat '~\n'
// as an illegal sequence even though RFC1843 permits it, and for
// the same reason we pass through control characters including '\n'
// and ' ' even in GB mode.
if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) {
*aDest++ = UCS2_NO_MAPPING;
}
aSrc--;
(*aSrcLength)--;
iDestlen++; iDestlen++;
break; break;
} }
} else if (HZ_ENCODING_STATE == HZ_STATE_GB) { } else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
*aDest++ = mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80); *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) &&
UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ?
mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) :
UCS2_NO_MAPPING;
mRunLength++; mRunLength++;
iDestlen++; iDestlen++;
} else { } else {

View File

@ -63,6 +63,8 @@ static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CI
#define JIS0212_INDEX gJIS0212Index #define JIS0212_INDEX gJIS0212Index
#define SJIS_UNMAPPED 0x30fb #define SJIS_UNMAPPED 0x30fb
#define UNICODE_REPLACEMENT_CHARACTER 0xfffd #define UNICODE_REPLACEMENT_CHARACTER 0xfffd
#define IN_GR_RANGE(b) \
((PRUint8(0xa1) <= PRUint8(b)) && (PRUint8(b) <= PRUint8(0xfe)))
NS_IMETHODIMP nsShiftJISToUnicode::Convert( NS_IMETHODIMP nsShiftJISToUnicode::Convert(
const char * aSrc, PRInt32 * aSrcLen, const char * aSrc, PRInt32 * aSrcLen,
@ -345,7 +347,7 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
case 3: // JIS 0212 case 3: // JIS 0212
{ {
if(*src & 0x80) if (IN_GR_RANGE(*src))
{ {
mData = JIS0212_INDEX[*src & 0x7F]; mData = JIS0212_INDEX[*src & 0x7F];
if(mData != 0xFFFD ) if(mData != 0xFFFD )
@ -355,30 +357,39 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
mState = 5; // error mState = 5; // error
} }
} else { } else {
mState = 5; // error // First "JIS 0212" byte is not in the valid GR range: save it
if (mErrBehavior == kOnError_Signal)
goto error_invalidchar;
*dest++ = 0xFFFD;
--src;
mState = 0;
if(dest >= destEnd)
goto error1;
} }
} }
break; break;
case 4: case 4:
{ {
PRUint8 off = sbIdx[*src]; PRUint8 off = sbIdx[*src];
if(0xFF == off) { if(0xFF != off) {
if (mErrBehavior == kOnError_Signal) *dest++ = gJapaneseMap[mData+off];
goto error_invalidchar; mState = 0;
*dest++ = 0xFFFD; if(dest >= destEnd)
} else { goto error1;
*dest++ = gJapaneseMap[mData+off]; break;
} }
mState = 0; // else fall through to error handler
if(dest >= destEnd)
goto error1;
} }
break;
case 5: // two bytes undefined case 5: // two bytes undefined
{ {
if (mErrBehavior == kOnError_Signal) if (mErrBehavior == kOnError_Signal)
goto error_invalidchar; goto error_invalidchar;
*dest++ = 0xFFFD; *dest++ = 0xFFFD;
// Undefined JIS 0212 two byte sequence. If the second byte is in
// the valid range for a two byte sequence (0xa1 - 0xfe) consume
// both bytes. Otherwise resynchronize on the second byte.
if (!IN_GR_RANGE(*src))
--src;
mState = 0; mState = 0;
if(dest >= destEnd) if(dest >= destEnd)
goto error1; goto error1;