mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Various fixes to multi-byte Unicode decoders. Bug 715319, r=emk
This commit is contained in:
parent
29722aef5f
commit
068e9f59a2
@ -294,7 +294,7 @@ cp932=Shift_JIS
|
|||||||
#
|
#
|
||||||
# Aliases for EUC_JP
|
# Aliases for EUC_JP
|
||||||
#
|
#
|
||||||
cseucjpkdfmtjapanese=EUC-JP
|
cseucpkdfmtjapanese=EUC-JP
|
||||||
x-euc-jp=EUC-JP
|
x-euc-jp=EUC-JP
|
||||||
#
|
#
|
||||||
# Aliases for ISO-2022-JP
|
# Aliases for ISO-2022-JP
|
||||||
|
@ -87,6 +87,7 @@ x-mac-farsi.isXSSVulnerable = true
|
|||||||
x-mac-hebrew.isXSSVulnerable = true
|
x-mac-hebrew.isXSSVulnerable = true
|
||||||
x-imap4-modified-utf7.isXSSVulnerable = true
|
x-imap4-modified-utf7.isXSSVulnerable = true
|
||||||
utf-7.isXSSVulnerable = true
|
utf-7.isXSSVulnerable = true
|
||||||
|
t.61-8bit.isXSSVulnerable = true
|
||||||
|
|
||||||
t.61-8bit.notForOutgoing = true
|
t.61-8bit.notForOutgoing = true
|
||||||
utf-7.notForOutgoing = true
|
utf-7.notForOutgoing = true
|
||||||
|
@ -217,11 +217,16 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
aSrc += 4;
|
||||||
|
i += 3;
|
||||||
} else {
|
} else {
|
||||||
*aDest = UCS2_NO_MAPPING;
|
*aDest = UCS2_NO_MAPPING;
|
||||||
|
// If the third and fourth bytes are not in the legal ranges for
|
||||||
|
// a four-byte sequnce, resynchronize on the second byte
|
||||||
|
// (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
|
||||||
|
// 0x30-0x39)
|
||||||
|
aSrc++;
|
||||||
}
|
}
|
||||||
aSrc += 4;
|
|
||||||
i+=3;
|
|
||||||
}
|
}
|
||||||
else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
|
else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
|
||||||
{
|
{
|
||||||
|
@ -46,9 +46,14 @@
|
|||||||
* http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
|
* http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
|
||||||
* and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
|
* and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
|
||||||
*
|
*
|
||||||
* In an effort to match the similar extended capability of Microsoft Internet Explorer
|
* Earlier versions of the converter said:
|
||||||
* 5.0. We also accept the 8-bit GB encoded chars mixed in a HZ string.
|
* "In an effort to match the similar extended capability of Microsoft
|
||||||
* But this should not be a recommendedd practice for HTML authors.
|
* Internet Explorer 5.0. We also accept the 8-bit GB encoded chars
|
||||||
|
* mixed in a HZ string.
|
||||||
|
* But this should not be a recommendedd practice for HTML authors."
|
||||||
|
* However, testing in current versions of IE shows that it only accepts
|
||||||
|
* 8-bit characters when the converter is in GB state, and when in ASCII
|
||||||
|
* state each single 8-bit character is converted to U+FFFD
|
||||||
*
|
*
|
||||||
* The priority of converting are as follows: first convert 8-bit GB code; then,
|
* The priority of converting are as follows: first convert 8-bit GB code; then,
|
||||||
* consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
|
* consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
|
||||||
@ -74,7 +79,6 @@
|
|||||||
#define HZLEAD1 '~'
|
#define HZLEAD1 '~'
|
||||||
#define HZLEAD2 '{'
|
#define HZLEAD2 '{'
|
||||||
#define HZLEAD3 '}'
|
#define HZLEAD3 '}'
|
||||||
#define HZLEAD4 '\n'
|
|
||||||
#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
|
#define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
|
||||||
#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
|
#define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
|
||||||
|
|
||||||
@ -107,23 +111,28 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
|
|||||||
|
|
||||||
char srcByte = *aSrc++;
|
char srcByte = *aSrc++;
|
||||||
(*aSrcLength)++;
|
(*aSrcLength)++;
|
||||||
|
|
||||||
if (!HZ_ODD_BYTE_STATE) {
|
if (!HZ_ODD_BYTE_STATE) {
|
||||||
if (srcByte & 0x80 || srcByte == HZLEAD1 || HZ_ENCODING_STATE == HZ_STATE_GB) {
|
if (srcByte == HZLEAD1 ||
|
||||||
|
(HZ_ENCODING_STATE == HZ_STATE_GB &&
|
||||||
|
(UINT8_IN_RANGE(0x21, srcByte, 0x7E) ||
|
||||||
|
UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) {
|
||||||
oddByte = srcByte;
|
oddByte = srcByte;
|
||||||
mHZState |= HZ_STATE_ODD_BYTE_FLAG;
|
mHZState |= HZ_STATE_ODD_BYTE_FLAG;
|
||||||
} else {
|
} else {
|
||||||
*aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
|
*aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING :
|
||||||
|
CAST_CHAR_TO_UNICHAR(srcByte);
|
||||||
iDestlen++;
|
iDestlen++;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (oddByte & 0x80) { // if it is a 8-bit byte
|
if (oddByte & 0x80) {
|
||||||
if (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
|
// Accept legal 8-bit GB 2312-80 sequences in GB mode only
|
||||||
UINT8_IN_RANGE(0x40, srcByte, 0xFE)) {
|
NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB,
|
||||||
// The source is a 8-bit GBCode
|
"Invalid lead byte in ASCII mode");
|
||||||
*aDest++ = mUtil.GBKCharToUnicode(oddByte, srcByte);
|
*aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
|
||||||
} else {
|
UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ?
|
||||||
*aDest++ = UCS2_NO_MAPPING;
|
mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING;
|
||||||
}
|
mRunLength++;
|
||||||
iDestlen++;
|
iDestlen++;
|
||||||
// otherwise, it is a 7-bit byte
|
// otherwise, it is a 7-bit byte
|
||||||
// The source will be an ASCII or a 7-bit HZ code depending on oddByte
|
// The source will be an ASCII or a 7-bit HZ code depending on oddByte
|
||||||
@ -132,14 +141,14 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
|
|||||||
case HZLEAD2:
|
case HZLEAD2:
|
||||||
// we got a '~{'
|
// we got a '~{'
|
||||||
// we are switching to HZ state
|
// we are switching to HZ state
|
||||||
mHZState = HZ_STATE_GB | HZ_ODD_BYTE_STATE;
|
mHZState = HZ_STATE_GB;
|
||||||
mRunLength = 0;
|
mRunLength = 0;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case HZLEAD3:
|
case HZLEAD3:
|
||||||
// we got a '~}'
|
// we got a '~}'
|
||||||
// we are switching to ASCII state
|
// we are switching to ASCII state
|
||||||
mHZState = HZ_STATE_ASCII | HZ_ODD_BYTE_STATE;
|
mHZState = HZ_STATE_ASCII;
|
||||||
if (mRunLength == 0) {
|
if (mRunLength == 0) {
|
||||||
*aDest++ = UCS2_NO_MAPPING;
|
*aDest++ = UCS2_NO_MAPPING;
|
||||||
iDestlen++;
|
iDestlen++;
|
||||||
@ -154,25 +163,28 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
|
|||||||
mRunLength++;
|
mRunLength++;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case HZLEAD4:
|
|
||||||
// we got a "~\n", it means maintain double byte mode cross lines,
|
|
||||||
// ignore the '~' itself
|
|
||||||
// mHZState = HZ_STATE_GB;
|
|
||||||
// I find that "~\n" should interpreted as line continuation
|
|
||||||
// without mode change
|
|
||||||
// It should not be interpreted as line continuation with double
|
|
||||||
// byte mode on
|
|
||||||
break;
|
|
||||||
|
|
||||||
default:
|
default:
|
||||||
// undefined ESC sequence '~X' are ignored since this is an
|
// Undefined ESC sequence '~X': treat as an error if X is a
|
||||||
// illegal combination
|
// printable character or we are in ASCII mode, and resynchronize
|
||||||
*aDest++ = UCS2_NO_MAPPING;
|
// on the second character.
|
||||||
|
//
|
||||||
|
// N.B. For compatibility with other implementations, we treat '~\n'
|
||||||
|
// as an illegal sequence even though RFC1843 permits it, and for
|
||||||
|
// the same reason we pass through control characters including '\n'
|
||||||
|
// and ' ' even in GB mode.
|
||||||
|
if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) {
|
||||||
|
*aDest++ = UCS2_NO_MAPPING;
|
||||||
|
}
|
||||||
|
aSrc--;
|
||||||
|
(*aSrcLength)--;
|
||||||
iDestlen++;
|
iDestlen++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
|
} else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
|
||||||
*aDest++ = mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80);
|
*aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) &&
|
||||||
|
UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ?
|
||||||
|
mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) :
|
||||||
|
UCS2_NO_MAPPING;
|
||||||
mRunLength++;
|
mRunLength++;
|
||||||
iDestlen++;
|
iDestlen++;
|
||||||
} else {
|
} else {
|
||||||
|
@ -63,6 +63,8 @@ static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CI
|
|||||||
#define JIS0212_INDEX gJIS0212Index
|
#define JIS0212_INDEX gJIS0212Index
|
||||||
#define SJIS_UNMAPPED 0x30fb
|
#define SJIS_UNMAPPED 0x30fb
|
||||||
#define UNICODE_REPLACEMENT_CHARACTER 0xfffd
|
#define UNICODE_REPLACEMENT_CHARACTER 0xfffd
|
||||||
|
#define IN_GR_RANGE(b) \
|
||||||
|
((PRUint8(0xa1) <= PRUint8(b)) && (PRUint8(b) <= PRUint8(0xfe)))
|
||||||
|
|
||||||
NS_IMETHODIMP nsShiftJISToUnicode::Convert(
|
NS_IMETHODIMP nsShiftJISToUnicode::Convert(
|
||||||
const char * aSrc, PRInt32 * aSrcLen,
|
const char * aSrc, PRInt32 * aSrcLen,
|
||||||
@ -345,7 +347,7 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
|
|||||||
|
|
||||||
case 3: // JIS 0212
|
case 3: // JIS 0212
|
||||||
{
|
{
|
||||||
if(*src & 0x80)
|
if (IN_GR_RANGE(*src))
|
||||||
{
|
{
|
||||||
mData = JIS0212_INDEX[*src & 0x7F];
|
mData = JIS0212_INDEX[*src & 0x7F];
|
||||||
if(mData != 0xFFFD )
|
if(mData != 0xFFFD )
|
||||||
@ -355,30 +357,39 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
|
|||||||
mState = 5; // error
|
mState = 5; // error
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
mState = 5; // error
|
// First "JIS 0212" byte is not in the valid GR range: save it
|
||||||
|
if (mErrBehavior == kOnError_Signal)
|
||||||
|
goto error_invalidchar;
|
||||||
|
*dest++ = 0xFFFD;
|
||||||
|
--src;
|
||||||
|
mState = 0;
|
||||||
|
if(dest >= destEnd)
|
||||||
|
goto error1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 4:
|
case 4:
|
||||||
{
|
{
|
||||||
PRUint8 off = sbIdx[*src];
|
PRUint8 off = sbIdx[*src];
|
||||||
if(0xFF == off) {
|
if(0xFF != off) {
|
||||||
if (mErrBehavior == kOnError_Signal)
|
*dest++ = gJapaneseMap[mData+off];
|
||||||
goto error_invalidchar;
|
mState = 0;
|
||||||
*dest++ = 0xFFFD;
|
if(dest >= destEnd)
|
||||||
} else {
|
goto error1;
|
||||||
*dest++ = gJapaneseMap[mData+off];
|
break;
|
||||||
}
|
}
|
||||||
mState = 0;
|
// else fall through to error handler
|
||||||
if(dest >= destEnd)
|
|
||||||
goto error1;
|
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
case 5: // two bytes undefined
|
case 5: // two bytes undefined
|
||||||
{
|
{
|
||||||
if (mErrBehavior == kOnError_Signal)
|
if (mErrBehavior == kOnError_Signal)
|
||||||
goto error_invalidchar;
|
goto error_invalidchar;
|
||||||
*dest++ = 0xFFFD;
|
*dest++ = 0xFFFD;
|
||||||
|
// Undefined JIS 0212 two byte sequence. If the second byte is in
|
||||||
|
// the valid range for a two byte sequence (0xa1 - 0xfe) consume
|
||||||
|
// both bytes. Otherwise resynchronize on the second byte.
|
||||||
|
if (!IN_GR_RANGE(*src))
|
||||||
|
--src;
|
||||||
mState = 0;
|
mState = 0;
|
||||||
if(dest >= destEnd)
|
if(dest >= destEnd)
|
||||||
goto error1;
|
goto error1;
|
||||||
|
Loading…
Reference in New Issue
Block a user