Various fixes to multi-byte Unicode decoders. Bug 715319, r=emk

2024-09-13 09:24:08 -07:00 · 2012-03-16 13:41:41 -07:00 · 2012-03-16 13:41:41 -07:00 · 068e9f59a2
commit 068e9f59a2
parent 29722aef5f
5 changed files with 74 additions and 45 deletions
--- a/intl/locale/src/charsetalias.properties
+++ b/intl/locale/src/charsetalias.properties
@ -294,7 +294,7 @@ cp932=Shift_JIS
 #
 # Aliases for EUC_JP
 #
-cseucjpkdfmtjapanese=EUC-JP
+cseucpkdfmtjapanese=EUC-JP
 x-euc-jp=EUC-JP
 #
 # Aliases for ISO-2022-JP
--- a/intl/uconv/src/charsetData.properties
+++ b/intl/uconv/src/charsetData.properties
@ -87,6 +87,7 @@ x-mac-farsi.isXSSVulnerable             = true
 x-mac-hebrew.isXSSVulnerable            = true
 x-imap4-modified-utf7.isXSSVulnerable   = true
 utf-7.isXSSVulnerable                   = true
 t.61-8bit.isXSSVulnerable               = true
 t.61-8bit.notForOutgoing             = true
 utf-7.notForOutgoing                 = true
--- a/intl/uconv/ucvcn/nsGBKToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsGBKToUnicode.cpp
@ -217,11 +217,16 @@ NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
               }
             }
           }
           aSrc += 4;
           i += 3;
        } else {
          *aDest = UCS2_NO_MAPPING; 
          // If the third and fourth bytes are not in the legal ranges for
          // a four-byte sequnce, resynchronize on the second byte
          // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
          //  0x30-0x39)
          aSrc++;
        }
        aSrc += 4;
        i+=3;
      }
      else if ((PRUint8) aSrc[0] == (PRUint8)0xA0 )
      {
--- a/intl/uconv/ucvcn/nsHZToUnicode.cpp
+++ b/intl/uconv/ucvcn/nsHZToUnicode.cpp
@ -46,9 +46,14 @@
 *       http://www.cis.ohio-state.edu/htbin/rfc/rfc1843.html
 *       and RFC1842 available at http://www.cis.ohio-state.edu/htbin/rfc/rfc1842.html.
 *        
- *       In an effort to match the similar extended capability of Microsoft Internet Explorer
+ *       Earlier versions of the converter said:
- *       5.0. We also accept the 8-bit GB encoded chars mixed in a HZ string.
+ *        "In an effort to match the similar extended capability of Microsoft 
- *       But this should not be a recommendedd practice for HTML authors.
+ *         Internet Explorer 5.0. We also accept the 8-bit GB encoded chars
 *         mixed in a HZ string. 
 *         But this should not be a recommendedd practice for HTML authors."
 *       However, testing in current versions of IE shows that it only accepts
 *       8-bit characters when the converter is in GB state, and when in ASCII
 *       state each single 8-bit character is converted to U+FFFD
 *
 *       The priority of converting are as follows: first convert 8-bit GB code; then,
 *       consume HZ ESC sequences such as '~{', '~}', '~~'; then, depending on the current
@ -74,7 +79,6 @@
 #define HZLEAD1 '~'
 #define HZLEAD2 '{'
 #define HZLEAD3 '}'
 #define HZLEAD4 '\n'
 #define HZ_ODD_BYTE_STATE (mHZState & (HZ_STATE_ODD_BYTE_FLAG))
 #define HZ_ENCODING_STATE (mHZState & ~(HZ_STATE_ODD_BYTE_FLAG))
@ -107,23 +111,28 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
    char srcByte = *aSrc++;
    (*aSrcLength)++;
    if (!HZ_ODD_BYTE_STATE) {
-      if (srcByte & 0x80 || srcByte == HZLEAD1 || HZ_ENCODING_STATE == HZ_STATE_GB) { 
+      if (srcByte == HZLEAD1 || 
          (HZ_ENCODING_STATE == HZ_STATE_GB && 
           (UINT8_IN_RANGE(0x21, srcByte, 0x7E) ||
            UINT8_IN_RANGE(0x81, srcByte, 0xFE)))) {
        oddByte = srcByte;
        mHZState |= HZ_STATE_ODD_BYTE_FLAG;
      } else {
-        *aDest++ = CAST_CHAR_TO_UNICHAR(srcByte);
+        *aDest++ = (srcByte & 0x80) ? UCS2_NO_MAPPING :
                                      CAST_CHAR_TO_UNICHAR(srcByte);
        iDestlen++;
      }
    } else {
-      if (oddByte & 0x80) { // if it is a 8-bit byte
+      if (oddByte & 0x80) {
-        if (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
+        // Accept legal 8-bit GB 2312-80 sequences in GB mode only
-            UINT8_IN_RANGE(0x40, srcByte, 0xFE)) {
+        NS_ASSERTION(HZ_ENCODING_STATE == HZ_STATE_GB,
-          // The source is a 8-bit GBCode
+                     "Invalid lead byte in ASCII mode");                    
-          *aDest++ = mUtil.GBKCharToUnicode(oddByte, srcByte);
+        *aDest++ = (UINT8_IN_RANGE(0x81, oddByte, 0xFE) &&
-        } else {
+                    UINT8_IN_RANGE(0x40, srcByte, 0xFE)) ?
-          *aDest++ = UCS2_NO_MAPPING;
+                     mUtil.GBKCharToUnicode(oddByte, srcByte) : UCS2_NO_MAPPING;
-        }
+        mRunLength++;
        iDestlen++;
      // otherwise, it is a 7-bit byte 
      // The source will be an ASCII or a 7-bit HZ code depending on oddByte
@ -132,14 +141,14 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
          case HZLEAD2: 
            // we got a '~{'
            // we are switching to HZ state
-            mHZState = HZ_STATE_GB | HZ_ODD_BYTE_STATE;
+            mHZState = HZ_STATE_GB;
            mRunLength = 0;
            break;
          case HZLEAD3: 
            // we got a '~}'
            // we are switching to ASCII state
-            mHZState = HZ_STATE_ASCII | HZ_ODD_BYTE_STATE;
+            mHZState = HZ_STATE_ASCII;
            if (mRunLength == 0) {
              *aDest++ = UCS2_NO_MAPPING;
              iDestlen++;
@ -154,25 +163,28 @@ NS_IMETHODIMP nsHZToUnicode::ConvertNoBuff(
            mRunLength++;
            break;
          case HZLEAD4:   
            // we got a "~\n", it means maintain double byte mode cross lines,
            // ignore the '~' itself
            //  mHZState = HZ_STATE_GB; 
            // I find that "~\n" should interpreted as line continuation
            // without mode change
            // It should not be interpreted as line continuation with double
            // byte mode on
            break;
          default:
-            // undefined ESC sequence '~X' are ignored since this is an
+            // Undefined ESC sequence '~X': treat as an error if X is a
-            // illegal combination 
+            // printable character or we are in ASCII mode, and resynchronize
-            *aDest++ = UCS2_NO_MAPPING;
+            // on the second character.
            // 
            // N.B. For compatibility with other implementations, we treat '~\n'
            // as an illegal sequence even though RFC1843 permits it, and for
            // the same reason we pass through control characters including '\n'
            // and ' ' even in GB mode.
            if (srcByte > 0x20 || HZ_ENCODING_STATE == HZ_STATE_ASCII) {
              *aDest++ = UCS2_NO_MAPPING;
            }
            aSrc--;
            (*aSrcLength)--;
            iDestlen++;
            break;
        }
      } else if (HZ_ENCODING_STATE == HZ_STATE_GB) {
-        *aDest++ = mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80);
+        *aDest++ = (UINT8_IN_RANGE(0x21, oddByte, 0x7E) &&
                    UINT8_IN_RANGE(0x21, srcByte, 0x7E)) ?
                     mUtil.GBKCharToUnicode(oddByte|0x80, srcByte|0x80) :
                     UCS2_NO_MAPPING;
        mRunLength++;
        iDestlen++;
      } else {
--- a/intl/uconv/ucvja/nsJapaneseToUnicode.cpp
+++ b/intl/uconv/ucvja/nsJapaneseToUnicode.cpp
@ -63,6 +63,8 @@ static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CI
 #define JIS0212_INDEX gJIS0212Index
 #define SJIS_UNMAPPED	0x30fb
 #define UNICODE_REPLACEMENT_CHARACTER 0xfffd
 #define IN_GR_RANGE(b) \
  ((PRUint8(0xa1) <= PRUint8(b)) && (PRUint8(b) <= PRUint8(0xfe)))
 NS_IMETHODIMP nsShiftJISToUnicode::Convert(
   const char * aSrc, PRInt32 * aSrcLen,
@ -345,7 +347,7 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
          case 3: // JIS 0212
          {
-            if(*src & 0x80)
+            if (IN_GR_RANGE(*src))
            {
              mData = JIS0212_INDEX[*src & 0x7F];
              if(mData != 0xFFFD )
@ -355,30 +357,39 @@ NS_IMETHODIMP nsEUCJPToUnicodeV2::Convert(
                 mState = 5; // error
              }
            } else {
-              mState = 5; // error
+              // First "JIS 0212" byte is not in the valid GR range: save it
              if (mErrBehavior == kOnError_Signal)
                goto error_invalidchar;
              *dest++ = 0xFFFD;
              --src;
              mState = 0;
              if(dest >= destEnd)
                goto error1;
            }
          }
          break;
          case 4:
          {
            PRUint8 off = sbIdx[*src];
-            if(0xFF == off) {
+            if(0xFF != off) {
-              if (mErrBehavior == kOnError_Signal)
+              *dest++ = gJapaneseMap[mData+off];
-                goto error_invalidchar;
+              mState = 0;
-               *dest++ = 0xFFFD;
+              if(dest >= destEnd)
-            } else {
+                goto error1;
-               *dest++ = gJapaneseMap[mData+off];
+              break;
            }
-            mState = 0;
+            // else fall through to error handler
            if(dest >= destEnd)
              goto error1;
          }
          break;
          case 5: // two bytes undefined
          {
            if (mErrBehavior == kOnError_Signal)
              goto error_invalidchar;
            *dest++ = 0xFFFD;
            // Undefined JIS 0212 two byte sequence. If the second byte is in
            // the valid range for a two byte sequence (0xa1 - 0xfe) consume
            // both bytes. Otherwise resynchronize on the second byte.
            if (!IN_GR_RANGE(*src))
              --src;
            mState = 0;
            if(dest >= destEnd)
              goto error1;