Bug 782412 - Part 4: Implement inputErrorBehavior. r=smontagu

2024-09-13 09:24:08 -07:00 · 2012-11-26 20:38:20 -05:00 · 2012-11-26 20:38:20 -05:00 · 292564df3b
commit 292564df3b
parent 6a679006de
3 changed files with 52 additions and 42 deletions
--- a/dom/encoding/test/unit/test_singlebytes.js
+++ b/dom/encoding/test/unit/test_singlebytes.js
@ -181,11 +181,10 @@ test(
      { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0] }, // invalid trail
      { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80] }, // > 0x10FFFF
      { encoding: 'utf-16', input: [0x00] }, // truncated code unit
-      //TODO: Gecko doesn't throw for unpaired surrogates
-      //{ encoding: 'utf-16', input: [0x00, 0xd8] }, // surrogate half
-      //{ encoding: 'utf-16', input: [0x00, 0xd8, 0x00, 0x00] }, // surrogate half
-      //{ encoding: 'utf-16', input: [0x00, 0xdc, 0x00, 0x00] }, // trail surrogate
-      //{ encoding: 'utf-16', input: [0x00, 0xdc, 0x00, 0xd8] }  // swapped surrogates
+      { encoding: 'utf-16', input: [0x00, 0xd8] }, // surrogate half
+      { encoding: 'utf-16', input: [0x00, 0xd8, 0x00, 0x00] }, // surrogate half
+      { encoding: 'utf-16', input: [0x00, 0xdc, 0x00, 0x00] }, // trail surrogate
+      { encoding: 'utf-16', input: [0x00, 0xdc, 0x00, 0xd8] }  // swapped surrogates
      // TODO: Single byte encoding cases
    ];

--- a/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
+++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp
@ -18,13 +18,12 @@ enum {
  STATE_ODD_SURROGATE_PAIR = 4
 };

-static nsresult
-UTF16ConvertToUnicode(uint8_t& aState, uint8_t& aOddByte,
-                      PRUnichar& aOddHighSurrogate, PRUnichar& aOddLowSurrogate,
-                      const char * aSrc,
-                      int32_t * aSrcLength, PRUnichar * aDest,
-                      int32_t * aDestLength,
-                      bool aSwapBytes)
+nsresult
+nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
+                                            int32_t * aSrcLength,
+                                            PRUnichar * aDest,
+                                            int32_t * aDestLength,
+                                            bool aSwapBytes)
 {
  const char* src = aSrc;
  const char* srcEnd = aSrc + *aSrcLength;
@ -32,27 +31,27 @@ UTF16ConvertToUnicode(uint8_t& aState, uint8_t& aOddByte,
  PRUnichar* destEnd = aDest + *aDestLength;
  PRUnichar oddHighSurrogate;

-  switch(aState) {
+  switch(mState) {
    case STATE_FIRST_CALL:
      NS_ASSERTION(*aSrcLength > 1, "buffer too short");
      src+=2;
-      aState = STATE_NORMAL;
+      mState = STATE_NORMAL;
      break;

    case STATE_SECOND_BYTE:
      NS_ASSERTION(*aSrcLength > 0, "buffer too short");
      src++;
-      aState = STATE_NORMAL;
+      mState = STATE_NORMAL;
      break;

    case STATE_ODD_SURROGATE_PAIR:
      if (*aDestLength < 2)
        goto error;
      else {
-        *dest++ = aOddHighSurrogate;
-        *dest++ = aOddLowSurrogate;
-        aOddHighSurrogate = aOddLowSurrogate = 0;
-        aState = STATE_NORMAL;
+        *dest++ = mOddHighSurrogate;
+        *dest++ = mOddLowSurrogate;
+        mOddHighSurrogate = mOddLowSurrogate = 0;
+        mState = STATE_NORMAL;
      }
      break;

@ -62,28 +61,28 @@ UTF16ConvertToUnicode(uint8_t& aState, uint8_t& aOddByte,
      break;
  }

-  oddHighSurrogate = aOddHighSurrogate;
+  oddHighSurrogate = mOddHighSurrogate;

  if (src == srcEnd) {
    *aDestLength = dest - aDest;
-    return (aState != STATE_NORMAL || oddHighSurrogate) ?
+    return (mState != STATE_NORMAL || oddHighSurrogate) ?
           NS_OK_UDEC_MOREINPUT : NS_OK;
  }

  const char* srcEvenEnd;

  PRUnichar u;
-  if (aState == STATE_HALF_CODE_POINT) {
+  if (mState == STATE_HALF_CODE_POINT) {
    if (dest == destEnd)
      goto error;

-    // the 1st byte of a 16-bit code unit was stored in |aOddByte| in the
+    // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
    // previous run while the 2nd byte has to come from |*src|.
-    aState = STATE_NORMAL;
+    mState = STATE_NORMAL;
 #ifdef IS_BIG_ENDIAN
-    u = (aOddByte << 8) | *src++; // safe, we know we have at least one byte.
+    u = (mOddByte << 8) | *src++; // safe, we know we have at least one byte.
 #else
-    u = (*src++ << 8) | aOddByte; // safe, we know we have at least one byte.
+    u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
 #endif
    srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
    goto have_codepoint;
@ -108,6 +107,9 @@ have_codepoint:

    if (!IS_SURROGATE(u)) {
      if (oddHighSurrogate) {
+        if (mErrBehavior == kOnError_Signal) {
+          goto error2;
+        }
        *dest++ = UCS2_REPLACEMENT_CHAR;
        if (dest == destEnd)
          goto error;
@ -116,6 +118,9 @@ have_codepoint:
      *dest++ = u;
    } else if (NS_IS_HIGH_SURROGATE(u)) {
      if (oddHighSurrogate) {
+        if (mErrBehavior == kOnError_Signal) {
+          goto error2;
+        }
        *dest++ = UCS2_REPLACEMENT_CHAR;
        if (dest == destEnd)
          goto error;
@ -125,14 +130,17 @@ have_codepoint:
    else /* if (NS_IS_LOW_SURROGATE(u)) */ {
      if (oddHighSurrogate && *aDestLength > 1) {
        if (dest + 1 >= destEnd) {
-          aOddLowSurrogate = u;
-          aOddHighSurrogate = oddHighSurrogate;
-          aState = STATE_ODD_SURROGATE_PAIR;
+          mOddLowSurrogate = u;
+          mOddHighSurrogate = oddHighSurrogate;
+          mState = STATE_ODD_SURROGATE_PAIR;
          goto error;
        }
        *dest++ = oddHighSurrogate;
        *dest++ = u;
      } else {
+        if (mErrBehavior == kOnError_Signal) {
+          goto error2;
+        }
        *dest++ = UCS2_REPLACEMENT_CHAR;
      }
      oddHighSurrogate = 0;
@ -140,21 +148,26 @@ have_codepoint:
  }
  if (src != srcEnd) {
    // store the lead byte of a 16-bit unit for the next run.
-    aOddByte = *src++;
-    aState = STATE_HALF_CODE_POINT;
+    mOddByte = *src++;
+    mState = STATE_HALF_CODE_POINT;
  }

-  aOddHighSurrogate = oddHighSurrogate;
+  mOddHighSurrogate = oddHighSurrogate;

  *aDestLength = dest - aDest;
  *aSrcLength =  src  - aSrc; 
-  return (aState != STATE_NORMAL || oddHighSurrogate) ?
+  return (mState != STATE_NORMAL || oddHighSurrogate) ?
         NS_OK_UDEC_MOREINPUT : NS_OK;

 error:
  *aDestLength = dest - aDest;
  *aSrcLength =  src  - aSrc; 
  return  NS_OK_UDEC_MOREOUTPUT;
+
+error2:
+  *aDestLength = dest - aDest;
+  *aSrcLength = --src - aSrc; 
+  return  NS_ERROR_ILLEGAL_INPUT;
 }

 NS_IMETHODIMP
@ -224,9 +237,7 @@ nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
      break;
  }

-  nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
-                                      mOddLowSurrogate,
-                                      aSrc, aSrcLength, aDest, aDestLength,
+  nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_LITTLE_ENDIAN
                                      true
 #else
@ -279,9 +290,7 @@ nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
      break;
  }

-  nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
-                                      mOddLowSurrogate,
-                                      aSrc, aSrcLength, aDest, aDestLength,
+  nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_BIG_ENDIAN
                                      true
 #else
@ -342,9 +351,7 @@ nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
      }
    }
    
-    nsresult rv = UTF16ConvertToUnicode(mState, mOddByte, mOddHighSurrogate,
-                                        mOddLowSurrogate,
-                                        aSrc, aSrcLength, aDest, aDestLength,
+    nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
 #ifdef IS_BIG_ENDIAN
                                        (mEndian == kLittleEndian)
 #elif defined(IS_LITTLE_ENDIAN)
--- a/intl/uconv/ucvlatin/nsUTF16ToUnicode.h
+++ b/intl/uconv/ucvlatin/nsUTF16ToUnicode.h
@ -16,6 +16,10 @@ protected:
  // ctor accessible only by child classes
  nsUTF16ToUnicodeBase() { Reset();}

+  nsresult UTF16ConvertToUnicode(const char * aSrc,
+                                 int32_t * aSrcLength, PRUnichar * aDest,
+                                 int32_t * aDestLength, bool aSwapBytes);
+
 public: 
  //--------------------------------------------------------------------
  // Subclassing of nsDecoderSupport class [declaration]