Bug 638379 - Part 1: Implement kOnError_Recover to the UTF-8 decoder. r=smontagu

2024-09-13 09:24:08 -07:00 · 2012-12-10 09:10:28 -05:00 · 2012-12-10 09:10:28 -05:00 · 7e6d22b429
commit 7e6d22b429
parent 89a4119635
3 changed files with 69 additions and 31 deletions
--- a/intl/locale/public/nsCharsetAlias.h
+++ b/intl/locale/public/nsCharsetAlias.h
@ -10,10 +10,12 @@
 #include "nsStringGlue.h"

 class nsCharsetConverterManager;
+class nsScriptableUnicodeConverter;

 class nsCharsetAlias
 {
   friend class nsCharsetConverterManager;
+   friend class nsScriptableUnicodeConverter;
   static nsresult GetPreferredInternal(const nsACString& aAlias, nsACString& aResult);
 public:
   static nsresult GetPreferred(const nsACString& aAlias, nsACString& aResult);
--- a/intl/uconv/src/nsScriptableUConv.cpp
+++ b/intl/uconv/src/nsScriptableUConv.cpp
@ -13,6 +13,7 @@
 #include "nsIStringStream.h"
 #include "nsCRT.h"
 #include "nsComponentManagerUtils.h"
+#include "nsCharsetAlias.h"

 static int32_t          gInstanceCount = 0;

@ -257,22 +258,39 @@ nsScriptableUnicodeConverter::InitConverter()
  mEncoder = nullptr;

  nsCOMPtr<nsICharsetConverterManager> ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
+  if (NS_FAILED(rv) || !ccm) {
+    return rv;
+  }

-  if (NS_SUCCEEDED(rv) && ccm) {
-    // get charset atom due to getting unicode converter
-    
-    // get an unicode converter
-    rv = ccm->GetUnicodeEncoder(mCharset.get(), getter_AddRefs(mEncoder));
-    if(NS_SUCCEEDED(rv)) {
-      rv = mEncoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nullptr, (PRUnichar)'?');
-      if(NS_SUCCEEDED(rv)) {
-        rv = mIsInternal ?
-          ccm->GetUnicodeDecoderInternal(mCharset.get(),
-                                         getter_AddRefs(mDecoder)) :
-          ccm->GetUnicodeDecoder(mCharset.get(),
-                                 getter_AddRefs(mDecoder));
-      }
-    }
+  // get an unicode converter
+  rv = ccm->GetUnicodeEncoder(mCharset.get(), getter_AddRefs(mEncoder));
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  rv = mEncoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nullptr, (PRUnichar)'?');
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  nsAutoCString charset;
+  rv = mIsInternal ? nsCharsetAlias::GetPreferredInternal(mCharset, charset)
+                   : nsCharsetAlias::GetPreferred(mCharset, charset);
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  rv = ccm->GetUnicodeDecoderRaw(charset.get(), getter_AddRefs(mDecoder));
+  if (NS_FAILED(rv)) {
+    return rv;
+  }
+
+  // The UTF-8 decoder used to throw regardless of the error behavior.
+  // Simulating the old behavior for compatibility with legacy callers
+  // (including addons). If callers want a control over the behavior,
+  // they should switch to TextDecoder.
+  if (charset.EqualsLiteral("UTF-8")) {
+    mDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal);
  }

  return rv ;
--- a/intl/uconv/src/nsUTF8ToUnicode.cpp
+++ b/intl/uconv/src/nsUTF8ToUnicode.cpp
@ -188,12 +188,11 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,

  out = aDest;
  if (mState == 0xFF) {
-    // Emit supplementary character left over from previous iteration. If the
-    // buffer size is insufficient, treat it as an illegal character.
+    // Emit supplementary character left over from previous iteration. It is
+    // caller's responsibility to keep a sufficient buffer.
    if (aDestLen < 2) {
-      NS_ERROR("Output buffer insufficient to hold supplementary character");
-      mState = 0;
-      return NS_ERROR_ILLEGAL_INPUT;
+      *aSrcLength = *aDestLength = 0;
+      return NS_OK_UDEC_MOREOUTPUT;
    }
    out = EmitSurrogatePair(mUcs4, out);
    mUcs4 = 0;
@ -225,8 +224,12 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
        mBytes = 1;
      } else if (c < 0xC2) {  // C0/C1
        // Overlong 2 octet sequence
-        res = NS_ERROR_ILLEGAL_INPUT;
-        break;
+        if (mErrBehavior == kOnError_Signal) {
+          res = NS_ERROR_ILLEGAL_INPUT;
+          break;
+        }
+        *out++ = UCS2_REPLACEMENT_CHAR;
+        mFirst = false;
      } else if (c < 0xE0) {  // C2..DF
        // First octet of 2 octet sequence
        mUcs4 = c;
@ -248,12 +251,16 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
      } else {  // F5..FF
        /* Current octet is neither in the US-ASCII range nor a legal first
         * octet of a multi-octet sequence.
-         *
-         * Return an error condition. Caller is responsible for flushing and
-         * refilling the buffer and resetting state.
         */
-        res = NS_ERROR_ILLEGAL_INPUT;
-        break;
+        if (mErrBehavior == kOnError_Signal) {
+          /* Return an error condition. Caller is responsible for flushing and
+           * refilling the buffer and resetting state.
+           */
+          res = NS_ERROR_ILLEGAL_INPUT;
+          break;
+        }
+        *out++ = UCS2_REPLACEMENT_CHAR;
+        mFirst = false;
      }
    } else {
      // When mState is non-zero, we expect a continuation of the multi-octet
@ -270,8 +277,14 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
                              mUcs4 == 0x100000 && c > 0x8F)) {  // F4 90..BF
            // illegal sequences or sequences converted into illegal ranges.
            in--;
-            res = NS_ERROR_ILLEGAL_INPUT;
-            break;
+            if (mErrBehavior == kOnError_Signal) {
+              res = NS_ERROR_ILLEGAL_INPUT;
+              break;
+            }
+            *out++ = UCS2_REPLACEMENT_CHAR;
+            mState = 0;
+            mFirst = false;
+            continue;
          }
        }

@ -315,8 +328,13 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc,
         * for flushing and refilling the buffer and resetting state.
         */
        in--;
-        res = NS_ERROR_ILLEGAL_INPUT;
-        break;
+        if (mErrBehavior == kOnError_Signal) {
+          res = NS_ERROR_ILLEGAL_INPUT;
+          break;
+        }
+        *out++ = UCS2_REPLACEMENT_CHAR;
+        mState = 0;
+        mFirst = false;
      }
    }
  }