From 7e6d22b4290303f6678a9e406b6a2b767a9a8893 Mon Sep 17 00:00:00 2001 From: Masatoshi Kimura Date: Mon, 10 Dec 2012 09:10:28 -0500 Subject: [PATCH] Bug 638379 - Part 1: Implement kOnError_Recover to the UTF-8 decoder. r=smontagu --- intl/locale/public/nsCharsetAlias.h | 2 ++ intl/uconv/src/nsScriptableUConv.cpp | 48 +++++++++++++++++--------- intl/uconv/src/nsUTF8ToUnicode.cpp | 50 +++++++++++++++++++--------- 3 files changed, 69 insertions(+), 31 deletions(-) diff --git a/intl/locale/public/nsCharsetAlias.h b/intl/locale/public/nsCharsetAlias.h index 94b03762954..17419043b69 100644 --- a/intl/locale/public/nsCharsetAlias.h +++ b/intl/locale/public/nsCharsetAlias.h @@ -10,10 +10,12 @@ #include "nsStringGlue.h" class nsCharsetConverterManager; +class nsScriptableUnicodeConverter; class nsCharsetAlias { friend class nsCharsetConverterManager; + friend class nsScriptableUnicodeConverter; static nsresult GetPreferredInternal(const nsACString& aAlias, nsACString& aResult); public: static nsresult GetPreferred(const nsACString& aAlias, nsACString& aResult); diff --git a/intl/uconv/src/nsScriptableUConv.cpp b/intl/uconv/src/nsScriptableUConv.cpp index 0a03f67fa40..1dcc0c12878 100644 --- a/intl/uconv/src/nsScriptableUConv.cpp +++ b/intl/uconv/src/nsScriptableUConv.cpp @@ -13,6 +13,7 @@ #include "nsIStringStream.h" #include "nsCRT.h" #include "nsComponentManagerUtils.h" +#include "nsCharsetAlias.h" static int32_t gInstanceCount = 0; @@ -257,22 +258,39 @@ nsScriptableUnicodeConverter::InitConverter() mEncoder = nullptr; nsCOMPtr ccm = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv); + if (NS_FAILED(rv) || !ccm) { + return rv; + } - if (NS_SUCCEEDED(rv) && ccm) { - // get charset atom due to getting unicode converter - - // get an unicode converter - rv = ccm->GetUnicodeEncoder(mCharset.get(), getter_AddRefs(mEncoder)); - if(NS_SUCCEEDED(rv)) { - rv = mEncoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nullptr, (PRUnichar)'?'); - if(NS_SUCCEEDED(rv)) { - rv = mIsInternal ? - ccm->GetUnicodeDecoderInternal(mCharset.get(), - getter_AddRefs(mDecoder)) : - ccm->GetUnicodeDecoder(mCharset.get(), - getter_AddRefs(mDecoder)); - } - } + // get an unicode converter + rv = ccm->GetUnicodeEncoder(mCharset.get(), getter_AddRefs(mEncoder)); + if (NS_FAILED(rv)) { + return rv; + } + + rv = mEncoder->SetOutputErrorBehavior(nsIUnicodeEncoder::kOnError_Replace, nullptr, (PRUnichar)'?'); + if (NS_FAILED(rv)) { + return rv; + } + + nsAutoCString charset; + rv = mIsInternal ? nsCharsetAlias::GetPreferredInternal(mCharset, charset) + : nsCharsetAlias::GetPreferred(mCharset, charset); + if (NS_FAILED(rv)) { + return rv; + } + + rv = ccm->GetUnicodeDecoderRaw(charset.get(), getter_AddRefs(mDecoder)); + if (NS_FAILED(rv)) { + return rv; + } + + // The UTF-8 decoder used to throw regardless of the error behavior. + // Simulating the old behavior for compatibility with legacy callers + // (including addons). If callers want a control over the behavior, + // they should switch to TextDecoder. + if (charset.EqualsLiteral("UTF-8")) { + mDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal); } return rv ; diff --git a/intl/uconv/src/nsUTF8ToUnicode.cpp b/intl/uconv/src/nsUTF8ToUnicode.cpp index fc76259c26b..cdac0547cc3 100644 --- a/intl/uconv/src/nsUTF8ToUnicode.cpp +++ b/intl/uconv/src/nsUTF8ToUnicode.cpp @@ -188,12 +188,11 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, out = aDest; if (mState == 0xFF) { - // Emit supplementary character left over from previous iteration. If the - // buffer size is insufficient, treat it as an illegal character. + // Emit supplementary character left over from previous iteration. It is + // caller's responsibility to keep a sufficient buffer. if (aDestLen < 2) { - NS_ERROR("Output buffer insufficient to hold supplementary character"); - mState = 0; - return NS_ERROR_ILLEGAL_INPUT; + *aSrcLength = *aDestLength = 0; + return NS_OK_UDEC_MOREOUTPUT; } out = EmitSurrogatePair(mUcs4, out); mUcs4 = 0; @@ -225,8 +224,12 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, mBytes = 1; } else if (c < 0xC2) { // C0/C1 // Overlong 2 octet sequence - res = NS_ERROR_ILLEGAL_INPUT; - break; + if (mErrBehavior == kOnError_Signal) { + res = NS_ERROR_ILLEGAL_INPUT; + break; + } + *out++ = UCS2_REPLACEMENT_CHAR; + mFirst = false; } else if (c < 0xE0) { // C2..DF // First octet of 2 octet sequence mUcs4 = c; @@ -248,12 +251,16 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, } else { // F5..FF /* Current octet is neither in the US-ASCII range nor a legal first * octet of a multi-octet sequence. - * - * Return an error condition. Caller is responsible for flushing and - * refilling the buffer and resetting state. */ - res = NS_ERROR_ILLEGAL_INPUT; - break; + if (mErrBehavior == kOnError_Signal) { + /* Return an error condition. Caller is responsible for flushing and + * refilling the buffer and resetting state. + */ + res = NS_ERROR_ILLEGAL_INPUT; + break; + } + *out++ = UCS2_REPLACEMENT_CHAR; + mFirst = false; } } else { // When mState is non-zero, we expect a continuation of the multi-octet @@ -270,8 +277,14 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, mUcs4 == 0x100000 && c > 0x8F)) { // F4 90..BF // illegal sequences or sequences converted into illegal ranges. in--; - res = NS_ERROR_ILLEGAL_INPUT; - break; + if (mErrBehavior == kOnError_Signal) { + res = NS_ERROR_ILLEGAL_INPUT; + break; + } + *out++ = UCS2_REPLACEMENT_CHAR; + mState = 0; + mFirst = false; + continue; } } @@ -315,8 +328,13 @@ NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, * for flushing and refilling the buffer and resetting state. */ in--; - res = NS_ERROR_ILLEGAL_INPUT; - break; + if (mErrBehavior == kOnError_Signal) { + res = NS_ERROR_ILLEGAL_INPUT; + break; + } + *out++ = UCS2_REPLACEMENT_CHAR; + mState = 0; + mFirst = false; } } }