Bug 801402 - Use FindEncodingForLabel from HTML parser. r=hsivonen

This commit is contained in:
Masatoshi Kimura 2012-11-07 18:04:22 -05:00
parent 20a07dfd8e
commit 6846f5bc5b
5 changed files with 34 additions and 64 deletions

View File

@ -14,7 +14,6 @@ EncMetaUnsupported=An unsupported character encoding was declared for the HTML d
EncProtocolUnsupported=An unsupported character encoding was declared on the transfer protocol level. The declaration was ignored.
EncBomlessUtf16=Detected UTF-16-encoded Basic Latin-only text without a byte order mark and without a transfer protocol-level declaration. Encoding this content in UTF-16 is inefficient and the character encoding should have been declared in any case.
EncMetaUtf16=A meta tag was used to declare the character encoding as UTF-16. This was interpreted as an UTF-8 declaration instead.
EncMetaNonRoughSuperset=A meta tag was used to declare a character encoding the does not encode the Basic Latin range roughly like US-ASCII. The declaration was ignored.
# The bulk of the messages below are derived from
# http://hg.mozilla.org/projects/htmlparser/file/1f633cef7de7/src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java

View File

@ -4,10 +4,12 @@
#include "nsICharsetConverterManager.h"
#include "nsServiceManagerUtils.h"
#include "nsCharsetAlias.h"
#include "nsEncoderDecoderUtils.h"
#include "nsTraceRefcnt.h"
#include "mozilla/dom/EncodingUtils.h"
using mozilla::dom::EncodingUtils;
void
nsHtml5MetaScanner::sniff(nsHtml5ByteReadable* bytes, nsIUnicodeDecoder** decoder, nsACString& charset)
@ -48,8 +50,7 @@ nsHtml5MetaScanner::tryCharset(nsString* charset)
return true;
}
nsAutoCString preferred;
res = nsCharsetAlias::GetPreferred(encoding, preferred);
if (NS_FAILED(res)) {
if (!EncodingUtils::FindEncodingForLabel(encoding, preferred)) {
return false;
}
if (preferred.LowerCaseEqualsLiteral("utf-16") ||

View File

@ -6,7 +6,6 @@
#include "nsHtml5StreamParser.h"
#include "nsICharsetConverterManager.h"
#include "nsCharsetAlias.h"
#include "nsServiceManagerUtils.h"
#include "nsEncoderDecoderUtils.h"
#include "nsContentUtils.h"
@ -26,8 +25,10 @@
#include "nsCharsetSource.h"
#include "nsIWyciwygChannel.h"
using namespace mozilla;
#include "mozilla/dom/EncodingUtils.h"
using namespace mozilla;
using mozilla::dom::EncodingUtils;
int32_t nsHtml5StreamParser::sTimerInitialDelay = 120;
int32_t nsHtml5StreamParser::sTimerSubsequentDelay = 120;
@ -1193,28 +1194,25 @@ nsHtml5StreamParser::OnDataAvailable(nsIRequest* aRequest,
bool
nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
{
nsAutoCString newEncoding(aEncoding);
newEncoding.Trim(" \t\r\n\f");
if (newEncoding.LowerCaseEqualsLiteral("utf-16") ||
newEncoding.LowerCaseEqualsLiteral("utf-16be") ||
newEncoding.LowerCaseEqualsLiteral("utf-16le")) {
mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16",
true,
mTokenizer->getLineNumber());
newEncoding.Assign("UTF-8");
}
nsresult rv = NS_OK;
bool eq;
rv = nsCharsetAlias::Equals(newEncoding, mCharset, &eq);
if (NS_FAILED(rv)) {
nsAutoCString newEncoding;
if (!EncodingUtils::FindEncodingForLabel(aEncoding, newEncoding)) {
// the encoding name is bogus
mTreeBuilder->MaybeComplainAboutCharset("EncMetaUnsupported",
true,
mTokenizer->getLineNumber());
return false;
}
if (eq) {
if (newEncoding.EqualsLiteral("UTF-16") ||
newEncoding.EqualsLiteral("UTF-16BE") ||
newEncoding.EqualsLiteral("UTF-16LE")) {
mTreeBuilder->MaybeComplainAboutCharset("EncMetaUtf16",
true,
mTokenizer->getLineNumber());
newEncoding.Assign("UTF-8");
}
if (newEncoding.Equals(mCharset)) {
if (mCharsetSource < kCharsetFromMetaPrescan) {
if (mInitialEncodingWasFromParentFrame) {
mTreeBuilder->MaybeComplainAboutCharset("EncLateMetaFrame",
@ -1231,36 +1229,7 @@ nsHtml5StreamParser::PreferredForInternalEncodingDecl(nsACString& aEncoding)
return false;
}
// XXX check HTML5 non-IANA aliases here
nsAutoCString preferred;
rv = nsCharsetAlias::GetPreferred(newEncoding, preferred);
if (NS_FAILED(rv)) {
// This charset has been blacklisted for permitting XSS smuggling.
// EncMetaNonRoughSuperset is a reasonable approximation to the
// right error message.
mTreeBuilder->MaybeComplainAboutCharset("EncMetaNonRoughSuperset",
true,
mTokenizer->getLineNumber());
return false;
}
// ??? Explicit further blacklist of character sets that are not
// "rough supersets" of ASCII. Some of these are handled above (utf-16),
// some by the XSS smuggling blacklist in charsetData.properties,
// maybe all of the remainder should also be blacklisted there.
if (preferred.LowerCaseEqualsLiteral("utf-16") ||
preferred.LowerCaseEqualsLiteral("utf-16be") ||
preferred.LowerCaseEqualsLiteral("utf-16le") ||
preferred.LowerCaseEqualsLiteral("utf-7") ||
preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7")) {
// Not a rough ASCII superset
mTreeBuilder->MaybeComplainAboutCharset("EncMetaNonRoughSuperset",
true,
mTokenizer->getLineNumber());
return false;
}
aEncoding.Assign(preferred);
aEncoding.Assign(newEncoding);
return true;
}

View File

@ -14,7 +14,6 @@
#include "nsIChannel.h"
#include "nsICachingChannel.h"
#include "nsICacheEntryDescriptor.h"
#include "nsCharsetAlias.h"
#include "nsICharsetConverterManager.h"
#include "nsIInputStream.h"
#include "CNavDTD.h"
@ -43,7 +42,10 @@
#include "nsCharsetSource.h"
#include "nsContentUtils.h"
#include "mozilla/dom/EncodingUtils.h"
using namespace mozilla;
using mozilla::dom::EncodingUtils;
#define NS_PARSER_FLAG_PARSER_ENABLED 0x00000002
#define NS_PARSER_FLAG_OBSERVERS_ENABLED 0x00000004
@ -1840,8 +1842,7 @@ ParserWriteFunc(nsIInputStream* in,
nsAutoCString declCharset;
if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) {
nsresult rv = nsCharsetAlias::GetPreferred(declCharset, maybePrefer);
if (NS_SUCCEEDED(rv)) {
if (EncodingUtils::FindEncodingForLabel(declCharset, maybePrefer)) {
preferred.Assign(maybePrefer);
source = kCharsetFromMetaTag;
}

View File

@ -10,7 +10,6 @@
#include "nsDebug.h"
#include "nsIServiceManager.h"
#include "nsICharsetConverterManager.h"
#include "nsCharsetAlias.h"
#include "nsReadableUtils.h"
#include "nsIInputStream.h"
#include "nsIFile.h"
@ -20,6 +19,10 @@
#include "nsParser.h"
#include "nsCharsetSource.h"
#include "mozilla/dom/EncodingUtils.h"
using mozilla::dom::EncodingUtils;
// We replace NUL characters with this character.
static PRUnichar sInvalid = UCS2_REPLACEMENT_CHAR;
@ -118,12 +121,12 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
if (aSource < mCharsetSource) // priority is lower the the current one , just
return NS_OK;
nsresult res = NS_OK;
nsCString charsetName;
bool valid = EncodingUtils::FindEncodingForLabel(aCharset, charsetName);
MOZ_ASSERT(valid, "Should never call with a bogus aCharset.");
if (!mCharset.IsEmpty())
{
bool same;
res = nsCharsetAlias::Equals(aCharset, mCharset, &same);
if(NS_SUCCEEDED(res) && same)
if (charsetName.Equals(mCharset))
{
mCharsetSource = aSource;
return NS_OK; // no difference, don't change it
@ -131,9 +134,6 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
}
// different, need to change it
nsCString charsetName;
res = nsCharsetAlias::GetPreferred(aCharset, charsetName);
MOZ_ASSERT(NS_SUCCEEDED(res), "Should never call with a bogus aCharset.");
mCharset.Assign(charsetName);
@ -142,7 +142,7 @@ nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSou
NS_ASSERTION(nsParser::GetCharsetConverterManager(),
"Must have the charset converter manager!");
res = nsParser::GetCharsetConverterManager()->
nsresult res = nsParser::GetCharsetConverterManager()->
GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
if (NS_SUCCEEDED(res) && mUnicodeDecoder)
{