Bug 863728 - Implement the replacement encoding. r=emk.

This commit is contained in:
Henri Sivonen 2013-11-25 10:06:56 +02:00
parent 15eb0861a8
commit d0c0e04f69
36 changed files with 264 additions and 157 deletions

View File

@ -268,7 +268,8 @@ EventSource::Init(nsISupports* aOwner,
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
rv = convManager->GetUnicodeDecoder("UTF-8", getter_AddRefs(mUnicodeDecoder));
rv = convManager->GetUnicodeDecoderRaw("UTF-8",
getter_AddRefs(mUnicodeDecoder));
NS_ENSURE_SUCCESS(rv, rv);
// the constructor should throw a SYNTAX_ERROR only if it fails resolving the

View File

@ -540,7 +540,8 @@ nsDOMFileReader::ConvertStream(const char *aFileData,
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr<nsIUnicodeDecoder> unicodeDecoder;
rv = charsetConverter->GetUnicodeDecoder(aCharset, getter_AddRefs(unicodeDecoder));
rv = charsetConverter->GetUnicodeDecoderRaw(aCharset,
getter_AddRefs(unicodeDecoder));
NS_ENSURE_SUCCESS(rv, rv);
int32_t destLength;

View File

@ -3289,16 +3289,12 @@ nsDocument::GetBaseTarget(nsAString &aBaseTarget)
void
nsDocument::SetDocumentCharacterSet(const nsACString& aCharSetID)
{
// XXX it would be a good idea to assert the sanity of the argument,
// but before we figure out what to do about non-Encoding Standard
// encodings in the charset menu and in mailnews, assertions are futile.
if (!mCharacterSet.Equals(aCharSetID)) {
mCharacterSet = aCharSetID;
#ifdef DEBUG
nsAutoCString canonicalName;
nsCharsetAlias::GetPreferred(aCharSetID, canonicalName);
NS_ASSERTION(canonicalName.Equals(aCharSetID),
"charset name must be canonical");
#endif
int32_t n = mCharSetObservers.Length();
for (int32_t i = 0; i < n; i++) {

View File

@ -48,6 +48,7 @@
#include "nsSandboxFlags.h"
#include "nsContentTypeParser.h"
#include "nsINetworkSeer.h"
#include "mozilla/dom/EncodingUtils.h"
#include "mozilla/CORSMode.h"
#include "mozilla/Attributes.h"
@ -1197,15 +1198,15 @@ nsScriptLoader::ConvertToUTF16(nsIChannel* aChannel, const uint8_t* aData,
if (!unicodeDecoder &&
aChannel &&
NS_SUCCEEDED(aChannel->GetContentCharset(charset)) &&
!charset.IsEmpty()) {
charsetConv->GetUnicodeDecoder(charset.get(),
getter_AddRefs(unicodeDecoder));
EncodingUtils::FindEncodingForLabel(charset, charset)) {
charsetConv->GetUnicodeDecoderRaw(charset.get(),
getter_AddRefs(unicodeDecoder));
}
if (!unicodeDecoder && !aHintCharset.IsEmpty()) {
CopyUTF16toUTF8(aHintCharset, charset);
charsetConv->GetUnicodeDecoder(charset.get(),
getter_AddRefs(unicodeDecoder));
if (!unicodeDecoder &&
EncodingUtils::FindEncodingForLabel(aHintCharset, charset)) {
charsetConv->GetUnicodeDecoderRaw(charset.get(),
getter_AddRefs(unicodeDecoder));
}
if (!unicodeDecoder && aDocument) {

View File

@ -38,6 +38,7 @@ EncodingUtils::IsAsciiCompatible(const nsACString& aPreferredName)
return !(aPreferredName.LowerCaseEqualsLiteral("utf-16") ||
aPreferredName.LowerCaseEqualsLiteral("utf-16be") ||
aPreferredName.LowerCaseEqualsLiteral("utf-16le") ||
aPreferredName.LowerCaseEqualsLiteral("replacement") ||
aPreferredName.LowerCaseEqualsLiteral("utf-7") ||
aPreferredName.LowerCaseEqualsLiteral("x-imap4-modified-utf7"));
}

View File

@ -21,8 +21,9 @@ TextDecoder::Init(const nsAString& aEncoding, const bool aFatal,
EncodingUtils::TrimSpaceCharacters(label);
// Let encoding be the result of getting an encoding from label.
// If encoding is failure, throw a TypeError.
if (!EncodingUtils::FindEncodingForLabel(label, mEncoding)) {
// If encoding is failure or replacement, throw a TypeError.
if (!EncodingUtils::FindEncodingForLabel(label, mEncoding) ||
mEncoding.EqualsLiteral("replacement")) {
aRv.ThrowTypeError(MSG_ENCODING_NOT_SUPPORTED, &label);
return;
}

View File

@ -215,8 +215,10 @@ ks_c_5601-1989=EUC-KR
ksc5601=EUC-KR
ksc_5601=EUC-KR
windows-949=EUC-KR
csiso2022kr=ISO-2022-KR
iso-2022-kr=ISO-2022-KR
csiso2022kr=replacement
iso-2022-kr=replacement
iso-2022-cn=replacement
iso-2022-cn-ext=replacement
utf-16=UTF-16LE
utf-16le=UTF-16LE
utf-16be=UTF-16BE

View File

@ -0,0 +1 @@
<meta charset=utf-8><EFBFBD>

View File

@ -0,0 +1 @@
<meta charset=iso-2022-cn>

View File

@ -0,0 +1,3 @@
<!DOCTYPE html>
<meta charset=utf-8>
<iframe src="data:text/html;charset=utf-8,<2C><iframe src='data:text/html;charset=utf-8,PASS'></iframe>" width=400 height=200></iframe>

View File

@ -0,0 +1,17 @@
<!DOCTYPE html>
<html class=reftest-wait>
<meta charset=utf-8>
<script>
function runTest() {
var r = document.documentElement;
var d = window[0].document;
var i = d.createElement("iframe");
i.src = "data:text/html,PASS";
i.onload = function() {
r.removeAttribute("class");
}
d.body.appendChild(i);
}
</script>
<body onload="runTest();">
<iframe src="bug863728-1.html" width=400 height=200></iframe>

View File

@ -0,0 +1,5 @@
<link rel=stylesheet href="data:text/css;charset=iso-2022-kr,html { background-color: red }">
<link rel=stylesheet href="data:text/css,html { background-color: red }" charset="iso-2022-cn-ext">
<link rel=stylesheet href='data:text/css,@charset "csiso2022kr"; html { background-color: red }'>
<script src="data:text/javascript;charset=iso-2022-kr,document.write('FAIL');"></script>
<script src="data:text/javascript,document.write('FAIL');" charset="iso-2022-kr"></script>

View File

@ -0,0 +1,3 @@
== bug863728-1.html bug863728-1-ref.html
== bug863728-2.html bug863728-2-ref.html
== bug863728-3.html bug863728-3-ref.html

View File

@ -349,11 +349,10 @@ function testDecoderGetEncoding()
{encoding: "iso-2022-jp", labels: ["csiso2022jp", "iso-2022-jp"]},
{encoding: "shift_jis", labels: ["csshiftjis", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"]},
{encoding: "euc-kr", labels: ["cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"]},
{encoding: "iso-2022-kr", labels: ["csiso2022kr", "iso-2022-kr"]},
{encoding: "utf-16le", labels: ["utf-16", "utf-16le"]},
{encoding: "utf-16be", labels: ["utf-16be"]},
{encoding: "x-user-defined", labels: ["x-user-defined"]},
{error: "TypeError", labels: ["x-windows-949", "\u0130SO-8859-1"]},
{error: "TypeError", labels: ["x-windows-949", "\u0130SO-8859-1", "csiso2022kr", "iso-2022-kr", "iso-2022-cn", "iso-2022-cn-ext", "replacement"]},
];
for (var le of labelEncodings) {

View File

@ -24,7 +24,6 @@ setup({explicit_timeout: true});
<script type="text/javascript" src="unit/test_iso-2022-jp.js"></script>
<script type="text/javascript" src="unit/test_shift_jis.js"></script>
<script type="text/javascript" src="unit/test_euc-kr.js"></script>
<script type="text/javascript" src="unit/test_iso-2022-kr.js"></script>
</body>
</html>

View File

@ -6,6 +6,5 @@
[test_gbk.js]
[test_hz-gb-2312.js]
[test_iso-2022-jp.js]
[test_iso-2022-kr.js]
[test_shift_jis.js]
[test_singlebytes.js]

File diff suppressed because one or more lines are too long

View File

@ -297,7 +297,7 @@ test(
test(
function () {
var encodings = ["utf-8", "ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "iso-2022-kr", "x-user-defined"];
var encodings = ["utf-8", "ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"];
encodings.forEach(function (encoding) {
var string = '', bytes = [];
@ -308,8 +308,6 @@ test(
continue;
if (encoding === "iso-2022-jp" && i === 0x1B)
continue;
if (encoding === "iso-2022-kr" && (i === 0x0E || i === 0x0F || i === 0x1B))
continue;
string += String.fromCharCode(i);
bytes.push(i);
@ -344,7 +342,7 @@ test(
var utf_encodings = ["utf-8", "utf-16le", "utf-16be"];
var legacy_encodings = ["ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "iso-2022-kr", "x-user-defined"];
var legacy_encodings = ["ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"];
utf_encodings.forEach(function(encoding) {
assert_equals(TextDecoder(encoding).encoding, encoding);

View File

@ -8,6 +8,5 @@ tail =
[test_gbk.js]
[test_hz-gb-2312.js]
[test_iso-2022-jp.js]
[test_iso-2022-kr.js]
[test_shift_jis.js]
[test_singlebytes.js]

View File

@ -185,5 +185,5 @@ const SequenceModel TIS620ThaiModel =
ThaiLangModel,
(float)0.926386,
false,
"TIS-620"
"windows-874"
};

View File

@ -22,7 +22,7 @@ https://bugzilla.mozilla.org/show_bug.cgi?id=488426
<script class="testbody" type="text/javascript">
/** Test for Bug 488426 **/
CharsetDetectionTests("bug488426_text.html",
"TIS-620",
"windows-874",
new Array("universal_charset_detector"));
</script>
</pre>

View File

@ -87,6 +87,7 @@ x-mac-hebrew.notForBrowser = true
x-imap4-modified-utf7.notForBrowser = true
utf-7.notForBrowser = true
ibm864.notForBrowser = true
replacement.notForBrowser = true
x-mac-arabic.isInternal = true
x-mac-farsi.isInternal = true
@ -95,6 +96,7 @@ x-imap4-modified-utf7.isInternal = true
utf-7.isInternal = true
t.61-8bit.isInternal = true
ibm864.isInternal = true
replacement.isInternal = true
t.61-8bit.notForOutgoing = true
utf-7.notForOutgoing = true
@ -110,7 +112,7 @@ iso-8859-8-e.notForOutgoing = true
iso-8859-8.notForOutgoing = true
iso-2022-kr.notForOutgoing = true
x-johab.notForOutgoing = true
replacement.notForOutgoing = true
// XXX : there are some entries only necessary for Gtk/Xlib builds
// to map XLFD registry-encoding pairs to langGroups. they can be
@ -119,6 +121,7 @@ x-johab.notForOutgoing = true
// XXX : todo: move to something based on BCP 47 (RFC 5646);
// these should primarily specify script (and sometimes region),
// but NOT language.
// See also https://bugzilla.mozilla.org/show_bug.cgi?id=756022
// e.g. x-western -> *-Latn-155 (Western Europe)
// x-central-euro -> *-Latn-151 (Eastern Europe)
// x-baltic -> *-Latn-154 (Northern Europe)
@ -194,6 +197,7 @@ utf-16be.LangGroup = x-unicode
utf-16le.LangGroup = x-unicode
utf-7.LangGroup = x-unicode
x-imap4-modified-utf7.LangGroup = x-unicode
replacement.LangGroup = x-unicode
viscii.LangGroup = x-western
x-viet-tcvn5712.LangGroup = x-western
x-viet-vps.LangGroup = x-western
@ -244,3 +248,4 @@ euc-kr.isMultibyte = true
x-johab.isMultibyte = true
utf-7.isMultibyte = true
utf-8.isMultibyte = true
replacement.isMultibyte = true

View File

@ -11,6 +11,7 @@ UNIFIED_SOURCES += [
'nsCP1252ToUnicode.cpp',
'nsISO88591ToUnicode.cpp',
'nsMacRomanToUnicode.cpp',
'nsReplacementToUnicode.cpp',
'nsScriptableUConv.cpp',
'nsTextToSubURI.cpp',
'nsUConvModule.cpp',

View File

@ -0,0 +1,56 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsReplacementToUnicode.h"
nsReplacementToUnicode::nsReplacementToUnicode()
: mSeenByte(false)
{
}
NS_IMETHODIMP
nsReplacementToUnicode::Convert(const char* aSrc,
int32_t* aSrcLength,
PRUnichar* aDest,
int32_t* aDestLength)
{
if (mSeenByte || !(*aSrcLength)) {
*aDestLength = 0;
return NS_PARTIAL_MORE_INPUT;
}
if (mErrBehavior == kOnError_Signal) {
mSeenByte = true;
*aSrcLength = 0;
*aDestLength = 0;
return NS_ERROR_ILLEGAL_INPUT;
}
if (!(*aDestLength)) {
*aSrcLength = -1;
return NS_PARTIAL_MORE_OUTPUT;
}
mSeenByte = true;
*aDest = 0xFFFD;
*aDestLength = 1;
return NS_PARTIAL_MORE_INPUT;
}
NS_IMETHODIMP
nsReplacementToUnicode::GetMaxLength(const char* aSrc,
int32_t aSrcLength,
int32_t* aDestLength)
{
if (!mSeenByte && aSrcLength > 0) {
*aDestLength = 1;
} else {
*aDestLength = 0;
}
return NS_EXACT_LENGTH;
}
NS_IMETHODIMP
nsReplacementToUnicode::Reset()
{
mSeenByte = false;
return NS_OK;
}

View File

@ -0,0 +1,37 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsReplacementToUnicode_h_
#define nsReplacementToUnicode_h_
#include "nsUCSupport.h"
#define NS_REPLACEMENTTOUNICODE_CID \
{ 0xd24b24da, 0xc607, 0x489a, \
{ 0xb5, 0xf0, 0x67, 0x91, 0xf4, 0x45, 0x45, 0x6d } }
#define NS_REPLACEMENTTOUNICODE_CONTRACTID \
"@mozilla.org/intl/unicode/decoder;1?charset=replacement"
class nsReplacementToUnicode : public nsBasicDecoderSupport
{
public:
nsReplacementToUnicode();
NS_IMETHOD Convert(const char* aSrc,
int32_t* aSrcLength,
PRUnichar* aDest,
int32_t* aDestLength);
NS_IMETHOD GetMaxLength(const char* aSrc,
int32_t aSrcLength,
int32_t* aDestLength);
NS_IMETHOD Reset();
private:
bool mSeenByte;
};
#endif // nsReplacementToUnicode_h_

View File

@ -21,6 +21,7 @@
#include "nsISO88591ToUnicode.h"
#include "nsCP1252ToUnicode.h"
#include "nsMacRomanToUnicode.h"
#include "nsReplacementToUnicode.h"
#include "nsUTF8ToUnicode.h"
#include "nsUnicodeToISO88591.h"
#include "nsUnicodeToCP1252.h"
@ -219,6 +220,7 @@ NS_UCONV_REG_UNREG("ISO-8859-1", NS_ISO88591TOUNICODE_CID, NS_UNICODETOISO88591_
NS_UCONV_REG_UNREG("windows-1252", NS_CP1252TOUNICODE_CID, NS_UNICODETOCP1252_CID)
NS_UCONV_REG_UNREG("macintosh", NS_MACROMANTOUNICODE_CID, NS_UNICODETOMACROMAN_CID)
NS_UCONV_REG_UNREG("UTF-8", NS_UTF8TOUNICODE_CID, NS_UNICODETOUTF8_CID)
NS_UCONV_REG_UNREG("replacement", NS_REPLACEMENTTOUNICODE_CID, NS_UNICODETOUTF8_CID)
// ucvlatin
NS_UCONV_REG_UNREG("us-ascii", NS_ASCIITOUNICODE_CID, NS_UNICODETOASCII_CID)
@ -332,6 +334,7 @@ NS_CONVERTER_REGISTRY_END
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToUTF8)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF8ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsReplacementToUnicode)
// ucvlatin
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUTF7ToUnicode)
@ -506,6 +509,7 @@ NS_DEFINE_NAMED_CID(NS_ISO88591TOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_CP1252TOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_MACROMANTOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_UTF8TOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_REPLACEMENTTOUNICODE_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOISO88591_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOCP1252_CID);
NS_DEFINE_NAMED_CID(NS_UNICODETOMACROMAN_CID);
@ -690,6 +694,7 @@ static const mozilla::Module::CIDEntry kUConvCIDs[] = {
{ &kNS_ISO88591TOUNICODE_CID, false, nullptr, nsISO88591ToUnicodeConstructor },
{ &kNS_CP1252TOUNICODE_CID, false, nullptr, nsCP1252ToUnicodeConstructor },
{ &kNS_MACROMANTOUNICODE_CID, false, nullptr, nsMacRomanToUnicodeConstructor },
{ &kNS_REPLACEMENTTOUNICODE_CID, false, nullptr, nsReplacementToUnicodeConstructor },
{ &kNS_UTF8TOUNICODE_CID, false, nullptr, nsUTF8ToUnicodeConstructor },
{ &kNS_UNICODETOISO88591_CID, false, nullptr, nsUnicodeToISO88591Constructor },
{ &kNS_UNICODETOCP1252_CID, false, nullptr, nsUnicodeToCP1252Constructor },
@ -877,6 +882,7 @@ static const mozilla::Module::ContractIDEntry kUConvContracts[] = {
{ NS_ISO88591TOUNICODE_CONTRACTID, &kNS_ISO88591TOUNICODE_CID },
{ NS_CP1252TOUNICODE_CONTRACTID, &kNS_CP1252TOUNICODE_CID },
{ NS_MACROMANTOUNICODE_CONTRACTID, &kNS_MACROMANTOUNICODE_CID },
{ NS_REPLACEMENTTOUNICODE_CONTRACTID, &kNS_REPLACEMENTTOUNICODE_CID },
{ NS_UTF8TOUNICODE_CONTRACTID, &kNS_UTF8TOUNICODE_CID },
{ NS_UNICODETOISO88591_CONTRACTID, &kNS_UNICODETOISO88591_CID },
{ NS_UNICODETOCP1252_CONTRACTID, &kNS_UNICODETOCP1252_CID },

View File

@ -34,7 +34,10 @@ while (decoderList.hasMore()) {
// Skip UTF-16 variants. (Other non-ASCII compatible encodings will be
// ignored anyway because of bug 601429
if (decoder.substring(0, 6) == "UTF-16")
if (decoder.substring(0, 6) == "UTF-16" ||
decoder == "replacement" ||
decoder == "ISO-2022-KR" ||
decoder == "ISO-2022-CN")
continue;
data = encodeURI(testContent);

View File

@ -99,6 +99,7 @@ var encoderList = [
"gbk",
"HZ-GB-2312",
"gb18030",
"replacement",
];
var decoderList = [
@ -183,6 +184,7 @@ var decoderList = [
"gb18030",
"ISO-2022-KR",
"ISO-2022-CN",
"replacement",
];
function verifyList(aEnumerator, aList)

View File

@ -32,6 +32,10 @@ while (decoderList.hasMore()) {
data = encodeUTF16BE(testContent);
else if (decoder == "UTF-16" || decoder == "UTF-16LE")
data = encodeUTF16LE(testContent);
else if (decoder == "replacement" ||
decoder == "ISO-2022-KR" ||
decoder == "ISO-2022-CN")
continue;
else
data = encodeURI(testContent);
var dataURI = "data:text/html;charset=" + decoder + "," + data;

View File

@ -343,3 +343,6 @@ skip-if(B2G) include box-sizing/reftest.list
# invalidation - only run on B2G
skip-if(!B2G) include invalidation/reftest.list
# encodings
include ../../dom/encoding/test/reftest/reftest.list

View File

@ -181,7 +181,16 @@ nsUnicharStreamLoader::DetermineCharset()
do_GetService(kCharsetConverterManagerCID, &rv);
if (NS_FAILED(rv)) return rv;
rv = ccm->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mDecoder));
// Sadly, nsIUnicharStreamLoader is exposed to extensions, so we can't
// assume mozilla::css::Loader to be the only caller. Since legacy
// charset alias code doesn't know about the replacement encoding,
// special-case it here, but let other stuff go through legacy alias
// resolution for now.
if (mCharset.EqualsLiteral("replacement")) {
rv = ccm->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mDecoder));
} else {
rv = ccm->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mDecoder));
}
if (NS_FAILED(rv)) return rv;
// Process the data into mBuffer

View File

@ -2,8 +2,6 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsICharsetConverterManager.h"
#include "nsServiceManagerUtils.h"
#include "nsEncoderDecoderUtils.h"
#include "nsTraceRefcnt.h"
@ -12,15 +10,12 @@
using mozilla::dom::EncodingUtils;
void
nsHtml5MetaScanner::sniff(nsHtml5ByteReadable* bytes, nsIUnicodeDecoder** decoder, nsACString& charset)
nsHtml5MetaScanner::sniff(nsHtml5ByteReadable* bytes, nsACString& charset)
{
readable = bytes;
stateLoop(stateSave);
readable = nullptr;
if (mUnicodeDecoder) {
mUnicodeDecoder.forget(decoder);
charset.Assign(mCharset);
}
charset.Assign(mCharset);
}
bool
@ -29,47 +24,17 @@ nsHtml5MetaScanner::tryCharset(nsString* charset)
// This code needs to stay in sync with
// nsHtml5StreamParser::internalEncodingDeclaration. Unfortunately, the
// trickery with member fields here leads to some copy-paste reuse. :-(
nsresult res = NS_OK;
nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &res);
if (NS_FAILED(res)) {
NS_ERROR("Could not get CharsetConverterManager service.");
return false;
}
nsAutoCString label;
CopyUTF16toUTF8(*charset, label);
nsAutoCString encoding;
CopyUTF16toUTF8(*charset, encoding);
encoding.Trim(" \t\r\n\f");
if (encoding.LowerCaseEqualsLiteral("utf-16") ||
encoding.LowerCaseEqualsLiteral("utf-16be") ||
encoding.LowerCaseEqualsLiteral("utf-16le")) {
if (!EncodingUtils::FindEncodingForLabel(label, encoding)) {
return false;
}
if (encoding.EqualsLiteral("UTF-16BE") ||
encoding.EqualsLiteral("UTF-16LE")) {
mCharset.Assign("UTF-8");
res = convManager->GetUnicodeDecoderRaw(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
if (NS_FAILED(res)) {
NS_ERROR("Could not get decoder for UTF-8.");
return false;
}
return true;
}
nsAutoCString preferred;
if (!EncodingUtils::FindEncodingForLabel(encoding, preferred)) {
return false;
}
if (preferred.LowerCaseEqualsLiteral("utf-16") ||
preferred.LowerCaseEqualsLiteral("utf-16be") ||
preferred.LowerCaseEqualsLiteral("utf-16le") ||
preferred.LowerCaseEqualsLiteral("utf-7") ||
preferred.LowerCaseEqualsLiteral("x-imap4-modified-utf7")) {
return false;
}
res = convManager->GetUnicodeDecoderRaw(preferred.get(), getter_AddRefs(mUnicodeDecoder));
if (res == NS_ERROR_UCONV_NOCONV) {
return false;
} else if (NS_FAILED(res)) {
NS_ERROR("Getting an encoding decoder failed in a bad way.");
mUnicodeDecoder = nullptr;
return false;
} else {
NS_ASSERTION(mUnicodeDecoder, "Getter nsresult and object don't match.");
mCharset.Assign(preferred);
return true;
}
mCharset.Assign(encoding);
return true;
}

View File

@ -3,10 +3,9 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
private:
nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder;
nsCString mCharset;
inline int32_t read() {
return readable->read();
}
public:
void sniff(nsHtml5ByteReadable* bytes, nsIUnicodeDecoder** decoder, nsACString& charset);
void sniff(nsHtml5ByteReadable* bytes, nsACString& charset);

View File

@ -238,8 +238,16 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
NS_ASSERTION(IsParserThread(), "Wrong thread!");
if (aConf == eBestAnswer || aConf == eSureAnswer) {
mFeedChardet = false; // just in case
nsAutoCString encoding;
if (!EncodingUtils::FindEncodingForLabel(nsDependentCString(aCharset),
encoding)) {
return NS_OK;
}
if (encoding.EqualsLiteral("replacement")) {
return NS_OK;
}
if (HasDecoder()) {
if (mCharset.Equals(aCharset)) {
if (mCharset.Equals(encoding)) {
NS_ASSERTION(mCharsetSource < kCharsetFromAutoDetection,
"Why are we running chardet at all?");
mCharsetSource = kCharsetFromAutoDetection;
@ -247,8 +255,7 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
} else {
// We've already committed to a decoder. Request a reload from the
// docshell.
nsAutoCString charset(aCharset);
mTreeBuilder->NeedsCharsetSwitchTo(charset,
mTreeBuilder->NeedsCharsetSwitchTo(encoding,
kCharsetFromAutoDetection,
0);
FlushTreeOpsAndDisarmTimer();
@ -257,7 +264,7 @@ nsHtml5StreamParser::Notify(const char* aCharset, nsDetectionConfident aConf)
} else {
// Got a confident answer from the sniffing buffer. That code will
// take care of setting up the decoder.
mCharset.Assign(aCharset);
mCharset.Assign(encoding);
mCharsetSource = kCharsetFromAutoDetection;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
}
@ -299,7 +306,8 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const
nsresult rv = NS_OK;
nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
rv = convManager->GetUnicodeDecoderRaw(mCharset.get(),
getter_AddRefs(mUnicodeDecoder));
if (rv == NS_ERROR_UCONV_NOCONV) {
mCharset.AssignLiteral("windows-1252"); // lower case is the raw form
mCharsetSource = kCharsetFromFallback;
@ -307,16 +315,6 @@ nsHtml5StreamParser::SetupDecodingAndWriteSniffingBufferAndCurrentSegment(const
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
}
NS_ENSURE_SUCCESS(rv, rv);
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount, aWriteCount);
}
nsresult
nsHtml5StreamParser::WriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment, // can be null
uint32_t aCount,
uint32_t* aWriteCount)
{
NS_ASSERTION(IsParserThread(), "Wrong thread!");
nsresult rv = NS_OK;
if (mSniffingBuffer) {
uint32_t writeCount;
rv = WriteStreamBytes(mSniffingBuffer, mSniffingLength, &writeCount);
@ -710,29 +708,22 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
// if we get here, there either was no BOM or the BOM sniffing isn't complete
// yet
MOZ_ASSERT(mCharsetSource != kCharsetFromByteOrderMark,
"Should not come here if BOM was found.");
MOZ_ASSERT(mCharsetSource != kCharsetFromOtherComponent,
"kCharsetFromOtherComponent is for XSLT.");
if (mBomState == BOM_SNIFFING_OVER &&
mCharsetSource >= kCharsetFromChannel) {
// There was no BOM and the charset came from channel or higher. mCharset
// still contains the charset from the channel or higher as set by an
mCharsetSource == kCharsetFromChannel) {
// There was no BOM and the charset came from channel. mCharset
// still contains the charset from the channel as set by an
// earlier call to SetDocumentCharset(), since we didn't find a BOM and
// overwrite mCharset.
nsCOMPtr<nsICharsetConverterManager> convManager =
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID);
convManager->GetUnicodeDecoder(mCharset.get(),
getter_AddRefs(mUnicodeDecoder));
if (mUnicodeDecoder) {
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nullptr;
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount,
aWriteCount);
} else {
// nsHTMLDocument is supposed to make sure this does not happen. Let's
// deal with this anyway, since who knows how kCharsetFromOtherComponent
// is used.
mCharsetSource = kCharsetFromFallback;
}
// overwrite mCharset. (Note that if the user has overridden the charset,
// we don't come here but check <meta> for XSS-dangerous charsets first.)
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount, aWriteCount);
}
if (!mMetaScanner && (mMode == NORMAL ||
@ -748,17 +739,31 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
nsHtml5ByteReadable readable(aFromSegment, aFromSegment +
countToSniffingLimit);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
// meta scan successful
nsAutoCString encoding;
mMetaScanner->sniff(&readable, encoding);
if (!encoding.IsEmpty()) {
// meta scan successful; honor overrides unless meta is XSS-dangerous
if ((mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) &&
EncodingUtils::IsAsciiCompatible(encoding)) {
// Honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment, aCount, aWriteCount);
}
mCharset.Assign(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nullptr;
return WriteSniffingBufferAndCurrentSegment(aFromSegment, aCount,
aWriteCount);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment, aCount, aWriteCount);
}
}
if (mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) {
// meta not found, honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(
aFromSegment, aCount, aWriteCount);
}
return FinalizeSniffing(aFromSegment, aCount, aWriteCount,
countToSniffingLimit);
}
@ -766,16 +771,23 @@ nsHtml5StreamParser::SniffStreamBytes(const uint8_t* aFromSegment,
// not the last buffer
if (mMode == NORMAL || mMode == VIEW_SOURCE_HTML || mMode == LOAD_AS_DATA) {
nsHtml5ByteReadable readable(aFromSegment, aFromSegment + aCount);
mMetaScanner->sniff(&readable, getter_AddRefs(mUnicodeDecoder), mCharset);
if (mUnicodeDecoder) {
// meta scan successful
nsAutoCString encoding;
mMetaScanner->sniff(&readable, encoding);
if (!encoding.IsEmpty()) {
// meta scan successful; honor overrides unless meta is XSS-dangerous
if ((mCharsetSource == kCharsetFromParentForced ||
mCharsetSource == kCharsetFromUserForced) &&
EncodingUtils::IsAsciiCompatible(encoding)) {
// Honor override
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount, aWriteCount);
}
mCharset.Assign(encoding);
mCharsetSource = kCharsetFromMetaPrescan;
mFeedChardet = false;
mTreeBuilder->SetDocumentCharset(mCharset, mCharsetSource);
mMetaScanner = nullptr;
return WriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount,
aWriteCount);
return SetupDecodingAndWriteSniffingBufferAndCurrentSegment(aFromSegment,
aCount, aWriteCount);
}
}
@ -975,9 +987,11 @@ nsHtml5StreamParser::OnStartRequest(nsIRequest* aRequest, nsISupports* aContext)
mFeedChardet = false;
// Instantiate the converter here to avoid BOM sniffing.
nsCOMPtr<nsICharsetConverterManager> convManager = do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
nsCOMPtr<nsICharsetConverterManager> convManager =
do_GetService(NS_CHARSETCONVERTERMANAGER_CONTRACTID, &rv);
NS_ENSURE_SUCCESS(rv, rv);
rv = convManager->GetUnicodeDecoder(mCharset.get(), getter_AddRefs(mUnicodeDecoder));
rv = convManager->GetUnicodeDecoderRaw(mCharset.get(),
getter_AddRefs(mUnicodeDecoder));
// if we failed to get a decoder, there will be fallback, so don't propagate
// the error.
if (NS_FAILED(rv)) {

View File

@ -322,21 +322,6 @@ class nsHtml5StreamParser : public nsIStreamListener,
uint32_t aCount,
uint32_t* aWriteCount);
/**
* Write the sniffing buffer into the Unicode decoder followed by the
* current network buffer.
*
* @param aFromSegment The current network buffer or null if the sniffing
* buffer is being flushed due to network stream ending.
* @param aCount The number of bytes in aFromSegment (ignored if
* aFromSegment is null)
* @param aWriteCount Return value for how many bytes got read from the
* buffer.
*/
nsresult WriteSniffingBufferAndCurrentSegment(const uint8_t* aFromSegment,
uint32_t aCount,
uint32_t* aWriteCount);
/**
* Initialize the Unicode decoder, mark the BOM as the source and
* drop the sniffer.