Bug 910211 - Guess the fallback encoding from the top-level domain when feasible. r=emk.

This commit is contained in:
Henri Sivonen 2014-02-06 11:08:01 +02:00
parent fa18f78ac7
commit c1e3afbe3c
15 changed files with 432 additions and 13 deletions

View File

@ -205,3 +205,10 @@ https://www2.w3c-test.org:443
https://xn--n8j6ds53lwwkrqhv28a.w3c-test.org:443
https://xn--lve-6lad.w3c-test.org:443
http://test.w3.org:80
# Hosts for testing TLD-based fallback encoding
http://example.tw:80 privileged
http://example.cn:80 privileged
http://example.co.jp:80 privileged
http://example.fi:80 privileged

View File

@ -435,6 +435,66 @@ nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell,
}
}
void
nsHTMLDocument::TryTLD(int32_t& aCharsetSource, nsACString& aCharset)
{
if (aCharsetSource >= kCharsetFromTopLevelDomain) {
return;
}
if (!FallbackEncoding::sGuessFallbackFromTopLevelDomain) {
return;
}
if (!mDocumentURI) {
return;
}
nsAutoCString host;
mDocumentURI->GetAsciiHost(host);
if (host.IsEmpty()) {
return;
}
// First let's see if the host is DNS-absolute and ends with a dot and
// get rid of that one.
if (host.Last() == '.') {
host.SetLength(host.Length() - 1);
if (host.IsEmpty()) {
return;
}
}
// If we still have a dot, the host is weird, so let's continue only
// if we have something other than a dot now.
if (host.Last() == '.') {
return;
}
int32_t index = host.RFindChar('.');
if (index == kNotFound) {
// We have an intranet host, Gecko-internal URL or an IPv6 address.
return;
}
// Since the string didn't end with a dot and we found a dot,
// there is at least one character between the dot and the end of
// the string, so taking the substring below is safe.
nsAutoCString tld;
ToLowerCase(Substring(host, index + 1, host.Length() - (index + 1)), tld);
// Reject generic TLDs and country TLDs that need more research
if (!FallbackEncoding::IsParticipatingTopLevelDomain(tld)) {
return;
}
// Check if we have an IPv4 address
bool seenNonDigit = false;
for (size_t i = 0; i < tld.Length(); ++i) {
char c = tld.CharAt(i);
if (c < '0' || c > '9') {
seenNonDigit = true;
break;
}
}
if (!seenNonDigit) {
return;
}
aCharsetSource = kCharsetFromTopLevelDomain;
FallbackEncoding::FromTopLevelDomain(tld, aCharset);
}
void
nsHTMLDocument::TryFallback(int32_t& aCharsetSource, nsACString& aCharset)
{
@ -661,6 +721,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
TryCacheCharset(cachingChan, charsetSource, charset);
}
TryTLD(charsetSource, charset);
TryFallback(charsetSource, charset);
if (wyciwygChannel) {

View File

@ -313,6 +313,7 @@ protected:
nsACString& aCharset);
void TryParentCharset(nsIDocShell* aDocShell,
int32_t& charsetSource, nsACString& aCharset);
void TryTLD(int32_t& aCharsetSource, nsACString& aCharset);
static void TryFallback(int32_t& aCharsetSource, nsACString& aCharset);
// Override so we can munge the charset on our wyciwyg channel as needed.

View File

@ -1994,6 +1994,10 @@ nsDocShell::GatherCharsetMenuTelemetry()
int32_t charsetSource = doc->GetDocumentCharacterSetSource();
switch (charsetSource) {
case kCharsetFromTopLevelDomain:
// Unlabeled doc on a domain that we map to a fallback encoding
Telemetry::Accumulate(Telemetry::CHARSET_OVERRIDE_SITUATION, 7);
break;
case kCharsetFromFallback:
case kCharsetFromDocTypeDefault:
case kCharsetFromCache:

View File

@ -17,7 +17,16 @@ static const char* localesFallbacks[][3] = {
#include "localesfallbacks.properties.h"
};
static const char* domainsFallbacks[][3] = {
#include "domainsfallbacks.properties.h"
};
static const char* nonParticipatingDomains[][3] = {
#include "nonparticipatingdomains.properties.h"
};
FallbackEncoding* FallbackEncoding::sInstance = nullptr;
bool FallbackEncoding::sGuessFallbackFromTopLevelDomain = true;
FallbackEncoding::FallbackEncoding()
{
@ -121,6 +130,8 @@ FallbackEncoding::Initialize()
Preferences::RegisterCallback(FallbackEncoding::PrefChanged,
"general.useragent.locale",
nullptr);
Preferences::AddBoolVarCache(&sGuessFallbackFromTopLevelDomain,
"intl.charset.fallback.tld");
}
void
@ -132,5 +143,26 @@ FallbackEncoding::Shutdown()
FallbackEncoding::sInstance = nullptr;
}
bool
FallbackEncoding::IsParticipatingTopLevelDomain(const nsACString& aTLD)
{
nsAutoCString dummy;
return NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
nonParticipatingDomains,
ArrayLength(nonParticipatingDomains),
aTLD,
dummy));
}
void
FallbackEncoding::FromTopLevelDomain(const nsACString& aTLD,
nsACString& aFallback)
{
if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
domainsFallbacks, ArrayLength(domainsFallbacks), aTLD, aFallback))) {
aFallback.AssignLiteral("windows-1252");
}
}
} // namespace dom
} // namespace mozilla

View File

@ -14,6 +14,11 @@ class FallbackEncoding
{
public:
/**
* Whether FromTopLevelDomain() should be used.
*/
static bool sGuessFallbackFromTopLevelDomain;
/**
* Gets the locale-dependent fallback encoding for legacy HTML and plain
* text content.
@ -22,6 +27,23 @@ public:
*/
static void FromLocale(nsACString& aFallback);
/**
* Checks if it is appropriate to call FromTopLevelDomain() for a given TLD.
*
* @param aTLD the top-level domain (in Punycode)
* @return true if OK to call FromTopLevelDomain()
*/
static bool IsParticipatingTopLevelDomain(const nsACString& aTLD);
/**
* Gets a top-level domain-depedendent fallback encoding for legacy HTML
* and plain text content
*
* @param aTLD the top-level domain (in Punycode)
* @param aFallback the outparam for the fallback encoding
*/
static void FromTopLevelDomain(const nsACString& aTLD, nsACString& aFallback);
// public API ends here!
/**

View File

@ -9,3 +9,7 @@ labelsencodings.properties.h: $(PROPS2ARRAYS) labelsencodings.properties
$(PYTHON) $^ $@
localesfallbacks.properties.h: $(PROPS2ARRAYS) localesfallbacks.properties
$(PYTHON) $^ $@
domainsfallbacks.properties.h: $(PROPS2ARRAYS) domainsfallbacks.properties
$(PYTHON) $^ $@
nonparticipatingdomains.properties.h: $(PROPS2ARRAYS) nonparticipatingdomains.properties
$(PYTHON) $^ $@

View File

@ -0,0 +1,167 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# This file contains educated guesses about which top-level domains are
# likely to host legacy content that assumes a non-windows-1252 encoding.
# Punycode TLDs are included on the theory that legacy content might appear
# behind those relatively new TLDs if DNS just points to a legacy server.
#
# Encodings for which a confident-enough educated guess is missing are
# listed in nonparticipatingdomains.properties. Domains that are listed
# neither there nor here get windows-1252 as the associated fallback.
#
# The list below includes Arabic-script TLDs not on IANA list but on the
# ICANN list:
# http://www.icann.org/en/resources/idn/fast-track/string-evaluation-completion
# Otherwise, the list includes non-windows-1252-affilited country TLDs from
# https://data.iana.org/TLD/tlds-alpha-by-domain.txt
#
# The guesses are assigned as follows:
# * If the country has a dominant country-affiliated language and that language
# is part of the languages to fallbacks mapping, use the encoding for that
# language from that mapping.
# * Use windows-1256 for countries that have a dominant Arabic-script
# language or whose all languages are Arabic-script languages.
# * Use windows-1251 likewise but for Cyrillic script.
ae=windows-1256
xn--mgbaam7a8h=windows-1256
af=windows-1256
bg=windows-1251
bh=windows-1256
by=windows-1251
cn=gbk
xn--fiqs8s=gbk
# Assume that Traditional Chinese TLD is meant to work if URL input happens to
# be in the traditional mode. Expect content to be simplified anyway.
xn--fiqz9s=gbk
cz=windows-1250
dz=windows-1256
xn--lgbbat1ad8j=windows-1256
ee=windows-1257
eg=windows-1256
xn--wgbh1c=windows-1256
gr=ISO-8859-7
hk=Big5-HKSCS
xn--j6w193g=Big5-HKSCS
hr=windows-1250
hu=ISO-8859-2
iq=windows-1256
ir=windows-1256
xn--mgba3a4f16a=windows-1256
jo=windows-1256
xn--mgbayh7gpa=windows-1256
jp=Shift_JIS
kg=windows-1251
kp=EUC-KR
kr=EUC-KR
xn--3e0b707e=EUC-KR
kw=windows-1256
kz=windows-1251
xn--80ao21a=windows-1251
lb=windows-1256
lt=windows-1257
lv=windows-1257
ma=windows-1256
xn--mgbc0a9azcg=windows-1256
mk=windows-1251
mn=windows-1251
xn--l1acc=windows-1251
mo=Big5
# my
xn--mgbx4cd0ab=windows-1256
om=windows-1256
xn--mgb9awbf=windows-1256
#pk
xn--mgbai9azgqp6j=windows-1256
pl=ISO-8859-2
ps=windows-1256
xn--ygbi2ammx=windows-1256
qa=windows-1256
xn--wgbl6a=windows-1256
rs=windows-1251
xn--90a3ac=windows-1251
ru=windows-1251
xn--p1ai=windows-1251
sa=windows-1256
xn--mgberp4a5d4ar=windows-1256
sd=windows-1256
xn--mgbpl2fh=windows-1256
sg=gbk
xn--yfro4i67o=gbk
si=ISO-8859-2
sk=windows-1250
su=windows-1251
sy=windows-1256
xn--mgbtf8fl=windows-1256
th=windows-874
xn--o3cw4h=windows-874
tj=windows-1251
tn=windows-1256
xn--pgbs0dh=windows-1256
tr=windows-1254
tw=Big5
# Assume that the Simplified Chinese TLD is meant to work when URL input
# happens in the simplified mode. Assume content is tradition anyway.
xn--kprw13d=Big5
xn--kpry57d=Big5
ua=windows-1251
xn--j1amh=windows-1251
uz=windows-1251
vn=windows-1258
ye=windows-1256
xn--mgb2ddes=windows-1256

View File

@ -28,6 +28,8 @@ LOCAL_INCLUDES += [
]
GENERATED_FILES += [
'domainsfallbacks.properties.h',
'labelsencodings.properties.h',
'localesfallbacks.properties.h',
'nonparticipatingdomains.properties.h',
]

View File

@ -0,0 +1,51 @@
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Top-level domains listed here do not participate in TLD-based guessing.
#
# We should do Web crawls to see if domains listed here can migrate to
# domainsfallbacks.properties.
#
# The value to the right of the = sign is ignored and serves as a placeholder.
# Generic
com=windows-1252
net=windows-1252
org=windows-1252
# No Firefox localization for Azeri
az=windows-1254
# windows-1251 or windows-1250?
ba=???
# ISO-8859-7 or windows-1254?
cy=???
# Is there enough unlabeled windows-1256 content for a windows-1255 to break
# too much?
il=windows-1255
# Out-of-country English use
ly=windows-1256
# Out-of-country English use
# md=windows-1250
# Out-of-country English use
# me=windows-1251
# Malaysia has an Arabic-script TLD, official script is latin, possibly Chinese-script publications
my=???
# No Firefox localization for Urdu; potential for minority-language sites
# relying on windows-1252 hacks.
pk=windows-1256
# The Romanian localization says windows-1252, even though the Windows legacy
# differs.
ro=windows-1250
tm=windows-1250

View File

@ -0,0 +1,7 @@
<!DOCTYPE html>
<script>
function report() {
window.parent.postMessage(document.characterSet, "*");
}
</script>
<body onload="report();">

View File

@ -7,6 +7,7 @@ support-files =
file_utf16_le_bom.js
file_utf16_le_bom.xhtml
file_utf16_le_nobom.xhtml
file_TLD.html
worker_helper.js
[test_BOMEncoding.js]
@ -16,4 +17,5 @@ support-files =
[test_TextEncoder.js]
[test_stringencoding.html]
[test_submit_euckr.html]
[test_TLD.html]
[test_utf16_files.html]

View File

@ -0,0 +1,57 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=910211
-->
<head>
<meta charset="utf-8">
<title>Test for Bug 910211</title>
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
<script type="application/javascript">
/** Test for Bug 910211 **/
SimpleTest.waitForExplicitFinish();
var tlds = [
{'tld': 'tw', 'encoding': 'Big5'},
{'tld': 'cn', 'encoding': 'gbk'},
{'tld': 'co.jp', 'encoding': 'Shift_JIS'},
{'tld': 'fi', 'encoding': 'windows-1252'},
];
var iframe = null;
var current = null;
function runTest() {
iframe = document.getElementsByTagName("iframe")[0];
window.addEventListener("message", next);
next(null);
}
function next(event) {
if (event) {
is(event.data, current['encoding'], "Got bad encoding for " + current["tld"]);
}
current = tlds.shift();
if (!current) {
SimpleTest.finish();
return;
}
iframe.src = "http://example." + current["tld"] + "/tests/dom/encoding/test/file_TLD.html";
}
</script>
</head>
<body onload="runTest();">
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=910211">Mozilla Bug 910211</a>
<p id="display"></p>
<div id="content" style="display: none">
<iframe></iframe>
</div>
<pre id="test">
</pre>
</body>
</html>

View File

@ -1392,6 +1392,7 @@ pref("intl.charsetmenu.composer.cache", "");
pref("intl.charsetmenu.browser.cache.size", 5);
pref("intl.charset.detector", "chrome://global/locale/intl.properties");
pref("intl.charset.fallback.override", "");
pref("intl.charset.fallback.tld", true);
pref("intl.ellipsis", "chrome://global-platform/locale/intl.properties");
pref("intl.locale.matchOS", false);
// fallback charset list for Unicode conversion (converting from Unicode)

View File

@ -8,18 +8,19 @@
// note: the value order defines the priority; higher numbers take priority
#define kCharsetUninitialized 0
#define kCharsetFromFallback 1
#define kCharsetFromDocTypeDefault 2 // This and up confident for XHR
#define kCharsetFromCache 3
#define kCharsetFromParentFrame 4
#define kCharsetFromAutoDetection 5
#define kCharsetFromHintPrevDoc 6
#define kCharsetFromMetaPrescan 7 // this one and smaller: HTML5 Tentative
#define kCharsetFromMetaTag 8 // this one and greater: HTML5 Confident
#define kCharsetFromIrreversibleAutoDetection 9
#define kCharsetFromChannel 10
#define kCharsetFromOtherComponent 11
#define kCharsetFromParentForced 12 // propagates to child frames
#define kCharsetFromUserForced 13 // propagates to child frames
#define kCharsetFromByteOrderMark 14
#define kCharsetFromTopLevelDomain 2
#define kCharsetFromDocTypeDefault 3 // This and up confident for XHR
#define kCharsetFromCache 4
#define kCharsetFromParentFrame 5
#define kCharsetFromAutoDetection 6
#define kCharsetFromHintPrevDoc 7
#define kCharsetFromMetaPrescan 8 // this one and smaller: HTML5 Tentative
#define kCharsetFromMetaTag 9 // this one and greater: HTML5 Confident
#define kCharsetFromIrreversibleAutoDetection 10
#define kCharsetFromChannel 11
#define kCharsetFromOtherComponent 12
#define kCharsetFromParentForced 13 // propagates to child frames
#define kCharsetFromUserForced 14 // propagates to child frames
#define kCharsetFromByteOrderMark 15
#endif /* nsCharsetSource_h_ */