mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Bug 910211 - Guess the fallback encoding from the top-level domain when feasible. r=emk.
This commit is contained in:
parent
fa18f78ac7
commit
c1e3afbe3c
@ -205,3 +205,10 @@ https://www2.w3c-test.org:443
|
||||
https://xn--n8j6ds53lwwkrqhv28a.w3c-test.org:443
|
||||
https://xn--lve-6lad.w3c-test.org:443
|
||||
http://test.w3.org:80
|
||||
|
||||
# Hosts for testing TLD-based fallback encoding
|
||||
http://example.tw:80 privileged
|
||||
http://example.cn:80 privileged
|
||||
http://example.co.jp:80 privileged
|
||||
http://example.fi:80 privileged
|
||||
|
||||
|
@ -435,6 +435,66 @@ nsHTMLDocument::TryParentCharset(nsIDocShell* aDocShell,
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
nsHTMLDocument::TryTLD(int32_t& aCharsetSource, nsACString& aCharset)
|
||||
{
|
||||
if (aCharsetSource >= kCharsetFromTopLevelDomain) {
|
||||
return;
|
||||
}
|
||||
if (!FallbackEncoding::sGuessFallbackFromTopLevelDomain) {
|
||||
return;
|
||||
}
|
||||
if (!mDocumentURI) {
|
||||
return;
|
||||
}
|
||||
nsAutoCString host;
|
||||
mDocumentURI->GetAsciiHost(host);
|
||||
if (host.IsEmpty()) {
|
||||
return;
|
||||
}
|
||||
// First let's see if the host is DNS-absolute and ends with a dot and
|
||||
// get rid of that one.
|
||||
if (host.Last() == '.') {
|
||||
host.SetLength(host.Length() - 1);
|
||||
if (host.IsEmpty()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
// If we still have a dot, the host is weird, so let's continue only
|
||||
// if we have something other than a dot now.
|
||||
if (host.Last() == '.') {
|
||||
return;
|
||||
}
|
||||
int32_t index = host.RFindChar('.');
|
||||
if (index == kNotFound) {
|
||||
// We have an intranet host, Gecko-internal URL or an IPv6 address.
|
||||
return;
|
||||
}
|
||||
// Since the string didn't end with a dot and we found a dot,
|
||||
// there is at least one character between the dot and the end of
|
||||
// the string, so taking the substring below is safe.
|
||||
nsAutoCString tld;
|
||||
ToLowerCase(Substring(host, index + 1, host.Length() - (index + 1)), tld);
|
||||
// Reject generic TLDs and country TLDs that need more research
|
||||
if (!FallbackEncoding::IsParticipatingTopLevelDomain(tld)) {
|
||||
return;
|
||||
}
|
||||
// Check if we have an IPv4 address
|
||||
bool seenNonDigit = false;
|
||||
for (size_t i = 0; i < tld.Length(); ++i) {
|
||||
char c = tld.CharAt(i);
|
||||
if (c < '0' || c > '9') {
|
||||
seenNonDigit = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!seenNonDigit) {
|
||||
return;
|
||||
}
|
||||
aCharsetSource = kCharsetFromTopLevelDomain;
|
||||
FallbackEncoding::FromTopLevelDomain(tld, aCharset);
|
||||
}
|
||||
|
||||
void
|
||||
nsHTMLDocument::TryFallback(int32_t& aCharsetSource, nsACString& aCharset)
|
||||
{
|
||||
@ -661,6 +721,7 @@ nsHTMLDocument::StartDocumentLoad(const char* aCommand,
|
||||
TryCacheCharset(cachingChan, charsetSource, charset);
|
||||
}
|
||||
|
||||
TryTLD(charsetSource, charset);
|
||||
TryFallback(charsetSource, charset);
|
||||
|
||||
if (wyciwygChannel) {
|
||||
|
@ -313,6 +313,7 @@ protected:
|
||||
nsACString& aCharset);
|
||||
void TryParentCharset(nsIDocShell* aDocShell,
|
||||
int32_t& charsetSource, nsACString& aCharset);
|
||||
void TryTLD(int32_t& aCharsetSource, nsACString& aCharset);
|
||||
static void TryFallback(int32_t& aCharsetSource, nsACString& aCharset);
|
||||
|
||||
// Override so we can munge the charset on our wyciwyg channel as needed.
|
||||
|
@ -1994,6 +1994,10 @@ nsDocShell::GatherCharsetMenuTelemetry()
|
||||
|
||||
int32_t charsetSource = doc->GetDocumentCharacterSetSource();
|
||||
switch (charsetSource) {
|
||||
case kCharsetFromTopLevelDomain:
|
||||
// Unlabeled doc on a domain that we map to a fallback encoding
|
||||
Telemetry::Accumulate(Telemetry::CHARSET_OVERRIDE_SITUATION, 7);
|
||||
break;
|
||||
case kCharsetFromFallback:
|
||||
case kCharsetFromDocTypeDefault:
|
||||
case kCharsetFromCache:
|
||||
|
@ -17,7 +17,16 @@ static const char* localesFallbacks[][3] = {
|
||||
#include "localesfallbacks.properties.h"
|
||||
};
|
||||
|
||||
static const char* domainsFallbacks[][3] = {
|
||||
#include "domainsfallbacks.properties.h"
|
||||
};
|
||||
|
||||
static const char* nonParticipatingDomains[][3] = {
|
||||
#include "nonparticipatingdomains.properties.h"
|
||||
};
|
||||
|
||||
FallbackEncoding* FallbackEncoding::sInstance = nullptr;
|
||||
bool FallbackEncoding::sGuessFallbackFromTopLevelDomain = true;
|
||||
|
||||
FallbackEncoding::FallbackEncoding()
|
||||
{
|
||||
@ -121,6 +130,8 @@ FallbackEncoding::Initialize()
|
||||
Preferences::RegisterCallback(FallbackEncoding::PrefChanged,
|
||||
"general.useragent.locale",
|
||||
nullptr);
|
||||
Preferences::AddBoolVarCache(&sGuessFallbackFromTopLevelDomain,
|
||||
"intl.charset.fallback.tld");
|
||||
}
|
||||
|
||||
void
|
||||
@ -132,5 +143,26 @@ FallbackEncoding::Shutdown()
|
||||
FallbackEncoding::sInstance = nullptr;
|
||||
}
|
||||
|
||||
bool
|
||||
FallbackEncoding::IsParticipatingTopLevelDomain(const nsACString& aTLD)
|
||||
{
|
||||
nsAutoCString dummy;
|
||||
return NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
|
||||
nonParticipatingDomains,
|
||||
ArrayLength(nonParticipatingDomains),
|
||||
aTLD,
|
||||
dummy));
|
||||
}
|
||||
|
||||
void
|
||||
FallbackEncoding::FromTopLevelDomain(const nsACString& aTLD,
|
||||
nsACString& aFallback)
|
||||
{
|
||||
if (NS_FAILED(nsUConvPropertySearch::SearchPropertyValue(
|
||||
domainsFallbacks, ArrayLength(domainsFallbacks), aTLD, aFallback))) {
|
||||
aFallback.AssignLiteral("windows-1252");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace dom
|
||||
} // namespace mozilla
|
||||
|
@ -14,6 +14,11 @@ class FallbackEncoding
|
||||
{
|
||||
public:
|
||||
|
||||
/**
|
||||
* Whether FromTopLevelDomain() should be used.
|
||||
*/
|
||||
static bool sGuessFallbackFromTopLevelDomain;
|
||||
|
||||
/**
|
||||
* Gets the locale-dependent fallback encoding for legacy HTML and plain
|
||||
* text content.
|
||||
@ -22,6 +27,23 @@ public:
|
||||
*/
|
||||
static void FromLocale(nsACString& aFallback);
|
||||
|
||||
/**
|
||||
* Checks if it is appropriate to call FromTopLevelDomain() for a given TLD.
|
||||
*
|
||||
* @param aTLD the top-level domain (in Punycode)
|
||||
* @return true if OK to call FromTopLevelDomain()
|
||||
*/
|
||||
static bool IsParticipatingTopLevelDomain(const nsACString& aTLD);
|
||||
|
||||
/**
|
||||
* Gets a top-level domain-depedendent fallback encoding for legacy HTML
|
||||
* and plain text content
|
||||
*
|
||||
* @param aTLD the top-level domain (in Punycode)
|
||||
* @param aFallback the outparam for the fallback encoding
|
||||
*/
|
||||
static void FromTopLevelDomain(const nsACString& aTLD, nsACString& aFallback);
|
||||
|
||||
// public API ends here!
|
||||
|
||||
/**
|
||||
|
@ -9,3 +9,7 @@ labelsencodings.properties.h: $(PROPS2ARRAYS) labelsencodings.properties
|
||||
$(PYTHON) $^ $@
|
||||
localesfallbacks.properties.h: $(PROPS2ARRAYS) localesfallbacks.properties
|
||||
$(PYTHON) $^ $@
|
||||
domainsfallbacks.properties.h: $(PROPS2ARRAYS) domainsfallbacks.properties
|
||||
$(PYTHON) $^ $@
|
||||
nonparticipatingdomains.properties.h: $(PROPS2ARRAYS) nonparticipatingdomains.properties
|
||||
$(PYTHON) $^ $@
|
||||
|
167
dom/encoding/domainsfallbacks.properties
Normal file
167
dom/encoding/domainsfallbacks.properties
Normal file
@ -0,0 +1,167 @@
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# This file contains educated guesses about which top-level domains are
|
||||
# likely to host legacy content that assumes a non-windows-1252 encoding.
|
||||
# Punycode TLDs are included on the theory that legacy content might appear
|
||||
# behind those relatively new TLDs if DNS just points to a legacy server.
|
||||
#
|
||||
# Encodings for which a confident-enough educated guess is missing are
|
||||
# listed in nonparticipatingdomains.properties. Domains that are listed
|
||||
# neither there nor here get windows-1252 as the associated fallback.
|
||||
#
|
||||
# The list below includes Arabic-script TLDs not on IANA list but on the
|
||||
# ICANN list:
|
||||
# http://www.icann.org/en/resources/idn/fast-track/string-evaluation-completion
|
||||
# Otherwise, the list includes non-windows-1252-affilited country TLDs from
|
||||
# https://data.iana.org/TLD/tlds-alpha-by-domain.txt
|
||||
#
|
||||
# The guesses are assigned as follows:
|
||||
# * If the country has a dominant country-affiliated language and that language
|
||||
# is part of the languages to fallbacks mapping, use the encoding for that
|
||||
# language from that mapping.
|
||||
# * Use windows-1256 for countries that have a dominant Arabic-script
|
||||
# language or whose all languages are Arabic-script languages.
|
||||
# * Use windows-1251 likewise but for Cyrillic script.
|
||||
|
||||
ae=windows-1256
|
||||
xn--mgbaam7a8h=windows-1256
|
||||
|
||||
af=windows-1256
|
||||
|
||||
bg=windows-1251
|
||||
|
||||
bh=windows-1256
|
||||
|
||||
by=windows-1251
|
||||
|
||||
cn=gbk
|
||||
xn--fiqs8s=gbk
|
||||
# Assume that Traditional Chinese TLD is meant to work if URL input happens to
|
||||
# be in the traditional mode. Expect content to be simplified anyway.
|
||||
xn--fiqz9s=gbk
|
||||
|
||||
cz=windows-1250
|
||||
|
||||
dz=windows-1256
|
||||
xn--lgbbat1ad8j=windows-1256
|
||||
|
||||
ee=windows-1257
|
||||
|
||||
eg=windows-1256
|
||||
xn--wgbh1c=windows-1256
|
||||
|
||||
gr=ISO-8859-7
|
||||
|
||||
hk=Big5-HKSCS
|
||||
xn--j6w193g=Big5-HKSCS
|
||||
|
||||
hr=windows-1250
|
||||
|
||||
hu=ISO-8859-2
|
||||
|
||||
iq=windows-1256
|
||||
|
||||
ir=windows-1256
|
||||
xn--mgba3a4f16a=windows-1256
|
||||
|
||||
jo=windows-1256
|
||||
xn--mgbayh7gpa=windows-1256
|
||||
|
||||
jp=Shift_JIS
|
||||
|
||||
kg=windows-1251
|
||||
|
||||
kp=EUC-KR
|
||||
|
||||
kr=EUC-KR
|
||||
xn--3e0b707e=EUC-KR
|
||||
|
||||
kw=windows-1256
|
||||
|
||||
kz=windows-1251
|
||||
xn--80ao21a=windows-1251
|
||||
|
||||
lb=windows-1256
|
||||
|
||||
lt=windows-1257
|
||||
|
||||
lv=windows-1257
|
||||
|
||||
ma=windows-1256
|
||||
xn--mgbc0a9azcg=windows-1256
|
||||
|
||||
mk=windows-1251
|
||||
|
||||
mn=windows-1251
|
||||
xn--l1acc=windows-1251
|
||||
|
||||
mo=Big5
|
||||
|
||||
# my
|
||||
xn--mgbx4cd0ab=windows-1256
|
||||
|
||||
om=windows-1256
|
||||
xn--mgb9awbf=windows-1256
|
||||
|
||||
#pk
|
||||
xn--mgbai9azgqp6j=windows-1256
|
||||
|
||||
pl=ISO-8859-2
|
||||
|
||||
ps=windows-1256
|
||||
xn--ygbi2ammx=windows-1256
|
||||
|
||||
qa=windows-1256
|
||||
xn--wgbl6a=windows-1256
|
||||
|
||||
rs=windows-1251
|
||||
xn--90a3ac=windows-1251
|
||||
|
||||
ru=windows-1251
|
||||
xn--p1ai=windows-1251
|
||||
|
||||
sa=windows-1256
|
||||
xn--mgberp4a5d4ar=windows-1256
|
||||
|
||||
sd=windows-1256
|
||||
xn--mgbpl2fh=windows-1256
|
||||
|
||||
sg=gbk
|
||||
xn--yfro4i67o=gbk
|
||||
|
||||
si=ISO-8859-2
|
||||
|
||||
sk=windows-1250
|
||||
|
||||
su=windows-1251
|
||||
|
||||
sy=windows-1256
|
||||
xn--mgbtf8fl=windows-1256
|
||||
|
||||
th=windows-874
|
||||
xn--o3cw4h=windows-874
|
||||
|
||||
tj=windows-1251
|
||||
|
||||
tn=windows-1256
|
||||
xn--pgbs0dh=windows-1256
|
||||
|
||||
tr=windows-1254
|
||||
|
||||
tw=Big5
|
||||
# Assume that the Simplified Chinese TLD is meant to work when URL input
|
||||
# happens in the simplified mode. Assume content is tradition anyway.
|
||||
xn--kprw13d=Big5
|
||||
xn--kpry57d=Big5
|
||||
|
||||
ua=windows-1251
|
||||
xn--j1amh=windows-1251
|
||||
|
||||
uz=windows-1251
|
||||
|
||||
vn=windows-1258
|
||||
|
||||
ye=windows-1256
|
||||
xn--mgb2ddes=windows-1256
|
@ -28,6 +28,8 @@ LOCAL_INCLUDES += [
|
||||
]
|
||||
|
||||
GENERATED_FILES += [
|
||||
'domainsfallbacks.properties.h',
|
||||
'labelsencodings.properties.h',
|
||||
'localesfallbacks.properties.h',
|
||||
'nonparticipatingdomains.properties.h',
|
||||
]
|
||||
|
51
dom/encoding/nonparticipatingdomains.properties
Normal file
51
dom/encoding/nonparticipatingdomains.properties
Normal file
@ -0,0 +1,51 @@
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# Top-level domains listed here do not participate in TLD-based guessing.
|
||||
#
|
||||
# We should do Web crawls to see if domains listed here can migrate to
|
||||
# domainsfallbacks.properties.
|
||||
#
|
||||
# The value to the right of the = sign is ignored and serves as a placeholder.
|
||||
|
||||
# Generic
|
||||
com=windows-1252
|
||||
net=windows-1252
|
||||
org=windows-1252
|
||||
|
||||
# No Firefox localization for Azeri
|
||||
az=windows-1254
|
||||
|
||||
# windows-1251 or windows-1250?
|
||||
ba=???
|
||||
|
||||
# ISO-8859-7 or windows-1254?
|
||||
cy=???
|
||||
|
||||
# Is there enough unlabeled windows-1256 content for a windows-1255 to break
|
||||
# too much?
|
||||
il=windows-1255
|
||||
|
||||
# Out-of-country English use
|
||||
ly=windows-1256
|
||||
|
||||
# Out-of-country English use
|
||||
# md=windows-1250
|
||||
|
||||
# Out-of-country English use
|
||||
# me=windows-1251
|
||||
|
||||
# Malaysia has an Arabic-script TLD, official script is latin, possibly Chinese-script publications
|
||||
my=???
|
||||
|
||||
# No Firefox localization for Urdu; potential for minority-language sites
|
||||
# relying on windows-1252 hacks.
|
||||
pk=windows-1256
|
||||
|
||||
# The Romanian localization says windows-1252, even though the Windows legacy
|
||||
# differs.
|
||||
ro=windows-1250
|
||||
|
||||
tm=windows-1250
|
||||
|
7
dom/encoding/test/file_TLD.html
Normal file
7
dom/encoding/test/file_TLD.html
Normal file
@ -0,0 +1,7 @@
|
||||
<!DOCTYPE html>
|
||||
<script>
|
||||
function report() {
|
||||
window.parent.postMessage(document.characterSet, "*");
|
||||
}
|
||||
</script>
|
||||
<body onload="report();">
|
@ -7,6 +7,7 @@ support-files =
|
||||
file_utf16_le_bom.js
|
||||
file_utf16_le_bom.xhtml
|
||||
file_utf16_le_nobom.xhtml
|
||||
file_TLD.html
|
||||
worker_helper.js
|
||||
|
||||
[test_BOMEncoding.js]
|
||||
@ -16,4 +17,5 @@ support-files =
|
||||
[test_TextEncoder.js]
|
||||
[test_stringencoding.html]
|
||||
[test_submit_euckr.html]
|
||||
[test_TLD.html]
|
||||
[test_utf16_files.html]
|
||||
|
57
dom/encoding/test/test_TLD.html
Normal file
57
dom/encoding/test/test_TLD.html
Normal file
@ -0,0 +1,57 @@
|
||||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=910211
|
||||
-->
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test for Bug 910211</title>
|
||||
<script type="application/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
|
||||
<script type="application/javascript">
|
||||
|
||||
/** Test for Bug 910211 **/
|
||||
|
||||
SimpleTest.waitForExplicitFinish();
|
||||
|
||||
var tlds = [
|
||||
{'tld': 'tw', 'encoding': 'Big5'},
|
||||
{'tld': 'cn', 'encoding': 'gbk'},
|
||||
{'tld': 'co.jp', 'encoding': 'Shift_JIS'},
|
||||
{'tld': 'fi', 'encoding': 'windows-1252'},
|
||||
];
|
||||
|
||||
var iframe = null;
|
||||
|
||||
var current = null;
|
||||
|
||||
function runTest() {
|
||||
iframe = document.getElementsByTagName("iframe")[0];
|
||||
window.addEventListener("message", next);
|
||||
next(null);
|
||||
}
|
||||
|
||||
function next(event) {
|
||||
if (event) {
|
||||
is(event.data, current['encoding'], "Got bad encoding for " + current["tld"]);
|
||||
}
|
||||
current = tlds.shift();
|
||||
if (!current) {
|
||||
SimpleTest.finish();
|
||||
return;
|
||||
}
|
||||
iframe.src = "http://example." + current["tld"] + "/tests/dom/encoding/test/file_TLD.html";
|
||||
}
|
||||
|
||||
</script>
|
||||
</head>
|
||||
<body onload="runTest();">
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=910211">Mozilla Bug 910211</a>
|
||||
<p id="display"></p>
|
||||
<div id="content" style="display: none">
|
||||
<iframe></iframe>
|
||||
</div>
|
||||
<pre id="test">
|
||||
</pre>
|
||||
</body>
|
||||
</html>
|
@ -1392,6 +1392,7 @@ pref("intl.charsetmenu.composer.cache", "");
|
||||
pref("intl.charsetmenu.browser.cache.size", 5);
|
||||
pref("intl.charset.detector", "chrome://global/locale/intl.properties");
|
||||
pref("intl.charset.fallback.override", "");
|
||||
pref("intl.charset.fallback.tld", true);
|
||||
pref("intl.ellipsis", "chrome://global-platform/locale/intl.properties");
|
||||
pref("intl.locale.matchOS", false);
|
||||
// fallback charset list for Unicode conversion (converting from Unicode)
|
||||
|
@ -8,18 +8,19 @@
|
||||
// note: the value order defines the priority; higher numbers take priority
|
||||
#define kCharsetUninitialized 0
|
||||
#define kCharsetFromFallback 1
|
||||
#define kCharsetFromDocTypeDefault 2 // This and up confident for XHR
|
||||
#define kCharsetFromCache 3
|
||||
#define kCharsetFromParentFrame 4
|
||||
#define kCharsetFromAutoDetection 5
|
||||
#define kCharsetFromHintPrevDoc 6
|
||||
#define kCharsetFromMetaPrescan 7 // this one and smaller: HTML5 Tentative
|
||||
#define kCharsetFromMetaTag 8 // this one and greater: HTML5 Confident
|
||||
#define kCharsetFromIrreversibleAutoDetection 9
|
||||
#define kCharsetFromChannel 10
|
||||
#define kCharsetFromOtherComponent 11
|
||||
#define kCharsetFromParentForced 12 // propagates to child frames
|
||||
#define kCharsetFromUserForced 13 // propagates to child frames
|
||||
#define kCharsetFromByteOrderMark 14
|
||||
#define kCharsetFromTopLevelDomain 2
|
||||
#define kCharsetFromDocTypeDefault 3 // This and up confident for XHR
|
||||
#define kCharsetFromCache 4
|
||||
#define kCharsetFromParentFrame 5
|
||||
#define kCharsetFromAutoDetection 6
|
||||
#define kCharsetFromHintPrevDoc 7
|
||||
#define kCharsetFromMetaPrescan 8 // this one and smaller: HTML5 Tentative
|
||||
#define kCharsetFromMetaTag 9 // this one and greater: HTML5 Confident
|
||||
#define kCharsetFromIrreversibleAutoDetection 10
|
||||
#define kCharsetFromChannel 11
|
||||
#define kCharsetFromOtherComponent 12
|
||||
#define kCharsetFromParentForced 13 // propagates to child frames
|
||||
#define kCharsetFromUserForced 14 // propagates to child frames
|
||||
#define kCharsetFromByteOrderMark 15
|
||||
|
||||
#endif /* nsCharsetSource_h_ */
|
||||
|
Loading…
Reference in New Issue
Block a user