gecko/dom/encoding/EncodingUtils.cpp

338 lines
9.2 KiB
C++

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "mozilla/dom/EncodingUtils.h"
#include "nsAutoPtr.h"
#include "nsContentUtils.h"
namespace mozilla {
namespace dom {
EncodingUtils* gEncodings = nullptr;
struct LabelEncoding
{
const char* mLabel;
const char* mEncoding;
};
static const LabelEncoding labelsEncodings[] = {
{"unicode-1-1-utf-8", "utf-8"},
{"utf-8", "utf-8"},
{"utf8", "utf-8"},
{"866", "ibm866"},
{"cp866", "ibm866"},
{"csibm866", "ibm866"},
{"ibm866", "ibm866"},
{"csisolatin2", "iso-8859-2"},
{"iso-8859-2", "iso-8859-2"},
{"iso-ir-101", "iso-8859-2"},
{"iso8859-2", "iso-8859-2"},
{"iso88592", "iso-8859-2"},
{"iso_8859-2", "iso-8859-2"},
{"iso_8859-2:1987", "iso-8859-2"},
{"l2", "iso-8859-2"},
{"latin2", "iso-8859-2"},
{"csisolatin3", "iso-8859-3"},
{"iso-8859-3", "iso-8859-3"},
{"iso-ir-109", "iso-8859-3"},
{"iso8859-3", "iso-8859-3"},
{"iso88593", "iso-8859-3"},
{"iso_8859-3", "iso-8859-3"},
{"iso_8859-3:1988", "iso-8859-3"},
{"l3", "iso-8859-3"},
{"latin3", "iso-8859-3"},
{"csisolatin4", "iso-8859-4"},
{"iso-8859-4", "iso-8859-4"},
{"iso-ir-110", "iso-8859-4"},
{"iso8859-4", "iso-8859-4"},
{"iso88594", "iso-8859-4"},
{"iso_8859-4", "iso-8859-4"},
{"iso_8859-4:1988", "iso-8859-4"},
{"l4", "iso-8859-4"},
{"latin4", "iso-8859-4"},
{"csisolatincyrillic", "iso-8859-5"},
{"cyrillic", "iso-8859-5"},
{"iso-8859-5", "iso-8859-5"},
{"iso-ir-144", "iso-8859-5"},
{"iso8859-5", "iso-8859-5"},
{"iso88595", "iso-8859-5"},
{"iso_8859-5", "iso-8859-5"},
{"iso_8859-5:1988", "iso-8859-5"},
{"arabic", "iso-8859-6"},
{"asmo-708", "iso-8859-6"},
{"csiso88596e", "iso-8859-6"},
{"csiso88596i", "iso-8859-6"},
{"csisolatinarabic", "iso-8859-6"},
{"ecma-114", "iso-8859-6"},
{"iso-8859-6", "iso-8859-6"},
{"iso-8859-6-e", "iso-8859-6"},
{"iso-8859-6-i", "iso-8859-6"},
{"iso-ir-127", "iso-8859-6"},
{"iso8859-6", "iso-8859-6"},
{"iso88596", "iso-8859-6"},
{"iso_8859-6", "iso-8859-6"},
{"iso_8859-6:1987", "iso-8859-6"},
{"csisolatingreek", "iso-8859-7"},
{"ecma-118", "iso-8859-7"},
{"elot_928", "iso-8859-7"},
{"greek", "iso-8859-7"},
{"greek8", "iso-8859-7"},
{"iso-8859-7", "iso-8859-7"},
{"iso-ir-126", "iso-8859-7"},
{"iso8859-7", "iso-8859-7"},
{"iso88597", "iso-8859-7"},
{"iso_8859-7", "iso-8859-7"},
{"iso_8859-7:1987", "iso-8859-7"},
{"sun_eu_greek", "iso-8859-7"},
{"csiso88598e", "iso-8859-8"},
{"csisolatinhebrew", "iso-8859-8"},
{"hebrew", "iso-8859-8"},
{"iso-8859-8", "iso-8859-8"},
{"iso-8859-8-e", "iso-8859-8"},
{"iso-ir-138", "iso-8859-8"},
{"iso8859-8", "iso-8859-8"},
{"iso88598", "iso-8859-8"},
{"iso_8859-8", "iso-8859-8"},
{"iso_8859-8:1988", "iso-8859-8"},
{"visual", "iso-8859-8"},
{"csiso88598i", "iso-8859-8-i"},
{"iso-8859-8-i", "iso-8859-8-i"},
{"logical", "iso-8859-8-i"},
{"csisolatin6", "iso-8859-10"},
{"iso-8859-10", "iso-8859-10"},
{"iso-ir-157", "iso-8859-10"},
{"iso8859-10", "iso-8859-10"},
{"iso885910", "iso-8859-10"},
{"l6", "iso-8859-10"},
{"latin6", "iso-8859-10"},
{"iso-8859-13", "iso-8859-13"},
{"iso8859-13", "iso-8859-13"},
{"iso885913", "iso-8859-13"},
{"iso-8859-14", "iso-8859-14"},
{"iso8859-14", "iso-8859-14"},
{"iso885914", "iso-8859-14"},
{"csisolatin9", "iso-8859-15"},
{"iso-8859-15", "iso-8859-15"},
{"iso8859-15", "iso-8859-15"},
{"iso885915", "iso-8859-15"},
{"iso_8859-15", "iso-8859-15"},
{"l9", "iso-8859-15"},
{"iso-8859-16", "iso-8859-16"},
{"cskoi8r", "koi8-r"},
{"koi", "koi8-r"},
{"koi8", "koi8-r"},
{"koi8-r", "koi8-r"},
{"koi8_r", "koi8-r"},
{"koi8-u", "koi8-u"},
{"csmacintosh", "macintosh"},
{"mac", "macintosh"},
{"macintosh", "macintosh"},
{"x-mac-roman", "macintosh"},
{"dos-874", "windows-874"},
{"iso-8859-11", "windows-874"},
{"iso8859-11", "windows-874"},
{"iso885911", "windows-874"},
{"tis-620", "windows-874"},
{"windows-874", "windows-874"},
{"cp1250", "windows-1250"},
{"windows-1250", "windows-1250"},
{"x-cp1250", "windows-1250"},
{"cp1251", "windows-1251"},
{"windows-1251", "windows-1251"},
{"x-cp1251", "windows-1251"},
{"ansi_x3.4-1968", "windows-1252"},
{"ascii", "windows-1252"},
{"cp1252", "windows-1252"},
{"cp819", "windows-1252"},
{"csisolatin1", "windows-1252"},
{"ibm819", "windows-1252"},
{"iso-8859-1", "windows-1252"},
{"iso-ir-100", "windows-1252"},
{"iso8859-1", "windows-1252"},
{"iso88591", "windows-1252"},
{"iso_8859-1", "windows-1252"},
{"iso_8859-1:1987", "windows-1252"},
{"l1", "windows-1252"},
{"latin1", "windows-1252"},
{"us-ascii", "windows-1252"},
{"windows-1252", "windows-1252"},
{"x-cp1252", "windows-1252"},
{"cp1253", "windows-1253"},
{"windows-1253", "windows-1253"},
{"x-cp1253", "windows-1253"},
{"cp1254", "windows-1254"},
{"csisolatin5", "windows-1254"},
{"iso-8859-9", "windows-1254"},
{"iso-ir-148", "windows-1254"},
{"iso8859-9", "windows-1254"},
{"iso88599", "windows-1254"},
{"iso_8859-9", "windows-1254"},
{"iso_8859-9:1989", "windows-1254"},
{"l5", "windows-1254"},
{"latin5", "windows-1254"},
{"windows-1254", "windows-1254"},
{"x-cp1254", "windows-1254"},
{"cp1255", "windows-1255"},
{"windows-1255", "windows-1255"},
{"x-cp1255", "windows-1255"},
{"cp1256", "windows-1256"},
{"windows-1256", "windows-1256"},
{"x-cp1256", "windows-1256"},
{"cp1257", "windows-1257"},
{"windows-1257", "windows-1257"},
{"x-cp1257", "windows-1257"},
{"cp1258", "windows-1258"},
{"windows-1258", "windows-1258"},
{"x-cp1258", "windows-1258"},
{"x-mac-cyrillic", "x-mac-cyrillic"},
{"x-mac-ukrainian", "x-mac-cyrillic"},
{"chinese", "gbk"},
{"csgb2312", "gbk"},
{"csiso58gb231280", "gbk"},
{"gb2312", "gbk"},
{"gb_2312", "gbk"},
{"gb_2312-80", "gbk"},
{"gbk", "gbk"},
{"iso-ir-58", "gbk"},
{"x-gbk", "gbk"},
{"gb18030", "gb18030"},
{"hz-gb-2312", "hz-gb-2312"},
{"big5", "big5"},
{"big5-hkscs", "big5"},
{"cn-big5", "big5"},
{"csbig5", "big5"},
{"x-x-big5", "big5"},
{"cseucpkdfmtjapanese", "euc-jp"},
{"euc-jp", "euc-jp"},
{"x-euc-jp", "euc-jp"},
{"csiso2022jp", "iso-2022-jp"},
{"iso-2022-jp", "iso-2022-jp"},
{"csshiftjis", "shift_jis"},
{"ms_kanji", "shift_jis"},
{"shift-jis", "shift_jis"},
{"shift_jis", "shift_jis"},
{"sjis", "shift_jis"},
{"windows-31j", "shift_jis"},
{"x-sjis", "shift_jis"},
{"cseuckr", "euc-kr"},
{"csksc56011987", "euc-kr"},
{"euc-kr", "euc-kr"},
{"iso-ir-149", "euc-kr"},
{"korean", "euc-kr"},
{"ks_c_5601-1987", "euc-kr"},
{"ks_c_5601-1989", "euc-kr"},
{"ksc5601", "euc-kr"},
{"ksc_5601", "euc-kr"},
{"windows-949", "euc-kr"},
{"csiso2022kr", "iso-2022-kr"},
{"iso-2022-kr", "iso-2022-kr"},
{"utf-16", "utf-16le"},
{"utf-16le", "utf-16le"},
{"utf-16be", "utf-16be"},
{"x-user-defined", "x-user-defined"},
};
EncodingUtils::EncodingUtils()
{
MOZ_ASSERT(!gEncodings);
const uint32_t numLabels = ArrayLength(labelsEncodings);
mLabelsEncodings.Init(numLabels);
for (uint32_t i = 0; i < numLabels; i++) {
mLabelsEncodings.Put(NS_ConvertASCIItoUTF16(labelsEncodings[i].mLabel),
labelsEncodings[i].mEncoding);
}
}
EncodingUtils::~EncodingUtils()
{
MOZ_ASSERT(gEncodings && gEncodings == this);
}
void
EncodingUtils::Shutdown()
{
NS_IF_RELEASE(gEncodings);
}
already_AddRefed<EncodingUtils>
EncodingUtils::GetOrCreate()
{
if (!gEncodings) {
gEncodings = new EncodingUtils();
NS_ADDREF(gEncodings);
}
NS_ADDREF(gEncodings);
return gEncodings;
}
uint32_t
EncodingUtils::IdentifyDataOffset(const char* aData,
const uint32_t aLength,
const char*& aRetval)
{
// Truncating to pre-clear return value in case of failure.
aRetval = "";
// Minimum bytes in input stream data that represents
// the Byte Order Mark is 2. Max is 3.
if (aLength < 2) {
return 0;
}
if (aData[0] == '\xFF' && aData[1] == '\xFE') {
aRetval = "utf-16le";
return 2;
}
if (aData[0] == '\xFE' && aData[1] == '\xFF') {
aRetval = "utf-16be";
return 2;
}
// Checking utf-8 byte order mark.
// Minimum bytes in input stream data that represents
// the Byte Order Mark for utf-8 is 3.
if (aLength < 3) {
return 0;
}
if (aData[0] == '\xEF' && aData[1] == '\xBB' && aData[2] == '\xBF') {
aRetval = "utf-8";
return 3;
}
return 0;
}
bool
EncodingUtils::FindEncodingForLabel(const nsAString& aLabel,
const char*& aOutEncoding)
{
nsRefPtr<EncodingUtils> self = EncodingUtils::GetOrCreate();
MOZ_ASSERT(self);
// Save aLabel first because it may be the same as aOutEncoding.
nsString label(aLabel);
// Truncating to clear aOutEncoding in case of failure.
aOutEncoding = EmptyCString().get();
EncodingUtils::TrimSpaceCharacters(label);
if (label.IsEmpty()) {
return false;
}
nsContentUtils::ASCIIToLower(label);
const char* encoding = self->mLabelsEncodings.Get(label);
if (!encoding) {
return false;
}
aOutEncoding = encoding;
return true;
}
} // namespace dom
} // namespace mozilla