Bug 912470 part 2 - Implement Encoding Standard-compliant big5 encoder. r=emk.

This commit is contained in:
Henri Sivonen 2015-09-03 15:21:57 +03:00
parent 7de0878383
commit 60d9d8a2f0
23 changed files with 549 additions and 6770 deletions

View File

@ -5,7 +5,6 @@
# x-unicode is assumed for encodings not listed here
Big5=zh-TW
Big5-HKSCS=zh=HK
EUC-JP=ja
EUC-KR=ko
gb18030=zh-CN

View File

@ -27,7 +27,6 @@ EXPORTS += [
'ucvja/nsUCVJACID.h',
'ucvko/nsUCvKOCID.h',
'ucvlatin/nsUCvLatinCID.h',
'ucvtw/nsUCvTWCID.h',
]
UNIFIED_SOURCES += [
@ -137,6 +136,7 @@ UNIFIED_SOURCES += [
]
UNIFIED_SOURCES += [
'ucvtw/nsBIG5Data.cpp',
'ucvtw/nsBIG5ToUnicode.cpp',
'ucvtw/nsUnicodeToBIG5.cpp',
]

View File

@ -96,6 +96,12 @@ public:
* the first of a surrogate pair.
* NS_ERROR_UENC_NOMAPPING if character without mapping
* was encountered and the behavior was set to "signal".
* In the case of an unmappable BMP character, aDestLength
* must indicate that the unmappable character was
* consumed by the encoder (unlike in the decode API!).
* In the case of an unmappable astral character,
* aDestLength must indicate that the high surrogate was
* consumed by the encoder but the low surrogate was not.
*/
NS_IMETHOD Convert(const char16_t * aSrc, int32_t * aSrcLength,
char * aDest, int32_t * aDestLength) = 0;

View File

@ -107,8 +107,6 @@
#include "nsUnicodeToISO2022JP.h"
// ucvtw
#include "nsUCvTWCID.h"
#include "nsUCvTWDll.h"
#include "nsBIG5ToUnicode.h"
#include "nsUnicodeToBIG5.h"
@ -212,6 +210,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO2022JP)
// ucvtw
NS_GENERIC_FACTORY_CONSTRUCTOR(nsBIG5ToUnicode)
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToBIG5)
// ucvko
@ -245,11 +244,6 @@ const uint16_t g_ASCIIMappingTable[] = {
0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0x007F, 0x0000
};
// ucvtw
const uint16_t g_ufBig5Mapping[] = {
#include "big5.uf"
};
// ucvko
const uint16_t g_utKSC5601Mapping[] = {
#include "u20kscgl.ut"

View File

@ -12,3 +12,4 @@ skip-if = toolkit == 'android' #bug 775227
[test_unicode_noncharacters_gb18030.html]
[test_unicode_noncharacters_utf8.html]
[test_utf8_overconsumption.html]
[test_big5_encoder.html]

View File

@ -0,0 +1,43 @@
<!DOCTYPE HTML>
<html>
<!--
https://bugzilla.mozilla.org/show_bug.cgi?id=912470
-->
<head>
<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
<title>Test for Unicode non-characters</title>
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
</head>
<body onload="test()">
<pre id="test">
<script class="testbody" type="text/javascript">
/* NOTE:
* When we make our data: URL origin work as in Blink, this test will fail.
* Hopefully, by that time are URL parser has become spec-compliant, so that
* we'll pass the Web Platform Test for the big5 encoder
* (testing/web-platform/tests/encoding/big5-encoder.html) and this test can
* simply be removed.
*/
SimpleTest.waitForExplicitFinish();
function test() {
var f = document.getElementsByTagName("iframe")[0];
f.onload = function() {
var href = f.contentWindow.location.href;
var index = href.indexOf("?foo=");
var actual = href.substring(index + 5);
var expected = "h%26%2340614%3Bi%26%23156267%3Bj%A1%40k%A3%E1l%A4%40m%C8%A4n%C8%CDo%FE%FEp%26%238365%3Bq%FDjr%F9%F9s%26%23128169%3Bt";
is(actual, expected, "Should have gotten the expected encode.");
SimpleTest.finish();
}
f.contentDocument.forms[0].submit();
}
</script>
</pre>
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=912470">Mozilla Bug 912470</a>
<p id="display"></p>
<div id="content" style="display: none"><iframe src="data:text/html;charset=big5,<form><input name=foo value=h&amp;%23x9EA6;i&amp;%23x2626B;j&amp;%23x3000;k&amp;%23x20AC;l&amp;%23x4E00;m&amp;%23x27607;n&amp;%23xFFE2;o&amp;%23x79D4;p&amp;%23x20AD;q&amp;%23x203B5;r&amp;%23x2550;s&amp;%23x1F4A9;t></form>">
</div>
</body>
</html>

View File

@ -11,7 +11,6 @@ function run_test() {
// this list excludes codepages that can represent all Unicode
var encoders = [
"Big5",
"Big5-HKSCS",
"EUC-JP",
"EUC-KR",
"gbk",

View File

@ -5,12 +5,12 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
# Adapted from
# https://hg.mozilla.org/projects/htmlparser/file/3ac10f9e8612/generate-encoding-data.py
# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py
# indexes.json comes from
# https://encoding.spec.whatwg.org/indexes.json
# i.e.
# https://github.com/whatwg/encoding/blob/a5215d07106e250dfef34908b99b3e4a576be2f6/indexes.json
# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json
import json
@ -75,8 +75,8 @@ for codePoint in index:
astralRanges = invertRanges(gaps, cap)
includeFile = open("../ucvtw/nsBIG5DecoderData.h", "w")
includeFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
classFile = open("../ucvtw/nsBIG5Data.cpp", "w")
classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
@ -85,14 +85,16 @@ includeFile.write('''/* This Source Code Form is subject to the terms of the Moz
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
*/
#include "nsBIG5Data.h"
static const char16_t kBig5LowBitsTable[] = {
''')
for (low, high) in ranges:
for i in xrange(low, high):
includeFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
includeFile.write('''};
classFile.write('''};
static const uint32_t kBig5AstralnessTable[] = {
''')
@ -112,20 +114,20 @@ while i < len(bits):
accu = 0
for j in xrange(32):
accu |= bits[i + j] << j
includeFile.write(' 0x%08X,\n' % accu)
classFile.write(' 0x%08X,\n' % accu)
i += 32
includeFile.write('''};
classFile.write('''};
// static
char16_t
nsBIG5ToUnicode::LowBits(size_t aPointer)
nsBIG5Data::LowBits(size_t aPointer)
{
''')
base = 0
for (low, high) in ranges:
includeFile.write(''' if (aPointer < %d) {
classFile.write(''' if (aPointer < %d) {
return 0;
}
if (aPointer < %d) {
@ -134,19 +136,19 @@ for (low, high) in ranges:
''' % (low, high, base, low))
base += (high - low)
includeFile.write(''' return 0;
classFile.write(''' return 0;
}
// static
bool
nsBIG5ToUnicode::IsAstral(size_t aPointer)
nsBIG5Data::IsAstral(size_t aPointer)
{
''')
base = 0
for (low, high) in astralRanges:
if high - low == 1:
includeFile.write(''' if (aPointer < %d) {
classFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer == %d) {
@ -154,7 +156,7 @@ for (low, high) in astralRanges:
}
''' % (low, low))
else:
includeFile.write(''' if (aPointer < %d) {
classFile.write(''' if (aPointer < %d) {
return false;
}
if (aPointer < %d) {
@ -164,7 +166,88 @@ for (low, high) in astralRanges:
''' % (low, high, base, low))
base += (high - low)
includeFile.write(''' return false;
classFile.write(''' return false;
}
//static
size_t
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
{
if (!aIsAstral) {
switch (aLowBits) {
''')
hkscsBound = (0xA1 - 0x81) * 157
preferLast = [
0x2550,
0x255E,
0x2561,
0x256A,
0x5341,
0x5345,
]
for codePoint in preferLast:
# Python lists don't have .rindex() :-(
for i in xrange(len(index) - 1, -1, -1):
candidate = index[i]
if candidate == codePoint:
classFile.write(''' case 0x%04X:
return %d;
''' % (codePoint, i))
break
classFile.write(''' default:
break;
}
}''')
base = 0
start = 0
for (low, high) in ranges:
if low <= hkscsBound and hkscsBound < high:
# This is the first range we don't ignore and the
# range that contains the first non-HKSCS pointer.
# Avoid searching HKSCS.
start = base + hkscsBound - low
break
base += (high - low)
classFile.write('''
for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
if (kBig5LowBitsTable[i] == aLowBits) {
size_t pointer;
''' % start)
base = 0
prevLow = 0
prevHigh = 0
prevBase = 0
writing = False
for (low, high) in ranges:
if writing:
classFile.write('''if (i < %d) {
pointer = i + %d;
} else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase)))
prevLow = low
prevHigh = high
prevBase = base
if high > hkscsBound:
writing = True
base += (high - low)
classFile.write('''{
pointer = i + %d;
}''' % (prevLow - prevBase))
classFile.write('''
if (aIsAstral == IsAstral(pointer)) {
return pointer;
}
}
}
return 0;
}
''')
includeFile.close()
classFile.close()

View File

@ -19,7 +19,7 @@
"iso-8859-15":[128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,8364,165,352,167,353,169,170,171,172,173,174,175,176,177,178,179,381,181,182,183,382,185,186,187,338,339,376,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255],
"iso-8859-16":[128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,260,261,321,8364,8222,352,167,353,169,536,171,377,173,378,379,176,177,268,322,381,8221,182,183,382,269,537,187,338,339,376,380,192,193,194,258,196,262,198,199,200,201,202,203,204,205,206,207,272,323,210,211,212,336,214,346,368,217,218,219,220,280,538,223,224,225,226,259,228,263,230,231,232,233,234,235,236,237,238,239,273,324,242,243,244,337,246,347,369,249,250,251,252,281,539,255],
"koi8-r":[9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,9555,9556,9557,9558,9559,9560,9561,9562,9563,9564,9565,9566,9567,9568,9569,1025,9570,9571,9572,9573,9574,9575,9576,9577,9578,9579,9580,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066],
"koi8-u":[9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,1108,9556,1110,1111,9559,9560,9561,9562,9563,1169,9565,9566,9567,9568,9569,1025,1028,9571,1030,1031,9574,9575,9576,9577,9578,1168,9580,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066],
"koi8-u":[9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,1108,9556,1110,1111,9559,9560,9561,9562,9563,1169,1118,9566,9567,9568,9569,1025,1028,9571,1030,1031,9574,9575,9576,9577,9578,1168,1038,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066],
"macintosh":[196,197,199,201,209,214,220,225,224,226,228,227,229,231,233,232,234,235,237,236,238,239,241,243,242,244,246,245,250,249,251,252,8224,176,162,163,167,8226,182,223,174,169,8482,180,168,8800,198,216,8734,177,8804,8805,165,181,8706,8721,8719,960,8747,170,186,937,230,248,191,161,172,8730,402,8776,8710,171,187,8230,160,192,195,213,338,339,8211,8212,8220,8221,8216,8217,247,9674,255,376,8260,8364,8249,8250,64257,64258,8225,183,8218,8222,8240,194,202,193,203,200,205,206,207,204,211,212,63743,210,218,219,217,305,710,732,175,728,729,730,184,733,731,711],
"windows-874":[8364,129,130,131,132,8230,134,135,136,137,138,139,140,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,152,153,154,155,156,157,158,159,160,3585,3586,3587,3588,3589,3590,3591,3592,3593,3594,3595,3596,3597,3598,3599,3600,3601,3602,3603,3604,3605,3606,3607,3608,3609,3610,3611,3612,3613,3614,3615,3616,3617,3618,3619,3620,3621,3622,3623,3624,3625,3626,3627,3628,3629,3630,3631,3632,3633,3634,3635,3636,3637,3638,3639,3640,3641,3642,null,null,null,null,3647,3648,3649,3650,3651,3652,3653,3654,3655,3656,3657,3658,3659,3660,3661,3662,3663,3664,3665,3666,3667,3668,3669,3670,3671,3672,3673,3674,3675,null,null,null,null],
"windows-1250":[8364,129,8218,131,8222,8230,8224,8225,136,8240,352,8249,346,356,381,377,144,8216,8217,8220,8221,8226,8211,8212,152,8482,353,8250,347,357,382,378,160,711,728,321,164,260,166,167,168,169,350,171,172,173,174,379,176,177,731,322,180,181,182,183,184,261,351,187,317,733,318,380,340,193,194,258,196,313,262,199,268,201,280,203,282,205,206,270,272,323,327,211,212,336,214,215,344,366,218,368,220,221,354,223,341,225,226,259,228,314,263,231,269,233,281,235,283,237,238,271,273,324,328,243,244,337,246,247,345,367,250,369,252,253,355,729],

File diff suppressed because it is too large Load Diff

View File

@ -7,6 +7,8 @@
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
*/
#include "nsBIG5Data.h"
static const char16_t kBig5LowBitsTable[] = {
0x43F0,
0x4C32,
@ -18807,7 +18809,7 @@ static const uint32_t kBig5AstralnessTable[] = {
// static
char16_t
nsBIG5ToUnicode::LowBits(size_t aPointer)
nsBIG5Data::LowBits(size_t aPointer)
{
if (aPointer < 942) {
return 0;
@ -18844,7 +18846,7 @@ nsBIG5ToUnicode::LowBits(size_t aPointer)
// static
bool
nsBIG5ToUnicode::IsAstral(size_t aPointer)
nsBIG5Data::IsAstral(size_t aPointer)
{
if (aPointer < 947) {
return false;
@ -18910,3 +18912,43 @@ nsBIG5ToUnicode::IsAstral(size_t aPointer)
}
return false;
}
//static
size_t
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
{
if (!aIsAstral) {
switch (aLowBits) {
case 0x2550:
return 18991;
case 0x255E:
return 18975;
case 0x2561:
return 18977;
case 0x256A:
return 18976;
case 0x5341:
return 5512;
case 0x5345:
return 5599;
default:
break;
}
}
for (size_t i = 3967; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
if (kBig5LowBitsTable[i] == aLowBits) {
size_t pointer;
if (i < 4409) {
pointer = i + 1057;
} else if (i < 10128) {
pointer = i + 1086;
} else {
pointer = i + 1126;
}
if (aIsAstral == IsAstral(pointer)) {
return pointer;
}
}
}
return 0;
}

View File

@ -3,10 +3,16 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUCvTWDll_h_
#define nsUCvTWDll_h_
#ifndef nsBIG5Data_h_
#define nsBIG5Data_h_
extern const uint16_t g_ufBig5Mapping[];
extern const uint16_t g_ASCIIMappingTable[];
class nsBIG5Data
{
public:
static char16_t LowBits(size_t aPointer);
static bool IsAstral(size_t aPointer);
static size_t FindPointer(char16_t aLowBits, bool aIsAstral);
};
#endif /* nsBIG5Data_h_ */
#endif /* nsUCvTWDll_h_ */

View File

@ -6,8 +6,7 @@
#include "nsBIG5ToUnicode.h"
#include "mozilla/BinarySearch.h"
#include "mozilla/ArrayUtils.h"
#include "nsBIG5DecoderData.h"
#include "nsBIG5Data.h"
nsBIG5ToUnicode::nsBIG5ToUnicode()
: mPendingTrail(0)
@ -90,7 +89,7 @@ nsBIG5ToUnicode::Convert(const char* aSrc,
outTrail = 0x030C;
break;
default:
char16_t lowBits = LowBits(pointer);
char16_t lowBits = nsBIG5Data::LowBits(pointer);
if (!lowBits) {
if (b <= 0x7F) {
// prepend byte to stream
@ -107,7 +106,7 @@ nsBIG5ToUnicode::Convert(const char* aSrc,
*out++ = 0xFFFD;
continue;
}
if (IsAstral(pointer)) {
if (nsBIG5Data::IsAstral(pointer)) {
uint32_t codePoint = uint32_t(lowBits) | 0x20000;
*out++ = char16_t(0xD7C0 + (codePoint >> 10));
outTrail = char16_t(0xDC00 + (codePoint & 0x3FF));

View File

@ -32,9 +32,6 @@ public:
NS_IMETHOD Reset();
private:
static char16_t LowBits(size_t aPointer);
static bool IsAstral(size_t aPointer);
char16_t mPendingTrail;
uint8_t mBig5Lead;
};

View File

@ -1,16 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUCvTWCID_h___
#define nsUCvTWCID_h___
#include "nsISupports.h"
// Class ID for our UnicodeToBIG5 charset converter
// {EFC323E2-EC62-11d2-8AAC-00600811A836}
#define NS_UNICODETOBIG5_CID \
{ 0xefc323e2, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
#endif /* nsUCvTWCID_h___ */

View File

@ -4,35 +4,248 @@
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUnicodeToBIG5.h"
#include "nsUCvTWDll.h"
#include "nsUCConstructors.h"
//----------------------------------------------------------------------
// Global functions and data [declaration]
NS_IMPL_ADDREF(nsUnicodeToBIG5)
NS_IMPL_RELEASE(nsUnicodeToBIG5)
NS_IMPL_QUERY_INTERFACE(nsUnicodeToBIG5,
nsIUnicodeEncoder)
static const uint16_t *g_Big5MappingTable[2] = {
g_ASCIIMappingTable,
g_ufBig5Mapping
};
static const uScanClassID g_Big5ScanClassIDs[2] = {
u1ByteCharset,
u2BytesCharset
};
//----------------------------------------------------------------------
// Class nsUnicodeToBIG5 [implementation]
nsresult
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult)
nsUnicodeToBIG5::nsUnicodeToBIG5()
: mUtf16Lead(0)
, mPendingTrail(0)
, mSignal(true) // as in nsEncoderSupport
{
return CreateMultiTableEncoder(2,
(uScanClassID*) &g_Big5ScanClassIDs,
(uMappingTable**) &g_Big5MappingTable,
2 /* max length = src * 2 */,
aOuter, aIID, aResult);
}
NS_IMETHODIMP
nsUnicodeToBIG5::Convert(const char16_t* aSrc,
int32_t* aSrcLength,
char* aDest,
int32_t * aDestLength)
{
const char16_t* in = aSrc;
const char16_t* inEnd = in + *aSrcLength;
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
uint8_t* outEnd = out + *aDestLength;
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
"Can't have both pending output and pending input.");
if (mPendingTrail) {
if (out == outEnd) {
*aSrcLength = 0;
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*out++ = mPendingTrail;
mPendingTrail = 0;
}
for (;;) {
if (in == inEnd) {
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_OK_UENC_MOREINPUT;
}
if (out == outEnd) {
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_OK_UENC_MOREOUTPUT;
}
bool isAstral; // true means Plane 2, false means BMP
char16_t lowBits; // The low 16 bits of the code point
char16_t codeUnit = *in++;
size_t highBits = (codeUnit & 0xFC00);
if (highBits == 0xD800) {
// high surrogate
if (mUtf16Lead) {
// High surrogate follows another high surrogate. The
// *previous* code unit is in error.
if (mSignal) {
mUtf16Lead = 0;
// NOTE: Encode API differs from decode API!
--in;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
}
mUtf16Lead = codeUnit;
continue;
}
if (highBits == 0xDC00) {
// low surrogate
if (!mUtf16Lead) {
// Got low surrogate without a previous high surrogate
if (mSignal) {
// NOTE: Encode API differs from decode API!
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
continue;
}
size_t codePoint = (mUtf16Lead << 10) + codeUnit -
(((0xD800 << 10) - 0x10000) + 0xDC00);
mUtf16Lead = 0;
// Plane 2 is the only astral plane that has potentially
// Big5-encodable characters.
if ((0xFF0000 & codePoint) != 0x20000) {
if (mSignal) {
// NOTE: Encode API differs from decode API!
// nsSaveAsCharset wants us to back up on step in the case of a
// surrogate pair.
--in;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
continue;
}
isAstral = true;
lowBits = (char16_t)(codePoint & 0xFFFF);
} else {
// not a surrogate
if (mUtf16Lead) {
// Non-surrogate follows a high surrogate. The *previous*
// code unit is in error.
mUtf16Lead = 0;
if (mSignal) {
// NOTE: Encode API differs from decode API!
--in;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
// Let's unconsume this code unit and reloop in order to
// re-check if the output buffer still has space.
--in;
continue;
}
isAstral = false;
lowBits = codeUnit;
}
// isAstral now tells us if we have a Plane 2 or a BMP character.
// lowBits tells us the low 16 bits.
// After all the above setup to deal with UTF-16, we are now
// finally ready to follow the spec.
if (!isAstral && lowBits <= 0x7F) {
*out++ = (uint8_t)lowBits;
continue;
}
size_t pointer = nsBIG5Data::FindPointer(lowBits, isAstral);
if (!pointer) {
if (mSignal) {
// NOTE: Encode API differs from decode API!
if (isAstral) {
// nsSaveAsCharset wants us to back up on step in the case of a
// surrogate pair.
--in;
}
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_ERROR_UENC_NOMAPPING;
}
*out++ = '?';
continue;
}
uint8_t lead = (uint8_t)(pointer / 157 + 0x81);
uint8_t trail = (uint8_t)(pointer % 157);
if (trail < 0x3F) {
trail += 0x40;
} else {
trail += 0x62;
}
*out++ = lead;
if (out == outEnd) {
mPendingTrail = trail;
*aSrcLength = in - aSrc;
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
return NS_OK_UENC_MOREOUTPUT;
}
*out++ = trail;
continue;
}
}
NS_IMETHODIMP
nsUnicodeToBIG5::Finish(char* aDest,
int32_t* aDestLength)
{
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
"Can't have both pending output and pending input.");
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
if (mPendingTrail) {
if (*aDestLength < 1) {
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*out = mPendingTrail;
mPendingTrail = 0;
*aDestLength = 1;
return NS_OK;
}
if (mUtf16Lead) {
if (*aDestLength < 1) {
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
// The API doesn't support signaling an error. It pretends that malformed
// input doesn't exist. The UTF-8 encoder outputs the replacement character
// unconditionally.
mUtf16Lead = 0;
*out = '?';
*aDestLength = 1;
return NS_OK;
}
*aDestLength = 0;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToBIG5::GetMaxLength(const char16_t* aSrc,
int32_t aSrcLength,
int32_t* aDestLength)
{
*aDestLength = (aSrcLength * 2) +
(mPendingTrail ? 1 : 0) +
// If the lead ends up being paired, the bytes produced
// are already included above.
// If not, it produces a single '?'.
(mUtf16Lead ? 1 : 0);
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToBIG5::Reset()
{
mUtf16Lead = 0;
mPendingTrail = 0;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToBIG5::SetOutputErrorBehavior(int32_t aBehavior,
nsIUnicharEncoder* aEncoder,
char16_t aChar)
{
switch (aBehavior) {
case kOnError_Signal:
mSignal = true;
break;
case kOnError_Replace:
mSignal = false;
MOZ_ASSERT(aChar == '?', "Unsupported replacement.");
break;
case kOnError_CallBack:
MOZ_ASSERT_UNREACHABLE("kOnError_CallBack is supposed to be unused.");
break;
default:
MOZ_ASSERT_UNREACHABLE("Non-existent enum item.");
break;
}
return NS_OK;
}

View File

@ -3,19 +3,48 @@
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#ifndef nsUnicodeToBIG5_h___
#define nsUnicodeToBIG5_h___
#ifndef nsUnicodeToBIG5_h_
#define nsUnicodeToBIG5_h_
#include "nsISupports.h"
#include "nsIUnicodeEncoder.h"
/**
* A character set converter from Unicode to BIG5.
*
* @created 06/Apr/1999
* @author Catalin Rotaru [CATA]
*/
nsresult
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
void **aResult);
#define NS_UNICODETOBIG5_CID \
{ 0xefc323e2, 0xec62, 0x11d2, \
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
#endif /* nsUnicodeToBIG5_h___ */
class nsUnicodeToBIG5 : public nsIUnicodeEncoder
{
public:
// Encoders probably shouldn't use the thread-safe variant, but we should
// make a systematic change instead of making this class different.
NS_DECL_THREADSAFE_ISUPPORTS
nsUnicodeToBIG5();
NS_IMETHOD Convert(const char16_t* aSrc,
int32_t* aSrcLength,
char* aDest,
int32_t * aDestLength);
NS_IMETHOD Finish(char* aDest,
int32_t* aDestLength);
MOZ_WARN_UNUSED_RESULT NS_IMETHOD GetMaxLength(const char16_t* aSrc,
int32_t aSrcLength,
int32_t* aDestLength);
NS_IMETHOD Reset();
NS_IMETHOD SetOutputErrorBehavior(int32_t aBehavior,
nsIUnicharEncoder* aEncoder,
char16_t aChar);
private:
virtual ~nsUnicodeToBIG5(){};
char16_t mUtf16Lead;
uint8_t mPendingTrail;
bool mSignal;
};
#endif /* nsUnicodeToBIG5_h_ */

View File

@ -1 +0,0 @@
We should put Big5 converter into this directory/dll

View File

@ -29051,7 +29051,16 @@
},
"local_changes": {
"deleted": [],
"items": {},
"items": {
"testharness": {
"encoding/big5-encoder.html": [
{
"path": "encoding/big5-encoder.html",
"url": "/encoding/big5-encoder.html"
}
]
}
},
"reftest_nodes": {}
},
"reftest_nodes": {

View File

@ -444,12 +444,6 @@
[Name "hz-gb-2312" has label "hz-gb-2312" (inputEncoding)]
expected: FAIL
[Name "big5" has label "big5-hkscs" (characterSet)]
expected: FAIL
[Name "big5" has label "big5-hkscs" (inputEncoding)]
expected: FAIL
[Name "replacement" has label "csiso2022kr" (characterSet)]
expected: FAIL

View File

@ -0,0 +1,14 @@
[big5-encoder.html]
type: testharness
[big5 encoder: Highest-pointer BMP character excluded from encoder]
expected: FAIL
[big5 encoder: Highest-pointer character excluded from encoder]
expected: FAIL
[big5 encoder: The canonical BMP test character that is not in the index]
expected: FAIL
[big5 encoder: The canonical astral test character that is not in the index]
expected: FAIL

View File

@ -1,53 +0,0 @@
[textdecoder-labels.html]
type: testharness
[name=big5 label=big5-hkscs]
expected: FAIL
["big5-hkscs" => "big5"]
expected: FAIL
[" big5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs " => "big5"]
expected: FAIL
[" big5-hkscs " => "big5"]
expected: FAIL
["\\tbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\t" => "big5"]
expected: FAIL
["\\tbig5-hkscs\\t" => "big5"]
expected: FAIL
["\\nbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\n" => "big5"]
expected: FAIL
["\\nbig5-hkscs\\n" => "big5"]
expected: FAIL
["\\fbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\f" => "big5"]
expected: FAIL
["\\fbig5-hkscs\\f" => "big5"]
expected: FAIL
["\\rbig5-hkscs" => "big5"]
expected: FAIL
["big5-hkscs\\r" => "big5"]
expected: FAIL
["\\rbig5-hkscs\\r" => "big5"]
expected: FAIL

View File

@ -0,0 +1,33 @@
<!doctype html>
<meta charset=big5> <!-- test breaks if the server overrides this -->
<script src=/resources/testharness.js></script>
<script src=/resources/testharnessreport.js></script>
<div id=log></div>
<script>
function encode(input, output, desc) {
test(function() {
var a = document.createElement("a"); // <a> uses document encoding for URL's query
// Append and prepend X to test for off-by-one errors
a.href = "https://example.com/?X" + input + "X";
assert_equals(a.search.substr(1), "X" + output + "X"); // remove leading "?"
}, "big5 encoder: " + desc);
}
encode("ab", "ab", "very basic")
// edge cases
encode("\u9EA6", "%26%2340614%3B", "Highest-pointer BMP character excluded from encoder");
encode("\uD858\uDE6B", "%26%23156267%3B", "Highest-pointer character excluded from encoder");
encode("\u3000", "%A1@", "Lowest-pointer character included in encoder");
encode("\u20AC", "%A3%E1", "Euro; the highest-pointer character before a range of 30 unmapped pointers");
encode("\u4E00", "%A4@", "The lowest-pointer character after the range of 30 unmapped pointers");
encode("\uD85D\uDE07", "%C8%A4", "The highest-pointer character before a range of 41 unmapped pointers");
encode("\uFFE2", "%C8%CD", "The lowest-pointer character after the range of 41 unmapped pointers");
encode("\u79D4", "%FE%FE", "The last character in the index");
// not in index
encode("\u2603", "%26%239731%3B", "The canonical BMP test character that is not in the index");
encode("\uD83D\uDCA9", "%26%23128169%3B", "The canonical astral test character that is not in the index");
// duplicate low bits
encode("\uD840\uDFB5", "%FDj", "A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer");
// prefer last
encode("\u2550", "%F9%F9", "A duplicate-mapped code point that prefers the highest pointer in the encoder");
</script>