mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Bug 912470 part 2 - Implement Encoding Standard-compliant big5 encoder. r=emk.
This commit is contained in:
parent
3672b55134
commit
8b06149c69
@ -5,7 +5,6 @@
|
||||
# x-unicode is assumed for encodings not listed here
|
||||
|
||||
Big5=zh-TW
|
||||
Big5-HKSCS=zh=HK
|
||||
EUC-JP=ja
|
||||
EUC-KR=ko
|
||||
gb18030=zh-CN
|
||||
|
@ -27,7 +27,6 @@ EXPORTS += [
|
||||
'ucvja/nsUCVJACID.h',
|
||||
'ucvko/nsUCvKOCID.h',
|
||||
'ucvlatin/nsUCvLatinCID.h',
|
||||
'ucvtw/nsUCvTWCID.h',
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
@ -137,6 +136,7 @@ UNIFIED_SOURCES += [
|
||||
]
|
||||
|
||||
UNIFIED_SOURCES += [
|
||||
'ucvtw/nsBIG5Data.cpp',
|
||||
'ucvtw/nsBIG5ToUnicode.cpp',
|
||||
'ucvtw/nsUnicodeToBIG5.cpp',
|
||||
]
|
||||
|
@ -96,6 +96,12 @@ public:
|
||||
* the first of a surrogate pair.
|
||||
* NS_ERROR_UENC_NOMAPPING if character without mapping
|
||||
* was encountered and the behavior was set to "signal".
|
||||
* In the case of an unmappable BMP character, aDestLength
|
||||
* must indicate that the unmappable character was
|
||||
* consumed by the encoder (unlike in the decode API!).
|
||||
* In the case of an unmappable astral character,
|
||||
* aDestLength must indicate that the high surrogate was
|
||||
* consumed by the encoder but the low surrogate was not.
|
||||
*/
|
||||
NS_IMETHOD Convert(const char16_t * aSrc, int32_t * aSrcLength,
|
||||
char * aDest, int32_t * aDestLength) = 0;
|
||||
|
@ -107,8 +107,6 @@
|
||||
#include "nsUnicodeToISO2022JP.h"
|
||||
|
||||
// ucvtw
|
||||
#include "nsUCvTWCID.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsBIG5ToUnicode.h"
|
||||
#include "nsUnicodeToBIG5.h"
|
||||
|
||||
@ -212,6 +210,7 @@ NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToISO2022JP)
|
||||
|
||||
// ucvtw
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsBIG5ToUnicode)
|
||||
NS_GENERIC_FACTORY_CONSTRUCTOR(nsUnicodeToBIG5)
|
||||
|
||||
// ucvko
|
||||
|
||||
@ -245,11 +244,6 @@ const uint16_t g_ASCIIMappingTable[] = {
|
||||
0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0x007F, 0x0000
|
||||
};
|
||||
|
||||
// ucvtw
|
||||
const uint16_t g_ufBig5Mapping[] = {
|
||||
#include "big5.uf"
|
||||
};
|
||||
|
||||
// ucvko
|
||||
const uint16_t g_utKSC5601Mapping[] = {
|
||||
#include "u20kscgl.ut"
|
||||
|
@ -12,3 +12,4 @@ skip-if = toolkit == 'android' #bug 775227
|
||||
[test_unicode_noncharacters_gb18030.html]
|
||||
[test_unicode_noncharacters_utf8.html]
|
||||
[test_utf8_overconsumption.html]
|
||||
[test_big5_encoder.html]
|
||||
|
43
intl/uconv/tests/test_big5_encoder.html
Normal file
43
intl/uconv/tests/test_big5_encoder.html
Normal file
@ -0,0 +1,43 @@
|
||||
<!DOCTYPE HTML>
|
||||
<html>
|
||||
<!--
|
||||
https://bugzilla.mozilla.org/show_bug.cgi?id=912470
|
||||
-->
|
||||
<head>
|
||||
<meta http-equiv="Content-type" content="text/html; charset=UTF-8">
|
||||
<title>Test for Unicode non-characters</title>
|
||||
<script type="text/javascript" src="/tests/SimpleTest/SimpleTest.js"></script>
|
||||
<link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css" />
|
||||
</head>
|
||||
<body onload="test()">
|
||||
<pre id="test">
|
||||
<script class="testbody" type="text/javascript">
|
||||
/* NOTE:
|
||||
* When we make our data: URL origin work as in Blink, this test will fail.
|
||||
* Hopefully, by that time are URL parser has become spec-compliant, so that
|
||||
* we'll pass the Web Platform Test for the big5 encoder
|
||||
* (testing/web-platform/tests/encoding/big5-encoder.html) and this test can
|
||||
* simply be removed.
|
||||
*/
|
||||
SimpleTest.waitForExplicitFinish();
|
||||
|
||||
function test() {
|
||||
var f = document.getElementsByTagName("iframe")[0];
|
||||
f.onload = function() {
|
||||
var href = f.contentWindow.location.href;
|
||||
var index = href.indexOf("?foo=");
|
||||
var actual = href.substring(index + 5);
|
||||
var expected = "h%26%2340614%3Bi%26%23156267%3Bj%A1%40k%A3%E1l%A4%40m%C8%A4n%C8%CDo%FE%FEp%26%238365%3Bq%FDjr%F9%F9s%26%23128169%3Bt";
|
||||
is(actual, expected, "Should have gotten the expected encode.");
|
||||
SimpleTest.finish();
|
||||
}
|
||||
f.contentDocument.forms[0].submit();
|
||||
}
|
||||
</script>
|
||||
</pre>
|
||||
<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=912470">Mozilla Bug 912470</a>
|
||||
<p id="display"></p>
|
||||
<div id="content" style="display: none"><iframe src="data:text/html;charset=big5,<form><input name=foo value=h&%23x9EA6;i&%23x2626B;j&%23x3000;k&%23x20AC;l&%23x4E00;m&%23x27607;n&%23xFFE2;o&%23x79D4;p&%23x20AD;q&%23x203B5;r&%23x2550;s&%23x1F4A9;t></form>">
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
@ -11,7 +11,6 @@ function run_test() {
|
||||
// this list excludes codepages that can represent all Unicode
|
||||
var encoders = [
|
||||
"Big5",
|
||||
"Big5-HKSCS",
|
||||
"EUC-JP",
|
||||
"EUC-KR",
|
||||
"gbk",
|
||||
|
@ -5,12 +5,12 @@
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
# Adapted from
|
||||
# https://hg.mozilla.org/projects/htmlparser/file/3ac10f9e8612/generate-encoding-data.py
|
||||
# https://hg.mozilla.org/projects/htmlparser/file/0d906fb1ab90/generate-encoding-data.py
|
||||
|
||||
# indexes.json comes from
|
||||
# https://encoding.spec.whatwg.org/indexes.json
|
||||
# i.e.
|
||||
# https://github.com/whatwg/encoding/blob/a5215d07106e250dfef34908b99b3e4a576be2f6/indexes.json
|
||||
# https://github.com/whatwg/encoding/blob/ce4e83d0df5b5efec0697fc76e66699737e033a3/indexes.json
|
||||
|
||||
import json
|
||||
|
||||
@ -75,8 +75,8 @@ for codePoint in index:
|
||||
astralRanges = invertRanges(gaps, cap)
|
||||
|
||||
|
||||
includeFile = open("../ucvtw/nsBIG5DecoderData.h", "w")
|
||||
includeFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
classFile = open("../ucvtw/nsBIG5Data.cpp", "w")
|
||||
classFile.write('''/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
@ -85,14 +85,16 @@ includeFile.write('''/* This Source Code Form is subject to the terms of the Moz
|
||||
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
|
||||
*/
|
||||
|
||||
#include "nsBIG5Data.h"
|
||||
|
||||
static const char16_t kBig5LowBitsTable[] = {
|
||||
''')
|
||||
|
||||
for (low, high) in ranges:
|
||||
for i in xrange(low, high):
|
||||
includeFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
|
||||
classFile.write(' 0x%04X,\n' % (index[i] & 0xFFFF))
|
||||
|
||||
includeFile.write('''};
|
||||
classFile.write('''};
|
||||
|
||||
static const uint32_t kBig5AstralnessTable[] = {
|
||||
''')
|
||||
@ -112,20 +114,20 @@ while i < len(bits):
|
||||
accu = 0
|
||||
for j in xrange(32):
|
||||
accu |= bits[i + j] << j
|
||||
includeFile.write(' 0x%08X,\n' % accu)
|
||||
classFile.write(' 0x%08X,\n' % accu)
|
||||
i += 32
|
||||
|
||||
includeFile.write('''};
|
||||
classFile.write('''};
|
||||
|
||||
// static
|
||||
char16_t
|
||||
nsBIG5ToUnicode::LowBits(size_t aPointer)
|
||||
nsBIG5Data::LowBits(size_t aPointer)
|
||||
{
|
||||
''')
|
||||
|
||||
base = 0
|
||||
for (low, high) in ranges:
|
||||
includeFile.write(''' if (aPointer < %d) {
|
||||
classFile.write(''' if (aPointer < %d) {
|
||||
return 0;
|
||||
}
|
||||
if (aPointer < %d) {
|
||||
@ -134,19 +136,19 @@ for (low, high) in ranges:
|
||||
''' % (low, high, base, low))
|
||||
base += (high - low)
|
||||
|
||||
includeFile.write(''' return 0;
|
||||
classFile.write(''' return 0;
|
||||
}
|
||||
|
||||
// static
|
||||
bool
|
||||
nsBIG5ToUnicode::IsAstral(size_t aPointer)
|
||||
nsBIG5Data::IsAstral(size_t aPointer)
|
||||
{
|
||||
''')
|
||||
|
||||
base = 0
|
||||
for (low, high) in astralRanges:
|
||||
if high - low == 1:
|
||||
includeFile.write(''' if (aPointer < %d) {
|
||||
classFile.write(''' if (aPointer < %d) {
|
||||
return false;
|
||||
}
|
||||
if (aPointer == %d) {
|
||||
@ -154,7 +156,7 @@ for (low, high) in astralRanges:
|
||||
}
|
||||
''' % (low, low))
|
||||
else:
|
||||
includeFile.write(''' if (aPointer < %d) {
|
||||
classFile.write(''' if (aPointer < %d) {
|
||||
return false;
|
||||
}
|
||||
if (aPointer < %d) {
|
||||
@ -164,7 +166,88 @@ for (low, high) in astralRanges:
|
||||
''' % (low, high, base, low))
|
||||
base += (high - low)
|
||||
|
||||
includeFile.write(''' return false;
|
||||
classFile.write(''' return false;
|
||||
}
|
||||
|
||||
//static
|
||||
size_t
|
||||
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
|
||||
{
|
||||
if (!aIsAstral) {
|
||||
switch (aLowBits) {
|
||||
''')
|
||||
|
||||
hkscsBound = (0xA1 - 0x81) * 157
|
||||
|
||||
preferLast = [
|
||||
0x2550,
|
||||
0x255E,
|
||||
0x2561,
|
||||
0x256A,
|
||||
0x5341,
|
||||
0x5345,
|
||||
]
|
||||
|
||||
for codePoint in preferLast:
|
||||
# Python lists don't have .rindex() :-(
|
||||
for i in xrange(len(index) - 1, -1, -1):
|
||||
candidate = index[i]
|
||||
if candidate == codePoint:
|
||||
classFile.write(''' case 0x%04X:
|
||||
return %d;
|
||||
''' % (codePoint, i))
|
||||
break
|
||||
|
||||
classFile.write(''' default:
|
||||
break;
|
||||
}
|
||||
}''')
|
||||
|
||||
base = 0
|
||||
start = 0
|
||||
for (low, high) in ranges:
|
||||
if low <= hkscsBound and hkscsBound < high:
|
||||
# This is the first range we don't ignore and the
|
||||
# range that contains the first non-HKSCS pointer.
|
||||
# Avoid searching HKSCS.
|
||||
start = base + hkscsBound - low
|
||||
break
|
||||
base += (high - low)
|
||||
|
||||
classFile.write('''
|
||||
for (size_t i = %d; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
|
||||
if (kBig5LowBitsTable[i] == aLowBits) {
|
||||
size_t pointer;
|
||||
''' % start)
|
||||
|
||||
base = 0
|
||||
prevLow = 0
|
||||
prevHigh = 0
|
||||
prevBase = 0
|
||||
writing = False
|
||||
for (low, high) in ranges:
|
||||
if writing:
|
||||
classFile.write('''if (i < %d) {
|
||||
pointer = i + %d;
|
||||
} else ''' % ((prevBase + prevHigh - prevLow), (prevLow - prevBase)))
|
||||
prevLow = low
|
||||
prevHigh = high
|
||||
prevBase = base
|
||||
if high > hkscsBound:
|
||||
writing = True
|
||||
base += (high - low)
|
||||
|
||||
classFile.write('''{
|
||||
pointer = i + %d;
|
||||
}''' % (prevLow - prevBase))
|
||||
|
||||
classFile.write('''
|
||||
if (aIsAstral == IsAstral(pointer)) {
|
||||
return pointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
''')
|
||||
includeFile.close()
|
||||
classFile.close()
|
||||
|
@ -19,7 +19,7 @@
|
||||
"iso-8859-15":[128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,162,163,8364,165,352,167,353,169,170,171,172,173,174,175,176,177,178,179,381,181,182,183,382,185,186,187,338,339,376,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255],
|
||||
"iso-8859-16":[128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,260,261,321,8364,8222,352,167,353,169,536,171,377,173,378,379,176,177,268,322,381,8221,182,183,382,269,537,187,338,339,376,380,192,193,194,258,196,262,198,199,200,201,202,203,204,205,206,207,272,323,210,211,212,336,214,346,368,217,218,219,220,280,538,223,224,225,226,259,228,263,230,231,232,233,234,235,236,237,238,239,273,324,242,243,244,337,246,347,369,249,250,251,252,281,539,255],
|
||||
"koi8-r":[9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,9555,9556,9557,9558,9559,9560,9561,9562,9563,9564,9565,9566,9567,9568,9569,1025,9570,9571,9572,9573,9574,9575,9576,9577,9578,9579,9580,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066],
|
||||
"koi8-u":[9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,1108,9556,1110,1111,9559,9560,9561,9562,9563,1169,9565,9566,9567,9568,9569,1025,1028,9571,1030,1031,9574,9575,9576,9577,9578,1168,9580,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066],
|
||||
"koi8-u":[9472,9474,9484,9488,9492,9496,9500,9508,9516,9524,9532,9600,9604,9608,9612,9616,9617,9618,9619,8992,9632,8729,8730,8776,8804,8805,160,8993,176,178,183,247,9552,9553,9554,1105,1108,9556,1110,1111,9559,9560,9561,9562,9563,1169,1118,9566,9567,9568,9569,1025,1028,9571,1030,1031,9574,9575,9576,9577,9578,1168,1038,169,1102,1072,1073,1094,1076,1077,1092,1075,1093,1080,1081,1082,1083,1084,1085,1086,1087,1103,1088,1089,1090,1091,1078,1074,1100,1099,1079,1096,1101,1097,1095,1098,1070,1040,1041,1062,1044,1045,1060,1043,1061,1048,1049,1050,1051,1052,1053,1054,1055,1071,1056,1057,1058,1059,1046,1042,1068,1067,1047,1064,1069,1065,1063,1066],
|
||||
"macintosh":[196,197,199,201,209,214,220,225,224,226,228,227,229,231,233,232,234,235,237,236,238,239,241,243,242,244,246,245,250,249,251,252,8224,176,162,163,167,8226,182,223,174,169,8482,180,168,8800,198,216,8734,177,8804,8805,165,181,8706,8721,8719,960,8747,170,186,937,230,248,191,161,172,8730,402,8776,8710,171,187,8230,160,192,195,213,338,339,8211,8212,8220,8221,8216,8217,247,9674,255,376,8260,8364,8249,8250,64257,64258,8225,183,8218,8222,8240,194,202,193,203,200,205,206,207,204,211,212,63743,210,218,219,217,305,710,732,175,728,729,730,184,733,731,711],
|
||||
"windows-874":[8364,129,130,131,132,8230,134,135,136,137,138,139,140,141,142,143,144,8216,8217,8220,8221,8226,8211,8212,152,153,154,155,156,157,158,159,160,3585,3586,3587,3588,3589,3590,3591,3592,3593,3594,3595,3596,3597,3598,3599,3600,3601,3602,3603,3604,3605,3606,3607,3608,3609,3610,3611,3612,3613,3614,3615,3616,3617,3618,3619,3620,3621,3622,3623,3624,3625,3626,3627,3628,3629,3630,3631,3632,3633,3634,3635,3636,3637,3638,3639,3640,3641,3642,null,null,null,null,3647,3648,3649,3650,3651,3652,3653,3654,3655,3656,3657,3658,3659,3660,3661,3662,3663,3664,3665,3666,3667,3668,3669,3670,3671,3672,3673,3674,3675,null,null,null,null],
|
||||
"windows-1250":[8364,129,8218,131,8222,8230,8224,8225,136,8240,352,8249,346,356,381,377,144,8216,8217,8220,8221,8226,8211,8212,152,8482,353,8250,347,357,382,378,160,711,728,321,164,260,166,167,168,169,350,171,172,173,174,379,176,177,731,322,180,181,182,183,184,261,351,187,317,733,318,380,340,193,194,258,196,313,262,199,268,201,280,203,282,205,206,270,272,323,327,211,212,336,214,215,344,366,218,368,220,221,354,223,341,225,226,259,228,314,263,231,269,233,281,235,283,237,238,271,273,324,328,243,244,337,246,247,345,367,250,369,252,253,355,729],
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,8 @@
|
||||
* Instead, please regenerate using intl/uconv/tools/gen-big5-data.py
|
||||
*/
|
||||
|
||||
#include "nsBIG5Data.h"
|
||||
|
||||
static const char16_t kBig5LowBitsTable[] = {
|
||||
0x43F0,
|
||||
0x4C32,
|
||||
@ -18807,7 +18809,7 @@ static const uint32_t kBig5AstralnessTable[] = {
|
||||
|
||||
// static
|
||||
char16_t
|
||||
nsBIG5ToUnicode::LowBits(size_t aPointer)
|
||||
nsBIG5Data::LowBits(size_t aPointer)
|
||||
{
|
||||
if (aPointer < 942) {
|
||||
return 0;
|
||||
@ -18844,7 +18846,7 @@ nsBIG5ToUnicode::LowBits(size_t aPointer)
|
||||
|
||||
// static
|
||||
bool
|
||||
nsBIG5ToUnicode::IsAstral(size_t aPointer)
|
||||
nsBIG5Data::IsAstral(size_t aPointer)
|
||||
{
|
||||
if (aPointer < 947) {
|
||||
return false;
|
||||
@ -18910,3 +18912,43 @@ nsBIG5ToUnicode::IsAstral(size_t aPointer)
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//static
|
||||
size_t
|
||||
nsBIG5Data::FindPointer(char16_t aLowBits, bool aIsAstral)
|
||||
{
|
||||
if (!aIsAstral) {
|
||||
switch (aLowBits) {
|
||||
case 0x2550:
|
||||
return 18991;
|
||||
case 0x255E:
|
||||
return 18975;
|
||||
case 0x2561:
|
||||
return 18977;
|
||||
case 0x256A:
|
||||
return 18976;
|
||||
case 0x5341:
|
||||
return 5512;
|
||||
case 0x5345:
|
||||
return 5599;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
for (size_t i = 3967; i < MOZ_ARRAY_LENGTH(kBig5LowBitsTable); ++i) {
|
||||
if (kBig5LowBitsTable[i] == aLowBits) {
|
||||
size_t pointer;
|
||||
if (i < 4409) {
|
||||
pointer = i + 1057;
|
||||
} else if (i < 10128) {
|
||||
pointer = i + 1086;
|
||||
} else {
|
||||
pointer = i + 1126;
|
||||
}
|
||||
if (aIsAstral == IsAstral(pointer)) {
|
||||
return pointer;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
@ -3,10 +3,16 @@
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUCvTWDll_h_
|
||||
#define nsUCvTWDll_h_
|
||||
#ifndef nsBIG5Data_h_
|
||||
#define nsBIG5Data_h_
|
||||
|
||||
extern const uint16_t g_ufBig5Mapping[];
|
||||
extern const uint16_t g_ASCIIMappingTable[];
|
||||
class nsBIG5Data
|
||||
{
|
||||
public:
|
||||
static char16_t LowBits(size_t aPointer);
|
||||
static bool IsAstral(size_t aPointer);
|
||||
static size_t FindPointer(char16_t aLowBits, bool aIsAstral);
|
||||
};
|
||||
|
||||
#endif /* nsBIG5Data_h_ */
|
||||
|
||||
#endif /* nsUCvTWDll_h_ */
|
@ -6,8 +6,7 @@
|
||||
#include "nsBIG5ToUnicode.h"
|
||||
#include "mozilla/BinarySearch.h"
|
||||
#include "mozilla/ArrayUtils.h"
|
||||
|
||||
#include "nsBIG5DecoderData.h"
|
||||
#include "nsBIG5Data.h"
|
||||
|
||||
nsBIG5ToUnicode::nsBIG5ToUnicode()
|
||||
: mPendingTrail(0)
|
||||
@ -90,7 +89,7 @@ nsBIG5ToUnicode::Convert(const char* aSrc,
|
||||
outTrail = 0x030C;
|
||||
break;
|
||||
default:
|
||||
char16_t lowBits = LowBits(pointer);
|
||||
char16_t lowBits = nsBIG5Data::LowBits(pointer);
|
||||
if (!lowBits) {
|
||||
if (b <= 0x7F) {
|
||||
// prepend byte to stream
|
||||
@ -107,7 +106,7 @@ nsBIG5ToUnicode::Convert(const char* aSrc,
|
||||
*out++ = 0xFFFD;
|
||||
continue;
|
||||
}
|
||||
if (IsAstral(pointer)) {
|
||||
if (nsBIG5Data::IsAstral(pointer)) {
|
||||
uint32_t codePoint = uint32_t(lowBits) | 0x20000;
|
||||
*out++ = char16_t(0xD7C0 + (codePoint >> 10));
|
||||
outTrail = char16_t(0xDC00 + (codePoint & 0x3FF));
|
||||
|
@ -32,9 +32,6 @@ public:
|
||||
NS_IMETHOD Reset();
|
||||
|
||||
private:
|
||||
static char16_t LowBits(size_t aPointer);
|
||||
static bool IsAstral(size_t aPointer);
|
||||
|
||||
char16_t mPendingTrail;
|
||||
uint8_t mBig5Lead;
|
||||
};
|
||||
|
@ -1,16 +0,0 @@
|
||||
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUCvTWCID_h___
|
||||
#define nsUCvTWCID_h___
|
||||
|
||||
#include "nsISupports.h"
|
||||
|
||||
// Class ID for our UnicodeToBIG5 charset converter
|
||||
// {EFC323E2-EC62-11d2-8AAC-00600811A836}
|
||||
#define NS_UNICODETOBIG5_CID \
|
||||
{ 0xefc323e2, 0xec62, 0x11d2, {0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36}}
|
||||
|
||||
#endif /* nsUCvTWCID_h___ */
|
@ -4,35 +4,248 @@
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "nsUnicodeToBIG5.h"
|
||||
#include "nsUCvTWDll.h"
|
||||
#include "nsUCConstructors.h"
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Global functions and data [declaration]
|
||||
NS_IMPL_ADDREF(nsUnicodeToBIG5)
|
||||
NS_IMPL_RELEASE(nsUnicodeToBIG5)
|
||||
NS_IMPL_QUERY_INTERFACE(nsUnicodeToBIG5,
|
||||
nsIUnicodeEncoder)
|
||||
|
||||
|
||||
static const uint16_t *g_Big5MappingTable[2] = {
|
||||
g_ASCIIMappingTable,
|
||||
g_ufBig5Mapping
|
||||
};
|
||||
|
||||
static const uScanClassID g_Big5ScanClassIDs[2] = {
|
||||
u1ByteCharset,
|
||||
u2BytesCharset
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Class nsUnicodeToBIG5 [implementation]
|
||||
|
||||
nsresult
|
||||
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult)
|
||||
nsUnicodeToBIG5::nsUnicodeToBIG5()
|
||||
: mUtf16Lead(0)
|
||||
, mPendingTrail(0)
|
||||
, mSignal(true) // as in nsEncoderSupport
|
||||
{
|
||||
|
||||
return CreateMultiTableEncoder(2,
|
||||
(uScanClassID*) &g_Big5ScanClassIDs,
|
||||
(uMappingTable**) &g_Big5MappingTable,
|
||||
2 /* max length = src * 2 */,
|
||||
aOuter, aIID, aResult);
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::Convert(const char16_t* aSrc,
|
||||
int32_t* aSrcLength,
|
||||
char* aDest,
|
||||
int32_t * aDestLength)
|
||||
{
|
||||
const char16_t* in = aSrc;
|
||||
const char16_t* inEnd = in + *aSrcLength;
|
||||
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
|
||||
uint8_t* outEnd = out + *aDestLength;
|
||||
|
||||
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
|
||||
"Can't have both pending output and pending input.");
|
||||
|
||||
if (mPendingTrail) {
|
||||
if (out == outEnd) {
|
||||
*aSrcLength = 0;
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*out++ = mPendingTrail;
|
||||
mPendingTrail = 0;
|
||||
}
|
||||
for (;;) {
|
||||
if (in == inEnd) {
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_OK_UENC_MOREINPUT;
|
||||
}
|
||||
if (out == outEnd) {
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
bool isAstral; // true means Plane 2, false means BMP
|
||||
char16_t lowBits; // The low 16 bits of the code point
|
||||
char16_t codeUnit = *in++;
|
||||
size_t highBits = (codeUnit & 0xFC00);
|
||||
if (highBits == 0xD800) {
|
||||
// high surrogate
|
||||
if (mUtf16Lead) {
|
||||
// High surrogate follows another high surrogate. The
|
||||
// *previous* code unit is in error.
|
||||
if (mSignal) {
|
||||
mUtf16Lead = 0;
|
||||
// NOTE: Encode API differs from decode API!
|
||||
--in;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
}
|
||||
mUtf16Lead = codeUnit;
|
||||
continue;
|
||||
}
|
||||
if (highBits == 0xDC00) {
|
||||
// low surrogate
|
||||
if (!mUtf16Lead) {
|
||||
// Got low surrogate without a previous high surrogate
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
continue;
|
||||
}
|
||||
size_t codePoint = (mUtf16Lead << 10) + codeUnit -
|
||||
(((0xD800 << 10) - 0x10000) + 0xDC00);
|
||||
mUtf16Lead = 0;
|
||||
// Plane 2 is the only astral plane that has potentially
|
||||
// Big5-encodable characters.
|
||||
if ((0xFF0000 & codePoint) != 0x20000) {
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
// nsSaveAsCharset wants us to back up on step in the case of a
|
||||
// surrogate pair.
|
||||
--in;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
continue;
|
||||
}
|
||||
isAstral = true;
|
||||
lowBits = (char16_t)(codePoint & 0xFFFF);
|
||||
} else {
|
||||
// not a surrogate
|
||||
if (mUtf16Lead) {
|
||||
// Non-surrogate follows a high surrogate. The *previous*
|
||||
// code unit is in error.
|
||||
mUtf16Lead = 0;
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
--in;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
// Let's unconsume this code unit and reloop in order to
|
||||
// re-check if the output buffer still has space.
|
||||
--in;
|
||||
continue;
|
||||
}
|
||||
isAstral = false;
|
||||
lowBits = codeUnit;
|
||||
}
|
||||
// isAstral now tells us if we have a Plane 2 or a BMP character.
|
||||
// lowBits tells us the low 16 bits.
|
||||
// After all the above setup to deal with UTF-16, we are now
|
||||
// finally ready to follow the spec.
|
||||
if (!isAstral && lowBits <= 0x7F) {
|
||||
*out++ = (uint8_t)lowBits;
|
||||
continue;
|
||||
}
|
||||
size_t pointer = nsBIG5Data::FindPointer(lowBits, isAstral);
|
||||
if (!pointer) {
|
||||
if (mSignal) {
|
||||
// NOTE: Encode API differs from decode API!
|
||||
if (isAstral) {
|
||||
// nsSaveAsCharset wants us to back up on step in the case of a
|
||||
// surrogate pair.
|
||||
--in;
|
||||
}
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_ERROR_UENC_NOMAPPING;
|
||||
}
|
||||
*out++ = '?';
|
||||
continue;
|
||||
}
|
||||
uint8_t lead = (uint8_t)(pointer / 157 + 0x81);
|
||||
uint8_t trail = (uint8_t)(pointer % 157);
|
||||
if (trail < 0x3F) {
|
||||
trail += 0x40;
|
||||
} else {
|
||||
trail += 0x62;
|
||||
}
|
||||
*out++ = lead;
|
||||
if (out == outEnd) {
|
||||
mPendingTrail = trail;
|
||||
*aSrcLength = in - aSrc;
|
||||
*aDestLength = out - reinterpret_cast<uint8_t*>(aDest);
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*out++ = trail;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::Finish(char* aDest,
|
||||
int32_t* aDestLength)
|
||||
{
|
||||
MOZ_ASSERT(!(mPendingTrail && mUtf16Lead),
|
||||
"Can't have both pending output and pending input.");
|
||||
uint8_t* out = reinterpret_cast<uint8_t*>(aDest);
|
||||
if (mPendingTrail) {
|
||||
if (*aDestLength < 1) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
*out = mPendingTrail;
|
||||
mPendingTrail = 0;
|
||||
*aDestLength = 1;
|
||||
return NS_OK;
|
||||
}
|
||||
if (mUtf16Lead) {
|
||||
if (*aDestLength < 1) {
|
||||
*aDestLength = 0;
|
||||
return NS_OK_UENC_MOREOUTPUT;
|
||||
}
|
||||
// The API doesn't support signaling an error. It pretends that malformed
|
||||
// input doesn't exist. The UTF-8 encoder outputs the replacement character
|
||||
// unconditionally.
|
||||
mUtf16Lead = 0;
|
||||
*out = '?';
|
||||
*aDestLength = 1;
|
||||
return NS_OK;
|
||||
}
|
||||
*aDestLength = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::GetMaxLength(const char16_t* aSrc,
|
||||
int32_t aSrcLength,
|
||||
int32_t* aDestLength)
|
||||
{
|
||||
*aDestLength = (aSrcLength * 2) +
|
||||
(mPendingTrail ? 1 : 0) +
|
||||
// If the lead ends up being paired, the bytes produced
|
||||
// are already included above.
|
||||
// If not, it produces a single '?'.
|
||||
(mUtf16Lead ? 1 : 0);
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::Reset()
|
||||
{
|
||||
mUtf16Lead = 0;
|
||||
mPendingTrail = 0;
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsUnicodeToBIG5::SetOutputErrorBehavior(int32_t aBehavior,
|
||||
nsIUnicharEncoder* aEncoder,
|
||||
char16_t aChar)
|
||||
{
|
||||
switch (aBehavior) {
|
||||
case kOnError_Signal:
|
||||
mSignal = true;
|
||||
break;
|
||||
case kOnError_Replace:
|
||||
mSignal = false;
|
||||
MOZ_ASSERT(aChar == '?', "Unsupported replacement.");
|
||||
break;
|
||||
case kOnError_CallBack:
|
||||
MOZ_ASSERT_UNREACHABLE("kOnError_CallBack is supposed to be unused.");
|
||||
break;
|
||||
default:
|
||||
MOZ_ASSERT_UNREACHABLE("Non-existent enum item.");
|
||||
break;
|
||||
}
|
||||
return NS_OK;
|
||||
}
|
||||
|
@ -3,19 +3,48 @@
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef nsUnicodeToBIG5_h___
|
||||
#define nsUnicodeToBIG5_h___
|
||||
#ifndef nsUnicodeToBIG5_h_
|
||||
#define nsUnicodeToBIG5_h_
|
||||
|
||||
#include "nsISupports.h"
|
||||
#include "nsIUnicodeEncoder.h"
|
||||
|
||||
/**
|
||||
* A character set converter from Unicode to BIG5.
|
||||
*
|
||||
* @created 06/Apr/1999
|
||||
* @author Catalin Rotaru [CATA]
|
||||
*/
|
||||
nsresult
|
||||
nsUnicodeToBIG5Constructor(nsISupports *aOuter, REFNSIID aIID,
|
||||
void **aResult);
|
||||
#define NS_UNICODETOBIG5_CID \
|
||||
{ 0xefc323e2, 0xec62, 0x11d2, \
|
||||
{ 0x8a, 0xac, 0x0, 0x60, 0x8, 0x11, 0xa8, 0x36 } }
|
||||
|
||||
#endif /* nsUnicodeToBIG5_h___ */
|
||||
class nsUnicodeToBIG5 : public nsIUnicodeEncoder
|
||||
{
|
||||
public:
|
||||
// Encoders probably shouldn't use the thread-safe variant, but we should
|
||||
// make a systematic change instead of making this class different.
|
||||
NS_DECL_THREADSAFE_ISUPPORTS
|
||||
|
||||
nsUnicodeToBIG5();
|
||||
|
||||
NS_IMETHOD Convert(const char16_t* aSrc,
|
||||
int32_t* aSrcLength,
|
||||
char* aDest,
|
||||
int32_t * aDestLength);
|
||||
|
||||
NS_IMETHOD Finish(char* aDest,
|
||||
int32_t* aDestLength);
|
||||
|
||||
MOZ_WARN_UNUSED_RESULT NS_IMETHOD GetMaxLength(const char16_t* aSrc,
|
||||
int32_t aSrcLength,
|
||||
int32_t* aDestLength);
|
||||
|
||||
NS_IMETHOD Reset();
|
||||
|
||||
NS_IMETHOD SetOutputErrorBehavior(int32_t aBehavior,
|
||||
nsIUnicharEncoder* aEncoder,
|
||||
char16_t aChar);
|
||||
|
||||
private:
|
||||
virtual ~nsUnicodeToBIG5(){};
|
||||
|
||||
char16_t mUtf16Lead;
|
||||
uint8_t mPendingTrail;
|
||||
bool mSignal;
|
||||
};
|
||||
|
||||
#endif /* nsUnicodeToBIG5_h_ */
|
||||
|
@ -1 +0,0 @@
|
||||
We should put Big5 converter into this directory/dll
|
@ -29051,7 +29051,16 @@
|
||||
},
|
||||
"local_changes": {
|
||||
"deleted": [],
|
||||
"items": {},
|
||||
"items": {
|
||||
"testharness": {
|
||||
"encoding/big5-encoder.html": [
|
||||
{
|
||||
"path": "encoding/big5-encoder.html",
|
||||
"url": "/encoding/big5-encoder.html"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"reftest_nodes": {}
|
||||
},
|
||||
"reftest_nodes": {
|
||||
|
@ -444,12 +444,6 @@
|
||||
[Name "hz-gb-2312" has label "hz-gb-2312" (inputEncoding)]
|
||||
expected: FAIL
|
||||
|
||||
[Name "big5" has label "big5-hkscs" (characterSet)]
|
||||
expected: FAIL
|
||||
|
||||
[Name "big5" has label "big5-hkscs" (inputEncoding)]
|
||||
expected: FAIL
|
||||
|
||||
[Name "replacement" has label "csiso2022kr" (characterSet)]
|
||||
expected: FAIL
|
||||
|
||||
|
14
testing/web-platform/meta/encoding/big5-encoder.html.ini
Normal file
14
testing/web-platform/meta/encoding/big5-encoder.html.ini
Normal file
@ -0,0 +1,14 @@
|
||||
[big5-encoder.html]
|
||||
type: testharness
|
||||
[big5 encoder: Highest-pointer BMP character excluded from encoder]
|
||||
expected: FAIL
|
||||
|
||||
[big5 encoder: Highest-pointer character excluded from encoder]
|
||||
expected: FAIL
|
||||
|
||||
[big5 encoder: The canonical BMP test character that is not in the index]
|
||||
expected: FAIL
|
||||
|
||||
[big5 encoder: The canonical astral test character that is not in the index]
|
||||
expected: FAIL
|
||||
|
@ -1,53 +0,0 @@
|
||||
[textdecoder-labels.html]
|
||||
type: testharness
|
||||
[name=big5 label=big5-hkscs]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
[" big5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs " => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
[" big5-hkscs " => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\tbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\t" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\tbig5-hkscs\\t" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\nbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\n" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\nbig5-hkscs\\n" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\fbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\f" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\fbig5-hkscs\\f" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\rbig5-hkscs" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["big5-hkscs\\r" => "big5"]
|
||||
expected: FAIL
|
||||
|
||||
["\\rbig5-hkscs\\r" => "big5"]
|
||||
expected: FAIL
|
||||
|
33
testing/web-platform/tests/encoding/big5-encoder.html
Normal file
33
testing/web-platform/tests/encoding/big5-encoder.html
Normal file
@ -0,0 +1,33 @@
|
||||
<!doctype html>
|
||||
<meta charset=big5> <!-- test breaks if the server overrides this -->
|
||||
<script src=/resources/testharness.js></script>
|
||||
<script src=/resources/testharnessreport.js></script>
|
||||
<div id=log></div>
|
||||
<script>
|
||||
function encode(input, output, desc) {
|
||||
test(function() {
|
||||
var a = document.createElement("a"); // <a> uses document encoding for URL's query
|
||||
// Append and prepend X to test for off-by-one errors
|
||||
a.href = "https://example.com/?X" + input + "X";
|
||||
assert_equals(a.search.substr(1), "X" + output + "X"); // remove leading "?"
|
||||
}, "big5 encoder: " + desc);
|
||||
}
|
||||
|
||||
encode("ab", "ab", "very basic")
|
||||
// edge cases
|
||||
encode("\u9EA6", "%26%2340614%3B", "Highest-pointer BMP character excluded from encoder");
|
||||
encode("\uD858\uDE6B", "%26%23156267%3B", "Highest-pointer character excluded from encoder");
|
||||
encode("\u3000", "%A1@", "Lowest-pointer character included in encoder");
|
||||
encode("\u20AC", "%A3%E1", "Euro; the highest-pointer character before a range of 30 unmapped pointers");
|
||||
encode("\u4E00", "%A4@", "The lowest-pointer character after the range of 30 unmapped pointers");
|
||||
encode("\uD85D\uDE07", "%C8%A4", "The highest-pointer character before a range of 41 unmapped pointers");
|
||||
encode("\uFFE2", "%C8%CD", "The lowest-pointer character after the range of 41 unmapped pointers");
|
||||
encode("\u79D4", "%FE%FE", "The last character in the index");
|
||||
// not in index
|
||||
encode("\u2603", "%26%239731%3B", "The canonical BMP test character that is not in the index");
|
||||
encode("\uD83D\uDCA9", "%26%23128169%3B", "The canonical astral test character that is not in the index");
|
||||
// duplicate low bits
|
||||
encode("\uD840\uDFB5", "%FDj", "A Plane 2 character whose low 16 bits match a BMP character that has a lower pointer");
|
||||
// prefer last
|
||||
encode("\u2550", "%F9%F9", "A duplicate-mapped code point that prefers the highest pointer in the encoder");
|
||||
</script>
|
Loading…
Reference in New Issue
Block a user