Do Hankaku to Zenkaku conversion in the ISO-2022-JP encoder. Bug 563283, r=Masatoshi Kimura <VYV03354@nifty.ne.jp>

This commit is contained in:
Simon Montagu 2010-06-21 15:34:11 +03:00
parent 2007687e8b
commit 1650207417
10 changed files with 155 additions and 298 deletions

View File

@ -75,8 +75,6 @@ static const nsModuleComponentInfo components[] =
NS_ENTITYCONVERTER_CONTRACTID, nsEntityConverterConstructor },
{ "Unicode To Charset Converter", NS_SAVEASCHARSET_CID,
NS_SAVEASCHARSET_CONTRACTID, nsSaveAsCharsetConstructor},
{ "Japanese Hankaku To Zenkaku", NS_HANKAKUTOZENKAKU_CID,
NS_HANKAKUTOZENKAKU_CONTRACTID, CreateNewHankakuToZenkaku},
{ "Unicode Normlization", NS_UNICODE_NORMALIZER_CID,
NS_UNICODE_NORMALIZER_CONTRACTID, nsUnicodeNormalizerConstructor},

View File

@ -0,0 +1,52 @@
// Tests conversion from Unicode to ISO-2022-JP with Hankaku characters
load('CharsetConversionTests.js');
const inStrings = [
// 。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゙゚
"\uFF61\uFF62\uFF63\uFF64\uFF65\uFF66\uFF67\uFF68\uFF69\uFF6A\uFF6B\uFF6C\uFF6D\uFF6E\uFF6F\uFF70\uFF71\uFF72\uFF73\uFF74\uFF75\uFF76\uFF77\uFF78\uFF79\uFF7A\uFF7B\uFF7C\uFF7D\uFF7E\uFF7F\uFF80\uFF81\uFF82\uFF83\uFF84\uFF85\uFF86\uFF87\uFF88\uFF89\uFF8A\uFF8B\uFF8C\uFF8D\uFF8E\uFF8F\uFF90\uFF91\uFF92\uFF93\uFF94\uFF95\uFF96\uFF97\uFF98\uFF99\uFF9A\uFF9B\uFF9C\uFF9D\uFF9E\uFF9F",
// equivalent to
// 。「」、・ヲァィゥェォャュョッーアイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワン゛゜
// \u3002\u300c\u300d\u3001\u30fb\u30f2\u30a1\u30a3\u30a5\u30a7\u30a9\u30e3\u30e5\u30e7\u30c3\u30fc\u30a2\u30a4\u30a6\u30a8\u30aa\u30ab\u30ad\u30af\u30b1\u30b3\u30b5\u30b7\u30b9\u30bb\u30bd\u30bf\u30c1\u30c4\u30c6\u30c8\u30ca\u30cb\u30cc\u30cd\u30ce\u30cf\u30d2\u30d5\u30d8\u30db\u30de\u30df\u30e0\u30e1\u30e2\u30e4\u30e6\u30e8\u30e9\u30ea\u30eb\u30ec\u30ed\u30ef\u30f3\u309b\u309c"
// ガギグゲゴザジズゼゾダヂヅデドバビブベボ
"\uFF76\uFF9E\uFF77\uFF9E\uFF78\uFF9E\uFF79\uFF9E\uFF7A\uFF9E\uFF7B\uFF9E\uFF7C\uFF9E\uFF7D\uFF9E\uFF7E\uFF9E\uFF7F\uFF9E\uFF80\uFF9E\uFF81\uFF9E\uFF82\uFF9E\uFF83\uFF9E\uFF84\uFF9E\uFF8A\uFF9E\uFF8B\uFF9E\uFF8C\uFF9E\uFF8D\uFF9E\uFF8E\uFF9E",
// equivalent to
// ガギゲゴザジズゼゾダヂヅデドバビブベボ
// \u30AC\u30AE\u30B2\u30B4\u30B6\u30B8\u30BA\u30BC\u30BE\u30C0\u30C2\u30C5\u30C7\u30C9\u30D0\u30D3\u30D6\u30D9\u30DC
// パピプペポ
"\uFF8A\uFF9F\uFF8B\uFF9F\uFF8C\uFF9F\uFF8D\uFF9F\uFF8E\uFF9F",
// equivalent to
// パピプペポ
// \u30D1\u30D4\u30D7\u30DA\u30DD"
// Hankaku preceded and followed by regular Katakana (no change of charset)
// フランツ・ヨーゼフ・ハイドン
"\u30D5\u30E9\u30F3\u30C4\u30FB\uFF96\uFF70\uFF7E\uFF9E\uFF8C\u30FB\u30CF\u30A4\u30C9\u30F3",
// Hankaku preceded and followed by Roman (charset change)
// Mozilla (モジラ) Foundation
"Mozilla (\uFF93\uFF7C\uFF9E\uFF97) Foundation",
// Hankaku preceded and followed by unencodable characters
// दिल्ली・デリー・ਦਿੱਲੀ
"\u0926\u093F\u0932\u094D\u0932\u0940\uFF65\uFF83\uFF9E\uFF98\uFF70\uFF65\u0A26\u0A3F\u0A71\u0A32\u0A40"
];
const expectedStrings = [
"\x1B$B!#!V!W!\x22!&%r%!%#%%%'%)%c%e%g%C!<%\x22%$%&%(%*%+%-%/%1%3%5%7%9%;%=%?%A%D%F%H%J%K%L%M%N%O%R%U%X%[%^%_%`%a%b%d%f%h%i%j%k%l%m%o%s!+!,\x1B(B",
"\x1B$B%,%.%0%2%4%6%8%:%<%>%@%B%E%G%I%P%S%V%Y%\x5C\x1B(B",
"\x1B$B%Q%T%W%Z%]\x1B(B",
"\x1B$B%U%i%s%D!&%h!<%<%U!&%O%$%I%s\x1B(B",
"Mozilla (\x1B$B%b%8%i\x1B(B) Foundation",
"??????\x1B$B!&%G%j!<!&\x1B(B?????"
];
function run_test()
{
for (var i = 0; i < inStrings.length; ++i) {
checkEncode(CreateScriptableConverter(), "ISO-2022-JP",
inStrings[i], expectedStrings[i]);
}
}

View File

@ -43,6 +43,41 @@
//----------------------------------------------------------------------
// Global functions and data [declaration]
// Basic mapping from Hankaku to Zenkaku
// Nigori and Maru are taken care of outside this basic mapping
static const PRUnichar gBasicMapping[0x40] =
{
// 0xff60
0xff60,0x3002,0x300c,0x300d,0x3001,0x30fb,0x30f2,0x30a1,
// 0xff68
0x30a3,0x30a5,0x30a7,0x30a9,0x30e3,0x30e5,0x30e7,0x30c3,
// 0xff70
0x30fc,0x30a2,0x30a4,0x30a6,0x30a8,0x30aa,0x30ab,0x30ad,
// 0xff78
0x30af,0x30b1,0x30b3,0x30b5,0x30b7,0x30b9,0x30bb,0x30bd,
// 0xff80
0x30bf,0x30c1,0x30c4,0x30c6,0x30c8,0x30ca,0x30cb,0x30cc,
// 0xff88
0x30cd,0x30ce,0x30cf,0x30d2,0x30d5,0x30d8,0x30db,0x30de,
// 0xff90
0x30df,0x30e0,0x30e1,0x30e2,0x30e4,0x30e6,0x30e8,0x30e9,
// 0xff98
0x30ea,0x30eb,0x30ec,0x30ed,0x30ef,0x30f3,0x309b,0x309c
};
// Do we need to check for Nigori for the next unicode ?
#define NEED_TO_CHECK_NIGORI(u) (((0xff76<=(u))&&((u)<=0xff84))||((0xff8a<=(u))&&((u)<=0xff8e)))
// Do we need to check for Maru for the next unicode ?
#define NEED_TO_CHECK_MARU(u) ((0xff8a<=(u))&&((u)<=0xff8e))
// The unicode is in Katakana Hankaku block
#define IS_HANKAKU(u) ((0xff61 <= (u)) && ((u) <= 0xff9f))
#define IS_NIGORI(u) (0xff9e == (u))
#define IS_MARU(u) (0xff9f == (u))
#define NIGORI_MODIFIER 1
#define MARU_MODIFIER 2
static const PRUint16 g_ufAsciiMapping [] = {
0x0001, 0x0004, 0x0005, 0x0008, 0x0000, 0x0000, 0x007F, 0x0000
};
@ -63,6 +98,7 @@ static const uScanClassID g_ufScanClassIDs[SIZE_OF_TABLES] = {
u2BytesCharset, // JIS X 0208- cp932 ext
u2BytesCharset, // JIS X 0208-1978 ISOREG 42
};
#define JIS_X_208_INDEX 2
//----------------------------------------------------------------------
// Class nsUnicodeToISO2022JP [implementation]
@ -135,6 +171,60 @@ nsresult nsUnicodeToISO2022JP::ChangeCharset(PRInt32 aCharset,
return NS_OK;
}
nsresult nsUnicodeToISO2022JP::ConvertHankaku(const PRUnichar * aSrc,
PRInt32 * aSrcLength,
char * aDest,
PRInt32 * aDestLength)
{
nsresult res = NS_OK;
const PRUnichar * src = aSrc;
const PRUnichar * srcEnd = aSrc + *aSrcLength;
char * dest = aDest;
char * destEnd = aDest + *aDestLength;
PRUnichar srcChar, tempChar;
PRInt32 bcr, bcw;
bcw = destEnd - dest;
res = ChangeCharset(JIS_X_208_INDEX, dest, &bcw);
dest += bcw;
if (res != NS_OK) {
return res;
}
while (src < srcEnd) {
srcChar = *src;
if (!IS_HANKAKU(srcChar)) {
break;
}
++src;
tempChar = gBasicMapping[(srcChar) - 0xff60];
if (src < srcEnd) {
// if the character could take a modifier, and the next char
// is a modifier, modify it and eat one PRUnichar
if (NEED_TO_CHECK_NIGORI(srcChar) && IS_NIGORI(*src)) {
tempChar += NIGORI_MODIFIER;
++src;
} else if (NEED_TO_CHECK_MARU(srcChar) && IS_MARU(*src)) {
tempChar += MARU_MODIFIER;
++src;
}
}
bcr = 1;
bcw = destEnd - dest;
res = nsUnicodeEncodeHelper::ConvertByTable(
&tempChar, &bcr, dest, &bcw, g_ufScanClassIDs[JIS_X_208_INDEX],
nsnull, (uMappingTable *) g_ufMappingTables[JIS_X_208_INDEX]);
dest += bcw;
if (res != NS_OK)
break;
}
*aDestLength = dest - aDest;
*aSrcLength = src - aSrc;
return res;
}
//----------------------------------------------------------------------
// Subclassing of nsTableEncoderSupport class [implementation]
@ -170,8 +260,17 @@ NS_IMETHODIMP nsUnicodeToISO2022JP::ConvertNoBuffNoErr(
}
if ( i>= SIZE_OF_TABLES) {
res = NS_ERROR_UENC_NOMAPPING;
src++;
if (IS_HANKAKU(*src)) {
bcr = srcEnd - src;
bcw = destEnd - dest;
res = ConvertHankaku(src, &bcr, dest, &bcw);
dest += bcw;
src += bcr;
if (res == NS_OK) continue;
} else {
res = NS_ERROR_UENC_NOMAPPING;
src++;
}
}
if (res != NS_OK) break;

View File

@ -69,6 +69,8 @@ protected:
nsresult ChangeCharset(PRInt32 aCharset, char * aDest,
PRInt32 * aDestLength);
nsresult ConvertHankaku(const PRUnichar *aSrc, PRInt32 * aSrcLength,
char *aDest, PRInt32 * aDestLength);
//--------------------------------------------------------------------
// Subclassing of nsEncoderSupport class [declaration]

View File

@ -49,7 +49,6 @@ EXPORTS = \
nsITextTransform.h \
nsIUGenCategory.h \
nsUnicharUtilCIID.h \
nsHankakuToZenkakuCID.h \
$(NULL)
include $(topsrcdir)/config/rules.mk

View File

@ -1,53 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsHankakuToZenkakuCID_h__
#define nsHankakuToZenkakuCID_h__
#include "nsITextTransform.h"
#include "nsISupports.h"
#include "nscore.h"
// {8F666A11-04A0-11d3-B3B9-00805F8A6670}
#define NS_HANKAKUTOZENKAKU_CID \
{ 0x8f666a11, 0x4a0, 0x11d3, \
{ 0xb3, 0xb9, 0x0, 0x80, 0x5f, 0x8a, 0x66, 0x70 } }
#define NS_HANKAKUTOZENKAKU_CONTRACTID NS_TEXTTRANSFORM_CONTRACTID_BASE "hankakutozenkaku"
#endif

View File

@ -51,7 +51,6 @@ LIBXUL_LIBRARY = 1
CPPSRCS = \
nsCaseConversionImp2.cpp \
nsCategoryImp.cpp \
nsHankakuToZenkaku.cpp \
nsEntityConverter.cpp \
nsSaveAsCharset.cpp \
nsUnicodeNormalizer.cpp \

View File

@ -1,189 +0,0 @@
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1999
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nsITextTransform.h"
#include "pratom.h"
#include "nsUUDll.h"
#include "nsTextTransformFactory.h"
// Basic mapping from Hankaku to Zenkaku
// Nigori and Maru is take care out side this basic mapping
static const PRUnichar gBasicMapping[0x40] =
{
// 0xff60
0xff60,0x3002,0x300c,0x300d,0x3001,0x30fb,0x30f2,0x30a1,
// 0xff68
0x30a3,0x30a5,0x30a7,0x30a9,0x30e3,0x30e5,0x30e7,0x30c3,
// 0xff70
0x30fc,0x30a2,0x30a4,0x30a6,0x30a8,0x30aa,0x30ab,0x30ad,
// 0xff78
0x30af,0x30b1,0x30b3,0x30b5,0x30b7,0x30b9,0x30bb,0x30bd,
// 0xff80
0x30bf,0x30c1,0x30c4,0x30c6,0x30c8,0x30ca,0x30cb,0x30cc,
// 0xff88
0x30cd,0x30ce,0x30cf,0x30d2,0x30d5,0x30d8,0x30db,0x30de,
// 0xff90
0x30df,0x30e0,0x30e1,0x30e2,0x30e4,0x30e6,0x30e8,0x30e9,
// 0xff98
0x30ea,0x30eb,0x30ec,0x30ed,0x30ef,0x30f3,0x309b,0x309c
};
// Do we need to check for Nigori for the next unicode ?
#define NEED_TO_CHECK_NIGORI(u) (((0xff76<=(u))&&((u)<=0xff84))||((0xff8a<=(u))&&((u)<=0xff8e)))
// Do we need to check for Maru for the next unicode ?
#define NEED_TO_CHECK_MARU(u) ((0xff8a<=(u))&&((u)<=0xff8e))
// The unicode is in Katakana Hankaku block
#define IS_HANKAKU(u) (0xff60==((u) & 0xffe0)) || (0xff80==((u)&0xffe0))
#define IS_NIGORI(u) (0xff9e == (u))
#define IS_MARU(u) (0xff9f == (u))
#define NIGORI_MODIFIER 1
#define MARU_MODIFIER 2
// function prototype
void HankakuToZenkaku (
const PRUnichar* aSrc, PRInt32 aLen,
PRUnichar* aDest, PRInt32 aDestLen, PRInt32* oLen);
void HankakuToZenkaku (
const PRUnichar* aSrc, PRInt32 aLen,
PRUnichar* aDest, PRInt32 aDestLen, PRInt32* oLen)
{
// XXX aDestLen is never checked, assumed to be as long as aLen
NS_ASSERTION(aDestLen >= aLen, "aDest must be as long as aSrc");
PRInt32 i,j;
if ( aLen == 0) {
*oLen = 0;
return;
}
// loop from the first to the last char except the last one.
for(i = j = 0; i < (aLen-1); i++,j++,aSrc++, aDest++)
{
if(IS_HANKAKU(*aSrc)) {
// if it is in hankaku - do basic mapping first
*aDest = gBasicMapping[(*aSrc) - 0xff60];
// if is some char could be modifier, and the next char
// is a modifier, modify it and eat one byte
if(IS_NIGORI(*(aSrc+1)) && NEED_TO_CHECK_NIGORI(*aSrc))
{
*aDest += NIGORI_MODIFIER;
i++; aSrc++;
}
else if(IS_MARU(*(aSrc+1)) && NEED_TO_CHECK_MARU(*aSrc))
{
*aDest += MARU_MODIFIER;
i++; aSrc++;
}
}
else
{
// not in hankaku block, just copy
*aDest = *aSrc;
}
}
// handle the last character
if(IS_HANKAKU(*aSrc))
*aDest = gBasicMapping[(*aSrc) - 0xff60];
else
*aDest = *aSrc;
*oLen = j+1;
}
class nsHankakuToZenkaku : public nsITextTransform {
NS_DECL_ISUPPORTS
public:
nsHankakuToZenkaku() ;
virtual ~nsHankakuToZenkaku() ;
NS_IMETHOD Change( const PRUnichar* aText, PRInt32 aTextLength, nsString& aResult);
NS_IMETHOD Change( nsString& aText, nsString& aResult);
};
NS_IMPL_ISUPPORTS1(nsHankakuToZenkaku, nsITextTransform)
nsHankakuToZenkaku::nsHankakuToZenkaku()
{
}
nsHankakuToZenkaku::~nsHankakuToZenkaku()
{
}
NS_IMETHODIMP nsHankakuToZenkaku::Change( const PRUnichar* aText, PRInt32 aTextLength, nsString& aResult)
{
PRInt32 ol;
if (!EnsureStringLength(aResult, aTextLength))
return NS_ERROR_OUT_OF_MEMORY;
HankakuToZenkaku ( aText, aTextLength, aResult.BeginWriting(), aTextLength, &ol);
aResult.SetLength(ol);
return NS_OK;
}
NS_IMETHODIMP nsHankakuToZenkaku::Change( nsString& aText, nsString& aResult)
{
aResult = aText;
const PRUnichar* u = aResult.get();
PRUnichar* ou = (PRUnichar*) u;
PRInt32 l = aResult.Length();
PRInt32 ol;
HankakuToZenkaku ( u, l, ou, l, &ol);
aResult.SetLength(ol);
return NS_OK;
}
nsresult NS_NewHankakuToZenkaku(nsISupports** oResult)
{
if(!oResult)
return NS_ERROR_NULL_POINTER;
*oResult = new nsHankakuToZenkaku();
if(*oResult)
NS_ADDREF(*oResult);
return (*oResult) ? NS_OK : NS_ERROR_OUT_OF_MEMORY;
}

View File

@ -1,46 +0,0 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is Mozilla Communicator client code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsTextTransformFactory_h__
#define nsTextTransformFactory_h__
#include "nsIFactory.h"
nsresult NS_NewHankakuToZenkaku(nsISupports** oResult);
#endif /* nsTextTransformFactory_h__ */

View File

@ -41,8 +41,6 @@
#include "nsUnicharUtilCIID.h"
#include "nsCaseConversionImp2.h"
#include "nsCategoryImp.h"
#include "nsHankakuToZenkakuCID.h"
#include "nsTextTransformFactory.h"
#include "nsICaseConversion.h"
#include "nsEntityConverter.h"
#include "nsSaveAsCharset.h"
@ -79,8 +77,6 @@ CreateNew##_name(nsISupports* aOuter, REFNSIID aIID, void **aResult) \
}
UNICHARUTIL_MAKE_CTOR(HankakuToZenkaku)
NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsCaseConversionImp2,
nsCaseConversionImp2::GetInstance)
NS_GENERIC_FACTORY_SINGLETON_CONSTRUCTOR(nsCategoryImp,