921 lines
22 KiB
C#
Raw Normal View History

/*
* CP932.cs - Japanese (Shift-JIS) code page.
*
* Copyright (c) 2002 Southern Storm Software, Pty Ltd
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
//
// Copyright (C) 2005-2006 Novell, Inc.
//
namespace I18N.CJK
{
using System;
using System.Text;
using I18N.Common;
#if DISABLE_UNSAFE
using MonoEncoder = I18N.Common.MonoSafeEncoder;
using MonoEncoding = I18N.Common.MonoSafeEncoding;
#endif
[Serializable]
public class CP932 : MonoEncoding
{
// Magic number used by Windows for the Shift-JIS code page.
private const int SHIFTJIS_CODE_PAGE = 932;
// Constructor.
public CP932() : base(SHIFTJIS_CODE_PAGE)
{
}
#if !DISABLE_UNSAFE
// Get the number of bytes needed to encode a character buffer.
public unsafe override int GetByteCountImpl (char* chars, int count)
{
int index = 0;
// Determine the length of the final output.
int length = 0;
int ch, value;
#if __PNET__
byte *cjkToJis = JISConvert.Convert.cjkToJis;
byte *extraToJis = JISConvert.Convert.extraToJis;
#else
byte[] cjkToJis = JISConvert.Convert.cjkToJis;
byte[] extraToJis = JISConvert.Convert.extraToJis;
#endif
while(count > 0)
{
ch = chars[index++];
--count;
++length;
if(ch < 0x0080)
{
// Character maps to itself.
continue;
}
else if(ch < 0x0100)
{
// Check for special Latin 1 characters that
// can be mapped to double-byte code points.
if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
ch == 0x00D7 || ch == 0x00F7)
{
++length;
}
}
else if(ch >= 0x0391 && ch <= 0x0451)
{
// Greek subset characters.
++length;
}
else if(ch >= 0x2010 && ch <= 0x9FA5)
{
// This range contains the bulk of the CJK set.
value = (ch - 0x2010) * 2;
value = ((int)(cjkToJis[value])) |
(((int)(cjkToJis[value + 1])) << 8);
if(value >= 0x0100)
{
++length;
}
}
else if(ch >= 0xE000 && ch <= 0xE757)
// PrivateUse
++length;
else if(ch >= 0xFF01 && ch <= 0xFFEF)
{
// This range contains extra characters,
// including half-width katakana.
value = (ch - 0xFF01) * 2;
value = ((int)(extraToJis[value])) |
(((int)(extraToJis[value + 1])) << 8);
if(value >= 0x0100)
{
++length;
}
}
}
// Return the length to the caller.
return length;
}
// Get the bytes that result from encoding a character buffer.
public unsafe override int GetBytesImpl (
char* chars, int charCount, byte* bytes, int byteCount)
{
int charIndex = 0;
int byteIndex = 0;
EncoderFallbackBuffer buffer = null;
// Convert the characters into their byte form.
int posn = byteIndex;
int end = charCount;
int byteLength = byteCount;
int ch, value;
#if __PNET__
byte *cjkToJis = JISConvert.Convert.cjkToJis;
byte *greekToJis = JISConvert.Convert.greekToJis;
byte *extraToJis = JISConvert.Convert.extraToJis;
#else
byte[] cjkToJis = JISConvert.Convert.cjkToJis;
byte[] greekToJis = JISConvert.Convert.greekToJis;
byte[] extraToJis = JISConvert.Convert.extraToJis;
#endif
for (int i = charIndex; i < end; i++, charCount--)
{
ch = chars[i];
if(posn >= byteLength)
{
throw new ArgumentException
(Strings.GetString("Arg_InsufficientSpace"),
"bytes");
}
if(ch < 0x0080)
{
// Character maps to itself.
bytes[posn++] = (byte)ch;
continue;
}
else if(ch < 0x0100)
{
// Check for special Latin 1 characters that
// can be mapped to double-byte code points.
if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
ch == 0x00D7 || ch == 0x00F7)
{
if((posn + 1) >= byteLength)
{
throw new ArgumentException
(Strings.GetString
("Arg_InsufficientSpace"), "bytes");
}
switch(ch)
{
case 0x00A2:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x91;
break;
case 0x00A3:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x92;
break;
case 0x00A7:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x98;
break;
case 0x00A8:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x4E;
break;
case 0x00AC:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0xCA;
break;
case 0x00B0:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x8B;
break;
case 0x00B1:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x7D;
break;
case 0x00B4:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x4C;
break;
case 0x00B6:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0xF7;
break;
case 0x00D7:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x7E;
break;
case 0x00F7:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x80;
break;
}
}
else if(ch == 0x00A5)
{
// Yen sign.
bytes[posn++] = (byte)0x5C;
}
else
{
HandleFallback (ref buffer,
chars, ref charIndex, ref charCount,
bytes, ref posn, ref byteCount, null);
}
continue;
}
else if(ch >= 0x0391 && ch <= 0x0451)
{
// Greek subset characters.
value = (ch - 0x0391) * 2;
value = ((int)(greekToJis[value])) |
(((int)(greekToJis[value + 1])) << 8);
}
else if(ch >= 0x2010 && ch <= 0x9FA5)
{
// This range contains the bulk of the CJK set.
value = (ch - 0x2010) * 2;
value = ((int)(cjkToJis[value])) |
(((int)(cjkToJis[value + 1])) << 8);
}
else if(ch >= 0xE000 && ch <= 0xE757)
{
// PrivateUse
int diff = ch - 0xE000;
value = ((int) (diff / 0xBC) << 8)
+ (diff % 0xBC)
+ 0xF040;
if (value % 0x100 >= 0x7F)
value++;
}
else if(ch >= 0xFF01 && ch <= 0xFF60)
{
value = (ch - 0xFF01) * 2;
value = ((int)(extraToJis[value])) |
(((int)(extraToJis[value + 1])) << 8);
}
else if(ch >= 0xFF60 && ch <= 0xFFA0)
{
value = ch - 0xFF60 + 0xA0;
}
else
{
// Invalid character.
value = 0;
}
if(value == 0)
{
HandleFallback (ref buffer,
chars, ref charIndex, ref charCount,
bytes, ref posn, ref byteCount, null);
}
else if(value < 0x0100)
{
bytes[posn++] = (byte)value;
}
else if((posn + 1) >= byteLength)
{
throw new ArgumentException
(Strings.GetString("Arg_InsufficientSpace"),
"bytes");
}
else if(value < 0x8000)
{
// JIS X 0208 character.
value -= 0x0100;
ch = (value / 0xBC);
value = (value % 0xBC) + 0x40;
if(value >= 0x7F)
{
++value;
}
if(ch < (0x9F - 0x80))
{
bytes[posn++] = (byte)(ch + 0x81);
}
else
{
bytes[posn++] = (byte)(ch - (0x9F - 0x80) + 0xE0);
}
bytes[posn++] = (byte)value;
}
else if (value >= 0xF040 && value <= 0xF9FC)
{
// PrivateUse
bytes[posn++] = (byte) (value / 0x100);
bytes[posn++] = (byte) (value % 0x100);
}
else
{
// JIS X 0212 character, which Shift-JIS doesn't
// support, but we've already allocated two slots.
bytes[posn++] = (byte)'?';
bytes[posn++] = (byte)'?';
}
}
// Return the final length to the caller.
return posn - byteIndex;
}
#else
// Get the number of bytes needed to encode a character buffer.
public override int GetByteCount(char[] chars, int index, int count)
{
// Determine the length of the final output.
int length = 0;
int ch, value;
byte[] cjkToJis = JISConvert.Convert.cjkToJis;
byte[] extraToJis = JISConvert.Convert.extraToJis;
while (count > 0)
{
ch = chars[index++];
--count;
++length;
if (ch < 0x0080)
{
// Character maps to itself.
continue;
}
else if (ch < 0x0100)
{
// Check for special Latin 1 characters that
// can be mapped to double-byte code points.
if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
ch == 0x00D7 || ch == 0x00F7)
{
++length;
}
}
else if (ch >= 0x0391 && ch <= 0x0451)
{
// Greek subset characters.
++length;
}
else if (ch >= 0x2010 && ch <= 0x9FA5)
{
// This range contains the bulk of the CJK set.
value = (ch - 0x2010) * 2;
value = ((int)(cjkToJis[value])) |
(((int)(cjkToJis[value + 1])) << 8);
if (value >= 0x0100)
{
++length;
}
}
else if (ch >= 0xE000 && ch <= 0xE757)
// PrivateUse
++length;
else if (ch >= 0xFF01 && ch <= 0xFFEF)
{
// This range contains extra characters,
// including half-width katakana.
value = (ch - 0xFF01) * 2;
value = ((int)(extraToJis[value])) |
(((int)(extraToJis[value + 1])) << 8);
if (value >= 0x0100)
{
++length;
}
}
}
// Return the length to the caller.
return length;
}
// Get the bytes that result from encoding a character buffer.
public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
{
int byteCount = bytes.Length;
EncoderFallbackBuffer buffer = null;
// Convert the characters into their byte form.
int posn = byteIndex;
int end = charIndex + charCount;
int byteLength = byteCount;
int /*ch,*/ value;
byte[] cjkToJis = JISConvert.Convert.cjkToJis;
byte[] greekToJis = JISConvert.Convert.greekToJis;
byte[] extraToJis = JISConvert.Convert.extraToJis;
for (int i = charIndex; i < end; i++, charCount--)
{
int ch = chars[i];
if (posn >= byteLength)
{
throw new ArgumentException
(Strings.GetString("Arg_InsufficientSpace"),
"bytes");
}
if (ch < 0x0080)
{
// Character maps to itself.
bytes[posn++] = (byte)ch;
continue;
}
else if (ch < 0x0100)
{
// Check for special Latin 1 characters that
// can be mapped to double-byte code points.
if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
ch == 0x00D7 || ch == 0x00F7)
{
if ((posn + 1) >= byteLength)
{
throw new ArgumentException
(Strings.GetString
("Arg_InsufficientSpace"), "bytes");
}
switch (ch)
{
case 0x00A2:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x91;
break;
case 0x00A3:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x92;
break;
case 0x00A7:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x98;
break;
case 0x00A8:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x4E;
break;
case 0x00AC:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0xCA;
break;
case 0x00B0:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x8B;
break;
case 0x00B1:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x7D;
break;
case 0x00B4:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x4C;
break;
case 0x00B6:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0xF7;
break;
case 0x00D7:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x7E;
break;
case 0x00F7:
bytes[posn++] = (byte)0x81;
bytes[posn++] = (byte)0x80;
break;
}
}
else if (ch == 0x00A5)
{
// Yen sign.
bytes[posn++] = (byte)0x5C;
}
else
{
HandleFallback (ref buffer, chars, ref i, ref charCount, bytes,
ref byteIndex, ref byteCount, null);
}
continue;
}
else if (ch >= 0x0391 && ch <= 0x0451)
{
// Greek subset characters.
value = (ch - 0x0391) * 2;
value = ((int)(greekToJis[value])) |
(((int)(greekToJis[value + 1])) << 8);
}
else if (ch >= 0x2010 && ch <= 0x9FA5)
{
// This range contains the bulk of the CJK set.
value = (ch - 0x2010) * 2;
value = ((int)(cjkToJis[value])) |
(((int)(cjkToJis[value + 1])) << 8);
}
else if (ch >= 0xE000 && ch <= 0xE757)
{
// PrivateUse
int diff = ch - 0xE000;
value = ((int)(diff / 0xBC) << 8)
+ (diff % 0xBC)
+ 0xF040;
if (value % 0x100 >= 0x7F)
value++;
}
else if (ch >= 0xFF01 && ch <= 0xFF60)
{
value = (ch - 0xFF01) * 2;
value = ((int)(extraToJis[value])) |
(((int)(extraToJis[value + 1])) << 8);
}
else if (ch >= 0xFF60 && ch <= 0xFFA0)
{
value = ch - 0xFF60 + 0xA0;
}
else
{
// Invalid character.
value = 0;
}
if (value == 0)
{
HandleFallback (ref buffer, chars, ref charIndex, ref charCount,
bytes, ref posn, ref byteCount, null);
}
else if (value < 0x0100)
{
bytes[posn++] = (byte)value;
}
else if ((posn + 1) >= byteLength)
{
throw new ArgumentException
(Strings.GetString("Arg_InsufficientSpace"),
"bytes");
}
else if (value < 0x8000)
{
// JIS X 0208 character.
value -= 0x0100;
ch = (value / 0xBC);
value = (value % 0xBC) + 0x40;
if (value >= 0x7F)
{
++value;
}
if (ch < (0x9F - 0x80))
{
bytes[posn++] = (byte)(ch + 0x81);
}
else
{
bytes[posn++] = (byte)(ch - (0x9F - 0x80) + 0xE0);
}
bytes[posn++] = (byte)value;
}
else if (value >= 0xF040 && value <= 0xF9FC)
{
// PrivateUse
bytes[posn++] = (byte)(value / 0x100);
bytes[posn++] = (byte)(value % 0x100);
}
else
{
// JIS X 0212 character, which Shift-JIS doesn't
// support, but we've already allocated two slots.
bytes[posn++] = (byte)'?';
bytes[posn++] = (byte)'?';
}
}
// Return the final length to the caller.
return posn - byteIndex;
}
#endif
public override int GetCharCount (byte [] bytes, int index, int count)
{
return new CP932Decoder (JISConvert.Convert).GetCharCount (
bytes, index, count, true);
}
public override int GetChars (
byte [] bytes, int byteIndex, int byteCount,
char [] chars, int charIndex)
{
return new CP932Decoder (JISConvert.Convert).GetChars (bytes,
byteIndex, byteCount, chars, charIndex,
true);
}
// Get the maximum number of bytes needed to encode a
// specified number of characters.
public override int GetMaxByteCount(int charCount)
{
if(charCount < 0)
{
throw new ArgumentOutOfRangeException
("charCount",
Strings.GetString("ArgRange_NonNegative"));
}
return charCount * 2;
}
// Get the maximum number of characters needed to decode a
// specified number of bytes.
public override int GetMaxCharCount(int byteCount)
{
if(byteCount < 0)
{
throw new ArgumentOutOfRangeException
("byteCount",
Strings.GetString("ArgRange_NonNegative"));
}
return byteCount;
}
// Get a decoder that handles a rolling Shift-JIS state.
public override Decoder GetDecoder()
{
return new CP932Decoder(JISConvert.Convert);
}
#if !ECMA_COMPAT
// Get the mail body name for this encoding.
public override String BodyName {
get { return "iso-2022-jp"; }
}
// Get the human-readable name for this encoding.
public override String EncodingName {
get { return "Japanese (Shift-JIS)"; }
}
// Get the mail agent header name for this encoding.
public override String HeaderName {
get { return "iso-2022-jp"; }
}
// Determine if this encoding can be displayed in a Web browser.
public override bool IsBrowserDisplay {
get { return true; }
}
// Determine if this encoding can be saved from a Web browser.
public override bool IsBrowserSave {
get { return true; }
}
// Determine if this encoding can be displayed in a mail/news agent.
public override bool IsMailNewsDisplay {
get { return true; }
}
// Determine if this encoding can be saved from a mail/news agent.
public override bool IsMailNewsSave {
get { return true; }
}
// Get the IANA-preferred Web name for this encoding.
public override String WebName {
get { return "shift_jis"; }
}
// Get the Windows code page represented by this object.
public override int WindowsCodePage {
get { return SHIFTJIS_CODE_PAGE; }
}
// FIXME: This doesn't make sense, but without declaring this override
// System.XML regresses at Encoder.Convert() in
// MonoTests.System.Xml.XmlWriterSettingsTests.EncodingTest.
public override Encoder GetEncoder ()
{
return new MonoEncodingDefaultEncoder (this);
}
}; // class CP932
#endif // !ECMA_COMPAT
// Decoder that handles a rolling Shift-JIS state.
sealed class CP932Decoder : DbcsEncoding.DbcsDecoder
{
private new JISConvert convert;
private int last_byte_count;
private int last_byte_chars;
// Constructor.
public CP932Decoder(JISConvert convert)
: base (null)
{
this.convert = convert;
}
// Override inherited methods.
public override int GetCharCount (
byte [] bytes, int index, int count)
{
return GetCharCount (bytes, index, count, false);
}
public
override
int GetCharCount (byte [] bytes, int index, int count, bool refresh)
{
CheckRange (bytes, index, count);
// Determine the total length of the converted string.
int length = 0;
int byteval;
int last = last_byte_count;
while(count > 0)
{
byteval = bytes[index++];
--count;
if(last == 0)
{
if((byteval >= 0x81 && byteval <= 0x9F) ||
(byteval >= 0xE0 && byteval <= 0xEF))
{
// First byte in a double-byte sequence.
last = byteval;
}
++length;
}
else
{
// Second byte in a double-byte sequence.
last = 0;
}
}
if (refresh) {
if (last != 0)
length++;
last_byte_count = '\0';
}
else
last_byte_count = last;
// Return the total length.
return length;
}
public override int GetChars (
byte [] bytes, int byteIndex, int byteCount,
char [] chars, int charIndex)
{
return GetChars (bytes, byteIndex, byteCount,
chars, charIndex, false);
}
public
override
int GetChars (
byte [] bytes, int byteIndex, int byteCount,
char [] chars, int charIndex, bool refresh)
{
CheckRange (bytes, byteIndex, byteCount,
chars, charIndex);
// Decode the bytes in the buffer.
int posn = charIndex;
int charLength = chars.Length;
int byteval, value;
int last = last_byte_chars;
#if __PNET__
byte *table = convert.jisx0208ToUnicode;
#else
byte[] table = convert.jisx0208ToUnicode;
#endif
while(byteCount > 0)
{
byteval = bytes[byteIndex++];
--byteCount;
if(last == 0)
{
if(posn >= charLength)
{
throw new ArgumentException
(Strings.GetString
("Arg_InsufficientSpace"), "chars");
}
if((byteval >= 0x81 && byteval <= 0x9F) ||
(byteval >= 0xE0 && byteval <= 0xEF))
{
// First byte in a double-byte sequence.
last = byteval;
}
else if(byteval < 0x80)
{
// Ordinary ASCII/Latin1 character.
chars[posn++] = (char)byteval;
}
else if(byteval >= 0xA1 && byteval <= 0xDF)
{
// Half-width katakana character.
chars[posn++] = (char)(byteval - 0xA1 + 0xFF61);
}
else
{
// Invalid first byte.
chars[posn++] = '?';
}
}
else
{
// Second byte in a double-byte sequence.
if(last >= 0x81 && last <= 0x9F)
{
value = (last - 0x81) * 0xBC;
}
else if (last >= 0xF0 && last <= 0xFC && byteval <= 0xFC)
{
// PrivateUse
value = 0xE000 + (last - 0xF0) * 0xBC + byteval;
if (byteval > 0x7F)
value--;
}
else
{
value = (last - 0xE0 + (0xA0 - 0x81)) * 0xBC;
}
last = 0;
if(byteval >= 0x40 && byteval <= 0x7E)
{
value += (byteval - 0x40);
}
else if(byteval >= 0x80 && byteval <= 0xFC)
{
value += (byteval - 0x80 + 0x3F);
}
else
{
// Invalid second byte.
chars[posn++] = '?';
continue;
}
value *= 2;
value = ((int)(table[value])) |
(((int)(table[value + 1])) << 8);
if(value != 0)
{
chars[posn++] = (char)value;
}
else
{
chars[posn++] = '?';
}
}
}
if (refresh) {
if (last != 0)
chars[posn++] = '\u30FB';
last_byte_chars = '\0';
}
else
last_byte_chars = last;
// Return the final length to the caller.
return posn - charIndex;
}
} // class CP932Decoder
[Serializable]
public class ENCshift_jis : CP932
{
public ENCshift_jis() : base() {}
}; // class ENCshift_jis
}; // namespace I18N.CJK