Jo Shields a575963da9 Imported Upstream version 3.6.0
Former-commit-id: da6be194a6b1221998fc28233f2503bd61dd9d14
2014-08-13 10:39:27 +01:00

372 lines
10 KiB
C#

//
// GB18030Encoding.cs
//
// Author:
// Atsushi Enomoto <atsushi@ximian.com>
//
using System;
using System.Globalization;
using System.Reflection;
using System.Text;
using System.Runtime.InteropServices;
#if BUILD_GENERATOR
using System.IO;
using System.Xml;
#endif
namespace I18N.CJK
{
internal unsafe class GB18030Source
{
class GB18030Map
{
public readonly int UStart;
public readonly int UEnd;
public readonly long GStart;
public readonly long GEnd;
public readonly bool Dummy; // This range is actually not usable.
public GB18030Map (
int ustart, int uend, long gstart, long gend, bool dummy)
{
this.UStart = ustart;
this.UEnd = uend;
this.GStart = gstart;
this.GEnd = gend;
this.Dummy = dummy;
}
}
private GB18030Source ()
{
}
static readonly byte *gbx2uni;
static readonly byte *uni2gbx;
static readonly int gbx2uniSize, uni2gbxSize;
static GB18030Source ()
{
MethodInfo mi = typeof (Assembly).GetMethod (
"GetManifestResourceInternal",
BindingFlags.NonPublic | BindingFlags.Instance);
int size = 0;
Module mod = null;
IntPtr ret = IntPtr.Zero;
if (mi != null)
{
ret = (IntPtr)mi.Invoke(
Assembly.GetExecutingAssembly(),
new object[] { "gb18030.table", size, mod });
}
else
{
// DotNet's way ;)
using (var ms = Assembly.GetExecutingAssembly()
.GetManifestResourceStream("gb18030.table"))
{
var len = (int)ms.Length;
byte* buf = (byte*)Marshal.AllocHGlobal(sizeof(byte) * len);
for (int i = 0; i < len; i++)
buf[i] = (byte)ms.ReadByte();
ret = (IntPtr)buf;
}
}
if (ret != IntPtr.Zero) {
gbx2uni = (byte*) ((void*) ret);
gbx2uniSize =
(gbx2uni [0] << 24) + (gbx2uni [1] << 16) +
(gbx2uni [2] << 8) + (gbx2uni [3]);
gbx2uni += 4;
uni2gbx = gbx2uni + gbx2uniSize;
uni2gbxSize =
(uni2gbx [0] << 24) + (uni2gbx [1] << 16) +
(uni2gbx [2] << 8) + (uni2gbx [3]);
uni2gbx += 4;
}
}
static readonly long gbxBase =
FromGBXRaw (0x81, 0x30, 0x81, 0x30, false);
static readonly long gbxSuppBase =
FromGBXRaw (0x90, 0x30, 0x81, 0x30, false);
// See http://icu.sourceforge.net/docs/papers/gb18030.html
// and referenced XML mapping table.
static readonly GB18030Map [] ranges = new GB18030Map [] {
// rawmap: 0x0080-0x0451
new GB18030Map (0x0452, 0x200F, FromGBXRaw (0x81, 0x30, 0xD3, 0x30, false), FromGBXRaw (0x81, 0x36, 0xA5, 0x31, false), false),
// rawmap: 0x2010-0x2642
new GB18030Map (0x2643, 0x2E80, FromGBXRaw (0x81, 0x37, 0xA8, 0x39, false), FromGBXRaw (0x81, 0x38, 0xFD, 0x38, false), false),
// rawmap: 0x2E81-0x361A
new GB18030Map (0x361B, 0x3917, FromGBXRaw (0x82, 0x30, 0xA6, 0x33, false), FromGBXRaw (0x82, 0x30, 0xF2, 0x37, false), false),
// rawmap: 0x3918-0x3CE0
new GB18030Map (0x3CE1, 0x4055, FromGBXRaw (0x82, 0x31, 0xD4, 0x38, false), FromGBXRaw (0x82, 0x32, 0xAF, 0x32, false), false),
// rawmap: 0x4056-0x415F
new GB18030Map (0x4160, 0x4336, FromGBXRaw (0x82, 0x32, 0xC9, 0x37, false), FromGBXRaw (0x82, 0x32, 0xF8, 0x37, false), false),
// rawmap: 4337-0x44D6
new GB18030Map (0x44D7, 0x464B, FromGBXRaw (0x82, 0x33, 0xA3, 0x39, false), FromGBXRaw (0x82, 0x33, 0xC9, 0x31, false), false),
// rawmap: 0x464C-0x478D
new GB18030Map (0x478E, 0x4946, FromGBXRaw (0x82, 0x33, 0xE8, 0x38, false), FromGBXRaw (0x82, 0x34, 0x96, 0x38, false), false),
// rawmap: 0x4947-0x49B7
new GB18030Map (0x49B8, 0x4C76, FromGBXRaw (0x82, 0x34, 0xA1, 0x31, false), FromGBXRaw (0x82, 0x34, 0xE7, 0x33, false), false),
// rawmap: 0x4C77-0x4DFF
// 4E00-9FA5 are all mapped in GB2312
new GB18030Map (0x4E00, 0x9FA5, 0, 0, true),
new GB18030Map (0x9FA6, 0xD7FF, FromGBXRaw (0x82, 0x35, 0x8F, 0x33, false), FromGBXRaw (0x83, 0x36, 0xC7, 0x38, false), false),
// D800-DFFF are ignored (surrogate)
// E000-E76B are all mapped in GB2312.
new GB18030Map (0xD800, 0xE76B, 0, 0, true),
// rawmap: 0xE76C-E884
new GB18030Map (0xE865, 0xF92B, FromGBXRaw (0x83, 0x36, 0xD0, 0x30, false), FromGBXRaw (0x84, 0x30, 0x85, 0x34, false), false),
// rawmap: 0xF92C-FA29
new GB18030Map (0xFA2A, 0xFE2F, FromGBXRaw (0x84, 0x30, 0x9C, 0x38, false), FromGBXRaw (0x84, 0x31, 0x85, 0x37, false), false),
// rawmap: 0xFE30-FFE5
new GB18030Map (0xFFE6, 0xFFFF, FromGBXRaw (0x84, 0x31, 0xA2, 0x34, false), FromGBXRaw (0x84, 0x31, 0xA4, 0x39, false), false),
};
public static void Unlinear (byte [] bytes, int start, long gbx)
{
fixed (byte* bptr = bytes) {
Unlinear (bptr + start, gbx);
}
}
public static unsafe void Unlinear (byte* bytes, long gbx)
{
bytes [3] = (byte) (gbx % 10 + 0x30);
gbx /= 10;
bytes [2] = (byte) (gbx % 126 + 0x81);
gbx /= 126;
bytes [1] = (byte) (gbx % 10 + 0x30);
gbx /= 10;
bytes [0] = (byte) (gbx + 0x81);
}
// negative (invalid) or positive (valid)
public static long FromGBX (byte [] bytes, int start)
{
byte b1 = bytes [start];
byte b2 = bytes [start + 1];
byte b3 = bytes [start + 2];
byte b4 = bytes [start + 3];
if (b1 < 0x81 || b1 == 0xFF)
return -1;
if (b2 < 0x30 || b2 > 0x39)
return -2;
if (b3 < 0x81 || b3 == 0xFF)
return -3;
if (b4 < 0x30 || b4 > 0x39)
return -4;
if (b1 >= 0x90)
return FromGBXRaw (b1, b2, b3, b4, true);
long linear = FromGBXRaw (b1, b2, b3, b4, false);
long rawOffset = 0;
long startIgnore = 0;
for (int i = 0; i < ranges.Length; i++) {
GB18030Map m = ranges [i];
if (linear < m.GStart)
return ToUcsRaw ((int) (linear
- startIgnore + rawOffset));
if (linear <= m.GEnd)
return linear - gbxBase - m.GStart
+ m.UStart;
if (m.GStart != 0) {
rawOffset += m.GStart - startIgnore;
startIgnore = m.GEnd + 1;
}
}
// return ToUcsRaw ((int) (linear - gbxBase));
throw new SystemException (String.Format ("GB18030 INTERNAL ERROR (should not happen): GBX {0:x02} {1:x02} {2:x02} {3:x02}", b1, b2, b3, b4));
}
public static long FromUCSSurrogate (int cp)
{
return cp + gbxSuppBase;
}
public static long FromUCS (int cp)
{
long rawOffset = 0;
long startIgnore = 0x80;
for (int i = 0; i < ranges.Length; i++) {
GB18030Map m = ranges [i];
if (cp < m.UStart)
return ToGbxRaw ((int) (cp
- startIgnore + rawOffset));
if (cp <= m.UEnd)
return cp - m.UStart + m.GStart;
if (m.GStart != 0) {
rawOffset += m.UStart - startIgnore;
startIgnore = m.UEnd + 1;
}
}
throw new SystemException (String.Format ("GB18030 INTERNAL ERROR (should not happen): UCS {0:x06}", cp));
}
static long FromGBXRaw (
byte b1, byte b2, byte b3, byte b4, bool supp)
{
// 126 = 0xFE - 0x80
return (((b1 - (supp ? 0x90 : 0x81)) * 10 +
(b2 - 0x30)) * 126 +
(b3 - 0x81)) * 10 +
b4 - 0x30 + (supp ? 0x10000 : 0);
}
static int ToUcsRaw (int idx)
{
return gbx2uni [idx * 2] * 0x100 +
gbx2uni [idx * 2 + 1];
}
static long ToGbxRaw (int idx)
{
if (idx < 0 || idx * 2 + 1 >= uni2gbxSize)
return -1;
return gbxBase + uni2gbx [idx * 2] * 0x100 + uni2gbx [idx * 2 + 1];
}
#if BUILD_GENERATOR
public static void Main ()
{
new GB18030Source ().Run ();
}
byte [] uni2gbxMap;
byte [] gbx2uniMap;
void Run ()
{
int ustart = 0x80;
long gstart = 0;
int ucount = 0;
long gcount = 0;
bool skip = false;
for (int i = 0; i < ranges.Length; i++) {
GB18030Map m = ranges [i];
if (!skip) {
//Console.WriteLine ("---- adding {0:X04} umap. {1:X04} gmap, skip range between {2:X04} and {3:X04}", m.UStart - ustart, m.GStart != 0 ? m.GStart - gstart : 0, m.UStart, m.UEnd);
ucount += m.UStart - ustart;
}
if (m.GStart != 0)
gcount += m.GStart - gstart;
skip = m.GStart == 0;
ustart = m.UEnd + 1;
if (m.GStart != 0)
gstart = m.GEnd + 1;
}
Console.Error.WriteLine ("Total UCS codepoints: {0} ({1:X04})", ucount, ucount);
Console.Error.WriteLine ("Total GBX codepoints: {0} ({1:X04})", gcount, gcount);
uni2gbxMap = new byte [ucount * 2];
gbx2uniMap = new byte [gcount * 2];
XmlDocument doc = new XmlDocument ();
doc.XmlResolver = null;
doc.Load ("gb-18030-2000.xml");
foreach (XmlElement e in doc.SelectNodes (
"/characterMapping/assignments/a"))
AddMap (e);
using (FileStream fs = File.Create ("gb18030.table")) {
byte [] size = new byte [4];
for (int i = 0, len = gbx2uniMap.Length;
i < 4; i++, len >>= 8)
size [3 - i] = (byte) (len % 0x100);
fs.Write (size, 0, 4);
fs.Write (gbx2uniMap, 0, gbx2uniMap.Length);
fs.Write (uni2gbxMap, 0, uni2gbxMap.Length);
}
Console.WriteLine ("done.");
}
void AddMap (XmlElement e)
{
int u = int.Parse (e.GetAttribute ("u"),
NumberStyles.HexNumber);
byte [] b = new byte [4];
int idx = 0;
foreach (string s in e.GetAttribute ("b").Split (' '))
b [idx++] =
byte.Parse (s, NumberStyles.HexNumber);
if (idx != 4)
return;
AddMap (u, b);
}
void AddMap (int u, byte [] b)
{
int gbx = (int) (FromGBXRaw (
b [0], b [1], b [2], b [3], false) - gbxBase);
if (u > 0x10000 || gbx > 0x10000)
throw new Exception (String.Format (
"should not happen: {0:X04} {1:X04}",
u, gbx));
int uidx = IndexForUcs (u);
//Console.WriteLine ("U: {0:x04} for {1:x04} [{2:x02} {3:x02}]", uidx, u, (byte) (gbx / 0x100), (byte) (gbx % 0x100));
uni2gbxMap [uidx * 2] = (byte) (gbx / 0x100);
uni2gbxMap [uidx * 2 + 1] = (byte) (gbx % 0x100);
int gidx = IndexForGbx (gbx);
//Console.WriteLine ("G: {0:x04} for {1:x04} ({2:x02} {3:x02} {4:x02} {5:x02})", gidx, gbx, b [0], b [1], b [2], b [3]);
gbx2uniMap [gidx * 2] = (byte) (u / 0x100);
gbx2uniMap [gidx * 2 + 1] = (byte) (u % 0x100);
}
static int IndexForUcs (int ucs)
{
int start = 0x80;
int count = 0;
bool skip = false;
for (int i = 0; i < ranges.Length; i++) {
GB18030Map m = ranges [i];
if (!skip) {
if (ucs < m.UStart)
return count + ucs - start;
count += m.UStart - start;
}
skip = m.GStart == 0;
start = m.UEnd + 1;
}
return -1;
}
static int IndexForGbx (int gbx)
{
long start = 0;
long count = 0;
for (int i = 0; i < ranges.Length; i++) {
GB18030Map m = ranges [i];
if (m.GStart == 0)
continue;
if (gbx < m.GStart)
return (int) (count + gbx - start);
count += m.GStart - start;
start = m.GEnd + 1;
}
return -1;
}
#endif
}
}