Xamarin Public Jenkins 6992685b86 Imported Upstream version 4.2.0.179
Former-commit-id: 0a113cb3a6feb7873f632839b1307cc6033cd595
2015-11-10 14:54:39 +00:00

575 lines
14 KiB
C#

using System;
using System.Globalization;
using System.Text;
using System.Runtime.CompilerServices;
using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
namespace System.Text
{
internal enum NormalizationCheck {
Yes,
No,
Maybe
}
internal unsafe class Normalization
{
public const int NoNfd = 1;
public const int NoNfkd = 2;
public const int NoNfc = 4;
public const int MaybeNfc = 8;
public const int NoNfkc = 16;
public const int MaybeNfkc = 32;
public const int FullCompositionExclusion = 64;
public const int IsUnsafe = 128;
// public const int ExpandOnNfd = 256;
// public const int ExpandOnNfc = 512;
// public const int ExpandOnNfkd = 1024;
// public const int ExpandOnNfkc = 2048;
static uint PropValue (int cp)
{
return props [NUtil.PropIdx (cp)];
}
static int CharMapIdx (int cp)
{
return charMapIndex [NUtil.MapIdx (cp)];
}
static byte GetCombiningClass (int c)
{
return combiningClass [NUtil.Combining.ToIndex (c)];
}
static int GetPrimaryCompositeFromMapIndex (int src)
{
return mapIdxToComposite [NUtil.Composite.ToIndex (src)];
}
static int GetPrimaryCompositeHelperIndex (int cp)
{
return helperIndex [NUtil.Helper.ToIndex (cp)];
}
private static string Compose (string source, int checkType)
{
StringBuilder sb = null;
// Decompose to NFD or NKFD depending on our target
Decompose (source, ref sb, checkType == 2 ? 3 : 1);
if (sb == null)
sb = Combine (source, 0, checkType);
else
Combine (sb, 0, checkType);
return sb != null ? sb.ToString () : source;
}
private static StringBuilder Combine (string source, int start, int checkType)
{
for (int i = 0; i < source.Length; i++) {
if (QuickCheck (source [i], checkType) == NormalizationCheck.Yes)
continue;
StringBuilder sb = new StringBuilder (source.Length + source.Length / 10);
sb.Append (source);
Combine (sb, i, checkType);
return sb;
}
return null;
}
/*
private static bool CanBePrimaryComposite (int i)
{
if (i >= 0x3400 && i <= 0x9FBB)
return GetPrimaryCompositeHelperIndex (i) != 0;
return (PropValue (i) & IsUnsafe) != 0;
}
*/
private static void Combine (StringBuilder sb, int i, int checkType)
{
// Back off one character as we may be looking at a V or T jamo.
CombineHangul (sb, null, i > 0 ? i - 1 : i);
while (i < sb.Length) {
if (QuickCheck (sb [i], checkType) == NormalizationCheck.Yes) {
i++;
continue;
}
i = TryComposeWithPreviousStarter (sb, null, i);
}
}
private static int CombineHangul (StringBuilder sb, string s, int current)
{
int length = sb != null ? sb.Length : s.Length;
int last = Fetch (sb, s, current);
for (int i = current + 1; i < length; ++i) {
int ch = Fetch (sb, s, i);
// 1. check to see if two current characters are L and V
int LIndex = last - HangulLBase;
if (0 <= LIndex && LIndex < HangulLCount) {
int VIndex = ch - HangulVBase;
if (0 <= VIndex && VIndex < HangulVCount) {
if (sb == null)
return -1;
// make syllable of form LV
last = HangulSBase + (LIndex * HangulVCount + VIndex) * HangulTCount;
sb [i - 1] = (char) last; // reset last
sb.Remove (i, 1);
i--; length--;
continue; // discard ch
}
}
// 2. check to see if two current characters are LV and T
int SIndex = last - HangulSBase;
if (0 <= SIndex && SIndex < HangulSCount && (SIndex % HangulTCount) == 0) {
int TIndex = ch - HangulTBase;
if (0 < TIndex && TIndex < HangulTCount) {
if (sb == null)
return -1;
// make syllable of form LVT
last += TIndex;
sb [i - 1] = (char) last; // reset last
sb.Remove (i, 1);
i--; length--;
continue; // discard ch
}
}
// if neither case was true, just add the character
last = ch;
}
return length;
}
static int Fetch (StringBuilder sb, string s, int i)
{
return (int) (sb != null ? sb [i] : s [i]);
}
// Cf. figure 7, section 1.3 of http://unicode.org/reports/tr15/.
static int TryComposeWithPreviousStarter (StringBuilder sb, string s, int current)
{
// Backtrack to previous starter.
int i = current - 1;
if (GetCombiningClass (Fetch (sb, s, current)) == 0) {
if (i < 0 || GetCombiningClass (Fetch (sb, s, i)) != 0)
return current + 1;
} else {
while (i >= 0 && GetCombiningClass (Fetch (sb, s, i)) != 0)
i--;
if (i < 0)
return current + 1;
}
int starter = Fetch (sb, s, i);
// The various decompositions involving starter follow this index.
int comp_idx = GetPrimaryCompositeHelperIndex (starter);
if (comp_idx == 0)
return current + 1;
int length = (sb != null ? sb.Length : s.Length);
int prevCombiningClass = -1;
for (int j = i + 1; j < length; j++) {
int candidate = Fetch (sb, s, j);
int combiningClass = GetCombiningClass (candidate);
if (combiningClass == prevCombiningClass)
// We skipped over a guy with the same class, without
// combining. Skip this one, too.
continue;
int composed = TryCompose (comp_idx, starter, candidate);
if (composed != 0) {
if (sb == null)
// Not normalized, and we are only checking.
return -1;
// Full Unicode warning: This will break when the underlying
// tables are extended.
sb [i] = (char) composed;
sb.Remove (j, 1);
return current;
}
// Gray box. We're done.
if (combiningClass == 0)
return j + 1;
prevCombiningClass = combiningClass;
}
return length;
}
static int TryCompose (int i, int starter, int candidate)
{
while (mappedChars [i] == starter) {
if (mappedChars [i + 1] == candidate &&
mappedChars [i + 2] == 0) {
int composed = GetPrimaryCompositeFromMapIndex (i);
if ((PropValue (composed) & FullCompositionExclusion) == 0)
return composed;
}
// Skip this entry.
while (mappedChars [i] != 0)
i++;
i++;
}
return 0;
}
static string Decompose (string source, int checkType)
{
StringBuilder sb = null;
Decompose (source, ref sb, checkType);
return sb != null ? sb.ToString () : source;
}
static void Decompose (string source,
ref StringBuilder sb, int checkType)
{
int [] buf = null;
int start = 0;
for (int i = 0; i < source.Length; i++)
if (QuickCheck (source [i], checkType) == NormalizationCheck.No)
DecomposeChar (ref sb, ref buf, source,
i, checkType, ref start);
if (sb != null)
sb.Append (source, start, source.Length - start);
ReorderCanonical (source, ref sb, 1);
}
static void ReorderCanonical (string src, ref StringBuilder sb, int start)
{
if (sb == null) {
// check only with src.
for (int i = 1; i < src.Length; i++) {
int level = GetCombiningClass (src [i]);
if (level == 0)
continue;
if (GetCombiningClass (src [i - 1]) > level) {
sb = new StringBuilder (src.Length);
sb.Append (src, 0, src.Length);
ReorderCanonical (src, ref sb, i);
return;
}
}
return;
}
// check only with sb
for (int i = start; i < sb.Length; ) {
int level = GetCombiningClass (sb [i]);
if (level == 0 || GetCombiningClass (sb [i - 1]) <= level) {
i++;
continue;
}
char c = sb [i - 1];
sb [i - 1] = sb [i];
sb [i] = c;
// Apply recursively.
if (i > 1)
i--;
}
}
static void DecomposeChar (ref StringBuilder sb,
ref int [] buf, string s, int i, int checkType, ref int start)
{
if (sb == null)
sb = new StringBuilder (s.Length + 100);
sb.Append (s, start, i - start);
if (buf == null)
buf = new int [19];
int n = GetCanonical (s [i], buf, 0, checkType);
for (int x = 0; x < n; x++) {
if (buf [x] < char.MaxValue)
sb.Append ((char) buf [x]);
else { // surrogate
sb.Append ((char) (buf [x] >> 10 + 0xD800));
sb.Append ((char) ((buf [x] & 0x0FFF) + 0xDC00));
}
}
start = i + 1;
}
public static NormalizationCheck QuickCheck (char c, int type)
{
uint v;
switch (type) {
default: // NFC
v = PropValue ((int) c);
return (v & NoNfc) == 0 ?
(v & MaybeNfc) == 0 ?
NormalizationCheck.Yes :
NormalizationCheck.Maybe :
NormalizationCheck.No;
case 1: // NFD
if ('\uAC00' <= c && c <= '\uD7A3')
return NormalizationCheck.No;
return (PropValue ((int) c) & NoNfd) != 0 ?
NormalizationCheck.No : NormalizationCheck.Yes;
case 2: // NFKC
v = PropValue ((int) c);
return (v & NoNfkc) != 0 ? NormalizationCheck.No :
(v & MaybeNfkc) != 0 ?
NormalizationCheck.Maybe :
NormalizationCheck.Yes;
case 3: // NFKD
if ('\uAC00' <= c && c <= '\uD7A3')
return NormalizationCheck.No;
return (PropValue ((int) c) & NoNfkd) != 0 ?
NormalizationCheck.No : NormalizationCheck.Yes;
}
}
/* for now we don't use FC_NFKC closure
public static bool IsMultiForm (char c)
{
return (PropValue ((int) c) & 0xF0000000) != 0;
}
public static char SingleForm (char c)
{
uint v = PropValue ((int) c);
int idx = (int) ((v & 0x7FFF0000) >> 16);
return (char) singleNorm [idx];
}
public static void MultiForm (char c, char [] buf, int index)
{
// FIXME: handle surrogate
uint v = PropValue ((int) c);
int midx = (int) ((v & 0x7FFF0000) >> 16);
buf [index] = (char) multiNorm [midx];
buf [index + 1] = (char) multiNorm [midx + 1];
buf [index + 2] = (char) multiNorm [midx + 2];
buf [index + 3] = (char) multiNorm [midx + 3];
if (buf [index + 3] != 0)
buf [index + 4] = (char) 0; // zero termination
}
*/
const int HangulSBase = 0xAC00, HangulLBase = 0x1100,
HangulVBase = 0x1161, HangulTBase = 0x11A7,
HangulLCount = 19, HangulVCount = 21, HangulTCount = 28,
HangulNCount = HangulVCount * HangulTCount, // 588
HangulSCount = HangulLCount * HangulNCount; // 11172
private static int GetCanonicalHangul (int s, int [] buf, int bufIdx)
{
int idx = s - HangulSBase;
if (idx < 0 || idx >= HangulSCount) {
return bufIdx;
}
int L = HangulLBase + idx / HangulNCount;
int V = HangulVBase + (idx % HangulNCount) / HangulTCount;
int T = HangulTBase + idx % HangulTCount;
buf [bufIdx++] = L;
buf [bufIdx++] = V;
if (T != HangulTBase) {
buf [bufIdx++] = T;
}
buf [bufIdx] = (char) 0;
return bufIdx;
}
static int GetCanonical (int c, int [] buf, int bufIdx, int checkType)
{
int newBufIdx = GetCanonicalHangul (c, buf, bufIdx);
if (newBufIdx > bufIdx)
return newBufIdx;
int i = CharMapIdx (c);
if (i == 0 || mappedChars [i] == c)
buf [bufIdx++] = c;
else {
// Character c maps to one or more decomposed chars.
for (; mappedChars [i] != 0; i++) {
int nth = mappedChars [i];
// http://www.unicode.org/reports/tr15/tr15-31.html, 1.3:
// Full decomposition involves recursive application of the
// Decomposition_Mapping values. Note that QuickCheck does
// not currently support astral plane codepoints.
if (nth <= 0xffff && QuickCheck ((char)nth, checkType) == NormalizationCheck.Yes)
buf [bufIdx++] = nth;
else
bufIdx = GetCanonical (nth, buf, bufIdx, checkType);
}
}
return bufIdx;
}
public static bool IsNormalized (string source, NormalizationForm normalizationForm)
{
switch (normalizationForm) {
default:
return IsNormalized (source, 0);
case NormalizationForm.FormD:
return IsNormalized (source, 1);
case NormalizationForm.FormKC:
return IsNormalized (source, 2);
case NormalizationForm.FormKD:
return IsNormalized (source, 3);
}
}
public static bool IsNormalized (string source, int type)
{
int prevCC = -1;
for (int i = 0; i < source.Length; ) {
int cc = GetCombiningClass (source [i]);
if (cc != 0 && cc < prevCC)
return false;
prevCC = cc;
switch (QuickCheck (source [i], type)) {
case NormalizationCheck.Yes:
i++;
break;
case NormalizationCheck.No:
return false;
case NormalizationCheck.Maybe:
// for those forms with composition, it cannot be checked here
switch (type) {
case 0: // NFC
case 2: // NFKC
return source == Normalize (source, type);
}
// go on...
i = CombineHangul (null, source, i > 0 ? i - 1 : i);
if (i < 0)
return false;
i = TryComposeWithPreviousStarter (null, source, i);
if (i < 0)
return false;
break;
}
}
return true;
}
public static string Normalize (string source, NormalizationForm normalizationForm)
{
switch (normalizationForm) {
default:
return Normalization.Normalize (source, 0);
case NormalizationForm.FormD:
return Normalization.Normalize (source, 1);
case NormalizationForm.FormKC:
return Normalization.Normalize (source, 2);
case NormalizationForm.FormKD:
return Normalization.Normalize (source, 3);
}
}
public static string Normalize (string source, int type)
{
switch (type) {
default:
case 2:
return Compose (source, type);
case 1:
case 3:
return Decompose (source, type);
}
}
static byte* props;
static int* mappedChars;
static short* charMapIndex;
static short* helperIndex;
static ushort* mapIdxToComposite;
static byte* combiningClass;
#if GENERATE_TABLE
public static readonly bool IsReady = true; // always
static Normalization ()
{
fixed (byte* tmp = propsArr) {
props = tmp;
}
fixed (int* tmp = mappedCharsArr) {
mappedChars = tmp;
}
fixed (short* tmp = charMapIndexArr) {
charMapIndex = tmp;
}
fixed (short* tmp = helperIndexArr) {
helperIndex = tmp;
}
fixed (ushort* tmp = mapIdxToCompositeArr) {
mapIdxToComposite = tmp;
}
fixed (byte* tmp = combiningClassArr) {
combiningClass = tmp;
}
}
#else
static object forLock = new object ();
public static readonly bool isReady;
public static bool IsReady {
get { return isReady; }
}
[MethodImpl (MethodImplOptions.InternalCall)]
static extern void load_normalization_resource (
out IntPtr props, out IntPtr mappedChars,
out IntPtr charMapIndex, out IntPtr helperIndex,
out IntPtr mapIdxToComposite, out IntPtr combiningClass);
static Normalization ()
{
IntPtr p1, p2, p3, p4, p5, p6;
lock (forLock) {
load_normalization_resource (out p1, out p2, out p3, out p4, out p5, out p6);
props = (byte*) p1;
mappedChars = (int*) p2;
charMapIndex = (short*) p3;
helperIndex = (short*) p4;
mapIdxToComposite = (ushort*) p5;
combiningClass = (byte*) p6;
}
isReady = true;
}
}
}
#endif
//
// autogenerated code or icall to fill array runs here
//