Xamarin Public Jenkins (auto-signing) e79aa3c0ed Imported Upstream version 4.6.0.125
Former-commit-id: a2155e9bd80020e49e72e86c44da02a8ac0e57a4
2016-08-03 10:59:49 +00:00

1365 lines
69 KiB
C#

//------------------------------------------------------------------------------
// <copyright file="RegexCharClass.cs" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
// This RegexCharClass class provides the "set of Unicode chars" functionality
// used by the regexp engine.
// The main function of RegexCharClass is as a builder to turn ranges, characters and
// Unicode categories into a single string. This string is used as a black box
// representation of a character class by the rest of Regex. The format is as follows.
//
// Char index Use
// 0 Flags - currently this only holds the "negate" flag
// 1 length of the string representing the "set" portion, eg [a-z0-9] only has a "set"
// 2 length of the string representing the "category" portion, eg [\p{Lu}] only has a "category"
// 3...m The set. These are a series of ranges which define the characters included in the set.
// To determine if a given character is in the set, we binary search over this set of ranges
// and see where the character should go. Based on whether the ending index is odd or even,
// we know if the character is in the set.
// m+1...n The categories. This is a list of UnicodeCategory enum values which describe categories
// included in this class.
namespace System.Text.RegularExpressions {
using System.Collections;
using System.Collections.Generic;
using System.Globalization;
using System.Diagnostics;
internal sealed class RegexCharClass {
// instance data
private List<SingleRange> _rangelist;
private StringBuilder _categories;
private bool _canonical;
private bool _negate;
private RegexCharClass _subtractor;
// Constants
private const int FLAGS = 0;
private const int SETLENGTH = 1;
private const int CATEGORYLENGTH = 2;
private const int SETSTART = 3;
private const char Nullchar = '\0';
private const char Lastchar = '\uFFFF';
private const char GroupChar = (char) 0;
private const short SpaceConst = 100;
private const short NotSpaceConst = -100;
private const char ZeroWidthJoiner = '\u200D';
private const char ZeroWidthNonJoiner = '\u200C';
private static readonly String InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
private static readonly String Space = "\x64";
private static readonly String NotSpace = NegateCategory(Space);
private static readonly String Word;
private static readonly String NotWord;
internal static readonly String SpaceClass;
internal static readonly String NotSpaceClass;
internal static readonly String WordClass;
internal static readonly String NotWordClass;
internal static readonly String DigitClass;
internal static readonly String NotDigitClass;
private const String ECMASpaceSet = "\u0009\u000E\u0020\u0021";
private const String NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021";
private const String ECMAWordSet = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
private const String NotECMAWordSet = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
private const String ECMADigitSet = "\u0030\u003A";
private const String NotECMADigitSet = "\0\u0030\u003A";
internal const String ECMASpaceClass = "\x00\x04\x00" + ECMASpaceSet;
internal const String NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet;
internal const String ECMAWordClass = "\x00\x0A\x00" + ECMAWordSet;
internal const String NotECMAWordClass = "\x01\x0A\x00" + ECMAWordSet;
internal const String ECMADigitClass = "\x00\x02\x00" + ECMADigitSet;
internal const String NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet;
internal const String AnyClass = "\x00\x01\x00\x00";
internal const String EmptyClass = "\x00\x00\x00";
static Dictionary<String, String> _definedCategories;
/*
* The property table contains all the block definitions defined in the
* XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 4.0 spec (www.unicode.org),
* and Perl 5.6 (see Programming Perl, 3rd edition page 167). Three blocks defined by Perl (and here) may
* not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates.
*
**/
// Has to be sorted by the first column
private static readonly String[,] _propTable = {
{"IsAlphabeticPresentationForms", "\uFB00\uFB50"},
{"IsArabic", "\u0600\u0700"},
{"IsArabicPresentationForms-A", "\uFB50\uFE00"},
{"IsArabicPresentationForms-B", "\uFE70\uFF00"},
{"IsArmenian", "\u0530\u0590"},
{"IsArrows", "\u2190\u2200"},
{"IsBasicLatin", "\u0000\u0080"},
{"IsBengali", "\u0980\u0A00"},
{"IsBlockElements", "\u2580\u25A0"},
{"IsBopomofo", "\u3100\u3130"},
{"IsBopomofoExtended", "\u31A0\u31C0"},
{"IsBoxDrawing", "\u2500\u2580"},
{"IsBraillePatterns", "\u2800\u2900"},
{"IsBuhid", "\u1740\u1760"},
{"IsCJKCompatibility", "\u3300\u3400"},
{"IsCJKCompatibilityForms", "\uFE30\uFE50"},
{"IsCJKCompatibilityIdeographs", "\uF900\uFB00"},
{"IsCJKRadicalsSupplement", "\u2E80\u2F00"},
{"IsCJKSymbolsandPunctuation", "\u3000\u3040"},
{"IsCJKUnifiedIdeographs", "\u4E00\uA000"},
{"IsCJKUnifiedIdeographsExtensionA", "\u3400\u4DC0"},
{"IsCherokee", "\u13A0\u1400"},
{"IsCombiningDiacriticalMarks", "\u0300\u0370"},
{"IsCombiningDiacriticalMarksforSymbols","\u20D0\u2100"},
{"IsCombiningHalfMarks", "\uFE20\uFE30"},
{"IsCombiningMarksforSymbols", "\u20D0\u2100"},
{"IsControlPictures", "\u2400\u2440"},
{"IsCurrencySymbols", "\u20A0\u20D0"},
{"IsCyrillic", "\u0400\u0500"},
{"IsCyrillicSupplement", "\u0500\u0530"},
{"IsDevanagari", "\u0900\u0980"},
{"IsDingbats", "\u2700\u27C0"},
{"IsEnclosedAlphanumerics", "\u2460\u2500"},
{"IsEnclosedCJKLettersandMonths", "\u3200\u3300"},
{"IsEthiopic", "\u1200\u1380"},
{"IsGeneralPunctuation", "\u2000\u2070"},
{"IsGeometricShapes", "\u25A0\u2600"},
{"IsGeorgian", "\u10A0\u1100"},
{"IsGreek", "\u0370\u0400"},
{"IsGreekExtended", "\u1F00\u2000"},
{"IsGreekandCoptic", "\u0370\u0400"},
{"IsGujarati", "\u0A80\u0B00"},
{"IsGurmukhi", "\u0A00\u0A80"},
{"IsHalfwidthandFullwidthForms", "\uFF00\uFFF0"},
{"IsHangulCompatibilityJamo", "\u3130\u3190"},
{"IsHangulJamo", "\u1100\u1200"},
{"IsHangulSyllables", "\uAC00\uD7B0"},
{"IsHanunoo", "\u1720\u1740"},
{"IsHebrew", "\u0590\u0600"},
{"IsHighPrivateUseSurrogates", "\uDB80\uDC00"},
{"IsHighSurrogates", "\uD800\uDB80"},
{"IsHiragana", "\u3040\u30A0"},
{"IsIPAExtensions", "\u0250\u02B0"},
{"IsIdeographicDescriptionCharacters", "\u2FF0\u3000"},
{"IsKanbun", "\u3190\u31A0"},
{"IsKangxiRadicals", "\u2F00\u2FE0"},
{"IsKannada", "\u0C80\u0D00"},
{"IsKatakana", "\u30A0\u3100"},
{"IsKatakanaPhoneticExtensions", "\u31F0\u3200"},
{"IsKhmer", "\u1780\u1800"},
{"IsKhmerSymbols", "\u19E0\u1A00"},
{"IsLao", "\u0E80\u0F00"},
{"IsLatin-1Supplement", "\u0080\u0100"},
{"IsLatinExtended-A", "\u0100\u0180"},
{"IsLatinExtended-B", "\u0180\u0250"},
{"IsLatinExtendedAdditional", "\u1E00\u1F00"},
{"IsLetterlikeSymbols", "\u2100\u2150"},
{"IsLimbu", "\u1900\u1950"},
{"IsLowSurrogates", "\uDC00\uE000"},
{"IsMalayalam", "\u0D00\u0D80"},
{"IsMathematicalOperators", "\u2200\u2300"},
{"IsMiscellaneousMathematicalSymbols-A","\u27C0\u27F0"},
{"IsMiscellaneousMathematicalSymbols-B","\u2980\u2A00"},
{"IsMiscellaneousSymbols", "\u2600\u2700"},
{"IsMiscellaneousSymbolsandArrows", "\u2B00\u2C00"},
{"IsMiscellaneousTechnical", "\u2300\u2400"},
{"IsMongolian", "\u1800\u18B0"},
{"IsMyanmar", "\u1000\u10A0"},
{"IsNumberForms", "\u2150\u2190"},
{"IsOgham", "\u1680\u16A0"},
{"IsOpticalCharacterRecognition", "\u2440\u2460"},
{"IsOriya", "\u0B00\u0B80"},
{"IsPhoneticExtensions", "\u1D00\u1D80"},
{"IsPrivateUse", "\uE000\uF900"},
{"IsPrivateUseArea", "\uE000\uF900"},
{"IsRunic", "\u16A0\u1700"},
{"IsSinhala", "\u0D80\u0E00"},
{"IsSmallFormVariants", "\uFE50\uFE70"},
{"IsSpacingModifierLetters", "\u02B0\u0300"},
{"IsSpecials", "\uFFF0"},
{"IsSuperscriptsandSubscripts", "\u2070\u20A0"},
{"IsSupplementalArrows-A", "\u27F0\u2800"},
{"IsSupplementalArrows-B", "\u2900\u2980"},
{"IsSupplementalMathematicalOperators", "\u2A00\u2B00"},
{"IsSyriac", "\u0700\u0750"},
{"IsTagalog", "\u1700\u1720"},
{"IsTagbanwa", "\u1760\u1780"},
{"IsTaiLe", "\u1950\u1980"},
{"IsTamil", "\u0B80\u0C00"},
{"IsTelugu", "\u0C00\u0C80"},
{"IsThaana", "\u0780\u07C0"},
{"IsThai", "\u0E00\u0E80"},
{"IsTibetan", "\u0F00\u1000"},
{"IsUnifiedCanadianAboriginalSyllabics","\u1400\u1680"},
{"IsVariationSelectors", "\uFE00\uFE10"},
{"IsYiRadicals", "\uA490\uA4D0"},
{"IsYiSyllables", "\uA000\uA490"},
{"IsYijingHexagramSymbols", "\u4DC0\u4E00"},
{"_xmlC", /* Name Char */ "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00B7\u00B8\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u02D0\u02D2\u0300\u0346\u0360\u0362\u0386\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0483\u0487\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0653\u0660\u066A\u0670\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06E9\u06EA\u06EE\u06F0\u06FA\u0901\u0904\u0905\u093A\u093C\u094E\u0951\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC"
+"\u09DE\u09DF\u09E4\u09E6\u09F2\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B70\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF0\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2"
+"\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0E01\u0E2F\u0E30\u0E3B\u0E40\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0F18\u0F1A\u0F20\u0F2A\u0F35\u0F36\u0F37\u0F38\u0F39\u0F3A\u0F3E\u0F48\u0F49\u0F6A\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F96\u0F97\u0F98\u0F99\u0FAE\u0FB1\u0FB8\u0FB9\u0FBA\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00"
+"\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u20D0\u20DD\u20E1\u20E2\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3005\u3006\u3007\u3008\u3021\u3030\u3031\u3036\u3041\u3095\u3099\u309B\u309D\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"},
{"_xmlD", "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u1040\u104A\u1369\u1372\u17E0\u17EA\u1810\u181A\uFF10\uFF1A"},
{"_xmlI", /* Start Name Char */ "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0641\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u093D\u093E\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A72\u0A75\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABD\u0ABE\u0AE0\u0AE1\u0B05\u0B0D\u0B0F"
+"\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3D\u0B3E\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E2F\u0E30\u0E31\u0E32\u0E34\u0E40\u0E46\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC"
+"\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3007\u3008\u3021\u302A\u3041\u3095\u30A1\u30FB\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"},
{"_xmlW", "\u0024\u0025\u002B\u002C\u0030\u003A\u003C\u003F\u0041\u005B\u005E\u005F\u0060\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u0221\u0222\u0234\u0250\u02AE\u02B0\u02EF\u0300\u0350\u0360\u0370\u0374\u0376\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03F7\u0400\u0487\u0488\u04CF\u04D0\u04F6\u04F8\u04FA\u0500\u0510\u0531\u0557\u0559\u055A\u0561\u0588\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0656\u0660\u066A\u066E\u06D4\u06D5\u06DD\u06DE\u06EE\u06F0\u06FF\u0710\u072D\u0730\u074B\u0780\u07B2\u0901\u0904\u0905\u093A\u093C\u094E\u0950\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC\u09DE\u09DF\u09E4\u09E6\u09FB\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35"
+"\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AD0\u0AD1\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B71\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF3\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49"
+"\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0D82\u0D84\u0D85\u0D97\u0D9A\u0DB2\u0DB3\u0DBC\u0DBD\u0DBE\u0DC0\u0DC7\u0DCA\u0DCB\u0DCF\u0DD5\u0DD6\u0DD7\u0DD8\u0DE0\u0DF2\u0DF4\u0E01\u0E3B\u0E3F\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0EDC\u0EDE\u0F00\u0F04\u0F13\u0F3A\u0F3E\u0F48\u0F49\u0F6B\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F98\u0F99\u0FBD\u0FBE\u0FCD\u0FCF\u0FD0\u1000\u1022\u1023\u1028\u1029\u102B\u102C\u1033\u1036\u103A\u1040\u104A\u1050\u105A\u10A0\u10C6\u10D0\u10F9\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1200\u1207\u1208\u1247\u1248\u1249\u124A\u124E\u1250\u1257\u1258\u1259\u125A\u125E\u1260\u1287\u1288\u1289\u128A\u128E\u1290\u12AF\u12B0\u12B1\u12B2\u12B6\u12B8\u12BF\u12C0\u12C1\u12C2\u12C6\u12C8\u12CF\u12D0\u12D7\u12D8\u12EF\u12F0\u130F\u1310\u1311\u1312\u1316\u1318\u131F\u1320\u1347\u1348\u135B\u1369\u137D\u13A0"
+"\u13F5\u1401\u166D\u166F\u1677\u1681\u169B\u16A0\u16EB\u16EE\u16F1\u1700\u170D\u170E\u1715\u1720\u1735\u1740\u1754\u1760\u176D\u176E\u1771\u1772\u1774\u1780\u17D4\u17D7\u17D8\u17DB\u17DD\u17E0\u17EA\u180B\u180E\u1810\u181A\u1820\u1878\u1880\u18AA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0\u1FF2\u1FF5\u1FF6\u1FFF\u2044\u2045\u2052\u2053\u2070\u2072\u2074\u207D\u207F\u208D\u20A0\u20B2\u20D0\u20EB\u2100\u213B\u213D\u214C\u2153\u2184\u2190\u2329\u232B\u23B4\u23B7\u23CF\u2400\u2427\u2440\u244B\u2460\u24FF\u2500\u2614\u2616\u2618\u2619\u267E\u2680\u268A\u2701\u2705\u2706\u270A\u270C\u2728\u2729\u274C\u274D\u274E\u274F\u2753\u2756\u2757\u2758\u275F\u2761\u2768\u2776\u2795\u2798\u27B0\u27B1\u27BF\u27D0\u27E6\u27F0\u2983\u2999\u29D8\u29DC\u29FC\u29FE\u2B00\u2E80\u2E9A\u2E9B\u2EF4\u2F00\u2FD6\u2FF0\u2FFC\u3004\u3008\u3012\u3014\u3020\u3030\u3031\u303D\u303E\u3040"
+"\u3041\u3097\u3099\u30A0\u30A1\u30FB\u30FC\u3100\u3105\u312D\u3131\u318F\u3190\u31B8\u31F0\u321D\u3220\u3244\u3251\u327C\u327F\u32CC\u32D0\u32FF\u3300\u3377\u337B\u33DE\u33E0\u33FF\u3400\u4DB6\u4E00\u9FA6\uA000\uA48D\uA490\uA4C7\uAC00\uD7A4\uF900\uFA2E\uFA30\uFA6B\uFB00\uFB07\uFB13\uFB18\uFB1D\uFB37\uFB38\uFB3D\uFB3E\uFB3F\uFB40\uFB42\uFB43\uFB45\uFB46\uFBB2\uFBD3\uFD3E\uFD50\uFD90\uFD92\uFDC8\uFDF0\uFDFD\uFE00\uFE10\uFE20\uFE24\uFE62\uFE63\uFE64\uFE67\uFE69\uFE6A\uFE70\uFE75\uFE76\uFEFD\uFF04\uFF05\uFF0B\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF21\uFF3B\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF66\uFFBF\uFFC2\uFFC8\uFFCA\uFFD0\uFFD2\uFFD8\uFFDA\uFFDD\uFFE0\uFFE7\uFFE8\uFFEF\uFFFC\uFFFE"},
};
/**************************************************************************
Let U be the set of Unicode character values and let L be the lowercase
function, mapping from U to U. To perform case insensitive matching of
character sets, we need to be able to map an interval I in U, say
I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
to a set A such that A contains L(I) and A is contained in the union of
I and L(I).
The table below partitions U into intervals on which L is non-decreasing.
Thus, for any interval J = [a, b] contained in one of these intervals,
L(J) is contained in [L(a), L(b)].
It is also true that for any such J, [L(a), L(b)] is contained in the
union of J and L(J). This does not follow from L being non-decreasing on
these intervals. It follows from the nature of the L on each interval.
On each interval, L has one of the following forms:
(1) L(ch) = constant (LowercaseSet)
(2) L(ch) = ch + offset (LowercaseAdd)
(3) L(ch) = ch | 1 (LowercaseBor)
(4) L(ch) = ch + (ch & 1) (LowercaseBad)
It is easy to verify that for any of these forms [L(a), L(b)] is
contained in the union of [a, b] and L([a, b]).
***************************************************************************/
private const int LowercaseSet = 0; // Set to arg.
private const int LowercaseAdd = 1; // Add arg.
private const int LowercaseBor = 2; // Bitwise or with 1.
private const int LowercaseBad = 3; // Bitwise and with 1 and add original.
private static readonly LowerCaseMapping[] _lcTable = new LowerCaseMapping[]
{
new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32),
new LowerCaseMapping('\u00C0', '\u00DE', LowercaseAdd, 32),
new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0),
new LowerCaseMapping('\u0130', '\u0130', LowercaseSet, 0x0069),
new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0),
new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0),
new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0),
new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF),
new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0),
new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253),
new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0),
new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254),
new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188),
new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205),
new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C),
new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD),
new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259),
new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B),
new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192),
new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260),
new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263),
new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269),
new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268),
new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199),
new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F),
new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272),
new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275),
new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0),
new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8),
new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283),
new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD),
new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288),
new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0),
new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217),
new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0),
new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292),
new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9),
new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD),
new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6),
new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9),
new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC),
new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0),
new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0),
new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3),
new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5),
new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0),
new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC),
new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37),
new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC),
new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63),
new LowerCaseMapping('\u0391', '\u03AB', LowercaseAdd, 32),
new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0),
new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80),
new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32),
new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0),
new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0),
new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0),
new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8),
new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC),
new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0),
new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0),
new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9),
new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48),
new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 48),
new LowerCaseMapping('\u1E00', '\u1EF8', LowercaseBor, 0),
new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F18', '\u1F1F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8),
new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51),
new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53),
new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55),
new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57),
new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8),
new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8),
new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8),
new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8),
new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74),
new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3),
new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86),
new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3),
new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8),
new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100),
new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8),
new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112),
new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5),
new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128),
new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126),
new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3),
new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16),
new LowerCaseMapping('\u24B6', '\u24D0', LowercaseAdd, 26),
new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32),
};
static RegexCharClass() {
// addressing Dictionary versus Hashtable thread safety difference by using
// a temp Dictionary. Note that this is just a theoretical concern since this
// is a static ctor and getter methods aren't called until after this is
// done; this is just to avoid the long-term possibility of thread safety
// problems.
Dictionary<String, String> tempCategories = new Dictionary<String, String>(32);
char[] groups = new char[9];
StringBuilder word = new StringBuilder(11);
word.Append(GroupChar);
groups[0] = GroupChar;
// We need the UnicodeCategory enum values as a char so we can put them in a string
// in the hashtable. In order to get there, we first must cast to an int,
// then cast to a char
// Also need to distinguish between positive and negative values. UnicodeCategory is zero
// based, so we add one to each value and subtract it off later
// Others
groups[1] = (char) ((int) UnicodeCategory.Control + 1);
tempCategories["Cc"] = groups[1].ToString(); // Control
groups[2] = (char) ((int) UnicodeCategory.Format + 1);
tempCategories["Cf"] = groups[2].ToString(); // Format
groups[3] = (char) ((int) UnicodeCategory.OtherNotAssigned + 1);
tempCategories["Cn"] = groups[3].ToString(); // Not assigned
groups[4] = (char) ((int) UnicodeCategory.PrivateUse + 1);
tempCategories["Co"] = groups[4].ToString(); // Private use
groups[5] = (char) ((int) UnicodeCategory.Surrogate + 1);
tempCategories["Cs"] = groups[5].ToString(); // Surrogate
groups[6] = GroupChar;
tempCategories["C"] = new String(groups, 0, 7);
// Letters
groups[1] = (char) ((int) UnicodeCategory.LowercaseLetter + 1);
tempCategories["Ll"] = groups[1].ToString(); // Lowercase
groups[2] = (char) ((int) UnicodeCategory.ModifierLetter + 1);
tempCategories["Lm"] = groups[2].ToString(); // Modifier
groups[3] = (char) ((int) UnicodeCategory.OtherLetter + 1);
tempCategories["Lo"] = groups[3].ToString(); // Other
groups[4] = (char) ((int) UnicodeCategory.TitlecaseLetter + 1);
tempCategories["Lt"] = groups[4].ToString(); // Titlecase
groups[5] = (char) ((int) UnicodeCategory.UppercaseLetter + 1);
tempCategories["Lu"] = groups[5].ToString(); // Uppercase
//groups[6] = GroupChar;
tempCategories["L"] = new String(groups, 0, 7);
word.Append(new String(groups, 1, 5));
// InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter}
// !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!!
tempCategories[InternalRegexIgnoreCase] = String.Format(CultureInfo.InvariantCulture, "{0}{1}{2}{3}{4}", GroupChar, groups[1], groups[4], groups[5], groups[6]);
// Marks
groups[1] = (char) ((int) UnicodeCategory.SpacingCombiningMark + 1);
tempCategories["Mc"] = groups[1].ToString(); // Spacing combining
groups[2] = (char) ((int) UnicodeCategory.EnclosingMark + 1);
tempCategories["Me"] = groups[2].ToString(); // Enclosing
groups[3] = (char) ((int) UnicodeCategory.NonSpacingMark + 1);
tempCategories["Mn"] = groups[3].ToString(); // Non-spacing
groups[4] = GroupChar;
tempCategories["M"] = new String(groups, 0, 5);
//word.Append(groups[1]);
word.Append(groups[3]);
// Numbers
groups[1] = (char) ((int) UnicodeCategory.DecimalDigitNumber + 1);
tempCategories["Nd"] = groups[1].ToString(); // Decimal digit
groups[2] = (char) ((int) UnicodeCategory.LetterNumber + 1);
tempCategories["Nl"] = groups[2].ToString(); // Letter
groups[3] = (char) ((int) UnicodeCategory.OtherNumber + 1);
tempCategories["No"] = groups[3].ToString(); // Other
//groups[4] = GroupChar;
tempCategories["N"] = new String(groups, 0, 5);
word.Append(groups[1]);
//word.Append(new String(groups, 1, 3));
// Punctuation
groups[1] = (char) ((int) UnicodeCategory.ConnectorPunctuation + 1);
tempCategories["Pc"] = groups[1].ToString(); // Connector
groups[2] = (char) ((int) UnicodeCategory.DashPunctuation + 1);
tempCategories["Pd"] = groups[2].ToString(); // Dash
groups[3] = (char) ((int) UnicodeCategory.ClosePunctuation + 1);
tempCategories["Pe"] = groups[3].ToString(); // Close
groups[4] = (char) ((int) UnicodeCategory.OtherPunctuation + 1);
tempCategories["Po"] = groups[4].ToString(); // Other
groups[5] = (char) ((int) UnicodeCategory.OpenPunctuation + 1);
tempCategories["Ps"] = groups[5].ToString(); // Open
groups[6] = (char) ((int) UnicodeCategory.FinalQuotePunctuation + 1);
tempCategories["Pf"] = groups[6].ToString(); // Inital quote
groups[7] = (char) ((int) UnicodeCategory.InitialQuotePunctuation + 1);
tempCategories["Pi"] = groups[7].ToString(); // Final quote
groups[8] = GroupChar;
tempCategories["P"] = new String(groups, 0, 9);
word.Append(groups[1]);
// Symbols
groups[1] = (char) ((int) UnicodeCategory.CurrencySymbol + 1);
tempCategories["Sc"] = groups[1].ToString(); // Currency
groups[2] = (char) ((int) UnicodeCategory.ModifierSymbol + 1);
tempCategories["Sk"] = groups[2].ToString(); // Modifier
groups[3] = (char) ((int) UnicodeCategory.MathSymbol + 1);
tempCategories["Sm"] = groups[3].ToString(); // Math
groups[4] = (char) ((int) UnicodeCategory.OtherSymbol + 1);
tempCategories["So"] = groups[4].ToString(); // Other
groups[5] = GroupChar;
tempCategories["S"] = new String(groups, 0, 6);
// Separators
groups[1] = (char) ((int) UnicodeCategory.LineSeparator + 1);
tempCategories["Zl"] = groups[1].ToString(); // Line
groups[2] = (char) ((int) UnicodeCategory.ParagraphSeparator + 1);
tempCategories["Zp"] = groups[2].ToString(); // Paragraph
groups[3] = (char) ((int) UnicodeCategory.SpaceSeparator + 1);
tempCategories["Zs"] = groups[3].ToString(); // Space
groups[4] = GroupChar;
tempCategories["Z"] = new String(groups, 0, 5);
word.Append(GroupChar);
Word = word.ToString();
NotWord = NegateCategory(Word);
SpaceClass = "\x00\x00\x01" + Space;
NotSpaceClass = "\x01\x00\x01" + Space;
WordClass = "\x00\x00" + (char) Word.Length + Word;
NotWordClass = "\x01\x00" + (char) Word.Length + Word;;
DigitClass = "\x00\x00\x01" + (char) ((int) UnicodeCategory.DecimalDigitNumber + 1);
NotDigitClass = "\x00\x00\x01" + unchecked ((char) (- ((int) UnicodeCategory.DecimalDigitNumber + 1)) );
#if DBG
// make sure the _propTable is correctly ordered
int len = _propTable.GetLength(0);
for (int i=0; i<len-1; i++)
Debug.Assert(String.Compare(_propTable[i,0], _propTable[i+1,0], StringComparison.Ordinal) < 0, "RegexCharClass _propTable is out of order at (" + _propTable[i,0] +", " + _propTable[i+1,0] + ")");
#endif
_definedCategories = tempCategories;
}
/*
* RegexCharClass()
*
* Creates an empty character class.
*/
internal RegexCharClass() {
_rangelist = new List<SingleRange>(6);
_canonical = true;
_categories = new StringBuilder();
}
private RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction) {
_rangelist = ranges;
_categories = categories;
_canonical = true;
_negate=negate;
_subtractor = subtraction;
}
internal bool CanMerge {
get {
return !_negate && _subtractor == null;
}
}
internal bool Negate {
set { _negate = value; }
}
internal void AddChar(char c) {
AddRange(c,c);
}
/*
* AddCharClass()
*
* Adds a regex char class
*/
internal void AddCharClass(RegexCharClass cc) {
int i;
Debug.Assert(cc.CanMerge && this.CanMerge, "Both character classes added together must be able to merge" );
if (!cc._canonical) {
// if the new char class to add isn't canonical, we're not either.
_canonical = false;
}
else if (_canonical && RangeCount() > 0 && cc.RangeCount() > 0 && cc.GetRangeAt(0)._first <= GetRangeAt(RangeCount() - 1)._last)
_canonical = false;
for (i = 0; i < cc.RangeCount(); i += 1) {
_rangelist.Add(cc.GetRangeAt(i));
}
_categories.Append(cc._categories.ToString());
}
/*
* AddSet()
*
* Adds a set (specified by its string represenation) to the class.
*/
private void AddSet(String set) {
int i;
if (_canonical && RangeCount() > 0 && set.Length > 0 &&
set[0] <= GetRangeAt(RangeCount() - 1)._last)
_canonical = false;
for (i = 0; i < set.Length - 1; i += 2) {
_rangelist.Add(new SingleRange(set[i], (char)(set[i + 1] - 1)));
}
if (i < set.Length) {
_rangelist.Add(new SingleRange(set[i], Lastchar));
}
}
internal void AddSubtraction(RegexCharClass sub) {
Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. ");
_subtractor = sub;
}
/*
* AddRange()
*
* Adds a single range of characters to the class.
*/
internal void AddRange(char first, char last) {
_rangelist.Add(new SingleRange(first, last));
if (_canonical && _rangelist.Count > 0 &&
first <= _rangelist[_rangelist.Count - 1]._last) {
_canonical = false;
}
}
internal void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern) {
String cat;
_definedCategories.TryGetValue(categoryName, out cat);
if (cat != null && !categoryName.Equals(InternalRegexIgnoreCase)) {
string catstr = cat;
if (caseInsensitive) {
if (categoryName.Equals("Ll") || categoryName.Equals("Lu") || categoryName.Equals("Lt"))
// when RegexOptions.IgnoreCase is specified then {Ll}, {Lu}, and {Lt} cases should all match
catstr = (string) _definedCategories[InternalRegexIgnoreCase];
}
if (invert)
catstr = NegateCategory(catstr); // negate the category
_categories.Append((string) catstr);
}
else
AddSet(SetFromProperty(categoryName, invert, pattern));
}
private void AddCategory(string category) {
_categories.Append(category);
}
/*
* AddLowerCase()
*
* Adds to the class any lowercase versions of characters already
* in the class. Used for case-insensitivity.
*/
internal void AddLowercase(CultureInfo culture) {
int i;
int origSize;
SingleRange range;
_canonical = false;
for (i = 0, origSize = _rangelist.Count; i < origSize; i++) {
range = _rangelist[i];
if (range._first == range._last)
range._first = range._last = Char.ToLower(range._first, culture);
else
AddLowercaseRange(range._first, range._last, culture);
}
}
/*
* AddLowercaseRange()
*
* For a single range that's in the set, adds any additional ranges
* necessary to ensure that lowercase equivalents are also included.
*/
private void AddLowercaseRange(char chMin, char chMax, CultureInfo culture) {
int i, iMax, iMid;
char chMinT, chMaxT;
LowerCaseMapping lc;
for (i = 0, iMax = _lcTable.Length; i < iMax; ) {
iMid = (i + iMax) / 2;
if (_lcTable[iMid]._chMax < chMin)
i = iMid + 1;
else
iMax = iMid;
}
if (i >= _lcTable.Length)
return;
for ( ; i < _lcTable.Length && (lc = _lcTable[i])._chMin <= chMax; i++) {
if ((chMinT = lc._chMin) < chMin)
chMinT = chMin;
if ((chMaxT = lc._chMax) > chMax)
chMaxT = chMax;
switch (lc._lcOp) {
case LowercaseSet:
chMinT = (char)lc._data;
chMaxT = (char)lc._data;
break;
case LowercaseAdd:
chMinT += (char)lc._data;
chMaxT += (char)lc._data;
break;
case LowercaseBor:
chMinT |= (char)1;
chMaxT |= (char)1;
break;
case LowercaseBad:
chMinT += (char)(chMinT & 1);
chMaxT += (char)(chMaxT & 1);
break;
}
if (chMinT < chMin || chMaxT > chMax)
AddRange(chMinT, chMaxT);
}
}
internal void AddWord(bool ecma, bool negate) {
if (negate) {
if (ecma)
AddSet(RegexCharClass.NotECMAWordSet);
else
AddCategory(RegexCharClass.NotWord);
}
else {
if (ecma)
AddSet(RegexCharClass.ECMAWordSet);
else
AddCategory(RegexCharClass.Word);
}
}
internal void AddSpace(bool ecma, bool negate) {
if (negate) {
if (ecma)
AddSet(RegexCharClass.NotECMASpaceSet);
else
AddCategory(RegexCharClass.NotSpace);
}
else {
if (ecma)
AddSet(RegexCharClass.ECMASpaceSet);
else
AddCategory(RegexCharClass.Space);
}
}
internal void AddDigit(bool ecma, bool negate, string pattern) {
if (ecma) {
if (negate)
AddSet(RegexCharClass.NotECMADigitSet);
else
AddSet(RegexCharClass.ECMADigitSet);
}
else
AddCategoryFromName("Nd", negate, false, pattern);
}
internal static string ConvertOldStringsToClass(string set, string category) {
StringBuilder sb = new StringBuilder(set.Length + category.Length + 3);
if (set.Length >= 2 && set[0] =='\0' && set[1] == '\0') {
sb.Append((char) 0x1);
sb.Append((char) (set.Length - 2));
sb.Append((char) category.Length);
sb.Append(set.Substring(2));
}
else {
sb.Append((char) 0x0);
sb.Append((char) set.Length);
sb.Append((char) category.Length);
sb.Append(set);
}
sb.Append(category);
return sb.ToString();
}
/*
* SingletonChar()
*
* Returns the char
*/
internal static char SingletonChar(String set) {
Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
return set[SETSTART];
}
internal static bool IsMergeable(string charClass) {
return (!IsNegated(charClass) && !IsSubtraction(charClass));
}
internal static bool IsEmpty(String charClass) {
if (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0 && !IsSubtraction(charClass))
return true;
else
return false;
}
/*
* IsSingleton()
*
* True if the set contains a single character only
*/
internal static bool IsSingleton(String set) {
if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
(set[SETSTART] == Lastchar || set[SETSTART]+1 == set[SETSTART+1]))
return true;
else
return false;
}
internal static bool IsSingletonInverse(String set) {
if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
(set[SETSTART] == Lastchar || set[SETSTART]+1 == set[SETSTART+1]))
return true;
else
return false;
}
private static bool IsSubtraction(string charClass) {
return (charClass.Length > SETSTART + charClass[SETLENGTH] + charClass[CATEGORYLENGTH]);
}
internal static bool IsNegated(string set) {
return (set != null && set[FLAGS] == 1);
}
internal static bool IsECMAWordChar(char ch) {
// According to ECMA-262, \s, \S, ., ^, and $ use Unicode-based interpretations of
// whitespace and newline, while \d, \D\, \w, \W, \b, and \B use ASCII-only
// interpretations of digit, word character, and word boundary. In other words,
// no special treatment of Unicode ZERO WIDTH NON-JOINER (ZWNJ U+200C) and
// ZERO WIDTH JOINER (ZWJ U+200D) is required for ECMA word boundaries.
return CharInClass(ch, ECMAWordClass);
}
internal static bool IsWordChar(char ch) {
// According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
// RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic
// values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
// ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
return CharInClass(ch, WordClass) || ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner;
}
internal static bool CharInClass(char ch, String set) {
return CharInClassRecursive(ch, set, 0);
}
internal static bool CharInClassRecursive(char ch, String set, int start) {
int mySetLength = set[start+SETLENGTH];
int myCategoryLength = set[start+CATEGORYLENGTH];
int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
bool subtracted = false;
if (set.Length > myEndPosition) {
subtracted = CharInClassRecursive(ch, set, myEndPosition);
}
bool b = CharInClassInternal(ch, set, start, mySetLength, myCategoryLength);
// Note that we apply the negation *before* performing the subtraction. This is because
// the negation only applies to the first char class, not the entire subtraction.
if (set[start+FLAGS] == 1)
b = !b;
return b && !subtracted;
}
/*
* CharInClass()
*
* Determines a character's membership in a character class (via the
* string representation of the class).
*/
private static bool CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength) {
int min;
int max;
int mid;
min = start + SETSTART;
max = min + mySetLength;
while (min != max) {
mid = (min + max) / 2;
if (ch < set[mid])
max = mid;
else
min = mid + 1;
}
// The starting position of the set within the character class determines
// whether what an odd or even ending position means. If the start is odd,
// an *even* ending position means the character was in the set. With recursive
// subtractions in the mix, the starting position = start+SETSTART. Since we know that
// SETSTART is odd, we can simplify it out of the equation. But if it changes we need to
// reverse this check.
Debug.Assert((SETSTART & 0x1) == 1, "If SETSTART is not odd, the calculation below this will be reversed");
if ((min & 0x1) == (start & 0x1))
return true;
else {
if (myCategoryLength == 0)
return false;
return CharInCategory(ch, set, start, mySetLength, myCategoryLength);
}
}
private static bool CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength) {
UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
int i=start + SETSTART + mySetLength;
int end = i + myCategoryLength;
while (i<end) {
int curcat = (short) set[i];
if (curcat == 0) {
// zero is our marker for a group of categories - treated as a unit
if (CharInCategoryGroup(ch, chcategory, set, ref i))
return true;
}
else if (curcat > 0) {
// greater than zero is a positive case
if (curcat == SpaceConst) {
if (Char.IsWhiteSpace(ch))
return true;
else {
i++;
continue;
}
}
--curcat;
if (chcategory == (UnicodeCategory) curcat)
return true;
}
else {
// less than zero is a negative case
if (curcat == NotSpaceConst) {
if (!Char.IsWhiteSpace(ch))
return true;
else {
i++;
continue;
}
}
//curcat = -curcat;
//--curcat;
curcat = -1 - curcat;
if (chcategory != (UnicodeCategory) curcat)
return true;
}
i++;
}
return false;
}
/*
* CharInCategoryGroup
* This is used for categories which are composed of other categories - L, N, Z, W...
* These groups need special treatment when they are negated
*/
private static bool CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i) {
i++;
int curcat = (short) category[i];
if (curcat > 0) {
// positive case - the character must be in ANY of the categories in the group
bool answer = false;
while (curcat != 0) {
if (!answer) {
--curcat;
if (chcategory == (UnicodeCategory) curcat)
answer = true;
}
i++;
curcat = (short) category[i];
}
return answer;
}
else {
// negative case - the character must be in NONE of the categories in the group
bool answer = true;
while (curcat != 0) {
if (answer) {
//curcat = -curcat;
//--curcat;
curcat = -1 - curcat;
if (chcategory == (UnicodeCategory) curcat)
answer = false;
}
i++;
curcat = (short) category[i];
}
return answer;
}
}
private static string NegateCategory(string category) {
if (category == null)
return null;
StringBuilder sb = new StringBuilder(category.Length);
for (int i=0; i<category.Length; i++) {
short ch = (short) category[i];
sb.Append( (char) -ch);
}
return sb.ToString();
}
internal static RegexCharClass Parse(string charClass) {
return ParseRecursive(charClass, 0);
}
private static RegexCharClass ParseRecursive(string charClass, int start) {
int mySetLength = charClass[start+SETLENGTH];
int myCategoryLength = charClass[start+CATEGORYLENGTH];
int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
List<SingleRange> ranges = new List<SingleRange>(mySetLength);
int i=start+SETSTART;
int end = i + mySetLength;
while (i<end) {
char first = charClass[i];
i++;
char last;
if (i < end)
last = (char) (charClass[i] - 1);
else
last = Lastchar;
i++;
ranges.Add(new SingleRange(first, last));
}
RegexCharClass sub = null;
if (charClass.Length > myEndPosition)
sub = ParseRecursive(charClass, myEndPosition);
return new RegexCharClass(charClass[start+FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)), sub);
}
/*
* RangeCount()
*
* The number of single ranges that have been accumulated so far.
*/
private int RangeCount() {
return _rangelist.Count;
}
/*
* ToString()
*
* Constructs the string representation of the class.
*/
internal String ToStringClass() {
if (!_canonical)
Canonicalize();
// make a guess about the length of the ranges. We'll update this at the end.
// This is important because if the last range ends in LastChar, we won't append
// LastChar to the list.
int rangeLen = _rangelist.Count * 2 ;
StringBuilder sb = new StringBuilder(rangeLen + _categories.Length + 3);
int flags;
if (_negate)
flags = 1;
else
flags = 0;
sb.Append((char) flags);
sb.Append((char) rangeLen);
sb.Append((char) _categories.Length);
for (int i = 0; i < _rangelist.Count; i++) {
SingleRange currentRange = _rangelist[i];
sb.Append(currentRange._first);
if (currentRange._last != Lastchar)
sb.Append((char)(currentRange._last + 1));
}
sb[SETLENGTH] = (char) (sb.Length - SETSTART);
sb.Append(_categories);
if (_subtractor != null)
sb.Append(_subtractor.ToStringClass());
return sb.ToString();
}
/*
* GetRangeAt(int i)
*
* The ith range.
*/
private SingleRange GetRangeAt(int i) {
return _rangelist[i];
}
/*
* Canonicalize()
*
* Logic to reduce a character class to a unique, sorted form.
*/
private void Canonicalize() {
SingleRange CurrentRange;
int i;
int j;
char last;
bool Done;
_canonical = true;
_rangelist.Sort(0, _rangelist.Count, new SingleRangeComparer());
//
// Find and eliminate overlapping or abutting ranges
//
if (_rangelist.Count > 1) {
Done = false;
for (i = 1, j = 0; ; i++) {
for (last = _rangelist[j]._last; ; i++) {
if (i == _rangelist.Count || last == Lastchar) {
Done = true;
break;
}
if ((CurrentRange = _rangelist[i])._first > last + 1)
break;
if (last < CurrentRange._last)
last = CurrentRange._last;
}
_rangelist[j]._last = last;
j++;
if (Done)
break;
if (j < i)
_rangelist[j] = _rangelist[i];
}
_rangelist.RemoveRange(j, _rangelist.Count - j);
}
}
private static String SetFromProperty(String capname, bool invert, string pattern) {
int min = 0;
int max = _propTable.GetLength(0);
while (min != max) {
int mid = (min + max) / 2;
int res = String.Compare(capname, _propTable[mid,0], StringComparison.Ordinal);
if (res < 0)
max = mid;
else if (res > 0)
min = mid + 1;
else {
String set = _propTable[mid,1];
Debug.Assert(!String.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table");
if (invert)
{
if (set[0] == Nullchar)
{
return set.Substring(1);
}
return Nullchar + set;
}
else
{
return set;
}
}
}
throw new ArgumentException(SR.GetString(SR.MakeException, pattern, SR.GetString(SR.UnknownProperty, capname)));
}
#if DBG
/*
* SetDescription()
*
* Produces a human-readable description for a set string.
*/
internal static String SetDescription(String set) {
int mySetLength = set[SETLENGTH];
int myCategoryLength = set[CATEGORYLENGTH];
int myEndPosition = SETSTART + mySetLength + myCategoryLength;
StringBuilder desc = new StringBuilder("[");
int index = SETSTART;
char ch1;
char ch2;
if (IsNegated(set))
desc.Append('^');
while (index < SETSTART + set[SETLENGTH]) {
ch1 = set[index];
if (index + 1 < set.Length)
ch2 = (char)(set[index + 1] - 1);
else
ch2 = Lastchar;
desc.Append(CharDescription(ch1));
if (ch2 != ch1) {
if (ch1 + 1 != ch2)
desc.Append('-');
desc.Append(CharDescription(ch2));
}
index += 2;
}
while (index < SETSTART + set[SETLENGTH] + set[CATEGORYLENGTH]) {
ch1 = set[index];
if (ch1 == 0) {
bool found = false;
int lastindex = set.IndexOf(GroupChar, index+1);
string group = set.Substring(index,lastindex-index + 1);
IDictionaryEnumerator en = _definedCategories.GetEnumerator();
while(en.MoveNext()) {
if (group.Equals(en.Value)) {
if ((short) set[index+1] > 0)
desc.Append("\\p{" + en.Key + "}");
else
desc.Append("\\P{" + en.Key + "}");
found = true;
break;
}
}
if (!found) {
if (group.Equals(Word))
desc.Append("\\w");
else if (group.Equals(NotWord))
desc.Append("\\W");
else
Debug.Assert(false, "Couldn't find a goup to match '" + group + "'");
}
index = lastindex;
}
else {
desc.Append(CategoryDescription(ch1));
}
index++;
}
if (set.Length > myEndPosition) {
desc.Append('-');
desc.Append(SetDescription(set.Substring(myEndPosition)));
}
desc.Append(']');
return desc.ToString();
}
internal static readonly char [] Hex = new char [] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
internal static readonly string[] Categories = new string[] {"Lu", "Ll", "Lt", "Lm", "Lo", InternalRegexIgnoreCase,
"Mn", "Mc", "Me",
"Nd", "Nl", "No",
"Zs", "Zl", "Zp",
"Cc", "Cf", "Cs", "Co",
"Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
"Sm", "Sc", "Sk", "So",
"Cn" };
/*
* CharDescription()
*
* Produces a human-readable description for a single character.
*/
internal static String CharDescription(char ch) {
StringBuilder sb = new StringBuilder();
int shift;
if (ch == '\\')
return "\\\\";
if (ch >= ' ' && ch <= '~') {
sb.Append(ch);
return sb.ToString();
}
if (ch < 256) {
sb.Append("\\x");
shift = 8;
}
else {
sb.Append("\\u");
shift = 16;
}
while (shift > 0) {
shift -= 4;
sb.Append(Hex[(ch >> shift) & 0xF]);
}
return sb.ToString();
}
private static String CategoryDescription(char ch) {
if (ch == SpaceConst)
return "\\s";
else if ((short) ch == NotSpaceConst)
return "\\S";
else if ((short) ch < 0) {
return "\\P{" + Categories[(- ((short)ch) - 1)] + "}";
}
else {
return "\\p{" + Categories[(ch - 1)] + "}";
}
}
#endif
// Lower case mapping descriptor.
private struct LowerCaseMapping {
internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data) {
_chMin = chMin;
_chMax = chMax;
_lcOp = lcOp;
_data = data;
}
internal char _chMin;
internal char _chMax;
internal int _lcOp;
internal int _data;
}
/*
* SingleRangeComparer
*
* For sorting ranges; compare based on the first char in the range.
*/
private sealed class SingleRangeComparer : IComparer<SingleRange> {
public int Compare(SingleRange x, SingleRange y) {
return((x)._first < (y)._first ? -1
: ((x)._first > (y)._first ? 1 : 0));
}
}
/*
* SingleRange
*
* A first/last pair representing a single range of characters.
*/
private sealed class SingleRange {
internal SingleRange(char first, char last) {
_first = first;
_last = last;
}
internal char _first;
internal char _last;
}
}
}