//------------------------------------------------------------------------------ // // Copyright (c) Microsoft Corporation. All rights reserved. // //------------------------------------------------------------------------------ // This RegexParser class is internal to the Regex package. // It builds a tree of RegexNodes from a regular expression // Implementation notes: // // It would be nice to get rid of the comment modes, since the // ScanBlank() calls are just kind of duct-taped in. namespace System.Text.RegularExpressions { using System.Collections; using System.Collections.Generic; using System.Globalization; internal sealed class RegexParser { internal RegexNode _stack; internal RegexNode _group; internal RegexNode _alternation; internal RegexNode _concatenation; internal RegexNode _unit; internal String _pattern; internal int _currentPos; internal CultureInfo _culture; internal int _autocap; internal int _capcount; internal int _captop; internal int _capsize; #if SILVERLIGHT internal Dictionary _caps; internal Dictionary _capnames; #else internal Hashtable _caps; internal Hashtable _capnames; #endif internal Int32[] _capnumlist; internal List _capnamelist; internal RegexOptions _options; internal List _optionsStack; internal bool _ignoreNextParen = false; internal const int MaxValueDiv10 = Int32.MaxValue / 10; internal const int MaxValueMod10 = Int32.MaxValue % 10; /* * This static call constructs a RegexTree from a regular expression * pattern string and an option string. * * The method creates, drives, and drops a parser instance. */ internal static RegexTree Parse(String re, RegexOptions op) { RegexParser p; RegexNode root; String[] capnamelist; p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); p._options = op; p.SetPattern(re); p.CountCaptures(); p.Reset(op); root = p.ScanRegex(); if (p._capnamelist == null) capnamelist = null; else capnamelist = p._capnamelist.ToArray(); return new RegexTree(root, p._caps, p._capnumlist, p._captop, p._capnames, capnamelist, op); } /* * This static call constructs a flat concatenation node given * a replacement pattern. */ #if SILVERLIGHT internal static RegexReplacement ParseReplacement(String rep, Dictionary caps, int capsize, Dictionary capnames, RegexOptions op) { #else internal static RegexReplacement ParseReplacement(String rep, Hashtable caps, int capsize, Hashtable capnames, RegexOptions op) { #endif RegexParser p; RegexNode root; p = new RegexParser((op & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture); p._options = op; p.NoteCaptures(caps, capsize, capnames); p.SetPattern(rep); root = p.ScanReplacement(); return new RegexReplacement(rep, root, caps); } /* * Escapes all metacharacters (including |,(,),[,{,|,^,$,*,+,?,\, spaces and #) */ internal static String Escape(String input) { for (int i = 0; i < input.Length; i++) { if (IsMetachar(input[i])) { StringBuilder sb = new StringBuilder(); char ch = input[i]; int lastpos; sb.Append(input, 0, i); do { sb.Append('\\'); switch (ch) { case '\n': ch = 'n'; break; case '\r': ch = 'r'; break; case '\t': ch = 't'; break; case '\f': ch = 'f'; break; } sb.Append(ch); i++; lastpos = i; while (i < input.Length) { ch = input[i]; if (IsMetachar(ch)) break; i++; } sb.Append(input, lastpos, i - lastpos); } while (i < input.Length); return sb.ToString(); } } return input; } /* * Escapes all metacharacters (including (,),[,],{,},|,^,$,*,+,?,\, spaces and #) */ internal static String Unescape(String input) { for (int i = 0; i < input.Length; i++) { if (input[i] == '\\') { StringBuilder sb = new StringBuilder(); RegexParser p = new RegexParser(CultureInfo.InvariantCulture); int lastpos; p.SetPattern(input); sb.Append(input, 0, i); do { i++; p.Textto(i); if (i < input.Length) sb.Append(p.ScanCharEscape()); i = p.Textpos(); lastpos = i; while (i < input.Length && input[i] != '\\') i++; sb.Append(input, lastpos, i - lastpos); } while (i < input.Length); return sb.ToString(); } } return input; } /* * Private constructor. */ private RegexParser(CultureInfo culture) { _culture = culture; _optionsStack = new List(); #if SILVERLIGHT _caps = new Dictionary(); #else _caps = new Hashtable(); #endif } /* * Drops a string into the pattern buffer. */ internal void SetPattern(String Re) { if (Re == null) Re = String.Empty; _pattern = Re; _currentPos = 0; } /* * Resets parsing to the beginning of the pattern. */ internal void Reset(RegexOptions topopts) { _currentPos = 0; _autocap = 1; _ignoreNextParen = false; if (_optionsStack.Count > 0) _optionsStack.RemoveRange(0, _optionsStack.Count - 1); _options = topopts; _stack = null; } /* * The main parsing function. */ internal RegexNode ScanRegex() { char ch = '@'; // nonspecial ch, means at beginning bool isQuantifier = false; StartGroup(new RegexNode(RegexNode.Capture, _options, 0, -1)); while (CharsRight() > 0) { bool wasPrevQuantifier = isQuantifier; isQuantifier = false; ScanBlank(); int startpos = Textpos(); // move past all of the normal characters. We'll stop when we hit some kind of control character, // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. if (UseOptionX()) while (CharsRight() > 0 && (!IsStopperX(ch = RightChar()) || ch == '{' && !IsTrueQuantifier())) MoveRight(); else while (CharsRight() > 0 && (!IsSpecial(ch = RightChar()) || ch == '{' && !IsTrueQuantifier())) MoveRight(); int endpos = Textpos(); ScanBlank(); if (CharsRight() == 0) ch = '!'; // nonspecial, means at end else if (IsSpecial(ch = RightChar())) { isQuantifier = IsQuantifier(ch); MoveRight(); } else ch = ' '; // nonspecial, means at ordinary char if (startpos < endpos) { int cchUnquantified = endpos - startpos - (isQuantifier ? 1 : 0); wasPrevQuantifier = false; if (cchUnquantified > 0) AddConcatenate(startpos, cchUnquantified, false); if (isQuantifier) AddUnitOne(CharAt(endpos - 1)); } switch (ch) { case '!': goto BreakOuterScan; case ' ': goto ContinueOuterScan; case '[': AddUnitSet(ScanCharClass(UseOptionI()).ToStringClass()); break; case '(': { RegexNode grouper; PushOptions(); if (null == (grouper = ScanGroupOpen())) { PopKeepOptions(); } else { PushGroup(); StartGroup(grouper); } } continue; case '|': AddAlternate(); goto ContinueOuterScan; case ')': if (EmptyStack()) throw MakeException(SR.GetString(SR.TooManyParens)); AddGroup(); PopGroup(); PopOptions(); if (Unit() == null) goto ContinueOuterScan; break; case '\\': AddUnitNode(ScanBackslash()); break; case '^': AddUnitType(UseOptionM() ? RegexNode.Bol : RegexNode.Beginning); break; case '$': AddUnitType(UseOptionM() ? RegexNode.Eol : RegexNode.EndZ); break; case '.': if (UseOptionS()) AddUnitSet(RegexCharClass.AnyClass); else AddUnitNotone('\n'); break; case '{': case '*': case '+': case '?': if (Unit() == null) throw MakeException(wasPrevQuantifier ? SR.GetString(SR.NestedQuantify, ch.ToString()) : SR.GetString(SR.QuantifyAfterNothing)); MoveLeft(); break; default: throw MakeException(SR.GetString(SR.InternalError)); } ScanBlank(); if (CharsRight() == 0 || !(isQuantifier = IsTrueQuantifier())) { AddConcatenate(); goto ContinueOuterScan; } ch = MoveRightGetChar(); // Handle quantifiers while (Unit() != null) { int min; int max; bool lazy; switch (ch) { case '*': min = 0; max = Int32.MaxValue; break; case '?': min = 0; max = 1; break; case '+': min = 1; max = Int32.MaxValue; break; case '{': { startpos = Textpos(); max = min = ScanDecimal(); if (startpos < Textpos()) { if (CharsRight() > 0 && RightChar() == ',') { MoveRight(); if (CharsRight() == 0 || RightChar() == '}') max = Int32.MaxValue; else max = ScanDecimal(); } } if (startpos == Textpos() || CharsRight() == 0 || MoveRightGetChar() != '}') { AddConcatenate(); Textto(startpos - 1); goto ContinueOuterScan; } } break; default: throw MakeException(SR.GetString(SR.InternalError)); } ScanBlank(); if (CharsRight() == 0 || RightChar() != '?') lazy = false; else { MoveRight(); lazy = true; } if (min > max) throw MakeException(SR.GetString(SR.IllegalRange)); AddConcatenate(lazy, min, max); } ContinueOuterScan: ; } BreakOuterScan: ; if (!EmptyStack()) throw MakeException(SR.GetString(SR.NotEnoughParens)); AddGroup(); return Unit(); } /* * Simple parsing for replacement patterns */ internal RegexNode ScanReplacement() { int c; int startpos; _concatenation = new RegexNode(RegexNode.Concatenate, _options); for (;;) { c = CharsRight(); if (c == 0) break; startpos = Textpos(); while (c > 0 && RightChar() != '$') { MoveRight(); c--; } AddConcatenate(startpos, Textpos() - startpos, true); if (c > 0) { if (MoveRightGetChar() == '$') AddUnitNode(ScanDollar()); AddConcatenate(); } } return _concatenation; } /* * Scans contents of [] (not including []'s), and converts to a * RegexCharClass. */ internal RegexCharClass ScanCharClass(bool caseInsensitive) { return ScanCharClass(caseInsensitive, false); } /* * Scans contents of [] (not including []'s), and converts to a * RegexCharClass. */ internal RegexCharClass ScanCharClass(bool caseInsensitive, bool scanOnly) { char ch = '\0'; char chPrev = '\0'; bool inRange = false; bool firstChar = true; bool closed = false; RegexCharClass cc; cc = scanOnly ? null : new RegexCharClass(); if (CharsRight() > 0 && RightChar() == '^') { MoveRight(); if (!scanOnly) cc.Negate = true; } for ( ; CharsRight() > 0; firstChar = false) { bool fTranslatedChar = false; ch = MoveRightGetChar(); if (ch == ']') { if (!firstChar) { closed = true; break; } } else if (ch == '\\' && CharsRight() > 0) { switch (ch = MoveRightGetChar()) { case 'D': case 'd': if (!scanOnly) { if (inRange) throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString())); cc.AddDigit(UseOptionE(), ch == 'D', _pattern); } continue; case 'S': case 's': if (!scanOnly) { if (inRange) throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString())); cc.AddSpace(UseOptionE(), ch == 'S'); } continue; case 'W': case 'w': if (!scanOnly) { if (inRange) throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString())); cc.AddWord(UseOptionE(), ch == 'W'); } continue; case 'p': case 'P': if (!scanOnly) { if (inRange) throw MakeException(SR.GetString(SR.BadClassInCharRange, ch.ToString())); cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), caseInsensitive, _pattern); } else ParseProperty(); continue; case '-': if (!scanOnly) cc.AddRange(ch, ch); continue; default: MoveLeft(); ch = ScanCharEscape(); // non-literal character fTranslatedChar = true; break; // this break will only break out of the switch } } else if (ch == '[') { // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. // It currently doesn't do anything other than skip the whole thing! if (CharsRight() > 0 && RightChar() == ':' && !inRange) { // String name; int savePos = Textpos(); MoveRight(); /* name = */ ScanCapname(); if (CharsRight() < 2 || MoveRightGetChar() != ':' || MoveRightGetChar() != ']') Textto(savePos); // else lookup name (nyi) } } if (inRange) { inRange = false; if (!scanOnly) { if (ch == '[' && !fTranslatedChar && !firstChar) { // We thought we were in a range, but we're actually starting a subtraction. // In that case, we'll add chPrev to our char class, skip the opening [, and // scan the new character class recursively. cc.AddChar(chPrev); cc.AddSubtraction(ScanCharClass(caseInsensitive, false)); if (CharsRight() > 0 && RightChar() != ']') throw MakeException(SR.GetString(SR.SubtractionMustBeLast)); } else { // a regular range, like a-z if (chPrev > ch) throw MakeException(SR.GetString(SR.ReversedCharRange)); cc.AddRange(chPrev, ch); } } } else if (CharsRight() >= 2 && RightChar() == '-' && RightChar(1) != ']') { // this could be the start of a range chPrev = ch; inRange = true; MoveRight(); } else if (CharsRight() >= 1 && ch == '-' && !fTranslatedChar && RightChar() == '[' && !firstChar) { // we aren't in a range, and now there is a subtraction. Usually this happens // only when a subtraction follows a range, like [a-z-[b]] if (!scanOnly) { MoveRight(1); cc.AddSubtraction(ScanCharClass(caseInsensitive, false)); if (CharsRight() > 0 && RightChar() != ']') throw MakeException(SR.GetString(SR.SubtractionMustBeLast)); } else { MoveRight(1); ScanCharClass(caseInsensitive, true); } } else { if (!scanOnly) cc.AddRange(ch, ch); } } if (!closed) throw MakeException(SR.GetString(SR.UnterminatedBracket)); if (!scanOnly && caseInsensitive) cc.AddLowercase(_culture); return cc; } /* * Scans chars following a '(' (not counting the '('), and returns * a RegexNode for the type of group scanned, or null if the group * simply changed options (?cimsx-cimsx) or was a comment (#...). */ internal RegexNode ScanGroupOpen() { char ch = '\0'; int NodeType; char close = '>'; // just return a RegexNode if we have: // 1. "(" followed by nothing // 2. "(x" where x != ? // 3. "(?)" if (CharsRight() == 0 || RightChar() != '?' || (RightChar() == '?' && (CharsRight() > 1 && RightChar(1) == ')'))) { if (UseOptionN() || _ignoreNextParen) { _ignoreNextParen = false; return new RegexNode(RegexNode.Group, _options); } else return new RegexNode(RegexNode.Capture, _options, _autocap++, -1); } MoveRight(); for (;;) { if (CharsRight() == 0) break; switch (ch = MoveRightGetChar()) { case ':': NodeType = RegexNode.Group; break; case '=': _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Require; break; case '!': _options &= ~(RegexOptions.RightToLeft); NodeType = RegexNode.Prevent; break; case '>': NodeType = RegexNode.Greedy; break; case '\'': close = '\''; goto case '<'; // fallthrough case '<': if (CharsRight() == 0) goto BreakRecognize; switch (ch = MoveRightGetChar()) { case '=': if (close == '\'') goto BreakRecognize; _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Require; break; case '!': if (close == '\'') goto BreakRecognize; _options |= RegexOptions.RightToLeft; NodeType = RegexNode.Prevent; break; default: MoveLeft(); int capnum = -1; int uncapnum = -1; bool proceed = false; // grab part before - if (ch >= '0' && ch <= '9') { capnum = ScanDecimal(); if (!IsCaptureSlot(capnum)) capnum = -1; // check if we have bogus characters after the number if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-')) throw MakeException(SR.GetString(SR.InvalidGroupName)); if (capnum == 0) throw MakeException(SR.GetString(SR.CapnumNotZero)); } else if (RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (IsCaptureName(capname)) capnum = CaptureSlotFromName(capname); // check if we have bogus character after the name if (CharsRight() > 0 && !(RightChar() == close || RightChar() == '-')) throw MakeException(SR.GetString(SR.InvalidGroupName)); } else if (ch == '-') { proceed = true; } else { // bad group name - starts with something other than a word character and isn't a number throw MakeException(SR.GetString(SR.InvalidGroupName)); } // grab part after - if any if ((capnum != -1 || proceed == true) && CharsRight() > 0 && RightChar() == '-') { MoveRight(); ch = RightChar(); if (ch >= '0' && ch <= '9') { uncapnum = ScanDecimal(); if (!IsCaptureSlot(uncapnum)) throw MakeException(SR.GetString(SR.UndefinedBackref, uncapnum)); // check if we have bogus characters after the number if (CharsRight() > 0 && RightChar() != close) throw MakeException(SR.GetString(SR.InvalidGroupName)); } else if (RegexCharClass.IsWordChar(ch)) { String uncapname = ScanCapname(); if (IsCaptureName(uncapname)) uncapnum = CaptureSlotFromName(uncapname); else throw MakeException(SR.GetString(SR.UndefinedNameRef, uncapname)); // check if we have bogus character after the name if (CharsRight() > 0 && RightChar() != close) throw MakeException(SR.GetString(SR.InvalidGroupName)); } else { // bad group name - starts with something other than a word character and isn't a number throw MakeException(SR.GetString(SR.InvalidGroupName)); } } // actually make the node if ((capnum != -1 || uncapnum != -1) && CharsRight() > 0 && MoveRightGetChar() == close) { return new RegexNode(RegexNode.Capture, _options, capnum, uncapnum); } goto BreakRecognize; } break; case '(': // alternation construct (?(...) | ) int parenPos = Textpos(); if (CharsRight() > 0) { ch = RightChar(); // check if the alternation condition is a backref if (ch >= '0' && ch <= '9') { int capnum = ScanDecimal(); if (CharsRight() > 0 && MoveRightGetChar() == ')') { if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Testref, _options, capnum); else throw MakeException(SR.GetString(SR.UndefinedReference, capnum.ToString(CultureInfo.CurrentCulture))); } else throw MakeException(SR.GetString(SR.MalformedReference, capnum.ToString(CultureInfo.CurrentCulture))); } else if (RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (IsCaptureName(capname) && CharsRight() > 0 && MoveRightGetChar() == ')') return new RegexNode(RegexNode.Testref, _options, CaptureSlotFromName(capname)); } } // not a backref NodeType = RegexNode.Testgroup; Textto(parenPos - 1); // jump to the start of the parentheses _ignoreNextParen = true; // but make sure we don't try to capture the insides int charsRight = CharsRight(); if (charsRight >= 3 && RightChar(1) == '?') { char rightchar2 = RightChar(2); // disallow comments in the condition if (rightchar2 == '#') throw MakeException(SR.GetString(SR.AlternationCantHaveComment)); // disallow named capture group (?<..>..) in the condition if (rightchar2 == '\'' ) throw MakeException(SR.GetString(SR.AlternationCantCapture)); else { if (charsRight >=4 && (rightchar2 == '<' && RightChar(3) != '!' && RightChar(3) != '=')) throw MakeException(SR.GetString(SR.AlternationCantCapture)); } } break; default: MoveLeft(); NodeType = RegexNode.Group; ScanOptions(); if (CharsRight() == 0) goto BreakRecognize; if ((ch = MoveRightGetChar()) == ')') return null; if (ch != ':') goto BreakRecognize; break; } return new RegexNode(NodeType, _options); } BreakRecognize: ; // break Recognize comes here throw MakeException(SR.GetString(SR.UnrecognizedGrouping)); } /* * Scans whitespace or x-mode comments. */ internal void ScanBlank() { if (UseOptionX()) { for (;;) { while (CharsRight() > 0 && IsSpace(RightChar())) MoveRight(); if (CharsRight() == 0) break; if (RightChar() == '#') { while (CharsRight() > 0 && RightChar() != '\n') MoveRight(); } else if (CharsRight() >= 3 && RightChar(2) == '#' && RightChar(1) == '?' && RightChar() == '(') { while (CharsRight() > 0 && RightChar() != ')') MoveRight(); if (CharsRight() == 0) throw MakeException(SR.GetString(SR.UnterminatedComment)); MoveRight(); } else break; } } else { for (;;) { if (CharsRight() < 3 || RightChar(2) != '#' || RightChar(1) != '?' || RightChar() != '(') return; while (CharsRight() > 0 && RightChar() != ')') MoveRight(); if (CharsRight() == 0) throw MakeException(SR.GetString(SR.UnterminatedComment)); MoveRight(); } } } /* * Scans chars following a '\' (not counting the '\'), and returns * a RegexNode for the type of atom scanned. */ internal RegexNode ScanBackslash() { char ch; RegexCharClass cc; if (CharsRight() == 0) throw MakeException(SR.GetString(SR.IllegalEndEscape)); switch (ch = RightChar()) { case 'b': case 'B': case 'A': case 'G': case 'Z': case 'z': MoveRight(); return new RegexNode(TypeFromCode(ch), _options); case 'w': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMAWordClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.WordClass); case 'W': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMAWordClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotWordClass); case 's': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMASpaceClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.SpaceClass); case 'S': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMASpaceClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotSpaceClass); case 'd': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.ECMADigitClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.DigitClass); case 'D': MoveRight(); if (UseOptionE()) return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotECMADigitClass); return new RegexNode(RegexNode.Set, _options, RegexCharClass.NotDigitClass); case 'p': case 'P': MoveRight(); cc = new RegexCharClass(); cc.AddCategoryFromName(ParseProperty(), (ch != 'p'), UseOptionI(), _pattern); if (UseOptionI()) cc.AddLowercase(_culture); return new RegexNode(RegexNode.Set, _options, cc.ToStringClass()); default: return ScanBasicBackslash(); } } /* * Scans \-style backreferences and character escapes */ internal RegexNode ScanBasicBackslash() { if (CharsRight() == 0) throw MakeException(SR.GetString(SR.IllegalEndEscape)); char ch; bool angled = false; char close = '\0'; int backpos; backpos = Textpos(); ch = RightChar(); // allow \k instead of \, which is now deprecated if (ch == 'k') { if (CharsRight() >= 2) { MoveRight(); ch = MoveRightGetChar(); if (ch == '<' || ch == '\'') { angled = true; close = (ch == '\'') ? '\'' : '>'; } } if (!angled || CharsRight() <= 0) throw MakeException(SR.GetString(SR.MalformedNameRef)); ch = RightChar(); } // Note angle without \g < else if ((ch == '<' || ch == '\'') && CharsRight() > 1) { angled = true; close = (ch == '\'') ? '\'' : '>'; MoveRight(); ch = RightChar(); } // Try to parse backreference: \<1> or \ if (angled && ch >= '0' && ch <= '9') { int capnum = ScanDecimal(); if (CharsRight() > 0 && MoveRightGetChar() == close) { if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Ref, _options, capnum); else throw MakeException(SR.GetString(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture))); } } // Try to parse backreference or octal: \1 else if (!angled && ch >= '1' && ch <= '9') { if (UseOptionE()) { int capnum = -1; int newcapnum = (int)(ch - '0'); int pos = Textpos() - 1; while (newcapnum <= _captop) { if (IsCaptureSlot(newcapnum) && (_caps == null || (int)_caps[newcapnum] < pos)) capnum = newcapnum; MoveRight(); if (CharsRight() == 0 || (ch = RightChar()) < '0' || ch > '9') break; newcapnum = newcapnum * 10 + (int)(ch - '0'); } if (capnum >= 0) return new RegexNode(RegexNode.Ref, _options, capnum); } else { int capnum = ScanDecimal(); if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Ref, _options, capnum); else if (capnum <= 9) throw MakeException(SR.GetString(SR.UndefinedBackref, capnum.ToString(CultureInfo.CurrentCulture))); } } else if (angled && RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (CharsRight() > 0 && MoveRightGetChar() == close) { if (IsCaptureName(capname)) return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); else throw MakeException(SR.GetString(SR.UndefinedNameRef, capname)); } } // Not backreference: must be char code Textto(backpos); ch = ScanCharEscape(); if (UseOptionI()) ch = Char.ToLower(ch, _culture); return new RegexNode(RegexNode.One, _options, ch); } /* * Scans $ patterns recognized within replacment patterns */ internal RegexNode ScanDollar() { if (CharsRight() == 0) return new RegexNode(RegexNode.One, _options, '$'); char ch = RightChar(); bool angled; int backpos = Textpos(); int lastEndPos = backpos; // Note angle if (ch == '{' && CharsRight() > 1) { angled = true; MoveRight(); ch = RightChar(); } else { angled = false; } // Try to parse backreference: \1 or \{1} or \{cap} if (ch >= '0' && ch <= '9') { if (!angled && UseOptionE()) { int capnum = -1; int newcapnum = (int)(ch - '0'); MoveRight(); if (IsCaptureSlot(newcapnum)) { capnum = newcapnum; lastEndPos = Textpos(); } while (CharsRight() > 0 && (ch = RightChar()) >= '0' && ch <= '9') { int digit = (int)(ch - '0'); if (newcapnum > (MaxValueDiv10) || (newcapnum == (MaxValueDiv10) && digit > (MaxValueMod10))) throw MakeException(SR.GetString(SR.CaptureGroupOutOfRange)); newcapnum = newcapnum * 10 + digit; MoveRight(); if (IsCaptureSlot(newcapnum)) { capnum = newcapnum; lastEndPos = Textpos(); } } Textto(lastEndPos); if (capnum >= 0) return new RegexNode(RegexNode.Ref, _options, capnum); } else { int capnum = ScanDecimal(); if (!angled || CharsRight() > 0 && MoveRightGetChar() == '}') { if (IsCaptureSlot(capnum)) return new RegexNode(RegexNode.Ref, _options, capnum); } } } else if (angled && RegexCharClass.IsWordChar(ch)) { String capname = ScanCapname(); if (CharsRight() > 0 && MoveRightGetChar() == '}') { if (IsCaptureName(capname)) return new RegexNode(RegexNode.Ref, _options, CaptureSlotFromName(capname)); } } else if (!angled) { int capnum = 1; switch (ch) { case '$': MoveRight(); return new RegexNode(RegexNode.One, _options, '$'); case '&': capnum = 0; break; case '`': capnum = RegexReplacement.LeftPortion; break; case '\'': capnum = RegexReplacement.RightPortion; break; case '+': capnum = RegexReplacement.LastGroup; break; case '_': capnum = RegexReplacement.WholeString; break; } if (capnum != 1) { MoveRight(); return new RegexNode(RegexNode.Ref, _options, capnum); } } // unrecognized $: literalize Textto(backpos); return new RegexNode(RegexNode.One, _options, '$'); } /* * Scans a capture name: consumes word chars */ internal String ScanCapname() { int startpos = Textpos(); while (CharsRight() > 0) { if (!RegexCharClass.IsWordChar(MoveRightGetChar())) { MoveLeft(); break; } } return _pattern.Substring(startpos, Textpos() - startpos); } /* * Scans up to three octal digits (stops before exceeding 0377). */ internal char ScanOctal() { int d; int i; int c; // Consume octal chars only up to 3 digits and value 0377 c = 3; if (c > CharsRight()) c = CharsRight(); for (i = 0; c > 0 && (uint)(d = RightChar() - '0') <= 7; c -= 1) { MoveRight(); i *= 8; i += d; if (UseOptionE() && i >= 0x20) break; } // Octal codes only go up to 255. Any larger and the behavior that Perl follows // is simply to truncate the high bits. i &= 0xFF; return(char)i; } /* * Scans any number of decimal digits (pegs value at 2^31-1 if too large) */ internal int ScanDecimal() { int i = 0; int d; while (CharsRight() > 0 && (uint)(d = (char)(RightChar() - '0')) <= 9) { MoveRight(); if (i > (MaxValueDiv10) || (i == (MaxValueDiv10) && d > (MaxValueMod10))) throw MakeException(SR.GetString(SR.CaptureGroupOutOfRange)); i *= 10; i += d; } return i; } /* * Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) */ internal char ScanHex(int c) { int i; int d; i = 0; if (CharsRight() >= c) { for (; c > 0 && ((d = HexDigit(MoveRightGetChar())) >= 0); c -= 1) { i *= 0x10; i += d; } } if (c > 0) throw MakeException(SR.GetString(SR.TooFewHex)); return(char)i; } /* * Returns n <= 0xF for a hex digit. */ internal static int HexDigit(char ch) { int d; if ((uint)(d = ch - '0') <= 9) return d; if ((uint)(d = ch - 'a') <= 5) return d + 0xa; if ((uint)(d = ch - 'A') <= 5) return d + 0xa; return -1; } /* * Grabs and converts an ascii control character */ internal char ScanControl() { char ch; if (CharsRight() <= 0) throw MakeException(SR.GetString(SR.MissingControl)); ch = MoveRightGetChar(); // \ca interpreted as \cA if (ch >= 'a' && ch <= 'z') ch = (char)(ch - ('a' - 'A')); if ((ch = (char)(ch - '@')) < ' ') return ch; throw MakeException(SR.GetString(SR.UnrecognizedControl)); } /* * Returns true for options allowed only at the top level */ internal bool IsOnlyTopOption(RegexOptions option) { return(option == RegexOptions.RightToLeft #if !(SILVERLIGHT||FULL_AOT_RUNTIME) || option == RegexOptions.Compiled #endif || option == RegexOptions.CultureInvariant || option == RegexOptions.ECMAScript ); } /* * Scans cimsx-cimsx option string, stops at the first unrecognized char. */ internal void ScanOptions() { char ch; bool off; RegexOptions option; for (off = false; CharsRight() > 0; MoveRight()) { ch = RightChar(); if (ch == '-') { off = true; } else if (ch == '+') { off = false; } else { option = OptionFromCode(ch); if (option == 0 || IsOnlyTopOption(option)) return; if (off) _options &= ~option; else _options |= option; } } } /* * Scans \ code for escape codes that map to single unicode chars. */ internal char ScanCharEscape() { char ch; ch = MoveRightGetChar(); if (ch >= '0' && ch <= '7') { MoveLeft(); return ScanOctal(); } switch (ch) { case 'x': return ScanHex(2); case 'u': return ScanHex(4); case 'a': return '\u0007'; case 'b': return '\b'; case 'e': return '\u001B'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; case 'v': return '\u000B'; case 'c': return ScanControl(); default: if (!UseOptionE() && RegexCharClass.IsWordChar(ch)) throw MakeException(SR.GetString(SR.UnrecognizedEscape, ch.ToString())); return ch; } } /* * Scans X for \p{X} or \P{X} */ internal String ParseProperty() { if (CharsRight() < 3) { throw MakeException(SR.GetString(SR.IncompleteSlashP)); } char ch = MoveRightGetChar(); if (ch != '{') { throw MakeException(SR.GetString(SR.MalformedSlashP)); } int startpos = Textpos(); while (CharsRight() > 0) { ch = MoveRightGetChar(); if (!(RegexCharClass.IsWordChar(ch) || ch == '-')) { MoveLeft(); break; } } String capname = _pattern.Substring(startpos, Textpos() - startpos); if (CharsRight() == 0 || MoveRightGetChar() != '}') throw MakeException(SR.GetString(SR.IncompleteSlashP)); return capname; } /* * Returns ReNode type for zero-length assertions with a \ code. */ internal int TypeFromCode(char ch) { switch (ch) { case 'b': return UseOptionE() ? RegexNode.ECMABoundary : RegexNode.Boundary; case 'B': return UseOptionE() ? RegexNode.NonECMABoundary : RegexNode.Nonboundary; case 'A': return RegexNode.Beginning; case 'G': return RegexNode.Start; case 'Z': return RegexNode.EndZ; case 'z': return RegexNode.End; default: return RegexNode.Nothing; } } /* * Returns option bit from single-char (?cimsx) code. */ internal static RegexOptions OptionFromCode(char ch) { // case-insensitive if (ch >= 'A' && ch <= 'Z') ch += (char)('a' - 'A'); switch (ch) { #if !(SILVERLIGHT||FULL_AOT_RUNTIME) case 'c': return RegexOptions.Compiled; #endif case 'i': return RegexOptions.IgnoreCase; case 'r': return RegexOptions.RightToLeft; case 'm': return RegexOptions.Multiline; case 'n': return RegexOptions.ExplicitCapture; case 's': return RegexOptions.Singleline; case 'x': return RegexOptions.IgnorePatternWhitespace; #if DBG case 'd': return RegexOptions.Debug; #endif case 'e': return RegexOptions.ECMAScript; default: return 0; } } /* * a prescanner for deducing the slots used for * captures by doing a partial tokenization of the pattern. */ internal void CountCaptures() { char ch; NoteCaptureSlot(0, 0); _autocap = 1; while (CharsRight() > 0) { int pos = Textpos(); ch = MoveRightGetChar(); switch (ch) { case '\\': if (CharsRight() > 0) MoveRight(); break; case '#': if (UseOptionX()) { MoveLeft(); ScanBlank(); } break; case '[': ScanCharClass(false, true); break; case ')': if (!EmptyOptionsStack()) PopOptions(); break; case '(': if (CharsRight() >= 2 && RightChar(1) == '#' && RightChar() == '?') { MoveLeft(); ScanBlank(); } else { PushOptions(); if (CharsRight() > 0 && RightChar() == '?') { // we have (?... MoveRight(); if (CharsRight() > 1 && (RightChar() == '<' || RightChar() == '\'')) { // named group: (?<... or (?'... MoveRight(); ch = RightChar(); if (ch != '0' && RegexCharClass.IsWordChar(ch)) { //if (_ignoreNextParen) // throw MakeException(SR.GetString(SR.AlternationCantCapture)); if (ch >= '1' && ch <= '9') NoteCaptureSlot(ScanDecimal(), pos); else NoteCaptureName(ScanCapname(), pos); } } else { // (?... // get the options if it's an option construct (?cimsx-cimsx...) ScanOptions(); if (CharsRight() > 0) { if (RightChar() == ')') { // (?cimsx-cimsx) MoveRight(); PopKeepOptions(); } else if (RightChar() == '(') { // alternation construct: (?(foo)yes|no) // ignore the next paren so we don't capture the condition _ignoreNextParen = true; // break from here so we don't reset _ignoreNextParen break; } } } } else { if (!UseOptionN() && !_ignoreNextParen) NoteCaptureSlot(_autocap++, pos); } } _ignoreNextParen = false; break; } } AssignNameSlots(); } /* * Notes a used capture slot */ internal void NoteCaptureSlot(int i, int pos) { if (!_caps.ContainsKey(i)) { // the rhs of the hashtable isn't used in the parser _caps.Add(i, pos); _capcount++; if (_captop <= i) { if (i == Int32.MaxValue) _captop = i; else _captop = i + 1; } } } /* * Notes a used capture slot */ internal void NoteCaptureName(String name, int pos) { if (_capnames == null) { #if SILVERLIGHT _capnames = new Dictionary(); #else _capnames = new Hashtable(); #endif _capnamelist = new List(); } if (!_capnames.ContainsKey(name)) { _capnames.Add(name, pos); _capnamelist.Add(name); } } /* * For when all the used captures are known: note them all at once */ #if SILVERLIGHT internal void NoteCaptures(Dictionary caps, int capsize, Dictionary capnames) { #else internal void NoteCaptures(Hashtable caps, int capsize, Hashtable capnames) { #endif _caps = caps; _capsize = capsize; _capnames = capnames; } /* * Assigns unused slot numbers to the capture names */ internal void AssignNameSlots() { if (_capnames != null) { for (int i = 0; i < _capnamelist.Count; i++) { while (IsCaptureSlot(_autocap)) _autocap++; string name = _capnamelist[i]; int pos = (int)_capnames[name]; _capnames[name] = _autocap; NoteCaptureSlot(_autocap, pos); _autocap++; } } // if the caps array has at least one gap, construct the list of used slots if (_capcount < _captop) { _capnumlist = new Int32[_capcount]; int i = 0; for (IDictionaryEnumerator de = _caps.GetEnumerator(); de.MoveNext(); ) _capnumlist[i++] = (int)de.Key; System.Array.Sort(_capnumlist, Comparer.Default); } // merge capsnumlist into capnamelist if (_capnames != null || _capnumlist != null) { List oldcapnamelist; int next; int k = 0; if (_capnames == null) { oldcapnamelist = null; #if SILVERLIGHT _capnames = new Dictionary(); #else _capnames = new Hashtable(); #endif _capnamelist = new List(); next = -1; } else { oldcapnamelist = _capnamelist; _capnamelist = new List(); next = (int)_capnames[oldcapnamelist[0]]; } for (int i = 0; i < _capcount; i++) { int j = (_capnumlist == null) ? i : (int)_capnumlist[i]; if (next == j) { _capnamelist.Add(oldcapnamelist[k++]); next = (k == oldcapnamelist.Count) ? -1 : (int)_capnames[oldcapnamelist[k]]; } else { String str = Convert.ToString(j, _culture); _capnamelist.Add(str); _capnames[str] = j; } } } } /* * Looks up the slot number for a given name */ internal int CaptureSlotFromName(String capname) { return(int)_capnames[capname]; } /* * True if the capture slot was noted */ internal bool IsCaptureSlot(int i) { if (_caps != null) return _caps.ContainsKey(i); return(i >= 0 && i < _capsize); } /* * Looks up the slot number for a given name */ internal bool IsCaptureName(String capname) { if (_capnames == null) return false; return _capnames.ContainsKey(capname); } /* * True if N option disabling '(' autocapture is on. */ internal bool UseOptionN() { return(_options & RegexOptions.ExplicitCapture) != 0; } /* * True if I option enabling case-insensitivity is on. */ internal bool UseOptionI() { return(_options & RegexOptions.IgnoreCase) != 0; } /* * True if M option altering meaning of $ and ^ is on. */ internal bool UseOptionM() { return(_options & RegexOptions.Multiline) != 0; } /* * True if S option altering meaning of . is on. */ internal bool UseOptionS() { return(_options & RegexOptions.Singleline) != 0; } /* * True if X option enabling whitespace/comment mode is on. */ internal bool UseOptionX() { return(_options & RegexOptions.IgnorePatternWhitespace) != 0; } /* * True if E option enabling ECMAScript behavior is on. */ internal bool UseOptionE() { return(_options & RegexOptions.ECMAScript) != 0; } internal const byte Q = 5; // quantifier internal const byte S = 4; // ordinary stoppper internal const byte Z = 3; // ScanBlank stopper internal const byte X = 2; // whitespace internal const byte E = 1; // should be escaped /* * For categorizing ascii characters. */ internal static readonly byte[] _category = new byte[] { // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0,0,0,0,0,0,0,0,0,X,X,0,X,X,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? X,0,0,Z,S,0,0,0,S,S,Q,Q,0,0,S,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q, // @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,S,S,0,S,0, // ' a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Q,S,0,0,0}; /* * Returns true for those characters that terminate a string of ordinary chars. */ internal static bool IsSpecial(char ch) { return(ch <= '|' && _category[ch] >= S); } /* * Returns true for those characters that terminate a string of ordinary chars. */ internal static bool IsStopperX(char ch) { return(ch <= '|' && _category[ch] >= X); } /* * Returns true for those characters that begin a quantifier. */ internal static bool IsQuantifier(char ch) { return(ch <= '{' && _category[ch] >= Q); } internal bool IsTrueQuantifier() { int nChars = CharsRight(); if (nChars == 0) return false; int startpos = Textpos(); char ch = CharAt(startpos); if (ch != '{') return ch <= '{' && _category[ch] >= Q; int pos = startpos; while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; if (nChars == 0 || pos - startpos == 1) return false; if (ch == '}') return true; if (ch != ',') return false; while (--nChars > 0 && (ch = CharAt(++pos)) >= '0' && ch <= '9') ; return nChars > 0 && ch == '}'; } /* * Returns true for whitespace. */ internal static bool IsSpace(char ch) { return(ch <= ' ' && _category[ch] == X); } /* * Returns true for chars that should be escaped. */ internal static bool IsMetachar(char ch) { return(ch <= '|' && _category[ch] >= E); } /* * Add a string to the last concatenate. */ internal void AddConcatenate(int pos, int cch, bool isReplacement) { RegexNode node; if (cch == 0) return; if (cch > 1) { String str = _pattern.Substring(pos, cch); if (UseOptionI() && !isReplacement) { // We do the ToLower character by character for consistency. With surrogate chars, doing // a ToLower on the entire string could actually change the surrogate pair. This is more correct // linguistically, but since Regex doesn't support surrogates, it's more important to be // consistent. StringBuilder sb = new StringBuilder(str.Length); for (int i=0; i 2 || _group.ChildCount() > 3) throw MakeException(SR.GetString(SR.TooManyAlternates)); } else { _alternation.AddChild(_concatenation.ReverseLeft()); _group.AddChild(_alternation); } _unit = _group; } /* * Saves options on a stack. */ internal void PushOptions() { _optionsStack.Add(_options); } /* * Recalls options from the stack. */ internal void PopOptions() { _options = _optionsStack[_optionsStack.Count - 1]; _optionsStack.RemoveAt(_optionsStack.Count - 1); } /* * True if options stack is empty. */ internal bool EmptyOptionsStack() { return(_optionsStack.Count == 0); } /* * Pops the option stack, but keeps the current options unchanged. */ internal void PopKeepOptions() { _optionsStack.RemoveAt(_optionsStack.Count - 1); } /* * Fills in an ArgumentException */ internal ArgumentException MakeException(String message) { return new ArgumentException(SR.GetString(SR.MakeException, _pattern, message)); } /* * Returns the current parsing position. */ internal int Textpos() { return _currentPos; } /* * Zaps to a specific parsing position. */ internal void Textto(int pos) { _currentPos = pos; } /* * Returns the char at the right of the current parsing position and advances to the right. */ internal char MoveRightGetChar() { return _pattern[_currentPos++]; } /* * Moves the current position to the right. */ internal void MoveRight() { MoveRight(1); } internal void MoveRight(int i) { _currentPos += i; } /* * Moves the current parsing position one to the left. */ internal void MoveLeft() { --_currentPos; } /* * Returns the char left of the current parsing position. */ internal char CharAt(int i) { return _pattern[i]; } /* * Returns the char right of the current parsing position. */ internal char RightChar() { return _pattern[_currentPos]; } /* * Returns the char i chars right of the current parsing position. */ internal char RightChar(int i) { return _pattern[_currentPos + i]; } /* * Number of characters to the right of the current parsing position. */ internal int CharsRight() { return _pattern.Length - _currentPos; } } }