//------------------------------------------------------------------------------ // // Copyright (c) Microsoft Corporation. All rights reserved. // //------------------------------------------------------------------------------ // This RegexFCD class is internal to the Regex package. // It builds a bunch of FC information (RegexFC) about // the regex for optimization purposes. // Implementation notes: // // This step is as simple as walking the tree and emitting // sequences of codes. namespace System.Text.RegularExpressions { using System.Collections; using System.Globalization; internal sealed class RegexFCD { private int[] _intStack; private int _intDepth; private RegexFC[] _fcStack; private int _fcDepth; private bool _skipAllChildren; // don't process any more children at the current level private bool _skipchild; // don't process the current child. private bool _failed = false; private const int BeforeChild = 64; private const int AfterChild = 128; // where the regex can be pegged internal const int Beginning = 0x0001; internal const int Bol = 0x0002; internal const int Start = 0x0004; internal const int Eol = 0x0008; internal const int EndZ = 0x0010; internal const int End = 0x0020; internal const int Boundary = 0x0040; internal const int ECMABoundary = 0x0080; /* * This is the one of the only two functions that should be called from outside. * It takes a RegexTree and computes the set of chars that can start it. */ internal static RegexPrefix FirstChars(RegexTree t) { RegexFCD s = new RegexFCD(); RegexFC fc = s.RegexFCFromRegexTree(t); if (fc == null || fc._nullable) return null; CultureInfo culture = ((t._options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; return new RegexPrefix(fc.GetFirstChars(culture), fc.IsCaseInsensitive()); } /* * This is a related computation: it takes a RegexTree and computes the * leading substring if it see one. It's quite trivial and gives up easily. */ internal static RegexPrefix Prefix(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; curNode = tree._root; for (;;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Oneloop: case RegexNode.Onelazy: if (curNode._m > 0) { string pref = String.Empty.PadRight(curNode._m, curNode._ch); return new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase)); } else return RegexPrefix.Empty; case RegexNode.One: return new RegexPrefix(curNode._ch.ToString(CultureInfo.InvariantCulture), 0 != (curNode._options & RegexOptions.IgnoreCase)); case RegexNode.Multi: return new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase)); case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return RegexPrefix.Empty; } if (concatNode == null || nextChild >= concatNode.ChildCount()) return RegexPrefix.Empty; curNode = concatNode.Child(nextChild++); } } /* * Yet another related computation: it takes a RegexTree and computes the * leading anchors that it encounters. */ internal static int Anchors(RegexTree tree) { RegexNode curNode; RegexNode concatNode = null; int nextChild = 0; int result = 0; curNode = tree._root; for (;;) { switch (curNode._type) { case RegexNode.Concatenate: if (curNode.ChildCount() > 0) { concatNode = curNode; nextChild = 0; } break; case RegexNode.Greedy: case RegexNode.Capture: curNode = curNode.Child(0); concatNode = null; continue; case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.ECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: return result | AnchorFromType(curNode._type); case RegexNode.Empty: case RegexNode.Require: case RegexNode.Prevent: break; default: return result; } if (concatNode == null || nextChild >= concatNode.ChildCount()) return result; curNode = concatNode.Child(nextChild++); } } /* * Convert anchor type to anchor bit. */ private static int AnchorFromType(int type) { switch (type) { case RegexNode.Bol: return Bol; case RegexNode.Eol: return Eol; case RegexNode.Boundary: return Boundary; case RegexNode.ECMABoundary: return ECMABoundary; case RegexNode.Beginning: return Beginning; case RegexNode.Start: return Start; case RegexNode.EndZ: return EndZ; case RegexNode.End: return End; default: return 0; } } #if DBG internal static String AnchorDescription(int anchors) { StringBuilder sb = new StringBuilder(); if (0 != (anchors & Beginning)) sb.Append(", Beginning"); if (0 != (anchors & Start)) sb.Append(", Start"); if (0 != (anchors & Bol)) sb.Append(", Bol"); if (0 != (anchors & Boundary)) sb.Append(", Boundary"); if (0 != (anchors & ECMABoundary)) sb.Append(", ECMABoundary"); if (0 != (anchors & Eol)) sb.Append(", Eol"); if (0 != (anchors & End)) sb.Append(", End"); if (0 != (anchors & EndZ)) sb.Append(", EndZ"); if (sb.Length >= 2) return(sb.ToString(2, sb.Length - 2)); return "None"; } #endif /* * private constructor; can't be created outside */ private RegexFCD() { _fcStack = new RegexFC[32]; _intStack = new int[32]; } /* * To avoid recursion, we use a simple integer stack. * This is the push. */ private void PushInt(int I) { if (_intDepth >= _intStack.Length) { int [] expanded = new int[_intDepth * 2]; System.Array.Copy(_intStack, 0, expanded, 0, _intDepth); _intStack = expanded; } _intStack[_intDepth++] = I; } /* * True if the stack is empty. */ private bool IntIsEmpty() { return _intDepth == 0; } /* * This is the pop. */ private int PopInt() { return _intStack[--_intDepth]; } /* * We also use a stack of RegexFC objects. * This is the push. */ private void PushFC(RegexFC fc) { if (_fcDepth >= _fcStack.Length) { RegexFC[] expanded = new RegexFC[_fcDepth * 2]; System.Array.Copy(_fcStack, 0, expanded, 0, _fcDepth); _fcStack = expanded; } _fcStack[_fcDepth++] = fc; } /* * True if the stack is empty. */ private bool FCIsEmpty() { return _fcDepth == 0; } /* * This is the pop. */ private RegexFC PopFC() { return _fcStack[--_fcDepth]; } /* * This is the top. */ private RegexFC TopFC() { return _fcStack[_fcDepth - 1]; } /* * The main FC computation. It does a shortcutted depth-first walk * through the tree and calls CalculateFC to emits code before * and after each child of an interior node, and at each leaf. */ private RegexFC RegexFCFromRegexTree(RegexTree tree) { RegexNode curNode; int curChild; curNode = tree._root; curChild = 0; for (;;) { if (curNode._children == null) { // This is a leaf node CalculateFC(curNode._type, curNode, 0); } else if (curChild < curNode._children.Count && !_skipAllChildren) { // This is an interior node, and we have more children to analyze CalculateFC(curNode._type | BeforeChild, curNode, curChild); if (!_skipchild) { curNode = (RegexNode)curNode._children[curChild]; // this stack is how we get a depth first walk of the tree. PushInt(curChild); curChild = 0; } else { curChild++; _skipchild = false; } continue; } // This is an interior node where we've finished analyzing all the children, or // the end of a leaf node. _skipAllChildren = false; if (IntIsEmpty()) break; curChild = PopInt(); curNode = curNode._next; CalculateFC(curNode._type | AfterChild, curNode, curChild); if (_failed) return null; curChild++; } if (FCIsEmpty()) return null; return PopFC(); } /* * Called in Beforechild to prevent further processing of the current child */ private void SkipChild() { _skipchild = true; } /* * FC computation and shortcut cases for each node type */ private void CalculateFC(int NodeType, RegexNode node, int CurIndex) { bool ci = false; bool rtl = false; if (NodeType <= RegexNode.Ref) { if ((node._options & RegexOptions.IgnoreCase) != 0) ci = true; if ((node._options & RegexOptions.RightToLeft) != 0) rtl = true; } switch (NodeType) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Alternate | BeforeChild: case RegexNode.Testref | BeforeChild: case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: break; case RegexNode.Testgroup | BeforeChild: if (CurIndex == 0) SkipChild(); break; case RegexNode.Empty: PushFC(new RegexFC(true)); break; case RegexNode.Concatenate | AfterChild: if (CurIndex != 0) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, true); } if (!TopFC()._nullable) _skipAllChildren = true; break; case RegexNode.Testgroup | AfterChild: if (CurIndex > 1) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, false); } break; case RegexNode.Alternate | AfterChild: case RegexNode.Testref | AfterChild: if (CurIndex != 0) { RegexFC child = PopFC(); RegexFC cumul = TopFC(); _failed = !cumul.AddFC(child, false); } break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: if (node._m == 0) TopFC()._nullable = true; break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: case RegexNode.Capture | BeforeChild: case RegexNode.Capture | AfterChild: case RegexNode.Greedy | BeforeChild: case RegexNode.Greedy | AfterChild: break; case RegexNode.Require | BeforeChild: case RegexNode.Prevent | BeforeChild: SkipChild(); PushFC(new RegexFC(true)); break; case RegexNode.Require | AfterChild: case RegexNode.Prevent | AfterChild: break; case RegexNode.One: case RegexNode.Notone: PushFC(new RegexFC(node._ch, NodeType == RegexNode.Notone, false, ci)); break; case RegexNode.Oneloop: case RegexNode.Onelazy: PushFC(new RegexFC(node._ch, false, node._m == 0, ci)); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: PushFC(new RegexFC(node._ch, true, node._m == 0, ci)); break; case RegexNode.Multi: if (node._str.Length == 0) PushFC(new RegexFC(true)); else if (!rtl) PushFC(new RegexFC(node._str[0], false, false, ci)); else PushFC(new RegexFC(node._str[node._str.Length - 1], false, false, ci)); break; case RegexNode.Set: PushFC(new RegexFC(node._str, false, ci)); break; case RegexNode.Setloop: case RegexNode.Setlazy: PushFC(new RegexFC(node._str, node._m == 0, ci)); break; case RegexNode.Ref: PushFC(new RegexFC(RegexCharClass.AnyClass, true, false)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: PushFC(new RegexFC(true)); break; default: throw new ArgumentException(SR.GetString(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture))); } } } internal sealed class RegexFC { internal RegexCharClass _cc; internal bool _nullable; internal bool _caseInsensitive; internal RegexFC(bool nullable) { _cc = new RegexCharClass(); _nullable = nullable; } internal RegexFC(char ch, bool not, bool nullable, bool caseInsensitive) { _cc = new RegexCharClass(); if (not) { if (ch > 0) _cc.AddRange('\0', (char)(ch - 1)); if (ch < 0xFFFF) _cc.AddRange((char)(ch + 1), '\uFFFF'); } else { _cc.AddRange(ch, ch); } _caseInsensitive = caseInsensitive; _nullable = nullable; } internal RegexFC(String charClass, bool nullable, bool caseInsensitive) { _cc = RegexCharClass.Parse(charClass); _nullable = nullable; _caseInsensitive = caseInsensitive; } internal bool AddFC(RegexFC fc, bool concatenate) { if (!_cc.CanMerge || !fc._cc.CanMerge) { return false; } if (concatenate) { if (!_nullable) return true; if (!fc._nullable) _nullable = false; } else { if (fc._nullable) _nullable = true; } _caseInsensitive |= fc._caseInsensitive; _cc.AddCharClass(fc._cc); return true; } internal String GetFirstChars(CultureInfo culture) { if (_caseInsensitive) _cc.AddLowercase(culture); return _cc.ToStringClass(); } internal bool IsCaseInsensitive() { return _caseInsensitive; } } internal sealed class RegexPrefix { internal String _prefix; internal bool _caseInsensitive; internal static RegexPrefix _empty = new RegexPrefix(String.Empty, false); internal RegexPrefix(String prefix, bool ci) { _prefix = prefix; _caseInsensitive = ci; } internal String Prefix { get { return _prefix; } } internal bool CaseInsensitive { get { return _caseInsensitive; } } internal static RegexPrefix Empty { get { return _empty; } } } }