592 lines
20 KiB
C#
592 lines
20 KiB
C#
|
//------------------------------------------------------------------------------
|
||
|
// <copyright file="RegexFCD.cs" company="Microsoft">
|
||
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||
|
// </copyright>
|
||
|
//------------------------------------------------------------------------------
|
||
|
|
||
|
// This RegexFCD class is internal to the Regex package.
|
||
|
// It builds a bunch of FC information (RegexFC) about
|
||
|
// the regex for optimization purposes.
|
||
|
|
||
|
// Implementation notes:
|
||
|
//
|
||
|
// This step is as simple as walking the tree and emitting
|
||
|
// sequences of codes.
|
||
|
|
||
|
namespace System.Text.RegularExpressions {
|
||
|
|
||
|
using System.Collections;
|
||
|
using System.Globalization;
|
||
|
|
||
|
internal sealed class RegexFCD {
|
||
|
private int[] _intStack;
|
||
|
private int _intDepth;
|
||
|
private RegexFC[] _fcStack;
|
||
|
private int _fcDepth;
|
||
|
private bool _skipAllChildren; // don't process any more children at the current level
|
||
|
private bool _skipchild; // don't process the current child.
|
||
|
private bool _failed = false;
|
||
|
|
||
|
private const int BeforeChild = 64;
|
||
|
private const int AfterChild = 128;
|
||
|
|
||
|
// where the regex can be pegged
|
||
|
|
||
|
internal const int Beginning = 0x0001;
|
||
|
internal const int Bol = 0x0002;
|
||
|
internal const int Start = 0x0004;
|
||
|
internal const int Eol = 0x0008;
|
||
|
internal const int EndZ = 0x0010;
|
||
|
internal const int End = 0x0020;
|
||
|
internal const int Boundary = 0x0040;
|
||
|
internal const int ECMABoundary = 0x0080;
|
||
|
|
||
|
/*
|
||
|
* This is the one of the only two functions that should be called from outside.
|
||
|
* It takes a RegexTree and computes the set of chars that can start it.
|
||
|
*/
|
||
|
internal static RegexPrefix FirstChars(RegexTree t) {
|
||
|
RegexFCD s = new RegexFCD();
|
||
|
RegexFC fc = s.RegexFCFromRegexTree(t);
|
||
|
|
||
|
if (fc == null || fc._nullable)
|
||
|
return null;
|
||
|
|
||
|
CultureInfo culture = ((t._options & RegexOptions.CultureInvariant) != 0) ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
|
||
|
return new RegexPrefix(fc.GetFirstChars(culture), fc.IsCaseInsensitive());
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* This is a related computation: it takes a RegexTree and computes the
|
||
|
* leading substring if it see one. It's quite trivial and gives up easily.
|
||
|
*/
|
||
|
internal static RegexPrefix Prefix(RegexTree tree) {
|
||
|
RegexNode curNode;
|
||
|
RegexNode concatNode = null;
|
||
|
int nextChild = 0;
|
||
|
|
||
|
curNode = tree._root;
|
||
|
|
||
|
for (;;) {
|
||
|
switch (curNode._type) {
|
||
|
case RegexNode.Concatenate:
|
||
|
if (curNode.ChildCount() > 0) {
|
||
|
concatNode = curNode;
|
||
|
nextChild = 0;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Greedy:
|
||
|
case RegexNode.Capture:
|
||
|
curNode = curNode.Child(0);
|
||
|
concatNode = null;
|
||
|
continue;
|
||
|
|
||
|
case RegexNode.Oneloop:
|
||
|
case RegexNode.Onelazy:
|
||
|
if (curNode._m > 0) {
|
||
|
string pref = String.Empty.PadRight(curNode._m, curNode._ch);
|
||
|
return new RegexPrefix(pref, 0 != (curNode._options & RegexOptions.IgnoreCase));
|
||
|
}
|
||
|
else
|
||
|
return RegexPrefix.Empty;
|
||
|
|
||
|
case RegexNode.One:
|
||
|
return new RegexPrefix(curNode._ch.ToString(CultureInfo.InvariantCulture), 0 != (curNode._options & RegexOptions.IgnoreCase));
|
||
|
|
||
|
case RegexNode.Multi:
|
||
|
return new RegexPrefix(curNode._str, 0 != (curNode._options & RegexOptions.IgnoreCase));
|
||
|
|
||
|
case RegexNode.Bol:
|
||
|
case RegexNode.Eol:
|
||
|
case RegexNode.Boundary:
|
||
|
case RegexNode.ECMABoundary:
|
||
|
case RegexNode.Beginning:
|
||
|
case RegexNode.Start:
|
||
|
case RegexNode.EndZ:
|
||
|
case RegexNode.End:
|
||
|
case RegexNode.Empty:
|
||
|
case RegexNode.Require:
|
||
|
case RegexNode.Prevent:
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
return RegexPrefix.Empty;
|
||
|
}
|
||
|
|
||
|
if (concatNode == null || nextChild >= concatNode.ChildCount())
|
||
|
return RegexPrefix.Empty;
|
||
|
|
||
|
curNode = concatNode.Child(nextChild++);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Yet another related computation: it takes a RegexTree and computes the
|
||
|
* leading anchors that it encounters.
|
||
|
*/
|
||
|
internal static int Anchors(RegexTree tree) {
|
||
|
RegexNode curNode;
|
||
|
RegexNode concatNode = null;
|
||
|
int nextChild = 0;
|
||
|
int result = 0;
|
||
|
|
||
|
curNode = tree._root;
|
||
|
|
||
|
for (;;) {
|
||
|
switch (curNode._type) {
|
||
|
case RegexNode.Concatenate:
|
||
|
if (curNode.ChildCount() > 0) {
|
||
|
concatNode = curNode;
|
||
|
nextChild = 0;
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Greedy:
|
||
|
case RegexNode.Capture:
|
||
|
curNode = curNode.Child(0);
|
||
|
concatNode = null;
|
||
|
continue;
|
||
|
|
||
|
case RegexNode.Bol:
|
||
|
case RegexNode.Eol:
|
||
|
case RegexNode.Boundary:
|
||
|
case RegexNode.ECMABoundary:
|
||
|
case RegexNode.Beginning:
|
||
|
case RegexNode.Start:
|
||
|
case RegexNode.EndZ:
|
||
|
case RegexNode.End:
|
||
|
return result | AnchorFromType(curNode._type);
|
||
|
|
||
|
case RegexNode.Empty:
|
||
|
case RegexNode.Require:
|
||
|
case RegexNode.Prevent:
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
if (concatNode == null || nextChild >= concatNode.ChildCount())
|
||
|
return result;
|
||
|
|
||
|
curNode = concatNode.Child(nextChild++);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Convert anchor type to anchor bit.
|
||
|
*/
|
||
|
private static int AnchorFromType(int type) {
|
||
|
switch (type) {
|
||
|
case RegexNode.Bol: return Bol;
|
||
|
case RegexNode.Eol: return Eol;
|
||
|
case RegexNode.Boundary: return Boundary;
|
||
|
case RegexNode.ECMABoundary: return ECMABoundary;
|
||
|
case RegexNode.Beginning: return Beginning;
|
||
|
case RegexNode.Start: return Start;
|
||
|
case RegexNode.EndZ: return EndZ;
|
||
|
case RegexNode.End: return End;
|
||
|
default: return 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#if DBG
|
||
|
internal static String AnchorDescription(int anchors) {
|
||
|
StringBuilder sb = new StringBuilder();
|
||
|
|
||
|
if (0 != (anchors & Beginning)) sb.Append(", Beginning");
|
||
|
if (0 != (anchors & Start)) sb.Append(", Start");
|
||
|
if (0 != (anchors & Bol)) sb.Append(", Bol");
|
||
|
if (0 != (anchors & Boundary)) sb.Append(", Boundary");
|
||
|
if (0 != (anchors & ECMABoundary)) sb.Append(", ECMABoundary");
|
||
|
if (0 != (anchors & Eol)) sb.Append(", Eol");
|
||
|
if (0 != (anchors & End)) sb.Append(", End");
|
||
|
if (0 != (anchors & EndZ)) sb.Append(", EndZ");
|
||
|
|
||
|
if (sb.Length >= 2)
|
||
|
return(sb.ToString(2, sb.Length - 2));
|
||
|
|
||
|
return "None";
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
/*
|
||
|
* private constructor; can't be created outside
|
||
|
*/
|
||
|
private RegexFCD() {
|
||
|
_fcStack = new RegexFC[32];
|
||
|
_intStack = new int[32];
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* To avoid recursion, we use a simple integer stack.
|
||
|
* This is the push.
|
||
|
*/
|
||
|
private void PushInt(int I) {
|
||
|
if (_intDepth >= _intStack.Length) {
|
||
|
int [] expanded = new int[_intDepth * 2];
|
||
|
|
||
|
System.Array.Copy(_intStack, 0, expanded, 0, _intDepth);
|
||
|
|
||
|
_intStack = expanded;
|
||
|
}
|
||
|
|
||
|
_intStack[_intDepth++] = I;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* True if the stack is empty.
|
||
|
*/
|
||
|
private bool IntIsEmpty() {
|
||
|
return _intDepth == 0;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* This is the pop.
|
||
|
*/
|
||
|
private int PopInt() {
|
||
|
return _intStack[--_intDepth];
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* We also use a stack of RegexFC objects.
|
||
|
* This is the push.
|
||
|
*/
|
||
|
private void PushFC(RegexFC fc) {
|
||
|
if (_fcDepth >= _fcStack.Length) {
|
||
|
RegexFC[] expanded = new RegexFC[_fcDepth * 2];
|
||
|
|
||
|
System.Array.Copy(_fcStack, 0, expanded, 0, _fcDepth);
|
||
|
_fcStack = expanded;
|
||
|
}
|
||
|
|
||
|
_fcStack[_fcDepth++] = fc;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* True if the stack is empty.
|
||
|
*/
|
||
|
private bool FCIsEmpty() {
|
||
|
return _fcDepth == 0;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* This is the pop.
|
||
|
*/
|
||
|
private RegexFC PopFC() {
|
||
|
return _fcStack[--_fcDepth];
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* This is the top.
|
||
|
*/
|
||
|
private RegexFC TopFC() {
|
||
|
return _fcStack[_fcDepth - 1];
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* The main FC computation. It does a shortcutted depth-first walk
|
||
|
* through the tree and calls CalculateFC to emits code before
|
||
|
* and after each child of an interior node, and at each leaf.
|
||
|
*/
|
||
|
private RegexFC RegexFCFromRegexTree(RegexTree tree) {
|
||
|
RegexNode curNode;
|
||
|
int curChild;
|
||
|
|
||
|
curNode = tree._root;
|
||
|
curChild = 0;
|
||
|
|
||
|
for (;;) {
|
||
|
if (curNode._children == null) {
|
||
|
// This is a leaf node
|
||
|
CalculateFC(curNode._type, curNode, 0);
|
||
|
}
|
||
|
else if (curChild < curNode._children.Count && !_skipAllChildren) {
|
||
|
// This is an interior node, and we have more children to analyze
|
||
|
CalculateFC(curNode._type | BeforeChild, curNode, curChild);
|
||
|
|
||
|
if (!_skipchild) {
|
||
|
curNode = (RegexNode)curNode._children[curChild];
|
||
|
// this stack is how we get a depth first walk of the tree.
|
||
|
PushInt(curChild);
|
||
|
curChild = 0;
|
||
|
}
|
||
|
else {
|
||
|
curChild++;
|
||
|
_skipchild = false;
|
||
|
}
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
// This is an interior node where we've finished analyzing all the children, or
|
||
|
// the end of a leaf node.
|
||
|
_skipAllChildren = false;
|
||
|
|
||
|
if (IntIsEmpty())
|
||
|
break;
|
||
|
|
||
|
curChild = PopInt();
|
||
|
curNode = curNode._next;
|
||
|
|
||
|
CalculateFC(curNode._type | AfterChild, curNode, curChild);
|
||
|
if (_failed)
|
||
|
return null;
|
||
|
|
||
|
curChild++;
|
||
|
}
|
||
|
|
||
|
if (FCIsEmpty())
|
||
|
return null;
|
||
|
|
||
|
return PopFC();
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Called in Beforechild to prevent further processing of the current child
|
||
|
*/
|
||
|
private void SkipChild() {
|
||
|
_skipchild = true;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* FC computation and shortcut cases for each node type
|
||
|
*/
|
||
|
private void CalculateFC(int NodeType, RegexNode node, int CurIndex) {
|
||
|
bool ci = false;
|
||
|
bool rtl = false;
|
||
|
|
||
|
if (NodeType <= RegexNode.Ref) {
|
||
|
if ((node._options & RegexOptions.IgnoreCase) != 0)
|
||
|
ci = true;
|
||
|
if ((node._options & RegexOptions.RightToLeft) != 0)
|
||
|
rtl = true;
|
||
|
}
|
||
|
|
||
|
switch (NodeType) {
|
||
|
case RegexNode.Concatenate | BeforeChild:
|
||
|
case RegexNode.Alternate | BeforeChild:
|
||
|
case RegexNode.Testref | BeforeChild:
|
||
|
case RegexNode.Loop | BeforeChild:
|
||
|
case RegexNode.Lazyloop | BeforeChild:
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Testgroup | BeforeChild:
|
||
|
if (CurIndex == 0)
|
||
|
SkipChild();
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Empty:
|
||
|
PushFC(new RegexFC(true));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Concatenate | AfterChild:
|
||
|
if (CurIndex != 0) {
|
||
|
RegexFC child = PopFC();
|
||
|
RegexFC cumul = TopFC();
|
||
|
|
||
|
_failed = !cumul.AddFC(child, true);
|
||
|
}
|
||
|
|
||
|
if (!TopFC()._nullable)
|
||
|
_skipAllChildren = true;
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Testgroup | AfterChild:
|
||
|
if (CurIndex > 1) {
|
||
|
RegexFC child = PopFC();
|
||
|
RegexFC cumul = TopFC();
|
||
|
|
||
|
_failed = !cumul.AddFC(child, false);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Alternate | AfterChild:
|
||
|
case RegexNode.Testref | AfterChild:
|
||
|
if (CurIndex != 0) {
|
||
|
RegexFC child = PopFC();
|
||
|
RegexFC cumul = TopFC();
|
||
|
|
||
|
_failed = !cumul.AddFC(child, false);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Loop | AfterChild:
|
||
|
case RegexNode.Lazyloop | AfterChild:
|
||
|
if (node._m == 0)
|
||
|
TopFC()._nullable = true;
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Group | BeforeChild:
|
||
|
case RegexNode.Group | AfterChild:
|
||
|
case RegexNode.Capture | BeforeChild:
|
||
|
case RegexNode.Capture | AfterChild:
|
||
|
case RegexNode.Greedy | BeforeChild:
|
||
|
case RegexNode.Greedy | AfterChild:
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Require | BeforeChild:
|
||
|
case RegexNode.Prevent | BeforeChild:
|
||
|
SkipChild();
|
||
|
PushFC(new RegexFC(true));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Require | AfterChild:
|
||
|
case RegexNode.Prevent | AfterChild:
|
||
|
break;
|
||
|
|
||
|
case RegexNode.One:
|
||
|
case RegexNode.Notone:
|
||
|
PushFC(new RegexFC(node._ch, NodeType == RegexNode.Notone, false, ci));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Oneloop:
|
||
|
case RegexNode.Onelazy:
|
||
|
PushFC(new RegexFC(node._ch, false, node._m == 0, ci));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Notoneloop:
|
||
|
case RegexNode.Notonelazy:
|
||
|
PushFC(new RegexFC(node._ch, true, node._m == 0, ci));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Multi:
|
||
|
if (node._str.Length == 0)
|
||
|
PushFC(new RegexFC(true));
|
||
|
else if (!rtl)
|
||
|
PushFC(new RegexFC(node._str[0], false, false, ci));
|
||
|
else
|
||
|
PushFC(new RegexFC(node._str[node._str.Length - 1], false, false, ci));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Set:
|
||
|
PushFC(new RegexFC(node._str, false, ci));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Setloop:
|
||
|
case RegexNode.Setlazy:
|
||
|
PushFC(new RegexFC(node._str, node._m == 0, ci));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Ref:
|
||
|
PushFC(new RegexFC(RegexCharClass.AnyClass, true, false));
|
||
|
break;
|
||
|
|
||
|
case RegexNode.Nothing:
|
||
|
case RegexNode.Bol:
|
||
|
case RegexNode.Eol:
|
||
|
case RegexNode.Boundary:
|
||
|
case RegexNode.Nonboundary:
|
||
|
case RegexNode.ECMABoundary:
|
||
|
case RegexNode.NonECMABoundary:
|
||
|
case RegexNode.Beginning:
|
||
|
case RegexNode.Start:
|
||
|
case RegexNode.EndZ:
|
||
|
case RegexNode.End:
|
||
|
PushFC(new RegexFC(true));
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
throw new ArgumentException(SR.GetString(SR.UnexpectedOpcode, NodeType.ToString(CultureInfo.CurrentCulture)));
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
internal sealed class RegexFC {
|
||
|
internal RegexCharClass _cc;
|
||
|
internal bool _nullable;
|
||
|
internal bool _caseInsensitive;
|
||
|
|
||
|
internal RegexFC(bool nullable) {
|
||
|
_cc = new RegexCharClass();
|
||
|
_nullable = nullable;
|
||
|
}
|
||
|
|
||
|
internal RegexFC(char ch, bool not, bool nullable, bool caseInsensitive) {
|
||
|
_cc = new RegexCharClass();
|
||
|
|
||
|
if (not) {
|
||
|
if (ch > 0)
|
||
|
_cc.AddRange('\0', (char)(ch - 1));
|
||
|
if (ch < 0xFFFF)
|
||
|
_cc.AddRange((char)(ch + 1), '\uFFFF');
|
||
|
}
|
||
|
else {
|
||
|
_cc.AddRange(ch, ch);
|
||
|
}
|
||
|
|
||
|
_caseInsensitive = caseInsensitive;
|
||
|
_nullable = nullable;
|
||
|
}
|
||
|
|
||
|
internal RegexFC(String charClass, bool nullable, bool caseInsensitive) {
|
||
|
_cc = RegexCharClass.Parse(charClass);
|
||
|
|
||
|
_nullable = nullable;
|
||
|
_caseInsensitive = caseInsensitive;
|
||
|
}
|
||
|
|
||
|
internal bool AddFC(RegexFC fc, bool concatenate) {
|
||
|
if (!_cc.CanMerge || !fc._cc.CanMerge) {
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
if (concatenate) {
|
||
|
if (!_nullable)
|
||
|
return true;
|
||
|
|
||
|
if (!fc._nullable)
|
||
|
_nullable = false;
|
||
|
}
|
||
|
else {
|
||
|
if (fc._nullable)
|
||
|
_nullable = true;
|
||
|
}
|
||
|
|
||
|
_caseInsensitive |= fc._caseInsensitive;
|
||
|
_cc.AddCharClass(fc._cc);
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
internal String GetFirstChars(CultureInfo culture) {
|
||
|
if (_caseInsensitive)
|
||
|
_cc.AddLowercase(culture);
|
||
|
|
||
|
return _cc.ToStringClass();
|
||
|
}
|
||
|
|
||
|
internal bool IsCaseInsensitive() {
|
||
|
return _caseInsensitive;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
internal sealed class RegexPrefix {
|
||
|
internal String _prefix;
|
||
|
internal bool _caseInsensitive;
|
||
|
|
||
|
internal static RegexPrefix _empty = new RegexPrefix(String.Empty, false);
|
||
|
|
||
|
internal RegexPrefix(String prefix, bool ci) {
|
||
|
_prefix = prefix;
|
||
|
_caseInsensitive = ci;
|
||
|
}
|
||
|
|
||
|
internal String Prefix {
|
||
|
get {
|
||
|
return _prefix;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
internal bool CaseInsensitive {
|
||
|
get {
|
||
|
return _caseInsensitive;
|
||
|
}
|
||
|
}
|
||
|
internal static RegexPrefix Empty {
|
||
|
get {
|
||
|
return _empty;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|