e79aa3c0ed
Former-commit-id: a2155e9bd80020e49e72e86c44da02a8ac0e57a4
694 lines
24 KiB
C#
694 lines
24 KiB
C#
//------------------------------------------------------------------------------
|
|
// <copyright file="RegexNode.cs" company="Microsoft">
|
|
// Copyright (c) Microsoft Corporation. All rights reserved.
|
|
// </copyright>
|
|
//------------------------------------------------------------------------------
|
|
|
|
// This RegexNode class is internal to the Regex package.
|
|
// It is built into a parsed tree for a regular expression.
|
|
|
|
// Implementation notes:
|
|
//
|
|
// Since the node tree is a temporary data structure only used
|
|
// during compilation of the regexp to integer codes, it's
|
|
// designed for clarity and convenience rather than
|
|
// space efficiency.
|
|
//
|
|
// RegexNodes are built into a tree, linked by the _children list.
|
|
// Each node also has a _parent and _ichild member indicating
|
|
// its parent and which child # it is in its parent's list.
|
|
//
|
|
// RegexNodes come in as many types as there are constructs in
|
|
// a regular expression, for example, "concatenate", "alternate",
|
|
// "one", "rept", "group". There are also node types for basic
|
|
// peephole optimizations, e.g., "onerep", "notsetrep", etc.
|
|
//
|
|
// Because perl 5 allows "lookback" groups that scan backwards,
|
|
// each node also gets a "direction". Normally the value of
|
|
// boolean _backward = false.
|
|
//
|
|
// During parsing, top-level nodes are also stacked onto a parse
|
|
// stack (a stack of trees). For this purpose we have a _next
|
|
// pointer. [Note that to save a few bytes, we could overload the
|
|
// _parent pointer instead.]
|
|
//
|
|
// On the parse stack, each tree has a "role" - basically, the
|
|
// nonterminal in the grammar that the parser has currently
|
|
// assigned to the tree. That code is stored in _role.
|
|
//
|
|
// Finally, some of the different kinds of nodes have data.
|
|
// Two integers (for the looping constructs) are stored in
|
|
// _operands, an an object (either a string or a set)
|
|
// is stored in _data
|
|
|
|
|
|
namespace System.Text.RegularExpressions {
|
|
|
|
using System.Collections;
|
|
using System.Collections.Generic;
|
|
using System.Diagnostics;
|
|
using System.Globalization;
|
|
|
|
internal sealed class RegexNode {
|
|
/*
|
|
* RegexNode types
|
|
*/
|
|
|
|
// the following are leaves, and correspond to primitive operations
|
|
|
|
// static final int Onerep = RegexCode.Onerep; // c,n a {n}
|
|
// static final int Notonerep = RegexCode.Notonerep; // c,n .{n}
|
|
// static final int Setrep = RegexCode.Setrep; // set,n \d {n}
|
|
|
|
internal const int Oneloop = RegexCode.Oneloop; // c,n a*
|
|
internal const int Notoneloop = RegexCode.Notoneloop; // c,n .*
|
|
internal const int Setloop = RegexCode.Setloop; // set,n \d*
|
|
|
|
internal const int Onelazy = RegexCode.Onelazy; // c,n a*?
|
|
internal const int Notonelazy = RegexCode.Notonelazy; // c,n .*?
|
|
internal const int Setlazy = RegexCode.Setlazy; // set,n \d*?
|
|
|
|
internal const int One = RegexCode.One; // char a
|
|
internal const int Notone = RegexCode.Notone; // char . [^a]
|
|
internal const int Set = RegexCode.Set; // set [a-z] \w \s \d
|
|
|
|
internal const int Multi = RegexCode.Multi; // string abcdef
|
|
internal const int Ref = RegexCode.Ref; // index \1
|
|
|
|
internal const int Bol = RegexCode.Bol; // ^
|
|
internal const int Eol = RegexCode.Eol; // $
|
|
internal const int Boundary = RegexCode.Boundary; // \b
|
|
internal const int Nonboundary= RegexCode.Nonboundary;// \B
|
|
internal const int ECMABoundary = RegexCode.ECMABoundary; // \b
|
|
internal const int NonECMABoundary= RegexCode.NonECMABoundary; // \B
|
|
internal const int Beginning = RegexCode.Beginning; // \A
|
|
internal const int Start = RegexCode.Start; // \G
|
|
internal const int EndZ = RegexCode.EndZ; // \Z
|
|
internal const int End = RegexCode.End; // \z
|
|
|
|
// (note: End = 21;)
|
|
|
|
// interior nodes do not correpond to primitive operations, but
|
|
// control structures compositing other operations
|
|
|
|
// concat and alternate take n children, and can run forward or backwards
|
|
|
|
internal const int Nothing = 22; // []
|
|
internal const int Empty = 23; // ()
|
|
|
|
internal const int Alternate = 24; // a|b
|
|
internal const int Concatenate= 25; // ab
|
|
|
|
internal const int Loop = 26; // m,x * + ? {,}
|
|
internal const int Lazyloop = 27; // m,x *? +? ?? {,}?
|
|
|
|
internal const int Capture = 28; // n ()
|
|
internal const int Group = 29; // (?:)
|
|
internal const int Require = 30; // (?=) (?<=)
|
|
internal const int Prevent = 31; // (?!) (?<!)
|
|
internal const int Greedy = 32; // (?>) (?<)
|
|
internal const int Testref = 33; // (?(n) | )
|
|
internal const int Testgroup = 34; // (?(...) | )
|
|
|
|
/*
|
|
* RegexNode data members
|
|
*
|
|
*/
|
|
|
|
internal int _type;
|
|
|
|
internal List<RegexNode> _children;
|
|
|
|
internal String _str;
|
|
internal char _ch;
|
|
internal int _m;
|
|
internal int _n;
|
|
internal RegexOptions _options;
|
|
|
|
internal RegexNode _next;
|
|
|
|
internal RegexNode(int type, RegexOptions options) {
|
|
_type = type;
|
|
_options = options;
|
|
}
|
|
|
|
internal RegexNode(int type, RegexOptions options, char ch) {
|
|
_type = type;
|
|
_options = options;
|
|
_ch = ch;
|
|
}
|
|
|
|
internal RegexNode(int type, RegexOptions options, String str) {
|
|
_type = type;
|
|
_options = options;
|
|
_str = str;
|
|
}
|
|
|
|
internal RegexNode(int type, RegexOptions options, int m) {
|
|
_type = type;
|
|
_options = options;
|
|
_m = m;
|
|
}
|
|
|
|
internal RegexNode(int type, RegexOptions options, int m, int n) {
|
|
_type = type;
|
|
_options = options;
|
|
_m = m;
|
|
_n = n;
|
|
}
|
|
|
|
internal bool UseOptionR() {
|
|
return(_options & RegexOptions.RightToLeft) != 0;
|
|
}
|
|
|
|
internal RegexNode ReverseLeft() {
|
|
if (UseOptionR() && _type == Concatenate && _children != null) {
|
|
_children.Reverse(0, _children.Count);
|
|
}
|
|
|
|
return this;
|
|
}
|
|
|
|
|
|
// Pass type as OneLazy or OneLoop
|
|
internal void MakeRep(int type, int min, int max) {
|
|
_type += (type - One);
|
|
_m = min;
|
|
_n = max;
|
|
}
|
|
|
|
/*
|
|
* Reduce
|
|
*
|
|
* Removes redundant nodes from the subtree, and returns a reduced subtree.
|
|
*/
|
|
internal RegexNode Reduce() {
|
|
RegexNode n;
|
|
|
|
switch (Type()) {
|
|
case Alternate:
|
|
n = ReduceAlternation();
|
|
break;
|
|
|
|
case Concatenate:
|
|
n = ReduceConcatenation();
|
|
break;
|
|
|
|
case Loop:
|
|
case Lazyloop:
|
|
n = ReduceRep();
|
|
break;
|
|
|
|
case Group:
|
|
n = ReduceGroup();
|
|
break;
|
|
|
|
case Set:
|
|
case Setloop:
|
|
n = ReduceSet();
|
|
break;
|
|
|
|
default:
|
|
n = this;
|
|
break;
|
|
}
|
|
|
|
return n;
|
|
}
|
|
|
|
|
|
/*
|
|
* StripEnation:
|
|
*
|
|
* Simple optimization. If a concatenation or alternation has only
|
|
* one child strip out the intermediate node. If it has zero children,
|
|
* turn it into an empty.
|
|
*
|
|
*/
|
|
|
|
internal RegexNode StripEnation(int emptyType) {
|
|
switch (ChildCount()) {
|
|
case 0:
|
|
return new RegexNode(emptyType, _options);
|
|
case 1:
|
|
return Child(0);
|
|
default:
|
|
return this;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* ReduceGroup:
|
|
*
|
|
* Simple optimization. Once parsed into a tree, noncapturing groups
|
|
* serve no function, so strip them out.
|
|
*/
|
|
|
|
internal RegexNode ReduceGroup() {
|
|
RegexNode u;
|
|
|
|
for (u = this; u.Type() == Group; )
|
|
u = u.Child(0);
|
|
|
|
return u;
|
|
}
|
|
|
|
/*
|
|
* ReduceRep:
|
|
*
|
|
* Nested repeaters just get multiplied with each other if they're not
|
|
* too lumpy
|
|
*/
|
|
|
|
internal RegexNode ReduceRep() {
|
|
RegexNode u;
|
|
RegexNode child;
|
|
int type;
|
|
int min;
|
|
int max;
|
|
|
|
u = this;
|
|
type = Type();
|
|
min = _m;
|
|
max = _n;
|
|
|
|
for (;;) {
|
|
if (u.ChildCount() == 0)
|
|
break;
|
|
|
|
child = u.Child(0);
|
|
|
|
// multiply reps of the same type only
|
|
if (child.Type() != type) {
|
|
int childType = child.Type();
|
|
|
|
if (!(childType >= Oneloop && childType <= Setloop && type == Loop ||
|
|
childType >= Onelazy && childType <= Setlazy && type == Lazyloop))
|
|
break;
|
|
}
|
|
|
|
// child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
|
|
// [but things like (a {2,})+ are not too lumpy...]
|
|
if (u._m == 0 && child._m > 1 || child._n < child._m * 2)
|
|
break;
|
|
|
|
u = child;
|
|
if (u._m > 0)
|
|
u._m = min = ((Int32.MaxValue - 1) / u._m < min) ? Int32.MaxValue : u._m * min;
|
|
if (u._n > 0)
|
|
u._n = max = ((Int32.MaxValue - 1) / u._n < max) ? Int32.MaxValue : u._n * max;
|
|
}
|
|
|
|
return min == Int32.MaxValue ? new RegexNode(Nothing, _options) : u;
|
|
}
|
|
|
|
/*
|
|
* ReduceSet:
|
|
*
|
|
* Simple optimization. If a set is a singleton, an inverse singleton,
|
|
* or empty, it's transformed accordingly.
|
|
*/
|
|
|
|
internal RegexNode ReduceSet() {
|
|
// Extract empty-set, one and not-one case as special
|
|
|
|
if (RegexCharClass.IsEmpty(_str)) {
|
|
_type = Nothing;
|
|
_str = null;
|
|
}
|
|
else if (RegexCharClass.IsSingleton(_str)) {
|
|
_ch = RegexCharClass.SingletonChar(_str);
|
|
_str = null;
|
|
_type += (One - Set);
|
|
}
|
|
else if (RegexCharClass.IsSingletonInverse(_str)) {
|
|
_ch = RegexCharClass.SingletonChar(_str);
|
|
_str = null;
|
|
_type += (Notone - Set);
|
|
}
|
|
|
|
return this;
|
|
}
|
|
|
|
/*
|
|
* ReduceAlternation:
|
|
*
|
|
* Basic optimization. Single-letter alternations can be replaced
|
|
* by faster set specifications, and nested alternations with no
|
|
* intervening operators can be flattened:
|
|
*
|
|
* a|b|c|def|g|h -> [a-c]|def|[gh]
|
|
* apple|(?:orange|pear)|grape -> apple|orange|pear|grape
|
|
*
|
|
* <
|
|
*/
|
|
|
|
internal RegexNode ReduceAlternation() {
|
|
// Combine adjacent sets/chars
|
|
|
|
bool wasLastSet;
|
|
bool lastNodeCannotMerge;
|
|
RegexOptions optionsLast;
|
|
RegexOptions optionsAt;
|
|
int i;
|
|
int j;
|
|
RegexNode at;
|
|
RegexNode prev;
|
|
|
|
if (_children == null)
|
|
return new RegexNode(RegexNode.Nothing, _options);
|
|
|
|
wasLastSet = false;
|
|
lastNodeCannotMerge = false;
|
|
optionsLast = 0;
|
|
|
|
for (i = 0, j = 0; i < _children.Count; i++, j++) {
|
|
at = _children[i];
|
|
|
|
if (j < i)
|
|
_children[j] = at;
|
|
|
|
for (;;) {
|
|
if (at._type == Alternate) {
|
|
for (int k = 0; k < at._children.Count; k++)
|
|
at._children[k]._next = this;
|
|
|
|
_children.InsertRange(i + 1, at._children);
|
|
j--;
|
|
}
|
|
else if (at._type == Set || at._type == One) {
|
|
// Cannot merge sets if L or I options differ, or if either are negated.
|
|
optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
|
|
|
|
|
|
if (at._type == Set) {
|
|
if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !RegexCharClass.IsMergeable(at._str)) {
|
|
wasLastSet = true;
|
|
lastNodeCannotMerge = !RegexCharClass.IsMergeable(at._str);
|
|
optionsLast = optionsAt;
|
|
break;
|
|
}
|
|
}
|
|
else if (!wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge) {
|
|
wasLastSet = true;
|
|
lastNodeCannotMerge = false;
|
|
optionsLast = optionsAt;
|
|
break;
|
|
}
|
|
|
|
|
|
// The last node was a Set or a One, we're a Set or One and our options are the same.
|
|
// Merge the two nodes.
|
|
j--;
|
|
prev = _children[j];
|
|
|
|
RegexCharClass prevCharClass;
|
|
if (prev._type == RegexNode.One) {
|
|
prevCharClass = new RegexCharClass();
|
|
prevCharClass.AddChar(prev._ch);
|
|
}
|
|
else {
|
|
prevCharClass = RegexCharClass.Parse(prev._str);
|
|
}
|
|
|
|
if (at._type == RegexNode.One) {
|
|
prevCharClass.AddChar(at._ch);
|
|
}
|
|
else {
|
|
RegexCharClass atCharClass = RegexCharClass.Parse(at._str);
|
|
prevCharClass.AddCharClass(atCharClass);
|
|
}
|
|
|
|
prev._type = RegexNode.Set;
|
|
prev._str = prevCharClass.ToStringClass();
|
|
|
|
}
|
|
else if (at._type == RegexNode.Nothing) {
|
|
j--;
|
|
}
|
|
else {
|
|
wasLastSet = false;
|
|
lastNodeCannotMerge = false;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (j < i)
|
|
_children.RemoveRange(j, i - j);
|
|
|
|
return StripEnation(RegexNode.Nothing);
|
|
}
|
|
|
|
/*
|
|
* ReduceConcatenation:
|
|
*
|
|
* Basic optimization. Adjacent strings can be concatenated.
|
|
*
|
|
* (?:abc)(?:def) -> abcdef
|
|
*/
|
|
|
|
internal RegexNode ReduceConcatenation() {
|
|
// Eliminate empties and concat adjacent strings/chars
|
|
|
|
bool wasLastString;
|
|
RegexOptions optionsLast;
|
|
RegexOptions optionsAt;
|
|
int i;
|
|
int j;
|
|
|
|
if (_children == null)
|
|
return new RegexNode(RegexNode.Empty, _options);
|
|
|
|
wasLastString = false;
|
|
optionsLast = 0;
|
|
|
|
for (i = 0, j = 0; i < _children.Count; i++, j++) {
|
|
RegexNode at;
|
|
RegexNode prev;
|
|
|
|
at = _children[i];
|
|
|
|
if (j < i)
|
|
_children[j] = at;
|
|
|
|
if (at._type == RegexNode.Concatenate &&
|
|
((at._options & RegexOptions.RightToLeft) == (_options & RegexOptions.RightToLeft))) {
|
|
for (int k = 0; k < at._children.Count; k++)
|
|
at._children[k]._next = this;
|
|
|
|
_children.InsertRange(i + 1, at._children);
|
|
j--;
|
|
}
|
|
else if (at._type == RegexNode.Multi ||
|
|
at._type == RegexNode.One) {
|
|
// Cannot merge strings if L or I options differ
|
|
optionsAt = at._options & (RegexOptions.RightToLeft | RegexOptions.IgnoreCase);
|
|
|
|
if (!wasLastString || optionsLast != optionsAt) {
|
|
wasLastString = true;
|
|
optionsLast = optionsAt;
|
|
continue;
|
|
}
|
|
|
|
prev = _children[--j];
|
|
|
|
if (prev._type == RegexNode.One) {
|
|
prev._type = RegexNode.Multi;
|
|
prev._str = Convert.ToString(prev._ch, CultureInfo.InvariantCulture);
|
|
}
|
|
|
|
if ((optionsAt & RegexOptions.RightToLeft) == 0) {
|
|
if (at._type == RegexNode.One)
|
|
prev._str += at._ch.ToString();
|
|
else
|
|
prev._str += at._str;
|
|
}
|
|
else {
|
|
if (at._type == RegexNode.One)
|
|
prev._str = at._ch.ToString() + prev._str;
|
|
else
|
|
prev._str = at._str + prev._str;
|
|
}
|
|
|
|
}
|
|
else if (at._type == RegexNode.Empty) {
|
|
j--;
|
|
}
|
|
else {
|
|
wasLastString = false;
|
|
}
|
|
}
|
|
|
|
if (j < i)
|
|
_children.RemoveRange(j, i - j);
|
|
|
|
return StripEnation(RegexNode.Empty);
|
|
}
|
|
|
|
internal RegexNode MakeQuantifier(bool lazy, int min, int max) {
|
|
RegexNode result;
|
|
|
|
if (min == 0 && max == 0)
|
|
return new RegexNode(RegexNode.Empty, _options);
|
|
|
|
if (min == 1 && max == 1)
|
|
return this;
|
|
|
|
switch (_type) {
|
|
case RegexNode.One:
|
|
case RegexNode.Notone:
|
|
case RegexNode.Set:
|
|
|
|
MakeRep(lazy ? RegexNode.Onelazy : RegexNode.Oneloop, min, max);
|
|
return this;
|
|
|
|
default:
|
|
result = new RegexNode(lazy ? RegexNode.Lazyloop : RegexNode.Loop, _options, min, max);
|
|
result.AddChild(this);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
internal void AddChild(RegexNode newChild) {
|
|
RegexNode reducedChild;
|
|
|
|
if (_children == null)
|
|
_children = new List<RegexNode>(4);
|
|
|
|
reducedChild = newChild.Reduce();
|
|
|
|
_children.Add(reducedChild);
|
|
reducedChild._next = this;
|
|
}
|
|
internal RegexNode Child(int i) {
|
|
return _children[i];
|
|
}
|
|
|
|
internal int ChildCount() {
|
|
return _children == null ? 0 : _children.Count;
|
|
}
|
|
|
|
internal int Type() {
|
|
return _type;
|
|
}
|
|
|
|
#if DBG
|
|
internal static String[] TypeStr = new String[] {
|
|
"Onerep", "Notonerep", "Setrep",
|
|
"Oneloop", "Notoneloop", "Setloop",
|
|
"Onelazy", "Notonelazy", "Setlazy",
|
|
"One", "Notone", "Set",
|
|
"Multi", "Ref",
|
|
"Bol", "Eol", "Boundary", "Nonboundary",
|
|
"ECMABoundary", "NonECMABoundary",
|
|
"Beginning", "Start", "EndZ", "End",
|
|
"Nothing", "Empty",
|
|
"Alternate", "Concatenate",
|
|
"Loop", "Lazyloop",
|
|
"Capture", "Group", "Require", "Prevent", "Greedy",
|
|
"Testref", "Testgroup"};
|
|
|
|
internal String Description() {
|
|
|
|
StringBuilder ArgSb = new StringBuilder();
|
|
|
|
ArgSb.Append(TypeStr[_type]);
|
|
|
|
if ((_options & RegexOptions.ExplicitCapture) != 0)
|
|
ArgSb.Append("-C");
|
|
if ((_options & RegexOptions.IgnoreCase) != 0)
|
|
ArgSb.Append("-I");
|
|
if ((_options & RegexOptions.RightToLeft) != 0)
|
|
ArgSb.Append("-L");
|
|
if ((_options & RegexOptions.Multiline) != 0)
|
|
ArgSb.Append("-M");
|
|
if ((_options & RegexOptions.Singleline) != 0)
|
|
ArgSb.Append("-S");
|
|
if ((_options & RegexOptions.IgnorePatternWhitespace) != 0)
|
|
ArgSb.Append("-X");
|
|
if ((_options & RegexOptions.ECMAScript) != 0)
|
|
ArgSb.Append("-E");
|
|
|
|
switch (_type) {
|
|
case Oneloop:
|
|
case Notoneloop:
|
|
case Onelazy:
|
|
case Notonelazy:
|
|
case One:
|
|
case Notone:
|
|
ArgSb.Append("(Ch = " + RegexCharClass.CharDescription(_ch) + ")");
|
|
break;
|
|
case Capture:
|
|
ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ", unindex = " + _n.ToString(CultureInfo.InvariantCulture) + ")");
|
|
break;
|
|
case Ref:
|
|
case Testref:
|
|
ArgSb.Append("(index = " + _m.ToString(CultureInfo.InvariantCulture) + ")");
|
|
break;
|
|
case Multi:
|
|
ArgSb.Append("(String = " + _str + ")");
|
|
break;
|
|
case Set:
|
|
case Setloop:
|
|
case Setlazy:
|
|
ArgSb.Append("(Set = " + RegexCharClass.SetDescription(_str) + ")");
|
|
break;
|
|
}
|
|
|
|
switch (_type) {
|
|
case Oneloop:
|
|
case Notoneloop:
|
|
case Onelazy:
|
|
case Notonelazy:
|
|
case Setloop:
|
|
case Setlazy:
|
|
case Loop:
|
|
case Lazyloop:
|
|
ArgSb.Append("(Min = " + _m.ToString(CultureInfo.InvariantCulture) + ", Max = " + (_n == Int32.MaxValue ? "inf" : Convert.ToString(_n, CultureInfo.InvariantCulture)) + ")");
|
|
break;
|
|
}
|
|
|
|
return ArgSb.ToString();
|
|
}
|
|
|
|
internal const String Space = " ";
|
|
|
|
internal void Dump() {
|
|
List<int> Stack = new List<int>();
|
|
RegexNode CurNode;
|
|
int CurChild;
|
|
|
|
CurNode = this;
|
|
CurChild = 0;
|
|
|
|
Debug.WriteLine(CurNode.Description());
|
|
|
|
for (;;) {
|
|
if (CurNode._children != null && CurChild < CurNode._children.Count) {
|
|
Stack.Add(CurChild + 1);
|
|
CurNode = CurNode._children[CurChild];
|
|
CurChild = 0;
|
|
|
|
int Depth = Stack.Count;
|
|
if (Depth > 32)
|
|
Depth = 32;
|
|
|
|
Debug.WriteLine(Space.Substring(0, Depth) + CurNode.Description());
|
|
}
|
|
else {
|
|
if (Stack.Count == 0)
|
|
break;
|
|
|
|
CurChild = Stack[Stack.Count - 1];
|
|
Stack.RemoveAt(Stack.Count - 1);
|
|
CurNode = CurNode._next;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|