618 lines
22 KiB
C#
Raw Normal View History

//------------------------------------------------------------------------------
// <copyright file="RegexRunner.cs" company="Microsoft">
// Copyright (c) Microsoft Corporation. All rights reserved.
// </copyright>
//------------------------------------------------------------------------------
// This RegexRunner class is a base class for compiled regex code.
// Implementation notes:
//
// RegexRunner provides a common calling convention and a common
// runtime environment for the interpreter and the compiled code.
//
// It provides the driver code that call's the subclass's Go()
// method for either scanning or direct execution.
//
// It also maintains memory allocation for the backtracking stack,
// the grouping stack and the longjump crawlstack, and provides
// methods to push new subpattern match results into (or remove
// backtracked results from) the Match instance.
namespace System.Text.RegularExpressions {
using System.Collections;
using System.Diagnostics;
using System.ComponentModel;
using System.Globalization;
/// <internalonly/>
//
#if !SILVERLIGHT
[EditorBrowsable(EditorBrowsableState.Never)]
#endif
#if !SILVERLIGHT
abstract public class RegexRunner {
#else
abstract internal class RegexRunner {
#endif
protected internal int runtextbeg; // beginning of text to search
protected internal int runtextend; // end of text to search
protected internal int runtextstart; // starting point for search
protected internal String runtext; // text to search
protected internal int runtextpos; // current position in text
protected internal int [] runtrack; // The backtracking stack. Opcodes use this to store data regarding
protected internal int runtrackpos; // what they have matched and where to backtrack to. Each "frame" on
// the stack takes the form of [CodePosition Data1 Data2...], where
// CodePosition is the position of the current opcode and
// the data values are all optional. The CodePosition can be negative, and
// these values (also called "back2") are used by the BranchMark family of opcodes
// to indicate whether they are backtracking after a successful or failed
// match.
// When we backtrack, we pop the CodePosition off the stack, set the current
// instruction pointer to that code position, and mark the opcode
// with a backtracking flag ("Back"). Each opcode then knows how to
// handle its own data.
protected internal int [] runstack; // This stack is used to track text positions across different opcodes.
protected internal int runstackpos; // For example, in /(a*b)+/, the parentheses result in a SetMark/CaptureMark
// pair. SetMark records the text position before we match a*b. Then
// CaptureMark uses that position to figure out where the capture starts.
// Opcodes which push onto this stack are always paired with other opcodes
// which will pop the value from it later. A successful match should mean
// that this stack is empty.
protected internal int [] runcrawl; // The crawl stack is used to keep track of captures. Every time a group
protected internal int runcrawlpos; // has a capture, we push its group number onto the runcrawl stack. In
// the case of a balanced match, we push BOTH groups onto the stack.
protected internal int runtrackcount; // count of states that may do backtracking
protected internal Match runmatch; // result object
protected internal Regex runregex; // regex object
private Int32 timeout; // timeout in millisecs (needed for actual)
private bool ignoreTimeout;
private Int32 timeoutOccursAt;
// GPaperin: We have determined this value in a series of experiments where x86 retail
// builds (ono-lab-optimised) were run on different pattern/input pairs. Larger values
// of TimeoutCheckFrequency did not tend to increase performance; smaller values
// of TimeoutCheckFrequency tended to slow down the execution.
private const int TimeoutCheckFrequency = 1000;
private int timeoutChecksToSkip;
protected internal RegexRunner() { }
/*
* Scans the string to find the first match. Uses the Match object
* both to feed text in and as a place to store matches that come out.
*
* All the action is in the abstract Go() method defined by subclasses. Our
* responsibility is to load up the class members (as done here) before
* calling Go.
*
* <
*/
protected internal Match Scan(Regex regex, String text, int textbeg, int textend, int textstart, int prevlen, bool quick) {
return Scan(regex, text, textbeg, textend, textstart, prevlen, quick, regex.MatchTimeout);
}
#if !SILVERLIGHT
protected internal
#else
internal
#endif
Match Scan(Regex regex, String text, int textbeg, int textend, int textstart, int prevlen, bool quick, TimeSpan timeout) {
int bump;
int stoppos;
bool initted = false;
// We need to re-validate timeout here because Scan is historically protected and
// thus there is a possibility it is called from user code:
Regex.ValidateMatchTimeout(timeout);
this.ignoreTimeout = (Regex.InfiniteMatchTimeout == timeout);
this.timeout = this.ignoreTimeout
? (Int32) Regex.InfiniteMatchTimeout.TotalMilliseconds
: (Int32) (timeout.TotalMilliseconds + 0.5); // Round
runregex = regex;
runtext = text;
runtextbeg = textbeg;
runtextend = textend;
runtextstart = textstart;
bump = runregex.RightToLeft ? -1 : 1;
stoppos = runregex.RightToLeft ? runtextbeg : runtextend;
runtextpos = textstart;
// If previous match was empty or failed, advance by one before matching
if (prevlen == 0) {
if (runtextpos == stoppos)
return Match.Empty;
runtextpos += bump;
}
StartTimeoutWatch();
for (; ; ) {
#if DBG
if (runregex.Debug) {
Debug.WriteLine("");
Debug.WriteLine("Search range: from " + runtextbeg.ToString(CultureInfo.InvariantCulture) + " to " + runtextend.ToString(CultureInfo.InvariantCulture));
Debug.WriteLine("Firstchar search starting at " + runtextpos.ToString(CultureInfo.InvariantCulture) + " stopping at " + stoppos.ToString(CultureInfo.InvariantCulture));
}
#endif
if (FindFirstChar()) {
CheckTimeout();
if (!initted) {
InitMatch();
initted = true;
}
#if DBG
if (runregex.Debug) {
Debug.WriteLine("Executing engine starting at " + runtextpos.ToString(CultureInfo.InvariantCulture));
Debug.WriteLine("");
}
#endif
Go();
if (runmatch._matchcount [0] > 0) {
// <
return TidyMatch(quick);
}
// reset state for another go
runtrackpos = runtrack.Length;
runstackpos = runstack.Length;
runcrawlpos = runcrawl.Length;
}
// failure!
if (runtextpos == stoppos) {
TidyMatch(true);
return Match.Empty;
}
// <
// Bump by one and start again
runtextpos += bump;
}
// We never get here
}
private void StartTimeoutWatch() {
if (ignoreTimeout)
return;
timeoutChecksToSkip = TimeoutCheckFrequency;
// We are using Environment.TickCount and not Timewatch for performance reasons.
// Environment.TickCount is an int that cycles. We intentionally let timeoutOccursAt
// overflow it will still stay ahead of Environment.TickCount for comparisons made
// in DoCheckTimeout():
unchecked {
timeoutOccursAt = Environment.TickCount + timeout;
}
}
#if !SILVERLIGHT
protected
#else
internal
#endif
void CheckTimeout() {
if (ignoreTimeout)
return;
if (--timeoutChecksToSkip != 0)
return;
timeoutChecksToSkip = TimeoutCheckFrequency;
DoCheckTimeout();
}
private void DoCheckTimeout() {
// Note that both, Environment.TickCount and timeoutOccursAt are ints and can overflow and become negative.
// See the comment in StartTimeoutWatch().
int currentMillis = Environment.TickCount;
if (currentMillis < timeoutOccursAt)
return;
if (0 > timeoutOccursAt && 0 < currentMillis)
return;
#if DBG
if (runregex.Debug) {
Debug.WriteLine("");
Debug.WriteLine("RegEx match timeout occurred!");
Debug.WriteLine("Specified timeout: " + TimeSpan.FromMilliseconds(timeout).ToString());
Debug.WriteLine("Timeout check frequency: " + TimeoutCheckFrequency);
Debug.WriteLine("Search pattern: " + runregex.pattern);
Debug.WriteLine("Input: " + runtext);
Debug.WriteLine("About to throw RegexMatchTimeoutException.");
}
#endif
throw new RegexMatchTimeoutException(runtext, runregex.pattern, TimeSpan.FromMilliseconds(timeout));
}
/*
* The responsibility of Go() is to run the regular expression at
* runtextpos and call Capture() on all the captured subexpressions,
* then to leave runtextpos at the ending position. It should leave
* runtextpos where it started if there was no match.
*/
protected abstract void Go();
/*
* The responsibility of FindFirstChar() is to advance runtextpos
* until it is at the next position which is a candidate for the
* beginning of a successful match.
*/
protected abstract bool FindFirstChar();
/*
* InitTrackCount must initialize the runtrackcount field; this is
* used to know how large the initial runtrack and runstack arrays
* must be.
*/
protected abstract void InitTrackCount();
/*
* Initializes all the data members that are used by Go()
*/
private void InitMatch() {
// Use a hashtable'ed Match object if the capture numbers are sparse
if (runmatch == null) {
if (runregex.caps != null)
runmatch = new MatchSparse(runregex, runregex.caps, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart);
else
runmatch = new Match (runregex, runregex.capsize, runtext, runtextbeg, runtextend - runtextbeg, runtextstart);
} else {
runmatch.Reset(runregex, runtext, runtextbeg, runtextend, runtextstart);
}
// note we test runcrawl, because it is the last one to be allocated
// If there is an alloc failure in the middle of the three allocations,
// we may still return to reuse this instance, and we want to behave
// as if the allocations didn't occur. (we used to test _trackcount != 0)
if (runcrawl != null) {
runtrackpos = runtrack.Length;
runstackpos = runstack.Length;
runcrawlpos = runcrawl.Length;
return;
}
InitTrackCount();
int tracksize = runtrackcount * 8;
int stacksize = runtrackcount * 8;
if (tracksize < 32)
tracksize = 32;
if (stacksize < 16)
stacksize = 16;
runtrack = new int [tracksize];
runtrackpos = tracksize;
runstack = new int [stacksize];
runstackpos = stacksize;
runcrawl = new int [32];
runcrawlpos = 32;
}
/*
* Put match in its canonical form before returning it.
*/
private Match TidyMatch(bool quick) {
if (!quick) {
Match match = runmatch;
runmatch = null;
match.Tidy(runtextpos);
return match;
} else {
// in quick mode, a successful match returns null, and
// the allocated match object is left in the cache
return null;
}
}
/*
* Called by the implemenation of Go() to increase the size of storage
*/
protected void EnsureStorage() {
if (runstackpos < runtrackcount * 4)
DoubleStack();
if (runtrackpos < runtrackcount * 4)
DoubleTrack();
}
/*
* Called by the implemenation of Go() to decide whether the pos
* at the specified index is a boundary or not. It's just not worth
* emitting inline code for this logic.
*/
protected bool IsBoundary(int index, int startpos, int endpos) {
return (index > startpos && RegexCharClass.IsWordChar(runtext [index - 1])) !=
(index < endpos && RegexCharClass.IsWordChar(runtext [index]));
}
protected bool IsECMABoundary(int index, int startpos, int endpos) {
return (index > startpos && RegexCharClass.IsECMAWordChar(runtext [index - 1])) !=
(index < endpos && RegexCharClass.IsECMAWordChar(runtext [index]));
}
protected static bool CharInSet(char ch, String set, String category) {
string charClass = RegexCharClass.ConvertOldStringsToClass(set, category);
return RegexCharClass.CharInClass(ch, charClass);
}
protected static bool CharInClass(char ch, String charClass) {
return RegexCharClass.CharInClass(ch, charClass);
}
/*
* Called by the implemenation of Go() to increase the size of the
* backtracking stack.
*/
protected void DoubleTrack() {
int [] newtrack;
newtrack = new int [runtrack.Length * 2];
System.Array.Copy(runtrack, 0, newtrack, runtrack.Length, runtrack.Length);
runtrackpos += runtrack.Length;
runtrack = newtrack;
}
/*
* Called by the implemenation of Go() to increase the size of the
* grouping stack.
*/
protected void DoubleStack() {
int [] newstack;
newstack = new int [runstack.Length * 2];
System.Array.Copy(runstack, 0, newstack, runstack.Length, runstack.Length);
runstackpos += runstack.Length;
runstack = newstack;
}
/*
* Increases the size of the longjump unrolling stack.
*/
protected void DoubleCrawl() {
int [] newcrawl;
newcrawl = new int [runcrawl.Length * 2];
System.Array.Copy(runcrawl, 0, newcrawl, runcrawl.Length, runcrawl.Length);
runcrawlpos += runcrawl.Length;
runcrawl = newcrawl;
}
/*
* Save a number on the longjump unrolling stack
*/
protected void Crawl(int i) {
if (runcrawlpos == 0)
DoubleCrawl();
runcrawl [--runcrawlpos] = i;
}
/*
* Remove a number from the longjump unrolling stack
*/
protected int Popcrawl() {
return runcrawl [runcrawlpos++];
}
/*
* Get the height of the stack
*/
protected int Crawlpos() {
return runcrawl.Length - runcrawlpos;
}
/*
* Called by Go() to capture a subexpression. Note that the
* capnum used here has already been mapped to a non-sparse
* index (by the code generator RegexWriter).
*/
protected void Capture(int capnum, int start, int end) {
if (end < start) {
int T;
T = end;
end = start;
start = T;
}
Crawl(capnum);
runmatch.AddMatch(capnum, start, end - start);
}
/*
* Called by Go() to capture a subexpression. Note that the
* capnum used here has already been mapped to a non-sparse
* index (by the code generator RegexWriter).
*/
protected void TransferCapture(int capnum, int uncapnum, int start, int end) {
int start2;
int end2;
// these are the two intervals that are cancelling each other
if (end < start) {
int T;
T = end;
end = start;
start = T;
}
start2 = MatchIndex(uncapnum);
end2 = start2 + MatchLength(uncapnum);
// The new capture gets the innermost defined interval
if (start >= end2) {
end = start;
start = end2;
} else if (end <= start2) {
start = start2;
} else {
if (end > end2)
end = end2;
if (start2 > start)
start = start2;
}
Crawl(uncapnum);
runmatch.BalanceMatch(uncapnum);
if (capnum != -1) {
Crawl(capnum);
runmatch.AddMatch(capnum, start, end - start);
}
}
/*
* Called by Go() to revert the last capture
*/
protected void Uncapture() {
int capnum = Popcrawl();
runmatch.RemoveMatch(capnum);
}
/*
* Call out to runmatch to get around visibility issues
*/
protected bool IsMatched(int cap) {
return runmatch.IsMatched(cap);
}
/*
* Call out to runmatch to get around visibility issues
*/
protected int MatchIndex(int cap) {
return runmatch.MatchIndex(cap);
}
/*
* Call out to runmatch to get around visibility issues
*/
protected int MatchLength(int cap) {
return runmatch.MatchLength(cap);
}
#if DBG
/*
* Dump the current state
*/
internal virtual void DumpState() {
Debug.WriteLine("Text: " + TextposDescription());
Debug.WriteLine("Track: " + StackDescription(runtrack, runtrackpos));
Debug.WriteLine("Stack: " + StackDescription(runstack, runstackpos));
}
internal static String StackDescription(int [] A, int Index) {
StringBuilder Sb = new StringBuilder();
Sb.Append(A.Length - Index);
Sb.Append('/');
Sb.Append(A.Length);
if (Sb.Length < 8)
Sb.Append(' ', 8 - Sb.Length);
Sb.Append("(");
for (int i = Index; i < A.Length; i++) {
if (i > Index)
Sb.Append(' ');
Sb.Append(A [i]);
}
Sb.Append(')');
return Sb.ToString();
}
internal virtual String TextposDescription() {
StringBuilder Sb = new StringBuilder();
int remaining;
Sb.Append(runtextpos);
if (Sb.Length < 8)
Sb.Append(' ', 8 - Sb.Length);
if (runtextpos > runtextbeg)
Sb.Append(RegexCharClass.CharDescription(runtext [runtextpos - 1]));
else
Sb.Append('^');
Sb.Append('>');
remaining = runtextend - runtextpos;
for (int i = runtextpos; i < runtextend; i++) {
Sb.Append(RegexCharClass.CharDescription(runtext [i]));
}
if (Sb.Length >= 64) {
Sb.Length = 61;
Sb.Append("...");
} else {
Sb.Append('$');
}
return Sb.ToString();
}
#endif
}
}