365 lines
9.0 KiB
C#
365 lines
9.0 KiB
C#
|
//
|
||
|
// assembly: System
|
||
|
// namespace: System.Text.RegularExpressions
|
||
|
// file: arch.cs
|
||
|
//
|
||
|
// author: Dan Lewis (dlewis@gmx.co.uk)
|
||
|
// (c) 2002
|
||
|
|
||
|
//
|
||
|
// Permission is hereby granted, free of charge, to any person obtaining
|
||
|
// a copy of this software and associated documentation files (the
|
||
|
// "Software"), to deal in the Software without restriction, including
|
||
|
// without limitation the rights to use, copy, modify, merge, publish,
|
||
|
// distribute, sublicense, and/or sell copies of the Software, and to
|
||
|
// permit persons to whom the Software is furnished to do so, subject to
|
||
|
// the following conditions:
|
||
|
//
|
||
|
// The above copyright notice and this permission notice shall be
|
||
|
// included in all copies or substantial portions of the Software.
|
||
|
//
|
||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||
|
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||
|
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||
|
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
|
//
|
||
|
|
||
|
using System;
|
||
|
using System.Collections;
|
||
|
|
||
|
namespace System.Text.RegularExpressions {
|
||
|
|
||
|
enum OpCode : ushort {
|
||
|
False = 0, // always fails
|
||
|
True, // always succeeds
|
||
|
|
||
|
// matching
|
||
|
|
||
|
Position, // zero-width position assertion
|
||
|
String, // match string literal
|
||
|
Reference, // back reference
|
||
|
|
||
|
// character matching
|
||
|
|
||
|
Character, // match character exactly
|
||
|
Category, // match character from category
|
||
|
NotCategory, // match character _not_ from category
|
||
|
Range, // match character from range
|
||
|
Set, // match character from set
|
||
|
In, // match character from group of tests
|
||
|
|
||
|
// capturing
|
||
|
|
||
|
Open, // open group
|
||
|
Close, // close group
|
||
|
Balance, // balance groups
|
||
|
BalanceStart, //track balance group length
|
||
|
|
||
|
// control flow
|
||
|
|
||
|
IfDefined, // conditional on capture
|
||
|
Sub, // non-backtracking subexpression
|
||
|
Test, // non-backtracking lookahead/behind
|
||
|
Branch, // alternative expression
|
||
|
Jump, // unconditional goto
|
||
|
Repeat, // new repeat context
|
||
|
Until, // repeat subexpression within context
|
||
|
FastRepeat, // repeat simple subexpression
|
||
|
Anchor, // anchoring expression
|
||
|
|
||
|
// miscellaneous
|
||
|
|
||
|
Info // pattern information
|
||
|
}
|
||
|
|
||
|
[Flags]
|
||
|
enum OpFlags : ushort {
|
||
|
None = 0x000,
|
||
|
Negate = 0x100, // succeed on mismatch
|
||
|
IgnoreCase = 0x200, // case insensitive matching
|
||
|
RightToLeft = 0x400, // right-to-left matching
|
||
|
Lazy = 0x800 // minimizing repeat
|
||
|
}
|
||
|
|
||
|
enum Position : ushort {
|
||
|
Any, // anywhere
|
||
|
Start, // start of string \A
|
||
|
StartOfString, // start of string \A
|
||
|
StartOfLine, // start of line ^
|
||
|
StartOfScan, // start of scan \G
|
||
|
End, // end or before newline at end \Z
|
||
|
EndOfString, // end of string \z
|
||
|
EndOfLine, // end of line $
|
||
|
Boundary, // word boundary \b
|
||
|
NonBoundary // not word boundary \B
|
||
|
};
|
||
|
|
||
|
// see category.cs for Category enum
|
||
|
|
||
|
interface IMachine {
|
||
|
Match Scan (Regex regex, string text, int start, int end);
|
||
|
string [] Split (Regex regex, string input, int count, int startat);
|
||
|
string Replace (Regex regex, string input, string replacement, int count, int startat);
|
||
|
string Result (string replacement, Match match);
|
||
|
}
|
||
|
|
||
|
interface IMachineFactory {
|
||
|
IMachine NewInstance ();
|
||
|
IDictionary Mapping { get; set; }
|
||
|
int GroupCount { get; }
|
||
|
int Gap { get; set; } // Index of first group whose number differs from its index, or 1+GroupCount
|
||
|
string [] NamesMapping { get; set; }
|
||
|
}
|
||
|
|
||
|
// Anchor SKIP OFFSET
|
||
|
//
|
||
|
// Flags: [RightToLeft] ??
|
||
|
// SKIP: relative address of tail expression
|
||
|
// OFFSET: offset of anchor from start of pattern
|
||
|
//
|
||
|
// Usage:
|
||
|
//
|
||
|
// Anchor :1 OFFSET
|
||
|
// <expr>
|
||
|
// True
|
||
|
// 1: <tail>
|
||
|
//
|
||
|
// Notes:
|
||
|
//
|
||
|
// In practice, the anchoring expression is only going to be
|
||
|
// Position (StartOfString, StartOfLine, StartOfScan) or String.
|
||
|
// This is because the optimizer looks for position anchors at the
|
||
|
// start of the expression, and if that fails it looks for the
|
||
|
// longest substring. If an expression has neither a position
|
||
|
// anchor or a longest substring anchor, then the anchoring expression
|
||
|
// is left empty. Since an empty expression will anchor at any
|
||
|
// position in any string, the entire input string will be scanned.
|
||
|
|
||
|
// String LEN STR...
|
||
|
//
|
||
|
// Flags: [RightToLeft, IgnoreCase]
|
||
|
// LEN: length of string
|
||
|
// STR: string characters
|
||
|
|
||
|
// Branch SKIP
|
||
|
//
|
||
|
// SKIP: relative address of next branch
|
||
|
//
|
||
|
// Branch :1
|
||
|
// <alt expr 1>
|
||
|
// Jump :4
|
||
|
// 1: Branch :2
|
||
|
// <alt expr 2>
|
||
|
// Jump :4
|
||
|
// 2: Branch :3
|
||
|
// <alt expr 3>
|
||
|
// Jump :4
|
||
|
// 3: False
|
||
|
// 4: <tail>
|
||
|
|
||
|
// Repeat SKIP MIN MAX
|
||
|
//
|
||
|
// Flags: [Lazy]
|
||
|
// SKIP: relative address of Until instruction
|
||
|
// MIN: minimum iterations (2 slots)
|
||
|
// MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
|
||
|
//
|
||
|
// Repeat :1 MIN MAX
|
||
|
// <expr>
|
||
|
// Until
|
||
|
// 1: <tail>
|
||
|
|
||
|
// FastRepeat SKIP MIN MAX
|
||
|
//
|
||
|
// Flags: [Lazy]
|
||
|
// SKIP: relative address of tail expression
|
||
|
// MIN: minimum iterations (2 slots)
|
||
|
// MAX: maximum iterations (2 slots, 0x7fffffff is infinity)
|
||
|
//
|
||
|
// FastRepeat :1 MIN MAX
|
||
|
// <expr>
|
||
|
// True
|
||
|
// 1: <tail>
|
||
|
//
|
||
|
// Notes:
|
||
|
//
|
||
|
// The subexpression of a FastRepeat construct must not contain any
|
||
|
// complex operators. These include: Open, Close, Balance, Repeat,
|
||
|
// FastRepeat, Sub, Test. In addition, the subexpression must have
|
||
|
// been determined to have a fixed width.
|
||
|
|
||
|
// Sub SKIP
|
||
|
//
|
||
|
// SKIP: relative address of tail expression
|
||
|
//
|
||
|
// Sub :1
|
||
|
// <expr>
|
||
|
// 1: <tail>
|
||
|
//
|
||
|
// Notes:
|
||
|
//
|
||
|
// The Sub operator invokes an independent subexpression. This means
|
||
|
// that the subexpression will match only once and so will not
|
||
|
// participate in any backtracking.
|
||
|
|
||
|
// Test TSKIP FSKIP
|
||
|
//
|
||
|
// TSKIP: relative address of true expression
|
||
|
// FSKIP: relative address of false expression
|
||
|
//
|
||
|
// Usage: (?(?=test)true|false)
|
||
|
//
|
||
|
// Test :1 :2
|
||
|
// <test expr>
|
||
|
// 1: <true expr>
|
||
|
// Jump
|
||
|
// 2: <false epxr>
|
||
|
// <tail>
|
||
|
//
|
||
|
// Usage: (?(?=test)true)
|
||
|
//
|
||
|
// Test :1 :2
|
||
|
// <test expr>
|
||
|
// 1: <true expr>
|
||
|
// 2: <tail>
|
||
|
//
|
||
|
// Usage: (?=test)
|
||
|
//
|
||
|
// Test :1 :2
|
||
|
// <test expr>
|
||
|
// 1: <true expr>
|
||
|
// Jump 3:
|
||
|
// 2: False
|
||
|
// 3: <tail>
|
||
|
//
|
||
|
// Notes:
|
||
|
//
|
||
|
// For negative lookaheads, just swap the values of TSKIP and
|
||
|
// FSKIP. For lookbehinds, the test expression must be compiled
|
||
|
// in reverse. The test expression is always executed as an
|
||
|
// independent subexpression, so its behaviour is non-backtracking
|
||
|
// (like a Sub clause.)
|
||
|
|
||
|
// IfDefined SKIP GID
|
||
|
//
|
||
|
// SKIP: relative address of else expression
|
||
|
// GID: number of group to check
|
||
|
//
|
||
|
// Usage: (?(gid)true)
|
||
|
//
|
||
|
// IfDefined :1
|
||
|
// <true expr>
|
||
|
// 1: <tail>
|
||
|
//
|
||
|
// Usage: (?(gid)true|false)
|
||
|
//
|
||
|
// IfDefined :1
|
||
|
// <true expr>
|
||
|
// Jump :2
|
||
|
// 1: <false expr>
|
||
|
// 2: <tail>
|
||
|
|
||
|
// Jump SKIP
|
||
|
//
|
||
|
// SKIP: relative address of target expression
|
||
|
//
|
||
|
// Jump :1
|
||
|
// ...
|
||
|
// :1 <target expr>
|
||
|
|
||
|
// Character CHAR
|
||
|
//
|
||
|
// Flags: [Negate, IgnoreCase, RightToLeft]
|
||
|
// CHAR: exact character to match
|
||
|
|
||
|
// Category CAT
|
||
|
//
|
||
|
// Flags: [Negate, RightToLeft]
|
||
|
// CAT: category to match (see Category enum)
|
||
|
|
||
|
// Range LO HI
|
||
|
//
|
||
|
// Flags: [Negate, IgnoreCase, RightToLeft]
|
||
|
// LO: lowest character in range
|
||
|
// HI: higest character in range
|
||
|
|
||
|
// Set LO LEN SET...
|
||
|
//
|
||
|
// Flags: [Negate, IgnoreCase, RightToLeft]
|
||
|
// LO: lowest character in set
|
||
|
// LEN: number of words in set
|
||
|
// SET: bit array representing characters in set
|
||
|
//
|
||
|
// Notes:
|
||
|
//
|
||
|
// Each word in the set represents 16 characters, so the first word
|
||
|
// defines membership for characters LO to LO + 15, the second for
|
||
|
// LO + 16 to LO + 31, and so on up to LO + (LEN * 16 - 1). It is
|
||
|
// up to the compiler to provide a compact representation for sparse
|
||
|
// unicode sets. The simple way is to use Set 0 4096. Other methods
|
||
|
// involve paritioning the set and placing the components into an
|
||
|
// In block.
|
||
|
|
||
|
// In SKIP
|
||
|
//
|
||
|
// SKIP: relative address of tail expression
|
||
|
//
|
||
|
// Usage: [expr]
|
||
|
//
|
||
|
// In :1
|
||
|
// <expr>
|
||
|
// True
|
||
|
// :1 <tail>
|
||
|
//
|
||
|
// Usage: [^expr]
|
||
|
//
|
||
|
// In :1
|
||
|
// <expr>
|
||
|
// False
|
||
|
// :1 <tail>
|
||
|
//
|
||
|
// Notes:
|
||
|
//
|
||
|
// The In instruction consumes a single character, using the flags
|
||
|
// of the first instruction in the subexpression to determine its
|
||
|
// IgnoreCase and RightToLeft properties. The subexpression is then
|
||
|
// applied to the single character as a disjunction. If any instruction
|
||
|
// in the subexpression succeeds, the entire In construct succeeds
|
||
|
// and matching continues with the tail.
|
||
|
|
||
|
// Position POS
|
||
|
//
|
||
|
// POS: position to match (see Position enum)
|
||
|
|
||
|
// Open GID
|
||
|
//
|
||
|
// GID: number of group to open
|
||
|
|
||
|
// Close GID
|
||
|
//
|
||
|
// GID: number of group to close
|
||
|
|
||
|
// Balance GID BAL
|
||
|
//
|
||
|
// GID: number of capturing group (0 if none)
|
||
|
// BAL: number of group to undefine
|
||
|
|
||
|
// Info GROUPS MIN MAX
|
||
|
//
|
||
|
// GROUPS: number of capturing groups (2 slots)
|
||
|
// MIN: minimum width of pattern (2 slots)
|
||
|
// MAX: maximum width of pattern (2 slots, 0x7fffffff means undefined)
|
||
|
|
||
|
// False
|
||
|
|
||
|
// True
|
||
|
|
||
|
// Reference GID
|
||
|
//
|
||
|
// Flags: [IgnoreCase, RightToLeft]
|
||
|
// GID: number of group to reference
|
||
|
}
|