1245 lines
29 KiB
C#
1245 lines
29 KiB
C#
|
//
|
||
|
// assembly: System
|
||
|
// namespace: System.Text.RegularExpressions
|
||
|
// file: parser.cs
|
||
|
//
|
||
|
// author: Dan Lewis (dlewis@gmx.co.uk)
|
||
|
// (c) 2002
|
||
|
|
||
|
//
|
||
|
// Permission is hereby granted, free of charge, to any person obtaining
|
||
|
// a copy of this software and associated documentation files (the
|
||
|
// "Software"), to deal in the Software without restriction, including
|
||
|
// without limitation the rights to use, copy, modify, merge, publish,
|
||
|
// distribute, sublicense, and/or sell copies of the Software, and to
|
||
|
// permit persons to whom the Software is furnished to do so, subject to
|
||
|
// the following conditions:
|
||
|
//
|
||
|
// The above copyright notice and this permission notice shall be
|
||
|
// included in all copies or substantial portions of the Software.
|
||
|
//
|
||
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||
|
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||
|
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||
|
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
|
//
|
||
|
|
||
|
using System;
|
||
|
using System.Collections;
|
||
|
using System.Globalization;
|
||
|
|
||
|
namespace System.Text.RegularExpressions.Syntax {
|
||
|
|
||
|
class Parser {
|
||
|
public static int ParseDecimal (string str, ref int ptr) {
|
||
|
return ParseNumber (str, ref ptr, 10, 1, Int32.MaxValue);
|
||
|
}
|
||
|
|
||
|
public static int ParseOctal (string str, ref int ptr) {
|
||
|
return ParseNumber (str, ref ptr, 8, 1, 3);
|
||
|
}
|
||
|
|
||
|
public static int ParseHex (string str, ref int ptr, int digits) {
|
||
|
return ParseNumber (str, ref ptr, 16, digits, digits);
|
||
|
}
|
||
|
|
||
|
public static int ParseNumber (string str, ref int ptr, int b, int min, int max) {
|
||
|
int p = ptr, n = 0, digits = 0, d;
|
||
|
if (max < min)
|
||
|
max = Int32.MaxValue;
|
||
|
|
||
|
while (digits < max && p < str.Length) {
|
||
|
d = ParseDigit (str[p ++], b, digits);
|
||
|
if (d < 0) {
|
||
|
-- p;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
n = n * b + d;
|
||
|
++ digits;
|
||
|
}
|
||
|
|
||
|
if (digits < min)
|
||
|
return -1;
|
||
|
|
||
|
ptr = p;
|
||
|
return n;
|
||
|
}
|
||
|
|
||
|
public static string ParseName (string str, ref int ptr) {
|
||
|
if (Char.IsDigit (str[ptr])) {
|
||
|
int gid = ParseNumber (str, ref ptr, 10, 1, 0);
|
||
|
if (gid > 0)
|
||
|
return gid.ToString ();
|
||
|
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
int start = ptr;
|
||
|
for (;;) {
|
||
|
if (!IsNameChar (str[ptr]))
|
||
|
break;
|
||
|
++ ptr;
|
||
|
}
|
||
|
|
||
|
if (ptr - start > 0)
|
||
|
return str.Substring (start, ptr - start);
|
||
|
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
public static string Escape (string str) {
|
||
|
string result = "";
|
||
|
for (int i = 0; i < str.Length; ++ i) {
|
||
|
char c = str[i];
|
||
|
switch (c) {
|
||
|
case '\\': case '*': case '+': case '?': case '|':
|
||
|
case '{': case '[': case '(': case ')': case '^':
|
||
|
case '$': case '.': case '#': case ' ':
|
||
|
result += "\\" + c;
|
||
|
break;
|
||
|
|
||
|
case '\t': result += "\\t"; break;
|
||
|
case '\n': result += "\\n"; break;
|
||
|
case '\r': result += "\\r"; break;
|
||
|
case '\f': result += "\\f"; break;
|
||
|
|
||
|
default: result += c; break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
public static string Unescape (string str) {
|
||
|
if (str.IndexOf ('\\') == -1)
|
||
|
return str;
|
||
|
return new Parser ().ParseString (str);
|
||
|
}
|
||
|
|
||
|
// public instance
|
||
|
|
||
|
public Parser () {
|
||
|
this.caps = new ArrayList ();
|
||
|
this.refs = new Hashtable ();
|
||
|
}
|
||
|
|
||
|
public RegularExpression ParseRegularExpression (string pattern, RegexOptions options) {
|
||
|
this.pattern = pattern;
|
||
|
this.ptr = 0;
|
||
|
|
||
|
caps.Clear ();
|
||
|
refs.Clear ();
|
||
|
this.num_groups = 0;
|
||
|
|
||
|
try {
|
||
|
RegularExpression re = new RegularExpression ();
|
||
|
ParseGroup (re, options, null);
|
||
|
ResolveReferences ();
|
||
|
|
||
|
re.GroupCount = num_groups;
|
||
|
|
||
|
return re;
|
||
|
}
|
||
|
catch (IndexOutOfRangeException) {
|
||
|
throw NewParseException ("Unexpected end of pattern.");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
public int GetMapping (Hashtable mapping)
|
||
|
{
|
||
|
int end = caps.Count;
|
||
|
mapping.Add ("0", 0);
|
||
|
for (int i = 0; i < end; i++) {
|
||
|
CapturingGroup group = (CapturingGroup) caps [i];
|
||
|
string name = group.Name != null ? group.Name : group.Index.ToString ();
|
||
|
if (mapping.Contains (name)) {
|
||
|
if ((int) mapping [name] != group.Index)
|
||
|
throw new SystemException ("invalid state");
|
||
|
continue;
|
||
|
}
|
||
|
mapping.Add (name, group.Index);
|
||
|
}
|
||
|
|
||
|
return gap;
|
||
|
}
|
||
|
|
||
|
// private methods
|
||
|
|
||
|
private void ParseGroup (Group group, RegexOptions options, Assertion assertion) {
|
||
|
bool is_top_level = group is RegularExpression;
|
||
|
|
||
|
Alternation alternation = null;
|
||
|
string literal = null;
|
||
|
|
||
|
Group current = new Group ();
|
||
|
Expression expr = null;
|
||
|
bool closed = false;
|
||
|
|
||
|
while (true) {
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
if (ptr >= pattern.Length)
|
||
|
break;
|
||
|
|
||
|
// (1) Parse for Expressions
|
||
|
|
||
|
char ch = pattern[ptr ++];
|
||
|
|
||
|
switch (ch) {
|
||
|
case '^': {
|
||
|
Position pos =
|
||
|
IsMultiline (options) ? Position.StartOfLine : Position.Start;
|
||
|
expr = new PositionAssertion (pos);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case '$': {
|
||
|
Position pos =
|
||
|
IsMultiline (options) ? Position.EndOfLine : Position.End;
|
||
|
expr = new PositionAssertion (pos);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case '.': {
|
||
|
Category cat =
|
||
|
IsSingleline (options) ? Category.AnySingleline : Category.Any;
|
||
|
expr = new CharacterClass (cat, false);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case '\\': {
|
||
|
int c = ParseEscape (false);
|
||
|
if (c >= 0)
|
||
|
ch = (char)c;
|
||
|
else {
|
||
|
expr = ParseSpecial (options);
|
||
|
|
||
|
if (expr == null)
|
||
|
ch = pattern[ptr ++]; // default escape
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case '[': {
|
||
|
expr = ParseCharacterClass (options);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case '(': {
|
||
|
bool ignore = IsIgnoreCase (options);
|
||
|
expr = ParseGroupingConstruct (ref options);
|
||
|
if (expr == null) {
|
||
|
if (literal != null && IsIgnoreCase (options) != ignore) {
|
||
|
current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
|
||
|
literal = null;
|
||
|
}
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case ')': {
|
||
|
closed = true;
|
||
|
goto EndOfGroup;
|
||
|
}
|
||
|
|
||
|
case '|': {
|
||
|
if (literal != null) {
|
||
|
current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
|
||
|
literal = null;
|
||
|
}
|
||
|
|
||
|
if (assertion != null) {
|
||
|
if (assertion.TrueExpression == null)
|
||
|
assertion.TrueExpression = current;
|
||
|
else if (assertion.FalseExpression == null)
|
||
|
assertion.FalseExpression = current;
|
||
|
else
|
||
|
throw NewParseException ("Too many | in (?()|).");
|
||
|
}
|
||
|
else {
|
||
|
if (alternation == null)
|
||
|
alternation = new Alternation ();
|
||
|
|
||
|
alternation.AddAlternative (current);
|
||
|
}
|
||
|
|
||
|
current = new Group ();
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
case '*': case '+': case '?': {
|
||
|
throw NewParseException ("Bad quantifier.");
|
||
|
}
|
||
|
|
||
|
default:
|
||
|
break; // literal character
|
||
|
}
|
||
|
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
|
||
|
// (2) Check for Repetitions
|
||
|
|
||
|
if (ptr < pattern.Length) {
|
||
|
char k = pattern[ptr];
|
||
|
int min = 0, max = 0;
|
||
|
bool lazy = false;
|
||
|
bool haveRep = false;
|
||
|
|
||
|
|
||
|
if (k == '?' || k == '*' || k == '+') {
|
||
|
++ ptr;
|
||
|
haveRep = true;
|
||
|
|
||
|
switch (k) {
|
||
|
case '?': min = 0; max = 1; break;
|
||
|
case '*': min = 0; max = 0x7fffffff; break;
|
||
|
case '+': min = 1; max = 0x7fffffff; break;
|
||
|
}
|
||
|
} else if (k == '{' && ptr + 1 < pattern.Length) {
|
||
|
int saved_ptr = ptr;
|
||
|
++ptr;
|
||
|
haveRep = ParseRepetitionBounds (out min, out max, options);
|
||
|
if (!haveRep)
|
||
|
ptr = saved_ptr;
|
||
|
}
|
||
|
|
||
|
if (haveRep) {
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
if (ptr < pattern.Length && pattern[ptr] == '?') {
|
||
|
++ ptr;
|
||
|
lazy = true;
|
||
|
}
|
||
|
|
||
|
//It doesn't make sense to assert a given position more than once.
|
||
|
bool ignore_repetition = false;
|
||
|
if (expr is PositionAssertion) {
|
||
|
ignore_repetition = min > 0 && !lazy;
|
||
|
max = 1;
|
||
|
}
|
||
|
|
||
|
if (!ignore_repetition) {
|
||
|
Repetition repetition = new Repetition (min, max, lazy);
|
||
|
|
||
|
if (expr == null)
|
||
|
repetition.Expression = new Literal (ch.ToString (), IsIgnoreCase (options));
|
||
|
else
|
||
|
repetition.Expression = expr;
|
||
|
|
||
|
expr = repetition;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// (3) Append Expression and/or Literal
|
||
|
|
||
|
if (expr == null) {
|
||
|
if (literal == null)
|
||
|
literal = "";
|
||
|
literal += ch;
|
||
|
}
|
||
|
else {
|
||
|
if (literal != null) {
|
||
|
current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
|
||
|
literal = null;
|
||
|
}
|
||
|
|
||
|
current.AppendExpression (expr);
|
||
|
expr = null;
|
||
|
}
|
||
|
|
||
|
if (is_top_level && ptr >= pattern.Length)
|
||
|
goto EndOfGroup;
|
||
|
}
|
||
|
|
||
|
EndOfGroup:
|
||
|
if (is_top_level && closed)
|
||
|
throw NewParseException ("Too many )'s.");
|
||
|
if (!is_top_level && !closed)
|
||
|
throw NewParseException ("Not enough )'s.");
|
||
|
|
||
|
|
||
|
// clean up literals and alternations
|
||
|
|
||
|
if (literal != null)
|
||
|
current.AppendExpression (new Literal (literal, IsIgnoreCase (options)));
|
||
|
|
||
|
if (assertion != null) {
|
||
|
if (assertion.TrueExpression == null)
|
||
|
assertion.TrueExpression = current;
|
||
|
else
|
||
|
assertion.FalseExpression = current;
|
||
|
|
||
|
group.AppendExpression (assertion);
|
||
|
}
|
||
|
else if (alternation != null) {
|
||
|
alternation.AddAlternative (current);
|
||
|
group.AppendExpression (alternation);
|
||
|
}
|
||
|
else
|
||
|
group.AppendExpression (current);
|
||
|
}
|
||
|
|
||
|
private Expression ParseGroupingConstruct (ref RegexOptions options) {
|
||
|
if (pattern[ptr] != '?') {
|
||
|
Group group;
|
||
|
|
||
|
if (IsExplicitCapture (options))
|
||
|
group = new Group ();
|
||
|
else {
|
||
|
group = new CapturingGroup ();
|
||
|
caps.Add (group);
|
||
|
}
|
||
|
|
||
|
ParseGroup (group, options, null);
|
||
|
return group;
|
||
|
}
|
||
|
else
|
||
|
++ ptr;
|
||
|
|
||
|
switch (pattern[ptr]) {
|
||
|
case ':': { // non-capturing group
|
||
|
++ ptr;
|
||
|
Group group = new Group ();
|
||
|
ParseGroup (group, options, null);
|
||
|
|
||
|
return group;
|
||
|
}
|
||
|
|
||
|
case '>': { // non-backtracking group
|
||
|
++ ptr;
|
||
|
Group group = new NonBacktrackingGroup ();
|
||
|
ParseGroup (group, options, null);
|
||
|
|
||
|
return group;
|
||
|
}
|
||
|
|
||
|
case 'i': case 'm': case 'n':
|
||
|
case 's': case 'x': case '-': { // options
|
||
|
RegexOptions o = options;
|
||
|
ParseOptions (ref o, false);
|
||
|
if (pattern[ptr] == '-') {
|
||
|
++ ptr;
|
||
|
ParseOptions (ref o, true);
|
||
|
}
|
||
|
|
||
|
if (pattern[ptr] == ':') { // pass options to child group
|
||
|
++ ptr;
|
||
|
Group group = new Group ();
|
||
|
ParseGroup (group, o, null);
|
||
|
return group;
|
||
|
}
|
||
|
else if (pattern[ptr] == ')') { // change options of enclosing group
|
||
|
++ ptr;
|
||
|
options = o;
|
||
|
return null;
|
||
|
}
|
||
|
else
|
||
|
throw NewParseException ("Bad options");
|
||
|
}
|
||
|
|
||
|
case '<': case '=': case '!': { // lookahead/lookbehind
|
||
|
ExpressionAssertion asn = new ExpressionAssertion ();
|
||
|
if (!ParseAssertionType (asn))
|
||
|
goto case '\''; // it's a (?<name> ) construct
|
||
|
|
||
|
Group test = new Group ();
|
||
|
ParseGroup (test, options, null);
|
||
|
|
||
|
asn.TestExpression = test;
|
||
|
return asn;
|
||
|
}
|
||
|
|
||
|
case '\'': { // named/balancing group
|
||
|
char delim;
|
||
|
if (pattern[ptr] == '<')
|
||
|
delim = '>';
|
||
|
else
|
||
|
delim = '\'';
|
||
|
|
||
|
++ ptr;
|
||
|
string name = ParseName ();
|
||
|
|
||
|
if (pattern[ptr] == delim) {
|
||
|
// capturing group
|
||
|
|
||
|
if (name == null)
|
||
|
throw NewParseException ("Bad group name.");
|
||
|
|
||
|
++ ptr;
|
||
|
CapturingGroup cap = new CapturingGroup ();
|
||
|
cap.Name = name;
|
||
|
caps.Add (cap);
|
||
|
ParseGroup (cap, options, null);
|
||
|
|
||
|
return cap;
|
||
|
}
|
||
|
else if (pattern[ptr] == '-') {
|
||
|
// balancing group
|
||
|
|
||
|
++ ptr;
|
||
|
string balance_name = ParseName ();
|
||
|
if (balance_name == null || pattern[ptr] != delim)
|
||
|
throw NewParseException ("Bad balancing group name.");
|
||
|
|
||
|
++ ptr;
|
||
|
BalancingGroup bal = new BalancingGroup ();
|
||
|
bal.Name = name;
|
||
|
|
||
|
if(bal.IsNamed) {
|
||
|
caps.Add (bal);
|
||
|
}
|
||
|
|
||
|
refs.Add (bal, balance_name);
|
||
|
|
||
|
ParseGroup (bal, options, null);
|
||
|
|
||
|
return bal;
|
||
|
}
|
||
|
else
|
||
|
throw NewParseException ("Bad group name.");
|
||
|
}
|
||
|
|
||
|
case '(': { // expression/capture test
|
||
|
Assertion asn;
|
||
|
|
||
|
++ ptr;
|
||
|
int p = ptr;
|
||
|
string name = ParseName ();
|
||
|
if (name == null || pattern[ptr] != ')') { // expression test
|
||
|
// FIXME MS implementation doesn't seem to
|
||
|
// implement this version of (?(x) ...)
|
||
|
|
||
|
ptr = p;
|
||
|
ExpressionAssertion expr_asn = new ExpressionAssertion ();
|
||
|
|
||
|
if (pattern[ptr] == '?') {
|
||
|
++ ptr;
|
||
|
if (!ParseAssertionType (expr_asn))
|
||
|
throw NewParseException ("Bad conditional.");
|
||
|
}
|
||
|
else {
|
||
|
expr_asn.Negate = false;
|
||
|
expr_asn.Reverse = false;
|
||
|
}
|
||
|
|
||
|
Group test = new Group ();
|
||
|
ParseGroup (test, options, null);
|
||
|
expr_asn.TestExpression = test;
|
||
|
asn = expr_asn;
|
||
|
}
|
||
|
else { // capture test
|
||
|
++ ptr;
|
||
|
asn = new CaptureAssertion (new Literal (name, IsIgnoreCase (options)));
|
||
|
refs.Add (asn, name);
|
||
|
}
|
||
|
|
||
|
Group group = new Group ();
|
||
|
ParseGroup (group, options, asn);
|
||
|
return group;
|
||
|
}
|
||
|
|
||
|
case '#': { // comment
|
||
|
++ ptr;
|
||
|
while (pattern[ptr ++] != ')') {
|
||
|
if (ptr >= pattern.Length)
|
||
|
throw NewParseException ("Unterminated (?#...) comment.");
|
||
|
}
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
default: // error
|
||
|
throw NewParseException ("Bad grouping construct.");
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private bool ParseAssertionType (ExpressionAssertion assertion) {
|
||
|
if (pattern[ptr] == '<') {
|
||
|
switch (pattern[ptr + 1]) {
|
||
|
case '=':
|
||
|
assertion.Negate = false;
|
||
|
break;
|
||
|
case '!':
|
||
|
assertion.Negate = true;
|
||
|
break;
|
||
|
default:
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
assertion.Reverse = true;
|
||
|
ptr += 2;
|
||
|
}
|
||
|
else {
|
||
|
switch (pattern[ptr]) {
|
||
|
case '=':
|
||
|
assertion.Negate = false;
|
||
|
break;
|
||
|
case '!':
|
||
|
assertion.Negate = true;
|
||
|
break;
|
||
|
default:
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
assertion.Reverse = false;
|
||
|
ptr += 1;
|
||
|
}
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
private void ParseOptions (ref RegexOptions options, bool negate) {
|
||
|
for (;;) {
|
||
|
switch (pattern[ptr]) {
|
||
|
case 'i':
|
||
|
if (negate)
|
||
|
options &= ~RegexOptions.IgnoreCase;
|
||
|
else
|
||
|
options |= RegexOptions.IgnoreCase;
|
||
|
break;
|
||
|
|
||
|
case 'm':
|
||
|
if (negate)
|
||
|
options &= ~RegexOptions.Multiline;
|
||
|
else
|
||
|
options |= RegexOptions.Multiline;
|
||
|
break;
|
||
|
|
||
|
case 'n':
|
||
|
if (negate)
|
||
|
options &= ~RegexOptions.ExplicitCapture;
|
||
|
else
|
||
|
options |= RegexOptions.ExplicitCapture;
|
||
|
break;
|
||
|
|
||
|
case 's':
|
||
|
if (negate)
|
||
|
options &= ~RegexOptions.Singleline;
|
||
|
else
|
||
|
options |= RegexOptions.Singleline;
|
||
|
break;
|
||
|
|
||
|
case 'x':
|
||
|
if (negate)
|
||
|
options &= ~RegexOptions.IgnorePatternWhitespace;
|
||
|
else
|
||
|
options |= RegexOptions.IgnorePatternWhitespace;
|
||
|
break;
|
||
|
|
||
|
default:
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
++ ptr;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private Expression ParseCharacterClass (RegexOptions options) {
|
||
|
bool negate = false;
|
||
|
if (pattern[ptr] == '^') {
|
||
|
negate = true;
|
||
|
++ ptr;
|
||
|
}
|
||
|
|
||
|
bool ecma = IsECMAScript (options);
|
||
|
CharacterClass cls = new CharacterClass (negate, IsIgnoreCase (options));
|
||
|
|
||
|
if (pattern[ptr] == ']') {
|
||
|
cls.AddCharacter (']');
|
||
|
++ ptr;
|
||
|
}
|
||
|
|
||
|
int c = -1;
|
||
|
int last = -1;
|
||
|
bool range = false;
|
||
|
bool closed = false;
|
||
|
while (ptr < pattern.Length) {
|
||
|
c = pattern[ptr ++];
|
||
|
|
||
|
if (c == ']') {
|
||
|
closed = true;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (c == '-' && last >= 0 && !range) {
|
||
|
range = true;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (c == '\\') {
|
||
|
c = ParseEscape (true);
|
||
|
if (c >= 0)
|
||
|
goto char_recognized;
|
||
|
|
||
|
// didn't recognize escape
|
||
|
c = pattern [ptr ++];
|
||
|
switch (c) {
|
||
|
case 'b':
|
||
|
c = '\b';
|
||
|
goto char_recognized;
|
||
|
|
||
|
case 'd': case 'D':
|
||
|
cls.AddCategory (ecma ? Category.EcmaDigit : Category.Digit, c == 'D');
|
||
|
break;
|
||
|
|
||
|
case 'w': case 'W':
|
||
|
cls.AddCategory (ecma ? Category.EcmaWord : Category.Word, c == 'W');
|
||
|
break;
|
||
|
|
||
|
case 's': case 'S':
|
||
|
cls.AddCategory (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, c == 'S');
|
||
|
break;
|
||
|
|
||
|
case 'p': case 'P':
|
||
|
cls.AddCategory (ParseUnicodeCategory (), c == 'P'); // ignore ecma
|
||
|
break;
|
||
|
|
||
|
default: // add escaped character
|
||
|
goto char_recognized;
|
||
|
}
|
||
|
|
||
|
// if the pattern looks like [a-\s] ...
|
||
|
if (range)
|
||
|
throw NewParseException ("character range cannot have category \\" + c);
|
||
|
|
||
|
last = -1;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
char_recognized:
|
||
|
if (range) {
|
||
|
// if 'range' is true, we know that 'last >= 0'
|
||
|
if (c < last)
|
||
|
throw NewParseException ("[" + last + "-" + c + "] range in reverse order.");
|
||
|
cls.AddRange ((char)last, (char)c);
|
||
|
last = -1;
|
||
|
range = false;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
cls.AddCharacter ((char)c);
|
||
|
last = c;
|
||
|
}
|
||
|
|
||
|
if (!closed)
|
||
|
throw NewParseException ("Unterminated [] set.");
|
||
|
|
||
|
if (range)
|
||
|
cls.AddCharacter ('-');
|
||
|
|
||
|
return cls;
|
||
|
}
|
||
|
|
||
|
private bool ParseRepetitionBounds (out int min, out int max, RegexOptions options) {
|
||
|
int n, m;
|
||
|
min = max = 0;
|
||
|
|
||
|
/* check syntax */
|
||
|
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
|
||
|
if (pattern[ptr] == ',') {
|
||
|
n = -1;
|
||
|
} else {
|
||
|
n = ParseNumber (10, 1, 0);
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
}
|
||
|
|
||
|
switch (pattern[ptr ++]) {
|
||
|
case '}':
|
||
|
m = n;
|
||
|
break;
|
||
|
case ',':
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
m = ParseNumber (10, 1, 0);
|
||
|
ConsumeWhitespace (IsIgnorePatternWhitespace (options));
|
||
|
if (pattern[ptr ++] != '}')
|
||
|
return false;
|
||
|
break;
|
||
|
default:
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* check bounds and ordering */
|
||
|
|
||
|
if (n > 0x7fffffff || m > 0x7fffffff)
|
||
|
throw NewParseException ("Illegal {x, y} - maximum of 2147483647.");
|
||
|
if (m >= 0 && m < n)
|
||
|
throw NewParseException ("Illegal {x, y} with x > y.");
|
||
|
|
||
|
/* assign min and max */
|
||
|
|
||
|
min = n;
|
||
|
if (m > 0)
|
||
|
max = m;
|
||
|
else
|
||
|
max = 0x7fffffff;
|
||
|
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
private Category ParseUnicodeCategory () {
|
||
|
if (pattern[ptr ++] != '{')
|
||
|
throw NewParseException ("Incomplete \\p{X} character escape.");
|
||
|
|
||
|
string name = ParseName (pattern, ref ptr);
|
||
|
if (name == null)
|
||
|
throw NewParseException ("Incomplete \\p{X} character escape.");
|
||
|
|
||
|
Category cat = CategoryUtils.CategoryFromName (name);
|
||
|
if (cat == Category.None)
|
||
|
throw NewParseException ("Unknown property '" + name + "'.");
|
||
|
|
||
|
if (pattern[ptr ++] != '}')
|
||
|
throw NewParseException ("Incomplete \\p{X} character escape.");
|
||
|
|
||
|
return cat;
|
||
|
}
|
||
|
|
||
|
private Expression ParseSpecial (RegexOptions options) {
|
||
|
int p = ptr;
|
||
|
bool ecma = IsECMAScript (options);
|
||
|
Expression expr = null;
|
||
|
|
||
|
switch (pattern[ptr ++]) {
|
||
|
|
||
|
// categories
|
||
|
|
||
|
case 'd':
|
||
|
expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, false);
|
||
|
break;
|
||
|
|
||
|
case 'w':
|
||
|
expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, false);
|
||
|
break;
|
||
|
|
||
|
case 's':
|
||
|
expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, false);
|
||
|
break;
|
||
|
|
||
|
case 'p':
|
||
|
// this is odd - ECMAScript isn't supposed to support Unicode,
|
||
|
// yet \p{..} compiles and runs under the MS implementation
|
||
|
// identically to canonical mode. That's why I'm ignoring the
|
||
|
// value of ecma here.
|
||
|
|
||
|
expr = new CharacterClass (ParseUnicodeCategory (), false);
|
||
|
break;
|
||
|
|
||
|
case 'D':
|
||
|
expr = new CharacterClass (ecma ? Category.EcmaDigit : Category.Digit, true);
|
||
|
break;
|
||
|
|
||
|
case 'W':
|
||
|
expr = new CharacterClass (ecma ? Category.EcmaWord : Category.Word, true);
|
||
|
break;
|
||
|
|
||
|
case 'S':
|
||
|
expr = new CharacterClass (ecma ? Category.EcmaWhiteSpace : Category.WhiteSpace, true);
|
||
|
break;
|
||
|
|
||
|
case 'P':
|
||
|
expr = new CharacterClass (ParseUnicodeCategory (), true);
|
||
|
break;
|
||
|
|
||
|
// positions
|
||
|
|
||
|
case 'A': expr = new PositionAssertion (Position.StartOfString); break;
|
||
|
case 'Z': expr = new PositionAssertion (Position.End); break;
|
||
|
case 'z': expr = new PositionAssertion (Position.EndOfString); break;
|
||
|
case 'G': expr = new PositionAssertion (Position.StartOfScan); break;
|
||
|
case 'b': expr = new PositionAssertion (Position.Boundary); break;
|
||
|
case 'B': expr = new PositionAssertion (Position.NonBoundary); break;
|
||
|
|
||
|
// references
|
||
|
|
||
|
case '1': case '2': case '3': case '4': case '5':
|
||
|
case '6': case '7': case '8': case '9': {
|
||
|
ptr --;
|
||
|
int n = ParseNumber (10, 1, 0);
|
||
|
if (n < 0) {
|
||
|
ptr = p;
|
||
|
return null;
|
||
|
}
|
||
|
|
||
|
// FIXME test if number is within number of assigned groups
|
||
|
// this may present a problem for right-to-left matching
|
||
|
|
||
|
Reference reference = new BackslashNumber (IsIgnoreCase (options), ecma);
|
||
|
refs.Add (reference, n.ToString ());
|
||
|
expr = reference;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
case 'k': {
|
||
|
char delim = pattern[ptr ++];
|
||
|
if (delim == '<')
|
||
|
delim = '>';
|
||
|
else if (delim != '\'')
|
||
|
throw NewParseException ("Malformed \\k<...> named backreference.");
|
||
|
|
||
|
string name = ParseName ();
|
||
|
if (name == null || pattern[ptr] != delim)
|
||
|
throw NewParseException ("Malformed \\k<...> named backreference.");
|
||
|
|
||
|
++ ptr;
|
||
|
Reference reference = new Reference (IsIgnoreCase (options));
|
||
|
refs.Add (reference, name);
|
||
|
expr = reference;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
default:
|
||
|
expr = null;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (expr == null)
|
||
|
ptr = p;
|
||
|
|
||
|
return expr;
|
||
|
}
|
||
|
|
||
|
private int ParseEscape (bool inCharacterClass) {
|
||
|
int p = ptr;
|
||
|
int c;
|
||
|
|
||
|
if (p >= pattern.Length)
|
||
|
throw new ArgumentException (
|
||
|
String.Format ("Parsing \"{0}\" - Illegal \\ at end of " +
|
||
|
"pattern.", pattern), pattern);
|
||
|
|
||
|
switch (pattern[ptr ++]) {
|
||
|
|
||
|
// standard escapes (except \b)
|
||
|
|
||
|
case 'a': return '\u0007';
|
||
|
case 't': return '\u0009';
|
||
|
case 'r': return '\u000d';
|
||
|
case 'v': return '\u000b';
|
||
|
case 'f': return '\u000c';
|
||
|
case 'n': return '\u000a';
|
||
|
case 'e': return '\u001b';
|
||
|
case '\\': return '\\';
|
||
|
|
||
|
// character codes
|
||
|
|
||
|
case '0':
|
||
|
|
||
|
//
|
||
|
// Turns out that octal values can be specified
|
||
|
// without a leading zero. But also the limit
|
||
|
// of three character should include this first
|
||
|
// one.
|
||
|
//
|
||
|
ptr--;
|
||
|
int prevptr = ptr;
|
||
|
int result = ParseOctal (pattern, ref ptr);
|
||
|
if (result == -1 && prevptr == ptr)
|
||
|
return 0;
|
||
|
|
||
|
return result;
|
||
|
|
||
|
case '1': case '2': case '3': case '4': case '5':
|
||
|
case '6': case '7':
|
||
|
if (inCharacterClass){
|
||
|
ptr--;
|
||
|
return ParseOctal (pattern, ref ptr);
|
||
|
}
|
||
|
break;
|
||
|
|
||
|
case 'x':
|
||
|
c = ParseHex (pattern, ref ptr, 2);
|
||
|
if (c < 0)
|
||
|
throw NewParseException ("Insufficient hex digits");
|
||
|
|
||
|
return c;
|
||
|
|
||
|
case 'u':
|
||
|
c = ParseHex (pattern, ref ptr, 4);
|
||
|
if (c < 0)
|
||
|
throw NewParseException ("Insufficient hex digits");
|
||
|
|
||
|
return c;
|
||
|
|
||
|
// control characters
|
||
|
|
||
|
case 'c':
|
||
|
c = pattern[ptr ++];
|
||
|
if (c >= '@' && c <= '_')
|
||
|
return c - '@';
|
||
|
else
|
||
|
throw NewParseException ("Unrecognized control character.");
|
||
|
}
|
||
|
|
||
|
// unknown escape
|
||
|
ptr = p;
|
||
|
return -1;
|
||
|
}
|
||
|
|
||
|
private string ParseName () {
|
||
|
return Parser.ParseName (pattern, ref ptr);
|
||
|
}
|
||
|
|
||
|
private static bool IsNameChar (char c) {
|
||
|
UnicodeCategory cat = Char.GetUnicodeCategory (c);
|
||
|
if (cat == UnicodeCategory.ModifierLetter)
|
||
|
return false;
|
||
|
if (cat == UnicodeCategory.ConnectorPunctuation)
|
||
|
return true;
|
||
|
return Char.IsLetterOrDigit (c);
|
||
|
}
|
||
|
|
||
|
private int ParseNumber (int b, int min, int max) {
|
||
|
return Parser.ParseNumber (pattern, ref ptr, b, min, max);
|
||
|
}
|
||
|
|
||
|
private static int ParseDigit (char c, int b, int n) {
|
||
|
switch (b) {
|
||
|
case 8:
|
||
|
if (c >= '0' && c <= '7')
|
||
|
return c - '0';
|
||
|
else
|
||
|
return -1;
|
||
|
case 10:
|
||
|
if (c >= '0' && c <= '9')
|
||
|
return c - '0';
|
||
|
else
|
||
|
return -1;
|
||
|
case 16:
|
||
|
if (c >= '0' && c <= '9')
|
||
|
return c - '0';
|
||
|
else if (c >= 'a' && c <= 'f')
|
||
|
return 10 + c - 'a';
|
||
|
else if (c >= 'A' && c <= 'F')
|
||
|
return 10 + c - 'A';
|
||
|
else
|
||
|
return -1;
|
||
|
default:
|
||
|
return -1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private void ConsumeWhitespace (bool ignore) {
|
||
|
while (ptr < pattern.Length) {
|
||
|
if (pattern[ptr] == '(') {
|
||
|
if (ptr + 3 >= pattern.Length)
|
||
|
return;
|
||
|
|
||
|
if (pattern[ptr + 1] != '?' || pattern[ptr + 2] != '#')
|
||
|
return;
|
||
|
|
||
|
ptr += 3;
|
||
|
while (ptr < pattern.Length && pattern[ptr ++] != ')')
|
||
|
/* ignore */ ;
|
||
|
}
|
||
|
else if (ignore && pattern[ptr] == '#') {
|
||
|
while (ptr < pattern.Length && pattern[ptr ++] != '\n')
|
||
|
/* ignore */ ;
|
||
|
}
|
||
|
else if (ignore && Char.IsWhiteSpace (pattern[ptr])) {
|
||
|
while (ptr < pattern.Length && Char.IsWhiteSpace (pattern[ptr]))
|
||
|
++ ptr;
|
||
|
}
|
||
|
else
|
||
|
return;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private string ParseString (string pattern) {
|
||
|
this.pattern = pattern;
|
||
|
this.ptr = 0;
|
||
|
|
||
|
StringBuilder result = new StringBuilder (pattern.Length);
|
||
|
while (ptr < pattern.Length) {
|
||
|
int c = pattern[ptr ++];
|
||
|
if (c == '\\') {
|
||
|
c = ParseEscape (false);
|
||
|
|
||
|
if(c < 0) {
|
||
|
c = pattern[ptr ++];
|
||
|
if(c == 'b')
|
||
|
c = '\b';
|
||
|
}
|
||
|
}
|
||
|
result.Append ((char) c);
|
||
|
}
|
||
|
|
||
|
return result.ToString ();
|
||
|
}
|
||
|
|
||
|
private void ResolveReferences ()
|
||
|
{
|
||
|
int gid = 1;
|
||
|
Hashtable dict = new Hashtable ();
|
||
|
ArrayList explicit_numeric_groups = null;
|
||
|
|
||
|
// number unnamed groups
|
||
|
|
||
|
foreach (CapturingGroup group in caps) {
|
||
|
if (group.Name != null)
|
||
|
continue;
|
||
|
|
||
|
dict.Add (gid.ToString (), group);
|
||
|
group.Index = gid ++;
|
||
|
++ num_groups;
|
||
|
}
|
||
|
|
||
|
// number named groups
|
||
|
|
||
|
foreach (CapturingGroup group in caps) {
|
||
|
if (group.Name == null)
|
||
|
continue;
|
||
|
|
||
|
if (dict.Contains (group.Name)) {
|
||
|
CapturingGroup prev = (CapturingGroup) dict [group.Name];
|
||
|
group.Index = prev.Index;
|
||
|
|
||
|
if (group.Index == gid)
|
||
|
gid ++;
|
||
|
else if (group.Index > gid)
|
||
|
explicit_numeric_groups.Add (group);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
if (Char.IsDigit (group.Name [0])) {
|
||
|
int ptr = 0;
|
||
|
int group_gid = ParseDecimal (group.Name, ref ptr);
|
||
|
if (ptr == group.Name.Length) {
|
||
|
group.Index = group_gid;
|
||
|
dict.Add (group.Name, group);
|
||
|
++ num_groups;
|
||
|
|
||
|
if (group_gid == gid) {
|
||
|
gid ++;
|
||
|
} else {
|
||
|
// all numbers before 'gid' are already in the dictionary. So, we know group_gid > gid
|
||
|
if (explicit_numeric_groups == null)
|
||
|
explicit_numeric_groups = new ArrayList (4);
|
||
|
explicit_numeric_groups.Add (group);
|
||
|
}
|
||
|
|
||
|
continue;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
string gid_s = gid.ToString ();
|
||
|
while (dict.Contains (gid_s))
|
||
|
gid_s = (++gid).ToString ();
|
||
|
|
||
|
dict.Add (gid_s, group);
|
||
|
dict.Add (group.Name, group);
|
||
|
group.Index = gid ++;
|
||
|
++ num_groups;
|
||
|
}
|
||
|
|
||
|
gap = gid; // == 1 + num_groups, if explicit_numeric_groups == null
|
||
|
|
||
|
if (explicit_numeric_groups != null)
|
||
|
HandleExplicitNumericGroups (explicit_numeric_groups);
|
||
|
|
||
|
// resolve references
|
||
|
|
||
|
foreach (Expression expr in refs.Keys) {
|
||
|
string name = (string) refs [expr];
|
||
|
if (!dict.Contains (name)) {
|
||
|
if (expr is CaptureAssertion && !Char.IsDigit (name [0]))
|
||
|
continue;
|
||
|
BackslashNumber bn = expr as BackslashNumber;
|
||
|
if (bn != null && bn.ResolveReference (name, dict))
|
||
|
continue;
|
||
|
throw NewParseException ("Reference to undefined group " +
|
||
|
(Char.IsDigit (name[0]) ? "number " : "name ") +
|
||
|
name);
|
||
|
}
|
||
|
|
||
|
CapturingGroup group = (CapturingGroup)dict[name];
|
||
|
if (expr is Reference)
|
||
|
((Reference)expr).CapturingGroup = group;
|
||
|
else if (expr is CaptureAssertion)
|
||
|
((CaptureAssertion)expr).CapturingGroup = group;
|
||
|
else if (expr is BalancingGroup)
|
||
|
((BalancingGroup)expr).Balance = group;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private void HandleExplicitNumericGroups (ArrayList explicit_numeric_groups)
|
||
|
{
|
||
|
int gid = gap;
|
||
|
int i = 0;
|
||
|
int n_explicit = explicit_numeric_groups.Count;
|
||
|
|
||
|
explicit_numeric_groups.Sort ();
|
||
|
|
||
|
// move 'gap' forward to skip over all explicit groups that
|
||
|
// turn out to match their index
|
||
|
for (; i < n_explicit; ++i) {
|
||
|
CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
|
||
|
if (g.Index > gid)
|
||
|
break;
|
||
|
if (g.Index == gid)
|
||
|
gid ++;
|
||
|
}
|
||
|
|
||
|
gap = gid;
|
||
|
|
||
|
// re-index all further groups so that the indexes are contiguous
|
||
|
int prev = gid;
|
||
|
for (; i < n_explicit; ++i) {
|
||
|
CapturingGroup g = (CapturingGroup) explicit_numeric_groups [i];
|
||
|
if (g.Index == prev) {
|
||
|
g.Index = gid - 1;
|
||
|
} else {
|
||
|
prev = g.Index;
|
||
|
g.Index = gid ++;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// flag helper functions
|
||
|
|
||
|
private static bool IsIgnoreCase (RegexOptions options) {
|
||
|
return (options & RegexOptions.IgnoreCase) != 0;
|
||
|
}
|
||
|
|
||
|
private static bool IsMultiline (RegexOptions options) {
|
||
|
return (options & RegexOptions.Multiline) != 0;
|
||
|
}
|
||
|
|
||
|
private static bool IsExplicitCapture (RegexOptions options) {
|
||
|
return (options & RegexOptions.ExplicitCapture) != 0;
|
||
|
}
|
||
|
|
||
|
private static bool IsSingleline (RegexOptions options) {
|
||
|
return (options & RegexOptions.Singleline) != 0;
|
||
|
}
|
||
|
|
||
|
private static bool IsIgnorePatternWhitespace (RegexOptions options) {
|
||
|
return (options & RegexOptions.IgnorePatternWhitespace) != 0;
|
||
|
}
|
||
|
|
||
|
private static bool IsECMAScript (RegexOptions options) {
|
||
|
return (options & RegexOptions.ECMAScript) != 0;
|
||
|
}
|
||
|
|
||
|
// exception creation
|
||
|
|
||
|
private ArgumentException NewParseException (string msg) {
|
||
|
msg = "parsing \"" + pattern + "\" - " + msg;
|
||
|
return new ArgumentException (msg, pattern);
|
||
|
}
|
||
|
|
||
|
private string pattern;
|
||
|
private int ptr;
|
||
|
|
||
|
private ArrayList caps;
|
||
|
private Hashtable refs;
|
||
|
private int num_groups;
|
||
|
private int gap;
|
||
|
}
|
||
|
}
|