You've already forked linux-packaging-mono
769 lines
24 KiB
C#
769 lines
24 KiB
C#
using System;
|
|
using System.Collections;
|
|
using System.Globalization;
|
|
using System.Reflection;
|
|
using System.Reflection.Emit;
|
|
|
|
namespace System.Text.RegularExpressions {
|
|
|
|
/* This behaves like a growing list of tuples (base, offsetpos) */
|
|
class RxLinkRef: LinkRef {
|
|
public int[] offsets;
|
|
public int current = 0;
|
|
|
|
public RxLinkRef ()
|
|
{
|
|
offsets = new int [8];
|
|
}
|
|
|
|
// the start of the branch instruction
|
|
// in the program stream
|
|
public void PushInstructionBase (int offset)
|
|
{
|
|
if ((current & 1) != 0)
|
|
throw new Exception ();
|
|
if (current == offsets.Length) {
|
|
int[] newarray = new int [offsets.Length * 2];
|
|
Array.Copy (offsets, newarray, offsets.Length);
|
|
offsets = newarray;
|
|
}
|
|
offsets [current++] = offset;
|
|
}
|
|
|
|
// the position in the program stream where the jump offset is stored
|
|
public void PushOffsetPosition (int offset)
|
|
{
|
|
if ((current & 1) == 0)
|
|
throw new Exception ();
|
|
offsets [current++] = offset;
|
|
}
|
|
|
|
}
|
|
|
|
class RxCompiler : ICompiler {
|
|
protected byte[] program = new byte [32];
|
|
protected int curpos = 0;
|
|
|
|
public RxCompiler () {
|
|
}
|
|
|
|
void MakeRoom (int bytes)
|
|
{
|
|
while (curpos + bytes > program.Length) {
|
|
int newsize = program.Length * 2;
|
|
byte[] newp = new byte [newsize];
|
|
Buffer.BlockCopy (program, 0, newp, 0, program.Length);
|
|
program = newp;
|
|
}
|
|
}
|
|
|
|
void Emit (byte val)
|
|
{
|
|
MakeRoom (1);
|
|
program [curpos] = val;
|
|
++curpos;
|
|
}
|
|
|
|
void Emit (RxOp opcode)
|
|
{
|
|
Emit ((byte)opcode);
|
|
}
|
|
|
|
void Emit (ushort val)
|
|
{
|
|
MakeRoom (2);
|
|
program [curpos] = (byte)val;
|
|
program [curpos + 1] = (byte)(val >> 8);
|
|
curpos += 2;
|
|
}
|
|
|
|
void Emit (int val)
|
|
{
|
|
MakeRoom (4);
|
|
program [curpos] = (byte)val;
|
|
program [curpos + 1] = (byte)(val >> 8);
|
|
program [curpos + 2] = (byte)(val >> 16);
|
|
program [curpos + 3] = (byte)(val >> 24);
|
|
curpos += 4;
|
|
}
|
|
|
|
void BeginLink (LinkRef lref) {
|
|
RxLinkRef link = lref as RxLinkRef;
|
|
link.PushInstructionBase (curpos);
|
|
}
|
|
|
|
void EmitLink (LinkRef lref)
|
|
{
|
|
RxLinkRef link = lref as RxLinkRef;
|
|
link.PushOffsetPosition (curpos);
|
|
Emit ((ushort)0);
|
|
}
|
|
|
|
// ICompiler implementation
|
|
public void Reset ()
|
|
{
|
|
curpos = 0;
|
|
}
|
|
|
|
public IMachineFactory GetMachineFactory ()
|
|
{
|
|
byte[] code = new byte [curpos];
|
|
Buffer.BlockCopy (program, 0, code, 0, curpos);
|
|
//Console.WriteLine ("Program size: {0}", curpos);
|
|
|
|
return new RxInterpreterFactory (code, null);
|
|
}
|
|
|
|
public void EmitFalse ()
|
|
{
|
|
Emit (RxOp.False);
|
|
}
|
|
|
|
public void EmitTrue ()
|
|
{
|
|
Emit (RxOp.True);
|
|
}
|
|
|
|
/* Overriden by CILCompiler */
|
|
public virtual void EmitOp (RxOp op, bool negate, bool ignore, bool reverse) {
|
|
int offset = 0;
|
|
if (negate)
|
|
offset += 1;
|
|
if (ignore)
|
|
offset += 2;
|
|
if (reverse)
|
|
offset += 4;
|
|
|
|
Emit ((RxOp)((int)op + offset));
|
|
}
|
|
|
|
public virtual void EmitOpIgnoreReverse (RxOp op, bool ignore, bool reverse) {
|
|
int offset = 0;
|
|
if (ignore)
|
|
offset += 1;
|
|
if (reverse)
|
|
offset += 2;
|
|
|
|
Emit ((RxOp)((int)op + offset));
|
|
}
|
|
|
|
public virtual void EmitOpNegateReverse (RxOp op, bool negate, bool reverse) {
|
|
int offset = 0;
|
|
if (negate)
|
|
offset += 1;
|
|
if (reverse)
|
|
offset += 2;
|
|
|
|
Emit ((RxOp)((int)op + offset));
|
|
}
|
|
|
|
public void EmitCharacter (char c, bool negate, bool ignore, bool reverse)
|
|
{
|
|
if (ignore)
|
|
c = Char.ToLower (c);
|
|
if (c < 256) {
|
|
EmitOp (RxOp.Char, negate, ignore, reverse);
|
|
Emit ((byte)c);
|
|
} else {
|
|
EmitOp (RxOp.UnicodeChar, negate, ignore, reverse);
|
|
Emit ((ushort)c);
|
|
}
|
|
}
|
|
|
|
void EmitUniCat (UnicodeCategory cat, bool negate, bool reverse)
|
|
{
|
|
EmitOpNegateReverse (RxOp.CategoryUnicode, negate, reverse);
|
|
Emit ((byte)cat);
|
|
}
|
|
|
|
void EmitCatGeneral (Category cat, bool negate, bool reverse)
|
|
{
|
|
EmitOpNegateReverse (RxOp.CategoryGeneral, negate, reverse);
|
|
Emit ((byte)cat);
|
|
}
|
|
|
|
public void EmitCategory (Category cat, bool negate, bool reverse)
|
|
{
|
|
switch (cat) {
|
|
case Category.Any:
|
|
case Category.EcmaAny:
|
|
EmitOpNegateReverse (RxOp.CategoryAny, negate, reverse);
|
|
break;
|
|
case Category.AnySingleline:
|
|
EmitOpNegateReverse (RxOp.CategoryAnySingleline, negate, reverse);
|
|
break;
|
|
case Category.Word:
|
|
EmitOpNegateReverse (RxOp.CategoryWord, negate, reverse);
|
|
break;
|
|
case Category.Digit:
|
|
EmitOpNegateReverse (RxOp.CategoryDigit, negate, reverse);
|
|
break;
|
|
case Category.WhiteSpace:
|
|
EmitOpNegateReverse (RxOp.CategoryWhiteSpace, negate, reverse);
|
|
break;
|
|
/* FIXME: translate EcmaWord, EcmaWhiteSpace into Bitmaps? EcmaWhiteSpace will fit very well with the IL engine */
|
|
case Category.EcmaWord:
|
|
EmitOpNegateReverse (RxOp.CategoryEcmaWord, negate, reverse);
|
|
break;
|
|
case Category.EcmaDigit:
|
|
EmitRange ('0', '9', negate, false, reverse);
|
|
break;
|
|
case Category.EcmaWhiteSpace:
|
|
EmitOpNegateReverse (RxOp.CategoryEcmaWhiteSpace, negate, reverse);
|
|
break;
|
|
case Category.UnicodeSpecials:
|
|
EmitOpNegateReverse (RxOp.CategoryUnicodeSpecials, negate, reverse);
|
|
break;
|
|
// Unicode categories...
|
|
// letter
|
|
case Category.UnicodeLu: EmitUniCat (UnicodeCategory.UppercaseLetter, negate, reverse); break;
|
|
case Category.UnicodeLl: EmitUniCat (UnicodeCategory.LowercaseLetter, negate, reverse); break;
|
|
case Category.UnicodeLt: EmitUniCat (UnicodeCategory.TitlecaseLetter, negate, reverse); break;
|
|
case Category.UnicodeLm: EmitUniCat (UnicodeCategory.ModifierLetter, negate, reverse); break;
|
|
case Category.UnicodeLo: EmitUniCat (UnicodeCategory.OtherLetter, negate, reverse); break;
|
|
// mark
|
|
case Category.UnicodeMn: EmitUniCat (UnicodeCategory.NonSpacingMark, negate, reverse); break;
|
|
case Category.UnicodeMe: EmitUniCat (UnicodeCategory.EnclosingMark, negate, reverse); break;
|
|
case Category.UnicodeMc: EmitUniCat (UnicodeCategory.SpacingCombiningMark, negate, reverse); break;
|
|
case Category.UnicodeNd: EmitUniCat (UnicodeCategory.DecimalDigitNumber, negate, reverse); break;
|
|
// number
|
|
case Category.UnicodeNl: EmitUniCat (UnicodeCategory.LetterNumber, negate, reverse); break;
|
|
case Category.UnicodeNo: EmitUniCat (UnicodeCategory.OtherNumber, negate, reverse); break;
|
|
// separator
|
|
case Category.UnicodeZs: EmitUniCat (UnicodeCategory.SpaceSeparator, negate, reverse); break;
|
|
case Category.UnicodeZl: EmitUniCat (UnicodeCategory.LineSeparator, negate, reverse); break;
|
|
case Category.UnicodeZp: EmitUniCat (UnicodeCategory.ParagraphSeparator, negate, reverse); break;
|
|
// punctuation
|
|
case Category.UnicodePd: EmitUniCat (UnicodeCategory.DashPunctuation, negate, reverse); break;
|
|
case Category.UnicodePs: EmitUniCat (UnicodeCategory.OpenPunctuation, negate, reverse); break;
|
|
case Category.UnicodePi: EmitUniCat (UnicodeCategory.InitialQuotePunctuation, negate, reverse); break;
|
|
case Category.UnicodePe: EmitUniCat (UnicodeCategory.ClosePunctuation, negate, reverse); break;
|
|
case Category.UnicodePf: EmitUniCat (UnicodeCategory.FinalQuotePunctuation, negate, reverse); break;
|
|
case Category.UnicodePc: EmitUniCat (UnicodeCategory.ConnectorPunctuation, negate, reverse); break;
|
|
case Category.UnicodePo: EmitUniCat (UnicodeCategory.OtherPunctuation, negate, reverse); break;
|
|
// symbol
|
|
case Category.UnicodeSm: EmitUniCat (UnicodeCategory.MathSymbol, negate, reverse); break;
|
|
case Category.UnicodeSc: EmitUniCat (UnicodeCategory.CurrencySymbol, negate, reverse); break;
|
|
case Category.UnicodeSk: EmitUniCat (UnicodeCategory.ModifierSymbol, negate, reverse); break;
|
|
case Category.UnicodeSo: EmitUniCat (UnicodeCategory.OtherSymbol, negate, reverse); break;
|
|
// other
|
|
case Category.UnicodeCc: EmitUniCat (UnicodeCategory.Control, negate, reverse); break;
|
|
case Category.UnicodeCf: EmitUniCat (UnicodeCategory.Format, negate, reverse); break;
|
|
case Category.UnicodeCo: EmitUniCat (UnicodeCategory.PrivateUse, negate, reverse); break;
|
|
case Category.UnicodeCs: EmitUniCat (UnicodeCategory.Surrogate, negate, reverse); break;
|
|
case Category.UnicodeCn: EmitUniCat (UnicodeCategory.OtherNotAssigned, negate, reverse); break;
|
|
// Unicode block ranges...
|
|
case Category.UnicodeBasicLatin:
|
|
EmitRange ('\u0000', '\u007F', negate, false, reverse); break;
|
|
case Category.UnicodeLatin1Supplement:
|
|
EmitRange ('\u0080', '\u00FF', negate, false, reverse); break;
|
|
case Category.UnicodeLatinExtendedA:
|
|
EmitRange ('\u0100', '\u017F', negate, false, reverse); break;
|
|
case Category.UnicodeLatinExtendedB:
|
|
EmitRange ('\u0180', '\u024F', negate, false, reverse); break;
|
|
case Category.UnicodeIPAExtensions:
|
|
EmitRange ('\u0250', '\u02AF', negate, false, reverse); break;
|
|
case Category.UnicodeSpacingModifierLetters:
|
|
EmitRange ('\u02B0', '\u02FF', negate, false, reverse); break;
|
|
case Category.UnicodeCombiningDiacriticalMarks:
|
|
EmitRange ('\u0300', '\u036F', negate, false, reverse); break;
|
|
case Category.UnicodeGreek:
|
|
EmitRange ('\u0370', '\u03FF', negate, false, reverse); break;
|
|
case Category.UnicodeCyrillic:
|
|
EmitRange ('\u0400', '\u04FF', negate, false, reverse); break;
|
|
case Category.UnicodeArmenian:
|
|
EmitRange ('\u0530', '\u058F', negate, false, reverse); break;
|
|
case Category.UnicodeHebrew:
|
|
EmitRange ('\u0590', '\u05FF', negate, false, reverse); break;
|
|
case Category.UnicodeArabic:
|
|
EmitRange ('\u0600', '\u06FF', negate, false, reverse); break;
|
|
case Category.UnicodeSyriac:
|
|
EmitRange ('\u0700', '\u074F', negate, false, reverse); break;
|
|
case Category.UnicodeThaana:
|
|
EmitRange ('\u0780', '\u07BF', negate, false, reverse); break;
|
|
case Category.UnicodeDevanagari:
|
|
EmitRange ('\u0900', '\u097F', negate, false, reverse); break;
|
|
case Category.UnicodeBengali:
|
|
EmitRange ('\u0980', '\u09FF', negate, false, reverse); break;
|
|
case Category.UnicodeGurmukhi:
|
|
EmitRange ('\u0A00', '\u0A7F', negate, false, reverse); break;
|
|
case Category.UnicodeGujarati:
|
|
EmitRange ('\u0A80', '\u0AFF', negate, false, reverse); break;
|
|
case Category.UnicodeOriya:
|
|
EmitRange ('\u0B00', '\u0B7F', negate, false, reverse); break;
|
|
case Category.UnicodeTamil:
|
|
EmitRange ('\u0B80', '\u0BFF', negate, false, reverse); break;
|
|
case Category.UnicodeTelugu:
|
|
EmitRange ('\u0C00', '\u0C7F', negate, false, reverse); break;
|
|
case Category.UnicodeKannada:
|
|
EmitRange ('\u0C80', '\u0CFF', negate, false, reverse); break;
|
|
case Category.UnicodeMalayalam:
|
|
EmitRange ('\u0D00', '\u0D7F', negate, false, reverse); break;
|
|
case Category.UnicodeSinhala:
|
|
EmitRange ('\u0D80', '\u0DFF', negate, false, reverse); break;
|
|
case Category.UnicodeThai:
|
|
EmitRange ('\u0E00', '\u0E7F', negate, false, reverse); break;
|
|
case Category.UnicodeLao:
|
|
EmitRange ('\u0E80', '\u0EFF', negate, false, reverse); break;
|
|
case Category.UnicodeTibetan:
|
|
EmitRange ('\u0F00', '\u0FFF', negate, false, reverse); break;
|
|
case Category.UnicodeMyanmar:
|
|
EmitRange ('\u1000', '\u109F', negate, false, reverse); break;
|
|
case Category.UnicodeGeorgian:
|
|
EmitRange ('\u10A0', '\u10FF', negate, false, reverse); break;
|
|
case Category.UnicodeHangulJamo:
|
|
EmitRange ('\u1100', '\u11FF', negate, false, reverse); break;
|
|
case Category.UnicodeEthiopic:
|
|
EmitRange ('\u1200', '\u137F', negate, false, reverse); break;
|
|
case Category.UnicodeCherokee:
|
|
EmitRange ('\u13A0', '\u13FF', negate, false, reverse); break;
|
|
case Category.UnicodeUnifiedCanadianAboriginalSyllabics:
|
|
EmitRange ('\u1400', '\u167F', negate, false, reverse); break;
|
|
case Category.UnicodeOgham:
|
|
EmitRange ('\u1680', '\u169F', negate, false, reverse); break;
|
|
case Category.UnicodeRunic:
|
|
EmitRange ('\u16A0', '\u16FF', negate, false, reverse); break;
|
|
case Category.UnicodeKhmer:
|
|
EmitRange ('\u1780', '\u17FF', negate, false, reverse); break;
|
|
case Category.UnicodeMongolian:
|
|
EmitRange ('\u1800', '\u18AF', negate, false, reverse); break;
|
|
case Category.UnicodeLatinExtendedAdditional:
|
|
EmitRange ('\u1E00', '\u1EFF', negate, false, reverse); break;
|
|
case Category.UnicodeGreekExtended:
|
|
EmitRange ('\u1F00', '\u1FFF', negate, false, reverse); break;
|
|
case Category.UnicodeGeneralPunctuation:
|
|
EmitRange ('\u2000', '\u206F', negate, false, reverse); break;
|
|
case Category.UnicodeSuperscriptsandSubscripts:
|
|
EmitRange ('\u2070', '\u209F', negate, false, reverse); break;
|
|
case Category.UnicodeCurrencySymbols:
|
|
EmitRange ('\u20A0', '\u20CF', negate, false, reverse); break;
|
|
case Category.UnicodeCombiningMarksforSymbols:
|
|
EmitRange ('\u20D0', '\u20FF', negate, false, reverse); break;
|
|
case Category.UnicodeLetterlikeSymbols:
|
|
EmitRange ('\u2100', '\u214F', negate, false, reverse); break;
|
|
case Category.UnicodeNumberForms:
|
|
EmitRange ('\u2150', '\u218F', negate, false, reverse); break;
|
|
case Category.UnicodeArrows:
|
|
EmitRange ('\u2190', '\u21FF', negate, false, reverse); break;
|
|
case Category.UnicodeMathematicalOperators:
|
|
EmitRange ('\u2200', '\u22FF', negate, false, reverse); break;
|
|
case Category.UnicodeMiscellaneousTechnical:
|
|
EmitRange ('\u2300', '\u23FF', negate, false, reverse); break;
|
|
case Category.UnicodeControlPictures:
|
|
EmitRange ('\u2400', '\u243F', negate, false, reverse); break;
|
|
case Category.UnicodeOpticalCharacterRecognition:
|
|
EmitRange ('\u2440', '\u245F', negate, false, reverse); break;
|
|
case Category.UnicodeEnclosedAlphanumerics:
|
|
EmitRange ('\u2460', '\u24FF', negate, false, reverse); break;
|
|
case Category.UnicodeBoxDrawing:
|
|
EmitRange ('\u2500', '\u257F', negate, false, reverse); break;
|
|
case Category.UnicodeBlockElements:
|
|
EmitRange ('\u2580', '\u259F', negate, false, reverse); break;
|
|
case Category.UnicodeGeometricShapes:
|
|
EmitRange ('\u25A0', '\u25FF', negate, false, reverse); break;
|
|
case Category.UnicodeMiscellaneousSymbols:
|
|
EmitRange ('\u2600', '\u26FF', negate, false, reverse); break;
|
|
case Category.UnicodeDingbats:
|
|
EmitRange ('\u2700', '\u27BF', negate, false, reverse); break;
|
|
case Category.UnicodeBraillePatterns:
|
|
EmitRange ('\u2800', '\u28FF', negate, false, reverse); break;
|
|
case Category.UnicodeCJKRadicalsSupplement:
|
|
EmitRange ('\u2E80', '\u2EFF', negate, false, reverse); break;
|
|
case Category.UnicodeKangxiRadicals:
|
|
EmitRange ('\u2F00', '\u2FDF', negate, false, reverse); break;
|
|
case Category.UnicodeIdeographicDescriptionCharacters:
|
|
EmitRange ('\u2FF0', '\u2FFF', negate, false, reverse); break;
|
|
case Category.UnicodeCJKSymbolsandPunctuation:
|
|
EmitRange ('\u3000', '\u303F', negate, false, reverse); break;
|
|
case Category.UnicodeHiragana:
|
|
EmitRange ('\u3040', '\u309F', negate, false, reverse); break;
|
|
case Category.UnicodeKatakana:
|
|
EmitRange ('\u30A0', '\u30FF', negate, false, reverse); break;
|
|
case Category.UnicodeBopomofo:
|
|
EmitRange ('\u3100', '\u312F', negate, false, reverse); break;
|
|
case Category.UnicodeHangulCompatibilityJamo:
|
|
EmitRange ('\u3130', '\u318F', negate, false, reverse); break;
|
|
case Category.UnicodeKanbun:
|
|
EmitRange ('\u3190', '\u319F', negate, false, reverse); break;
|
|
case Category.UnicodeBopomofoExtended:
|
|
EmitRange ('\u31A0', '\u31BF', negate, false, reverse); break;
|
|
case Category.UnicodeEnclosedCJKLettersandMonths:
|
|
EmitRange ('\u3200', '\u32FF', negate, false, reverse); break;
|
|
case Category.UnicodeCJKCompatibility:
|
|
EmitRange ('\u3300', '\u33FF', negate, false, reverse); break;
|
|
case Category.UnicodeCJKUnifiedIdeographsExtensionA:
|
|
EmitRange ('\u3400', '\u4DB5', negate, false, reverse); break;
|
|
case Category.UnicodeCJKUnifiedIdeographs:
|
|
EmitRange ('\u4E00', '\u9FFF', negate, false, reverse); break;
|
|
case Category.UnicodeYiSyllables:
|
|
EmitRange ('\uA000', '\uA48F', negate, false, reverse); break;
|
|
case Category.UnicodeYiRadicals:
|
|
EmitRange ('\uA490', '\uA4CF', negate, false, reverse); break;
|
|
case Category.UnicodeHangulSyllables:
|
|
EmitRange ('\uAC00', '\uD7A3', negate, false, reverse); break;
|
|
case Category.UnicodeHighSurrogates:
|
|
EmitRange ('\uD800', '\uDB7F', negate, false, reverse); break;
|
|
case Category.UnicodeHighPrivateUseSurrogates:
|
|
EmitRange ('\uDB80', '\uDBFF', negate, false, reverse); break;
|
|
case Category.UnicodeLowSurrogates:
|
|
EmitRange ('\uDC00', '\uDFFF', negate, false, reverse); break;
|
|
case Category.UnicodePrivateUse:
|
|
EmitRange ('\uE000', '\uF8FF', negate, false, reverse); break;
|
|
case Category.UnicodeCJKCompatibilityIdeographs:
|
|
EmitRange ('\uF900', '\uFAFF', negate, false, reverse); break;
|
|
case Category.UnicodeAlphabeticPresentationForms:
|
|
EmitRange ('\uFB00', '\uFB4F', negate, false, reverse); break;
|
|
case Category.UnicodeArabicPresentationFormsA:
|
|
EmitRange ('\uFB50', '\uFDFF', negate, false, reverse); break;
|
|
case Category.UnicodeCombiningHalfMarks:
|
|
EmitRange ('\uFE20', '\uFE2F', negate, false, reverse); break;
|
|
case Category.UnicodeCJKCompatibilityForms:
|
|
EmitRange ('\uFE30', '\uFE4F', negate, false, reverse); break;
|
|
case Category.UnicodeSmallFormVariants:
|
|
EmitRange ('\uFE50', '\uFE6F', negate, false, reverse); break;
|
|
case Category.UnicodeArabicPresentationFormsB:
|
|
EmitRange ('\uFE70', '\uFEFE', negate, false, reverse); break;
|
|
case Category.UnicodeHalfwidthandFullwidthForms:
|
|
EmitRange ('\uFF00', '\uFFEF', negate, false, reverse); break;
|
|
|
|
// Complex categories
|
|
case Category.UnicodeL:
|
|
case Category.UnicodeM:
|
|
case Category.UnicodeN:
|
|
case Category.UnicodeZ:
|
|
case Category.UnicodeP:
|
|
case Category.UnicodeS:
|
|
case Category.UnicodeC:
|
|
EmitCatGeneral (cat, negate, reverse); break;
|
|
|
|
default:
|
|
throw new NotImplementedException ("Missing category: " + cat);
|
|
}
|
|
}
|
|
|
|
public void EmitNotCategory (Category cat, bool negate, bool reverse)
|
|
{
|
|
// not sure why the compiler needed this separate interface funtion
|
|
if (negate) {
|
|
EmitCategory (cat, false, reverse);
|
|
} else {
|
|
EmitCategory (cat, true, reverse);
|
|
}
|
|
}
|
|
|
|
public void EmitRange (char lo, char hi, bool negate, bool ignore, bool reverse)
|
|
{
|
|
if (lo < 256 && hi < 256) {
|
|
EmitOp (RxOp.Range, negate, ignore, reverse);
|
|
Emit ((byte)lo);
|
|
Emit ((byte)hi);
|
|
} else {
|
|
EmitOp (RxOp.UnicodeRange, negate, ignore, reverse);
|
|
Emit ((ushort)lo);
|
|
Emit ((ushort)hi);
|
|
}
|
|
}
|
|
|
|
public void EmitSet (char lo, BitArray set, bool negate, bool ignore, bool reverse)
|
|
{
|
|
int len = (set.Length + 0x7) >> 3;
|
|
if (lo < 256 && len < 256) {
|
|
EmitOp (RxOp.Bitmap, negate, ignore, reverse);
|
|
Emit ((byte)lo);
|
|
Emit ((byte)len);
|
|
} else {
|
|
EmitOp (RxOp.UnicodeBitmap, negate, ignore, reverse);
|
|
Emit ((ushort)lo);
|
|
Emit ((ushort)len);
|
|
}
|
|
// emit the bitmap bytes
|
|
int b = 0;
|
|
while (len-- != 0) {
|
|
int word = 0;
|
|
for (int i = 0; i < 8; ++ i) {
|
|
if (b >= set.Length)
|
|
break;
|
|
if (set [b ++])
|
|
word |= 1 << i;
|
|
}
|
|
Emit ((byte)word);
|
|
}
|
|
}
|
|
|
|
public void EmitString (string str, bool ignore, bool reverse)
|
|
{
|
|
bool islatin1 = false;
|
|
int i;
|
|
int offset = 0;
|
|
if (ignore)
|
|
offset += 1;
|
|
if (reverse)
|
|
offset += 2;
|
|
if (ignore)
|
|
str = str.ToLower ();
|
|
if (str.Length < 256) {
|
|
islatin1 = true;
|
|
for (i = 0; i < str.Length; ++i) {
|
|
if (str [i] >= 256) {
|
|
islatin1 = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (islatin1) {
|
|
EmitOpIgnoreReverse (RxOp.String, ignore, reverse);
|
|
Emit ((byte)str.Length);
|
|
for (i = 0; i < str.Length; ++i)
|
|
Emit ((byte)str [i]);
|
|
} else {
|
|
EmitOpIgnoreReverse (RxOp.UnicodeString, ignore, reverse);
|
|
if (str.Length > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
Emit ((ushort)str.Length);
|
|
for (i = 0; i < str.Length; ++i)
|
|
Emit ((ushort)str [i]);
|
|
}
|
|
}
|
|
|
|
public void EmitPosition (Position pos)
|
|
{
|
|
switch (pos) {
|
|
case Position.Any:
|
|
Emit (RxOp.AnyPosition);
|
|
break;
|
|
case Position.Start:
|
|
Emit (RxOp.StartOfString);
|
|
break;
|
|
case Position.StartOfString:
|
|
Emit (RxOp.StartOfString);
|
|
break;
|
|
case Position.StartOfLine:
|
|
Emit (RxOp.StartOfLine);
|
|
break;
|
|
case Position.StartOfScan:
|
|
Emit (RxOp.StartOfScan);
|
|
break;
|
|
case Position.End:
|
|
Emit (RxOp.End);
|
|
break;
|
|
case Position.EndOfString:
|
|
Emit (RxOp.EndOfString);
|
|
break;
|
|
case Position.EndOfLine:
|
|
Emit (RxOp.EndOfLine);
|
|
break;
|
|
case Position.Boundary:
|
|
Emit (RxOp.WordBoundary);
|
|
break;
|
|
case Position.NonBoundary:
|
|
Emit (RxOp.NoWordBoundary);
|
|
break;
|
|
default:
|
|
throw new NotSupportedException ();
|
|
}
|
|
}
|
|
|
|
public void EmitOpen (int gid)
|
|
{
|
|
if (gid > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
Emit (RxOp.OpenGroup);
|
|
Emit ((ushort)gid);
|
|
}
|
|
|
|
public void EmitClose (int gid)
|
|
{
|
|
if (gid > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
Emit (RxOp.CloseGroup);
|
|
Emit ((ushort)gid);
|
|
}
|
|
|
|
public void EmitBalanceStart(int gid, int balance, bool capture, LinkRef tail)
|
|
{
|
|
BeginLink (tail);
|
|
Emit (RxOp.BalanceStart);
|
|
Emit ((ushort)gid);
|
|
Emit ((ushort)balance);
|
|
Emit ((byte)(capture ? 1 : 0));
|
|
EmitLink (tail);
|
|
}
|
|
|
|
public void EmitBalance ()
|
|
{
|
|
Emit (RxOp.Balance);
|
|
}
|
|
|
|
public void EmitReference (int gid, bool ignore, bool reverse)
|
|
{
|
|
if (gid > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
EmitOpIgnoreReverse (RxOp.Reference, ignore, reverse);
|
|
Emit ((ushort)gid);
|
|
}
|
|
|
|
public void EmitIfDefined (int gid, LinkRef tail)
|
|
{
|
|
if (gid > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
BeginLink (tail);
|
|
Emit (RxOp.IfDefined);
|
|
EmitLink (tail);
|
|
Emit ((ushort)gid);
|
|
}
|
|
|
|
public void EmitSub (LinkRef tail)
|
|
{
|
|
BeginLink (tail);
|
|
Emit (RxOp.SubExpression);
|
|
EmitLink (tail);
|
|
}
|
|
|
|
public void EmitTest (LinkRef yes, LinkRef tail)
|
|
{
|
|
BeginLink (yes);
|
|
BeginLink (tail);
|
|
Emit (RxOp.Test);
|
|
EmitLink (yes);
|
|
EmitLink (tail);
|
|
}
|
|
|
|
public void EmitBranch (LinkRef next)
|
|
{
|
|
BeginLink (next);
|
|
Emit (RxOp.Branch);
|
|
EmitLink (next);
|
|
}
|
|
|
|
public void EmitJump (LinkRef target)
|
|
{
|
|
BeginLink (target);
|
|
Emit (RxOp.Jump);
|
|
EmitLink (target);
|
|
}
|
|
|
|
public void EmitIn (LinkRef tail)
|
|
{
|
|
// emitted for things like [\dabcfh]
|
|
BeginLink (tail);
|
|
Emit (RxOp.TestCharGroup);
|
|
EmitLink (tail);
|
|
}
|
|
|
|
public void EmitRepeat (int min, int max, bool lazy, LinkRef until)
|
|
{
|
|
BeginLink (until);
|
|
Emit (lazy ? RxOp.RepeatLazy : RxOp.Repeat);
|
|
EmitLink (until);
|
|
Emit (min);
|
|
Emit (max);
|
|
}
|
|
|
|
public void EmitUntil (LinkRef repeat)
|
|
{
|
|
ResolveLink (repeat);
|
|
Emit (RxOp.Until);
|
|
}
|
|
|
|
public void EmitInfo (int count, int min, int max)
|
|
{
|
|
Emit (RxOp.Info);
|
|
if (count > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
Emit ((ushort)count);
|
|
Emit (min);
|
|
Emit (max);
|
|
}
|
|
|
|
public void EmitFastRepeat (int min, int max, bool lazy, LinkRef tail)
|
|
{
|
|
BeginLink (tail);
|
|
Emit (lazy ? RxOp.FastRepeatLazy : RxOp.FastRepeat);
|
|
EmitLink (tail);
|
|
Emit (min);
|
|
Emit (max);
|
|
}
|
|
|
|
public void EmitAnchor (bool reverse, int offset, LinkRef tail)
|
|
{
|
|
BeginLink (tail);
|
|
if (reverse)
|
|
Emit (RxOp.AnchorReverse);
|
|
else
|
|
Emit (RxOp.Anchor);
|
|
EmitLink (tail);
|
|
if (offset > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
Emit ((ushort)offset);
|
|
}
|
|
|
|
// event for the CILCompiler
|
|
public void EmitBranchEnd ()
|
|
{
|
|
}
|
|
|
|
public void EmitAlternationEnd ()
|
|
{
|
|
}
|
|
|
|
public LinkRef NewLink ()
|
|
{
|
|
return new RxLinkRef ();
|
|
}
|
|
|
|
public void ResolveLink (LinkRef link)
|
|
{
|
|
RxLinkRef l = link as RxLinkRef;
|
|
for (int i = 0; i < l.current; i += 2) {
|
|
int offset = curpos - l.offsets [i];
|
|
if (offset > ushort.MaxValue)
|
|
throw new NotSupportedException ();
|
|
int offsetpos = l.offsets [i + 1];
|
|
program [offsetpos] = (byte)offset;
|
|
program [offsetpos + 1] = (byte)(offset >> 8);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
class RxInterpreterFactory : IMachineFactory {
|
|
public RxInterpreterFactory (byte[] program, EvalDelegate eval_del) {
|
|
this.program = program;
|
|
this.eval_del = eval_del;
|
|
}
|
|
|
|
public IMachine NewInstance () {
|
|
return new RxInterpreter (program, eval_del);
|
|
}
|
|
|
|
public int GroupCount {
|
|
get {
|
|
return (int)program [1] | ((int)program [2] << 8);
|
|
}
|
|
}
|
|
|
|
public int Gap {
|
|
get { return gap; }
|
|
set { gap = value; }
|
|
}
|
|
|
|
public IDictionary Mapping {
|
|
get { return mapping; }
|
|
set { mapping = value; }
|
|
}
|
|
|
|
public string [] NamesMapping {
|
|
get { return namesMapping; }
|
|
set { namesMapping = value; }
|
|
}
|
|
|
|
private IDictionary mapping;
|
|
private byte[] program;
|
|
private EvalDelegate eval_del;
|
|
private string[] namesMapping;
|
|
private int gap;
|
|
}
|
|
|
|
}
|
|
|