//------------------------------------------------------------------------------ // // Copyright (c) Microsoft Corporation. All rights reserved. // //------------------------------------------------------------------------------ // This RegexWriter class is internal to the Regex package. // It builds a block of regular expression codes (RegexCode) // from a RegexTree parse tree. // Implementation notes: // // This step is as simple as walking the tree and emitting // sequences of codes. // namespace System.Text.RegularExpressions { using System.Collections.Generic; using System.Collections; using System.Globalization; internal sealed class RegexWriter { internal int[] _intStack; internal int _depth; internal int[] _emitted; internal int _curpos; internal Dictionary _stringhash; internal List _stringtable; // not used! internal int _stringcount; internal bool _counting; internal int _count; internal int _trackcount; #if SILVERLIGHT internal Dictionary _caps; #else internal Hashtable _caps; #endif internal const int BeforeChild = 64; internal const int AfterChild = 128; /* * This is the only function that should be called from outside. * It takes a RegexTree and creates a corresponding RegexCode. */ internal static RegexCode Write(RegexTree t) { RegexWriter w = new RegexWriter(); RegexCode retval = w.RegexCodeFromRegexTree(t); #if DBG if (t.Debug) { t.Dump(); retval.Dump(); } #endif return retval; } /* * private constructor; can't be created outside */ private RegexWriter() { _intStack = new int[32]; _emitted = new int[32]; _stringhash = new Dictionary(); _stringtable = new List(); } /* * To avoid recursion, we use a simple integer stack. * This is the push. */ internal void PushInt(int I) { if (_depth >= _intStack.Length) { int [] expanded = new int[_depth * 2]; System.Array.Copy(_intStack, 0, expanded, 0, _depth); _intStack = expanded; } _intStack[_depth++] = I; } /* * True if the stack is empty. */ internal bool EmptyStack() { return _depth == 0; } /* * This is the pop. */ internal int PopInt() { return _intStack[--_depth]; } /* * Returns the current position in the emitted code. */ internal int CurPos() { return _curpos; } /* * Fixes up a jump instruction at the specified offset * so that it jumps to the specified jumpDest. */ internal void PatchJump(int Offset, int jumpDest) { _emitted[Offset + 1] = jumpDest; } /* * Emits a zero-argument operation. Note that the emit * functions all run in two modes: they can emit code, or * they can just count the size of the code. */ internal void Emit(int op) { if (_counting) { _count += 1; if (RegexCode.OpcodeBacktracks(op)) _trackcount += 1; return; } _emitted[_curpos++] = op; } /* * Emits a one-argument operation. */ internal void Emit(int op, int opd1) { if (_counting) { _count += 2; if (RegexCode.OpcodeBacktracks(op)) _trackcount += 1; return; } _emitted[_curpos++] = op; _emitted[_curpos++] = opd1; } /* * Emits a two-argument operation. */ internal void Emit(int op, int opd1, int opd2) { if (_counting) { _count += 3; if (RegexCode.OpcodeBacktracks(op)) _trackcount += 1; return; } _emitted[_curpos++] = op; _emitted[_curpos++] = opd1; _emitted[_curpos++] = opd2; } /* * Returns an index in the string table for a string; * uses a hashtable to eliminate duplicates. */ internal int StringCode(String str) { Int32 i; if (_counting) return 0; if (str == null) str = String.Empty; if (_stringhash.ContainsKey(str)) { i = (Int32)_stringhash[str]; } else { i = _stringtable.Count; _stringhash[str] = i; _stringtable.Add(str); } return i; } /* * Just returns an exception; should be dead code */ internal ArgumentException MakeException(String message) { return new ArgumentException(message); } /* * When generating code on a regex that uses a sparse set * of capture slots, we hash them to a dense set of indices * for an array of capture slots. Instead of doing the hash * at match time, it's done at compile time, here. */ internal int MapCapnum(int capnum) { if (capnum == -1) return -1; if (_caps != null) #if SILVERLIGHT return _caps[capnum]; #else return(Int32)_caps[capnum]; #endif else return capnum; } /* * The top level RegexCode generator. It does a depth-first walk * through the tree and calls EmitFragment to emits code before * and after each child of an interior node, and at each leaf. * * It runs two passes, first to count the size of the generated * code, and second to generate the code. * * < */ internal RegexCode RegexCodeFromRegexTree(RegexTree tree) { RegexNode curNode; int curChild; int capsize; RegexPrefix fcPrefix; RegexPrefix prefix; int anchors; RegexBoyerMoore bmPrefix; bool rtl; // construct sparse capnum mapping if some numbers are unused if (tree._capnumlist == null || tree._captop == tree._capnumlist.Length) { capsize = tree._captop; _caps = null; } else { capsize = tree._capnumlist.Length; _caps = tree._caps; for (int i = 0; i < tree._capnumlist.Length; i++) _caps[tree._capnumlist[i]] = i; } _counting = true; for (;;) { if (!_counting) _emitted = new int[_count]; curNode = tree._root; curChild = 0; Emit(RegexCode.Lazybranch, 0); for (;;) { if (curNode._children == null) { EmitFragment(curNode._type, curNode, 0); } else if (curChild < curNode._children.Count) { EmitFragment(curNode._type | BeforeChild, curNode, curChild); curNode = (RegexNode)curNode._children[curChild]; PushInt(curChild); curChild = 0; continue; } if (EmptyStack()) break; curChild = PopInt(); curNode = curNode._next; EmitFragment(curNode._type | AfterChild, curNode, curChild); curChild++; } PatchJump(0, CurPos()); Emit(RegexCode.Stop); if (!_counting) break; _counting = false; } fcPrefix = RegexFCD.FirstChars(tree); prefix = RegexFCD.Prefix(tree); rtl = ((tree._options & RegexOptions.RightToLeft) != 0); CultureInfo culture = (tree._options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture; if (prefix != null && prefix.Prefix.Length > 0) bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture); else bmPrefix = null; anchors = RegexFCD.Anchors(tree); return new RegexCode(_emitted, _stringtable, _trackcount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl); } /* * The main RegexCode generator. It does a depth-first walk * through the tree and calls EmitFragment to emits code before * and after each child of an interior node, and at each leaf. */ internal void EmitFragment(int nodetype, RegexNode node, int CurIndex) { int bits = 0; if (nodetype <= RegexNode.Ref) { if (node.UseOptionR()) bits |= RegexCode.Rtl; if ((node._options & RegexOptions.IgnoreCase) != 0) bits |= RegexCode.Ci; } switch (nodetype) { case RegexNode.Concatenate | BeforeChild: case RegexNode.Concatenate | AfterChild: case RegexNode.Empty: break; case RegexNode.Alternate | BeforeChild: if (CurIndex < node._children.Count - 1) { PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); } break; case RegexNode.Alternate | AfterChild: { if (CurIndex < node._children.Count - 1) { int LBPos = PopInt(); PushInt(CurPos()); Emit(RegexCode.Goto, 0); PatchJump(LBPos, CurPos()); } else { int I; for (I = 0; I < CurIndex; I++) { PatchJump(PopInt(), CurPos()); } } break; } case RegexNode.Testref | BeforeChild: switch (CurIndex) { case 0: Emit(RegexCode.Setjump); PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); Emit(RegexCode.Testref, MapCapnum(node._m)); Emit(RegexCode.Forejump); break; } break; case RegexNode.Testref | AfterChild: switch (CurIndex) { case 0: { int Branchpos = PopInt(); PushInt(CurPos()); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, CurPos()); Emit(RegexCode.Forejump); if (node._children.Count > 1) break; // else fallthrough goto case 1; } case 1: PatchJump(PopInt(), CurPos()); break; } break; case RegexNode.Testgroup | BeforeChild: switch (CurIndex) { case 0: Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); break; } break; case RegexNode.Testgroup | AfterChild: switch (CurIndex) { case 0: Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); break; case 1: int Branchpos = PopInt(); PushInt(CurPos()); Emit(RegexCode.Goto, 0); PatchJump(Branchpos, CurPos()); Emit(RegexCode.Getmark); Emit(RegexCode.Forejump); if (node._children.Count > 2) break; // else fallthrough goto case 2; case 2: PatchJump(PopInt(), CurPos()); break; } break; case RegexNode.Loop | BeforeChild: case RegexNode.Lazyloop | BeforeChild: if (node._n < Int32.MaxValue || node._m > 1) Emit(node._m == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node._m == 0 ? 0 : 1 - node._m); else Emit(node._m == 0 ? RegexCode.Nullmark : RegexCode.Setmark); if (node._m == 0) { PushInt(CurPos()); Emit(RegexCode.Goto, 0); } PushInt(CurPos()); break; case RegexNode.Loop | AfterChild: case RegexNode.Lazyloop | AfterChild: { int StartJumpPos = CurPos(); int Lazy = (nodetype - (RegexNode.Loop | AfterChild)); if (node._n < Int32.MaxValue || node._m > 1) Emit(RegexCode.Branchcount + Lazy, PopInt(), node._n == Int32.MaxValue ? Int32.MaxValue : node._n - node._m); else Emit(RegexCode.Branchmark + Lazy, PopInt()); if (node._m == 0) PatchJump(PopInt(), StartJumpPos); } break; case RegexNode.Group | BeforeChild: case RegexNode.Group | AfterChild: break; case RegexNode.Capture | BeforeChild: Emit(RegexCode.Setmark); break; case RegexNode.Capture | AfterChild: Emit(RegexCode.Capturemark, MapCapnum(node._m), MapCapnum(node._n)); break; case RegexNode.Require | BeforeChild: // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Setjump); Emit(RegexCode.Setmark); break; case RegexNode.Require | AfterChild: Emit(RegexCode.Getmark); // NOTE: the following line causes lookahead/lookbehind to be // NON-BACKTRACKING. It can be commented out with (*) Emit(RegexCode.Forejump); break; case RegexNode.Prevent | BeforeChild: Emit(RegexCode.Setjump); PushInt(CurPos()); Emit(RegexCode.Lazybranch, 0); break; case RegexNode.Prevent | AfterChild: Emit(RegexCode.Backjump); PatchJump(PopInt(), CurPos()); Emit(RegexCode.Forejump); break; case RegexNode.Greedy | BeforeChild: Emit(RegexCode.Setjump); break; case RegexNode.Greedy | AfterChild: Emit(RegexCode.Forejump); break; case RegexNode.One: case RegexNode.Notone: Emit(node._type | bits, (int)node._ch); break; case RegexNode.Notoneloop: case RegexNode.Notonelazy: case RegexNode.Oneloop: case RegexNode.Onelazy: if (node._m > 0) Emit(((node._type == RegexNode.Oneloop || node._type == RegexNode.Onelazy) ? RegexCode.Onerep : RegexCode.Notonerep) | bits, (int)node._ch, node._m); if (node._n > node._m) Emit(node._type | bits, (int)node._ch, node._n == Int32.MaxValue ? Int32.MaxValue : node._n - node._m); break; case RegexNode.Setloop: case RegexNode.Setlazy: if (node._m > 0) Emit(RegexCode.Setrep | bits, StringCode(node._str), node._m); if (node._n > node._m) Emit(node._type | bits, StringCode(node._str), (node._n == Int32.MaxValue) ? Int32.MaxValue : node._n - node._m); break; case RegexNode.Multi: Emit(node._type | bits, StringCode(node._str)); break; case RegexNode.Set: Emit(node._type | bits, StringCode(node._str)); break; case RegexNode.Ref: Emit(node._type | bits, MapCapnum(node._m)); break; case RegexNode.Nothing: case RegexNode.Bol: case RegexNode.Eol: case RegexNode.Boundary: case RegexNode.Nonboundary: case RegexNode.ECMABoundary: case RegexNode.NonECMABoundary: case RegexNode.Beginning: case RegexNode.Start: case RegexNode.EndZ: case RegexNode.End: Emit(node._type); break; default: throw MakeException(SR.GetString(SR.UnexpectedOpcode, nodetype.ToString(CultureInfo.CurrentCulture))); } } } }