//------------------------------------------------------------------------------ // // Copyright (c) Microsoft Corporation. All rights reserved. // // [....] // http://www.w3.org/TR/xpath#exprlex //------------------------------------------------------------------------------ using System.Diagnostics; namespace System.Xml.Xsl.XPath { using Res = System.Xml.Utils.Res; // Extends XPathOperator enumeration internal enum LexKind { Unknown, // Unknown lexeme Or, // Operator 'or' And, // Operator 'and' Eq, // Operator '=' Ne, // Operator '!=' Lt, // Operator '<' Le, // Operator '<=' Gt, // Operator '>' Ge, // Operator '>=' Plus, // Operator '+' Minus, // Operator '-' Multiply, // Operator '*' Divide, // Operator 'div' Modulo, // Operator 'mod' UnaryMinus, // Not used Union, // Operator '|' LastOperator = Union, DotDot, // '..' ColonColon, // '::' SlashSlash, // Operator '//' Number, // Number (numeric literal) Axis, // AxisName Name, // NameTest, NodeType, FunctionName, AxisName, second part of VariableReference String, // Literal (string literal) Eof, // End of the expression FirstStringable = Name, LastNonChar = Eof, LParens = '(', RParens = ')', LBracket = '[', RBracket = ']', Dot = '.', At = '@', Comma = ',', Star = '*', // NameTest Slash = '/', // Operator '/' Dollar = '$', // First part of VariableReference RBrace = '}', // Used for AVTs }; internal sealed class XPathScanner { private string xpathExpr; private int curIndex; private char curChar; private LexKind kind; private string name; private string prefix; private string stringValue; private bool canBeFunction; private int lexStart; private int prevLexEnd; private LexKind prevKind; private XPathAxis axis; private XmlCharType xmlCharType = XmlCharType.Instance; public XPathScanner(string xpathExpr) : this(xpathExpr, 0) {} public XPathScanner(string xpathExpr, int startFrom) { Debug.Assert(xpathExpr != null); this.xpathExpr = xpathExpr; this.kind = LexKind.Unknown; SetSourceIndex(startFrom); NextLex(); } public string Source { get { return xpathExpr; } } public LexKind Kind { get { return kind; } } public int LexStart { get { return lexStart; } } public int LexSize { get { return curIndex - lexStart; } } public int PrevLexEnd { get { return prevLexEnd; } } private void SetSourceIndex(int index) { Debug.Assert(0 <= index && index <= xpathExpr.Length); curIndex = index - 1; NextChar(); } private void NextChar() { Debug.Assert(-1 <= curIndex && curIndex < xpathExpr.Length); curIndex++; if (curIndex < xpathExpr.Length) { curChar = xpathExpr[curIndex]; } else { Debug.Assert(curIndex == xpathExpr.Length); curChar = '\0'; } } #if XML10_FIFTH_EDITION private char PeekNextChar() { Debug.Assert(-1 <= curIndex && curIndex <= xpathExpr.Length); if (curIndex + 1 < xpathExpr.Length) { return xpathExpr[curIndex + 1]; } else { return '\0'; } } #endif public string Name { get { Debug.Assert(kind == LexKind.Name); Debug.Assert(name != null); return name; } } public string Prefix { get { Debug.Assert(kind == LexKind.Name); Debug.Assert(prefix != null); return prefix; } } public string RawValue { get { if (kind == LexKind.Eof) { return LexKindToString(kind); } else { return xpathExpr.Substring(lexStart, curIndex - lexStart); } } } public string StringValue { get { Debug.Assert(kind == LexKind.String); Debug.Assert(stringValue != null); return stringValue; } } // Returns true if the character following an QName (possibly after intervening // ExprWhitespace) is '('. In this case the token must be recognized as a NodeType // or a FunctionName unless it is an OperatorName. This distinction cannot be done // without knowing the previous lexeme. For example, "or" in "... or (1 != 0)" may // be an OperatorName or a FunctionName. public bool CanBeFunction { get { Debug.Assert(kind == LexKind.Name); return canBeFunction; } } public XPathAxis Axis { get { Debug.Assert(kind == LexKind.Axis); Debug.Assert(axis != XPathAxis.Unknown); return axis; } } private void SkipSpace() { while (xmlCharType.IsWhiteSpace(curChar)) { NextChar(); } } private static bool IsAsciiDigit(char ch) { return (uint)(ch - '0') <= 9; } public void NextLex() { prevLexEnd = curIndex; prevKind = kind; SkipSpace(); lexStart = curIndex; switch (curChar) { case '\0': kind = LexKind.Eof; return; case '(': case ')': case '[': case ']': case '@': case ',': case '$': case '}': kind = (LexKind)curChar; NextChar(); break; case '.': NextChar(); if (curChar == '.') { kind = LexKind.DotDot; NextChar(); } else if (IsAsciiDigit(curChar)) { SetSourceIndex(lexStart); goto case '0'; } else { kind = LexKind.Dot; } break; case ':': NextChar(); if (curChar == ':') { kind = LexKind.ColonColon; NextChar(); } else { kind = LexKind.Unknown; } break; case '*': kind = LexKind.Star; NextChar(); CheckOperator(true); break; case '/': NextChar(); if (curChar == '/') { kind = LexKind.SlashSlash; NextChar(); } else { kind = LexKind.Slash; } break; case '|': kind = LexKind.Union; NextChar(); break; case '+': kind = LexKind.Plus; NextChar(); break; case '-': kind = LexKind.Minus; NextChar(); break; case '=': kind = LexKind.Eq; NextChar(); break; case '!': NextChar(); if (curChar == '=') { kind = LexKind.Ne; NextChar(); } else { kind = LexKind.Unknown; } break; case '<': NextChar(); if (curChar == '=') { kind = LexKind.Le; NextChar(); } else { kind = LexKind.Lt; } break; case '>': NextChar(); if (curChar == '=') { kind = LexKind.Ge; NextChar(); } else { kind = LexKind.Gt; } break; case '"': case '\'': kind = LexKind.String; ScanString(); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': kind = LexKind.Number; ScanNumber(); break; default: if (xmlCharType.IsStartNCNameSingleChar(curChar) #if XML10_FIFTH_EDITION || xmlCharType.IsNCNameHighSurrogateChar(curChar) #endif ) { kind = LexKind.Name; this.name = ScanNCName(); this.prefix = string.Empty; this.canBeFunction = false; this.axis = XPathAxis.Unknown; bool colonColon = false; int saveSourceIndex = curIndex; // "foo:bar" or "foo:*" -- one lexeme (no spaces allowed) // "foo::" or "foo ::" -- two lexemes, reported as one (AxisName) // "foo:?" or "foo :?" -- lexeme "foo" reported if (curChar == ':') { NextChar(); if (curChar == ':') { // "foo::" -> OperatorName, AxisName NextChar(); colonColon = true; SetSourceIndex(saveSourceIndex); } else { // "foo:bar", "foo:*" or "foo:?" if (curChar == '*') { NextChar(); this.prefix = this.name; this.name = "*"; } else if (xmlCharType.IsStartNCNameSingleChar(curChar) #if XML10_FIFTH_EDITION || xmlCharType.IsNCNameHighSurrogateChar(curChar) #endif ) { this.prefix = this.name; this.name = ScanNCName(); // Look ahead for '(' to determine whether QName can be a FunctionName saveSourceIndex = curIndex; SkipSpace(); this.canBeFunction = (curChar == '('); SetSourceIndex(saveSourceIndex); } else { // "foo:?" -> OperatorName, NameTest // Return "foo" and leave ":" to be reported later as an unknown lexeme SetSourceIndex(saveSourceIndex); } } } else { SkipSpace(); if (curChar == ':') { // "foo ::" or "foo :?" NextChar(); if (curChar == ':') { NextChar(); colonColon = true; } SetSourceIndex(saveSourceIndex); } else { this.canBeFunction = (curChar == '('); } } if (!CheckOperator(false) && colonColon) { this.axis = CheckAxis(); } } else { kind = LexKind.Unknown; NextChar(); } break; } } private bool CheckOperator(bool star) { LexKind opKind; if (star) { opKind = LexKind.Multiply; } else { if (prefix.Length != 0 || name.Length > 3) return false; switch (name) { case "or" : opKind = LexKind.Or; break; case "and": opKind = LexKind.And; break; case "div": opKind = LexKind.Divide; break; case "mod": opKind = LexKind.Modulo; break; default : return false; } } // If there is a preceding token and the preceding token is not one of '@', '::', '(', '[', ',' or an Operator, // then a '*' must be recognized as a MultiplyOperator and an NCName must be recognized as an OperatorName. if (prevKind <= LexKind.LastOperator) return false; switch (prevKind) { case LexKind.Slash: case LexKind.SlashSlash: case LexKind.At: case LexKind.ColonColon: case LexKind.LParens: case LexKind.LBracket: case LexKind.Comma: case LexKind.Dollar: return false; } this.kind = opKind; return true; } private XPathAxis CheckAxis() { this.kind = LexKind.Axis; switch (name) { case "ancestor" : return XPathAxis.Ancestor; case "ancestor-or-self" : return XPathAxis.AncestorOrSelf; case "attribute" : return XPathAxis.Attribute; case "child" : return XPathAxis.Child; case "descendant" : return XPathAxis.Descendant; case "descendant-or-self" : return XPathAxis.DescendantOrSelf; case "following" : return XPathAxis.Following; case "following-sibling" : return XPathAxis.FollowingSibling; case "namespace" : return XPathAxis.Namespace; case "parent" : return XPathAxis.Parent; case "preceding" : return XPathAxis.Preceding; case "preceding-sibling" : return XPathAxis.PrecedingSibling; case "self" : return XPathAxis.Self; default : this.kind = LexKind.Name; return XPathAxis.Unknown; } } private void ScanNumber() { Debug.Assert(IsAsciiDigit(curChar) || curChar == '.'); while (IsAsciiDigit(curChar)) { NextChar(); } if (curChar == '.') { NextChar(); while (IsAsciiDigit(curChar)) { NextChar(); } } if ((curChar & (~0x20)) == 'E') { NextChar(); if (curChar == '+' || curChar == '-') { NextChar(); } while (IsAsciiDigit(curChar)) { NextChar(); } throw CreateException(Res.XPath_ScientificNotation); } } private void ScanString() { int startIdx = curIndex + 1; int endIdx = xpathExpr.IndexOf(curChar, startIdx); if (endIdx < 0) { SetSourceIndex(xpathExpr.Length); throw CreateException(Res.XPath_UnclosedString); } this.stringValue = xpathExpr.Substring(startIdx, endIdx - startIdx); SetSourceIndex(endIdx + 1); } private string ScanNCName() { Debug.Assert(xmlCharType.IsStartNCNameSingleChar(curChar) #if XML10_FIFTH_EDITION || xmlCharType.IsNCNameHighSurrogateChar(curChar) #endif ); int start = curIndex; for (;;) { if (xmlCharType.IsNCNameSingleChar(curChar)) { NextChar(); } #if XML10_FIFTH_EDITION else if (xmlCharType.IsNCNameSurrogateChar(PeekNextChar(), curChar)) { NextChar(); NextChar(); } #endif else { break; } } return xpathExpr.Substring(start, curIndex - start); } public void PassToken(LexKind t) { CheckToken(t); NextLex(); } public void CheckToken(LexKind t) { Debug.Assert(LexKind.FirstStringable <= t); if (kind != t) { if (t == LexKind.Eof) { throw CreateException(Res.XPath_EofExpected, RawValue); } else { throw CreateException(Res.XPath_TokenExpected, LexKindToString(t), RawValue); } } } // May be called for the following tokens: Name, String, Eof, Comma, LParens, RParens, LBracket, RBracket, RBrace private string LexKindToString(LexKind t) { Debug.Assert(LexKind.FirstStringable <= t); if (LexKind.LastNonChar < t) { Debug.Assert("()[].@,*/$}".IndexOf((char)t) >= 0); return new String((char)t, 1); } switch (t) { case LexKind.Name : return ""; case LexKind.String : return ""; case LexKind.Eof : return ""; default: Debug.Fail("Unexpected LexKind: " + t.ToString()); return string.Empty; } } public XPathCompileException CreateException(string resId, params string[] args) { return new XPathCompileException(xpathExpr, lexStart, curIndex, resId, args); } } }