522 lines
12 KiB
C#
522 lines
12 KiB
C#
//
|
|
// System.Xml.XPath.Tokenizer.cs / Mono.Xml.Xsl/PatternTokenizer.cs
|
|
//
|
|
// Author:
|
|
// Piers Haken (piersh@friskit.com)
|
|
// Atsushi Enomoto (atsushi@ximian.com)
|
|
//
|
|
// (C) 2002 Piers Haken
|
|
// (C) 2005 Novell Inc,
|
|
//
|
|
// IMPORTANT:
|
|
//
|
|
// Do not edit PatternTokenizer.cs. It is autogenerated.
|
|
//
|
|
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining
|
|
// a copy of this software and associated documentation files (the
|
|
// "Software"), to deal in the Software without restriction, including
|
|
// without limitation the rights to use, copy, modify, merge, publish,
|
|
// distribute, sublicense, and/or sell copies of the Software, and to
|
|
// permit persons to whom the Software is furnished to do so, subject to
|
|
// the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be
|
|
// included in all copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
//
|
|
using System;
|
|
using System.Globalization;
|
|
using System.IO;
|
|
using System.Text;
|
|
using System.Collections;
|
|
using System.Xml;
|
|
using System.Xml.XPath;
|
|
using Mono.Xml.XPath;
|
|
|
|
#if XSLT_PATTERN
|
|
namespace Mono.Xml.Xsl
|
|
#else
|
|
namespace Mono.Xml.XPath
|
|
#endif
|
|
{
|
|
internal class Tokenizer : yyParser.yyInput
|
|
{
|
|
private string m_rgchInput;
|
|
private int m_ich;
|
|
private int m_cch;
|
|
private int m_iToken;
|
|
private int m_iTokenPrev = Token.EOF;
|
|
private Object m_objToken;
|
|
private bool m_fPrevWasOperator = false;
|
|
private bool m_fThisIsOperator = false;
|
|
private static readonly Hashtable s_mapTokens = new Hashtable ();
|
|
private static readonly Object [] s_rgTokenMap =
|
|
{
|
|
Token.AND, "and",
|
|
Token.OR, "or",
|
|
Token.DIV, "div",
|
|
Token.MOD, "mod",
|
|
Token.ANCESTOR, "ancestor",
|
|
Token.ANCESTOR_OR_SELF, "ancestor-or-self",
|
|
Token.ATTRIBUTE, "attribute",
|
|
Token.CHILD, "child",
|
|
Token.DESCENDANT, "descendant",
|
|
Token.DESCENDANT_OR_SELF, "descendant-or-self",
|
|
Token.FOLLOWING, "following",
|
|
Token.FOLLOWING_SIBLING, "following-sibling",
|
|
Token.NAMESPACE, "namespace",
|
|
Token.PARENT, "parent",
|
|
Token.PRECEDING, "preceding",
|
|
Token.PRECEDING_SIBLING, "preceding-sibling",
|
|
Token.SELF, "self",
|
|
Token.COMMENT, "comment",
|
|
Token.TEXT, "text",
|
|
Token.PROCESSING_INSTRUCTION, "processing-instruction",
|
|
Token.NODE, "node",
|
|
};
|
|
private const char EOL = '\0';
|
|
|
|
static Tokenizer ()
|
|
{
|
|
for (int i = 0; i < s_rgTokenMap.Length; i += 2)
|
|
s_mapTokens.Add (s_rgTokenMap [i + 1], s_rgTokenMap [i]);
|
|
}
|
|
|
|
public Tokenizer (string strInput)
|
|
{
|
|
//Console.WriteLine ("Tokenizing: " + strInput);
|
|
m_rgchInput = strInput;
|
|
m_ich = 0;
|
|
m_cch = strInput.Length;
|
|
SkipWhitespace ();
|
|
}
|
|
|
|
private char Peek (int iOffset)
|
|
{
|
|
if (m_ich + iOffset>= m_cch)
|
|
return EOL;
|
|
return m_rgchInput [m_ich + iOffset];
|
|
}
|
|
|
|
private char Peek ()
|
|
{
|
|
return Peek (0);
|
|
}
|
|
|
|
private char GetChar ()
|
|
{
|
|
if (m_ich >= m_cch)
|
|
return EOL;
|
|
return m_rgchInput [m_ich++];
|
|
}
|
|
|
|
private char PutBack ()
|
|
{
|
|
if (m_ich == 0)
|
|
throw new XPathException ("XPath parser returned an error status: invalid tokenizer state.");
|
|
return m_rgchInput [--m_ich];
|
|
}
|
|
|
|
private bool SkipWhitespace () // returns trus if any whitespace was skipped
|
|
{
|
|
if (!IsWhitespace (Peek ()))
|
|
return false;
|
|
|
|
while (IsWhitespace (Peek ()))
|
|
GetChar ();
|
|
|
|
return true;
|
|
}
|
|
|
|
private int ParseNumber ()
|
|
{
|
|
StringBuilder sb = new StringBuilder ();
|
|
|
|
while (IsDigit (Peek ()))
|
|
sb.Append ((char) GetChar ());
|
|
|
|
// don't handle '3.' as an error case (it is not. XPath 3.7 syntax [30])
|
|
if (Peek () == '.')
|
|
{
|
|
sb.Append ((char) GetChar ());
|
|
while (IsDigit (Peek ()))
|
|
sb.Append ((char) GetChar ());
|
|
}
|
|
m_objToken = Double.Parse (sb.ToString (), NumberFormatInfo.InvariantInfo);
|
|
return Token.NUMBER;
|
|
}
|
|
|
|
private int ParseLiteral ()
|
|
{
|
|
StringBuilder sb = new StringBuilder ();
|
|
|
|
char chInit = GetChar ();
|
|
char ch;
|
|
while ((ch = Peek ()) != chInit)
|
|
{
|
|
if (ch == EOL)
|
|
throw new XPathException ("unmatched "+chInit+" in expression");
|
|
sb.Append ((char) GetChar ());
|
|
}
|
|
GetChar ();
|
|
m_objToken = sb.ToString ();
|
|
return Token.LITERAL;
|
|
}
|
|
|
|
private string ReadIdentifier ()
|
|
{
|
|
StringBuilder sb = new StringBuilder ();
|
|
|
|
char ch = Peek ();
|
|
if (!Char.IsLetter (ch) && ch != '_')
|
|
return null;
|
|
|
|
sb.Append ((char) GetChar ());
|
|
|
|
while ((ch = Peek ()) == '_' || ch == '-' || ch == '.' || Char.IsLetterOrDigit (ch))
|
|
sb.Append ((char) GetChar ());
|
|
|
|
SkipWhitespace ();
|
|
return sb.ToString ();
|
|
}
|
|
|
|
private int ParseIdentifier ()
|
|
{
|
|
string strToken = ReadIdentifier ();
|
|
Object objToken = s_mapTokens [strToken];
|
|
|
|
int iToken = (objToken != null) ? (int) objToken : Token.QName;
|
|
m_objToken = strToken;
|
|
|
|
char ch = Peek ();
|
|
if (ch == ':')
|
|
{
|
|
if (Peek (1) == ':')
|
|
{
|
|
// If the two characters following an NCName (possibly
|
|
// after intervening ExprWhitespace) are ::, then the
|
|
// token must be recognized as an AxisName.
|
|
if (objToken == null || !IsAxisName (iToken))
|
|
throw new XPathException ("invalid axis name: '"+strToken+"'");
|
|
return iToken;
|
|
}
|
|
|
|
GetChar ();
|
|
SkipWhitespace ();
|
|
ch = Peek ();
|
|
|
|
if (ch == '*')
|
|
{
|
|
GetChar ();
|
|
m_objToken = new XmlQualifiedName ("", strToken);
|
|
return Token.QName;
|
|
}
|
|
string strToken2 = ReadIdentifier ();
|
|
if (strToken2 == null)
|
|
throw new XPathException ("invalid QName: "+strToken+":"+(char)ch);
|
|
|
|
ch = Peek ();
|
|
m_objToken = new XmlQualifiedName (strToken2, strToken);
|
|
if (ch == '(')
|
|
return Token.FUNCTION_NAME;
|
|
return Token.QName;
|
|
}
|
|
|
|
// If there is a preceding token and the preceding
|
|
// token is not one of @, ::, (, [, , or an Operator,
|
|
// then a * must be recognized as a MultiplyOperator
|
|
// and an NCName must be recognized as an OperatorName.
|
|
if (!IsFirstToken && !m_fPrevWasOperator)
|
|
{
|
|
if (objToken == null || !IsOperatorName (iToken))
|
|
throw new XPathException ("invalid operator name: '"+strToken+"'");
|
|
return iToken;
|
|
}
|
|
|
|
if (ch == '(')
|
|
{
|
|
// If the character following an NCName (possibly
|
|
// after intervening ExprWhitespace) is (, then the
|
|
// token must be recognized as a NodeType or a FunctionName.
|
|
if (objToken == null)
|
|
{
|
|
m_objToken = new XmlQualifiedName (strToken, "");
|
|
return Token.FUNCTION_NAME;
|
|
}
|
|
if (IsNodeType (iToken))
|
|
return iToken;
|
|
throw new XPathException ("invalid function name: '"+strToken+"'");
|
|
}
|
|
|
|
m_objToken = new XmlQualifiedName (strToken, "");
|
|
return Token.QName;
|
|
}
|
|
|
|
private static bool IsWhitespace (char ch)
|
|
{
|
|
// return Char.IsWhiteSpace (ch);
|
|
return (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
|
|
}
|
|
|
|
private static bool IsDigit (char ch)
|
|
{
|
|
// return Char.IsDigit (ch);
|
|
return ch >= '0' && ch <= '9';
|
|
}
|
|
|
|
|
|
int ParseToken ()
|
|
{
|
|
char ch = Peek ();
|
|
switch (ch)
|
|
{
|
|
case EOL:
|
|
return Token.EOF;
|
|
|
|
case '/':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
if (Peek () == '/')
|
|
{
|
|
GetChar ();
|
|
return Token.SLASH2;
|
|
}
|
|
return Token.SLASH;
|
|
|
|
case '.':
|
|
GetChar ();
|
|
if (Peek () == '.')
|
|
{
|
|
GetChar ();
|
|
return Token.DOT2;
|
|
}
|
|
else if (IsDigit (Peek ()))
|
|
{
|
|
PutBack ();
|
|
return ParseNumber ();
|
|
}
|
|
return Token.DOT;
|
|
|
|
case ':':
|
|
GetChar ();
|
|
if (Peek () == ':')
|
|
{
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.COLON2;
|
|
}
|
|
return Token.ERROR;
|
|
|
|
case ',':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.COMMA;
|
|
|
|
case '@':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.AT;
|
|
|
|
case '[':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.BRACKET_OPEN;
|
|
|
|
case ']':
|
|
GetChar ();
|
|
return Token.BRACKET_CLOSE;
|
|
|
|
case '(':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.PAREN_OPEN;
|
|
|
|
case ')':
|
|
GetChar ();
|
|
return Token.PAREN_CLOSE;
|
|
|
|
case '+':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.PLUS;
|
|
|
|
case '-':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.MINUS;
|
|
|
|
case '*':
|
|
GetChar ();
|
|
if (!IsFirstToken && !m_fPrevWasOperator)
|
|
{
|
|
m_fThisIsOperator = true;
|
|
return Token.MULTIPLY;
|
|
}
|
|
return Token.ASTERISK;
|
|
|
|
case '$':
|
|
GetChar ();
|
|
m_fThisIsOperator = true;
|
|
return Token.DOLLAR;
|
|
|
|
case '|':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.BAR;
|
|
|
|
case '=':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.EQ;
|
|
|
|
case '!':
|
|
GetChar ();
|
|
if (Peek () == '=')
|
|
{
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
return Token.NE;
|
|
}
|
|
break;
|
|
|
|
case '>':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
if (Peek () == '=')
|
|
{
|
|
GetChar ();
|
|
return Token.GE;
|
|
}
|
|
return Token.GT;
|
|
|
|
case '<':
|
|
m_fThisIsOperator = true;
|
|
GetChar ();
|
|
if (Peek () == '=')
|
|
{
|
|
GetChar ();
|
|
return Token.LE;
|
|
}
|
|
return Token.LT;
|
|
|
|
case '\'':
|
|
return ParseLiteral ();
|
|
|
|
case '\"':
|
|
return ParseLiteral ();
|
|
|
|
default:
|
|
if (IsDigit (ch))
|
|
{
|
|
return ParseNumber ();
|
|
}
|
|
else if (Char.IsLetter (ch) || ch == '_') // NCName
|
|
{
|
|
int iToken = ParseIdentifier ();
|
|
if (IsOperatorName (iToken))
|
|
m_fThisIsOperator = true;
|
|
return iToken;
|
|
}
|
|
break;
|
|
}
|
|
throw new XPathException ("invalid token: '"+ch+"'");
|
|
}
|
|
|
|
///////////////////////////
|
|
// yyParser.yyInput methods
|
|
///////////////////////////
|
|
|
|
/** move on to next token.
|
|
@return false if positioned beyond tokens.
|
|
@throws IOException on input error.
|
|
*/
|
|
public bool advance ()
|
|
{
|
|
m_fThisIsOperator = false;
|
|
m_objToken = null;
|
|
m_iToken = ParseToken ();
|
|
SkipWhitespace ();
|
|
m_iTokenPrev = m_iToken;
|
|
m_fPrevWasOperator = m_fThisIsOperator;
|
|
return (m_iToken != Token.EOF);
|
|
}
|
|
|
|
/** classifies current token.
|
|
Should not be called if advance() returned false.
|
|
@return current %token or single character.
|
|
*/
|
|
public int token ()
|
|
{
|
|
return m_iToken;
|
|
}
|
|
|
|
/** associated with current token.
|
|
Should not be called if advance() returned false.
|
|
@return value for token().
|
|
*/
|
|
public Object value ()
|
|
{
|
|
return m_objToken;
|
|
}
|
|
private bool IsFirstToken { get { return m_iTokenPrev == Token.EOF; } }
|
|
|
|
private bool IsNodeType (int iToken)
|
|
{
|
|
switch (iToken)
|
|
{
|
|
case Token.COMMENT:
|
|
case Token.TEXT:
|
|
case Token.PROCESSING_INSTRUCTION:
|
|
case Token.NODE:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
private bool IsOperatorName (int iToken)
|
|
{
|
|
switch (iToken)
|
|
{
|
|
case Token.AND:
|
|
case Token.OR:
|
|
case Token.MOD:
|
|
case Token.DIV:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
private bool IsAxisName (int iToken)
|
|
{
|
|
switch (iToken)
|
|
{
|
|
case Token.ATTRIBUTE:
|
|
case Token.ANCESTOR:
|
|
case Token.ANCESTOR_OR_SELF:
|
|
case Token.CHILD:
|
|
case Token.DESCENDANT:
|
|
case Token.DESCENDANT_OR_SELF:
|
|
case Token.FOLLOWING:
|
|
case Token.FOLLOWING_SIBLING:
|
|
case Token.NAMESPACE:
|
|
case Token.PARENT:
|
|
case Token.PRECEDING:
|
|
case Token.PRECEDING_SIBLING:
|
|
case Token.SELF:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
}
|