a575963da9
Former-commit-id: da6be194a6b1221998fc28233f2503bd61dd9d14
1962 lines
70 KiB
C#
1962 lines
70 KiB
C#
// HtmlAgilityPack V1.0 - Simon Mourier <simon underscore mourier at hotmail dot com>
|
|
using System;
|
|
using System.Collections;
|
|
using System.Collections.Generic;
|
|
using System.IO;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Xml;
|
|
using System.Xml.XPath;
|
|
|
|
namespace HtmlAgilityPack
|
|
{
|
|
/// <summary>
|
|
/// Represents a complete HTML document.
|
|
/// </summary>
|
|
public class HtmlDocument : IXPathNavigable
|
|
{
|
|
#region Fields
|
|
|
|
private int _c;
|
|
private Crc32 _crc32;
|
|
private HtmlAttribute _currentattribute;
|
|
private HtmlNode _currentnode;
|
|
private Encoding _declaredencoding;
|
|
private HtmlNode _documentnode;
|
|
private bool _fullcomment;
|
|
private int _index;
|
|
internal Hashtable _lastnodes = new Hashtable();
|
|
private HtmlNode _lastparentnode;
|
|
private int _line;
|
|
private int _lineposition, _maxlineposition;
|
|
internal Hashtable _nodesid;
|
|
private ParseState _oldstate;
|
|
private bool _onlyDetectEncoding;
|
|
internal Hashtable _openednodes;
|
|
private List<HtmlParseError> _parseerrors = new List<HtmlParseError>();
|
|
private string _remainder;
|
|
private int _remainderOffset;
|
|
private ParseState _state;
|
|
private Encoding _streamencoding;
|
|
internal string _text;
|
|
|
|
// public props
|
|
|
|
/// <summary>
|
|
/// Adds Debugging attributes to node. Default is false.
|
|
/// </summary>
|
|
public bool OptionAddDebuggingAttributes;
|
|
|
|
/// <summary>
|
|
/// Defines if closing for non closed nodes must be done at the end or directly in the document.
|
|
/// Setting this to true can actually change how browsers render the page. Default is false.
|
|
/// </summary>
|
|
public bool OptionAutoCloseOnEnd; // close errors at the end
|
|
|
|
/// <summary>
|
|
/// Defines if non closed nodes will be checked at the end of parsing. Default is true.
|
|
/// </summary>
|
|
public bool OptionCheckSyntax = true;
|
|
|
|
/// <summary>
|
|
/// Defines if a checksum must be computed for the document while parsing. Default is false.
|
|
/// </summary>
|
|
public bool OptionComputeChecksum;
|
|
|
|
/// <summary>
|
|
/// Defines the default stream encoding to use. Default is System.Text.Encoding.Default.
|
|
/// </summary>
|
|
public Encoding OptionDefaultStreamEncoding = Encoding.Default;
|
|
|
|
/// <summary>
|
|
/// Defines if source text must be extracted while parsing errors.
|
|
/// If the document has a lot of errors, or cascading errors, parsing performance can be dramatically affected if set to true.
|
|
/// Default is false.
|
|
/// </summary>
|
|
public bool OptionExtractErrorSourceText;
|
|
|
|
// turning this on can dramatically slow performance if a lot of errors are detected
|
|
|
|
/// <summary>
|
|
/// Defines the maximum length of source text or parse errors. Default is 100.
|
|
/// </summary>
|
|
public int OptionExtractErrorSourceTextMaxLength = 100;
|
|
|
|
/// <summary>
|
|
/// Defines if LI, TR, TH, TD tags must be partially fixed when nesting errors are detected. Default is false.
|
|
/// </summary>
|
|
public bool OptionFixNestedTags; // fix li, tr, th, td tags
|
|
|
|
/// <summary>
|
|
/// Defines if output must conform to XML, instead of HTML.
|
|
/// </summary>
|
|
public bool OptionOutputAsXml;
|
|
|
|
/// <summary>
|
|
/// Defines if attribute value output must be optimized (not bound with double quotes if it is possible). Default is false.
|
|
/// </summary>
|
|
public bool OptionOutputOptimizeAttributeValues;
|
|
|
|
/// <summary>
|
|
/// Defines if name must be output with it's original case. Useful for asp.net tags and attributes
|
|
/// </summary>
|
|
public bool OptionOutputOriginalCase;
|
|
|
|
/// <summary>
|
|
/// Defines if name must be output in uppercase. Default is false.
|
|
/// </summary>
|
|
public bool OptionOutputUpperCase;
|
|
|
|
/// <summary>
|
|
/// Defines if declared encoding must be read from the document.
|
|
/// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
|
|
/// Default is true.
|
|
/// </summary>
|
|
public bool OptionReadEncoding = true;
|
|
|
|
/// <summary>
|
|
/// Defines the name of a node that will throw the StopperNodeException when found as an end node. Default is null.
|
|
/// </summary>
|
|
public string OptionStopperNodeName;
|
|
|
|
/// <summary>
|
|
/// Defines if the 'id' attribute must be specifically used. Default is true.
|
|
/// </summary>
|
|
public bool OptionUseIdAttribute = true;
|
|
|
|
/// <summary>
|
|
/// Defines if empty nodes must be written as closed during output. Default is false.
|
|
/// </summary>
|
|
public bool OptionWriteEmptyNodes;
|
|
|
|
#endregion
|
|
|
|
#region Static Members
|
|
|
|
internal static readonly string HtmlExceptionRefNotChild = "Reference node must be a child of this node";
|
|
|
|
internal static readonly string HtmlExceptionUseIdAttributeFalse =
|
|
"You need to set UseIdAttribute property to true to enable this feature";
|
|
|
|
#endregion
|
|
|
|
#region Constructors
|
|
|
|
/// <summary>
|
|
/// Creates an instance of an HTML document.
|
|
/// </summary>
|
|
public HtmlDocument()
|
|
{
|
|
_documentnode = CreateNode(HtmlNodeType.Document, 0);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Properties
|
|
|
|
/// <summary>
|
|
/// Gets the document CRC32 checksum if OptionComputeChecksum was set to true before parsing, 0 otherwise.
|
|
/// </summary>
|
|
public int CheckSum
|
|
{
|
|
get
|
|
{
|
|
if (_crc32 == null)
|
|
{
|
|
return 0;
|
|
}
|
|
else
|
|
{
|
|
return (int) _crc32.CheckSum;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the document's declared encoding.
|
|
/// Declared encoding is determined using the meta http-equiv="content-type" content="text/html;charset=XXXXX" html node.
|
|
/// </summary>
|
|
public Encoding DeclaredEncoding
|
|
{
|
|
get { return _declaredencoding; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the root node of the document.
|
|
/// </summary>
|
|
public HtmlNode DocumentNode
|
|
{
|
|
get { return _documentnode; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the document's output encoding.
|
|
/// </summary>
|
|
public Encoding Encoding
|
|
{
|
|
get { return GetOutEncoding(); }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets a list of parse errors found in the document.
|
|
/// </summary>
|
|
public IEnumerable<HtmlParseError> ParseErrors
|
|
{
|
|
get { return _parseerrors; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the remaining text.
|
|
/// Will always be null if OptionStopperNodeName is null.
|
|
/// </summary>
|
|
public string Remainder
|
|
{
|
|
get { return _remainder; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the offset of Remainder in the original Html text.
|
|
/// If OptionStopperNodeName is null, this will return the length of the original Html text.
|
|
/// </summary>
|
|
public int RemainderOffset
|
|
{
|
|
get { return _remainderOffset; }
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the document's stream encoding.
|
|
/// </summary>
|
|
public Encoding StreamEncoding
|
|
{
|
|
get { return _streamencoding; }
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region IXPathNavigable Members
|
|
|
|
/// <summary>
|
|
/// Creates a new XPathNavigator object for navigating this HTML document.
|
|
/// </summary>
|
|
/// <returns>An XPathNavigator object. The XPathNavigator is positioned on the root of the document.</returns>
|
|
public XPathNavigator CreateNavigator()
|
|
{
|
|
return new HtmlNodeNavigator(this, _documentnode);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Public Methods
|
|
|
|
/// <summary>
|
|
/// Gets a valid XML name.
|
|
/// </summary>
|
|
/// <param name="name">Any text.</param>
|
|
/// <returns>A string that is a valid XML name.</returns>
|
|
public static string GetXmlName(string name)
|
|
{
|
|
string xmlname = string.Empty;
|
|
bool nameisok = true;
|
|
for (int i = 0; i < name.Length; i++)
|
|
{
|
|
// names are lcase
|
|
// note: we are very limited here, too much?
|
|
if (((name[i] >= 'a') && (name[i] <= 'z')) ||
|
|
((name[i] >= '0') && (name[i] <= '9')) ||
|
|
// (name[i]==':') || (name[i]=='_') || (name[i]=='-') || (name[i]=='.')) // these are bads in fact
|
|
(name[i] == '_') || (name[i] == '-') || (name[i] == '.'))
|
|
{
|
|
xmlname += name[i];
|
|
}
|
|
else
|
|
{
|
|
nameisok = false;
|
|
byte[] bytes = Encoding.UTF8.GetBytes(new char[] {name[i]});
|
|
for (int j = 0; j < bytes.Length; j++)
|
|
{
|
|
xmlname += bytes[j].ToString("x2");
|
|
}
|
|
xmlname += "_";
|
|
}
|
|
}
|
|
if (nameisok)
|
|
{
|
|
return xmlname;
|
|
}
|
|
return "_" + xmlname;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Applies HTML encoding to a specified string.
|
|
/// </summary>
|
|
/// <param name="html">The input string to encode. May not be null.</param>
|
|
/// <returns>The encoded string.</returns>
|
|
public static string HtmlEncode(string html)
|
|
{
|
|
if (html == null)
|
|
{
|
|
throw new ArgumentNullException("html");
|
|
}
|
|
// replace & by & but only once!
|
|
Regex rx = new Regex("&(?!(amp;)|(lt;)|(gt;)|(quot;))", RegexOptions.IgnoreCase);
|
|
return rx.Replace(html, "&").Replace("<", "<").Replace(">", ">").Replace("\"", """);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Determines if the specified character is considered as a whitespace character.
|
|
/// </summary>
|
|
/// <param name="c">The character to check.</param>
|
|
/// <returns>true if if the specified character is considered as a whitespace character.</returns>
|
|
public static bool IsWhiteSpace(int c)
|
|
{
|
|
if ((c == 10) || (c == 13) || (c == 32) || (c == 9))
|
|
{
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML attribute with the specified name.
|
|
/// </summary>
|
|
/// <param name="name">The name of the attribute. May not be null.</param>
|
|
/// <returns>The new HTML attribute.</returns>
|
|
public HtmlAttribute CreateAttribute(string name)
|
|
{
|
|
if (name == null)
|
|
{
|
|
throw new ArgumentNullException("name");
|
|
}
|
|
HtmlAttribute att = CreateAttribute();
|
|
att.Name = name;
|
|
return att;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML attribute with the specified name.
|
|
/// </summary>
|
|
/// <param name="name">The name of the attribute. May not be null.</param>
|
|
/// <param name="value">The value of the attribute.</param>
|
|
/// <returns>The new HTML attribute.</returns>
|
|
public HtmlAttribute CreateAttribute(string name, string value)
|
|
{
|
|
if (name == null)
|
|
{
|
|
throw new ArgumentNullException("name");
|
|
}
|
|
HtmlAttribute att = CreateAttribute(name);
|
|
att.Value = value;
|
|
return att;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML comment node.
|
|
/// </summary>
|
|
/// <returns>The new HTML comment node.</returns>
|
|
public HtmlCommentNode CreateComment()
|
|
{
|
|
return (HtmlCommentNode) CreateNode(HtmlNodeType.Comment);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML comment node with the specified comment text.
|
|
/// </summary>
|
|
/// <param name="comment">The comment text. May not be null.</param>
|
|
/// <returns>The new HTML comment node.</returns>
|
|
public HtmlCommentNode CreateComment(string comment)
|
|
{
|
|
if (comment == null)
|
|
{
|
|
throw new ArgumentNullException("comment");
|
|
}
|
|
HtmlCommentNode c = CreateComment();
|
|
c.Comment = comment;
|
|
return c;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML element node with the specified name.
|
|
/// </summary>
|
|
/// <param name="name">The qualified name of the element. May not be null.</param>
|
|
/// <returns>The new HTML node.</returns>
|
|
public HtmlNode CreateElement(string name)
|
|
{
|
|
if (name == null)
|
|
{
|
|
throw new ArgumentNullException("name");
|
|
}
|
|
HtmlNode node = CreateNode(HtmlNodeType.Element);
|
|
node.Name = name;
|
|
return node;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML text node.
|
|
/// </summary>
|
|
/// <returns>The new HTML text node.</returns>
|
|
public HtmlTextNode CreateTextNode()
|
|
{
|
|
return (HtmlTextNode) CreateNode(HtmlNodeType.Text);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates an HTML text node with the specified text.
|
|
/// </summary>
|
|
/// <param name="text">The text of the node. May not be null.</param>
|
|
/// <returns>The new HTML text node.</returns>
|
|
public HtmlTextNode CreateTextNode(string text)
|
|
{
|
|
if (text == null)
|
|
{
|
|
throw new ArgumentNullException("text");
|
|
}
|
|
HtmlTextNode t = CreateTextNode();
|
|
t.Text = text;
|
|
return t;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Detects the encoding of an HTML stream.
|
|
/// </summary>
|
|
/// <param name="stream">The input stream. May not be null.</param>
|
|
/// <returns>The detected encoding.</returns>
|
|
public Encoding DetectEncoding(Stream stream)
|
|
{
|
|
if (stream == null)
|
|
{
|
|
throw new ArgumentNullException("stream");
|
|
}
|
|
return DetectEncoding(new StreamReader(stream));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Detects the encoding of an HTML file.
|
|
/// </summary>
|
|
/// <param name="path">Path for the file containing the HTML document to detect. May not be null.</param>
|
|
/// <returns>The detected encoding.</returns>
|
|
public Encoding DetectEncoding(string path)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
|
|
Encoding encoding = DetectEncoding(sr);
|
|
sr.Close();
|
|
return encoding;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Detects the encoding of an HTML text provided on a TextReader.
|
|
/// </summary>
|
|
/// <param name="reader">The TextReader used to feed the HTML. May not be null.</param>
|
|
/// <returns>The detected encoding.</returns>
|
|
public Encoding DetectEncoding(TextReader reader)
|
|
{
|
|
if (reader == null)
|
|
{
|
|
throw new ArgumentNullException("reader");
|
|
}
|
|
_onlyDetectEncoding = true;
|
|
if (OptionCheckSyntax)
|
|
{
|
|
_openednodes = new Hashtable();
|
|
}
|
|
else
|
|
{
|
|
_openednodes = null;
|
|
}
|
|
|
|
if (OptionUseIdAttribute)
|
|
{
|
|
_nodesid = new Hashtable();
|
|
}
|
|
else
|
|
{
|
|
_nodesid = null;
|
|
}
|
|
|
|
StreamReader sr = reader as StreamReader;
|
|
if (sr != null)
|
|
{
|
|
_streamencoding = sr.CurrentEncoding;
|
|
}
|
|
else
|
|
{
|
|
_streamencoding = null;
|
|
}
|
|
_declaredencoding = null;
|
|
|
|
_text = reader.ReadToEnd();
|
|
_documentnode = CreateNode(HtmlNodeType.Document, 0);
|
|
|
|
// this is almost a hack, but it allows us not to muck with the original parsing code
|
|
try
|
|
{
|
|
Parse();
|
|
}
|
|
catch (EncodingFoundException ex)
|
|
{
|
|
return ex.Encoding;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Detects the encoding of an HTML document from a file first, and then loads the file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read.</param>
|
|
public void DetectEncodingAndLoad(string path)
|
|
{
|
|
DetectEncodingAndLoad(path, true);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Detects the encoding of an HTML document from a file first, and then loads the file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read. May not be null.</param>
|
|
/// <param name="detectEncoding">true to detect encoding, false otherwise.</param>
|
|
public void DetectEncodingAndLoad(string path, bool detectEncoding)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
Encoding enc;
|
|
if (detectEncoding)
|
|
{
|
|
enc = DetectEncoding(path);
|
|
}
|
|
else
|
|
{
|
|
enc = null;
|
|
}
|
|
|
|
if (enc == null)
|
|
{
|
|
Load(path);
|
|
}
|
|
else
|
|
{
|
|
Load(path, enc);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Detects the encoding of an HTML text.
|
|
/// </summary>
|
|
/// <param name="html">The input html text. May not be null.</param>
|
|
/// <returns>The detected encoding.</returns>
|
|
public Encoding DetectEncodingHtml(string html)
|
|
{
|
|
if (html == null)
|
|
{
|
|
throw new ArgumentNullException("html");
|
|
}
|
|
StringReader sr = new StringReader(html);
|
|
Encoding encoding = DetectEncoding(sr);
|
|
sr.Close();
|
|
return encoding;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Gets the HTML node with the specified 'id' attribute value.
|
|
/// </summary>
|
|
/// <param name="id">The attribute id to match. May not be null.</param>
|
|
/// <returns>The HTML node with the matching id or null if not found.</returns>
|
|
public HtmlNode GetElementbyId(string id)
|
|
{
|
|
if (id == null)
|
|
{
|
|
throw new ArgumentNullException("id");
|
|
}
|
|
if (_nodesid == null)
|
|
{
|
|
throw new Exception(HtmlExceptionUseIdAttributeFalse);
|
|
}
|
|
|
|
return _nodesid[id.ToLower()] as HtmlNode;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a stream.
|
|
/// </summary>
|
|
/// <param name="stream">The input stream.</param>
|
|
public void Load(Stream stream)
|
|
{
|
|
Load(new StreamReader(stream, OptionDefaultStreamEncoding));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a stream.
|
|
/// </summary>
|
|
/// <param name="stream">The input stream.</param>
|
|
/// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
|
|
public void Load(Stream stream, bool detectEncodingFromByteOrderMarks)
|
|
{
|
|
Load(new StreamReader(stream, detectEncodingFromByteOrderMarks));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a stream.
|
|
/// </summary>
|
|
/// <param name="stream">The input stream.</param>
|
|
/// <param name="encoding">The character encoding to use.</param>
|
|
public void Load(Stream stream, Encoding encoding)
|
|
{
|
|
Load(new StreamReader(stream, encoding));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a stream.
|
|
/// </summary>
|
|
/// <param name="stream">The input stream.</param>
|
|
/// <param name="encoding">The character encoding to use.</param>
|
|
/// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
|
|
public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
|
|
{
|
|
Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a stream.
|
|
/// </summary>
|
|
/// <param name="stream">The input stream.</param>
|
|
/// <param name="encoding">The character encoding to use.</param>
|
|
/// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the stream.</param>
|
|
/// <param name="buffersize">The minimum buffer size.</param>
|
|
public void Load(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
|
|
{
|
|
Load(new StreamReader(stream, encoding, detectEncodingFromByteOrderMarks, buffersize));
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read. May not be null.</param>
|
|
public void Load(string path)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
StreamReader sr = new StreamReader(path, OptionDefaultStreamEncoding);
|
|
Load(sr);
|
|
sr.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read. May not be null.</param>
|
|
/// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
|
|
public void Load(string path, bool detectEncodingFromByteOrderMarks)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
StreamReader sr = new StreamReader(path, detectEncodingFromByteOrderMarks);
|
|
Load(sr);
|
|
sr.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read. May not be null.</param>
|
|
/// <param name="encoding">The character encoding to use. May not be null.</param>
|
|
public void Load(string path, Encoding encoding)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
if (encoding == null)
|
|
{
|
|
throw new ArgumentNullException("encoding");
|
|
}
|
|
StreamReader sr = new StreamReader(path, encoding);
|
|
Load(sr);
|
|
sr.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read. May not be null.</param>
|
|
/// <param name="encoding">The character encoding to use. May not be null.</param>
|
|
/// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
|
|
public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
if (encoding == null)
|
|
{
|
|
throw new ArgumentNullException("encoding");
|
|
}
|
|
StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks);
|
|
Load(sr);
|
|
sr.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads an HTML document from a file.
|
|
/// </summary>
|
|
/// <param name="path">The complete file path to be read. May not be null.</param>
|
|
/// <param name="encoding">The character encoding to use. May not be null.</param>
|
|
/// <param name="detectEncodingFromByteOrderMarks">Indicates whether to look for byte order marks at the beginning of the file.</param>
|
|
/// <param name="buffersize">The minimum buffer size.</param>
|
|
public void Load(string path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int buffersize)
|
|
{
|
|
if (path == null)
|
|
{
|
|
throw new ArgumentNullException("path");
|
|
}
|
|
if (encoding == null)
|
|
{
|
|
throw new ArgumentNullException("encoding");
|
|
}
|
|
StreamReader sr = new StreamReader(path, encoding, detectEncodingFromByteOrderMarks, buffersize);
|
|
Load(sr);
|
|
sr.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads the HTML document from the specified TextReader.
|
|
/// </summary>
|
|
/// <param name="reader">The TextReader used to feed the HTML data into the document. May not be null.</param>
|
|
public void Load(TextReader reader)
|
|
{
|
|
// all Load methods pass down to this one
|
|
if (reader == null)
|
|
{
|
|
throw new ArgumentNullException("reader");
|
|
}
|
|
|
|
_onlyDetectEncoding = false;
|
|
|
|
if (OptionCheckSyntax)
|
|
{
|
|
_openednodes = new Hashtable();
|
|
}
|
|
else
|
|
{
|
|
_openednodes = null;
|
|
}
|
|
|
|
if (OptionUseIdAttribute)
|
|
{
|
|
_nodesid = new Hashtable();
|
|
}
|
|
else
|
|
{
|
|
_nodesid = null;
|
|
}
|
|
|
|
StreamReader sr = reader as StreamReader;
|
|
if (sr != null)
|
|
{
|
|
try
|
|
{
|
|
// trigger bom read if needed
|
|
sr.Peek();
|
|
}
|
|
// ReSharper disable EmptyGeneralCatchClause
|
|
catch (Exception)
|
|
// ReSharper restore EmptyGeneralCatchClause
|
|
{
|
|
// void on purpose
|
|
}
|
|
_streamencoding = sr.CurrentEncoding;
|
|
}
|
|
else
|
|
{
|
|
_streamencoding = null;
|
|
}
|
|
_declaredencoding = null;
|
|
|
|
_text = reader.ReadToEnd();
|
|
_documentnode = CreateNode(HtmlNodeType.Document, 0);
|
|
Parse();
|
|
|
|
if (OptionCheckSyntax)
|
|
{
|
|
foreach (HtmlNode node in _openednodes.Values)
|
|
{
|
|
if (!node._starttag) // already reported
|
|
{
|
|
continue;
|
|
}
|
|
|
|
string html;
|
|
if (OptionExtractErrorSourceText)
|
|
{
|
|
html = node.OuterHtml;
|
|
if (html.Length > OptionExtractErrorSourceTextMaxLength)
|
|
{
|
|
html = html.Substring(0, OptionExtractErrorSourceTextMaxLength);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
html = string.Empty;
|
|
}
|
|
AddError(
|
|
HtmlParseErrorCode.TagNotClosed,
|
|
node._line, node._lineposition,
|
|
node._streamposition, html,
|
|
"End tag </" + node.Name + "> was not found");
|
|
}
|
|
|
|
// we don't need this anymore
|
|
_openednodes.Clear();
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Loads the HTML document from the specified string.
|
|
/// </summary>
|
|
/// <param name="html">String containing the HTML document to load. May not be null.</param>
|
|
public void LoadHtml(string html)
|
|
{
|
|
if (html == null)
|
|
{
|
|
throw new ArgumentNullException("html");
|
|
}
|
|
StringReader sr = new StringReader(html);
|
|
Load(sr);
|
|
sr.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the HTML document to the specified stream.
|
|
/// </summary>
|
|
/// <param name="outStream">The stream to which you want to save.</param>
|
|
public void Save(Stream outStream)
|
|
{
|
|
StreamWriter sw = new StreamWriter(outStream, GetOutEncoding());
|
|
Save(sw);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the HTML document to the specified stream.
|
|
/// </summary>
|
|
/// <param name="outStream">The stream to which you want to save. May not be null.</param>
|
|
/// <param name="encoding">The character encoding to use. May not be null.</param>
|
|
public void Save(Stream outStream, Encoding encoding)
|
|
{
|
|
if (outStream == null)
|
|
{
|
|
throw new ArgumentNullException("outStream");
|
|
}
|
|
if (encoding == null)
|
|
{
|
|
throw new ArgumentNullException("encoding");
|
|
}
|
|
StreamWriter sw = new StreamWriter(outStream, encoding);
|
|
Save(sw);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the mixed document to the specified file.
|
|
/// </summary>
|
|
/// <param name="filename">The location of the file where you want to save the document.</param>
|
|
public void Save(string filename)
|
|
{
|
|
StreamWriter sw = new StreamWriter(filename, false, GetOutEncoding());
|
|
Save(sw);
|
|
sw.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the mixed document to the specified file.
|
|
/// </summary>
|
|
/// <param name="filename">The location of the file where you want to save the document. May not be null.</param>
|
|
/// <param name="encoding">The character encoding to use. May not be null.</param>
|
|
public void Save(string filename, Encoding encoding)
|
|
{
|
|
if (filename == null)
|
|
{
|
|
throw new ArgumentNullException("filename");
|
|
}
|
|
if (encoding == null)
|
|
{
|
|
throw new ArgumentNullException("encoding");
|
|
}
|
|
StreamWriter sw = new StreamWriter(filename, false, encoding);
|
|
Save(sw);
|
|
sw.Close();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the HTML document to the specified StreamWriter.
|
|
/// </summary>
|
|
/// <param name="writer">The StreamWriter to which you want to save.</param>
|
|
public void Save(StreamWriter writer)
|
|
{
|
|
Save((TextWriter) writer);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the HTML document to the specified TextWriter.
|
|
/// </summary>
|
|
/// <param name="writer">The TextWriter to which you want to save. May not be null.</param>
|
|
public void Save(TextWriter writer)
|
|
{
|
|
if (writer == null)
|
|
{
|
|
throw new ArgumentNullException("writer");
|
|
}
|
|
DocumentNode.WriteTo(writer);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Saves the HTML document to the specified XmlWriter.
|
|
/// </summary>
|
|
/// <param name="writer">The XmlWriter to which you want to save.</param>
|
|
public void Save(XmlWriter writer)
|
|
{
|
|
DocumentNode.WriteTo(writer);
|
|
writer.Flush();
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Internal Methods
|
|
|
|
internal HtmlAttribute CreateAttribute()
|
|
{
|
|
return new HtmlAttribute(this);
|
|
}
|
|
|
|
internal HtmlNode CreateNode(HtmlNodeType type)
|
|
{
|
|
return CreateNode(type, -1);
|
|
}
|
|
|
|
internal HtmlNode CreateNode(HtmlNodeType type, int index)
|
|
{
|
|
switch (type)
|
|
{
|
|
case HtmlNodeType.Comment:
|
|
return new HtmlCommentNode(this, index);
|
|
|
|
case HtmlNodeType.Text:
|
|
return new HtmlTextNode(this, index);
|
|
|
|
default:
|
|
return new HtmlNode(type, this, index);
|
|
}
|
|
}
|
|
|
|
internal Encoding GetOutEncoding()
|
|
{
|
|
// when unspecified, use the stream encoding first
|
|
if (_declaredencoding != null)
|
|
{
|
|
return _declaredencoding;
|
|
}
|
|
else
|
|
{
|
|
if (_streamencoding != null)
|
|
{
|
|
return _streamencoding;
|
|
}
|
|
}
|
|
return OptionDefaultStreamEncoding;
|
|
}
|
|
|
|
internal HtmlNode GetXmlDeclaration()
|
|
{
|
|
if (!_documentnode.HasChildNodes)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
foreach (HtmlNode node in _documentnode._childnodes)
|
|
{
|
|
if (node.Name == "?xml") // it's ok, names are case sensitive
|
|
{
|
|
return node;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
internal void SetIdForNode(HtmlNode node, string id)
|
|
{
|
|
if (!OptionUseIdAttribute)
|
|
{
|
|
return;
|
|
}
|
|
|
|
if ((_nodesid == null) || (id == null))
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (node == null)
|
|
{
|
|
_nodesid.Remove(id.ToLower());
|
|
}
|
|
else
|
|
{
|
|
_nodesid[id.ToLower()] = node;
|
|
}
|
|
}
|
|
|
|
internal void UpdateLastParentNode()
|
|
{
|
|
do
|
|
{
|
|
if (_lastparentnode.Closed)
|
|
{
|
|
_lastparentnode = _lastparentnode.ParentNode;
|
|
}
|
|
} while ((_lastparentnode != null) && (_lastparentnode.Closed));
|
|
if (_lastparentnode == null)
|
|
{
|
|
_lastparentnode = _documentnode;
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Private Methods
|
|
|
|
private HtmlParseError AddError(
|
|
HtmlParseErrorCode code,
|
|
int line,
|
|
int linePosition,
|
|
int streamPosition,
|
|
string sourceText,
|
|
string reason)
|
|
{
|
|
HtmlParseError err = new HtmlParseError(code, line, linePosition, streamPosition, sourceText, reason);
|
|
_parseerrors.Add(err);
|
|
return err;
|
|
}
|
|
|
|
private void CloseCurrentNode()
|
|
{
|
|
if (_currentnode.Closed) // text or document are by def closed
|
|
return;
|
|
|
|
bool error = false;
|
|
|
|
// find last node of this kind
|
|
HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];
|
|
if (prev == null)
|
|
{
|
|
if (HtmlNode.IsClosedElement(_currentnode.Name))
|
|
{
|
|
// </br> will be seen as <br>
|
|
_currentnode.CloseNode(_currentnode);
|
|
|
|
// add to parent node
|
|
if (_lastparentnode != null)
|
|
{
|
|
HtmlNode foundNode = null;
|
|
Stack futureChild = new Stack();
|
|
for (HtmlNode node = _lastparentnode.LastChild; node != null; node = node.PreviousSibling)
|
|
{
|
|
if ((node.Name == _currentnode.Name) && (!node.HasChildNodes))
|
|
{
|
|
foundNode = node;
|
|
break;
|
|
}
|
|
futureChild.Push(node);
|
|
}
|
|
if (foundNode != null)
|
|
{
|
|
HtmlNode node = null;
|
|
while (futureChild.Count != 0)
|
|
{
|
|
node = (HtmlNode) futureChild.Pop();
|
|
_lastparentnode.RemoveChild(node);
|
|
foundNode.AppendChild(node);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_lastparentnode.AppendChild(_currentnode);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// node has no parent
|
|
// node is not a closed node
|
|
|
|
if (HtmlNode.CanOverlapElement(_currentnode.Name))
|
|
{
|
|
// this is a hack: add it as a text node
|
|
HtmlNode closenode = CreateNode(HtmlNodeType.Text, _currentnode._outerstartindex);
|
|
closenode._outerlength = _currentnode._outerlength;
|
|
((HtmlTextNode) closenode).Text = ((HtmlTextNode) closenode).Text.ToLower();
|
|
if (_lastparentnode != null)
|
|
{
|
|
_lastparentnode.AppendChild(closenode);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (HtmlNode.IsEmptyElement(_currentnode.Name))
|
|
{
|
|
AddError(
|
|
HtmlParseErrorCode.EndTagNotRequired,
|
|
_currentnode._line, _currentnode._lineposition,
|
|
_currentnode._streamposition, _currentnode.OuterHtml,
|
|
"End tag </" + _currentnode.Name + "> is not required");
|
|
}
|
|
else
|
|
{
|
|
// node cannot overlap, node is not empty
|
|
AddError(
|
|
HtmlParseErrorCode.TagNotOpened,
|
|
_currentnode._line, _currentnode._lineposition,
|
|
_currentnode._streamposition, _currentnode.OuterHtml,
|
|
"Start tag <" + _currentnode.Name + "> was not found");
|
|
error = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (OptionFixNestedTags)
|
|
{
|
|
if (FindResetterNodes(prev, GetResetters(_currentnode.Name)))
|
|
{
|
|
AddError(
|
|
HtmlParseErrorCode.EndTagInvalidHere,
|
|
_currentnode._line, _currentnode._lineposition,
|
|
_currentnode._streamposition, _currentnode.OuterHtml,
|
|
"End tag </" + _currentnode.Name + "> invalid here");
|
|
error = true;
|
|
}
|
|
}
|
|
|
|
if (!error)
|
|
{
|
|
_lastnodes[_currentnode.Name] = prev._prevwithsamename;
|
|
prev.CloseNode(_currentnode);
|
|
}
|
|
}
|
|
|
|
|
|
// we close this node, get grandparent
|
|
if (!error)
|
|
{
|
|
if ((_lastparentnode != null) &&
|
|
((!HtmlNode.IsClosedElement(_currentnode.Name)) ||
|
|
(_currentnode._starttag)))
|
|
{
|
|
UpdateLastParentNode();
|
|
}
|
|
}
|
|
}
|
|
|
|
private string CurrentAttributeName()
|
|
{
|
|
return _text.Substring(_currentattribute._namestartindex, _currentattribute._namelength);
|
|
}
|
|
|
|
private string CurrentAttributeValue()
|
|
{
|
|
return _text.Substring(_currentattribute._valuestartindex, _currentattribute._valuelength);
|
|
}
|
|
|
|
private string CurrentNodeInner()
|
|
{
|
|
return _text.Substring(_currentnode._innerstartindex, _currentnode._innerlength);
|
|
}
|
|
|
|
private string CurrentNodeName()
|
|
{
|
|
return _text.Substring(_currentnode._namestartindex, _currentnode._namelength);
|
|
}
|
|
|
|
private string CurrentNodeOuter()
|
|
{
|
|
return _text.Substring(_currentnode._outerstartindex, _currentnode._outerlength);
|
|
}
|
|
|
|
|
|
private void DecrementPosition()
|
|
{
|
|
_index--;
|
|
if (_lineposition == 1)
|
|
{
|
|
_lineposition = _maxlineposition;
|
|
_line--;
|
|
}
|
|
else
|
|
{
|
|
_lineposition--;
|
|
}
|
|
}
|
|
|
|
private HtmlNode FindResetterNode(HtmlNode node, string name)
|
|
{
|
|
HtmlNode resetter = (HtmlNode) _lastnodes[name];
|
|
if (resetter == null)
|
|
return null;
|
|
if (resetter.Closed)
|
|
{
|
|
return null;
|
|
}
|
|
if (resetter._streamposition < node._streamposition)
|
|
{
|
|
return null;
|
|
}
|
|
return resetter;
|
|
}
|
|
|
|
private bool FindResetterNodes(HtmlNode node, string[] names)
|
|
{
|
|
if (names == null)
|
|
{
|
|
return false;
|
|
}
|
|
for (int i = 0; i < names.Length; i++)
|
|
{
|
|
if (FindResetterNode(node, names[i]) != null)
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
private void FixNestedTag(string name, string[] resetters)
|
|
{
|
|
if (resetters == null)
|
|
return;
|
|
|
|
HtmlNode prev;
|
|
|
|
// if we find a previous unclosed same name node, without a resetter node between, we must close it
|
|
prev = (HtmlNode) _lastnodes[name];
|
|
if ((prev != null) && (!prev.Closed))
|
|
{
|
|
// try to find a resetter node, if found, we do nothing
|
|
if (FindResetterNodes(prev, resetters))
|
|
{
|
|
return;
|
|
}
|
|
|
|
// ok we need to close the prev now
|
|
// create a fake closer node
|
|
HtmlNode close = new HtmlNode(prev.NodeType, this, -1);
|
|
close._endnode = close;
|
|
prev.CloseNode(close);
|
|
}
|
|
}
|
|
|
|
private void FixNestedTags()
|
|
{
|
|
// we are only interested by start tags, not closing tags
|
|
if (!_currentnode._starttag)
|
|
return;
|
|
|
|
string name = CurrentNodeName();
|
|
FixNestedTag(name, GetResetters(name));
|
|
}
|
|
|
|
private string[] GetResetters(string name)
|
|
{
|
|
switch (name)
|
|
{
|
|
case "li":
|
|
return new string[] {"ul"};
|
|
|
|
case "tr":
|
|
return new string[] {"table"};
|
|
|
|
case "th":
|
|
case "td":
|
|
return new string[] {"tr", "table"};
|
|
|
|
default:
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private void IncrementPosition()
|
|
{
|
|
if (_crc32 != null)
|
|
{
|
|
// REVIEW: should we add some checksum code in DecrementPosition too?
|
|
_crc32.AddToCRC32(_c);
|
|
}
|
|
|
|
_index++;
|
|
_maxlineposition = _lineposition;
|
|
if (_c == 10)
|
|
{
|
|
_lineposition = 1;
|
|
_line++;
|
|
}
|
|
else
|
|
{
|
|
_lineposition++;
|
|
}
|
|
}
|
|
|
|
private bool NewCheck()
|
|
{
|
|
if (_c != '<')
|
|
{
|
|
return false;
|
|
}
|
|
if (_index < _text.Length)
|
|
{
|
|
if (_text[_index] == '%')
|
|
{
|
|
switch (_state)
|
|
{
|
|
case ParseState.AttributeAfterEquals:
|
|
PushAttributeValueStart(_index - 1);
|
|
break;
|
|
|
|
case ParseState.BetweenAttributes:
|
|
PushAttributeNameStart(_index - 1);
|
|
break;
|
|
|
|
case ParseState.WhichTag:
|
|
PushNodeNameStart(true, _index - 1);
|
|
_state = ParseState.Tag;
|
|
break;
|
|
}
|
|
_oldstate = _state;
|
|
_state = ParseState.ServerSideCode;
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (!PushNodeEnd(_index - 1, true))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
return true;
|
|
}
|
|
_state = ParseState.WhichTag;
|
|
if ((_index - 1) <= (_text.Length - 2))
|
|
{
|
|
if (_text[_index] == '!')
|
|
{
|
|
PushNodeStart(HtmlNodeType.Comment, _index - 1);
|
|
PushNodeNameStart(true, _index);
|
|
PushNodeNameEnd(_index + 1);
|
|
_state = ParseState.Comment;
|
|
if (_index < (_text.Length - 2))
|
|
{
|
|
if ((_text[_index + 1] == '-') &&
|
|
(_text[_index + 2] == '-'))
|
|
{
|
|
_fullcomment = true;
|
|
}
|
|
else
|
|
{
|
|
_fullcomment = false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
PushNodeStart(HtmlNodeType.Element, _index - 1);
|
|
return true;
|
|
}
|
|
|
|
private void Parse()
|
|
{
|
|
int lastquote = 0;
|
|
if (OptionComputeChecksum)
|
|
{
|
|
_crc32 = new Crc32();
|
|
}
|
|
|
|
_lastnodes = new Hashtable();
|
|
_c = 0;
|
|
_fullcomment = false;
|
|
_parseerrors = new List<HtmlParseError>();
|
|
_line = 1;
|
|
_lineposition = 1;
|
|
_maxlineposition = 1;
|
|
|
|
_state = ParseState.Text;
|
|
_oldstate = _state;
|
|
_documentnode._innerlength = _text.Length;
|
|
_documentnode._outerlength = _text.Length;
|
|
_remainderOffset = _text.Length;
|
|
|
|
_lastparentnode = _documentnode;
|
|
_currentnode = CreateNode(HtmlNodeType.Text, 0);
|
|
_currentattribute = null;
|
|
|
|
_index = 0;
|
|
PushNodeStart(HtmlNodeType.Text, 0);
|
|
while (_index < _text.Length)
|
|
{
|
|
_c = _text[_index];
|
|
IncrementPosition();
|
|
|
|
switch (_state)
|
|
{
|
|
case ParseState.Text:
|
|
if (NewCheck())
|
|
continue;
|
|
break;
|
|
|
|
case ParseState.WhichTag:
|
|
if (NewCheck())
|
|
continue;
|
|
if (_c == '/')
|
|
{
|
|
PushNodeNameStart(false, _index);
|
|
}
|
|
else
|
|
{
|
|
PushNodeNameStart(true, _index - 1);
|
|
DecrementPosition();
|
|
}
|
|
_state = ParseState.Tag;
|
|
break;
|
|
|
|
case ParseState.Tag:
|
|
if (NewCheck())
|
|
continue;
|
|
if (IsWhiteSpace(_c))
|
|
{
|
|
PushNodeNameEnd(_index - 1);
|
|
if (_state != ParseState.Tag)
|
|
continue;
|
|
_state = ParseState.BetweenAttributes;
|
|
continue;
|
|
}
|
|
if (_c == '/')
|
|
{
|
|
PushNodeNameEnd(_index - 1);
|
|
if (_state != ParseState.Tag)
|
|
continue;
|
|
_state = ParseState.EmptyTag;
|
|
continue;
|
|
}
|
|
if (_c == '>')
|
|
{
|
|
PushNodeNameEnd(_index - 1);
|
|
if (_state != ParseState.Tag)
|
|
continue;
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
if (_state != ParseState.Tag)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
}
|
|
break;
|
|
|
|
case ParseState.BetweenAttributes:
|
|
if (NewCheck())
|
|
continue;
|
|
|
|
if (IsWhiteSpace(_c))
|
|
continue;
|
|
|
|
if ((_c == '/') || (_c == '?'))
|
|
{
|
|
_state = ParseState.EmptyTag;
|
|
continue;
|
|
}
|
|
|
|
if (_c == '>')
|
|
{
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
|
|
if (_state != ParseState.BetweenAttributes)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
|
|
PushAttributeNameStart(_index - 1);
|
|
_state = ParseState.AttributeName;
|
|
break;
|
|
|
|
case ParseState.EmptyTag:
|
|
if (NewCheck())
|
|
continue;
|
|
|
|
if (_c == '>')
|
|
{
|
|
if (!PushNodeEnd(_index, true))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
|
|
if (_state != ParseState.EmptyTag)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
_state = ParseState.BetweenAttributes;
|
|
break;
|
|
|
|
case ParseState.AttributeName:
|
|
if (NewCheck())
|
|
continue;
|
|
|
|
if (IsWhiteSpace(_c))
|
|
{
|
|
PushAttributeNameEnd(_index - 1);
|
|
_state = ParseState.AttributeBeforeEquals;
|
|
continue;
|
|
}
|
|
if (_c == '=')
|
|
{
|
|
PushAttributeNameEnd(_index - 1);
|
|
_state = ParseState.AttributeAfterEquals;
|
|
continue;
|
|
}
|
|
if (_c == '>')
|
|
{
|
|
PushAttributeNameEnd(_index - 1);
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
if (_state != ParseState.AttributeName)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
break;
|
|
|
|
case ParseState.AttributeBeforeEquals:
|
|
if (NewCheck())
|
|
continue;
|
|
|
|
if (IsWhiteSpace(_c))
|
|
continue;
|
|
if (_c == '>')
|
|
{
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
if (_state != ParseState.AttributeBeforeEquals)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
if (_c == '=')
|
|
{
|
|
_state = ParseState.AttributeAfterEquals;
|
|
continue;
|
|
}
|
|
// no equals, no whitespace, it's a new attrribute starting
|
|
_state = ParseState.BetweenAttributes;
|
|
DecrementPosition();
|
|
break;
|
|
|
|
case ParseState.AttributeAfterEquals:
|
|
if (NewCheck())
|
|
continue;
|
|
|
|
if (IsWhiteSpace(_c))
|
|
continue;
|
|
|
|
if ((_c == '\'') || (_c == '"'))
|
|
{
|
|
_state = ParseState.QuotedAttributeValue;
|
|
PushAttributeValueStart(_index, _c);
|
|
lastquote = _c;
|
|
continue;
|
|
}
|
|
if (_c == '>')
|
|
{
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
if (_state != ParseState.AttributeAfterEquals)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
PushAttributeValueStart(_index - 1);
|
|
_state = ParseState.AttributeValue;
|
|
break;
|
|
|
|
case ParseState.AttributeValue:
|
|
if (NewCheck())
|
|
continue;
|
|
|
|
if (IsWhiteSpace(_c))
|
|
{
|
|
PushAttributeValueEnd(_index - 1);
|
|
_state = ParseState.BetweenAttributes;
|
|
continue;
|
|
}
|
|
|
|
if (_c == '>')
|
|
{
|
|
PushAttributeValueEnd(_index - 1);
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
if (_state != ParseState.AttributeValue)
|
|
continue;
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
break;
|
|
|
|
case ParseState.QuotedAttributeValue:
|
|
if (_c == lastquote)
|
|
{
|
|
PushAttributeValueEnd(_index - 1);
|
|
_state = ParseState.BetweenAttributes;
|
|
continue;
|
|
}
|
|
if (_c == '<')
|
|
{
|
|
if (_index < _text.Length)
|
|
{
|
|
if (_text[_index] == '%')
|
|
{
|
|
_oldstate = _state;
|
|
_state = ParseState.ServerSideCode;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case ParseState.Comment:
|
|
if (_c == '>')
|
|
{
|
|
if (_fullcomment)
|
|
{
|
|
if ((_text[_index - 2] != '-') ||
|
|
(_text[_index - 3] != '-'))
|
|
{
|
|
continue;
|
|
}
|
|
}
|
|
if (!PushNodeEnd(_index, false))
|
|
{
|
|
// stop parsing
|
|
_index = _text.Length;
|
|
break;
|
|
}
|
|
_state = ParseState.Text;
|
|
PushNodeStart(HtmlNodeType.Text, _index);
|
|
continue;
|
|
}
|
|
break;
|
|
|
|
case ParseState.ServerSideCode:
|
|
if (_c == '%')
|
|
{
|
|
if (_index < _text.Length)
|
|
{
|
|
if (_text[_index] == '>')
|
|
{
|
|
switch (_oldstate)
|
|
{
|
|
case ParseState.AttributeAfterEquals:
|
|
_state = ParseState.AttributeValue;
|
|
break;
|
|
|
|
case ParseState.BetweenAttributes:
|
|
PushAttributeNameEnd(_index + 1);
|
|
_state = ParseState.BetweenAttributes;
|
|
break;
|
|
|
|
default:
|
|
_state = _oldstate;
|
|
break;
|
|
}
|
|
IncrementPosition();
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
|
|
case ParseState.PcData:
|
|
// look for </tag + 1 char
|
|
|
|
// check buffer end
|
|
if ((_currentnode._namelength + 3) <= (_text.Length - (_index - 1)))
|
|
{
|
|
if (string.Compare(_text.Substring(_index - 1, _currentnode._namelength + 2),
|
|
"</" + _currentnode.Name, true) == 0)
|
|
{
|
|
int c = _text[_index - 1 + 2 + _currentnode.Name.Length];
|
|
if ((c == '>') || (IsWhiteSpace(c)))
|
|
{
|
|
// add the script as a text node
|
|
HtmlNode script = CreateNode(HtmlNodeType.Text,
|
|
_currentnode._outerstartindex +
|
|
_currentnode._outerlength);
|
|
script._outerlength = _index - 1 - script._outerstartindex;
|
|
_currentnode.AppendChild(script);
|
|
|
|
|
|
PushNodeStart(HtmlNodeType.Element, _index - 1);
|
|
PushNodeNameStart(false, _index - 1 + 2);
|
|
_state = ParseState.Tag;
|
|
IncrementPosition();
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// finish the current work
|
|
if (_currentnode._namestartindex > 0)
|
|
{
|
|
PushNodeNameEnd(_index);
|
|
}
|
|
PushNodeEnd(_index, false);
|
|
|
|
// we don't need this anymore
|
|
_lastnodes.Clear();
|
|
}
|
|
|
|
private void PushAttributeNameEnd(int index)
|
|
{
|
|
_currentattribute._namelength = index - _currentattribute._namestartindex;
|
|
_currentnode.Attributes.Append(_currentattribute);
|
|
}
|
|
|
|
private void PushAttributeNameStart(int index)
|
|
{
|
|
_currentattribute = CreateAttribute();
|
|
_currentattribute._namestartindex = index;
|
|
_currentattribute.Line = _line;
|
|
_currentattribute._lineposition = _lineposition;
|
|
_currentattribute._streamposition = index;
|
|
}
|
|
|
|
private void PushAttributeValueEnd(int index)
|
|
{
|
|
_currentattribute._valuelength = index - _currentattribute._valuestartindex;
|
|
}
|
|
|
|
private void PushAttributeValueStart(int index)
|
|
{
|
|
PushAttributeValueStart(index, 0);
|
|
}
|
|
|
|
private void PushAttributeValueStart(int index, int quote)
|
|
{
|
|
_currentattribute._valuestartindex = index;
|
|
if (quote == '\'')
|
|
_currentattribute.QuoteType = AttributeValueQuote.SingleQuote;
|
|
}
|
|
|
|
private bool PushNodeEnd(int index, bool close)
|
|
{
|
|
_currentnode._outerlength = index - _currentnode._outerstartindex;
|
|
|
|
if ((_currentnode._nodetype == HtmlNodeType.Text) ||
|
|
(_currentnode._nodetype == HtmlNodeType.Comment))
|
|
{
|
|
// forget about void nodes
|
|
if (_currentnode._outerlength > 0)
|
|
{
|
|
_currentnode._innerlength = _currentnode._outerlength;
|
|
_currentnode._innerstartindex = _currentnode._outerstartindex;
|
|
if (_lastparentnode != null)
|
|
{
|
|
_lastparentnode.AppendChild(_currentnode);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if ((_currentnode._starttag) && (_lastparentnode != _currentnode))
|
|
{
|
|
// add to parent node
|
|
if (_lastparentnode != null)
|
|
{
|
|
_lastparentnode.AppendChild(_currentnode);
|
|
}
|
|
|
|
ReadDocumentEncoding(_currentnode);
|
|
|
|
// remember last node of this kind
|
|
HtmlNode prev = (HtmlNode) _lastnodes[_currentnode.Name];
|
|
_currentnode._prevwithsamename = prev;
|
|
_lastnodes[_currentnode.Name] = _currentnode;
|
|
|
|
// change parent?
|
|
if ((_currentnode.NodeType == HtmlNodeType.Document) ||
|
|
(_currentnode.NodeType == HtmlNodeType.Element))
|
|
{
|
|
_lastparentnode = _currentnode;
|
|
}
|
|
|
|
if (HtmlNode.IsCDataElement(CurrentNodeName()))
|
|
{
|
|
_state = ParseState.PcData;
|
|
return true;
|
|
}
|
|
|
|
if ((HtmlNode.IsClosedElement(_currentnode.Name)) ||
|
|
(HtmlNode.IsEmptyElement(_currentnode.Name)))
|
|
{
|
|
close = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ((close) || (!_currentnode._starttag))
|
|
{
|
|
if ((OptionStopperNodeName != null) && (_remainder == null) &&
|
|
(string.Compare(_currentnode.Name, OptionStopperNodeName, true) == 0))
|
|
{
|
|
_remainderOffset = index;
|
|
_remainder = _text.Substring(_remainderOffset);
|
|
CloseCurrentNode();
|
|
return false; // stop parsing
|
|
}
|
|
CloseCurrentNode();
|
|
}
|
|
return true;
|
|
}
|
|
|
|
private void PushNodeNameEnd(int index)
|
|
{
|
|
_currentnode._namelength = index - _currentnode._namestartindex;
|
|
if (OptionFixNestedTags)
|
|
{
|
|
FixNestedTags();
|
|
}
|
|
}
|
|
|
|
private void PushNodeNameStart(bool starttag, int index)
|
|
{
|
|
_currentnode._starttag = starttag;
|
|
_currentnode._namestartindex = index;
|
|
}
|
|
|
|
private void PushNodeStart(HtmlNodeType type, int index)
|
|
{
|
|
_currentnode = CreateNode(type, index);
|
|
_currentnode._line = _line;
|
|
_currentnode._lineposition = _lineposition;
|
|
if (type == HtmlNodeType.Element)
|
|
{
|
|
_currentnode._lineposition--;
|
|
}
|
|
_currentnode._streamposition = index;
|
|
}
|
|
|
|
private void ReadDocumentEncoding(HtmlNode node)
|
|
{
|
|
if (!OptionReadEncoding)
|
|
return;
|
|
// format is
|
|
// <meta http-equiv="content-type" content="text/html;charset=iso-8859-1" />
|
|
|
|
// when we append a child, we are in node end, so attributes are already populated
|
|
if (node._namelength == 4) // quick check, avoids string alloc
|
|
{
|
|
if (node.Name == "meta") // all nodes names are lowercase
|
|
{
|
|
HtmlAttribute att = node.Attributes["http-equiv"];
|
|
if (att != null)
|
|
{
|
|
if (string.Compare(att.Value, "content-type", true) == 0)
|
|
{
|
|
HtmlAttribute content = node.Attributes["content"];
|
|
if (content != null)
|
|
{
|
|
string charset = NameValuePairList.GetNameValuePairsValue(content.Value, "charset");
|
|
if (charset != null && (charset = charset.Trim()).Length > 0)
|
|
{
|
|
_declaredencoding = Encoding.GetEncoding(charset.Trim());
|
|
if (_onlyDetectEncoding)
|
|
{
|
|
throw new EncodingFoundException(_declaredencoding);
|
|
}
|
|
|
|
if (_streamencoding != null)
|
|
{
|
|
if (_declaredencoding.WindowsCodePage != _streamencoding.WindowsCodePage)
|
|
{
|
|
AddError(
|
|
HtmlParseErrorCode.CharsetMismatch,
|
|
_line, _lineposition,
|
|
_index, node.OuterHtml,
|
|
"Encoding mismatch between StreamEncoding: " +
|
|
_streamencoding.WebName + " and DeclaredEncoding: " +
|
|
_declaredencoding.WebName);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Nested type: ParseState
|
|
|
|
private enum ParseState
|
|
{
|
|
Text,
|
|
WhichTag,
|
|
Tag,
|
|
BetweenAttributes,
|
|
EmptyTag,
|
|
AttributeName,
|
|
AttributeBeforeEquals,
|
|
AttributeAfterEquals,
|
|
AttributeValue,
|
|
Comment,
|
|
QuotedAttributeValue,
|
|
ServerSideCode,
|
|
PcData
|
|
}
|
|
|
|
#endregion
|
|
}
|
|
} |