gecko/parser/htmlparser/public/nsHTMLTokens.h
Nicholas Nethercote 1c40305ba0 Bug 847248 (part 9) - Remove nsFixedSizeAllocator from parser/htmlparser/. r=hsivonen.
--HG--
extra : rebase_source : 9a9bbdeb87a2d7a3739777ad152937c09caa82d1
2013-03-05 16:51:40 -08:00

377 lines
12 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/**
* MODULE NOTES:
* @update gess 4/1/98
*
* This file contains the declarations for all the HTML specific token types that
* our DTD's understand. In fact, the same set of token types are used for XML.
* Currently we have tokens for text, comments, start and end tags, entities,
* attributes, style, script and skipped content. Whitespace and newlines also
* have their own token types, but don't count on them to stay forever.
*
* If you're looking for the html tags, they're in a file called nsHTMLTag.h/cpp.
*
* Most of the token types have a similar API. They have methods to get the type
* of token (GetTokenType); those that represent HTML tags also have a method to
* get type tag type (GetTypeID). In addition, most have a method that causes the
* token to help in the parsing process called (Consume). We've also thrown in a
* few standard debugging methods as well.
*/
#ifndef HTMLTOKENS_H
#define HTMLTOKENS_H
#include "nsToken.h"
#include "nsHTMLTags.h"
#include "nsString.h"
#include "nsScannerString.h"
class nsScanner;
/*******************************************************************
* This enum defines the set of token types that we currently support.
*******************************************************************/
enum eHTMLTokenTypes {
eToken_unknown=0,
eToken_start=1, eToken_end, eToken_comment, eToken_entity,
eToken_whitespace, eToken_newline, eToken_text, eToken_attribute,
eToken_instruction, eToken_cdatasection, eToken_doctypeDecl, eToken_markupDecl,
eToken_last //make sure this stays the last token...
};
nsresult ConsumeQuotedString(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
nsresult ConsumeAttributeText(PRUnichar aChar,nsString& aString,nsScanner& aScanner);
const PRUnichar* GetTagName(int32_t aTag);
//int32_t FindEntityIndex(nsString& aString,int32_t aCount=-1);
/**
* This declares the basic token type used in the HTML DTD's.
* @update gess 3/25/98
*/
class CHTMLToken : public CToken {
public:
virtual ~CHTMLToken();
CHTMLToken(eHTMLTags aTag);
virtual eContainerInfo GetContainerInfo(void) const {return eFormUnknown;}
virtual void SetContainerInfo(eContainerInfo aInfo) { }
protected:
};
/**
* This declares start tokens, which always take the form <xxxx>.
* This class also knows how to consume related attributes.
*
* @update gess 3/25/98
*/
class CStartToken: public CHTMLToken {
public:
CStartToken(eHTMLTags aTag=eHTMLTag_unknown);
CStartToken(const nsAString& aString);
CStartToken(const nsAString& aName,eHTMLTags aTag);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTypeID(void);
virtual int32_t GetTokenType(void);
virtual bool IsEmpty(void);
virtual void SetEmpty(bool aValue);
virtual const nsSubstring& GetStringValue();
virtual void GetSource(nsString& anOutputString);
virtual void AppendSourceTo(nsAString& anOutputString);
// the following info is used to set well-formedness state on start tags...
virtual eContainerInfo GetContainerInfo(void) const {return mContainerInfo;}
virtual void SetContainerInfo(eContainerInfo aContainerInfo) {
if (eFormUnknown==mContainerInfo) {
mContainerInfo=aContainerInfo;
}
}
virtual bool IsWellFormed(void) const {
return eWellFormed == mContainerInfo;
}
nsString mTextValue;
protected:
eContainerInfo mContainerInfo;
bool mEmpty;
#ifdef DEBUG
bool mAttributed;
#endif
};
/**
* This declares end tokens, which always take the
* form </xxxx>. This class also knows how to consume
* related attributes.
*
* @update gess 3/25/98
*/
class CEndToken: public CHTMLToken {
public:
CEndToken(eHTMLTags aTag);
CEndToken(const nsAString& aString);
CEndToken(const nsAString& aName,eHTMLTags aTag);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTypeID(void);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue();
virtual void GetSource(nsString& anOutputString);
virtual void AppendSourceTo(nsAString& anOutputString);
protected:
nsString mTextValue;
};
/**
* This declares comment tokens. Comments are usually
* thought of as tokens, but we treat them that way
* here so that the parser can have a consistent view
* of all tokens.
*
* @update gess 3/25/98
*/
class CCommentToken: public CHTMLToken {
public:
CCommentToken();
CCommentToken(const nsAString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
virtual void AppendSourceTo(nsAString& anOutputString);
nsresult ConsumeStrictComment(nsScanner& aScanner);
nsresult ConsumeQuirksComment(nsScanner& aScanner);
protected:
nsScannerSubstring mComment; // does not include MDO & MDC
nsScannerSubstring mCommentDecl; // includes MDO & MDC
};
/**
* This class declares entity tokens, which always take
* the form &xxxx;. This class also offers a few utility
* methods that allow you to easily reduce entities.
*
* @update gess 3/25/98
*/
class CEntityToken : public CHTMLToken {
public:
CEntityToken();
CEntityToken(const nsAString& aString);
virtual int32_t GetTokenType(void);
int32_t TranslateToUnicodeStr(nsString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
static nsresult ConsumeEntity(PRUnichar aChar, nsString& aString,
nsScanner& aScanner);
static int32_t TranslateToUnicodeStr(int32_t aValue,nsString& aString);
virtual const nsSubstring& GetStringValue(void);
virtual void GetSource(nsString& anOutputString);
virtual void AppendSourceTo(nsAString& anOutputString);
protected:
nsString mTextValue;
};
/**
* Whitespace tokens are used where whitespace can be
* detected as distinct from text. This allows us to
* easily skip leading/trailing whitespace when desired.
*
* @update gess 3/25/98
*/
class CWhitespaceToken: public CHTMLToken {
public:
CWhitespaceToken();
CWhitespaceToken(const nsAString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
protected:
nsScannerSharedSubstring mTextValue;
};
/**
* Text tokens contain the normalized form of html text.
* These tokens are guaranteed not to contain entities,
* start or end tags, or newlines.
*
* @update gess 3/25/98
*/
class CTextToken: public CHTMLToken {
public:
CTextToken();
CTextToken(const nsAString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual int32_t GetTextLength(void);
virtual void CopyTo(nsAString& aStr);
virtual const nsSubstring& GetStringValue(void);
virtual void Bind(nsScanner* aScanner, nsScannerIterator& aStart,
nsScannerIterator& aEnd);
virtual void Bind(const nsAString& aStr);
nsresult ConsumeCharacterData(bool aIgnoreComments,
nsScanner& aScanner,
const nsAString& aEndTagName,
int32_t aFlag,
bool& aFlushTokens);
nsresult ConsumeParsedCharacterData(bool aDiscardFirstNewline,
bool aConservativeConsume,
nsScanner& aScanner,
const nsAString& aEndTagName,
int32_t aFlag,
bool& aFound);
protected:
nsScannerSubstring mTextValue;
};
/**
* CDATASection tokens contain raw unescaped text content delimited by
* a ![CDATA[ and ]].
* XXX Not really a HTML construct - maybe we need a separation
*
* @update vidur 11/12/98
*/
class CCDATASectionToken : public CHTMLToken {
public:
CCDATASectionToken(eHTMLTags aTag = eHTMLTag_unknown);
CCDATASectionToken(const nsAString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
protected:
nsString mTextValue;
};
/**
* Declaration tokens contain raw unescaped text content (not really, but
* right now we use this only for view source).
* XXX Not really a HTML construct - maybe we need a separation
*
*/
class CMarkupDeclToken : public CHTMLToken {
public:
CMarkupDeclToken();
CMarkupDeclToken(const nsAString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
protected:
nsScannerSubstring mTextValue;
};
/**
* Attribute tokens are used to contain attribute key/value
* pairs whereever they may occur. Typically, they should
* occur only in start tokens. However, we may expand that
* ability when XML tokens become commonplace.
*
* @update gess 3/25/98
*/
class CAttributeToken: public CHTMLToken {
public:
CAttributeToken();
CAttributeToken(const nsAString& aString);
CAttributeToken(const nsAString& aKey, const nsAString& aString);
~CAttributeToken() {}
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
const nsSubstring& GetKey(void) { return mTextKey.AsString(); }
virtual void SetKey(const nsAString& aKey);
virtual void BindKey(nsScanner* aScanner, nsScannerIterator& aStart,
nsScannerIterator& aEnd);
const nsSubstring& GetValue(void) {return mTextValue.str();}
virtual const nsSubstring& GetStringValue(void);
virtual void GetSource(nsString& anOutputString);
virtual void AppendSourceTo(nsAString& anOutputString);
bool mHasEqualWithoutValue;
protected:
nsScannerSharedSubstring mTextValue;
nsScannerSubstring mTextKey;
};
/**
* Newline tokens contain, you guessed it, newlines.
* They consume newline (CR/LF) either alone or in pairs.
*
* @update gess 3/25/98
*/
class CNewlineToken: public CHTMLToken {
public:
CNewlineToken();
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
static void AllocNewline();
static void FreeNewline();
};
/**
* Whitespace tokens are used where whitespace can be
* detected as distinct from text. This allows us to
* easily skip leading/trailing whitespace when desired.
*
* @update gess 3/25/98
*/
class CInstructionToken: public CHTMLToken {
public:
CInstructionToken();
CInstructionToken(const nsAString& aString);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
protected:
nsString mTextValue;
};
/**
* This token is generated by the HTML and Expat tokenizers
* when they see the doctype declaration ("<!DOCTYPE ... >")
*
*/
class CDoctypeDeclToken: public CHTMLToken {
public:
CDoctypeDeclToken(eHTMLTags aTag=eHTMLTag_unknown);
CDoctypeDeclToken(const nsAString& aString,eHTMLTags aTag=eHTMLTag_unknown);
virtual nsresult Consume(PRUnichar aChar,nsScanner& aScanner,int32_t aMode);
virtual int32_t GetTokenType(void);
virtual const nsSubstring& GetStringValue(void);
virtual void SetStringValue(const nsAString& aStr);
protected:
nsString mTextValue;
};
#endif