Bug 650784 part 1 - Introduce a new API for converting HTML to plain text. r=smaug.

This commit is contained in:
Henri Sivonen 2012-02-27 13:57:48 +02:00
parent 84b0480002
commit 3aacb86e06
14 changed files with 160 additions and 100 deletions

View File

@ -1144,12 +1144,34 @@ public:
* @param aSourceBuffer the string to parse as an HTML document
* @param aTargetDocument the document object to parse into. Must not have
* child nodes.
* @param aScriptingEnabledForNoscriptParsing whether <noscript> is parsed
* as if scripting was enabled
* @return NS_ERROR_DOM_INVALID_STATE_ERR if a re-entrant attempt to parse
* fragments is made, NS_ERROR_OUT_OF_MEMORY if aSourceBuffer is too
* long and NS_OK otherwise.
*/
static nsresult ParseDocumentHTML(const nsAString& aSourceBuffer,
nsIDocument* aTargetDocument);
nsIDocument* aTargetDocument,
bool aScriptingEnabledForNoscriptParsing);
/**
* Converts HTML source to plain text by parsing the source and using the
* plain text serializer on the resulting tree.
*
* @param aSourceBuffer the string to parse as an HTML document
* @param aResultBuffer the string where the plain text result appears;
* may be the same string as aSourceBuffer
* @param aFlags Flags from nsIDocumentEncoder.
* @param aWrapCol Number of columns after which to line wrap; 0 for no
* auto-wrapping
* @return NS_ERROR_DOM_INVALID_STATE_ERR if a re-entrant attempt to parse
* fragments is made, NS_ERROR_OUT_OF_MEMORY if aSourceBuffer is too
* long and NS_OK otherwise.
*/
static nsresult ConvertToPlainText(const nsAString& aSourceBuffer,
nsAString& aResultBuffer,
PRUint32 aFlags,
PRUint32 aWrapCol);
/**
* Creates a new XML document, which is marked to be loaded as data.

View File

@ -3927,7 +3927,8 @@ nsContentUtils::ParseFragmentHTML(const nsAString& aSourceBuffer,
/* static */
nsresult
nsContentUtils::ParseDocumentHTML(const nsAString& aSourceBuffer,
nsIDocument* aTargetDocument)
nsIDocument* aTargetDocument,
bool aScriptingEnabledForNoscriptParsing)
{
if (nsContentUtils::sFragmentParsingActive) {
NS_NOTREACHED("Re-entrant fragment parsing attempted.");
@ -3941,7 +3942,8 @@ nsContentUtils::ParseDocumentHTML(const nsAString& aSourceBuffer,
}
nsresult rv =
sHTMLFragmentParser->ParseDocument(aSourceBuffer,
aTargetDocument);
aTargetDocument,
aScriptingEnabledForNoscriptParsing);
return rv;
}
@ -3992,6 +3994,44 @@ nsContentUtils::ParseFragmentXML(const nsAString& aSourceBuffer,
return rv;
}
/* static */
nsresult
nsContentUtils::ConvertToPlainText(const nsAString& aSourceBuffer,
nsAString& aResultBuffer,
PRUint32 aFlags,
PRUint32 aWrapCol)
{
nsCOMPtr<nsIURI> uri;
NS_NewURI(getter_AddRefs(uri), "about:blank");
nsCOMPtr<nsIPrincipal> principal =
do_CreateInstance("@mozilla.org/nullprincipal;1");
nsCOMPtr<nsIDOMDocument> domDocument;
nsresult rv = nsContentUtils::CreateDocument(EmptyString(),
EmptyString(),
nsnull,
uri,
uri,
principal,
nsnull,
DocumentFlavorHTML,
getter_AddRefs(domDocument));
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr<nsIDocument> document = do_QueryInterface(domDocument);
rv = nsContentUtils::ParseDocumentHTML(aSourceBuffer, document,
!(aFlags & nsIDocumentEncoder::OutputNoScriptContent));
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr<nsIDocumentEncoder> encoder = do_CreateInstance(
"@mozilla.org/layout/documentEncoder;1?type=text/plain");
rv = encoder->Init(domDocument, NS_LITERAL_STRING("text/plain"), aFlags);
NS_ENSURE_SUCCESS(rv, rv);
encoder->SetWrapColumn(aWrapCol);
return encoder->EncodeToString(aResultBuffer);
}
/* static */
nsresult

View File

@ -102,7 +102,7 @@ nsDOMParser::ParseFromString(const PRUnichar *str,
NS_ENSURE_SUCCESS(rv, rv);
nsCOMPtr<nsIDocument> document = do_QueryInterface(domDocument);
nsDependentString sourceBuffer(str);
rv = nsContentUtils::ParseDocumentHTML(sourceBuffer, document);
rv = nsContentUtils::ParseDocumentHTML(sourceBuffer, document, false);
NS_ENSURE_SUCCESS(rv, rv);
// Keep the XULXBL state, base URL and principal setting in sync with the

View File

@ -37,37 +37,19 @@
#include "TestHarness.h"
#include "nsIParser.h"
#include "nsIHTMLToTextSink.h"
#include "nsIParser.h"
#include "nsIContentSink.h"
#include "nsIParserService.h"
#include "nsServiceManagerUtils.h"
#include "nsStringGlue.h"
#include "nsParserCIID.h"
#include "nsIDocumentEncoder.h"
#include "nsCRT.h"
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
#include "nsIParserUtils.h"
#include "nsToolkitCompsCID.h"
void
ConvertBufToPlainText(nsString &aConBuf, int aFlag)
{
nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID);
if (parser) {
nsCOMPtr<nsIContentSink> sink;
sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
if (sink) {
nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
if (textSink) {
nsAutoString convertedText;
textSink->Initialize(&convertedText, aFlag, 72);
parser->SetContentSink(sink);
parser->Parse(aConBuf, 0, NS_LITERAL_CSTRING("text/html"), true);
aConBuf = convertedText;
}
}
}
nsCOMPtr<nsIParserUtils> utils =
do_GetService(NS_PARSERUTILS_CONTRACTID);
utils->ConvertToPlainText(aConBuf, aFlag, 72, aConBuf);
}
// Test for ASCII with format=flowed; delsp=yes

View File

@ -45,6 +45,9 @@ MODULE = html5
LIBRARY_NAME = html5p_s
LIBXUL_LIBRARY = 1
XPIDLSRCS = \
nsIParserUtils.idl \
$(NULL)
EXPORTS = \
jArray.h \

View File

@ -102,7 +102,8 @@ nsHtml5StringParser::ParseFragment(const nsAString& aSourceBuffer,
nsresult
nsHtml5StringParser::ParseDocument(const nsAString& aSourceBuffer,
nsIDocument* aTargetDoc)
nsIDocument* aTargetDoc,
bool aScriptingEnabledForNoscriptParsing)
{
MOZ_ASSERT(!aTargetDoc->GetFirstChild());
@ -116,7 +117,7 @@ nsHtml5StringParser::ParseDocument(const nsAString& aSourceBuffer,
mExecutor->PreventScriptExecution();
Tokenize(aSourceBuffer, aTargetDoc, false);
Tokenize(aSourceBuffer, aTargetDoc, aScriptingEnabledForNoscriptParsing);
return NS_OK;
}

View File

@ -86,7 +86,8 @@ class nsHtml5StringParser : public nsParserBase
*
*/
nsresult ParseDocument(const nsAString& aSourceBuffer,
nsIDocument* aTargetDoc);
nsIDocument* aTargetDoc,
bool aScriptingEnabledForNoscriptParsing);
private:

View File

@ -0,0 +1,25 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
* You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsISupports.idl"
/**
* Non-Web HTML parser functionality to Firefox extensions and XULRunner apps.
* Don't use this from within Gecko--use nsContentUtils directly instead.
*/
[scriptable, uuid(290f49bb-0619-4bda-8006-ab31bec7231a)]
interface nsIParserUtils : nsISupports
{
/**
* Convert HTML to plain text.
*
* @param src the HTML source to parse (C++ callers are allowed but not
* required to use the same string for the return value.)
* @param flags conversion option flags defined in nsIDocumentEncoder
* @param wrapCol number of characters per line; 0 for no auto-wrapping
*/
AString convertToPlainText(in AString src,
in unsigned long flags,
in unsigned long wrapCol);
};

View File

@ -96,6 +96,9 @@
#define NS_URLCLASSIFIERHASHCOMPLETER_CONTRACTID \
"@mozilla.org/url-classifier/hashcompleter;1"
#define NS_PARSERUTILS_CONTRACTID \
"@mozilla.org/parserutils;1"
#define NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID "@mozilla.org/feed-unescapehtml;1"
#define NS_NAVHISTORYSERVICE_CONTRACTID \
@ -179,6 +182,9 @@
#define NS_URLCLASSIFIERUTILS_CID \
{ 0xb7b2ccec, 0x7912, 0x4ea6, { 0xa5, 0x48, 0xb0, 0x38, 0x44, 0x70, 0x04, 0xbd} }
#define NS_PARSERUTILS_CID \
{ 0xaf7b24cb, 0x893f, 0x41bb, { 0x96, 0x1f, 0x5a, 0x69, 0x38, 0x8e, 0x27, 0xc3 } }
// {10f2f5f0-f103-4901-980f-ba11bd70d60d}
#define NS_SCRIPTABLEUNESCAPEHTML_CID \
{ 0x10f2f5f0, 0xf103, 0x4901, { 0x98, 0x0f, 0xba, 0x11, 0xbd, 0x70, 0xd6, 0x0d} }

View File

@ -140,6 +140,7 @@ NS_DEFINE_NAMED_CID(NS_URLCLASSIFIERSTREAMUPDATER_CID);
NS_DEFINE_NAMED_CID(NS_URLCLASSIFIERUTILS_CID);
#endif
#ifdef MOZ_FEEDS
NS_DEFINE_NAMED_CID(NS_PARSERUTILS_CID);
NS_DEFINE_NAMED_CID(NS_SCRIPTABLEUNESCAPEHTML_CID);
#endif
NS_DEFINE_NAMED_CID(NS_BROWSERSTATUSFILTER_CID);
@ -165,6 +166,7 @@ static const mozilla::Module::CIDEntry kToolkitCIDs[] = {
{ &kNS_URLCLASSIFIERUTILS_CID, false, NULL, nsUrlClassifierUtilsConstructor },
#endif
#ifdef MOZ_FEEDS
{ &kNS_PARSERUTILS_CID, false, NULL, nsScriptableUnescapeHTMLConstructor },
{ &kNS_SCRIPTABLEUNESCAPEHTML_CID, false, NULL, nsScriptableUnescapeHTMLConstructor },
#endif
{ &kNS_BROWSERSTATUSFILTER_CID, false, NULL, nsBrowserStatusFilterConstructor },
@ -193,6 +195,7 @@ static const mozilla::Module::ContractIDEntry kToolkitContracts[] = {
{ NS_URLCLASSIFIERUTILS_CONTRACTID, &kNS_URLCLASSIFIERUTILS_CID },
#endif
#ifdef MOZ_FEEDS
{ NS_PARSERUTILS_CONTRACTID, &kNS_PARSERUTILS_CID },
{ NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID, &kNS_SCRIPTABLEUNESCAPEHTML_CID },
#endif
{ NS_BROWSERSTATUSFILTER_CONTRACTID, &kNS_BROWSERSTATUSFILTER_CID },

View File

@ -41,20 +41,31 @@ interface nsIDOMDocumentFragment;
interface nsIURI;
/**
* A utility class that unescapes HTML strings.
* A utility class for HTML parsing in the feed processor.
*/
[scriptable, uuid(3ab244a9-f09d-44da-9e3f-ee4d67367f2d)]
interface nsIScriptableUnescapeHTML : nsISupports
{
/**
* Converts all entities to Unicode.
* Converts HTML to plain text. This is equivalent to calling
* nsIParserUtils::convertToPlainText(src,
* nsIDocumentEncoder::OutputSelectionOnly |
* nsIDocumentEncoder::OutputAbsoluteLinks, 0).
*
* @param src The HTML string to escape.
* You should most likely call nsIParserUtils::convertToPlainText()
* instead of calling this method.
*
* @param src The HTML string to convert to plain text.
*/
AString unescape(in AString src);
/**
* Appends the text to the element.
* Parses markup into a sanitized document fragment.
*
* @param fragment the input markup
* @param isXML true if |fragment| is XML and false if HTML
* @param baseURI the base URL for this fragment
* @param element the context node for the fragment parsing algorithm
*/
nsIDOMDocumentFragment parseFragment(in AString fragment,
in boolean isXML,

View File

@ -49,7 +49,6 @@
#include "nsParserCIID.h"
#include "nsContentUtils.h"
#include "nsIContentSink.h"
#include "nsIHTMLToTextSink.h"
#include "nsIDocumentEncoder.h"
#include "nsIDOMDocumentFragment.h"
#include "nsIFragmentContentSink.h"
@ -70,42 +69,35 @@
#define XHTML_DIV_TAG "div xmlns=\"http://www.w3.org/1999/xhtml\""
NS_IMPL_ISUPPORTS1(nsScriptableUnescapeHTML, nsIScriptableUnescapeHTML)
NS_IMPL_ISUPPORTS2(nsScriptableUnescapeHTML,
nsIScriptableUnescapeHTML,
nsIParserUtils)
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
// From /widget/HTMLConverter
//
// Takes HTML and converts it to plain text but in unicode.
//
NS_IMETHODIMP
nsScriptableUnescapeHTML::ConvertToPlainText(const nsAString & aFromStr,
PRUint32 aFlags,
PRUint32 aWrapCol,
nsAString & aToStr)
{
return nsContentUtils::ConvertToPlainText(aFromStr,
aToStr,
aFlags,
aWrapCol);
}
NS_IMETHODIMP
nsScriptableUnescapeHTML::Unescape(const nsAString & aFromStr,
nsAString & aToStr)
{
// create the parser to do the conversion.
aToStr.SetLength(0);
nsresult rv;
nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID, &rv);
if (NS_FAILED(rv)) return rv;
// convert it!
nsCOMPtr<nsIContentSink> sink;
sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
textSink->Initialize(&aToStr, nsIDocumentEncoder::OutputSelectionOnly
| nsIDocumentEncoder::OutputAbsoluteLinks, 0);
parser->SetContentSink(sink);
parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"),
true, eDTDMode_fragment);
return NS_OK;
return nsContentUtils::ConvertToPlainText(aFromStr,
aToStr,
nsIDocumentEncoder::OutputSelectionOnly |
nsIDocumentEncoder::OutputAbsoluteLinks,
0);
}
// The feed version of nsContentUtils::CreateContextualFragment It

View File

@ -38,12 +38,15 @@
#define nsScriptableHTMLUnescape_h__
#include "nsIScriptableUnescapeHTML.h"
#include "nsIParserUtils.h"
class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML
class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML,
public nsIParserUtils
{
public:
NS_DECL_ISUPPORTS
NS_DECL_NSISCRIPTABLEUNESCAPEHTML
NS_DECL_NSIPARSERUTILS
};
#endif // nsScriptableHTMLUnescape_h__

View File

@ -49,15 +49,9 @@
#include "nsITransferable.h" // for mime defs, this is BAD
// HTML convertor stuff
#include "nsIParser.h"
#include "nsIDTD.h"
#include "nsParserCIID.h"
#include "nsIContentSink.h"
#include "nsPrimitiveHelpers.h"
#include "nsIDocumentEncoder.h"
#include "nsIHTMLToTextSink.h"
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
#include "nsContentUtils.h"
nsHTMLFormatConverter::nsHTMLFormatConverter()
{
@ -272,36 +266,13 @@ nsHTMLFormatConverter::Convert(const char *aFromDataFlavor, nsISupports *aFromDa
NS_IMETHODIMP
nsHTMLFormatConverter::ConvertFromHTMLToUnicode(const nsAutoString & aFromStr, nsAutoString & aToStr)
{
// create the parser to do the conversion.
aToStr.SetLength(0);
nsresult rv;
nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID, &rv);
if ( !parser )
return rv;
// convert it!
nsCOMPtr<nsIContentSink> sink;
sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
// We set OutputNoScriptContent and OutputNoFramesContent unconditionally
// here because |aFromStr| is already filtered based on user preferences.
PRUint32 flags =
return nsContentUtils::ConvertToPlainText(aFromStr,
aToStr,
nsIDocumentEncoder::OutputSelectionOnly |
nsIDocumentEncoder::OutputAbsoluteLinks |
nsIDocumentEncoder::OutputNoScriptContent |
nsIDocumentEncoder::OutputNoFramesContent;
textSink->Initialize(&aToStr, flags, 0);
parser->SetContentSink(sink);
parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"), true, eDTDMode_fragment);
return NS_OK;
nsIDocumentEncoder::OutputNoFramesContent,
0);
} // ConvertFromHTMLToUnicode