mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Bug 650784 part 1 - Introduce a new API for converting HTML to plain text. r=smaug.
This commit is contained in:
parent
84b0480002
commit
3aacb86e06
@ -1144,12 +1144,34 @@ public:
|
||||
* @param aSourceBuffer the string to parse as an HTML document
|
||||
* @param aTargetDocument the document object to parse into. Must not have
|
||||
* child nodes.
|
||||
* @param aScriptingEnabledForNoscriptParsing whether <noscript> is parsed
|
||||
* as if scripting was enabled
|
||||
* @return NS_ERROR_DOM_INVALID_STATE_ERR if a re-entrant attempt to parse
|
||||
* fragments is made, NS_ERROR_OUT_OF_MEMORY if aSourceBuffer is too
|
||||
* long and NS_OK otherwise.
|
||||
*/
|
||||
static nsresult ParseDocumentHTML(const nsAString& aSourceBuffer,
|
||||
nsIDocument* aTargetDocument);
|
||||
nsIDocument* aTargetDocument,
|
||||
bool aScriptingEnabledForNoscriptParsing);
|
||||
|
||||
/**
|
||||
* Converts HTML source to plain text by parsing the source and using the
|
||||
* plain text serializer on the resulting tree.
|
||||
*
|
||||
* @param aSourceBuffer the string to parse as an HTML document
|
||||
* @param aResultBuffer the string where the plain text result appears;
|
||||
* may be the same string as aSourceBuffer
|
||||
* @param aFlags Flags from nsIDocumentEncoder.
|
||||
* @param aWrapCol Number of columns after which to line wrap; 0 for no
|
||||
* auto-wrapping
|
||||
* @return NS_ERROR_DOM_INVALID_STATE_ERR if a re-entrant attempt to parse
|
||||
* fragments is made, NS_ERROR_OUT_OF_MEMORY if aSourceBuffer is too
|
||||
* long and NS_OK otherwise.
|
||||
*/
|
||||
static nsresult ConvertToPlainText(const nsAString& aSourceBuffer,
|
||||
nsAString& aResultBuffer,
|
||||
PRUint32 aFlags,
|
||||
PRUint32 aWrapCol);
|
||||
|
||||
/**
|
||||
* Creates a new XML document, which is marked to be loaded as data.
|
||||
|
@ -3927,7 +3927,8 @@ nsContentUtils::ParseFragmentHTML(const nsAString& aSourceBuffer,
|
||||
/* static */
|
||||
nsresult
|
||||
nsContentUtils::ParseDocumentHTML(const nsAString& aSourceBuffer,
|
||||
nsIDocument* aTargetDocument)
|
||||
nsIDocument* aTargetDocument,
|
||||
bool aScriptingEnabledForNoscriptParsing)
|
||||
{
|
||||
if (nsContentUtils::sFragmentParsingActive) {
|
||||
NS_NOTREACHED("Re-entrant fragment parsing attempted.");
|
||||
@ -3941,7 +3942,8 @@ nsContentUtils::ParseDocumentHTML(const nsAString& aSourceBuffer,
|
||||
}
|
||||
nsresult rv =
|
||||
sHTMLFragmentParser->ParseDocument(aSourceBuffer,
|
||||
aTargetDocument);
|
||||
aTargetDocument,
|
||||
aScriptingEnabledForNoscriptParsing);
|
||||
return rv;
|
||||
}
|
||||
|
||||
@ -3992,6 +3994,44 @@ nsContentUtils::ParseFragmentXML(const nsAString& aSourceBuffer,
|
||||
return rv;
|
||||
}
|
||||
|
||||
/* static */
|
||||
nsresult
|
||||
nsContentUtils::ConvertToPlainText(const nsAString& aSourceBuffer,
|
||||
nsAString& aResultBuffer,
|
||||
PRUint32 aFlags,
|
||||
PRUint32 aWrapCol)
|
||||
{
|
||||
nsCOMPtr<nsIURI> uri;
|
||||
NS_NewURI(getter_AddRefs(uri), "about:blank");
|
||||
nsCOMPtr<nsIPrincipal> principal =
|
||||
do_CreateInstance("@mozilla.org/nullprincipal;1");
|
||||
nsCOMPtr<nsIDOMDocument> domDocument;
|
||||
nsresult rv = nsContentUtils::CreateDocument(EmptyString(),
|
||||
EmptyString(),
|
||||
nsnull,
|
||||
uri,
|
||||
uri,
|
||||
principal,
|
||||
nsnull,
|
||||
DocumentFlavorHTML,
|
||||
getter_AddRefs(domDocument));
|
||||
NS_ENSURE_SUCCESS(rv, rv);
|
||||
|
||||
nsCOMPtr<nsIDocument> document = do_QueryInterface(domDocument);
|
||||
rv = nsContentUtils::ParseDocumentHTML(aSourceBuffer, document,
|
||||
!(aFlags & nsIDocumentEncoder::OutputNoScriptContent));
|
||||
NS_ENSURE_SUCCESS(rv, rv);
|
||||
|
||||
nsCOMPtr<nsIDocumentEncoder> encoder = do_CreateInstance(
|
||||
"@mozilla.org/layout/documentEncoder;1?type=text/plain");
|
||||
|
||||
rv = encoder->Init(domDocument, NS_LITERAL_STRING("text/plain"), aFlags);
|
||||
NS_ENSURE_SUCCESS(rv, rv);
|
||||
|
||||
encoder->SetWrapColumn(aWrapCol);
|
||||
|
||||
return encoder->EncodeToString(aResultBuffer);
|
||||
}
|
||||
|
||||
/* static */
|
||||
nsresult
|
||||
|
@ -102,7 +102,7 @@ nsDOMParser::ParseFromString(const PRUnichar *str,
|
||||
NS_ENSURE_SUCCESS(rv, rv);
|
||||
nsCOMPtr<nsIDocument> document = do_QueryInterface(domDocument);
|
||||
nsDependentString sourceBuffer(str);
|
||||
rv = nsContentUtils::ParseDocumentHTML(sourceBuffer, document);
|
||||
rv = nsContentUtils::ParseDocumentHTML(sourceBuffer, document, false);
|
||||
NS_ENSURE_SUCCESS(rv, rv);
|
||||
|
||||
// Keep the XULXBL state, base URL and principal setting in sync with the
|
||||
|
@ -37,37 +37,19 @@
|
||||
|
||||
#include "TestHarness.h"
|
||||
|
||||
#include "nsIParser.h"
|
||||
#include "nsIHTMLToTextSink.h"
|
||||
#include "nsIParser.h"
|
||||
#include "nsIContentSink.h"
|
||||
#include "nsIParserService.h"
|
||||
#include "nsServiceManagerUtils.h"
|
||||
#include "nsStringGlue.h"
|
||||
#include "nsParserCIID.h"
|
||||
#include "nsIDocumentEncoder.h"
|
||||
#include "nsCRT.h"
|
||||
|
||||
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
|
||||
#include "nsIParserUtils.h"
|
||||
#include "nsToolkitCompsCID.h"
|
||||
|
||||
void
|
||||
ConvertBufToPlainText(nsString &aConBuf, int aFlag)
|
||||
{
|
||||
nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID);
|
||||
if (parser) {
|
||||
nsCOMPtr<nsIContentSink> sink;
|
||||
sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
|
||||
if (sink) {
|
||||
nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
|
||||
if (textSink) {
|
||||
nsAutoString convertedText;
|
||||
textSink->Initialize(&convertedText, aFlag, 72);
|
||||
parser->SetContentSink(sink);
|
||||
parser->Parse(aConBuf, 0, NS_LITERAL_CSTRING("text/html"), true);
|
||||
aConBuf = convertedText;
|
||||
}
|
||||
}
|
||||
}
|
||||
nsCOMPtr<nsIParserUtils> utils =
|
||||
do_GetService(NS_PARSERUTILS_CONTRACTID);
|
||||
utils->ConvertToPlainText(aConBuf, aFlag, 72, aConBuf);
|
||||
}
|
||||
|
||||
// Test for ASCII with format=flowed; delsp=yes
|
||||
|
@ -45,6 +45,9 @@ MODULE = html5
|
||||
LIBRARY_NAME = html5p_s
|
||||
LIBXUL_LIBRARY = 1
|
||||
|
||||
XPIDLSRCS = \
|
||||
nsIParserUtils.idl \
|
||||
$(NULL)
|
||||
|
||||
EXPORTS = \
|
||||
jArray.h \
|
||||
|
@ -102,7 +102,8 @@ nsHtml5StringParser::ParseFragment(const nsAString& aSourceBuffer,
|
||||
|
||||
nsresult
|
||||
nsHtml5StringParser::ParseDocument(const nsAString& aSourceBuffer,
|
||||
nsIDocument* aTargetDoc)
|
||||
nsIDocument* aTargetDoc,
|
||||
bool aScriptingEnabledForNoscriptParsing)
|
||||
{
|
||||
MOZ_ASSERT(!aTargetDoc->GetFirstChild());
|
||||
|
||||
@ -116,7 +117,7 @@ nsHtml5StringParser::ParseDocument(const nsAString& aSourceBuffer,
|
||||
|
||||
mExecutor->PreventScriptExecution();
|
||||
|
||||
Tokenize(aSourceBuffer, aTargetDoc, false);
|
||||
Tokenize(aSourceBuffer, aTargetDoc, aScriptingEnabledForNoscriptParsing);
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
|
@ -86,7 +86,8 @@ class nsHtml5StringParser : public nsParserBase
|
||||
*
|
||||
*/
|
||||
nsresult ParseDocument(const nsAString& aSourceBuffer,
|
||||
nsIDocument* aTargetDoc);
|
||||
nsIDocument* aTargetDoc,
|
||||
bool aScriptingEnabledForNoscriptParsing);
|
||||
|
||||
private:
|
||||
|
||||
|
25
parser/html/nsIParserUtils.idl
Normal file
25
parser/html/nsIParserUtils.idl
Normal file
@ -0,0 +1,25 @@
|
||||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "nsISupports.idl"
|
||||
|
||||
/**
|
||||
* Non-Web HTML parser functionality to Firefox extensions and XULRunner apps.
|
||||
* Don't use this from within Gecko--use nsContentUtils directly instead.
|
||||
*/
|
||||
[scriptable, uuid(290f49bb-0619-4bda-8006-ab31bec7231a)]
|
||||
interface nsIParserUtils : nsISupports
|
||||
{
|
||||
/**
|
||||
* Convert HTML to plain text.
|
||||
*
|
||||
* @param src the HTML source to parse (C++ callers are allowed but not
|
||||
* required to use the same string for the return value.)
|
||||
* @param flags conversion option flags defined in nsIDocumentEncoder
|
||||
* @param wrapCol number of characters per line; 0 for no auto-wrapping
|
||||
*/
|
||||
AString convertToPlainText(in AString src,
|
||||
in unsigned long flags,
|
||||
in unsigned long wrapCol);
|
||||
};
|
@ -96,6 +96,9 @@
|
||||
#define NS_URLCLASSIFIERHASHCOMPLETER_CONTRACTID \
|
||||
"@mozilla.org/url-classifier/hashcompleter;1"
|
||||
|
||||
#define NS_PARSERUTILS_CONTRACTID \
|
||||
"@mozilla.org/parserutils;1"
|
||||
|
||||
#define NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID "@mozilla.org/feed-unescapehtml;1"
|
||||
|
||||
#define NS_NAVHISTORYSERVICE_CONTRACTID \
|
||||
@ -179,6 +182,9 @@
|
||||
#define NS_URLCLASSIFIERUTILS_CID \
|
||||
{ 0xb7b2ccec, 0x7912, 0x4ea6, { 0xa5, 0x48, 0xb0, 0x38, 0x44, 0x70, 0x04, 0xbd} }
|
||||
|
||||
#define NS_PARSERUTILS_CID \
|
||||
{ 0xaf7b24cb, 0x893f, 0x41bb, { 0x96, 0x1f, 0x5a, 0x69, 0x38, 0x8e, 0x27, 0xc3 } }
|
||||
|
||||
// {10f2f5f0-f103-4901-980f-ba11bd70d60d}
|
||||
#define NS_SCRIPTABLEUNESCAPEHTML_CID \
|
||||
{ 0x10f2f5f0, 0xf103, 0x4901, { 0x98, 0x0f, 0xba, 0x11, 0xbd, 0x70, 0xd6, 0x0d} }
|
||||
|
@ -140,6 +140,7 @@ NS_DEFINE_NAMED_CID(NS_URLCLASSIFIERSTREAMUPDATER_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_URLCLASSIFIERUTILS_CID);
|
||||
#endif
|
||||
#ifdef MOZ_FEEDS
|
||||
NS_DEFINE_NAMED_CID(NS_PARSERUTILS_CID);
|
||||
NS_DEFINE_NAMED_CID(NS_SCRIPTABLEUNESCAPEHTML_CID);
|
||||
#endif
|
||||
NS_DEFINE_NAMED_CID(NS_BROWSERSTATUSFILTER_CID);
|
||||
@ -165,6 +166,7 @@ static const mozilla::Module::CIDEntry kToolkitCIDs[] = {
|
||||
{ &kNS_URLCLASSIFIERUTILS_CID, false, NULL, nsUrlClassifierUtilsConstructor },
|
||||
#endif
|
||||
#ifdef MOZ_FEEDS
|
||||
{ &kNS_PARSERUTILS_CID, false, NULL, nsScriptableUnescapeHTMLConstructor },
|
||||
{ &kNS_SCRIPTABLEUNESCAPEHTML_CID, false, NULL, nsScriptableUnescapeHTMLConstructor },
|
||||
#endif
|
||||
{ &kNS_BROWSERSTATUSFILTER_CID, false, NULL, nsBrowserStatusFilterConstructor },
|
||||
@ -193,6 +195,7 @@ static const mozilla::Module::ContractIDEntry kToolkitContracts[] = {
|
||||
{ NS_URLCLASSIFIERUTILS_CONTRACTID, &kNS_URLCLASSIFIERUTILS_CID },
|
||||
#endif
|
||||
#ifdef MOZ_FEEDS
|
||||
{ NS_PARSERUTILS_CONTRACTID, &kNS_PARSERUTILS_CID },
|
||||
{ NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID, &kNS_SCRIPTABLEUNESCAPEHTML_CID },
|
||||
#endif
|
||||
{ NS_BROWSERSTATUSFILTER_CONTRACTID, &kNS_BROWSERSTATUSFILTER_CID },
|
||||
|
@ -41,20 +41,31 @@ interface nsIDOMDocumentFragment;
|
||||
interface nsIURI;
|
||||
|
||||
/**
|
||||
* A utility class that unescapes HTML strings.
|
||||
* A utility class for HTML parsing in the feed processor.
|
||||
*/
|
||||
[scriptable, uuid(3ab244a9-f09d-44da-9e3f-ee4d67367f2d)]
|
||||
interface nsIScriptableUnescapeHTML : nsISupports
|
||||
{
|
||||
/**
|
||||
* Converts all entities to Unicode.
|
||||
* Converts HTML to plain text. This is equivalent to calling
|
||||
* nsIParserUtils::convertToPlainText(src,
|
||||
* nsIDocumentEncoder::OutputSelectionOnly |
|
||||
* nsIDocumentEncoder::OutputAbsoluteLinks, 0).
|
||||
*
|
||||
* @param src The HTML string to escape.
|
||||
* You should most likely call nsIParserUtils::convertToPlainText()
|
||||
* instead of calling this method.
|
||||
*
|
||||
* @param src The HTML string to convert to plain text.
|
||||
*/
|
||||
AString unescape(in AString src);
|
||||
|
||||
/**
|
||||
* Appends the text to the element.
|
||||
* Parses markup into a sanitized document fragment.
|
||||
*
|
||||
* @param fragment the input markup
|
||||
* @param isXML true if |fragment| is XML and false if HTML
|
||||
* @param baseURI the base URL for this fragment
|
||||
* @param element the context node for the fragment parsing algorithm
|
||||
*/
|
||||
nsIDOMDocumentFragment parseFragment(in AString fragment,
|
||||
in boolean isXML,
|
||||
|
@ -49,7 +49,6 @@
|
||||
#include "nsParserCIID.h"
|
||||
#include "nsContentUtils.h"
|
||||
#include "nsIContentSink.h"
|
||||
#include "nsIHTMLToTextSink.h"
|
||||
#include "nsIDocumentEncoder.h"
|
||||
#include "nsIDOMDocumentFragment.h"
|
||||
#include "nsIFragmentContentSink.h"
|
||||
@ -70,42 +69,35 @@
|
||||
|
||||
#define XHTML_DIV_TAG "div xmlns=\"http://www.w3.org/1999/xhtml\""
|
||||
|
||||
NS_IMPL_ISUPPORTS1(nsScriptableUnescapeHTML, nsIScriptableUnescapeHTML)
|
||||
NS_IMPL_ISUPPORTS2(nsScriptableUnescapeHTML,
|
||||
nsIScriptableUnescapeHTML,
|
||||
nsIParserUtils)
|
||||
|
||||
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
|
||||
|
||||
// From /widget/HTMLConverter
|
||||
//
|
||||
// Takes HTML and converts it to plain text but in unicode.
|
||||
//
|
||||
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsScriptableUnescapeHTML::Unescape(const nsAString & aFromStr,
|
||||
nsScriptableUnescapeHTML::ConvertToPlainText(const nsAString & aFromStr,
|
||||
PRUint32 aFlags,
|
||||
PRUint32 aWrapCol,
|
||||
nsAString & aToStr)
|
||||
{
|
||||
return nsContentUtils::ConvertToPlainText(aFromStr,
|
||||
aToStr,
|
||||
aFlags,
|
||||
aWrapCol);
|
||||
}
|
||||
|
||||
NS_IMETHODIMP
|
||||
nsScriptableUnescapeHTML::Unescape(const nsAString & aFromStr,
|
||||
nsAString & aToStr)
|
||||
{
|
||||
// create the parser to do the conversion.
|
||||
aToStr.SetLength(0);
|
||||
nsresult rv;
|
||||
nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID, &rv);
|
||||
if (NS_FAILED(rv)) return rv;
|
||||
|
||||
// convert it!
|
||||
nsCOMPtr<nsIContentSink> sink;
|
||||
|
||||
sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
|
||||
NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
|
||||
|
||||
nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
|
||||
NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
|
||||
|
||||
textSink->Initialize(&aToStr, nsIDocumentEncoder::OutputSelectionOnly
|
||||
| nsIDocumentEncoder::OutputAbsoluteLinks, 0);
|
||||
|
||||
parser->SetContentSink(sink);
|
||||
|
||||
parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"),
|
||||
true, eDTDMode_fragment);
|
||||
|
||||
return NS_OK;
|
||||
return nsContentUtils::ConvertToPlainText(aFromStr,
|
||||
aToStr,
|
||||
nsIDocumentEncoder::OutputSelectionOnly |
|
||||
nsIDocumentEncoder::OutputAbsoluteLinks,
|
||||
0);
|
||||
}
|
||||
|
||||
// The feed version of nsContentUtils::CreateContextualFragment It
|
||||
|
@ -38,12 +38,15 @@
|
||||
#define nsScriptableHTMLUnescape_h__
|
||||
|
||||
#include "nsIScriptableUnescapeHTML.h"
|
||||
#include "nsIParserUtils.h"
|
||||
|
||||
class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML
|
||||
class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML,
|
||||
public nsIParserUtils
|
||||
{
|
||||
public:
|
||||
NS_DECL_ISUPPORTS
|
||||
NS_DECL_NSISCRIPTABLEUNESCAPEHTML
|
||||
NS_DECL_NSIPARSERUTILS
|
||||
};
|
||||
|
||||
#endif // nsScriptableHTMLUnescape_h__
|
||||
|
@ -49,15 +49,9 @@
|
||||
#include "nsITransferable.h" // for mime defs, this is BAD
|
||||
|
||||
// HTML convertor stuff
|
||||
#include "nsIParser.h"
|
||||
#include "nsIDTD.h"
|
||||
#include "nsParserCIID.h"
|
||||
#include "nsIContentSink.h"
|
||||
#include "nsPrimitiveHelpers.h"
|
||||
#include "nsIDocumentEncoder.h"
|
||||
#include "nsIHTMLToTextSink.h"
|
||||
|
||||
static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
|
||||
#include "nsContentUtils.h"
|
||||
|
||||
nsHTMLFormatConverter::nsHTMLFormatConverter()
|
||||
{
|
||||
@ -272,36 +266,13 @@ nsHTMLFormatConverter::Convert(const char *aFromDataFlavor, nsISupports *aFromDa
|
||||
NS_IMETHODIMP
|
||||
nsHTMLFormatConverter::ConvertFromHTMLToUnicode(const nsAutoString & aFromStr, nsAutoString & aToStr)
|
||||
{
|
||||
// create the parser to do the conversion.
|
||||
aToStr.SetLength(0);
|
||||
nsresult rv;
|
||||
nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID, &rv);
|
||||
if ( !parser )
|
||||
return rv;
|
||||
|
||||
// convert it!
|
||||
nsCOMPtr<nsIContentSink> sink;
|
||||
|
||||
sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
|
||||
NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
|
||||
|
||||
nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
|
||||
NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
|
||||
|
||||
// We set OutputNoScriptContent and OutputNoFramesContent unconditionally
|
||||
// here because |aFromStr| is already filtered based on user preferences.
|
||||
PRUint32 flags =
|
||||
return nsContentUtils::ConvertToPlainText(aFromStr,
|
||||
aToStr,
|
||||
nsIDocumentEncoder::OutputSelectionOnly |
|
||||
nsIDocumentEncoder::OutputAbsoluteLinks |
|
||||
nsIDocumentEncoder::OutputNoScriptContent |
|
||||
nsIDocumentEncoder::OutputNoFramesContent;
|
||||
textSink->Initialize(&aToStr, flags, 0);
|
||||
|
||||
parser->SetContentSink(sink);
|
||||
|
||||
parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"), true, eDTDMode_fragment);
|
||||
|
||||
return NS_OK;
|
||||
nsIDocumentEncoder::OutputNoFramesContent,
|
||||
0);
|
||||
} // ConvertFromHTMLToUnicode
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user