Bug 650784 part 1 - Introduce a new API for converting HTML to plain text. r=smaug.

2024-09-13 09:24:08 -07:00 · 2012-02-27 13:57:48 +02:00 · 2012-02-27 13:57:48 +02:00 · 3aacb86e06
commit 3aacb86e06
parent 84b0480002
14 changed files with 160 additions and 100 deletions
--- a/content/base/public/nsContentUtils.h
+++ b/content/base/public/nsContentUtils.h
@ -1144,12 +1144,34 @@ public:
   * @param aSourceBuffer the string to parse as an HTML document
   * @param aTargetDocument the document object to parse into. Must not have
   *                        child nodes.
+   * @param aScriptingEnabledForNoscriptParsing whether <noscript> is parsed
+   *                                            as if scripting was enabled
   * @return NS_ERROR_DOM_INVALID_STATE_ERR if a re-entrant attempt to parse
   *         fragments is made, NS_ERROR_OUT_OF_MEMORY if aSourceBuffer is too
   *         long and NS_OK otherwise.
   */
  static nsresult ParseDocumentHTML(const nsAString& aSourceBuffer,
-                                    nsIDocument* aTargetDocument);
+                                    nsIDocument* aTargetDocument,
+                                    bool aScriptingEnabledForNoscriptParsing);
+
+  /**
+   * Converts HTML source to plain text by parsing the source and using the
+   * plain text serializer on the resulting tree.
+   *
+   * @param aSourceBuffer the string to parse as an HTML document
+   * @param aResultBuffer the string where the plain text result appears;
+   *                      may be the same string as aSourceBuffer
+   * @param aFlags Flags from nsIDocumentEncoder.
+   * @param aWrapCol Number of columns after which to line wrap; 0 for no
+   *                 auto-wrapping
+   * @return NS_ERROR_DOM_INVALID_STATE_ERR if a re-entrant attempt to parse
+   *         fragments is made, NS_ERROR_OUT_OF_MEMORY if aSourceBuffer is too
+   *         long and NS_OK otherwise.
+   */
+  static nsresult ConvertToPlainText(const nsAString& aSourceBuffer,
+                                     nsAString& aResultBuffer,
+                                     PRUint32 aFlags,
+                                     PRUint32 aWrapCol);

  /**
   * Creates a new XML document, which is marked to be loaded as data.
--- a/content/base/src/nsContentUtils.cpp
+++ b/content/base/src/nsContentUtils.cpp
@ -3927,7 +3927,8 @@ nsContentUtils::ParseFragmentHTML(const nsAString& aSourceBuffer,
 /* static */
 nsresult
 nsContentUtils::ParseDocumentHTML(const nsAString& aSourceBuffer,
-                                  nsIDocument* aTargetDocument)
+                                  nsIDocument* aTargetDocument,
+                                  bool aScriptingEnabledForNoscriptParsing)
 {
  if (nsContentUtils::sFragmentParsingActive) {
    NS_NOTREACHED("Re-entrant fragment parsing attempted.");
@ -3941,7 +3942,8 @@ nsContentUtils::ParseDocumentHTML(const nsAString& aSourceBuffer,
  }
  nsresult rv =
    sHTMLFragmentParser->ParseDocument(aSourceBuffer,
-                                       aTargetDocument);
+                                       aTargetDocument,
+                                       aScriptingEnabledForNoscriptParsing);
  return rv;
 }

@ -3992,6 +3994,44 @@ nsContentUtils::ParseFragmentXML(const nsAString& aSourceBuffer,
  return rv;
 }

+/* static */
+nsresult
+nsContentUtils::ConvertToPlainText(const nsAString& aSourceBuffer,
+                                   nsAString& aResultBuffer,
+                                   PRUint32 aFlags,
+                                   PRUint32 aWrapCol)
+{
+  nsCOMPtr<nsIURI> uri;
+  NS_NewURI(getter_AddRefs(uri), "about:blank");
+  nsCOMPtr<nsIPrincipal> principal =
+    do_CreateInstance("@mozilla.org/nullprincipal;1");
+  nsCOMPtr<nsIDOMDocument> domDocument;
+  nsresult rv = nsContentUtils::CreateDocument(EmptyString(),
+                                               EmptyString(),
+                                               nsnull,
+                                               uri,
+                                               uri,
+                                               principal,
+                                               nsnull,
+                                               DocumentFlavorHTML,
+                                               getter_AddRefs(domDocument));
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  nsCOMPtr<nsIDocument> document = do_QueryInterface(domDocument);
+  rv = nsContentUtils::ParseDocumentHTML(aSourceBuffer, document,
+    !(aFlags & nsIDocumentEncoder::OutputNoScriptContent));
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  nsCOMPtr<nsIDocumentEncoder> encoder = do_CreateInstance(
+    "@mozilla.org/layout/documentEncoder;1?type=text/plain");
+
+  rv = encoder->Init(domDocument, NS_LITERAL_STRING("text/plain"), aFlags);
+  NS_ENSURE_SUCCESS(rv, rv);
+
+  encoder->SetWrapColumn(aWrapCol);
+
+  return encoder->EncodeToString(aResultBuffer);
+}

 /* static */
 nsresult
--- a/content/base/src/nsDOMParser.cpp
+++ b/content/base/src/nsDOMParser.cpp
@ -102,7 +102,7 @@ nsDOMParser::ParseFromString(const PRUnichar *str,
    NS_ENSURE_SUCCESS(rv, rv);
    nsCOMPtr<nsIDocument> document = do_QueryInterface(domDocument);
    nsDependentString sourceBuffer(str);
-    rv = nsContentUtils::ParseDocumentHTML(sourceBuffer, document);
+    rv = nsContentUtils::ParseDocumentHTML(sourceBuffer, document, false);
    NS_ENSURE_SUCCESS(rv, rv);

    // Keep the XULXBL state, base URL and principal setting in sync with the
--- a/content/base/test/TestPlainTextSerializer.cpp
+++ b/content/base/test/TestPlainTextSerializer.cpp
@ -37,37 +37,19 @@

 #include "TestHarness.h"

-#include "nsIParser.h"
-#include "nsIHTMLToTextSink.h"
-#include "nsIParser.h"
-#include "nsIContentSink.h"
-#include "nsIParserService.h"
 #include "nsServiceManagerUtils.h"
 #include "nsStringGlue.h"
-#include "nsParserCIID.h"
 #include "nsIDocumentEncoder.h"
 #include "nsCRT.h"
-
-static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
+#include "nsIParserUtils.h"
+#include "nsToolkitCompsCID.h"

 void
 ConvertBufToPlainText(nsString &aConBuf, int aFlag)
 {
-  nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID);
-  if (parser) {
-    nsCOMPtr<nsIContentSink> sink;
-    sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
-    if (sink) {
-      nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
-      if (textSink) {
-        nsAutoString convertedText;
-        textSink->Initialize(&convertedText, aFlag, 72);
-        parser->SetContentSink(sink);
-        parser->Parse(aConBuf, 0, NS_LITERAL_CSTRING("text/html"), true);
-        aConBuf = convertedText;
-      }
-    }
-  }
+  nsCOMPtr<nsIParserUtils> utils =
+    do_GetService(NS_PARSERUTILS_CONTRACTID);
+  utils->ConvertToPlainText(aConBuf, aFlag, 72, aConBuf);
 }

 // Test for ASCII with format=flowed; delsp=yes
--- a/parser/html/Makefile.in
+++ b/parser/html/Makefile.in
@ -45,6 +45,9 @@ MODULE		= html5
 LIBRARY_NAME	= html5p_s
 LIBXUL_LIBRARY	= 1

+XPIDLSRCS = \
+  nsIParserUtils.idl \
+  $(NULL)

 EXPORTS		= \
 		jArray.h \
--- a/parser/html/nsHtml5StringParser.cpp
+++ b/parser/html/nsHtml5StringParser.cpp
@ -102,7 +102,8 @@ nsHtml5StringParser::ParseFragment(const nsAString& aSourceBuffer,

 nsresult
 nsHtml5StringParser::ParseDocument(const nsAString& aSourceBuffer,
-                                   nsIDocument* aTargetDoc)
+                                   nsIDocument* aTargetDoc,
+                                   bool aScriptingEnabledForNoscriptParsing)
 {
  MOZ_ASSERT(!aTargetDoc->GetFirstChild());

@ -116,7 +117,7 @@ nsHtml5StringParser::ParseDocument(const nsAString& aSourceBuffer,

  mExecutor->PreventScriptExecution();

-  Tokenize(aSourceBuffer, aTargetDoc, false);
+  Tokenize(aSourceBuffer, aTargetDoc, aScriptingEnabledForNoscriptParsing);
  return NS_OK;
 }

--- a/parser/html/nsHtml5StringParser.h
+++ b/parser/html/nsHtml5StringParser.h
@ -86,7 +86,8 @@ class nsHtml5StringParser : public nsParserBase
     *
     */
    nsresult ParseDocument(const nsAString& aSourceBuffer,
-                           nsIDocument* aTargetDoc);
+                           nsIDocument* aTargetDoc,
+                           bool aScriptingEnabledForNoscriptParsing);

  private:

--- a/parser/html/nsIParserUtils.idl
+++ b/parser/html/nsIParserUtils.idl
@ -0,0 +1,25 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "nsISupports.idl"
+
+/**
+ * Non-Web HTML parser functionality to Firefox extensions and XULRunner apps. 
+ * Don't use this from within Gecko--use nsContentUtils directly instead.
+ */
+[scriptable, uuid(290f49bb-0619-4bda-8006-ab31bec7231a)]
+interface nsIParserUtils : nsISupports
+{
+  /**
+   * Convert HTML to plain text.
+   *
+   * @param src the HTML source to parse (C++ callers are allowed but not
+   *            required to use the same string for the return value.)
+   * @param flags conversion option flags defined in nsIDocumentEncoder
+   * @param wrapCol number of characters per line; 0 for no auto-wrapping
+   */
+  AString convertToPlainText(in AString src,
+                             in unsigned long flags,
+                             in unsigned long wrapCol);
+};
--- a/toolkit/components/build/nsToolkitCompsCID.h
+++ b/toolkit/components/build/nsToolkitCompsCID.h
@ -96,6 +96,9 @@
 #define NS_URLCLASSIFIERHASHCOMPLETER_CONTRACTID \
    "@mozilla.org/url-classifier/hashcompleter;1"

+#define NS_PARSERUTILS_CONTRACTID \
+    "@mozilla.org/parserutils;1"
+
 #define NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID "@mozilla.org/feed-unescapehtml;1"

 #define NS_NAVHISTORYSERVICE_CONTRACTID \
@ -179,6 +182,9 @@
 #define NS_URLCLASSIFIERUTILS_CID \
 { 0xb7b2ccec, 0x7912, 0x4ea6, { 0xa5, 0x48, 0xb0, 0x38, 0x44, 0x70, 0x04, 0xbd} }

+#define NS_PARSERUTILS_CID  \
+{ 0xaf7b24cb, 0x893f, 0x41bb, { 0x96, 0x1f, 0x5a, 0x69, 0x38, 0x8e, 0x27, 0xc3 } }
+
 // {10f2f5f0-f103-4901-980f-ba11bd70d60d}
 #define NS_SCRIPTABLEUNESCAPEHTML_CID  \
 { 0x10f2f5f0, 0xf103, 0x4901, { 0x98, 0x0f, 0xba, 0x11, 0xbd, 0x70, 0xd6, 0x0d} }
--- a/toolkit/components/build/nsToolkitCompsModule.cpp
+++ b/toolkit/components/build/nsToolkitCompsModule.cpp
@ -140,6 +140,7 @@ NS_DEFINE_NAMED_CID(NS_URLCLASSIFIERSTREAMUPDATER_CID);
 NS_DEFINE_NAMED_CID(NS_URLCLASSIFIERUTILS_CID);
 #endif
 #ifdef MOZ_FEEDS
+NS_DEFINE_NAMED_CID(NS_PARSERUTILS_CID);
 NS_DEFINE_NAMED_CID(NS_SCRIPTABLEUNESCAPEHTML_CID);
 #endif
 NS_DEFINE_NAMED_CID(NS_BROWSERSTATUSFILTER_CID);
@ -165,6 +166,7 @@ static const mozilla::Module::CIDEntry kToolkitCIDs[] = {
  { &kNS_URLCLASSIFIERUTILS_CID, false, NULL, nsUrlClassifierUtilsConstructor },
 #endif
 #ifdef MOZ_FEEDS
+  { &kNS_PARSERUTILS_CID, false, NULL, nsScriptableUnescapeHTMLConstructor },
  { &kNS_SCRIPTABLEUNESCAPEHTML_CID, false, NULL, nsScriptableUnescapeHTMLConstructor },
 #endif
  { &kNS_BROWSERSTATUSFILTER_CID, false, NULL, nsBrowserStatusFilterConstructor },
@ -193,6 +195,7 @@ static const mozilla::Module::ContractIDEntry kToolkitContracts[] = {
  { NS_URLCLASSIFIERUTILS_CONTRACTID, &kNS_URLCLASSIFIERUTILS_CID },
 #endif
 #ifdef MOZ_FEEDS
+  { NS_PARSERUTILS_CONTRACTID, &kNS_PARSERUTILS_CID },
  { NS_SCRIPTABLEUNESCAPEHTML_CONTRACTID, &kNS_SCRIPTABLEUNESCAPEHTML_CID },
 #endif
  { NS_BROWSERSTATUSFILTER_CONTRACTID, &kNS_BROWSERSTATUSFILTER_CID },
--- a/toolkit/components/feeds/nsIScriptableUnescapeHTML.idl
+++ b/toolkit/components/feeds/nsIScriptableUnescapeHTML.idl
@ -41,20 +41,31 @@ interface nsIDOMDocumentFragment;
 interface nsIURI;

 /**
- * A utility class that unescapes HTML strings.
+ * A utility class for HTML parsing in the feed processor.
 */
 [scriptable, uuid(3ab244a9-f09d-44da-9e3f-ee4d67367f2d)]
 interface nsIScriptableUnescapeHTML : nsISupports 
 {
  /** 
-   * Converts all entities to Unicode.
+   * Converts HTML to plain text. This is equivalent to calling
+   * nsIParserUtils::convertToPlainText(src, 
+   *   nsIDocumentEncoder::OutputSelectionOnly |
+   *   nsIDocumentEncoder::OutputAbsoluteLinks, 0).
   *
-   * @param src The HTML string to escape.
+   * You should most likely call nsIParserUtils::convertToPlainText()
+   * instead of calling this method.
+   *
+   * @param src The HTML string to convert to plain text.
   */ 
  AString unescape(in AString src);
        
  /**
-   * Appends the text to the element.
+   * Parses markup into a sanitized document fragment.
+   *
+   * @param fragment the input markup
+   * @param isXML true if |fragment| is XML and false if HTML
+   * @param baseURI the base URL for this fragment
+   * @param element the context node for the fragment parsing algorithm
   */
  nsIDOMDocumentFragment parseFragment(in AString fragment,
                                       in boolean isXML,
--- a/toolkit/components/feeds/nsScriptableUnescapeHTML.cpp
+++ b/toolkit/components/feeds/nsScriptableUnescapeHTML.cpp
@ -49,7 +49,6 @@
 #include "nsParserCIID.h"
 #include "nsContentUtils.h"
 #include "nsIContentSink.h"
-#include "nsIHTMLToTextSink.h"
 #include "nsIDocumentEncoder.h"
 #include "nsIDOMDocumentFragment.h"
 #include "nsIFragmentContentSink.h"
@ -70,42 +69,35 @@

 #define XHTML_DIV_TAG "div xmlns=\"http://www.w3.org/1999/xhtml\""

-NS_IMPL_ISUPPORTS1(nsScriptableUnescapeHTML, nsIScriptableUnescapeHTML)
+NS_IMPL_ISUPPORTS2(nsScriptableUnescapeHTML,
+                   nsIScriptableUnescapeHTML,
+                   nsIParserUtils)

 static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);

-// From /widget/HTMLConverter
-//
-// Takes HTML and converts it to plain text but in unicode.
-//
+
+
+NS_IMETHODIMP
+nsScriptableUnescapeHTML::ConvertToPlainText(const nsAString & aFromStr,
+                                             PRUint32 aFlags,
+                                             PRUint32 aWrapCol,
+                                             nsAString & aToStr)
+{
+  return nsContentUtils::ConvertToPlainText(aFromStr,
+    aToStr,
+    aFlags,
+    aWrapCol);
+}
+
 NS_IMETHODIMP
 nsScriptableUnescapeHTML::Unescape(const nsAString & aFromStr,
                                   nsAString & aToStr)
 {
-  // create the parser to do the conversion.
-  aToStr.SetLength(0);
-  nsresult rv;
-  nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID, &rv);
-  if (NS_FAILED(rv)) return rv;
-
-  // convert it!
-  nsCOMPtr<nsIContentSink> sink;
-
-  sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
-  NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
-
-  nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
-  NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
-
-  textSink->Initialize(&aToStr, nsIDocumentEncoder::OutputSelectionOnly
-                       | nsIDocumentEncoder::OutputAbsoluteLinks, 0);
-
-  parser->SetContentSink(sink);
-
-  parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"),
-                true, eDTDMode_fragment);
-
-  return NS_OK;
+  return nsContentUtils::ConvertToPlainText(aFromStr,
+    aToStr,
+    nsIDocumentEncoder::OutputSelectionOnly |
+    nsIDocumentEncoder::OutputAbsoluteLinks,
+    0);
 }

 // The feed version of nsContentUtils::CreateContextualFragment It
--- a/toolkit/components/feeds/nsScriptableUnescapeHTML.h
+++ b/toolkit/components/feeds/nsScriptableUnescapeHTML.h
@ -38,12 +38,15 @@
 #define nsScriptableHTMLUnescape_h__

 #include "nsIScriptableUnescapeHTML.h"
+#include "nsIParserUtils.h"

-class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML
+class nsScriptableUnescapeHTML : public nsIScriptableUnescapeHTML,
+                                 public nsIParserUtils
 {
 public:
  NS_DECL_ISUPPORTS
  NS_DECL_NSISCRIPTABLEUNESCAPEHTML
+  NS_DECL_NSIPARSERUTILS
 };

 #endif // nsScriptableHTMLUnescape_h__
--- a/widget/xpwidgets/nsHTMLFormatConverter.cpp
+++ b/widget/xpwidgets/nsHTMLFormatConverter.cpp
@ -49,15 +49,9 @@
 #include "nsITransferable.h" // for mime defs, this is BAD

 // HTML convertor stuff
-#include "nsIParser.h"
-#include "nsIDTD.h"
-#include "nsParserCIID.h"
-#include "nsIContentSink.h"
 #include "nsPrimitiveHelpers.h"
 #include "nsIDocumentEncoder.h"
-#include "nsIHTMLToTextSink.h"
-
-static NS_DEFINE_CID(kCParserCID, NS_PARSER_CID);
+#include "nsContentUtils.h"

 nsHTMLFormatConverter::nsHTMLFormatConverter()
 {
@ -272,36 +266,13 @@ nsHTMLFormatConverter::Convert(const char *aFromDataFlavor, nsISupports *aFromDa
 NS_IMETHODIMP
 nsHTMLFormatConverter::ConvertFromHTMLToUnicode(const nsAutoString & aFromStr, nsAutoString & aToStr)
 {
-  // create the parser to do the conversion.
-  aToStr.SetLength(0);
-  nsresult rv;
-  nsCOMPtr<nsIParser> parser = do_CreateInstance(kCParserCID, &rv);
-  if ( !parser )
-    return rv;
-
-  // convert it!
-  nsCOMPtr<nsIContentSink> sink;
-
-  sink = do_CreateInstance(NS_PLAINTEXTSINK_CONTRACTID);
-  NS_ENSURE_TRUE(sink, NS_ERROR_FAILURE);
-
-  nsCOMPtr<nsIHTMLToTextSink> textSink(do_QueryInterface(sink));
-  NS_ENSURE_TRUE(textSink, NS_ERROR_FAILURE);
-
-  // We set OutputNoScriptContent and OutputNoFramesContent unconditionally
-  // here because |aFromStr| is already filtered based on user preferences.
-  PRUint32 flags =
+  return nsContentUtils::ConvertToPlainText(aFromStr,
+    aToStr,
    nsIDocumentEncoder::OutputSelectionOnly |
    nsIDocumentEncoder::OutputAbsoluteLinks |
    nsIDocumentEncoder::OutputNoScriptContent |
-    nsIDocumentEncoder::OutputNoFramesContent;
-  textSink->Initialize(&aToStr, flags, 0);
-
-  parser->SetContentSink(sink);
-
-  parser->Parse(aFromStr, 0, NS_LITERAL_CSTRING("text/html"), true, eDTDMode_fragment);
-  
-  return NS_OK;
+    nsIDocumentEncoder::OutputNoFramesContent,
+    0);
 } // ConvertFromHTMLToUnicode