gecko/content/base/src/nsPlainTextSerializer.cpp

2076 lines
63 KiB
C++
Raw Normal View History

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 1998
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Daniel Bratell <bratell@lysator.liu.se>
* Ben Bucksch <mozilla@bucksch.org>
*
* Alternatively, the contents of this file may be used under the terms of
* either of the GNU General Public License Version 2 or later (the "GPL"),
* or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* nsIContentSerializer implementation that can be used with an
* nsIDocumentEncoder to convert a DOM into plaintext in a nice way
* (eg for copy/paste as plaintext).
*/
#include "nsPlainTextSerializer.h"
#include "nsLWBrkCIID.h"
#include "nsIServiceManager.h"
#include "nsGkAtoms.h"
#include "nsIDOMText.h"
#include "nsIDOMCDATASection.h"
#include "nsIDOMElement.h"
#include "nsINameSpaceManager.h"
#include "nsTextFragment.h"
#include "nsContentUtils.h"
#include "nsReadableUtils.h"
#include "nsUnicharUtils.h"
#include "nsCRT.h"
#include "nsIParserService.h"
#define PREF_STRUCTS "converter.html2txt.structs"
#define PREF_HEADER_STRATEGY "converter.html2txt.header_strategy"
static const PRInt32 kTabSize=4;
static const PRInt32 kOLNumberWidth = 3;
static const PRInt32 kIndentSizeHeaders = 2; /* Indention of h1, if
mHeaderStrategy = 1 or = 2.
Indention of other headers
is derived from that.
XXX center h1? */
static const PRInt32 kIndentIncrementHeaders = 2; /* If mHeaderStrategy = 1,
indent h(x+1) this many
columns more than h(x) */
static const PRInt32 kIndentSizeList = (kTabSize > kOLNumberWidth+3) ? kTabSize: kOLNumberWidth+3;
// Indention of non-first lines of ul and ol
static const PRInt32 kIndentSizeDD = kTabSize; // Indention of <dd>
static const PRUnichar kNBSP = 160;
static const PRUnichar kSPACE = ' ';
static PRInt32 HeaderLevel(eHTMLTags aTag);
static PRInt32 GetUnicharWidth(PRUnichar ucs);
static PRInt32 GetUnicharStringWidth(const PRUnichar* pwcs, PRInt32 n);
// Someday may want to make this non-const:
static const PRUint32 TagStackSize = 500;
static const PRUint32 OLStackSize = 100;
nsresult NS_NewPlainTextSerializer(nsIContentSerializer** aSerializer)
{
nsPlainTextSerializer* it = new nsPlainTextSerializer();
if (!it) {
return NS_ERROR_OUT_OF_MEMORY;
}
return CallQueryInterface(it, aSerializer);
}
nsPlainTextSerializer::nsPlainTextSerializer()
: kSpace(NS_LITERAL_STRING(" ")) // Init of "constant"
{
mOutputString = nsnull;
mInHead = PR_FALSE;
mAtFirstColumn = PR_TRUE;
mIndent = 0;
mCiteQuoteLevel = 0;
mStructs = PR_TRUE; // will be read from prefs later
mHeaderStrategy = 1 /*indent increasingly*/; // ditto
mQuotesPreformatted = PR_FALSE; // ditto
mDontWrapAnyQuotes = PR_FALSE; // ditto
mHasWrittenCiteBlockquote = PR_FALSE;
mSpanLevel = 0;
for (PRInt32 i = 0; i <= 6; i++) {
mHeaderCounter[i] = 0;
}
// Line breaker
mWrapColumn = 72; // XXX magic number, we expect someone to reset this
mCurrentLineWidth = 0;
// Flow
mEmptyLines = 1; // The start of the document is an "empty line" in itself,
mInWhitespace = PR_TRUE;
mPreFormatted = PR_FALSE;
mStartedOutput = PR_FALSE;
// initialize the tag stack to zero:
mTagStack = new nsHTMLTag[TagStackSize];
mTagStackIndex = 0;
mIgnoreAboveIndex = (PRUint32)kNotFound;
// initialize the OL stack, where numbers for ordered lists are kept:
mOLStack = new PRInt32[OLStackSize];
mOLStackIndex = 0;
mULCount = 0;
}
nsPlainTextSerializer::~nsPlainTextSerializer()
{
delete[] mTagStack;
delete[] mOLStack;
}
NS_IMPL_ISUPPORTS4(nsPlainTextSerializer,
nsIContentSerializer,
nsIContentSink,
nsIHTMLContentSink,
nsIHTMLToTextSink)
NS_IMETHODIMP
nsPlainTextSerializer::Init(PRUint32 aFlags, PRUint32 aWrapColumn,
const char* aCharSet, PRBool aIsCopying,
PRBool aIsWholeDocument)
{
#ifdef DEBUG
// Check if the major control flags are set correctly.
if(aFlags & nsIDocumentEncoder::OutputFormatFlowed) {
NS_ASSERTION(aFlags & nsIDocumentEncoder::OutputFormatted,
"If you want format=flowed, you must combine it with "
"nsIDocumentEncoder::OutputFormatted");
}
if(aFlags & nsIDocumentEncoder::OutputFormatted) {
NS_ASSERTION(!(aFlags & nsIDocumentEncoder::OutputPreformatted),
"Can't do formatted and preformatted output at the same time!");
}
#endif
NS_ENSURE_TRUE(nsContentUtils::GetParserService(), NS_ERROR_UNEXPECTED);
mFlags = aFlags;
mWrapColumn = aWrapColumn;
// Only create a linebreaker if we will handle wrapping.
if (MayWrap()) {
mLineBreaker = nsContentUtils::LineBreaker();
}
// Set the line break character:
if ((mFlags & nsIDocumentEncoder::OutputCRLineBreak)
&& (mFlags & nsIDocumentEncoder::OutputLFLineBreak)) {
// Windows
mLineBreak.AssignLiteral("\r\n");
}
else if (mFlags & nsIDocumentEncoder::OutputCRLineBreak) {
// Mac
mLineBreak.Assign(PRUnichar('\r'));
}
else if (mFlags & nsIDocumentEncoder::OutputLFLineBreak) {
// Unix/DOM
mLineBreak.Assign(PRUnichar('\n'));
}
else {
// Platform/default
mLineBreak.AssignLiteral(NS_LINEBREAK);
}
mLineBreakDue = PR_FALSE;
mFloatingLines = -1;
if (mFlags & nsIDocumentEncoder::OutputFormatted) {
// Get some prefs that controls how we do formatted output
mStructs = nsContentUtils::GetBoolPref(PREF_STRUCTS, mStructs);
mHeaderStrategy =
nsContentUtils::GetIntPref(PREF_HEADER_STRATEGY, mHeaderStrategy);
// The quotesPreformatted pref is a temporary measure. See bug 69638.
mQuotesPreformatted =
nsContentUtils::GetBoolPref("editor.quotesPreformatted",
mQuotesPreformatted);
// DontWrapAnyQuotes is set according to whether plaintext mail
// is wrapping to window width -- see bug 134439.
// We'll only want this if we're wrapping and formatted.
if (mFlags & nsIDocumentEncoder::OutputWrap || mWrapColumn > 0) {
mDontWrapAnyQuotes =
nsContentUtils::GetBoolPref("mail.compose.wrap_to_window_width",
mDontWrapAnyQuotes);
}
}
// XXX We should let the caller pass this in.
if (nsContentUtils::GetBoolPref("browser.frames.enabled")) {
mFlags &= ~nsIDocumentEncoder::OutputNoFramesContent;
}
else {
mFlags |= nsIDocumentEncoder::OutputNoFramesContent;
}
return NS_OK;
}
PRBool
nsPlainTextSerializer::GetLastBool(const nsVoidArray& aStack)
{
PRUint32 size = aStack.Count();
if (size == 0) {
return PR_FALSE;
}
return (aStack.ElementAt(size-1) != reinterpret_cast<void*>(PR_FALSE));
}
void
nsPlainTextSerializer::SetLastBool(nsVoidArray& aStack, PRBool aValue)
{
PRUint32 size = aStack.Count();
if (size > 0) {
aStack.ReplaceElementAt(reinterpret_cast<void*>(aValue), size-1);
}
else {
NS_ERROR("There is no \"Last\" value");
}
}
void
nsPlainTextSerializer::PushBool(nsVoidArray& aStack, PRBool aValue)
{
aStack.AppendElement(reinterpret_cast<void*>(aValue));
}
PRBool
nsPlainTextSerializer::PopBool(nsVoidArray& aStack)
{
PRBool returnValue = PR_FALSE;
PRUint32 size = aStack.Count();
if (size > 0) {
returnValue = (aStack.ElementAt(size-1) != reinterpret_cast<void*>(PR_FALSE));
aStack.RemoveElementAt(size-1);
}
return returnValue;
}
NS_IMETHODIMP
nsPlainTextSerializer::Initialize(nsAString* aOutString,
PRUint32 aFlags, PRUint32 aWrapCol)
{
nsresult rv = Init(aFlags, aWrapCol, nsnull, PR_FALSE, PR_FALSE);
NS_ENSURE_SUCCESS(rv, rv);
// XXX This is wrong. It violates XPCOM string ownership rules.
// We're only getting away with this because instances of this
// class are restricted to single function scope.
mOutputString = aOutString;
return NS_OK;
}
NS_IMETHODIMP
nsPlainTextSerializer::AppendText(nsIDOMText* aText,
PRInt32 aStartOffset,
PRInt32 aEndOffset,
nsAString& aStr)
{
if (mIgnoreAboveIndex != (PRUint32)kNotFound) {
return NS_OK;
}
NS_ASSERTION(aStartOffset >= 0, "Negative start offset for text fragment!");
if ( aStartOffset < 0 )
return NS_ERROR_INVALID_ARG;
NS_ENSURE_ARG(aText);
nsresult rv = NS_OK;
PRInt32 length = 0;
nsAutoString textstr;
nsCOMPtr<nsIContent> content = do_QueryInterface(aText);
const nsTextFragment* frag;
if (!content || !(frag = content->GetText())) {
return NS_ERROR_FAILURE;
}
PRInt32 endoffset = (aEndOffset == -1) ? frag->GetLength() : aEndOffset;
NS_ASSERTION(aStartOffset <= endoffset, "A start offset is beyond the end of the text fragment!");
length = endoffset - aStartOffset;
if (length <= 0) {
return NS_OK;
}
if (frag->Is2b()) {
textstr.Assign(frag->Get2b() + aStartOffset, length);
}
else {
textstr.AssignWithConversion(frag->Get1b()+aStartOffset, length);
}
mOutputString = &aStr;
// We have to split the string across newlines
// to match parser behavior
PRInt32 start = 0;
PRInt32 offset = textstr.FindCharInSet("\n\r");
while (offset != kNotFound) {
if(offset>start) {
// Pass in the line
rv = DoAddLeaf(nsnull,
eHTMLTag_text,
Substring(textstr, start, offset-start));
if (NS_FAILED(rv)) break;
}
// Pass in a newline
rv = DoAddLeaf(nsnull, eHTMLTag_newline, mLineBreak);
if (NS_FAILED(rv)) break;
start = offset+1;
offset = textstr.FindCharInSet("\n\r", start);
}
// Consume the last bit of the string if there's any left
if (NS_SUCCEEDED(rv) && start < length) {
if (start) {
rv = DoAddLeaf(nsnull,
eHTMLTag_text,
Substring(textstr, start, length-start));
}
else {
rv = DoAddLeaf(nsnull, eHTMLTag_text, textstr);
}
}
mOutputString = nsnull;
return rv;
}
NS_IMETHODIMP
nsPlainTextSerializer::AppendCDATASection(nsIDOMCDATASection* aCDATASection,
PRInt32 aStartOffset,
PRInt32 aEndOffset,
nsAString& aStr)
{
return AppendText(aCDATASection, aStartOffset, aEndOffset, aStr);
}
NS_IMETHODIMP
nsPlainTextSerializer::AppendElementStart(nsIDOMElement *aElement,
nsIDOMElement *aOriginalElement,
nsAString& aStr)
{
NS_ENSURE_ARG(aElement);
mContent = do_QueryInterface(aElement);
if (!mContent) return NS_ERROR_FAILURE;
nsresult rv;
PRInt32 id = GetIdForContent(mContent);
PRBool isContainer = IsContainer(id);
mOutputString = &aStr;
if (isContainer) {
rv = DoOpenContainer(nsnull, id);
}
else {
rv = DoAddLeaf(nsnull, id, EmptyString());
}
mContent = 0;
mOutputString = nsnull;
if (!mInHead && id == eHTMLTag_head)
mInHead = PR_TRUE;
return rv;
}
NS_IMETHODIMP
nsPlainTextSerializer::AppendElementEnd(nsIDOMElement *aElement,
nsAString& aStr)
{
NS_ENSURE_ARG(aElement);
mContent = do_QueryInterface(aElement);
if (!mContent) return NS_ERROR_FAILURE;
nsresult rv;
PRInt32 id = GetIdForContent(mContent);
PRBool isContainer = IsContainer(id);
mOutputString = &aStr;
rv = NS_OK;
if (isContainer) {
rv = DoCloseContainer(id);
}
mContent = 0;
mOutputString = nsnull;
if (mInHead && id == eHTMLTag_head)
mInHead = PR_FALSE;
return rv;
}
NS_IMETHODIMP
nsPlainTextSerializer::Flush(nsAString& aStr)
{
mOutputString = &aStr;
FlushLine();
mOutputString = nsnull;
return NS_OK;
}
NS_IMETHODIMP
nsPlainTextSerializer::AppendDocumentStart(nsIDOMDocument *aDocument,
nsAString& aStr)
{
return NS_OK;
}
NS_IMETHODIMP
nsPlainTextSerializer::OpenContainer(const nsIParserNode& aNode)
{
PRInt32 type = aNode.GetNodeType();
if (type == eHTMLTag_head) {
mInHead = PR_TRUE;
return NS_OK;
}
return DoOpenContainer(&aNode, type);
}
NS_IMETHODIMP
nsPlainTextSerializer::CloseContainer(const nsHTMLTag aTag)
{
if (aTag == eHTMLTag_head) {
mInHead = PR_FALSE;
return NS_OK;
}
return DoCloseContainer(aTag);
}
NS_IMETHODIMP
nsPlainTextSerializer::AddLeaf(const nsIParserNode& aNode)
{
if (mIgnoreAboveIndex != (PRUint32)kNotFound) {
return NS_OK;
}
eHTMLTags type = (eHTMLTags)aNode.GetNodeType();
const nsAString& text = aNode.GetText();
if ((type == eHTMLTag_text) ||
(type == eHTMLTag_whitespace) ||
(type == eHTMLTag_newline)) {
// Copy the text out, stripping out CRs
nsAutoString str;
PRUint32 length;
str.SetCapacity(text.Length());
nsReadingIterator<PRUnichar> srcStart, srcEnd;
length = nsContentUtils::CopyNewlineNormalizedUnicodeTo(text.BeginReading(srcStart), text.EndReading(srcEnd), str);
str.SetLength(length);
return DoAddLeaf(&aNode, type, str);
}
else {
return DoAddLeaf(&aNode, type, text);
}
}
NS_IMETHODIMP
nsPlainTextSerializer::OpenHead()
{
mInHead = PR_TRUE;
return NS_OK;
}
NS_IMETHODIMP
nsPlainTextSerializer::IsEnabled(PRInt32 aTag, PRBool* aReturn)
{
nsHTMLTag theHTMLTag = nsHTMLTag(aTag);
if (theHTMLTag == eHTMLTag_script) {
*aReturn = !(mFlags & nsIDocumentEncoder::OutputNoScriptContent);
}
else if (theHTMLTag == eHTMLTag_frameset) {
*aReturn = !(mFlags & nsIDocumentEncoder::OutputNoFramesContent);
}
else {
*aReturn = PR_FALSE;
}
return NS_OK;
}
/**
* aNode may be null when we're working with the DOM, but then mContent is
* useable instead.
*/
nsresult
nsPlainTextSerializer::DoOpenContainer(const nsIParserNode* aNode, PRInt32 aTag)
{
if (mFlags & nsIDocumentEncoder::OutputRaw) {
// Raw means raw. Don't even think about doing anything fancy
// here like indenting, adding line breaks or any other
// characters such as list item bullets, quote characters
// around <q>, etc. I mean it! Don't make me smack you!
return NS_OK;
}
eHTMLTags type = (eHTMLTags)aTag;
if (mTagStackIndex < TagStackSize) {
mTagStack[mTagStackIndex++] = type;
}
if (mIgnoreAboveIndex != (PRUint32)kNotFound) {
return NS_OK;
}
// Reset this so that <blockquote type=cite> doesn't affect the whitespace
// above random <pre>s below it.
mHasWrittenCiteBlockquote = mHasWrittenCiteBlockquote && aTag == eHTMLTag_pre;
PRBool isInCiteBlockquote = PR_FALSE;
// XXX special-case <blockquote type=cite> so that we don't add additional
// newlines before the text.
if (aTag == eHTMLTag_blockquote) {
nsAutoString value;
nsresult rv = GetAttributeValue(aNode, nsGkAtoms::type, value);
isInCiteBlockquote = NS_SUCCEEDED(rv) && value.EqualsIgnoreCase("cite");
}
if (mLineBreakDue && !isInCiteBlockquote)
EnsureVerticalSpace(mFloatingLines);
// Check if this tag's content that should not be output
if ((type == eHTMLTag_noscript &&
!(mFlags & nsIDocumentEncoder::OutputNoScriptContent)) ||
((type == eHTMLTag_iframe || type == eHTMLTag_noframes) &&
!(mFlags & nsIDocumentEncoder::OutputNoFramesContent))) {
// Ignore everything that follows the current tag in
// question until a matching end tag is encountered.
mIgnoreAboveIndex = mTagStackIndex - 1;
return NS_OK;
}
if (type == eHTMLTag_body) {
// Try to figure out here whether we have a
// preformatted style attribute.
//
// Trigger on the presence of a "-moz-pre-wrap" in the
// style attribute. That's a very simplistic way to do
// it, but better than nothing.
// Also set mWrapColumn to the value given there
// (which arguably we should only do if told to do so).
nsAutoString style;
PRInt32 whitespace;
if(NS_SUCCEEDED(GetAttributeValue(aNode, nsGkAtoms::style, style)) &&
(kNotFound != (whitespace = style.Find("white-space:")))) {
if (kNotFound != style.Find("-moz-pre-wrap", PR_TRUE, whitespace)) {
#ifdef DEBUG_preformatted
printf("Set mPreFormatted based on style moz-pre-wrap\n");
#endif
mPreFormatted = PR_TRUE;
PRInt32 widthOffset = style.Find("width:");
if (widthOffset >= 0) {
// We have to search for the ch before the semicolon,
// not for the semicolon itself, because nsString::ToInteger()
// considers 'c' to be a valid numeric char (even if radix=10)
// but then gets confused if it sees it next to the number
// when the radix specified was 10, and returns an error code.
PRInt32 semiOffset = style.Find("ch", widthOffset+6);
PRInt32 length = (semiOffset > 0 ? semiOffset - widthOffset - 6
: style.Length() - widthOffset);
nsAutoString widthstr;
style.Mid(widthstr, widthOffset+6, length);
PRInt32 err;
PRInt32 col = widthstr.ToInteger(&err);
if (NS_SUCCEEDED(err)) {
mWrapColumn = (PRUint32)col;
#ifdef DEBUG_preformatted
printf("Set wrap column to %d based on style\n", mWrapColumn);
#endif
}
}
}
else if (kNotFound != style.Find("pre", PR_TRUE, whitespace)) {
#ifdef DEBUG_preformatted
printf("Set mPreFormatted based on style pre\n");
#endif
mPreFormatted = PR_TRUE;
mWrapColumn = 0;
}
}
else {
mPreFormatted = PR_FALSE;
}
return NS_OK;
}
if (!DoOutput()) {
return NS_OK;
}
if (type == eHTMLTag_p)
EnsureVerticalSpace(1);
else if (type == eHTMLTag_pre) {
if (GetLastBool(mIsInCiteBlockquote))
EnsureVerticalSpace(0);
else if (mHasWrittenCiteBlockquote) {
EnsureVerticalSpace(0);
mHasWrittenCiteBlockquote = PR_FALSE;
}
else
EnsureVerticalSpace(1);
}
else if (type == eHTMLTag_tr) {
PushBool(mHasWrittenCellsForRow, PR_FALSE);
}
else if (type == eHTMLTag_td || type == eHTMLTag_th) {
// We must make sure that the content of two table cells get a
// space between them.
// To make the separation between cells most obvious and
// importable, we use a TAB.
if (GetLastBool(mHasWrittenCellsForRow)) {
// Bypass |Write| so that the TAB isn't compressed away.
AddToLine(NS_LITERAL_STRING("\t").get(), 1);
mInWhitespace = PR_TRUE;
}
else if (mHasWrittenCellsForRow.Count() == 0) {
// We don't always see a <tr> (nor a <table>) before the <td> if we're
// copying part of a table
PushBool(mHasWrittenCellsForRow, PR_TRUE); // will never be popped
}
else {
SetLastBool(mHasWrittenCellsForRow, PR_TRUE);
}
}
else if (type == eHTMLTag_ul) {
// Indent here to support nested lists, which aren't included in li :-(
EnsureVerticalSpace(mULCount + mOLStackIndex == 0 ? 1 : 0);
// Must end the current line before we change indention
mIndent += kIndentSizeList;
mULCount++;
}
else if (type == eHTMLTag_ol) {
EnsureVerticalSpace(mULCount + mOLStackIndex == 0 ? 1 : 0);
// Must end the current line before we change indention
if (mOLStackIndex < OLStackSize) {
nsAutoString startAttr;
PRInt32 startVal = 1;
if(NS_SUCCEEDED(GetAttributeValue(aNode, nsGkAtoms::start, startAttr))){
PRInt32 rv = 0;
startVal = startAttr.ToInteger(&rv);
if (NS_FAILED(rv))
startVal = 1;
}
mOLStack[mOLStackIndex++] = startVal;
}
mIndent += kIndentSizeList; // see ul
}
else if (type == eHTMLTag_li) {
if (mTagStackIndex > 1 && IsInOL()) {
if (mOLStackIndex > 0) {
nsAutoString valueAttr;
if(NS_SUCCEEDED(GetAttributeValue(aNode, nsGkAtoms::value, valueAttr))){
PRInt32 rv = 0;
PRInt32 valueAttrVal = valueAttr.ToInteger(&rv);
if (NS_SUCCEEDED(rv))
mOLStack[mOLStackIndex-1] = valueAttrVal;
}
// This is what nsBulletFrame does for OLs:
mInIndentString.AppendInt(mOLStack[mOLStackIndex-1]++, 10);
}
else {
mInIndentString.Append(PRUnichar('#'));
}
mInIndentString.Append(PRUnichar('.'));
}
else {
static char bulletCharArray[] = "*o+#";
PRUint32 index = mULCount > 0 ? (mULCount - 1) : 3;
char bulletChar = bulletCharArray[index % 4];
mInIndentString.Append(PRUnichar(bulletChar));
}
mInIndentString.Append(PRUnichar(' '));
}
else if (type == eHTMLTag_dl) {
EnsureVerticalSpace(1);
}
else if (type == eHTMLTag_dt) {
EnsureVerticalSpace(0);
}
else if (type == eHTMLTag_dd) {
EnsureVerticalSpace(0);
mIndent += kIndentSizeDD;
}
else if (type == eHTMLTag_span) {
++mSpanLevel;
}
else if (type == eHTMLTag_blockquote) {
// Push
PushBool(mIsInCiteBlockquote, isInCiteBlockquote);
if (isInCiteBlockquote) {
EnsureVerticalSpace(0);
mCiteQuoteLevel++;
}
else {
EnsureVerticalSpace(1);
mIndent += kTabSize; // Check for some maximum value?
}
}
else if (type == eHTMLTag_q) {
Write(NS_LITERAL_STRING("\""));
}
// Else make sure we'll separate block level tags,
// even if we're about to leave, before doing any other formatting.
else if (IsBlockLevel(aTag)) {
EnsureVerticalSpace(0);
}
//////////////////////////////////////////////////////////////
if (!(mFlags & nsIDocumentEncoder::OutputFormatted)) {
return NS_OK;
}
//////////////////////////////////////////////////////////////
// The rest of this routine is formatted output stuff,
// which we should skip if we're not formatted:
//////////////////////////////////////////////////////////////
// Push on stack
PRBool currentNodeIsConverted = IsCurrentNodeConverted(aNode);
PushBool(mCurrentNodeIsConverted, currentNodeIsConverted);
if (type == eHTMLTag_h1 || type == eHTMLTag_h2 ||
type == eHTMLTag_h3 || type == eHTMLTag_h4 ||
type == eHTMLTag_h5 || type == eHTMLTag_h6)
{
EnsureVerticalSpace(2);
if (mHeaderStrategy == 2) { // numbered
mIndent += kIndentSizeHeaders;
// Caching
PRInt32 level = HeaderLevel(type);
// Increase counter for current level
mHeaderCounter[level]++;
// Reset all lower levels
PRInt32 i;
for (i = level + 1; i <= 6; i++) {
mHeaderCounter[i] = 0;
}
// Construct numbers
nsAutoString leadup;
for (i = 1; i <= level; i++) {
leadup.AppendInt(mHeaderCounter[i]);
leadup.Append(PRUnichar('.'));
}
leadup.Append(PRUnichar(' '));
Write(leadup);
}
else if (mHeaderStrategy == 1) { // indent increasingly
mIndent += kIndentSizeHeaders;
for (PRInt32 i = HeaderLevel(type); i > 1; i--) {
// for h(x), run x-1 times
mIndent += kIndentIncrementHeaders;
}
}
}
else if (type == eHTMLTag_a && !currentNodeIsConverted) {
nsAutoString url;
if (NS_SUCCEEDED(GetAttributeValue(aNode, nsGkAtoms::href, url))
&& !url.IsEmpty()) {
mURL = url;
}
}
else if (type == eHTMLTag_sup && mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("^"));
}
else if (type == eHTMLTag_sub && mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("_"));
}
else if (type == eHTMLTag_code && mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("|"));
}
else if ((type == eHTMLTag_strong || type == eHTMLTag_b)
&& mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("*"));
}
else if ((type == eHTMLTag_em || type == eHTMLTag_i)
&& mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("/"));
}
else if (type == eHTMLTag_u && mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("_"));
}
return NS_OK;
}
nsresult
nsPlainTextSerializer::DoCloseContainer(PRInt32 aTag)
{
if (mFlags & nsIDocumentEncoder::OutputRaw) {
// Raw means raw. Don't even think about doing anything fancy
// here like indenting, adding line breaks or any other
// characters such as list item bullets, quote characters
// around <q>, etc. I mean it! Don't make me smack you!
return NS_OK;
}
if (mTagStackIndex > 0) {
--mTagStackIndex;
}
if (mTagStackIndex >= mIgnoreAboveIndex) {
if (mTagStackIndex == mIgnoreAboveIndex) {
// We're dealing with the close tag whose matching
// open tag had set the mIgnoreAboveIndex value.
// Reset mIgnoreAboveIndex before discarding this tag.
mIgnoreAboveIndex = (PRUint32)kNotFound;
}
return NS_OK;
}
eHTMLTags type = (eHTMLTags)aTag;
// End current line if we're ending a block level tag
if((type == eHTMLTag_body) || (type == eHTMLTag_html)) {
// We want the output to end with a new line,
// but in preformatted areas like text fields,
// we can't emit newlines that weren't there.
// So add the newline only in the case of formatted output.
if (mFlags & nsIDocumentEncoder::OutputFormatted) {
EnsureVerticalSpace(0);
}
else {
FlushLine();
}
// We won't want to do anything with these in formatted mode either,
// so just return now:
return NS_OK;
}
else if (type == eHTMLTag_tr) {
PopBool(mHasWrittenCellsForRow);
// Should always end a line, but get no more whitespace
if (mFloatingLines < 0)
mFloatingLines = 0;
mLineBreakDue = PR_TRUE;
}
else if ((type == eHTMLTag_li) ||
(type == eHTMLTag_dt)) {
// Items that should always end a line, but get no more whitespace
if (mFloatingLines < 0)
mFloatingLines = 0;
mLineBreakDue = PR_TRUE;
}
else if (type == eHTMLTag_pre) {
mFloatingLines = GetLastBool(mIsInCiteBlockquote) ? 0 : 1;
mLineBreakDue = PR_TRUE;
}
else if (type == eHTMLTag_ul) {
FlushLine();
mIndent -= kIndentSizeList;
if (--mULCount + mOLStackIndex == 0) {
mFloatingLines = 1;
mLineBreakDue = PR_TRUE;
}
}
else if (type == eHTMLTag_ol) {
FlushLine(); // Doing this after decreasing OLStackIndex would be wrong.
mIndent -= kIndentSizeList;
mOLStackIndex--;
if (mULCount + mOLStackIndex == 0) {
mFloatingLines = 1;
mLineBreakDue = PR_TRUE;
}
}
else if (type == eHTMLTag_dl) {
mFloatingLines = 1;
mLineBreakDue = PR_TRUE;
}
else if (type == eHTMLTag_dd) {
FlushLine();
mIndent -= kIndentSizeDD;
}
else if (type == eHTMLTag_span) {
--mSpanLevel;
}
else if (type == eHTMLTag_div) {
if (mFloatingLines < 0)
mFloatingLines = 0;
mLineBreakDue = PR_TRUE;
}
else if (type == eHTMLTag_blockquote) {
FlushLine(); // Is this needed?
// Pop
PRBool isInCiteBlockquote = PopBool(mIsInCiteBlockquote);
if (isInCiteBlockquote) {
mCiteQuoteLevel--;
mFloatingLines = 0;
mHasWrittenCiteBlockquote = PR_TRUE;
}
else {
mIndent -= kTabSize;
mFloatingLines = 1;
}
mLineBreakDue = PR_TRUE;
}
else if (type == eHTMLTag_q) {
Write(NS_LITERAL_STRING("\""));
}
else if (IsBlockLevel(aTag)
&& type != eHTMLTag_script
&& type != eHTMLTag_doctypeDecl
&& type != eHTMLTag_markupDecl) {
// All other blocks get 1 vertical space after them
// in formatted mode, otherwise 0.
// This is hard. Sometimes 0 is a better number, but
// how to know?
if (mFlags & nsIDocumentEncoder::OutputFormatted)
EnsureVerticalSpace(1);
else {
if (mFloatingLines < 0)
mFloatingLines = 0;
mLineBreakDue = PR_TRUE;
}
}
//////////////////////////////////////////////////////////////
if (!(mFlags & nsIDocumentEncoder::OutputFormatted)) {
return NS_OK;
}
//////////////////////////////////////////////////////////////
// The rest of this routine is formatted output stuff,
// which we should skip if we're not formatted:
//////////////////////////////////////////////////////////////
// Pop the currentConverted stack
PRBool currentNodeIsConverted = PopBool(mCurrentNodeIsConverted);
if (type == eHTMLTag_h1 || type == eHTMLTag_h2 ||
type == eHTMLTag_h3 || type == eHTMLTag_h4 ||
type == eHTMLTag_h5 || type == eHTMLTag_h6) {
if (mHeaderStrategy) { /*numbered or indent increasingly*/
mIndent -= kIndentSizeHeaders;
}
if (mHeaderStrategy == 1 /*indent increasingly*/ ) {
for (PRInt32 i = HeaderLevel(type); i > 1; i--) {
// for h(x), run x-1 times
mIndent -= kIndentIncrementHeaders;
}
}
EnsureVerticalSpace(1);
}
else if (type == eHTMLTag_a && !currentNodeIsConverted && !mURL.IsEmpty()) {
nsAutoString temp;
temp.AssignLiteral(" <");
temp += mURL;
temp.Append(PRUnichar('>'));
Write(temp);
mURL.Truncate();
}
else if ((type == eHTMLTag_sup || type == eHTMLTag_sub)
&& mStructs && !currentNodeIsConverted) {
Write(kSpace);
}
else if (type == eHTMLTag_code && mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("|"));
}
else if ((type == eHTMLTag_strong || type == eHTMLTag_b)
&& mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("*"));
}
else if ((type == eHTMLTag_em || type == eHTMLTag_i)
&& mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("/"));
}
else if (type == eHTMLTag_u && mStructs && !currentNodeIsConverted) {
Write(NS_LITERAL_STRING("_"));
}
return NS_OK;
}
/**
* aNode may be null when we're working with the DOM, but then mContent is
* useable instead.
*/
nsresult
nsPlainTextSerializer::DoAddLeaf(const nsIParserNode *aNode, PRInt32 aTag,
const nsAString& aText)
{
// If we don't want any output, just return
if (!DoOutput()) {
return NS_OK;
}
if (aTag != eHTMLTag_whitespace && aTag != eHTMLTag_newline) {
// Make sure to reset this, since it's no longer true.
mHasWrittenCiteBlockquote = PR_FALSE;
}
if (mLineBreakDue)
EnsureVerticalSpace(mFloatingLines);
eHTMLTags type = (eHTMLTags)aTag;
if ((mTagStackIndex > 1 &&
mTagStack[mTagStackIndex-2] == eHTMLTag_select) ||
(mTagStackIndex > 0 &&
mTagStack[mTagStackIndex-1] == eHTMLTag_select)) {
// Don't output the contents of SELECT elements;
// Might be nice, eventually, to output just the selected element.
// Read more in bug 31994.
return NS_OK;
}
else if (mTagStackIndex > 0 && mTagStack[mTagStackIndex-1] == eHTMLTag_script) {
// Don't output the contents of <script> tags;
return NS_OK;
}
else if (type == eHTMLTag_text) {
/* Check, if we are in a link (symbolized with mURL containing the URL)
and the text is equal to the URL. In that case we don't want to output
the URL twice so we scrap the text in mURL. */
if (!mURL.IsEmpty() && mURL.Equals(aText)) {
mURL.Truncate();
}
Write(aText);
}
else if (type == eHTMLTag_entity) {
nsIParserService* parserService = nsContentUtils::GetParserService();
if (parserService) {
nsAutoString str(aText);
PRInt32 entity;
parserService->HTMLConvertEntityToUnicode(str, &entity);
if (entity == -1 &&
!str.IsEmpty() &&
str.First() == (PRUnichar) '#') {
PRInt32 err = 0;
entity = str.ToInteger(&err, kAutoDetect); // NCR
}
nsAutoString temp;
temp.Append(PRUnichar(entity));
Write(temp);
}
}
else if (type == eHTMLTag_br) {
// Another egregious editor workaround, see bug 38194:
// ignore the bogus br tags that the editor sticks here and there.
nsAutoString typeAttr;
if (NS_FAILED(GetAttributeValue(aNode, nsGkAtoms::type, typeAttr))
|| !typeAttr.EqualsLiteral("_moz")) {
EnsureVerticalSpace(mEmptyLines+1);
}
}
else if (type == eHTMLTag_whitespace) {
// The only times we want to pass along whitespace from the original
// html source are if we're forced into preformatted mode via flags,
// or if we're prettyprinting and we're inside a <pre>.
// Otherwise, either we're collapsing to minimal text, or we're
// prettyprinting to mimic the html format, and in neither case
// does the formatting of the html source help us.
// One exception: at the very beginning of a selection,
// we want to preserve whitespace.
if (mFlags & nsIDocumentEncoder::OutputPreformatted ||
(mPreFormatted && !mWrapColumn) ||
IsInPre()) {
Write(aText);
}
else if(!mInWhitespace ||
(!mStartedOutput
&& mFlags | nsIDocumentEncoder::OutputSelectionOnly)) {
mInWhitespace = PR_FALSE;
Write(kSpace);
mInWhitespace = PR_TRUE;
}
}
else if (type == eHTMLTag_newline) {
if (mFlags & nsIDocumentEncoder::OutputPreformatted ||
(mPreFormatted && !mWrapColumn) ||
IsInPre()) {
EnsureVerticalSpace(mEmptyLines+1);
}
else {
Write(kSpace);
}
}
else if (type == eHTMLTag_hr &&
(mFlags & nsIDocumentEncoder::OutputFormatted)) {
EnsureVerticalSpace(0);
// Make a line of dashes as wide as the wrap width
// XXX honoring percentage would be nice
nsAutoString line;
PRUint32 width = (mWrapColumn > 0 ? mWrapColumn : 25);
while (line.Length() < width) {
line.Append(PRUnichar('-'));
}
Write(line);
EnsureVerticalSpace(0);
}
else if (type == eHTMLTag_img) {
/* Output (in decreasing order of preference)
alt, title or nothing */
// See <http://www.w3.org/TR/REC-html40/struct/objects.html#edef-IMG>
nsAutoString imageDescription;
if (NS_SUCCEEDED(GetAttributeValue(aNode,
nsGkAtoms::alt,
imageDescription))) {
// If the alt attribute has an empty value (|alt=""|), output nothing
}
else if (NS_SUCCEEDED(GetAttributeValue(aNode,
nsGkAtoms::title,
imageDescription))
&& !imageDescription.IsEmpty()) {
imageDescription = NS_LITERAL_STRING(" [") +
imageDescription +
NS_LITERAL_STRING("] ");
}
Write(imageDescription);
}
return NS_OK;
}
/**
* Adds as many newline as necessary to get |noOfRows| empty lines
*
* noOfRows = -1 : Being in the middle of some line of text
* noOfRows = 0 : Being at the start of a line
* noOfRows = n>0 : Having n empty lines before the current line.
*/
void
nsPlainTextSerializer::EnsureVerticalSpace(PRInt32 noOfRows)
{
// If we have something in the indent we probably want to output
// it and it's not included in the count for empty lines so we don't
// realize that we should start a new line.
if(noOfRows >= 0 && !mInIndentString.IsEmpty()) {
EndLine(PR_FALSE);
}
while(mEmptyLines < noOfRows) {
EndLine(PR_FALSE);
}
mLineBreakDue = PR_FALSE;
mFloatingLines = -1;
}
/**
* This empties the current line cache without adding a NEWLINE.
* Should not be used if line wrapping is of importance since
* this function destroys the cache information.
*
* It will also write indentation and quotes if we believe us to be
* at the start of the line.
*/
void
nsPlainTextSerializer::FlushLine()
{
if(!mCurrentLine.IsEmpty()) {
if(mAtFirstColumn) {
OutputQuotesAndIndent(); // XXX: Should we always do this? Bug?
}
Output(mCurrentLine);
mAtFirstColumn = mAtFirstColumn && mCurrentLine.IsEmpty();
mCurrentLine.Truncate();
mCurrentLineWidth = 0;
}
}
/**
* Prints the text to output to our current output device (the string mOutputString).
* The only logic here is to replace non breaking spaces with a normal space since
* most (all?) receivers of the result won't understand the nbsp and even be
* confused by it.
*/
void
nsPlainTextSerializer::Output(nsString& aString)
{
if (!aString.IsEmpty()) {
mStartedOutput = PR_TRUE;
}
if (!(mFlags & nsIDocumentEncoder::OutputPersistNBSP)) {
// First, replace all nbsp characters with spaces,
// which the unicode encoder won't do for us.
aString.ReplaceChar(kNBSP, kSPACE);
}
mOutputString->Append(aString);
}
/**
* This function adds a piece of text to the current stored line. If we are
* wrapping text and the stored line will become too long, a suitable
* location to wrap will be found and the line that's complete will be
* output.
*/
void
nsPlainTextSerializer::AddToLine(const PRUnichar * aLineFragment,
PRInt32 aLineFragmentLength)
{
PRUint32 prefixwidth = (mCiteQuoteLevel > 0 ? mCiteQuoteLevel + 1:0)+mIndent;
if (mLineBreakDue)
EnsureVerticalSpace(mFloatingLines);
PRInt32 linelength = mCurrentLine.Length();
if(0 == linelength) {
if(0 == aLineFragmentLength) {
// Nothing at all. Are you kidding me?
return;
}
if(mFlags & nsIDocumentEncoder::OutputFormatFlowed) {
if(
(
'>' == aLineFragment[0] ||
' ' == aLineFragment[0] ||
kNBSP == aLineFragment[0] || // bug 215068
!nsCRT::strncmp(aLineFragment, NS_LITERAL_STRING("From ").get(), 5)
)
&& mCiteQuoteLevel == 0 // We space-stuff quoted lines anyway
)
{
// Space stuffing a la RFC 2646 (format=flowed).
mCurrentLine.Append(PRUnichar(' '));
if(MayWrap()) {
mCurrentLineWidth += GetUnicharWidth(' ');
#ifdef DEBUG_wrapping
NS_ASSERTION(GetUnicharStringWidth(mCurrentLine.get(),
mCurrentLine.Length()) ==
(PRInt32)mCurrentLineWidth,
"mCurrentLineWidth and reality out of sync!");
#endif
}
}
}
mEmptyLines=-1;
}
mCurrentLine.Append(aLineFragment, aLineFragmentLength);
if(MayWrap()) {
mCurrentLineWidth += GetUnicharStringWidth(aLineFragment,
aLineFragmentLength);
#ifdef DEBUG_wrapping
NS_ASSERTION(GetUnicharstringWidth(mCurrentLine.get(),
mCurrentLine.Length()) ==
(PRInt32)mCurrentLineWidth,
"mCurrentLineWidth and reality out of sync!");
#endif
}
linelength = mCurrentLine.Length();
// Wrap?
if(MayWrap())
{
#ifdef DEBUG_wrapping
NS_ASSERTION(GetUnicharstringWidth(mCurrentLine.get(),
mCurrentLine.Length()) ==
(PRInt32)mCurrentLineWidth,
"mCurrentLineWidth and reality out of sync!");
#endif
// Yes, wrap!
// The "+4" is to avoid wrap lines that only would be a couple
// of letters too long. We give this bonus only if the
// wrapcolumn is more than 20.
PRUint32 bonuswidth = (mWrapColumn > 20) ? 4 : 0;
// XXX: Should calculate prefixwidth with GetUnicharStringWidth
while(mCurrentLineWidth+prefixwidth > mWrapColumn+bonuswidth) {
// We go from the end removing one letter at a time until
// we have a reasonable width
PRInt32 goodSpace = mCurrentLine.Length();
PRUint32 width = mCurrentLineWidth;
while(goodSpace > 0 && (width+prefixwidth > mWrapColumn)) {
goodSpace--;
width -= GetUnicharWidth(mCurrentLine[goodSpace]);
}
goodSpace++;
if (mLineBreaker) {
goodSpace = mLineBreaker->Prev(mCurrentLine.get(),
mCurrentLine.Length(), goodSpace);
if (goodSpace != NS_LINEBREAKER_NEED_MORE_TEXT &&
nsCRT::IsAsciiSpace(mCurrentLine.CharAt(goodSpace-1))) {
--goodSpace; // adjust the position since line breaker returns a position next to space
}
}
// fallback if the line breaker is unavailable or failed
if (!mLineBreaker) {
goodSpace = mWrapColumn-prefixwidth;
while (goodSpace >= 0 &&
!nsCRT::IsAsciiSpace(mCurrentLine.CharAt(goodSpace))) {
goodSpace--;
}
}
nsAutoString restOfLine;
if (goodSpace == NS_LINEBREAKER_NEED_MORE_TEXT) {
// If we don't found a good place to break, accept long line and
// try to find another place to break
goodSpace=(prefixwidth>mWrapColumn+1)?1:mWrapColumn-prefixwidth+1;
if (mLineBreaker) {
goodSpace = mLineBreaker->Next(mCurrentLine.get(),
mCurrentLine.Length(), goodSpace);
if (goodSpace == NS_LINEBREAKER_NEED_MORE_TEXT)
goodSpace = mCurrentLine.Length();
}
// fallback if the line breaker is unavailable or failed
if (!mLineBreaker) {
goodSpace=(prefixwidth>mWrapColumn)?1:mWrapColumn-prefixwidth;
while (goodSpace < linelength &&
!nsCRT::IsAsciiSpace(mCurrentLine.CharAt(goodSpace))) {
goodSpace++;
}
}
}
if((goodSpace < linelength) && (goodSpace > 0)) {
// Found a place to break
// -1 (trim a char at the break position)
// only if the line break was a space.
if (nsCRT::IsAsciiSpace(mCurrentLine.CharAt(goodSpace))) {
mCurrentLine.Right(restOfLine, linelength-goodSpace-1);
}
else {
mCurrentLine.Right(restOfLine, linelength-goodSpace);
}
mCurrentLine.Truncate(goodSpace);
EndLine(PR_TRUE);
mCurrentLine.Truncate();
// Space stuff new line?
if(mFlags & nsIDocumentEncoder::OutputFormatFlowed) {
if(
!restOfLine.IsEmpty()
&&
(
restOfLine[0] == '>' ||
restOfLine[0] == ' ' ||
StringBeginsWith(restOfLine, NS_LITERAL_STRING("From "))
)
&& mCiteQuoteLevel == 0 // We space-stuff quoted lines anyway
)
{
// Space stuffing a la RFC 2646 (format=flowed).
mCurrentLine.Append(PRUnichar(' '));
//XXX doesn't seem to work correctly for ' '
}
}
mCurrentLine.Append(restOfLine);
mCurrentLineWidth = GetUnicharStringWidth(mCurrentLine.get(),
mCurrentLine.Length());
linelength = mCurrentLine.Length();
mEmptyLines = -1;
}
else {
// Nothing to do. Hopefully we get more data later
// to use for a place to break line
break;
}
}
}
else {
// No wrapping.
}
}
/**
* Outputs the contents of mCurrentLine, and resets line specific
* variables. Also adds an indentation and prefix if there is
* one specified. Strips ending spaces from the line if it isn't
* preformatted.
*/
void
nsPlainTextSerializer::EndLine(PRBool aSoftlinebreak)
{
PRUint32 currentlinelength = mCurrentLine.Length();
if(aSoftlinebreak && 0 == currentlinelength) {
// No meaning
return;
}
/* In non-preformatted mode, remove spaces from the end of the line for
* format=flowed compatibility. Don't do this for these special cases:
* "-- ", the signature separator (RFC 2646) shouldn't be touched and
* "- -- ", the OpenPGP dash-escaped signature separator in inline
* signed messages according to the OpenPGP standard (RFC 2440).
*/
if(!(mFlags & nsIDocumentEncoder::OutputPreformatted) &&
(aSoftlinebreak ||
!(mCurrentLine.EqualsLiteral("-- ") || mCurrentLine.EqualsLiteral("- -- ")))) {
// Remove spaces from the end of the line.
while(currentlinelength > 0 &&
mCurrentLine[currentlinelength-1] == ' ') {
--currentlinelength;
}
mCurrentLine.SetLength(currentlinelength);
}
if(aSoftlinebreak &&
(mFlags & nsIDocumentEncoder::OutputFormatFlowed) &&
(mIndent == 0)) {
// Add the soft part of the soft linebreak (RFC 2646 4.1)
// We only do this when there is no indentation since format=flowed
// lines and indentation doesn't work well together.
mCurrentLine.Append(PRUnichar(' '));
}
if(aSoftlinebreak) {
mEmptyLines=0;
}
else {
// Hard break
if(!mCurrentLine.IsEmpty() || !mInIndentString.IsEmpty()) {
mEmptyLines=-1;
}
mEmptyLines++;
}
if(mAtFirstColumn) {
// If we don't have anything "real" to output we have to
// make sure the indent doesn't end in a space since that
// would trick a format=flowed-aware receiver.
PRBool stripTrailingSpaces = mCurrentLine.IsEmpty();
OutputQuotesAndIndent(stripTrailingSpaces);
}
mCurrentLine.Append(mLineBreak);
Output(mCurrentLine);
mCurrentLine.Truncate();
mCurrentLineWidth = 0;
mAtFirstColumn=PR_TRUE;
mInWhitespace=PR_TRUE;
mLineBreakDue = PR_FALSE;
mFloatingLines = -1;
}
/**
* Outputs the calculated and stored indent and text in the indentation. That is
* quote chars and numbers for numbered lists and such. It will also reset any
* stored text to put in the indentation after using it.
*/
void
nsPlainTextSerializer::OutputQuotesAndIndent(PRBool stripTrailingSpaces /* = PR_FALSE */)
{
nsAutoString stringToOutput;
// Put the mail quote "> " chars in, if appropriate:
if (mCiteQuoteLevel > 0) {
nsAutoString quotes;
for(int i=0; i < mCiteQuoteLevel; i++) {
quotes.Append(PRUnichar('>'));
}
if (!mCurrentLine.IsEmpty()) {
/* Better don't output a space here, if the line is empty,
in case a receiving f=f-aware UA thinks, this were a flowed line,
which it isn't - it's just empty.
(Flowed lines may be joined with the following one,
so the empty line may be lost completely.) */
quotes.Append(PRUnichar(' '));
}
stringToOutput = quotes;
mAtFirstColumn = PR_FALSE;
}
// Indent if necessary
PRInt32 indentwidth = mIndent - mInIndentString.Length();
if (indentwidth > 0
&& (!mCurrentLine.IsEmpty() || !mInIndentString.IsEmpty())
// Don't make empty lines look flowed
) {
nsAutoString spaces;
for (int i=0; i < indentwidth; ++i)
spaces.Append(PRUnichar(' '));
stringToOutput += spaces;
mAtFirstColumn = PR_FALSE;
}
if(!mInIndentString.IsEmpty()) {
stringToOutput += mInIndentString;
mAtFirstColumn = PR_FALSE;
mInIndentString.Truncate();
}
if(stripTrailingSpaces) {
PRInt32 lineLength = stringToOutput.Length();
while(lineLength > 0 &&
' ' == stringToOutput[lineLength-1]) {
--lineLength;
}
stringToOutput.SetLength(lineLength);
}
if(!stringToOutput.IsEmpty()) {
Output(stringToOutput);
}
}
/**
* Write a string. This is the highlevel function to use to get text output.
* By using AddToLine, Output, EndLine and other functions it handles quotation,
* line wrapping, indentation, whitespace compression and other things.
*/
void
nsPlainTextSerializer::Write(const nsAString& aStr)
{
// XXX Copy necessary to use nsString methods and gain
// access to underlying buffer
nsAutoString str(aStr);
#ifdef DEBUG_wrapping
printf("Write(%s): wrap col = %d\n",
NS_ConvertUTF16toUTF8(aString).get(), mWrapColumn);
#endif
PRInt32 bol = 0;
PRInt32 newline;
PRInt32 totLen = str.Length();
// If the string is empty, do nothing:
if (totLen <= 0) return;
// For Flowed text change nbsp-ses to spaces at end of lines to allow them
// to be cut off along with usual spaces if required. (bug #125928)
if (mFlags & nsIDocumentEncoder::OutputFormatFlowed) {
PRUnichar nbsp = 160;
for (PRInt32 i = totLen-1; i >= 0; i--) {
PRUnichar c = str[i];
if ('\n' == c || '\r' == c || ' ' == c || '\t' == c)
continue;
if (nbsp == c)
str.Replace(i, 1, ' ');
else
break;
}
}
// We have two major codepaths here. One that does preformatted text and one
// that does normal formatted text. The one for preformatted text calls
// Output directly while the other code path goes through AddToLine.
if ((mPreFormatted && !mWrapColumn) || IsInPre()
|| ((((!mQuotesPreformatted && mSpanLevel > 0) || mDontWrapAnyQuotes))
&& mEmptyLines >= 0 && str.First() == PRUnichar('>'))) {
// No intelligent wrapping.
// This mustn't be mixed with intelligent wrapping without clearing
// the mCurrentLine buffer before!!!
NS_ASSERTION(mCurrentLine.IsEmpty(),
"Mixed wrapping data and nonwrapping data on the same line");
if (!mCurrentLine.IsEmpty()) {
FlushLine();
}
// Put the mail quote "> " chars in, if appropriate.
// Have to put it in before every line.
while(bol<totLen) {
PRBool outputQuotes = mAtFirstColumn;
PRBool atFirstColumn = mAtFirstColumn;
PRBool outputLineBreak = PR_FALSE;
// Find one of '\n' or '\r' using iterators since nsAString
// doesn't have the old FindCharInSet function.
nsAString::const_iterator iter; str.BeginReading(iter);
nsAString::const_iterator done_searching; str.EndReading(done_searching);
iter.advance(bol);
PRInt32 new_newline = bol;
newline = kNotFound;
while(iter != done_searching) {
if('\n' == *iter || '\r' == *iter) {
newline = new_newline;
break;
}
++new_newline;
++iter;
}
// Done searching
if(newline == kNotFound) {
// No new lines.
nsAutoString stringpart(Substring(str, bol, totLen - bol));
if(!stringpart.IsEmpty()) {
PRUnichar lastchar = stringpart[stringpart.Length()-1];
if((lastchar == '\t') || (lastchar == ' ') ||
(lastchar == '\r') ||(lastchar == '\n')) {
mInWhitespace = PR_TRUE;
}
else {
mInWhitespace = PR_FALSE;
}
}
mCurrentLine.Assign(stringpart);
mEmptyLines=-1;
atFirstColumn = mAtFirstColumn && (totLen-bol)==0;
bol = totLen;
}
else {
// There is a newline
nsAutoString stringpart(Substring(str, bol, newline-bol));
if (mFlags & nsIDocumentEncoder::OutputFormatFlowed)
stringpart.Trim(" ", PR_FALSE, PR_TRUE, PR_TRUE);
mInWhitespace = PR_TRUE;
mCurrentLine.Assign(stringpart);
outputLineBreak = PR_TRUE;
mEmptyLines=0;
atFirstColumn = PR_TRUE;
bol = newline+1;
if('\r' == *iter && bol < totLen && '\n' == *++iter) {
// There was a CRLF in the input. This used to be illegal and
// stripped by the parser. Apparently not anymore. Let's skip
// over the LF.
bol++;
}
}
if(outputQuotes) {
// Note: this call messes with mAtFirstColumn
OutputQuotesAndIndent();
}
Output(mCurrentLine);
if (outputLineBreak) {
Output(mLineBreak);
}
mAtFirstColumn = atFirstColumn;
}
// Reset mCurrentLine.
mCurrentLine.Truncate();
#ifdef DEBUG_wrapping
printf("No wrapping: newline is %d, totLen is %d\n",
newline, totLen);
#endif
return;
}
// Intelligent handling of text
// If needed, strip out all "end of lines"
// and multiple whitespace between words
PRInt32 nextpos;
const PRUnichar * offsetIntoBuffer = nsnull;
while (bol < totLen) { // Loop over lines
// Find a place where we may have to do whitespace compression
nextpos = str.FindCharInSet(" \t\n\r", bol);
#ifdef DEBUG_wrapping
nsAutoString remaining;
str.Right(remaining, totLen - bol);
foo = ToNewCString(remaining);
// printf("Next line: bol = %d, newlinepos = %d, totLen = %d, string = '%s'\n",
// bol, nextpos, totLen, foo);
nsMemory::Free(foo);
#endif
if(nextpos == kNotFound) {
// The rest of the string
offsetIntoBuffer = str.get() + bol;
AddToLine(offsetIntoBuffer, totLen-bol);
bol=totLen;
mInWhitespace=PR_FALSE;
}
else {
// There's still whitespace left in the string
if (nextpos != 0 && (nextpos + 1) < totLen) {
offsetIntoBuffer = str.get() + nextpos;
// skip '\n' if it is between CJ chars
if (offsetIntoBuffer[0] == '\n' && IS_CJ_CHAR(offsetIntoBuffer[-1]) && IS_CJ_CHAR(offsetIntoBuffer[1])) {
offsetIntoBuffer = str.get() + bol;
AddToLine(offsetIntoBuffer, nextpos-bol);
bol = nextpos + 1;
continue;
}
}
// If we're already in whitespace and not preformatted, just skip it:
if (mInWhitespace && (nextpos == bol) && !mPreFormatted &&
!(mFlags & nsIDocumentEncoder::OutputPreformatted)) {
// Skip whitespace
bol++;
continue;
}
if(nextpos == bol) {
// Note that we are in whitespace.
mInWhitespace = PR_TRUE;
offsetIntoBuffer = str.get() + nextpos;
AddToLine(offsetIntoBuffer, 1);
bol++;
continue;
}
mInWhitespace = PR_TRUE;
offsetIntoBuffer = str.get() + bol;
if(mPreFormatted || (mFlags & nsIDocumentEncoder::OutputPreformatted)) {
// Preserve the real whitespace character
nextpos++;
AddToLine(offsetIntoBuffer, nextpos-bol);
bol = nextpos;
}
else {
// Replace the whitespace with a space
AddToLine(offsetIntoBuffer, nextpos-bol);
AddToLine(kSpace.get(),1);
bol = nextpos + 1; // Let's eat the whitespace
}
}
} // Continue looping over the string
}
/**
* Gets the value of an attribute in a string. If the function returns
* NS_ERROR_NOT_AVAILABLE, there was none such attribute specified.
*/
nsresult
nsPlainTextSerializer::GetAttributeValue(const nsIParserNode* aNode,
nsIAtom* aName,
nsString& aValueRet)
{
if (mContent) {
if (mContent->GetAttr(kNameSpaceID_None, aName, aValueRet)) {
return NS_OK;
}
}
else if (aNode) {
nsAutoString name;
aName->ToString(name);
PRInt32 count = aNode->GetAttributeCount();
for (PRInt32 i=0;i<count;i++) {
const nsAString& key = aNode->GetKeyAt(i);
if (key.Equals(name, nsCaseInsensitiveStringComparator())) {
aValueRet = aNode->GetValueAt(i);
return NS_OK;
}
}
}
return NS_ERROR_NOT_AVAILABLE;
}
/**
* Returns true, if the element was inserted by Moz' TXT->HTML converter.
* In this case, we should ignore it.
*/
PRBool
nsPlainTextSerializer::IsCurrentNodeConverted(const nsIParserNode* aNode)
{
nsAutoString value;
nsresult rv = GetAttributeValue(aNode, nsGkAtoms::_class, value);
return (NS_SUCCEEDED(rv) &&
(value.EqualsIgnoreCase("moz-txt", 7) ||
value.EqualsIgnoreCase("\"moz-txt", 8)));
}
// static
PRInt32
nsPlainTextSerializer::GetIdForContent(nsIContent* aContent)
{
if (!aContent->IsNodeOfType(nsINode::eHTML)) {
return eHTMLTag_unknown;
}
nsIParserService* parserService = nsContentUtils::GetParserService();
return parserService ? parserService->HTMLAtomTagToId(aContent->Tag()) :
eHTMLTag_unknown;
}
/**
* Returns true if the id represents an element of block type.
* Can be used to determine if a new paragraph should be started.
*/
PRBool
nsPlainTextSerializer::IsBlockLevel(PRInt32 aId)
{
PRBool isBlock = PR_FALSE;
nsIParserService* parserService = nsContentUtils::GetParserService();
if (parserService) {
parserService->IsBlock(aId, isBlock);
}
return isBlock;
}
/**
* Returns true if the id represents a container.
*/
PRBool
nsPlainTextSerializer::IsContainer(PRInt32 aId)
{
PRBool isContainer = PR_FALSE;
nsIParserService* parserService = nsContentUtils::GetParserService();
if (parserService) {
parserService->IsContainer(aId, isContainer);
}
return isContainer;
}
/**
* Returns true if we currently are inside a <pre>. The check is done
* by traversing the tag stack looking for <pre> until we hit a block
* level tag which is assumed to override any <pre>:s below it in
* the stack. To do this correctly to a 100% would require access
* to style which we don't support in this converter.
*/
PRBool
nsPlainTextSerializer::IsInPre()
{
PRInt32 i = mTagStackIndex;
while(i > 0) {
if(mTagStack[i-1] == eHTMLTag_pre)
return PR_TRUE;
if(IsBlockLevel(mTagStack[i-1])) {
// We assume that every other block overrides a <pre>
return PR_FALSE;
}
--i;
}
// Not a <pre> in the whole stack
return PR_FALSE;
}
/**
* This method is required only to identify LI's inside OL.
* Returns TRUE if we are inside an OL tag and FALSE otherwise.
*/
PRBool
nsPlainTextSerializer::IsInOL()
{
PRInt32 i = mTagStackIndex;
while(--i >= 0) {
if(mTagStack[i] == eHTMLTag_ol)
return PR_TRUE;
if (mTagStack[i] == eHTMLTag_ul) {
// If a UL is reached first, LI belongs the UL nested in OL.
return PR_FALSE;
}
}
// We may reach here for orphan LI's.
return PR_FALSE;
}
/*
@return 0 = no header, 1 = h1, ..., 6 = h6
*/
PRInt32 HeaderLevel(eHTMLTags aTag)
{
PRInt32 result;
switch (aTag)
{
case eHTMLTag_h1:
result = 1; break;
case eHTMLTag_h2:
result = 2; break;
case eHTMLTag_h3:
result = 3; break;
case eHTMLTag_h4:
result = 4; break;
case eHTMLTag_h5:
result = 5; break;
case eHTMLTag_h6:
result = 6; break;
default:
result = 0; break;
}
return result;
}
/*
* This is an implementation of GetUnicharWidth() and
* GetUnicharStringWidth() as defined in
* "The Single UNIX Specification, Version 2, The Open Group, 1997"
* <http://www.UNIX-systems.org/online.html>
*
* Markus Kuhn -- 2000-02-08 -- public domain
*
* Minor alterations to fit Mozilla's data types by Daniel Bratell
*/
/* These functions define the column width of an ISO 10646 character
* as follows:
*
* - The null character (U+0000) has a column width of 0.
*
* - Other C0/C1 control characters and DEL will lead to a return
* value of -1.
*
* - Non-spacing and enclosing combining characters (general
* category code Mn or Me in the Unicode database) have a
* column width of 0.
*
* - Spacing characters in the East Asian Wide (W) or East Asian
* FullWidth (F) category as defined in Unicode Technical
* Report #11 have a column width of 2.
*
* - All remaining characters (including all printable
* ISO 8859-1 and WGL4 characters, Unicode control characters,
* etc.) have a column width of 1.
*
* This implementation assumes that wchar_t characters are encoded
* in ISO 10646.
*/
PRInt32 GetUnicharWidth(PRUnichar ucs)
{
/* sorted list of non-overlapping intervals of non-spacing characters */
static const struct interval {
PRUint16 first;
PRUint16 last;
} combining[] = {
{ 0x0300, 0x034E }, { 0x0360, 0x0362 }, { 0x0483, 0x0486 },
{ 0x0488, 0x0489 }, { 0x0591, 0x05A1 }, { 0x05A3, 0x05B9 },
{ 0x05BB, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },
{ 0x05C4, 0x05C4 }, { 0x064B, 0x0655 }, { 0x0670, 0x0670 },
{ 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },
{ 0x0711, 0x0711 }, { 0x0730, 0x074A }, { 0x07A6, 0x07B0 },
{ 0x0901, 0x0902 }, { 0x093C, 0x093C }, { 0x0941, 0x0948 },
{ 0x094D, 0x094D }, { 0x0951, 0x0954 }, { 0x0962, 0x0963 },
{ 0x0981, 0x0981 }, { 0x09BC, 0x09BC }, { 0x09C1, 0x09C4 },
{ 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 }, { 0x0A02, 0x0A02 },
{ 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 }, { 0x0A47, 0x0A48 },
{ 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 }, { 0x0A81, 0x0A82 },
{ 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
{ 0x0ACD, 0x0ACD }, { 0x0B01, 0x0B01 }, { 0x0B3C, 0x0B3C },
{ 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 }, { 0x0B4D, 0x0B4D },
{ 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 },
{ 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 },
{ 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 }, { 0x0CBF, 0x0CBF },
{ 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD }, { 0x0D41, 0x0D43 },
{ 0x0D4D, 0x0D4D }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
{ 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
{ 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
{ 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
{ 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
{ 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
{ 0x0F90, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
{ 0x102D, 0x1030 }, { 0x1032, 0x1032 }, { 0x1036, 0x1037 },
{ 0x1039, 0x1039 }, { 0x1058, 0x1059 }, { 0x17B7, 0x17BD },
{ 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x18A9, 0x18A9 },
{ 0x20D0, 0x20E3 }, { 0x302A, 0x302F }, { 0x3099, 0x309A },
{ 0xFB1E, 0xFB1E }, { 0xFE20, 0xFE23 }
};
PRInt32 min = 0;
PRInt32 max = sizeof(combining) / sizeof(struct interval) - 1;
PRInt32 mid;
/* test for 8-bit control characters */
if (ucs == 0)
return 0;
if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
return -1;
/* first quick check for Latin-1 etc. characters */
if (ucs < combining[0].first)
return 1;
/* binary search in table of non-spacing characters */
while (max >= min) {
mid = (min + max) / 2;
if (combining[mid].last < ucs)
min = mid + 1;
else if (combining[mid].first > ucs)
max = mid - 1;
else if (combining[mid].first <= ucs && combining[mid].last >= ucs)
return 0;
}
/* if we arrive here, ucs is not a combining or C0/C1 control character */
/* fast test for majority of non-wide scripts */
if (ucs < 0x1100)
return 1;
return 1 +
((ucs >= 0x1100 && ucs <= 0x115f) || /* Hangul Jamo */
(ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
ucs != 0x303f) || /* CJK ... Yi */
(ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
(ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */
(ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
(ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
(ucs >= 0xffe0 && ucs <= 0xffe6));
}
PRInt32 GetUnicharStringWidth(const PRUnichar* pwcs, PRInt32 n)
{
PRInt32 w, width = 0;
for (;*pwcs && n-- > 0; pwcs++)
if ((w = GetUnicharWidth(*pwcs)) < 0)
++width; // Taking 1 as the width of non-printable character, for bug# 94475.
else
width += w;
return width;
}