/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- * ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Novell code. * * The Initial Developer of the Original Code is Novell Corporation. * Portions created by the Initial Developer are Copyright (C) 2006 * the Initial Developer. All Rights Reserved. * * Contributor(s): * robert@ocallahan.org * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nsTextFrameUtils.h" #include "nsContentUtils.h" #include "nsIWordBreaker.h" #include "gfxFont.h" #include "nsTextTransformer.h" #include "nsCompressedCharMap.h" #include "nsUnicharUtils.h" // XXX TODO implement transform of backslash to yen that nsTextTransform does // when requested by PresContext->LanguageSpecificTransformType(). Do it with // a new factory type that just munges the input stream. But first, check // that we really still need this, it's only enabled via a hidden pref // which defaults false... // Replaced by precompiled CCMap (see bug 180266). To update the list // of characters, see one of files included below. As for the way // the original list of characters was obtained by Frank Tang, see bug 54467. // Updated to fix the regression (bug 263411). The list contains // characters of the following Unicode character classes : Ps, Pi, Po, Pf, Pe. // (ref.: http://www.w3.org/TR/2004/CR-CSS21-20040225/selector.html#first-letter) // Note that the file does NOT yet include non-BMP characters because // there's no point including them without fixing the way we identify // 'first-letter' currently working only with BMP characters. #include "punct_marks.ccmap" DEFINE_CCMAP(gPuncCharsCCMap, const); #define UNICODE_ZWSP 0x200B PRBool nsTextFrameUtils::IsPunctuationMark(PRUnichar aChar) { return CCMAP_HAS_CHAR(gPuncCharsCCMap, aChar); } static PRBool IsDiscardable(PRUnichar ch, PRUint32* aFlags) { // Unlike IS_DISCARDABLE, we don't discard \r. \r will be ignored by gfxTextRun // and discarding it would force us to copy text in many cases of preformatted // text containing \r\n. if (ch == CH_SHY) { *aFlags |= nsTextFrameUtils::TEXT_HAS_SHY; return PR_TRUE; } if ((ch & 0xFF00) != 0x2000) { // Not a Bidi control character return PR_FALSE; } return IS_BIDI_CONTROL(ch); } static PRBool IsDiscardable(PRUint8 ch, PRUint32* aFlags) { if (ch == CH_SHY) { *aFlags |= nsTextFrameUtils::TEXT_HAS_SHY; return PR_TRUE; } return PR_FALSE; } PRUnichar* nsTextFrameUtils::TransformText(const PRUnichar* aText, PRUint32 aLength, PRUnichar* aOutput, PRBool aCompressWhitespace, PRPackedBool* aIncomingWhitespace, gfxSkipCharsBuilder* aSkipChars, PRUint32* aAnalysisFlags) { PRUint32 flags = 0; PRUnichar* outputStart = aOutput; if (!aCompressWhitespace) { // Convert tabs and formfeeds to spaces and skip discardables. PRUint32 i; for (i = 0; i < aLength; ++i) { PRUnichar ch = *aText++; if (ch == '\t') { flags |= TEXT_HAS_TAB|TEXT_WAS_TRANSFORMED; aSkipChars->KeepChar(); *aOutput++ = ' '; } else if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); } else { aSkipChars->KeepChar(); if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ch; } } *aIncomingWhitespace = PR_FALSE; } else { PRBool inWhitespace = *aIncomingWhitespace; PRUint32 i; for (i = 0; i < aLength; ++i) { PRUnichar ch = *aText++; PRBool nowInWhitespace; if (ch == ' ' && (i + 1 >= aLength || !IsSpaceCombiningSequenceTail(aText, aLength - (i + 1)))) { nowInWhitespace = PR_TRUE; } else if (ch == '\n') { if (i > 0 && IS_CJ_CHAR(aText[-1]) && i + 1 < aLength && IS_CJ_CHAR(aText[1])) { // Discard newlines between CJK chars. // XXX this really requires more context to get right! aSkipChars->SkipChar(); continue; } nowInWhitespace = PR_TRUE; } else { nowInWhitespace = ch == '\t'; } if (!nowInWhitespace) { if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); nowInWhitespace = inWhitespace; } else { if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ch; aSkipChars->KeepChar(); } } else { if (inWhitespace) { aSkipChars->SkipChar(); } else { if (ch != ' ') { flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ' '; aSkipChars->KeepChar(); } } inWhitespace = nowInWhitespace; } *aIncomingWhitespace = inWhitespace; } if (outputStart + aLength != aOutput) { flags |= TEXT_WAS_TRANSFORMED; } *aAnalysisFlags = flags; return aOutput; } PRUint8* nsTextFrameUtils::TransformText(const PRUint8* aText, PRUint32 aLength, PRUint8* aOutput, PRBool aCompressWhitespace, PRPackedBool* aIncomingWhitespace, gfxSkipCharsBuilder* aSkipChars, PRUint32* aAnalysisFlags) { PRUint32 flags = 0; PRUint8* outputStart = aOutput; if (!aCompressWhitespace) { // Convert tabs to spaces and skip discardables. PRUint32 i; for (i = 0; i < aLength; ++i) { PRUint8 ch = *aText++; if (ch == '\t') { flags |= TEXT_HAS_TAB|TEXT_WAS_TRANSFORMED; aSkipChars->KeepChar(); *aOutput++ = ' '; } else if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); } else { aSkipChars->KeepChar(); if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ch; } } *aIncomingWhitespace = PR_FALSE; } else { PRBool inWhitespace = *aIncomingWhitespace; PRUint32 i; for (i = 0; i < aLength; ++i) { PRUint8 ch = *aText++; PRBool nowInWhitespace = ch == ' ' || ch == '\t' || ch == '\n' || ch == '\f'; if (!nowInWhitespace) { if (IsDiscardable(ch, &flags)) { aSkipChars->SkipChar(); nowInWhitespace = inWhitespace; } else { if (ch == CH_NBSP) { ch = ' '; flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ch; aSkipChars->KeepChar(); } } else { if (inWhitespace) { aSkipChars->SkipChar(); } else { if (ch != ' ') { flags |= TEXT_WAS_TRANSFORMED; } *aOutput++ = ' '; aSkipChars->KeepChar(); } } inWhitespace = nowInWhitespace; } *aIncomingWhitespace = inWhitespace; } if (outputStart + aLength != aOutput) { flags |= TEXT_WAS_TRANSFORMED; } *aAnalysisFlags = flags; return aOutput; } // TODO The wordbreaker needs to be fixed. It's buggy, for example, it doesn't // handle diacriticals combined with spaces enum SimpleCharClass { CLASS_ALNUM, CLASS_PUNCT, CLASS_SPACE }; // This is what nsSampleWordBreaker::GetClass considers whitespace static PRBool IsWordBreakerWhitespace(const PRUnichar* aChars, PRInt32 aLength) { NS_ASSERTION(aLength > 0, "Can't handle empty string"); PRUnichar ch = aChars[0]; if (ch == '\t' || ch == '\n' || ch == '\r') return PR_TRUE; if (ch == ' ' && !nsTextFrameUtils::IsSpaceCombiningSequenceTail(aChars + 1, aLength - 1)) return PR_TRUE; return PR_FALSE; } // like nsSampleWordBreaker::GetClass static SimpleCharClass Classify8BitChar(PRUint8 aChar) { if (aChar == ' ' || aChar == '\t' || aChar == '\n' || aChar == '\r') return CLASS_SPACE; if ((aChar >= 'a' && aChar <= 'z') || (aChar >= 'A' || aChar <= 'Z') || (aChar >= '0' && aChar <= '9') || (aChar >= 128)) return CLASS_ALNUM; return CLASS_PUNCT; } PRInt32 nsTextFrameUtils::FindWordBoundary(const nsTextFragment* aText, gfxTextRun* aTextRun, gfxSkipCharsIterator* aIterator, PRInt32 aOffset, PRInt32 aLength, PRInt32 aPosition, PRInt32 aDirection, PRBool aBreakBeforePunctuation, PRBool aBreakAfterPunctuation, PRBool* aWordIsWhitespace) { // A space followed by combining diacritical marks is not whitespace!! PRInt32 textLength = aText->GetLength(); NS_ASSERTION(aOffset + aLength <= textLength, "Substring out of bounds"); NS_ASSERTION(aPosition >= aOffset && aPosition < aOffset + aLength, "Position out of bounds"); *aWordIsWhitespace = aText->Is2b() ? IsWordBreakerWhitespace(aText->Get2b() + aPosition, textLength - aPosition) : Classify8BitChar(aText->Get1b()[aPosition]) == CLASS_SPACE; PRInt32 len = 0; // length of current "word", excluding first character at aPosition if (aText->Is2b()) { nsIWordBreaker* wordBreaker = nsContentUtils::WordBreaker(); const PRUnichar* text = aText->Get2b(); // XXX the wordbreaker currently isn't cluster-aware. We need to make // it cluster-aware. In the meantime, just reject any word breaks // inside clusters. for (;;) { if (aDirection > 0) { // Returns the index of the first character of the next word PRInt32 nextWordPos = wordBreaker->NextWord(text, textLength, aPosition + len); if (nextWordPos < 0) break; len = nextWordPos - aPosition - 1; } else { // Returns the index of the first character of the current word PRInt32 nextWordPos = wordBreaker->PrevWord(text, textLength, aPosition + len); if (nextWordPos < 0) break; len = aPosition - nextWordPos; } if (aTextRun->IsClusterStart(aIterator->ConvertOriginalToSkipped(aPosition + len*aDirection))) break; } } else { const char* text = aText->Get1b(); SimpleCharClass cl = Classify8BitChar(text[aPosition]); // There shouldn't be any clusters in 8bit text but we'll cover that // possibility anyway PRInt32 nextWordPos; do { ++len; nextWordPos = aPosition + aDirection*len; if (nextWordPos < aOffset || nextWordPos >= aOffset + aLength) break; } while (Classify8BitChar(text[nextWordPos]) == cl || !aTextRun->IsClusterStart(aIterator->ConvertOriginalToSkipped(nextWordPos))); } // Handle punctuation breaks PRInt32 i; PRBool punctPrev = IsPunctuationMark(aText->CharAt(aPosition)); for (i = 1; i < len; ++i) { PRInt32 pos = aPosition + i*aDirection; // See if there's a punctuation break between i-aDirection and i PRBool punct = IsPunctuationMark(aText->CharAt(pos)); if (punct != punctPrev && aTextRun->IsClusterStart(aIterator->ConvertOriginalToSkipped(pos))) { PRBool punctIsBefore = aDirection < 0 ? punct : punctPrev; if (punctIsBefore ? aBreakAfterPunctuation : aBreakBeforePunctuation) break; } punctPrev = punct; } PRInt32 pos = aPosition + i*aDirection; if (pos < aOffset || pos >= aOffset + aLength) return -1; return i; } PRBool nsSkipCharsRunIterator::NextRun() { do { if (mRunLength) { mIterator.AdvanceOriginal(mRunLength); NS_ASSERTION(mRunLength > 0, "No characters in run (initial length too large?)"); if (!mSkipped || mLengthIncludesSkipped) { mRemainingLength -= mRunLength; } } if (!mRemainingLength) return PR_FALSE; PRInt32 length; mSkipped = mIterator.IsOriginalCharSkipped(&length); mRunLength = PR_MIN(length, mRemainingLength); } while (!mVisitSkipped && mSkipped); return PR_TRUE; }