From ad3efbd71440f2fc015547ea59537cbe9a00c8a9 Mon Sep 17 00:00:00 2001 From: Zack Weinberg Date: Sat, 16 Feb 2013 18:27:53 -0500 Subject: [PATCH] Bug 543151, part A1: Preliminary cleanups to the scanner/parser interface and the organization of nsCSSScanner.cpp. r=heycam --- layout/style/nsCSSParser.cpp | 67 +--- layout/style/nsCSSScanner.cpp | 639 +++++++++++++++++----------------- layout/style/nsCSSScanner.h | 177 ++++++---- 3 files changed, 438 insertions(+), 445 deletions(-) diff --git a/layout/style/nsCSSParser.cpp b/layout/style/nsCSSParser.cpp index dbbb33c2cb2..b29e889691d 100644 --- a/layout/style/nsCSSParser.cpp +++ b/layout/style/nsCSSParser.cpp @@ -340,15 +340,6 @@ protected: bool GetToken(bool aSkipWS); void UngetToken(); - // get the part in paretheses of the url() function, which is really a - // part of a token in the CSS grammar, but we're using a combination - // of the parser and the scanner to do it to handle the backtracking - // required by the error handling of the tokenization (since if we - // fail to scan the full token, we should fall back to tokenizing as - // FUNCTION ... ')'). - // Note that this function WILL WRITE TO aURL IN SOME FAILURE CASES. - bool GetURLInParens(nsString& aURL); - bool ExpectSymbol(PRUnichar aSymbol, bool aSkipWS); bool ExpectEndProperty(); bool CheckEndProperty(); @@ -1437,44 +1428,13 @@ CSSParserImpl::EvaluateSupportsCondition(const nsAString& aDeclaration, bool CSSParserImpl::GetToken(bool aSkipWS) { - for (;;) { - if (!mHavePushBack) { - if (!mScanner->Next(mToken)) { - break; - } - } + if (mHavePushBack) { mHavePushBack = false; - if (aSkipWS && (eCSSToken_WhiteSpace == mToken.mType)) { - continue; + if (!aSkipWS || mToken.mType != eCSSToken_Whitespace) { + return true; } - return true; } - return false; -} - -bool -CSSParserImpl::GetURLInParens(nsString& aURL) -{ - NS_ASSERTION(!mHavePushBack, "mustn't have pushback at this point"); - if (! mScanner->NextURL(mToken)) { - // EOF - return false; - } - - aURL = mToken.mIdent; - - if (eCSSToken_URL != mToken.mType) { - // In the failure case (which gives a token of type - // eCSSToken_Bad_URL), we do not have to match parentheses *inside* - // the Bad_URL token, since this is now an invalid URL token. But - // we do need to match the closing parenthesis to match the 'url('. - NS_ABORT_IF_FALSE(mToken.mType == eCSSToken_Bad_URL, - "unexpected token type"); - SkipUntil(')'); - return false; - } - - return true; + return mScanner->Next(mToken, aSkipWS); } void @@ -2207,9 +2167,10 @@ CSSParserImpl::ParseMozDocumentRule(RuleAppendFunc aAppendFunc, void* aData) cur->func = css::DocumentRule::eDomain; } - nsAutoString url; - if (!GetURLInParens(url)) { + NS_ASSERTION(!mHavePushBack, "mustn't have pushback at this point"); + if (!mScanner->NextURL(mToken) || mToken.mType != eCSSToken_URL) { REPORT_UNEXPECTED_TOKEN(PEMozDocRuleNotURI); + SkipUntil(')'); delete urls; return false; } @@ -2217,7 +2178,7 @@ CSSParserImpl::ParseMozDocumentRule(RuleAppendFunc aAppendFunc, void* aData) // We could try to make the URL (as long as it's not domain()) // canonical and absolute with NS_NewURI and GetSpec, but I'm // inclined to think we shouldn't. - CopyUTF16toUTF8(url, cur->url); + CopyUTF16toUTF8(mToken.mIdent, cur->url); } } while (ExpectSymbol(',', true)); @@ -3039,7 +3000,7 @@ CSSParserImpl::ParseSelectorGroup(nsCSSSelectorList*& aList) } combinator = PRUnichar(0); - if (mToken.mType == eCSSToken_WhiteSpace) { + if (mToken.mType == eCSSToken_Whitespace) { if (!GetToken(true)) { break; // EOF ok here } @@ -4121,7 +4082,7 @@ CSSParserImpl::ParseColor(nsCSSValue& aValue) nscolor rgba; switch (tk->mType) { case eCSSToken_ID: - case eCSSToken_Ref: + case eCSSToken_Hash: // #xxyyzz if (NS_HexToRGB(tk->mIdent, &rgba)) { aValue.SetColorValue(rgba); @@ -5031,7 +4992,7 @@ CSSParserImpl::ParseVariant(nsCSSValue& aValue, if ((aVariantMask & VARIANT_COLOR) != 0) { if (mHashlessColorQuirk || // NONSTANDARD: Nav interprets 'xxyyzz' values even without '#' prefix (eCSSToken_ID == tk->mType) || - (eCSSToken_Ref == tk->mType) || + (eCSSToken_Hash == tk->mType) || (eCSSToken_Ident == tk->mType) || ((eCSSToken_Function == tk->mType) && (tk->mIdent.LowerCaseEqualsLiteral("rgb") || @@ -5748,7 +5709,7 @@ CSSParserImpl::IsLegacyGradientLine(const nsCSSTokenType& aType, } // fall through case eCSSToken_ID: - case eCSSToken_Ref: + case eCSSToken_Hash: // this is a color break; @@ -8021,7 +7982,7 @@ CSSParserImpl::RequireWhitespace() { if (!GetToken(false)) return false; - if (mToken.mType != eCSSToken_WhiteSpace) { + if (mToken.mType != eCSSToken_Whitespace) { UngetToken(); return false; } @@ -8427,7 +8388,7 @@ CSSParserImpl::ParseOneFamily(nsAString& aFamily, bool& aOneKeyword) if (eCSSToken_Ident == tk->mType) { aOneKeyword = false; aFamily.Append(tk->mIdent); - } else if (eCSSToken_WhiteSpace == tk->mType) { + } else if (eCSSToken_Whitespace == tk->mType) { // Lookahead one token and drop whitespace if we are ending the // font name. if (!GetToken(true)) diff --git a/layout/style/nsCSSScanner.cpp b/layout/style/nsCSSScanner.cpp index f9b19414dc4..e921f183f21 100644 --- a/layout/style/nsCSSScanner.cpp +++ b/layout/style/nsCSSScanner.cpp @@ -121,11 +121,6 @@ HexDigitValue(int32_t ch) } } -nsCSSToken::nsCSSToken() -{ - mType = eCSSToken_Symbol; -} - void nsCSSToken::AppendToString(nsString& aBuffer) const { @@ -140,7 +135,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const break; case eCSSToken_ID: - case eCSSToken_Ref: + case eCSSToken_Hash: aBuffer.Append('#'); nsStyleUtil::AppendEscapedCSSIdent(mIdent, aBuffer); break; @@ -200,7 +195,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const aBuffer.Append(mSymbol); break; - case eCSSToken_WhiteSpace: + case eCSSToken_Whitespace: aBuffer.Append(' '); break; @@ -232,7 +227,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const } nsCSSScanner::nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber) - : mReadPointer(aBuffer.BeginReading()) + : mBuffer(aBuffer.BeginReading()) , mOffset(0) , mCount(aBuffer.Length()) , mPushback(mLocalPushback) @@ -259,6 +254,43 @@ nsCSSScanner::~nsCSSScanner() } } +void +nsCSSScanner::StartRecording() +{ + NS_ASSERTION(!mRecording, "already started recording"); + mRecording = true; + mRecordStartOffset = mOffset - mPushbackCount; +} + +void +nsCSSScanner::StopRecording() +{ + NS_ASSERTION(mRecording, "haven't started recording"); + mRecording = false; +} + +void +nsCSSScanner::StopRecording(nsString& aBuffer) +{ + NS_ASSERTION(mRecording, "haven't started recording"); + mRecording = false; + aBuffer.Append(mBuffer + mRecordStartOffset, + mOffset - mPushbackCount - mRecordStartOffset); +} + +nsDependentSubstring +nsCSSScanner::GetCurrentLine() const +{ + uint32_t end = mTokenOffset; + while (end < mCount && + mBuffer[end] != '\n' && mBuffer[end] != '\r' && + mBuffer[end] != '\f') { + end++; + } + return nsDependentSubstring(mBuffer + mTokenLineOffset, + mBuffer + end); +} + // Returns -1 on error or eof int32_t nsCSSScanner::Read() @@ -270,11 +302,11 @@ nsCSSScanner::Read() if (mOffset == mCount) { return -1; } - rv = int32_t(mReadPointer[mOffset++]); + rv = int32_t(mBuffer[mOffset++]); // There are four types of newlines in CSS: "\r", "\n", "\r\n", and "\f". // To simplify dealing with newlines, they are all normalized to "\n" here if (rv == '\r') { - if (mOffset < mCount && mReadPointer[mOffset] == '\n') { + if (mOffset < mCount && mBuffer[mOffset] == '\n') { mOffset++; } rv = '\n'; @@ -323,43 +355,6 @@ nsCSSScanner::Pushback(PRUnichar aChar) mPushback[mPushbackCount++] = aChar; } -void -nsCSSScanner::StartRecording() -{ - NS_ASSERTION(!mRecording, "already started recording"); - mRecording = true; - mRecordStartOffset = mOffset - mPushbackCount; -} - -void -nsCSSScanner::StopRecording() -{ - NS_ASSERTION(mRecording, "haven't started recording"); - mRecording = false; -} - -void -nsCSSScanner::StopRecording(nsString& aBuffer) -{ - NS_ASSERTION(mRecording, "haven't started recording"); - mRecording = false; - aBuffer.Append(mReadPointer + mRecordStartOffset, - mOffset - mPushbackCount - mRecordStartOffset); -} - -nsDependentSubstring -nsCSSScanner::GetCurrentLine() const -{ - uint32_t end = mTokenOffset; - while (end < mCount && - mReadPointer[end] != '\n' && mReadPointer[end] != '\r' && - mReadPointer[end] != '\f') { - end++; - } - return nsDependentSubstring(mReadPointer + mTokenLineOffset, - mReadPointer + end); -} - bool nsCSSScanner::LookAhead(PRUnichar aChar) { @@ -389,7 +384,7 @@ nsCSSScanner::LookAheadOrEOF(PRUnichar aChar) } void -nsCSSScanner::EatWhiteSpace() +nsCSSScanner::SkipWhitespace() { for (;;) { int32_t ch = Read(); @@ -403,233 +398,28 @@ nsCSSScanner::EatWhiteSpace() } } -bool -nsCSSScanner::Next(nsCSSToken& aToken) +void +nsCSSScanner::SkipComment() { - for (;;) { // Infinite loop so we can restart after comments. - mTokenOffset = mOffset; - mTokenLineOffset = mLineOffset; - mTokenLineNumber = mLineNumber; - - int32_t ch = Read(); - if (ch < 0) { - return false; - } - - // UNICODE-RANGE - if ((ch == 'u' || ch == 'U') && Peek() == '+') - return ParseURange(ch, aToken); - - // IDENT - if (StartsIdent(ch, Peek())) - return ParseIdent(ch, aToken); - - // AT_KEYWORD - if (ch == '@') { - return ParseAtKeyword(aToken); - } - - // NUMBER or DIM - if ((ch == '.') || (ch == '+') || (ch == '-')) { - int32_t nextChar = Peek(); - if (IsDigit(nextChar)) { - return ParseNumber(ch, aToken); - } - else if (('.' == nextChar) && ('.' != ch)) { - nextChar = Read(); - int32_t followingChar = Peek(); - Pushback(nextChar); - if (IsDigit(followingChar)) - return ParseNumber(ch, aToken); - } - } - if (IsDigit(ch)) { - return ParseNumber(ch, aToken); - } - - // ID - if (ch == '#') { - return ParseRef(ch, aToken); - } - - // STRING - if ((ch == '"') || (ch == '\'')) { - return ParseString(ch, aToken); - } - - // WS - if (IsWhitespace(ch)) { - aToken.mType = eCSSToken_WhiteSpace; - aToken.mIdent.Assign(PRUnichar(ch)); - EatWhiteSpace(); - return true; - } - if (ch == '/' && !IsSVGMode()) { - int32_t nextChar = Peek(); - if (nextChar == '*') { - Read(); - // FIXME: Editor wants comments to be preserved (bug 60290). - if (!SkipCComment()) { - return false; - } - continue; // start again at the beginning - } - } - if (ch == '<') { // consume HTML comment tags - if (LookAhead('!')) { - if (LookAhead('-')) { - if (LookAhead('-')) { - aToken.mType = eCSSToken_HTMLComment; - aToken.mIdent.AssignLiteral(""); - return true; - } - Pushback('-'); - } - } - - // INCLUDES ("~=") and DASHMATCH ("|=") - if (( ch == '|' ) || ( ch == '~' ) || ( ch == '^' ) || - ( ch == '$' ) || ( ch == '*' )) { - int32_t nextChar = Read(); - if ( nextChar == '=' ) { - if (ch == '~') { - aToken.mType = eCSSToken_Includes; - } - else if (ch == '|') { - aToken.mType = eCSSToken_Dashmatch; - } - else if (ch == '^') { - aToken.mType = eCSSToken_Beginsmatch; - } - else if (ch == '$') { - aToken.mType = eCSSToken_Endsmatch; - } - else if (ch == '*') { - aToken.mType = eCSSToken_Containsmatch; - } - return true; - } else if (nextChar >= 0) { - Pushback(nextChar); - } - } - aToken.mType = eCSSToken_Symbol; - aToken.mSymbol = ch; - return true; - } -} - -bool -nsCSSScanner::NextURL(nsCSSToken& aToken) -{ - EatWhiteSpace(); - - int32_t ch = Read(); - if (ch < 0) { - return false; - } - - // STRING - if ((ch == '"') || (ch == '\'')) { -#ifdef DEBUG - bool ok = -#endif - ParseString(ch, aToken); - NS_ABORT_IF_FALSE(ok, "ParseString should never fail, " - "since there's always something read"); - - NS_ABORT_IF_FALSE(aToken.mType == eCSSToken_String || - aToken.mType == eCSSToken_Bad_String, - "unexpected token type"); - if (MOZ_LIKELY(aToken.mType == eCSSToken_String)) { - EatWhiteSpace(); - if (LookAheadOrEOF(')')) { - aToken.mType = eCSSToken_URL; - } else { - aToken.mType = eCSSToken_Bad_URL; - } - } else { - aToken.mType = eCSSToken_Bad_URL; - } - return true; - } - - // Process a url lexical token. A CSS1 url token can contain - // characters beyond identifier characters (e.g. '/', ':', etc.) - // Because of this the normal rules for tokenizing the input don't - // apply very well. To simplify the parser and relax some of the - // requirements on the scanner we parse url's here. If we find a - // malformed URL then we emit a token of type "Bad_URL" so that - // the CSS1 parser can ignore the invalid input. The parser must - // treat a Bad_URL token like a Function token, and process - // tokens until a matching parenthesis. - - aToken.mType = eCSSToken_Bad_URL; - aToken.mSymbol = PRUnichar(0); - nsString& ident = aToken.mIdent; - ident.SetLength(0); - - // start of a non-quoted url (which may be empty) - bool ok = true; for (;;) { - if (IsURLChar(ch)) { - // A regular url character. - ident.Append(PRUnichar(ch)); - } else if (ch == ')') { - // All done - break; - } else if (IsWhitespace(ch)) { - // Whitespace is allowed at the end of the URL - EatWhiteSpace(); - // Consume the close paren if we have it; if not we're an invalid URL. - ok = LookAheadOrEOF(')'); - break; - } else if (ch == '\\') { - if (!ParseAndAppendEscape(ident, false)) { - ok = false; - Pushback(ch); - break; + int32_t ch = Read(); + if (ch < 0) break; + if (ch == '*') { + if (LookAhead('/')) { + return; } - } else { - // This is an invalid URL spec - ok = false; - Pushback(ch); // push it back so the parser can match tokens and - // then closing parenthesis - break; - } - - ch = Read(); - if (ch < 0) { - break; } } - // If the result of the above scanning is ok then change the token - // type to a useful one. - if (ok) { - aToken.mType = eCSSToken_URL; - } - return true; + mReporter->ReportUnexpectedEOF("PECommentEOF"); } - /** * Returns whether an escape was succesfully parsed; if it was not, * the backslash needs to be its own symbol token. */ bool -nsCSSScanner::ParseAndAppendEscape(nsString& aOutput, bool aInString) +nsCSSScanner::GatherEscape(nsString& aOutput, bool aInString) { int32_t ch = Read(); if (ch < 0) { @@ -717,7 +507,7 @@ bool nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent) { if (aChar == '\\') { - if (!ParseAndAppendEscape(aIdent, false)) { + if (!GatherEscape(aIdent, false)) { return false; } } else { @@ -730,12 +520,12 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent) // See how much we can consume and append in one go uint32_t n = mOffset; // Count number of Ident characters that can be processed - while (n < mCount && IsIdent(mReadPointer[n])) { + while (n < mCount && IsIdent(mBuffer[n])) { ++n; } // Add to the token what we have so far if (n > mOffset) { - aIdent.Append(&mReadPointer[mOffset], n - mOffset); + aIdent.Append(&mBuffer[mOffset], n - mOffset); mOffset = n; } } @@ -743,7 +533,7 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent) aChar = Read(); if (aChar < 0) break; if (aChar == '\\') { - if (!ParseAndAppendEscape(aIdent, false)) { + if (!GatherEscape(aIdent, false)) { Pushback(aChar); break; } @@ -759,35 +549,7 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent) } bool -nsCSSScanner::ParseRef(int32_t aChar, nsCSSToken& aToken) -{ - // Fall back for when we don't have name characters following: - aToken.mType = eCSSToken_Symbol; - aToken.mSymbol = aChar; - - int32_t ch = Read(); - if (ch < 0) { - return true; - } - if (IsIdent(ch) || ch == '\\') { - // First char after the '#' is a valid ident char (or an escape), - // so it makes sense to keep going - nsCSSTokenType type = - StartsIdent(ch, Peek()) ? eCSSToken_ID : eCSSToken_Ref; - aToken.mIdent.SetLength(0); - if (GatherIdent(ch, aToken.mIdent)) { - aToken.mType = type; - return true; - } - } - - // No ident chars after the '#'. Just unread |ch| and get out of here. - Pushback(ch); - return true; -} - -bool -nsCSSScanner::ParseIdent(int32_t aChar, nsCSSToken& aToken) +nsCSSScanner::ScanIdent(int32_t aChar, nsCSSToken& aToken) { nsString& ident = aToken.mIdent; ident.SetLength(0); @@ -814,7 +576,7 @@ nsCSSScanner::ParseIdent(int32_t aChar, nsCSSToken& aToken) } bool -nsCSSScanner::ParseAtKeyword(nsCSSToken& aToken) +nsCSSScanner::ScanAtKeyword(nsCSSToken& aToken) { int32_t ch = Read(); if (StartsIdent(ch, Peek())) { @@ -833,7 +595,35 @@ nsCSSScanner::ParseAtKeyword(nsCSSToken& aToken) } bool -nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken) +nsCSSScanner::ScanHash(int32_t aChar, nsCSSToken& aToken) +{ + // Fall back for when we don't have name characters following: + aToken.mType = eCSSToken_Symbol; + aToken.mSymbol = aChar; + + int32_t ch = Read(); + if (ch < 0) { + return true; + } + if (IsIdent(ch) || ch == '\\') { + // First char after the '#' is a valid ident char (or an escape), + // so it makes sense to keep going + nsCSSTokenType type = + StartsIdent(ch, Peek()) ? eCSSToken_ID : eCSSToken_Hash; + aToken.mIdent.SetLength(0); + if (GatherIdent(ch, aToken.mIdent)) { + aToken.mType = type; + return true; + } + } + + // No ident chars after the '#'. Just unread |ch| and get out of here. + Pushback(ch); + return true; +} + +bool +nsCSSScanner::ScanNumber(int32_t c, nsCSSToken& aToken) { NS_PRECONDITION(c == '.' || c == '+' || c == '-' || IsDigit(c), "Why did we get called?"); @@ -867,7 +657,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken) bool gotDot = (c == '.'); if (!gotDot) { - // Parse the integer part of the mantisssa + // Scan the integer part of the mantisssa NS_ASSERTION(IsDigit(c), "Why did we get called?"); do { intPart = 10*intPart + DecimalDigitValue(c); @@ -879,7 +669,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken) } if (gotDot) { - // Parse the fractional part of the mantissa. + // Scan the fractional part of the mantissa. c = Read(); NS_ASSERTION(IsDigit(c), "How did we get here?"); // Power of ten by which we need to divide our next digit @@ -967,24 +757,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken) } bool -nsCSSScanner::SkipCComment() -{ - for (;;) { - int32_t ch = Read(); - if (ch < 0) break; - if (ch == '*') { - if (LookAhead('/')) { - return true; - } - } - } - - mReporter->ReportUnexpectedEOF("PECommentEOF"); - return false; -} - -bool -nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken) +nsCSSScanner::ScanString(int32_t aStop, nsCSSToken& aToken) { aToken.mIdent.SetLength(0); aToken.mType = eCSSToken_String; @@ -996,7 +769,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken) uint32_t n = mOffset; // Count number of characters that can be processed for (;n < mCount; ++n) { - PRUnichar nextChar = mReadPointer[n]; + PRUnichar nextChar = mBuffer[n]; if ((nextChar == aStop) || (nextChar == '\\') || (nextChar == '\n') || (nextChar == '\r') || (nextChar == '\f')) { break; @@ -1004,7 +777,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken) } // Add to the token what we have so far if (n > mOffset) { - aToken.mIdent.Append(&mReadPointer[mOffset], n - mOffset); + aToken.mIdent.Append(&mBuffer[mOffset], n - mOffset); mOffset = n; } } @@ -1018,10 +791,10 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken) break; } if (ch == '\\') { - if (!ParseAndAppendEscape(aToken.mIdent, true)) { + if (!GatherEscape(aToken.mIdent, true)) { aToken.mType = eCSSToken_Bad_String; Pushback(ch); - // For strings, the only case where ParseAndAppendEscape will + // For strings, the only case where GatherEscape will // return false is when there's a backslash to start an escape // immediately followed by end-of-stream. In that case, the // correct tokenization is badstring *followed* by a DELIM for @@ -1052,7 +825,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken) // are also decoded into mInteger and mInteger2, and mIntegerValid is set. bool -nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult) +nsCSSScanner::ScanURange(int32_t aChar, nsCSSToken& aResult) { int32_t intro2 = Read(); int32_t ch = Peek(); @@ -1069,7 +842,7 @@ nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult) if (!IsHexDigit(ch) && ch != '?') { Pushback(intro2); Pushback(aChar); - return ParseIdent(aChar, aResult); + return ScanIdent(aChar, aResult); } aResult.mIdent.Truncate(); @@ -1129,3 +902,223 @@ nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult) aResult.mType = eCSSToken_URange; return true; } + +bool +nsCSSScanner::NextURL(nsCSSToken& aToken) +{ + SkipWhitespace(); + + int32_t ch = Read(); + if (ch < 0) { + return false; + } + + // STRING + if ((ch == '"') || (ch == '\'')) { +#ifdef DEBUG + bool ok = +#endif + ScanString(ch, aToken); + NS_ABORT_IF_FALSE(ok, "ScanString should never fail, " + "since there's always something read"); + + NS_ABORT_IF_FALSE(aToken.mType == eCSSToken_String || + aToken.mType == eCSSToken_Bad_String, + "unexpected token type"); + if (MOZ_LIKELY(aToken.mType == eCSSToken_String)) { + SkipWhitespace(); + if (LookAheadOrEOF(')')) { + aToken.mType = eCSSToken_URL; + } else { + aToken.mType = eCSSToken_Bad_URL; + } + } else { + aToken.mType = eCSSToken_Bad_URL; + } + return true; + } + + // Process a url lexical token. A CSS1 url token can contain + // characters beyond identifier characters (e.g. '/', ':', etc.) + // Because of this the normal rules for tokenizing the input don't + // apply very well. To simplify the parser and relax some of the + // requirements on the scanner we parse url's here. If we find a + // malformed URL then we emit a token of type "Bad_URL" so that + // the CSS1 parser can ignore the invalid input. The parser must + // treat a Bad_URL token like a Function token, and process + // tokens until a matching parenthesis. + + aToken.mType = eCSSToken_Bad_URL; + aToken.mSymbol = PRUnichar(0); + nsString& ident = aToken.mIdent; + ident.SetLength(0); + + // start of a non-quoted url (which may be empty) + bool ok = true; + for (;;) { + if (IsURLChar(ch)) { + // A regular url character. + ident.Append(PRUnichar(ch)); + } else if (ch == ')') { + // All done + break; + } else if (IsWhitespace(ch)) { + // Whitespace is allowed at the end of the URL + SkipWhitespace(); + // Consume the close paren if we have it; if not we're an invalid URL. + ok = LookAheadOrEOF(')'); + break; + } else if (ch == '\\') { + if (!GatherEscape(ident, false)) { + ok = false; + Pushback(ch); + break; + } + } else { + // This is an invalid URL spec + ok = false; + Pushback(ch); // push it back so the parser can match tokens and + // then closing parenthesis + break; + } + + ch = Read(); + if (ch < 0) { + break; + } + } + + // If the result of the above scanning is ok then change the token + // type to a useful one. + if (ok) { + aToken.mType = eCSSToken_URL; + } + return true; +} + +bool +nsCSSScanner::Next(nsCSSToken& aToken, bool aSkipWS) +{ + for (;;) { // Infinite loop so we can restart after comments. + mTokenOffset = mOffset; + mTokenLineOffset = mLineOffset; + mTokenLineNumber = mLineNumber; + + int32_t ch = Read(); + if (ch < 0) { + return false; + } + + // UNICODE-RANGE + if ((ch == 'u' || ch == 'U') && Peek() == '+') + return ScanURange(ch, aToken); + + // IDENT + if (StartsIdent(ch, Peek())) + return ScanIdent(ch, aToken); + + // AT_KEYWORD + if (ch == '@') { + return ScanAtKeyword(aToken); + } + + // NUMBER or DIM + if ((ch == '.') || (ch == '+') || (ch == '-')) { + int32_t nextChar = Peek(); + if (IsDigit(nextChar)) { + return ScanNumber(ch, aToken); + } + else if (('.' == nextChar) && ('.' != ch)) { + nextChar = Read(); + int32_t followingChar = Peek(); + Pushback(nextChar); + if (IsDigit(followingChar)) + return ScanNumber(ch, aToken); + } + } + if (IsDigit(ch)) { + return ScanNumber(ch, aToken); + } + + // ID + if (ch == '#') { + return ScanHash(ch, aToken); + } + + // STRING + if ((ch == '"') || (ch == '\'')) { + return ScanString(ch, aToken); + } + + // WS + if (IsWhitespace(ch)) { + SkipWhitespace(); + if (!aSkipWS) { + aToken.mType = eCSSToken_Whitespace; + return true; + } + continue; // start again at the beginning + } + if (ch == '/' && !IsSVGMode()) { + int32_t nextChar = Peek(); + if (nextChar == '*') { + Read(); + // FIXME: Editor wants comments to be preserved (bug 60290). + SkipComment(); + continue; // start again at the beginning + } + } + if (ch == '<') { // consume HTML comment tags + if (LookAhead('!')) { + if (LookAhead('-')) { + if (LookAhead('-')) { + aToken.mType = eCSSToken_HTMLComment; + aToken.mIdent.AssignLiteral(""); + return true; + } + Pushback('-'); + } + } + + // INCLUDES ("~=") and DASHMATCH ("|=") + if (( ch == '|' ) || ( ch == '~' ) || ( ch == '^' ) || + ( ch == '$' ) || ( ch == '*' )) { + int32_t nextChar = Read(); + if ( nextChar == '=' ) { + if (ch == '~') { + aToken.mType = eCSSToken_Includes; + } + else if (ch == '|') { + aToken.mType = eCSSToken_Dashmatch; + } + else if (ch == '^') { + aToken.mType = eCSSToken_Beginsmatch; + } + else if (ch == '$') { + aToken.mType = eCSSToken_Endsmatch; + } + else if (ch == '*') { + aToken.mType = eCSSToken_Containsmatch; + } + return true; + } else if (nextChar >= 0) { + Pushback(nextChar); + } + } + aToken.mType = eCSSToken_Symbol; + aToken.mSymbol = ch; + return true; + } +} diff --git a/layout/style/nsCSSScanner.h b/layout/style/nsCSSScanner.h index 30159bcea82..54e4be7c8f6 100644 --- a/layout/style/nsCSSScanner.h +++ b/layout/style/nsCSSScanner.h @@ -16,58 +16,88 @@ class ErrorReporter; } } -// Token types +// Token types; in close but not perfect correspondence to the token +// categorization in section 4.1.1 of CSS2.1. (The deviations are all +// the fault of css3-selectors, which has requirements that can only be +// met by changing the generic tokenization.) The comment on each line +// illustrates the form of each identifier. + enum nsCSSTokenType { - // A css identifier (e.g. foo) - eCSSToken_Ident, // mIdent + // White space of any kind. No value fields are used. Note that + // comments do *not* count as white space; comments separate tokens + // but are not themselves tokens. + eCSSToken_Whitespace, // - // A css at keyword (e.g. @foo) - eCSSToken_AtKeyword, // mIdent + // Identifier-like tokens. mIdent is the text of the identifier. + // The difference between ID and Hash is: if the text after the # + // would have been a valid Ident if the # hadn't been there, the + // scanner produces an ID token. Otherwise it produces a Hash token. + // (This distinction is required by css3-selectors.) + eCSSToken_Ident, // word + eCSSToken_Function, // word( + eCSSToken_AtKeyword, // @word + eCSSToken_ID, // #word + eCSSToken_Hash, // #0word - // A css number without a percentage or dimension; with percentage; - // without percentage but with a dimension - eCSSToken_Number, // mNumber - eCSSToken_Percentage, // mNumber - eCSSToken_Dimension, // mNumber + mIdent + // Numeric tokens. mNumber is the floating-point value of the + // number, and mHasSign indicates whether there was an explicit sign + // (+ or -) in front of the number. If mIntegerValid is true, the + // number had the lexical form of an integer, and mInteger is its + // integer value. Lexically integer values outside the range of a + // 32-bit signed number are clamped to the maximum values; mNumber + // will indicate a 'truer' value in that case. Percentage tokens + // are always considered not to be integers, even if their numeric + // value is integral (100% => mNumber = 1.0). For Dimension + // tokens, mIdent holds the text of the unit. + eCSSToken_Number, // 1 -5 +2e3 3.14159 7.297352e-3 + eCSSToken_Dimension, // 24px 8.5in + eCSSToken_Percentage, // 85% 1280.4% - // A css string (e.g. "foo" or 'foo') - eCSSToken_String, // mSymbol + mIdent + mSymbol + // String-like tokens. In all cases, mIdent holds the text + // belonging to the string, and mSymbol holds the delimiter + // character, which may be ', ", or zero (only for unquoted URLs). + // Bad_String and Bad_URL tokens are emitted when the closing + // delimiter or parenthesis was missing. + eCSSToken_String, // 'foo bar' "foo bar" + eCSSToken_Bad_String, // 'foo bar + eCSSToken_URL, // url(foobar) url("foo bar") + eCSSToken_Bad_URL, // url(foo - // Whitespace (e.g. " " or "/* abc */") - eCSSToken_WhiteSpace, // mIdent + // Any one-character symbol. mSymbol holds the character. + eCSSToken_Symbol, // . ; { } ! * - // A css symbol (e.g. ':', ';', '+', etc.) - eCSSToken_Symbol, // mSymbol + // Match operators. These are single tokens rather than pairs of + // Symbol tokens because css3-selectors forbids the presence of + // comments between the two characters. No value fields are used; + // the token type indicates which operator. + eCSSToken_Includes, // ~= + eCSSToken_Dashmatch, // |= + eCSSToken_Beginsmatch, // ^= + eCSSToken_Endsmatch, // $= + eCSSToken_Containsmatch, // *= - // A css1 id (e.g. #foo3) - eCSSToken_ID, // mIdent - // Just like eCSSToken_ID, except the part following the '#' is not - // a valid CSS identifier (eg. starts with a digit, is the empty - // string, etc). - eCSSToken_Ref, // mIdent + // Unicode-range token: currently used only in @font-face. + // The lexical rule for this token includes several forms that are + // semantically invalid. Therefore, mIdent always holds the + // complete original text of the token (so we can print it + // accurately in diagnostics), and mIntegerValid is true iff the + // token is semantically valid. In that case, mInteger holds the + // lowest value included in the range, and mInteger2 holds the + // highest value included in the range. + eCSSToken_URange, // U+007e U+01?? U+2000-206F - eCSSToken_Function, // mIdent - - eCSSToken_URL, // mIdent + mSymbol - eCSSToken_Bad_URL, // mIdent + mSymbol - - eCSSToken_HTMLComment, // "" - - eCSSToken_Includes, // "~=" - eCSSToken_Dashmatch, // "|=" - eCSSToken_Beginsmatch, // "^=" - eCSSToken_Endsmatch, // "$=" - eCSSToken_Containsmatch, // "*=" - - eCSSToken_URange, // Low in mInteger, high in mInteger2; - // mIntegerValid is true if the token is a - // valid range; mIdent preserves the textual - // form of the token for error reporting - - // An unterminated string, which is always an error. - eCSSToken_Bad_String // mSymbol + mIdent + // HTML comment delimiters, ignored as a unit when they appear at + // the top level of a style sheet, for compatibility with websites + // written for compatibility with pre-CSS browsers. This token type + // subsumes the css2.1 CDO and CDC tokens, which are always treated + // the same by the parser. mIdent holds the text of the token, for + // diagnostics. + eCSSToken_HTMLComment, // }; +// A single token returned from the scanner. mType is always +// meaningful; comments above describe which other fields are +// meaningful for which token types. struct nsCSSToken { nsAutoString mIdent NS_OKONHEAP; float mNumber; @@ -75,22 +105,24 @@ struct nsCSSToken { int32_t mInteger2; nsCSSTokenType mType; PRUnichar mSymbol; - bool mIntegerValid; // for number, dimension, urange - bool mHasSign; // for number, percentage, and dimension + bool mIntegerValid; + bool mHasSign; - nsCSSToken(); + nsCSSToken() + : mNumber(0), mInteger(0), mInteger2(0), mType(eCSSToken_Whitespace), + mSymbol('\0'), mIntegerValid(false), mHasSign(false) + {} - bool IsSymbol(PRUnichar aSymbol) { - return bool((eCSSToken_Symbol == mType) && (mSymbol == aSymbol)); + bool IsSymbol(PRUnichar aSymbol) const { + return mType == eCSSToken_Symbol && mSymbol == aSymbol; } void AppendToString(nsString& aBuffer) const; }; -// CSS Scanner API. Used to tokenize an input stream using the CSS -// forward compatible tokenization rules. This implementation is -// private to this package and is only used internally by the css -// parser. +// nsCSSScanner tokenizes an input stream using the CSS2.1 forward +// compatible tokenization rules. Used internally by nsCSSParser; +// not available for use by other code. class nsCSSScanner { public: // |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0| @@ -122,17 +154,23 @@ class nsCSSScanner { // the most recently processed token. nsDependentSubstring GetCurrentLine() const; - // Get the next token. Return false on EOF. aTokenResult - // is filled in with the data for the token. - bool Next(nsCSSToken& aTokenResult); + // Get the next token. Return false on EOF. aTokenResult is filled + // in with the data for the token. If aSkipWS is true, skip over + // eCSSToken_Whitespace tokens rather than returning them. + bool Next(nsCSSToken& aTokenResult, bool aSkipWS); - // Get the next token that may be a string or unquoted URL + // Get the body of an URL token (everything after the 'url('). + // This is exposed for use by nsCSSParser::ParseMozDocumentRule, + // which, for historical reasons, must make additional function + // tokens behave like url(). Please do not add new uses to the + // parser. bool NextURL(nsCSSToken& aTokenResult); - // It's really ugly that we have to expose this, but it's the easiest - // way to do :nth-child() parsing sanely. (In particular, in - // :nth-child(2n-1), "2n-1" is a dimension, and we need to push the - // "-1" back so we can read it again as a number.) + // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg, + // because "2n-1" is a single DIMENSION token, and "n-1" is a single + // IDENT token, but the :nth() selector syntax wants to interpret + // them the same as "2n -1" and "n -1" respectively. Please do not + // add new uses to the parser. void Pushback(PRUnichar aChar); // Starts recording the input stream from the current position. @@ -150,20 +188,21 @@ protected: int32_t Peek(); bool LookAhead(PRUnichar aChar); bool LookAheadOrEOF(PRUnichar aChar); // expect either aChar or EOF - void EatWhiteSpace(); - bool ParseAndAppendEscape(nsString& aOutput, bool aInString); - bool ParseIdent(int32_t aChar, nsCSSToken& aResult); - bool ParseAtKeyword(nsCSSToken& aResult); - bool ParseNumber(int32_t aChar, nsCSSToken& aResult); - bool ParseRef(int32_t aChar, nsCSSToken& aResult); - bool ParseString(int32_t aChar, nsCSSToken& aResult); - bool ParseURange(int32_t aChar, nsCSSToken& aResult); - bool SkipCComment(); + void SkipWhitespace(); + void SkipComment(); + bool GatherEscape(nsString& aOutput, bool aInString); bool GatherIdent(int32_t aChar, nsString& aIdent); - const PRUnichar *mReadPointer; + bool ScanIdent(int32_t aChar, nsCSSToken& aResult); + bool ScanAtKeyword(nsCSSToken& aResult); + bool ScanHash(int32_t aChar, nsCSSToken& aResult); + bool ScanNumber(int32_t aChar, nsCSSToken& aResult); + bool ScanString(int32_t aChar, nsCSSToken& aResult); + bool ScanURange(int32_t aChar, nsCSSToken& aResult); + + const PRUnichar *mBuffer; uint32_t mOffset; uint32_t mCount;