Bug 543151, part A1: Preliminary cleanups to the scanner/parser interface and the organization of nsCSSScanner.cpp. r=heycam

2024-09-13 09:24:08 -07:00 · 2013-02-16 18:27:53 -05:00 · 2013-02-16 18:27:53 -05:00 · ad3efbd714
commit ad3efbd714
parent 1162fa289b
3 changed files with 438 additions and 445 deletions
--- a/layout/style/nsCSSParser.cpp
+++ b/layout/style/nsCSSParser.cpp
@ -340,15 +340,6 @@ protected:
  bool GetToken(bool aSkipWS);
  void UngetToken();

-  // get the part in paretheses of the url() function, which is really a
-  // part of a token in the CSS grammar, but we're using a combination
-  // of the parser and the scanner to do it to handle the backtracking
-  // required by the error handling of the tokenization (since if we
-  // fail to scan the full token, we should fall back to tokenizing as
-  // FUNCTION ... ')').
-  // Note that this function WILL WRITE TO aURL IN SOME FAILURE CASES.
-  bool GetURLInParens(nsString& aURL);
-
  bool ExpectSymbol(PRUnichar aSymbol, bool aSkipWS);
  bool ExpectEndProperty();
  bool CheckEndProperty();
@ -1437,44 +1428,13 @@ CSSParserImpl::EvaluateSupportsCondition(const nsAString& aDeclaration,
 bool
 CSSParserImpl::GetToken(bool aSkipWS)
 {
-  for (;;) {
-    if (!mHavePushBack) {
-      if (!mScanner->Next(mToken)) {
-        break;
-      }
-    }
+  if (mHavePushBack) {
    mHavePushBack = false;
-    if (aSkipWS && (eCSSToken_WhiteSpace == mToken.mType)) {
-      continue;
+    if (!aSkipWS || mToken.mType != eCSSToken_Whitespace) {
+      return true;
    }
-    return true;
  }
-  return false;
-}
-
-bool
-CSSParserImpl::GetURLInParens(nsString& aURL)
-{
-  NS_ASSERTION(!mHavePushBack, "mustn't have pushback at this point");
-  if (! mScanner->NextURL(mToken)) {
-    // EOF
-    return false;
-  }
-
-  aURL = mToken.mIdent;
-
-  if (eCSSToken_URL != mToken.mType) {
-    // In the failure case (which gives a token of type
-    // eCSSToken_Bad_URL), we do not have to match parentheses *inside*
-    // the Bad_URL token, since this is now an invalid URL token.  But
-    // we do need to match the closing parenthesis to match the 'url('.
-    NS_ABORT_IF_FALSE(mToken.mType == eCSSToken_Bad_URL,
-                      "unexpected token type");
-    SkipUntil(')');
-    return false;
-  }
-
-  return true;
+  return mScanner->Next(mToken, aSkipWS);
 }

 void
@ -2207,9 +2167,10 @@ CSSParserImpl::ParseMozDocumentRule(RuleAppendFunc aAppendFunc, void* aData)
        cur->func = css::DocumentRule::eDomain;
      }

-      nsAutoString url;
-      if (!GetURLInParens(url)) {
+      NS_ASSERTION(!mHavePushBack, "mustn't have pushback at this point");
+      if (!mScanner->NextURL(mToken) || mToken.mType != eCSSToken_URL) {
        REPORT_UNEXPECTED_TOKEN(PEMozDocRuleNotURI);
+        SkipUntil(')');
        delete urls;
        return false;
      }
@ -2217,7 +2178,7 @@ CSSParserImpl::ParseMozDocumentRule(RuleAppendFunc aAppendFunc, void* aData)
      // We could try to make the URL (as long as it's not domain())
      // canonical and absolute with NS_NewURI and GetSpec, but I'm
      // inclined to think we shouldn't.
-      CopyUTF16toUTF8(url, cur->url);
+      CopyUTF16toUTF8(mToken.mIdent, cur->url);
    }
  } while (ExpectSymbol(',', true));

@ -3039,7 +3000,7 @@ CSSParserImpl::ParseSelectorGroup(nsCSSSelectorList*& aList)
    }

    combinator = PRUnichar(0);
-    if (mToken.mType == eCSSToken_WhiteSpace) {
+    if (mToken.mType == eCSSToken_Whitespace) {
      if (!GetToken(true)) {
        break; // EOF ok here
      }
@ -4121,7 +4082,7 @@ CSSParserImpl::ParseColor(nsCSSValue& aValue)
  nscolor rgba;
  switch (tk->mType) {
    case eCSSToken_ID:
-    case eCSSToken_Ref:
+    case eCSSToken_Hash:
      // #xxyyzz
      if (NS_HexToRGB(tk->mIdent, &rgba)) {
        aValue.SetColorValue(rgba);
@ -5031,7 +4992,7 @@ CSSParserImpl::ParseVariant(nsCSSValue& aValue,
  if ((aVariantMask & VARIANT_COLOR) != 0) {
    if (mHashlessColorQuirk || // NONSTANDARD: Nav interprets 'xxyyzz' values even without '#' prefix
        (eCSSToken_ID == tk->mType) ||
-        (eCSSToken_Ref == tk->mType) ||
+        (eCSSToken_Hash == tk->mType) ||
        (eCSSToken_Ident == tk->mType) ||
        ((eCSSToken_Function == tk->mType) &&
         (tk->mIdent.LowerCaseEqualsLiteral("rgb") ||
@ -5748,7 +5709,7 @@ CSSParserImpl::IsLegacyGradientLine(const nsCSSTokenType& aType,
    }
    // fall through
  case eCSSToken_ID:
-  case eCSSToken_Ref:
+  case eCSSToken_Hash:
    // this is a color
    break;

@ -8021,7 +7982,7 @@ CSSParserImpl::RequireWhitespace()
 {
  if (!GetToken(false))
    return false;
-  if (mToken.mType != eCSSToken_WhiteSpace) {
+  if (mToken.mType != eCSSToken_Whitespace) {
    UngetToken();
    return false;
  }
@ -8427,7 +8388,7 @@ CSSParserImpl::ParseOneFamily(nsAString& aFamily, bool& aOneKeyword)
      if (eCSSToken_Ident == tk->mType) {
        aOneKeyword = false;
        aFamily.Append(tk->mIdent);
-      } else if (eCSSToken_WhiteSpace == tk->mType) {
+      } else if (eCSSToken_Whitespace == tk->mType) {
        // Lookahead one token and drop whitespace if we are ending the
        // font name.
        if (!GetToken(true))
--- a/layout/style/nsCSSScanner.cpp
+++ b/layout/style/nsCSSScanner.cpp
@ -121,11 +121,6 @@ HexDigitValue(int32_t ch)
  }
 }

-nsCSSToken::nsCSSToken()
-{
-  mType = eCSSToken_Symbol;
-}
-
 void
 nsCSSToken::AppendToString(nsString& aBuffer) const
 {
@ -140,7 +135,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
      break;

    case eCSSToken_ID:
-    case eCSSToken_Ref:
+    case eCSSToken_Hash:
      aBuffer.Append('#');
      nsStyleUtil::AppendEscapedCSSIdent(mIdent, aBuffer);
      break;
@ -200,7 +195,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
      aBuffer.Append(mSymbol);
      break;

-    case eCSSToken_WhiteSpace:
+    case eCSSToken_Whitespace:
      aBuffer.Append(' ');
      break;

@ -232,7 +227,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
 }

 nsCSSScanner::nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber)
-  : mReadPointer(aBuffer.BeginReading())
+  : mBuffer(aBuffer.BeginReading())
  , mOffset(0)
  , mCount(aBuffer.Length())
  , mPushback(mLocalPushback)
@ -259,6 +254,43 @@ nsCSSScanner::~nsCSSScanner()
  }
 }

+void
+nsCSSScanner::StartRecording()
+{
+  NS_ASSERTION(!mRecording, "already started recording");
+  mRecording = true;
+  mRecordStartOffset = mOffset - mPushbackCount;
+}
+
+void
+nsCSSScanner::StopRecording()
+{
+  NS_ASSERTION(mRecording, "haven't started recording");
+  mRecording = false;
+}
+
+void
+nsCSSScanner::StopRecording(nsString& aBuffer)
+{
+  NS_ASSERTION(mRecording, "haven't started recording");
+  mRecording = false;
+  aBuffer.Append(mBuffer + mRecordStartOffset,
+                 mOffset - mPushbackCount - mRecordStartOffset);
+}
+
+nsDependentSubstring
+nsCSSScanner::GetCurrentLine() const
+{
+  uint32_t end = mTokenOffset;
+  while (end < mCount &&
+         mBuffer[end] != '\n' && mBuffer[end] != '\r' &&
+         mBuffer[end] != '\f') {
+    end++;
+  }
+  return nsDependentSubstring(mBuffer + mTokenLineOffset,
+                              mBuffer + end);
+}
+
 // Returns -1 on error or eof
 int32_t
 nsCSSScanner::Read()
@ -270,11 +302,11 @@ nsCSSScanner::Read()
    if (mOffset == mCount) {
      return -1;
    }
-    rv = int32_t(mReadPointer[mOffset++]);
+    rv = int32_t(mBuffer[mOffset++]);
    // There are four types of newlines in CSS: "\r", "\n", "\r\n", and "\f".
    // To simplify dealing with newlines, they are all normalized to "\n" here
    if (rv == '\r') {
-      if (mOffset < mCount && mReadPointer[mOffset] == '\n') {
+      if (mOffset < mCount && mBuffer[mOffset] == '\n') {
        mOffset++;
      }
      rv = '\n';
@ -323,43 +355,6 @@ nsCSSScanner::Pushback(PRUnichar aChar)
  mPushback[mPushbackCount++] = aChar;
 }

-void
-nsCSSScanner::StartRecording()
-{
-  NS_ASSERTION(!mRecording, "already started recording");
-  mRecording = true;
-  mRecordStartOffset = mOffset - mPushbackCount;
-}
-
-void
-nsCSSScanner::StopRecording()
-{
-  NS_ASSERTION(mRecording, "haven't started recording");
-  mRecording = false;
-}
-
-void
-nsCSSScanner::StopRecording(nsString& aBuffer)
-{
-  NS_ASSERTION(mRecording, "haven't started recording");
-  mRecording = false;
-  aBuffer.Append(mReadPointer + mRecordStartOffset,
-                 mOffset - mPushbackCount - mRecordStartOffset);
-}
-
-nsDependentSubstring
-nsCSSScanner::GetCurrentLine() const
-{
-  uint32_t end = mTokenOffset;
-  while (end < mCount &&
-         mReadPointer[end] != '\n' && mReadPointer[end] != '\r' &&
-         mReadPointer[end] != '\f') {
-    end++;
-  }
-  return nsDependentSubstring(mReadPointer + mTokenLineOffset,
-                              mReadPointer + end);
-}
-
 bool
 nsCSSScanner::LookAhead(PRUnichar aChar)
 {
@ -389,7 +384,7 @@ nsCSSScanner::LookAheadOrEOF(PRUnichar aChar)
 }

 void
-nsCSSScanner::EatWhiteSpace()
+nsCSSScanner::SkipWhitespace()
 {
  for (;;) {
    int32_t ch = Read();
@ -403,233 +398,28 @@ nsCSSScanner::EatWhiteSpace()
  }
 }

-bool
-nsCSSScanner::Next(nsCSSToken& aToken)
+void
+nsCSSScanner::SkipComment()
 {
-  for (;;) { // Infinite loop so we can restart after comments.
-    mTokenOffset = mOffset;
-    mTokenLineOffset = mLineOffset;
-    mTokenLineNumber = mLineNumber;
-
-    int32_t ch = Read();
-    if (ch < 0) {
-      return false;
-    }
-
-    // UNICODE-RANGE
-    if ((ch == 'u' || ch == 'U') && Peek() == '+')
-      return ParseURange(ch, aToken);
-
-    // IDENT
-    if (StartsIdent(ch, Peek()))
-      return ParseIdent(ch, aToken);
-
-    // AT_KEYWORD
-    if (ch == '@') {
-      return ParseAtKeyword(aToken);
-    }
-
-    // NUMBER or DIM
-    if ((ch == '.') || (ch == '+') || (ch == '-')) {
-      int32_t nextChar = Peek();
-      if (IsDigit(nextChar)) {
-        return ParseNumber(ch, aToken);
-      }
-      else if (('.' == nextChar) && ('.' != ch)) {
-        nextChar = Read();
-        int32_t followingChar = Peek();
-        Pushback(nextChar);
-        if (IsDigit(followingChar))
-          return ParseNumber(ch, aToken);
-      }
-    }
-    if (IsDigit(ch)) {
-      return ParseNumber(ch, aToken);
-    }
-
-    // ID
-    if (ch == '#') {
-      return ParseRef(ch, aToken);
-    }
-
-    // STRING
-    if ((ch == '"') || (ch == '\'')) {
-      return ParseString(ch, aToken);
-    }
-
-    // WS
-    if (IsWhitespace(ch)) {
-      aToken.mType = eCSSToken_WhiteSpace;
-      aToken.mIdent.Assign(PRUnichar(ch));
-      EatWhiteSpace();
-      return true;
-    }
-    if (ch == '/' && !IsSVGMode()) {
-      int32_t nextChar = Peek();
-      if (nextChar == '*') {
-        Read();
-        // FIXME: Editor wants comments to be preserved (bug 60290).
-        if (!SkipCComment()) {
-          return false;
-        }
-        continue; // start again at the beginning
-      }
-    }
-    if (ch == '<') {  // consume HTML comment tags
-      if (LookAhead('!')) {
-        if (LookAhead('-')) {
-          if (LookAhead('-')) {
-            aToken.mType = eCSSToken_HTMLComment;
-            aToken.mIdent.AssignLiteral("<!--");
-            return true;
-          }
-          Pushback('-');
-        }
-        Pushback('!');
-      }
-    }
-    if (ch == '-') {  // check for HTML comment end
-      if (LookAhead('-')) {
-        if (LookAhead('>')) {
-          aToken.mType = eCSSToken_HTMLComment;
-          aToken.mIdent.AssignLiteral("-->");
-          return true;
-        }
-        Pushback('-');
-      }
-    }
-
-    // INCLUDES ("~=") and DASHMATCH ("|=")
-    if (( ch == '|' ) || ( ch == '~' ) || ( ch == '^' ) ||
-        ( ch == '$' ) || ( ch == '*' )) {
-      int32_t nextChar = Read();
-      if ( nextChar == '=' ) {
-        if (ch == '~') {
-          aToken.mType = eCSSToken_Includes;
-        }
-        else if (ch == '|') {
-          aToken.mType = eCSSToken_Dashmatch;
-        }
-        else if (ch == '^') {
-          aToken.mType = eCSSToken_Beginsmatch;
-        }
-        else if (ch == '$') {
-          aToken.mType = eCSSToken_Endsmatch;
-        }
-        else if (ch == '*') {
-          aToken.mType = eCSSToken_Containsmatch;
-        }
-        return true;
-      } else if (nextChar >= 0) {
-        Pushback(nextChar);
-      }
-    }
-    aToken.mType = eCSSToken_Symbol;
-    aToken.mSymbol = ch;
-    return true;
-  }
-}
-
-bool
-nsCSSScanner::NextURL(nsCSSToken& aToken)
-{
-  EatWhiteSpace();
-
-  int32_t ch = Read();
-  if (ch < 0) {
-    return false;
-  }
-
-  // STRING
-  if ((ch == '"') || (ch == '\'')) {
-#ifdef DEBUG
-    bool ok =
-#endif
-      ParseString(ch, aToken);
-    NS_ABORT_IF_FALSE(ok, "ParseString should never fail, "
-                          "since there's always something read");
-
-    NS_ABORT_IF_FALSE(aToken.mType == eCSSToken_String ||
-                      aToken.mType == eCSSToken_Bad_String,
-                      "unexpected token type");
-    if (MOZ_LIKELY(aToken.mType == eCSSToken_String)) {
-      EatWhiteSpace();
-      if (LookAheadOrEOF(')')) {
-        aToken.mType = eCSSToken_URL;
-      } else {
-        aToken.mType = eCSSToken_Bad_URL;
-      }
-    } else {
-      aToken.mType = eCSSToken_Bad_URL;
-    }
-    return true;
-  }
-
-  // Process a url lexical token. A CSS1 url token can contain
-  // characters beyond identifier characters (e.g. '/', ':', etc.)
-  // Because of this the normal rules for tokenizing the input don't
-  // apply very well. To simplify the parser and relax some of the
-  // requirements on the scanner we parse url's here. If we find a
-  // malformed URL then we emit a token of type "Bad_URL" so that
-  // the CSS1 parser can ignore the invalid input.  The parser must
-  // treat a Bad_URL token like a Function token, and process
-  // tokens until a matching parenthesis.
-
-  aToken.mType = eCSSToken_Bad_URL;
-  aToken.mSymbol = PRUnichar(0);
-  nsString& ident = aToken.mIdent;
-  ident.SetLength(0);
-
-  // start of a non-quoted url (which may be empty)
-  bool ok = true;
  for (;;) {
-    if (IsURLChar(ch)) {
-      // A regular url character.
-      ident.Append(PRUnichar(ch));
-    } else if (ch == ')') {
-      // All done
-      break;
-    } else if (IsWhitespace(ch)) {
-      // Whitespace is allowed at the end of the URL
-      EatWhiteSpace();
-      // Consume the close paren if we have it; if not we're an invalid URL.
-      ok = LookAheadOrEOF(')');
-      break;
-    } else if (ch == '\\') {
-      if (!ParseAndAppendEscape(ident, false)) {
-        ok = false;
-        Pushback(ch);
-        break;
+    int32_t ch = Read();
+    if (ch < 0) break;
+    if (ch == '*') {
+      if (LookAhead('/')) {
+        return;
      }
-    } else {
-      // This is an invalid URL spec
-      ok = false;
-      Pushback(ch); // push it back so the parser can match tokens and
-                    // then closing parenthesis
-      break;
-    }
-
-    ch = Read();
-    if (ch < 0) {
-      break;
    }
  }

-  // If the result of the above scanning is ok then change the token
-  // type to a useful one.
-  if (ok) {
-    aToken.mType = eCSSToken_URL;
-  }
-  return true;
+  mReporter->ReportUnexpectedEOF("PECommentEOF");
 }

-
 /**
 * Returns whether an escape was succesfully parsed; if it was not,
 * the backslash needs to be its own symbol token.
 */
 bool
-nsCSSScanner::ParseAndAppendEscape(nsString& aOutput, bool aInString)
+nsCSSScanner::GatherEscape(nsString& aOutput, bool aInString)
 {
  int32_t ch = Read();
  if (ch < 0) {
@ -717,7 +507,7 @@ bool
 nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
 {
  if (aChar == '\\') {
-    if (!ParseAndAppendEscape(aIdent, false)) {
+    if (!GatherEscape(aIdent, false)) {
      return false;
    }
  } else {
@ -730,12 +520,12 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
      // See how much we can consume and append in one go
      uint32_t n = mOffset;
      // Count number of Ident characters that can be processed
-      while (n < mCount && IsIdent(mReadPointer[n])) {
+      while (n < mCount && IsIdent(mBuffer[n])) {
        ++n;
      }
      // Add to the token what we have so far
      if (n > mOffset) {
-        aIdent.Append(&mReadPointer[mOffset], n - mOffset);
+        aIdent.Append(&mBuffer[mOffset], n - mOffset);
        mOffset = n;
      }
    }
@ -743,7 +533,7 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
    aChar = Read();
    if (aChar < 0) break;
    if (aChar == '\\') {
-      if (!ParseAndAppendEscape(aIdent, false)) {
+      if (!GatherEscape(aIdent, false)) {
        Pushback(aChar);
        break;
      }
@ -759,35 +549,7 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
 }

 bool
-nsCSSScanner::ParseRef(int32_t aChar, nsCSSToken& aToken)
-{
-  // Fall back for when we don't have name characters following:
-  aToken.mType = eCSSToken_Symbol;
-  aToken.mSymbol = aChar;
-
-  int32_t ch = Read();
-  if (ch < 0) {
-    return true;
-  }
-  if (IsIdent(ch) || ch == '\\') {
-    // First char after the '#' is a valid ident char (or an escape),
-    // so it makes sense to keep going
-    nsCSSTokenType type =
-      StartsIdent(ch, Peek()) ? eCSSToken_ID : eCSSToken_Ref;
-    aToken.mIdent.SetLength(0);
-    if (GatherIdent(ch, aToken.mIdent)) {
-      aToken.mType = type;
-      return true;
-    }
-  }
-
-  // No ident chars after the '#'.  Just unread |ch| and get out of here.
-  Pushback(ch);
-  return true;
-}
-
-bool
-nsCSSScanner::ParseIdent(int32_t aChar, nsCSSToken& aToken)
+nsCSSScanner::ScanIdent(int32_t aChar, nsCSSToken& aToken)
 {
  nsString& ident = aToken.mIdent;
  ident.SetLength(0);
@ -814,7 +576,7 @@ nsCSSScanner::ParseIdent(int32_t aChar, nsCSSToken& aToken)
 }

 bool
-nsCSSScanner::ParseAtKeyword(nsCSSToken& aToken)
+nsCSSScanner::ScanAtKeyword(nsCSSToken& aToken)
 {
  int32_t ch = Read();
  if (StartsIdent(ch, Peek())) {
@ -833,7 +595,35 @@ nsCSSScanner::ParseAtKeyword(nsCSSToken& aToken)
 }

 bool
-nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
+nsCSSScanner::ScanHash(int32_t aChar, nsCSSToken& aToken)
+{
+  // Fall back for when we don't have name characters following:
+  aToken.mType = eCSSToken_Symbol;
+  aToken.mSymbol = aChar;
+
+  int32_t ch = Read();
+  if (ch < 0) {
+    return true;
+  }
+  if (IsIdent(ch) || ch == '\\') {
+    // First char after the '#' is a valid ident char (or an escape),
+    // so it makes sense to keep going
+    nsCSSTokenType type =
+      StartsIdent(ch, Peek()) ? eCSSToken_ID : eCSSToken_Hash;
+    aToken.mIdent.SetLength(0);
+    if (GatherIdent(ch, aToken.mIdent)) {
+      aToken.mType = type;
+      return true;
+    }
+  }
+
+  // No ident chars after the '#'.  Just unread |ch| and get out of here.
+  Pushback(ch);
+  return true;
+}
+
+bool
+nsCSSScanner::ScanNumber(int32_t c, nsCSSToken& aToken)
 {
  NS_PRECONDITION(c == '.' || c == '+' || c == '-' || IsDigit(c),
                  "Why did we get called?");
@ -867,7 +657,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
  bool gotDot = (c == '.');

  if (!gotDot) {
-    // Parse the integer part of the mantisssa
+    // Scan the integer part of the mantisssa
    NS_ASSERTION(IsDigit(c), "Why did we get called?");
    do {
      intPart = 10*intPart + DecimalDigitValue(c);
@ -879,7 +669,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
  }

  if (gotDot) {
-    // Parse the fractional part of the mantissa.
+    // Scan the fractional part of the mantissa.
    c = Read();
    NS_ASSERTION(IsDigit(c), "How did we get here?");
    // Power of ten by which we need to divide our next digit
@ -967,24 +757,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
 }

 bool
-nsCSSScanner::SkipCComment()
-{
-  for (;;) {
-    int32_t ch = Read();
-    if (ch < 0) break;
-    if (ch == '*') {
-      if (LookAhead('/')) {
-        return true;
-      }
-    }
-  }
-
-  mReporter->ReportUnexpectedEOF("PECommentEOF");
-  return false;
-}
-
-bool
-nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
+nsCSSScanner::ScanString(int32_t aStop, nsCSSToken& aToken)
 {
  aToken.mIdent.SetLength(0);
  aToken.mType = eCSSToken_String;
@ -996,7 +769,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
      uint32_t n = mOffset;
      // Count number of characters that can be processed
      for (;n < mCount; ++n) {
-        PRUnichar nextChar = mReadPointer[n];
+        PRUnichar nextChar = mBuffer[n];
        if ((nextChar == aStop) || (nextChar == '\\') ||
            (nextChar == '\n') || (nextChar == '\r') || (nextChar == '\f')) {
          break;
@ -1004,7 +777,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
      }
      // Add to the token what we have so far
      if (n > mOffset) {
-        aToken.mIdent.Append(&mReadPointer[mOffset], n - mOffset);
+        aToken.mIdent.Append(&mBuffer[mOffset], n - mOffset);
        mOffset = n;
      }
    }
@ -1018,10 +791,10 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
      break;
    }
    if (ch == '\\') {
-      if (!ParseAndAppendEscape(aToken.mIdent, true)) {
+      if (!GatherEscape(aToken.mIdent, true)) {
        aToken.mType = eCSSToken_Bad_String;
        Pushback(ch);
-        // For strings, the only case where ParseAndAppendEscape will
+        // For strings, the only case where GatherEscape will
        // return false is when there's a backslash to start an escape
        // immediately followed by end-of-stream.  In that case, the
        // correct tokenization is badstring *followed* by a DELIM for
@ -1052,7 +825,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
 // are also decoded into mInteger and mInteger2, and mIntegerValid is set.

 bool
-nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult)
+nsCSSScanner::ScanURange(int32_t aChar, nsCSSToken& aResult)
 {
  int32_t intro2 = Read();
  int32_t ch = Peek();
@ -1069,7 +842,7 @@ nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult)
  if (!IsHexDigit(ch) && ch != '?') {
    Pushback(intro2);
    Pushback(aChar);
-    return ParseIdent(aChar, aResult);
+    return ScanIdent(aChar, aResult);
  }

  aResult.mIdent.Truncate();
@ -1129,3 +902,223 @@ nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult)
  aResult.mType = eCSSToken_URange;
  return true;
 }
+
+bool
+nsCSSScanner::NextURL(nsCSSToken& aToken)
+{
+  SkipWhitespace();
+
+  int32_t ch = Read();
+  if (ch < 0) {
+    return false;
+  }
+
+  // STRING
+  if ((ch == '"') || (ch == '\'')) {
+#ifdef DEBUG
+    bool ok =
+#endif
+      ScanString(ch, aToken);
+    NS_ABORT_IF_FALSE(ok, "ScanString should never fail, "
+                          "since there's always something read");
+
+    NS_ABORT_IF_FALSE(aToken.mType == eCSSToken_String ||
+                      aToken.mType == eCSSToken_Bad_String,
+                      "unexpected token type");
+    if (MOZ_LIKELY(aToken.mType == eCSSToken_String)) {
+      SkipWhitespace();
+      if (LookAheadOrEOF(')')) {
+        aToken.mType = eCSSToken_URL;
+      } else {
+        aToken.mType = eCSSToken_Bad_URL;
+      }
+    } else {
+      aToken.mType = eCSSToken_Bad_URL;
+    }
+    return true;
+  }
+
+  // Process a url lexical token. A CSS1 url token can contain
+  // characters beyond identifier characters (e.g. '/', ':', etc.)
+  // Because of this the normal rules for tokenizing the input don't
+  // apply very well. To simplify the parser and relax some of the
+  // requirements on the scanner we parse url's here. If we find a
+  // malformed URL then we emit a token of type "Bad_URL" so that
+  // the CSS1 parser can ignore the invalid input.  The parser must
+  // treat a Bad_URL token like a Function token, and process
+  // tokens until a matching parenthesis.
+
+  aToken.mType = eCSSToken_Bad_URL;
+  aToken.mSymbol = PRUnichar(0);
+  nsString& ident = aToken.mIdent;
+  ident.SetLength(0);
+
+  // start of a non-quoted url (which may be empty)
+  bool ok = true;
+  for (;;) {
+    if (IsURLChar(ch)) {
+      // A regular url character.
+      ident.Append(PRUnichar(ch));
+    } else if (ch == ')') {
+      // All done
+      break;
+    } else if (IsWhitespace(ch)) {
+      // Whitespace is allowed at the end of the URL
+      SkipWhitespace();
+      // Consume the close paren if we have it; if not we're an invalid URL.
+      ok = LookAheadOrEOF(')');
+      break;
+    } else if (ch == '\\') {
+      if (!GatherEscape(ident, false)) {
+        ok = false;
+        Pushback(ch);
+        break;
+      }
+    } else {
+      // This is an invalid URL spec
+      ok = false;
+      Pushback(ch); // push it back so the parser can match tokens and
+                    // then closing parenthesis
+      break;
+    }
+
+    ch = Read();
+    if (ch < 0) {
+      break;
+    }
+  }
+
+  // If the result of the above scanning is ok then change the token
+  // type to a useful one.
+  if (ok) {
+    aToken.mType = eCSSToken_URL;
+  }
+  return true;
+}
+
+bool
+nsCSSScanner::Next(nsCSSToken& aToken, bool aSkipWS)
+{
+  for (;;) { // Infinite loop so we can restart after comments.
+    mTokenOffset = mOffset;
+    mTokenLineOffset = mLineOffset;
+    mTokenLineNumber = mLineNumber;
+
+    int32_t ch = Read();
+    if (ch < 0) {
+      return false;
+    }
+
+    // UNICODE-RANGE
+    if ((ch == 'u' || ch == 'U') && Peek() == '+')
+      return ScanURange(ch, aToken);
+
+    // IDENT
+    if (StartsIdent(ch, Peek()))
+      return ScanIdent(ch, aToken);
+
+    // AT_KEYWORD
+    if (ch == '@') {
+      return ScanAtKeyword(aToken);
+    }
+
+    // NUMBER or DIM
+    if ((ch == '.') || (ch == '+') || (ch == '-')) {
+      int32_t nextChar = Peek();
+      if (IsDigit(nextChar)) {
+        return ScanNumber(ch, aToken);
+      }
+      else if (('.' == nextChar) && ('.' != ch)) {
+        nextChar = Read();
+        int32_t followingChar = Peek();
+        Pushback(nextChar);
+        if (IsDigit(followingChar))
+          return ScanNumber(ch, aToken);
+      }
+    }
+    if (IsDigit(ch)) {
+      return ScanNumber(ch, aToken);
+    }
+
+    // ID
+    if (ch == '#') {
+      return ScanHash(ch, aToken);
+    }
+
+    // STRING
+    if ((ch == '"') || (ch == '\'')) {
+      return ScanString(ch, aToken);
+    }
+
+    // WS
+    if (IsWhitespace(ch)) {
+      SkipWhitespace();
+      if (!aSkipWS) {
+        aToken.mType = eCSSToken_Whitespace;
+        return true;
+      }
+      continue; // start again at the beginning
+    }
+    if (ch == '/' && !IsSVGMode()) {
+      int32_t nextChar = Peek();
+      if (nextChar == '*') {
+        Read();
+        // FIXME: Editor wants comments to be preserved (bug 60290).
+        SkipComment();
+        continue; // start again at the beginning
+      }
+    }
+    if (ch == '<') {  // consume HTML comment tags
+      if (LookAhead('!')) {
+        if (LookAhead('-')) {
+          if (LookAhead('-')) {
+            aToken.mType = eCSSToken_HTMLComment;
+            aToken.mIdent.AssignLiteral("<!--");
+            return true;
+          }
+          Pushback('-');
+        }
+        Pushback('!');
+      }
+    }
+    if (ch == '-') {  // check for HTML comment end
+      if (LookAhead('-')) {
+        if (LookAhead('>')) {
+          aToken.mType = eCSSToken_HTMLComment;
+          aToken.mIdent.AssignLiteral("-->");
+          return true;
+        }
+        Pushback('-');
+      }
+    }
+
+    // INCLUDES ("~=") and DASHMATCH ("|=")
+    if (( ch == '|' ) || ( ch == '~' ) || ( ch == '^' ) ||
+        ( ch == '$' ) || ( ch == '*' )) {
+      int32_t nextChar = Read();
+      if ( nextChar == '=' ) {
+        if (ch == '~') {
+          aToken.mType = eCSSToken_Includes;
+        }
+        else if (ch == '|') {
+          aToken.mType = eCSSToken_Dashmatch;
+        }
+        else if (ch == '^') {
+          aToken.mType = eCSSToken_Beginsmatch;
+        }
+        else if (ch == '$') {
+          aToken.mType = eCSSToken_Endsmatch;
+        }
+        else if (ch == '*') {
+          aToken.mType = eCSSToken_Containsmatch;
+        }
+        return true;
+      } else if (nextChar >= 0) {
+        Pushback(nextChar);
+      }
+    }
+    aToken.mType = eCSSToken_Symbol;
+    aToken.mSymbol = ch;
+    return true;
+  }
+}
--- a/layout/style/nsCSSScanner.h
+++ b/layout/style/nsCSSScanner.h
@ -16,58 +16,88 @@ class ErrorReporter;
 }
 }

-// Token types
+// Token types; in close but not perfect correspondence to the token
+// categorization in section 4.1.1 of CSS2.1.  (The deviations are all
+// the fault of css3-selectors, which has requirements that can only be
+// met by changing the generic tokenization.)  The comment on each line
+// illustrates the form of each identifier.
+
 enum nsCSSTokenType {
-  // A css identifier (e.g. foo)
-  eCSSToken_Ident,          // mIdent
+  // White space of any kind.  No value fields are used.  Note that
+  // comments do *not* count as white space; comments separate tokens
+  // but are not themselves tokens.
+  eCSSToken_Whitespace,     //

-  // A css at keyword (e.g. @foo)
-  eCSSToken_AtKeyword,      // mIdent
+  // Identifier-like tokens.  mIdent is the text of the identifier.
+  // The difference between ID and Hash is: if the text after the #
+  // would have been a valid Ident if the # hadn't been there, the
+  // scanner produces an ID token.  Otherwise it produces a Hash token.
+  // (This distinction is required by css3-selectors.)
+  eCSSToken_Ident,          // word
+  eCSSToken_Function,       // word(
+  eCSSToken_AtKeyword,      // @word
+  eCSSToken_ID,             // #word
+  eCSSToken_Hash,           // #0word

-  // A css number without a percentage or dimension; with percentage;
-  // without percentage but with a dimension
-  eCSSToken_Number,         // mNumber
-  eCSSToken_Percentage,     // mNumber
-  eCSSToken_Dimension,      // mNumber + mIdent
+  // Numeric tokens.  mNumber is the floating-point value of the
+  // number, and mHasSign indicates whether there was an explicit sign
+  // (+ or -) in front of the number.  If mIntegerValid is true, the
+  // number had the lexical form of an integer, and mInteger is its
+  // integer value.  Lexically integer values outside the range of a
+  // 32-bit signed number are clamped to the maximum values; mNumber
+  // will indicate a 'truer' value in that case.  Percentage tokens
+  // are always considered not to be integers, even if their numeric
+  // value is integral (100% => mNumber = 1.0).  For Dimension
+  // tokens, mIdent holds the text of the unit.
+  eCSSToken_Number,         // 1 -5 +2e3 3.14159 7.297352e-3
+  eCSSToken_Dimension,      // 24px 8.5in
+  eCSSToken_Percentage,     // 85% 1280.4%

-  // A css string (e.g. "foo" or 'foo')
-  eCSSToken_String,         // mSymbol + mIdent + mSymbol
+  // String-like tokens.  In all cases, mIdent holds the text
+  // belonging to the string, and mSymbol holds the delimiter
+  // character, which may be ', ", or zero (only for unquoted URLs).
+  // Bad_String and Bad_URL tokens are emitted when the closing
+  // delimiter or parenthesis was missing.
+  eCSSToken_String,         // 'foo bar' "foo bar"
+  eCSSToken_Bad_String,     // 'foo bar
+  eCSSToken_URL,            // url(foobar) url("foo bar")
+  eCSSToken_Bad_URL,        // url(foo

-  // Whitespace (e.g. " " or "/* abc */")
-  eCSSToken_WhiteSpace,     // mIdent
+  // Any one-character symbol.  mSymbol holds the character.
+  eCSSToken_Symbol,         // . ; { } ! *

-  // A css symbol (e.g. ':', ';', '+', etc.)
-  eCSSToken_Symbol,         // mSymbol
+  // Match operators.  These are single tokens rather than pairs of
+  // Symbol tokens because css3-selectors forbids the presence of
+  // comments between the two characters.  No value fields are used;
+  // the token type indicates which operator.
+  eCSSToken_Includes,       // ~=
+  eCSSToken_Dashmatch,      // |=
+  eCSSToken_Beginsmatch,    // ^=
+  eCSSToken_Endsmatch,      // $=
+  eCSSToken_Containsmatch,  // *=

-  // A css1 id (e.g. #foo3)
-  eCSSToken_ID,             // mIdent
-  // Just like eCSSToken_ID, except the part following the '#' is not
-  // a valid CSS identifier (eg. starts with a digit, is the empty
-  // string, etc).
-  eCSSToken_Ref,            // mIdent
+  // Unicode-range token: currently used only in @font-face.
+  // The lexical rule for this token includes several forms that are
+  // semantically invalid.  Therefore, mIdent always holds the
+  // complete original text of the token (so we can print it
+  // accurately in diagnostics), and mIntegerValid is true iff the
+  // token is semantically valid.  In that case, mInteger holds the
+  // lowest value included in the range, and mInteger2 holds the
+  // highest value included in the range.
+  eCSSToken_URange,         // U+007e U+01?? U+2000-206F

-  eCSSToken_Function,       // mIdent
-
-  eCSSToken_URL,            // mIdent + mSymbol
-  eCSSToken_Bad_URL,        // mIdent + mSymbol
-
-  eCSSToken_HTMLComment,    // "<!--" or "-->"
-
-  eCSSToken_Includes,       // "~="
-  eCSSToken_Dashmatch,      // "|="
-  eCSSToken_Beginsmatch,    // "^="
-  eCSSToken_Endsmatch,      // "$="
-  eCSSToken_Containsmatch,  // "*="
-
-  eCSSToken_URange,         // Low in mInteger, high in mInteger2;
-                            // mIntegerValid is true if the token is a
-                            // valid range; mIdent preserves the textual
-                            // form of the token for error reporting
-
-  // An unterminated string, which is always an error.
-  eCSSToken_Bad_String      // mSymbol + mIdent
+  // HTML comment delimiters, ignored as a unit when they appear at
+  // the top level of a style sheet, for compatibility with websites
+  // written for compatibility with pre-CSS browsers.  This token type
+  // subsumes the css2.1 CDO and CDC tokens, which are always treated
+  // the same by the parser.  mIdent holds the text of the token, for
+  // diagnostics.
+  eCSSToken_HTMLComment,    // <!-- -->
 };

+// A single token returned from the scanner.  mType is always
+// meaningful; comments above describe which other fields are
+// meaningful for which token types.
 struct nsCSSToken {
  nsAutoString    mIdent NS_OKONHEAP;
  float           mNumber;
@ -75,22 +105,24 @@ struct nsCSSToken {
  int32_t         mInteger2;
  nsCSSTokenType  mType;
  PRUnichar       mSymbol;
-  bool            mIntegerValid; // for number, dimension, urange
-  bool            mHasSign; // for number, percentage, and dimension
+  bool            mIntegerValid;
+  bool            mHasSign;

-  nsCSSToken();
+  nsCSSToken()
+    : mNumber(0), mInteger(0), mInteger2(0), mType(eCSSToken_Whitespace),
+      mSymbol('\0'), mIntegerValid(false), mHasSign(false)
+  {}

-  bool IsSymbol(PRUnichar aSymbol) {
-    return bool((eCSSToken_Symbol == mType) && (mSymbol == aSymbol));
+  bool IsSymbol(PRUnichar aSymbol) const {
+    return mType == eCSSToken_Symbol && mSymbol == aSymbol;
  }

  void AppendToString(nsString& aBuffer) const;
 };

-// CSS Scanner API. Used to tokenize an input stream using the CSS
-// forward compatible tokenization rules. This implementation is
-// private to this package and is only used internally by the css
-// parser.
+// nsCSSScanner tokenizes an input stream using the CSS2.1 forward
+// compatible tokenization rules.  Used internally by nsCSSParser;
+// not available for use by other code.
 class nsCSSScanner {
  public:
  // |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0|
@ -122,17 +154,23 @@ class nsCSSScanner {
  // the most recently processed token.
  nsDependentSubstring GetCurrentLine() const;

-  // Get the next token. Return false on EOF. aTokenResult
-  // is filled in with the data for the token.
-  bool Next(nsCSSToken& aTokenResult);
+  // Get the next token.  Return false on EOF.  aTokenResult is filled
+  // in with the data for the token.  If aSkipWS is true, skip over
+  // eCSSToken_Whitespace tokens rather than returning them.
+  bool Next(nsCSSToken& aTokenResult, bool aSkipWS);

-  // Get the next token that may be a string or unquoted URL
+  // Get the body of an URL token (everything after the 'url(').
+  // This is exposed for use by nsCSSParser::ParseMozDocumentRule,
+  // which, for historical reasons, must make additional function
+  // tokens behave like url().  Please do not add new uses to the
+  // parser.
  bool NextURL(nsCSSToken& aTokenResult);

-  // It's really ugly that we have to expose this, but it's the easiest
-  // way to do :nth-child() parsing sanely.  (In particular, in
-  // :nth-child(2n-1), "2n-1" is a dimension, and we need to push the
-  // "-1" back so we can read it again as a number.)
+  // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg,
+  // because "2n-1" is a single DIMENSION token, and "n-1" is a single
+  // IDENT token, but the :nth() selector syntax wants to interpret
+  // them the same as "2n -1" and "n -1" respectively.  Please do not
+  // add new uses to the parser.
  void Pushback(PRUnichar aChar);

  // Starts recording the input stream from the current position.
@ -150,20 +188,21 @@ protected:
  int32_t Peek();
  bool LookAhead(PRUnichar aChar);
  bool LookAheadOrEOF(PRUnichar aChar); // expect either aChar or EOF
-  void EatWhiteSpace();

-  bool ParseAndAppendEscape(nsString& aOutput, bool aInString);
-  bool ParseIdent(int32_t aChar, nsCSSToken& aResult);
-  bool ParseAtKeyword(nsCSSToken& aResult);
-  bool ParseNumber(int32_t aChar, nsCSSToken& aResult);
-  bool ParseRef(int32_t aChar, nsCSSToken& aResult);
-  bool ParseString(int32_t aChar, nsCSSToken& aResult);
-  bool ParseURange(int32_t aChar, nsCSSToken& aResult);
-  bool SkipCComment();
+  void SkipWhitespace();
+  void SkipComment();

+  bool GatherEscape(nsString& aOutput, bool aInString);
  bool GatherIdent(int32_t aChar, nsString& aIdent);

-  const PRUnichar *mReadPointer;
+  bool ScanIdent(int32_t aChar, nsCSSToken& aResult);
+  bool ScanAtKeyword(nsCSSToken& aResult);
+  bool ScanHash(int32_t aChar, nsCSSToken& aResult);
+  bool ScanNumber(int32_t aChar, nsCSSToken& aResult);
+  bool ScanString(int32_t aChar, nsCSSToken& aResult);
+  bool ScanURange(int32_t aChar, nsCSSToken& aResult);
+
+  const PRUnichar *mBuffer;
  uint32_t mOffset;
  uint32_t mCount;