From ad3efbd71440f2fc015547ea59537cbe9a00c8a9 Mon Sep 17 00:00:00 2001
From: Zack Weinberg <zackw@panix.com>
Date: Sat, 16 Feb 2013 18:27:53 -0500
Subject: [PATCH] Bug 543151, part A1: Preliminary cleanups to the
 scanner/parser interface and the organization of nsCSSScanner.cpp. r=heycam

---
 layout/style/nsCSSParser.cpp  |  67 +---
 layout/style/nsCSSScanner.cpp | 639 +++++++++++++++++-----------------
 layout/style/nsCSSScanner.h   | 177 ++++++----
 3 files changed, 438 insertions(+), 445 deletions(-)

diff --git a/layout/style/nsCSSParser.cpp b/layout/style/nsCSSParser.cpp
index dbbb33c2cb2..b29e889691d 100644
--- a/layout/style/nsCSSParser.cpp
+++ b/layout/style/nsCSSParser.cpp
@@ -340,15 +340,6 @@ protected:
   bool GetToken(bool aSkipWS);
   void UngetToken();
 
-  // get the part in paretheses of the url() function, which is really a
-  // part of a token in the CSS grammar, but we're using a combination
-  // of the parser and the scanner to do it to handle the backtracking
-  // required by the error handling of the tokenization (since if we
-  // fail to scan the full token, we should fall back to tokenizing as
-  // FUNCTION ... ')').
-  // Note that this function WILL WRITE TO aURL IN SOME FAILURE CASES.
-  bool GetURLInParens(nsString& aURL);
-
   bool ExpectSymbol(PRUnichar aSymbol, bool aSkipWS);
   bool ExpectEndProperty();
   bool CheckEndProperty();
@@ -1437,44 +1428,13 @@ CSSParserImpl::EvaluateSupportsCondition(const nsAString& aDeclaration,
 bool
 CSSParserImpl::GetToken(bool aSkipWS)
 {
-  for (;;) {
-    if (!mHavePushBack) {
-      if (!mScanner->Next(mToken)) {
-        break;
-      }
-    }
+  if (mHavePushBack) {
     mHavePushBack = false;
-    if (aSkipWS && (eCSSToken_WhiteSpace == mToken.mType)) {
-      continue;
+    if (!aSkipWS || mToken.mType != eCSSToken_Whitespace) {
+      return true;
     }
-    return true;
   }
-  return false;
-}
-
-bool
-CSSParserImpl::GetURLInParens(nsString& aURL)
-{
-  NS_ASSERTION(!mHavePushBack, "mustn't have pushback at this point");
-  if (! mScanner->NextURL(mToken)) {
-    // EOF
-    return false;
-  }
-
-  aURL = mToken.mIdent;
-
-  if (eCSSToken_URL != mToken.mType) {
-    // In the failure case (which gives a token of type
-    // eCSSToken_Bad_URL), we do not have to match parentheses *inside*
-    // the Bad_URL token, since this is now an invalid URL token.  But
-    // we do need to match the closing parenthesis to match the 'url('.
-    NS_ABORT_IF_FALSE(mToken.mType == eCSSToken_Bad_URL,
-                      "unexpected token type");
-    SkipUntil(')');
-    return false;
-  }
-
-  return true;
+  return mScanner->Next(mToken, aSkipWS);
 }
 
 void
@@ -2207,9 +2167,10 @@ CSSParserImpl::ParseMozDocumentRule(RuleAppendFunc aAppendFunc, void* aData)
         cur->func = css::DocumentRule::eDomain;
       }
 
-      nsAutoString url;
-      if (!GetURLInParens(url)) {
+      NS_ASSERTION(!mHavePushBack, "mustn't have pushback at this point");
+      if (!mScanner->NextURL(mToken) || mToken.mType != eCSSToken_URL) {
         REPORT_UNEXPECTED_TOKEN(PEMozDocRuleNotURI);
+        SkipUntil(')');
         delete urls;
         return false;
       }
@@ -2217,7 +2178,7 @@ CSSParserImpl::ParseMozDocumentRule(RuleAppendFunc aAppendFunc, void* aData)
       // We could try to make the URL (as long as it's not domain())
       // canonical and absolute with NS_NewURI and GetSpec, but I'm
       // inclined to think we shouldn't.
-      CopyUTF16toUTF8(url, cur->url);
+      CopyUTF16toUTF8(mToken.mIdent, cur->url);
     }
   } while (ExpectSymbol(',', true));
 
@@ -3039,7 +3000,7 @@ CSSParserImpl::ParseSelectorGroup(nsCSSSelectorList*& aList)
     }
 
     combinator = PRUnichar(0);
-    if (mToken.mType == eCSSToken_WhiteSpace) {
+    if (mToken.mType == eCSSToken_Whitespace) {
       if (!GetToken(true)) {
         break; // EOF ok here
       }
@@ -4121,7 +4082,7 @@ CSSParserImpl::ParseColor(nsCSSValue& aValue)
   nscolor rgba;
   switch (tk->mType) {
     case eCSSToken_ID:
-    case eCSSToken_Ref:
+    case eCSSToken_Hash:
       // #xxyyzz
       if (NS_HexToRGB(tk->mIdent, &rgba)) {
         aValue.SetColorValue(rgba);
@@ -5031,7 +4992,7 @@ CSSParserImpl::ParseVariant(nsCSSValue& aValue,
   if ((aVariantMask & VARIANT_COLOR) != 0) {
     if (mHashlessColorQuirk || // NONSTANDARD: Nav interprets 'xxyyzz' values even without '#' prefix
         (eCSSToken_ID == tk->mType) ||
-        (eCSSToken_Ref == tk->mType) ||
+        (eCSSToken_Hash == tk->mType) ||
         (eCSSToken_Ident == tk->mType) ||
         ((eCSSToken_Function == tk->mType) &&
          (tk->mIdent.LowerCaseEqualsLiteral("rgb") ||
@@ -5748,7 +5709,7 @@ CSSParserImpl::IsLegacyGradientLine(const nsCSSTokenType& aType,
     }
     // fall through
   case eCSSToken_ID:
-  case eCSSToken_Ref:
+  case eCSSToken_Hash:
     // this is a color
     break;
 
@@ -8021,7 +7982,7 @@ CSSParserImpl::RequireWhitespace()
 {
   if (!GetToken(false))
     return false;
-  if (mToken.mType != eCSSToken_WhiteSpace) {
+  if (mToken.mType != eCSSToken_Whitespace) {
     UngetToken();
     return false;
   }
@@ -8427,7 +8388,7 @@ CSSParserImpl::ParseOneFamily(nsAString& aFamily, bool& aOneKeyword)
       if (eCSSToken_Ident == tk->mType) {
         aOneKeyword = false;
         aFamily.Append(tk->mIdent);
-      } else if (eCSSToken_WhiteSpace == tk->mType) {
+      } else if (eCSSToken_Whitespace == tk->mType) {
         // Lookahead one token and drop whitespace if we are ending the
         // font name.
         if (!GetToken(true))
diff --git a/layout/style/nsCSSScanner.cpp b/layout/style/nsCSSScanner.cpp
index f9b19414dc4..e921f183f21 100644
--- a/layout/style/nsCSSScanner.cpp
+++ b/layout/style/nsCSSScanner.cpp
@@ -121,11 +121,6 @@ HexDigitValue(int32_t ch)
   }
 }
 
-nsCSSToken::nsCSSToken()
-{
-  mType = eCSSToken_Symbol;
-}
-
 void
 nsCSSToken::AppendToString(nsString& aBuffer) const
 {
@@ -140,7 +135,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
       break;
 
     case eCSSToken_ID:
-    case eCSSToken_Ref:
+    case eCSSToken_Hash:
       aBuffer.Append('#');
       nsStyleUtil::AppendEscapedCSSIdent(mIdent, aBuffer);
       break;
@@ -200,7 +195,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
       aBuffer.Append(mSymbol);
       break;
 
-    case eCSSToken_WhiteSpace:
+    case eCSSToken_Whitespace:
       aBuffer.Append(' ');
       break;
 
@@ -232,7 +227,7 @@ nsCSSToken::AppendToString(nsString& aBuffer) const
 }
 
 nsCSSScanner::nsCSSScanner(const nsAString& aBuffer, uint32_t aLineNumber)
-  : mReadPointer(aBuffer.BeginReading())
+  : mBuffer(aBuffer.BeginReading())
   , mOffset(0)
   , mCount(aBuffer.Length())
   , mPushback(mLocalPushback)
@@ -259,6 +254,43 @@ nsCSSScanner::~nsCSSScanner()
   }
 }
 
+void
+nsCSSScanner::StartRecording()
+{
+  NS_ASSERTION(!mRecording, "already started recording");
+  mRecording = true;
+  mRecordStartOffset = mOffset - mPushbackCount;
+}
+
+void
+nsCSSScanner::StopRecording()
+{
+  NS_ASSERTION(mRecording, "haven't started recording");
+  mRecording = false;
+}
+
+void
+nsCSSScanner::StopRecording(nsString& aBuffer)
+{
+  NS_ASSERTION(mRecording, "haven't started recording");
+  mRecording = false;
+  aBuffer.Append(mBuffer + mRecordStartOffset,
+                 mOffset - mPushbackCount - mRecordStartOffset);
+}
+
+nsDependentSubstring
+nsCSSScanner::GetCurrentLine() const
+{
+  uint32_t end = mTokenOffset;
+  while (end < mCount &&
+         mBuffer[end] != '\n' && mBuffer[end] != '\r' &&
+         mBuffer[end] != '\f') {
+    end++;
+  }
+  return nsDependentSubstring(mBuffer + mTokenLineOffset,
+                              mBuffer + end);
+}
+
 // Returns -1 on error or eof
 int32_t
 nsCSSScanner::Read()
@@ -270,11 +302,11 @@ nsCSSScanner::Read()
     if (mOffset == mCount) {
       return -1;
     }
-    rv = int32_t(mReadPointer[mOffset++]);
+    rv = int32_t(mBuffer[mOffset++]);
     // There are four types of newlines in CSS: "\r", "\n", "\r\n", and "\f".
     // To simplify dealing with newlines, they are all normalized to "\n" here
     if (rv == '\r') {
-      if (mOffset < mCount && mReadPointer[mOffset] == '\n') {
+      if (mOffset < mCount && mBuffer[mOffset] == '\n') {
         mOffset++;
       }
       rv = '\n';
@@ -323,43 +355,6 @@ nsCSSScanner::Pushback(PRUnichar aChar)
   mPushback[mPushbackCount++] = aChar;
 }
 
-void
-nsCSSScanner::StartRecording()
-{
-  NS_ASSERTION(!mRecording, "already started recording");
-  mRecording = true;
-  mRecordStartOffset = mOffset - mPushbackCount;
-}
-
-void
-nsCSSScanner::StopRecording()
-{
-  NS_ASSERTION(mRecording, "haven't started recording");
-  mRecording = false;
-}
-
-void
-nsCSSScanner::StopRecording(nsString& aBuffer)
-{
-  NS_ASSERTION(mRecording, "haven't started recording");
-  mRecording = false;
-  aBuffer.Append(mReadPointer + mRecordStartOffset,
-                 mOffset - mPushbackCount - mRecordStartOffset);
-}
-
-nsDependentSubstring
-nsCSSScanner::GetCurrentLine() const
-{
-  uint32_t end = mTokenOffset;
-  while (end < mCount &&
-         mReadPointer[end] != '\n' && mReadPointer[end] != '\r' &&
-         mReadPointer[end] != '\f') {
-    end++;
-  }
-  return nsDependentSubstring(mReadPointer + mTokenLineOffset,
-                              mReadPointer + end);
-}
-
 bool
 nsCSSScanner::LookAhead(PRUnichar aChar)
 {
@@ -389,7 +384,7 @@ nsCSSScanner::LookAheadOrEOF(PRUnichar aChar)
 }
 
 void
-nsCSSScanner::EatWhiteSpace()
+nsCSSScanner::SkipWhitespace()
 {
   for (;;) {
     int32_t ch = Read();
@@ -403,233 +398,28 @@ nsCSSScanner::EatWhiteSpace()
   }
 }
 
-bool
-nsCSSScanner::Next(nsCSSToken& aToken)
+void
+nsCSSScanner::SkipComment()
 {
-  for (;;) { // Infinite loop so we can restart after comments.
-    mTokenOffset = mOffset;
-    mTokenLineOffset = mLineOffset;
-    mTokenLineNumber = mLineNumber;
-
-    int32_t ch = Read();
-    if (ch < 0) {
-      return false;
-    }
-
-    // UNICODE-RANGE
-    if ((ch == 'u' || ch == 'U') && Peek() == '+')
-      return ParseURange(ch, aToken);
-
-    // IDENT
-    if (StartsIdent(ch, Peek()))
-      return ParseIdent(ch, aToken);
-
-    // AT_KEYWORD
-    if (ch == '@') {
-      return ParseAtKeyword(aToken);
-    }
-
-    // NUMBER or DIM
-    if ((ch == '.') || (ch == '+') || (ch == '-')) {
-      int32_t nextChar = Peek();
-      if (IsDigit(nextChar)) {
-        return ParseNumber(ch, aToken);
-      }
-      else if (('.' == nextChar) && ('.' != ch)) {
-        nextChar = Read();
-        int32_t followingChar = Peek();
-        Pushback(nextChar);
-        if (IsDigit(followingChar))
-          return ParseNumber(ch, aToken);
-      }
-    }
-    if (IsDigit(ch)) {
-      return ParseNumber(ch, aToken);
-    }
-
-    // ID
-    if (ch == '#') {
-      return ParseRef(ch, aToken);
-    }
-
-    // STRING
-    if ((ch == '"') || (ch == '\'')) {
-      return ParseString(ch, aToken);
-    }
-
-    // WS
-    if (IsWhitespace(ch)) {
-      aToken.mType = eCSSToken_WhiteSpace;
-      aToken.mIdent.Assign(PRUnichar(ch));
-      EatWhiteSpace();
-      return true;
-    }
-    if (ch == '/' && !IsSVGMode()) {
-      int32_t nextChar = Peek();
-      if (nextChar == '*') {
-        Read();
-        // FIXME: Editor wants comments to be preserved (bug 60290).
-        if (!SkipCComment()) {
-          return false;
-        }
-        continue; // start again at the beginning
-      }
-    }
-    if (ch == '<') {  // consume HTML comment tags
-      if (LookAhead('!')) {
-        if (LookAhead('-')) {
-          if (LookAhead('-')) {
-            aToken.mType = eCSSToken_HTMLComment;
-            aToken.mIdent.AssignLiteral("<!--");
-            return true;
-          }
-          Pushback('-');
-        }
-        Pushback('!');
-      }
-    }
-    if (ch == '-') {  // check for HTML comment end
-      if (LookAhead('-')) {
-        if (LookAhead('>')) {
-          aToken.mType = eCSSToken_HTMLComment;
-          aToken.mIdent.AssignLiteral("-->");
-          return true;
-        }
-        Pushback('-');
-      }
-    }
-
-    // INCLUDES ("~=") and DASHMATCH ("|=")
-    if (( ch == '|' ) || ( ch == '~' ) || ( ch == '^' ) ||
-        ( ch == '$' ) || ( ch == '*' )) {
-      int32_t nextChar = Read();
-      if ( nextChar == '=' ) {
-        if (ch == '~') {
-          aToken.mType = eCSSToken_Includes;
-        }
-        else if (ch == '|') {
-          aToken.mType = eCSSToken_Dashmatch;
-        }
-        else if (ch == '^') {
-          aToken.mType = eCSSToken_Beginsmatch;
-        }
-        else if (ch == '$') {
-          aToken.mType = eCSSToken_Endsmatch;
-        }
-        else if (ch == '*') {
-          aToken.mType = eCSSToken_Containsmatch;
-        }
-        return true;
-      } else if (nextChar >= 0) {
-        Pushback(nextChar);
-      }
-    }
-    aToken.mType = eCSSToken_Symbol;
-    aToken.mSymbol = ch;
-    return true;
-  }
-}
-
-bool
-nsCSSScanner::NextURL(nsCSSToken& aToken)
-{
-  EatWhiteSpace();
-
-  int32_t ch = Read();
-  if (ch < 0) {
-    return false;
-  }
-
-  // STRING
-  if ((ch == '"') || (ch == '\'')) {
-#ifdef DEBUG
-    bool ok =
-#endif
-      ParseString(ch, aToken);
-    NS_ABORT_IF_FALSE(ok, "ParseString should never fail, "
-                          "since there's always something read");
-
-    NS_ABORT_IF_FALSE(aToken.mType == eCSSToken_String ||
-                      aToken.mType == eCSSToken_Bad_String,
-                      "unexpected token type");
-    if (MOZ_LIKELY(aToken.mType == eCSSToken_String)) {
-      EatWhiteSpace();
-      if (LookAheadOrEOF(')')) {
-        aToken.mType = eCSSToken_URL;
-      } else {
-        aToken.mType = eCSSToken_Bad_URL;
-      }
-    } else {
-      aToken.mType = eCSSToken_Bad_URL;
-    }
-    return true;
-  }
-
-  // Process a url lexical token. A CSS1 url token can contain
-  // characters beyond identifier characters (e.g. '/', ':', etc.)
-  // Because of this the normal rules for tokenizing the input don't
-  // apply very well. To simplify the parser and relax some of the
-  // requirements on the scanner we parse url's here. If we find a
-  // malformed URL then we emit a token of type "Bad_URL" so that
-  // the CSS1 parser can ignore the invalid input.  The parser must
-  // treat a Bad_URL token like a Function token, and process
-  // tokens until a matching parenthesis.
-
-  aToken.mType = eCSSToken_Bad_URL;
-  aToken.mSymbol = PRUnichar(0);
-  nsString& ident = aToken.mIdent;
-  ident.SetLength(0);
-
-  // start of a non-quoted url (which may be empty)
-  bool ok = true;
   for (;;) {
-    if (IsURLChar(ch)) {
-      // A regular url character.
-      ident.Append(PRUnichar(ch));
-    } else if (ch == ')') {
-      // All done
-      break;
-    } else if (IsWhitespace(ch)) {
-      // Whitespace is allowed at the end of the URL
-      EatWhiteSpace();
-      // Consume the close paren if we have it; if not we're an invalid URL.
-      ok = LookAheadOrEOF(')');
-      break;
-    } else if (ch == '\\') {
-      if (!ParseAndAppendEscape(ident, false)) {
-        ok = false;
-        Pushback(ch);
-        break;
+    int32_t ch = Read();
+    if (ch < 0) break;
+    if (ch == '*') {
+      if (LookAhead('/')) {
+        return;
       }
-    } else {
-      // This is an invalid URL spec
-      ok = false;
-      Pushback(ch); // push it back so the parser can match tokens and
-                    // then closing parenthesis
-      break;
-    }
-
-    ch = Read();
-    if (ch < 0) {
-      break;
     }
   }
 
-  // If the result of the above scanning is ok then change the token
-  // type to a useful one.
-  if (ok) {
-    aToken.mType = eCSSToken_URL;
-  }
-  return true;
+  mReporter->ReportUnexpectedEOF("PECommentEOF");
 }
 
-
 /**
  * Returns whether an escape was succesfully parsed; if it was not,
  * the backslash needs to be its own symbol token.
  */
 bool
-nsCSSScanner::ParseAndAppendEscape(nsString& aOutput, bool aInString)
+nsCSSScanner::GatherEscape(nsString& aOutput, bool aInString)
 {
   int32_t ch = Read();
   if (ch < 0) {
@@ -717,7 +507,7 @@ bool
 nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
 {
   if (aChar == '\\') {
-    if (!ParseAndAppendEscape(aIdent, false)) {
+    if (!GatherEscape(aIdent, false)) {
       return false;
     }
   } else {
@@ -730,12 +520,12 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
       // See how much we can consume and append in one go
       uint32_t n = mOffset;
       // Count number of Ident characters that can be processed
-      while (n < mCount && IsIdent(mReadPointer[n])) {
+      while (n < mCount && IsIdent(mBuffer[n])) {
         ++n;
       }
       // Add to the token what we have so far
       if (n > mOffset) {
-        aIdent.Append(&mReadPointer[mOffset], n - mOffset);
+        aIdent.Append(&mBuffer[mOffset], n - mOffset);
         mOffset = n;
       }
     }
@@ -743,7 +533,7 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
     aChar = Read();
     if (aChar < 0) break;
     if (aChar == '\\') {
-      if (!ParseAndAppendEscape(aIdent, false)) {
+      if (!GatherEscape(aIdent, false)) {
         Pushback(aChar);
         break;
       }
@@ -759,35 +549,7 @@ nsCSSScanner::GatherIdent(int32_t aChar, nsString& aIdent)
 }
 
 bool
-nsCSSScanner::ParseRef(int32_t aChar, nsCSSToken& aToken)
-{
-  // Fall back for when we don't have name characters following:
-  aToken.mType = eCSSToken_Symbol;
-  aToken.mSymbol = aChar;
-
-  int32_t ch = Read();
-  if (ch < 0) {
-    return true;
-  }
-  if (IsIdent(ch) || ch == '\\') {
-    // First char after the '#' is a valid ident char (or an escape),
-    // so it makes sense to keep going
-    nsCSSTokenType type =
-      StartsIdent(ch, Peek()) ? eCSSToken_ID : eCSSToken_Ref;
-    aToken.mIdent.SetLength(0);
-    if (GatherIdent(ch, aToken.mIdent)) {
-      aToken.mType = type;
-      return true;
-    }
-  }
-
-  // No ident chars after the '#'.  Just unread |ch| and get out of here.
-  Pushback(ch);
-  return true;
-}
-
-bool
-nsCSSScanner::ParseIdent(int32_t aChar, nsCSSToken& aToken)
+nsCSSScanner::ScanIdent(int32_t aChar, nsCSSToken& aToken)
 {
   nsString& ident = aToken.mIdent;
   ident.SetLength(0);
@@ -814,7 +576,7 @@ nsCSSScanner::ParseIdent(int32_t aChar, nsCSSToken& aToken)
 }
 
 bool
-nsCSSScanner::ParseAtKeyword(nsCSSToken& aToken)
+nsCSSScanner::ScanAtKeyword(nsCSSToken& aToken)
 {
   int32_t ch = Read();
   if (StartsIdent(ch, Peek())) {
@@ -833,7 +595,35 @@ nsCSSScanner::ParseAtKeyword(nsCSSToken& aToken)
 }
 
 bool
-nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
+nsCSSScanner::ScanHash(int32_t aChar, nsCSSToken& aToken)
+{
+  // Fall back for when we don't have name characters following:
+  aToken.mType = eCSSToken_Symbol;
+  aToken.mSymbol = aChar;
+
+  int32_t ch = Read();
+  if (ch < 0) {
+    return true;
+  }
+  if (IsIdent(ch) || ch == '\\') {
+    // First char after the '#' is a valid ident char (or an escape),
+    // so it makes sense to keep going
+    nsCSSTokenType type =
+      StartsIdent(ch, Peek()) ? eCSSToken_ID : eCSSToken_Hash;
+    aToken.mIdent.SetLength(0);
+    if (GatherIdent(ch, aToken.mIdent)) {
+      aToken.mType = type;
+      return true;
+    }
+  }
+
+  // No ident chars after the '#'.  Just unread |ch| and get out of here.
+  Pushback(ch);
+  return true;
+}
+
+bool
+nsCSSScanner::ScanNumber(int32_t c, nsCSSToken& aToken)
 {
   NS_PRECONDITION(c == '.' || c == '+' || c == '-' || IsDigit(c),
                   "Why did we get called?");
@@ -867,7 +657,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
   bool gotDot = (c == '.');
 
   if (!gotDot) {
-    // Parse the integer part of the mantisssa
+    // Scan the integer part of the mantisssa
     NS_ASSERTION(IsDigit(c), "Why did we get called?");
     do {
       intPart = 10*intPart + DecimalDigitValue(c);
@@ -879,7 +669,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
   }
 
   if (gotDot) {
-    // Parse the fractional part of the mantissa.
+    // Scan the fractional part of the mantissa.
     c = Read();
     NS_ASSERTION(IsDigit(c), "How did we get here?");
     // Power of ten by which we need to divide our next digit
@@ -967,24 +757,7 @@ nsCSSScanner::ParseNumber(int32_t c, nsCSSToken& aToken)
 }
 
 bool
-nsCSSScanner::SkipCComment()
-{
-  for (;;) {
-    int32_t ch = Read();
-    if (ch < 0) break;
-    if (ch == '*') {
-      if (LookAhead('/')) {
-        return true;
-      }
-    }
-  }
-
-  mReporter->ReportUnexpectedEOF("PECommentEOF");
-  return false;
-}
-
-bool
-nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
+nsCSSScanner::ScanString(int32_t aStop, nsCSSToken& aToken)
 {
   aToken.mIdent.SetLength(0);
   aToken.mType = eCSSToken_String;
@@ -996,7 +769,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
       uint32_t n = mOffset;
       // Count number of characters that can be processed
       for (;n < mCount; ++n) {
-        PRUnichar nextChar = mReadPointer[n];
+        PRUnichar nextChar = mBuffer[n];
         if ((nextChar == aStop) || (nextChar == '\\') ||
             (nextChar == '\n') || (nextChar == '\r') || (nextChar == '\f')) {
           break;
@@ -1004,7 +777,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
       }
       // Add to the token what we have so far
       if (n > mOffset) {
-        aToken.mIdent.Append(&mReadPointer[mOffset], n - mOffset);
+        aToken.mIdent.Append(&mBuffer[mOffset], n - mOffset);
         mOffset = n;
       }
     }
@@ -1018,10 +791,10 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
       break;
     }
     if (ch == '\\') {
-      if (!ParseAndAppendEscape(aToken.mIdent, true)) {
+      if (!GatherEscape(aToken.mIdent, true)) {
         aToken.mType = eCSSToken_Bad_String;
         Pushback(ch);
-        // For strings, the only case where ParseAndAppendEscape will
+        // For strings, the only case where GatherEscape will
         // return false is when there's a backslash to start an escape
         // immediately followed by end-of-stream.  In that case, the
         // correct tokenization is badstring *followed* by a DELIM for
@@ -1052,7 +825,7 @@ nsCSSScanner::ParseString(int32_t aStop, nsCSSToken& aToken)
 // are also decoded into mInteger and mInteger2, and mIntegerValid is set.
 
 bool
-nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult)
+nsCSSScanner::ScanURange(int32_t aChar, nsCSSToken& aResult)
 {
   int32_t intro2 = Read();
   int32_t ch = Peek();
@@ -1069,7 +842,7 @@ nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult)
   if (!IsHexDigit(ch) && ch != '?') {
     Pushback(intro2);
     Pushback(aChar);
-    return ParseIdent(aChar, aResult);
+    return ScanIdent(aChar, aResult);
   }
 
   aResult.mIdent.Truncate();
@@ -1129,3 +902,223 @@ nsCSSScanner::ParseURange(int32_t aChar, nsCSSToken& aResult)
   aResult.mType = eCSSToken_URange;
   return true;
 }
+
+bool
+nsCSSScanner::NextURL(nsCSSToken& aToken)
+{
+  SkipWhitespace();
+
+  int32_t ch = Read();
+  if (ch < 0) {
+    return false;
+  }
+
+  // STRING
+  if ((ch == '"') || (ch == '\'')) {
+#ifdef DEBUG
+    bool ok =
+#endif
+      ScanString(ch, aToken);
+    NS_ABORT_IF_FALSE(ok, "ScanString should never fail, "
+                          "since there's always something read");
+
+    NS_ABORT_IF_FALSE(aToken.mType == eCSSToken_String ||
+                      aToken.mType == eCSSToken_Bad_String,
+                      "unexpected token type");
+    if (MOZ_LIKELY(aToken.mType == eCSSToken_String)) {
+      SkipWhitespace();
+      if (LookAheadOrEOF(')')) {
+        aToken.mType = eCSSToken_URL;
+      } else {
+        aToken.mType = eCSSToken_Bad_URL;
+      }
+    } else {
+      aToken.mType = eCSSToken_Bad_URL;
+    }
+    return true;
+  }
+
+  // Process a url lexical token. A CSS1 url token can contain
+  // characters beyond identifier characters (e.g. '/', ':', etc.)
+  // Because of this the normal rules for tokenizing the input don't
+  // apply very well. To simplify the parser and relax some of the
+  // requirements on the scanner we parse url's here. If we find a
+  // malformed URL then we emit a token of type "Bad_URL" so that
+  // the CSS1 parser can ignore the invalid input.  The parser must
+  // treat a Bad_URL token like a Function token, and process
+  // tokens until a matching parenthesis.
+
+  aToken.mType = eCSSToken_Bad_URL;
+  aToken.mSymbol = PRUnichar(0);
+  nsString& ident = aToken.mIdent;
+  ident.SetLength(0);
+
+  // start of a non-quoted url (which may be empty)
+  bool ok = true;
+  for (;;) {
+    if (IsURLChar(ch)) {
+      // A regular url character.
+      ident.Append(PRUnichar(ch));
+    } else if (ch == ')') {
+      // All done
+      break;
+    } else if (IsWhitespace(ch)) {
+      // Whitespace is allowed at the end of the URL
+      SkipWhitespace();
+      // Consume the close paren if we have it; if not we're an invalid URL.
+      ok = LookAheadOrEOF(')');
+      break;
+    } else if (ch == '\\') {
+      if (!GatherEscape(ident, false)) {
+        ok = false;
+        Pushback(ch);
+        break;
+      }
+    } else {
+      // This is an invalid URL spec
+      ok = false;
+      Pushback(ch); // push it back so the parser can match tokens and
+                    // then closing parenthesis
+      break;
+    }
+
+    ch = Read();
+    if (ch < 0) {
+      break;
+    }
+  }
+
+  // If the result of the above scanning is ok then change the token
+  // type to a useful one.
+  if (ok) {
+    aToken.mType = eCSSToken_URL;
+  }
+  return true;
+}
+
+bool
+nsCSSScanner::Next(nsCSSToken& aToken, bool aSkipWS)
+{
+  for (;;) { // Infinite loop so we can restart after comments.
+    mTokenOffset = mOffset;
+    mTokenLineOffset = mLineOffset;
+    mTokenLineNumber = mLineNumber;
+
+    int32_t ch = Read();
+    if (ch < 0) {
+      return false;
+    }
+
+    // UNICODE-RANGE
+    if ((ch == 'u' || ch == 'U') && Peek() == '+')
+      return ScanURange(ch, aToken);
+
+    // IDENT
+    if (StartsIdent(ch, Peek()))
+      return ScanIdent(ch, aToken);
+
+    // AT_KEYWORD
+    if (ch == '@') {
+      return ScanAtKeyword(aToken);
+    }
+
+    // NUMBER or DIM
+    if ((ch == '.') || (ch == '+') || (ch == '-')) {
+      int32_t nextChar = Peek();
+      if (IsDigit(nextChar)) {
+        return ScanNumber(ch, aToken);
+      }
+      else if (('.' == nextChar) && ('.' != ch)) {
+        nextChar = Read();
+        int32_t followingChar = Peek();
+        Pushback(nextChar);
+        if (IsDigit(followingChar))
+          return ScanNumber(ch, aToken);
+      }
+    }
+    if (IsDigit(ch)) {
+      return ScanNumber(ch, aToken);
+    }
+
+    // ID
+    if (ch == '#') {
+      return ScanHash(ch, aToken);
+    }
+
+    // STRING
+    if ((ch == '"') || (ch == '\'')) {
+      return ScanString(ch, aToken);
+    }
+
+    // WS
+    if (IsWhitespace(ch)) {
+      SkipWhitespace();
+      if (!aSkipWS) {
+        aToken.mType = eCSSToken_Whitespace;
+        return true;
+      }
+      continue; // start again at the beginning
+    }
+    if (ch == '/' && !IsSVGMode()) {
+      int32_t nextChar = Peek();
+      if (nextChar == '*') {
+        Read();
+        // FIXME: Editor wants comments to be preserved (bug 60290).
+        SkipComment();
+        continue; // start again at the beginning
+      }
+    }
+    if (ch == '<') {  // consume HTML comment tags
+      if (LookAhead('!')) {
+        if (LookAhead('-')) {
+          if (LookAhead('-')) {
+            aToken.mType = eCSSToken_HTMLComment;
+            aToken.mIdent.AssignLiteral("<!--");
+            return true;
+          }
+          Pushback('-');
+        }
+        Pushback('!');
+      }
+    }
+    if (ch == '-') {  // check for HTML comment end
+      if (LookAhead('-')) {
+        if (LookAhead('>')) {
+          aToken.mType = eCSSToken_HTMLComment;
+          aToken.mIdent.AssignLiteral("-->");
+          return true;
+        }
+        Pushback('-');
+      }
+    }
+
+    // INCLUDES ("~=") and DASHMATCH ("|=")
+    if (( ch == '|' ) || ( ch == '~' ) || ( ch == '^' ) ||
+        ( ch == '$' ) || ( ch == '*' )) {
+      int32_t nextChar = Read();
+      if ( nextChar == '=' ) {
+        if (ch == '~') {
+          aToken.mType = eCSSToken_Includes;
+        }
+        else if (ch == '|') {
+          aToken.mType = eCSSToken_Dashmatch;
+        }
+        else if (ch == '^') {
+          aToken.mType = eCSSToken_Beginsmatch;
+        }
+        else if (ch == '$') {
+          aToken.mType = eCSSToken_Endsmatch;
+        }
+        else if (ch == '*') {
+          aToken.mType = eCSSToken_Containsmatch;
+        }
+        return true;
+      } else if (nextChar >= 0) {
+        Pushback(nextChar);
+      }
+    }
+    aToken.mType = eCSSToken_Symbol;
+    aToken.mSymbol = ch;
+    return true;
+  }
+}
diff --git a/layout/style/nsCSSScanner.h b/layout/style/nsCSSScanner.h
index 30159bcea82..54e4be7c8f6 100644
--- a/layout/style/nsCSSScanner.h
+++ b/layout/style/nsCSSScanner.h
@@ -16,58 +16,88 @@ class ErrorReporter;
 }
 }
 
-// Token types
+// Token types; in close but not perfect correspondence to the token
+// categorization in section 4.1.1 of CSS2.1.  (The deviations are all
+// the fault of css3-selectors, which has requirements that can only be
+// met by changing the generic tokenization.)  The comment on each line
+// illustrates the form of each identifier.
+
 enum nsCSSTokenType {
-  // A css identifier (e.g. foo)
-  eCSSToken_Ident,          // mIdent
+  // White space of any kind.  No value fields are used.  Note that
+  // comments do *not* count as white space; comments separate tokens
+  // but are not themselves tokens.
+  eCSSToken_Whitespace,     //
 
-  // A css at keyword (e.g. @foo)
-  eCSSToken_AtKeyword,      // mIdent
+  // Identifier-like tokens.  mIdent is the text of the identifier.
+  // The difference between ID and Hash is: if the text after the #
+  // would have been a valid Ident if the # hadn't been there, the
+  // scanner produces an ID token.  Otherwise it produces a Hash token.
+  // (This distinction is required by css3-selectors.)
+  eCSSToken_Ident,          // word
+  eCSSToken_Function,       // word(
+  eCSSToken_AtKeyword,      // @word
+  eCSSToken_ID,             // #word
+  eCSSToken_Hash,           // #0word
 
-  // A css number without a percentage or dimension; with percentage;
-  // without percentage but with a dimension
-  eCSSToken_Number,         // mNumber
-  eCSSToken_Percentage,     // mNumber
-  eCSSToken_Dimension,      // mNumber + mIdent
+  // Numeric tokens.  mNumber is the floating-point value of the
+  // number, and mHasSign indicates whether there was an explicit sign
+  // (+ or -) in front of the number.  If mIntegerValid is true, the
+  // number had the lexical form of an integer, and mInteger is its
+  // integer value.  Lexically integer values outside the range of a
+  // 32-bit signed number are clamped to the maximum values; mNumber
+  // will indicate a 'truer' value in that case.  Percentage tokens
+  // are always considered not to be integers, even if their numeric
+  // value is integral (100% => mNumber = 1.0).  For Dimension
+  // tokens, mIdent holds the text of the unit.
+  eCSSToken_Number,         // 1 -5 +2e3 3.14159 7.297352e-3
+  eCSSToken_Dimension,      // 24px 8.5in
+  eCSSToken_Percentage,     // 85% 1280.4%
 
-  // A css string (e.g. "foo" or 'foo')
-  eCSSToken_String,         // mSymbol + mIdent + mSymbol
+  // String-like tokens.  In all cases, mIdent holds the text
+  // belonging to the string, and mSymbol holds the delimiter
+  // character, which may be ', ", or zero (only for unquoted URLs).
+  // Bad_String and Bad_URL tokens are emitted when the closing
+  // delimiter or parenthesis was missing.
+  eCSSToken_String,         // 'foo bar' "foo bar"
+  eCSSToken_Bad_String,     // 'foo bar
+  eCSSToken_URL,            // url(foobar) url("foo bar")
+  eCSSToken_Bad_URL,        // url(foo
 
-  // Whitespace (e.g. " " or "/* abc */")
-  eCSSToken_WhiteSpace,     // mIdent
+  // Any one-character symbol.  mSymbol holds the character.
+  eCSSToken_Symbol,         // . ; { } ! *
 
-  // A css symbol (e.g. ':', ';', '+', etc.)
-  eCSSToken_Symbol,         // mSymbol
+  // Match operators.  These are single tokens rather than pairs of
+  // Symbol tokens because css3-selectors forbids the presence of
+  // comments between the two characters.  No value fields are used;
+  // the token type indicates which operator.
+  eCSSToken_Includes,       // ~=
+  eCSSToken_Dashmatch,      // |=
+  eCSSToken_Beginsmatch,    // ^=
+  eCSSToken_Endsmatch,      // $=
+  eCSSToken_Containsmatch,  // *=
 
-  // A css1 id (e.g. #foo3)
-  eCSSToken_ID,             // mIdent
-  // Just like eCSSToken_ID, except the part following the '#' is not
-  // a valid CSS identifier (eg. starts with a digit, is the empty
-  // string, etc).
-  eCSSToken_Ref,            // mIdent
+  // Unicode-range token: currently used only in @font-face.
+  // The lexical rule for this token includes several forms that are
+  // semantically invalid.  Therefore, mIdent always holds the
+  // complete original text of the token (so we can print it
+  // accurately in diagnostics), and mIntegerValid is true iff the
+  // token is semantically valid.  In that case, mInteger holds the
+  // lowest value included in the range, and mInteger2 holds the
+  // highest value included in the range.
+  eCSSToken_URange,         // U+007e U+01?? U+2000-206F
 
-  eCSSToken_Function,       // mIdent
-
-  eCSSToken_URL,            // mIdent + mSymbol
-  eCSSToken_Bad_URL,        // mIdent + mSymbol
-
-  eCSSToken_HTMLComment,    // "<!--" or "-->"
-
-  eCSSToken_Includes,       // "~="
-  eCSSToken_Dashmatch,      // "|="
-  eCSSToken_Beginsmatch,    // "^="
-  eCSSToken_Endsmatch,      // "$="
-  eCSSToken_Containsmatch,  // "*="
-
-  eCSSToken_URange,         // Low in mInteger, high in mInteger2;
-                            // mIntegerValid is true if the token is a
-                            // valid range; mIdent preserves the textual
-                            // form of the token for error reporting
-
-  // An unterminated string, which is always an error.
-  eCSSToken_Bad_String      // mSymbol + mIdent
+  // HTML comment delimiters, ignored as a unit when they appear at
+  // the top level of a style sheet, for compatibility with websites
+  // written for compatibility with pre-CSS browsers.  This token type
+  // subsumes the css2.1 CDO and CDC tokens, which are always treated
+  // the same by the parser.  mIdent holds the text of the token, for
+  // diagnostics.
+  eCSSToken_HTMLComment,    // <!-- -->
 };
 
+// A single token returned from the scanner.  mType is always
+// meaningful; comments above describe which other fields are
+// meaningful for which token types.
 struct nsCSSToken {
   nsAutoString    mIdent NS_OKONHEAP;
   float           mNumber;
@@ -75,22 +105,24 @@ struct nsCSSToken {
   int32_t         mInteger2;
   nsCSSTokenType  mType;
   PRUnichar       mSymbol;
-  bool            mIntegerValid; // for number, dimension, urange
-  bool            mHasSign; // for number, percentage, and dimension
+  bool            mIntegerValid;
+  bool            mHasSign;
 
-  nsCSSToken();
+  nsCSSToken()
+    : mNumber(0), mInteger(0), mInteger2(0), mType(eCSSToken_Whitespace),
+      mSymbol('\0'), mIntegerValid(false), mHasSign(false)
+  {}
 
-  bool IsSymbol(PRUnichar aSymbol) {
-    return bool((eCSSToken_Symbol == mType) && (mSymbol == aSymbol));
+  bool IsSymbol(PRUnichar aSymbol) const {
+    return mType == eCSSToken_Symbol && mSymbol == aSymbol;
   }
 
   void AppendToString(nsString& aBuffer) const;
 };
 
-// CSS Scanner API. Used to tokenize an input stream using the CSS
-// forward compatible tokenization rules. This implementation is
-// private to this package and is only used internally by the css
-// parser.
+// nsCSSScanner tokenizes an input stream using the CSS2.1 forward
+// compatible tokenization rules.  Used internally by nsCSSParser;
+// not available for use by other code.
 class nsCSSScanner {
   public:
   // |aLineNumber == 1| is the beginning of a file, use |aLineNumber == 0|
@@ -122,17 +154,23 @@ class nsCSSScanner {
   // the most recently processed token.
   nsDependentSubstring GetCurrentLine() const;
 
-  // Get the next token. Return false on EOF. aTokenResult
-  // is filled in with the data for the token.
-  bool Next(nsCSSToken& aTokenResult);
+  // Get the next token.  Return false on EOF.  aTokenResult is filled
+  // in with the data for the token.  If aSkipWS is true, skip over
+  // eCSSToken_Whitespace tokens rather than returning them.
+  bool Next(nsCSSToken& aTokenResult, bool aSkipWS);
 
-  // Get the next token that may be a string or unquoted URL
+  // Get the body of an URL token (everything after the 'url(').
+  // This is exposed for use by nsCSSParser::ParseMozDocumentRule,
+  // which, for historical reasons, must make additional function
+  // tokens behave like url().  Please do not add new uses to the
+  // parser.
   bool NextURL(nsCSSToken& aTokenResult);
 
-  // It's really ugly that we have to expose this, but it's the easiest
-  // way to do :nth-child() parsing sanely.  (In particular, in
-  // :nth-child(2n-1), "2n-1" is a dimension, and we need to push the
-  // "-1" back so we can read it again as a number.)
+  // This is exposed for use by nsCSSParser::ParsePseudoClassWithNthPairArg,
+  // because "2n-1" is a single DIMENSION token, and "n-1" is a single
+  // IDENT token, but the :nth() selector syntax wants to interpret
+  // them the same as "2n -1" and "n -1" respectively.  Please do not
+  // add new uses to the parser.
   void Pushback(PRUnichar aChar);
 
   // Starts recording the input stream from the current position.
@@ -150,20 +188,21 @@ protected:
   int32_t Peek();
   bool LookAhead(PRUnichar aChar);
   bool LookAheadOrEOF(PRUnichar aChar); // expect either aChar or EOF
-  void EatWhiteSpace();
 
-  bool ParseAndAppendEscape(nsString& aOutput, bool aInString);
-  bool ParseIdent(int32_t aChar, nsCSSToken& aResult);
-  bool ParseAtKeyword(nsCSSToken& aResult);
-  bool ParseNumber(int32_t aChar, nsCSSToken& aResult);
-  bool ParseRef(int32_t aChar, nsCSSToken& aResult);
-  bool ParseString(int32_t aChar, nsCSSToken& aResult);
-  bool ParseURange(int32_t aChar, nsCSSToken& aResult);
-  bool SkipCComment();
+  void SkipWhitespace();
+  void SkipComment();
 
+  bool GatherEscape(nsString& aOutput, bool aInString);
   bool GatherIdent(int32_t aChar, nsString& aIdent);
 
-  const PRUnichar *mReadPointer;
+  bool ScanIdent(int32_t aChar, nsCSSToken& aResult);
+  bool ScanAtKeyword(nsCSSToken& aResult);
+  bool ScanHash(int32_t aChar, nsCSSToken& aResult);
+  bool ScanNumber(int32_t aChar, nsCSSToken& aResult);
+  bool ScanString(int32_t aChar, nsCSSToken& aResult);
+  bool ScanURange(int32_t aChar, nsCSSToken& aResult);
+
+  const PRUnichar *mBuffer;
   uint32_t mOffset;
   uint32_t mCount;