Parse comments according to parsing.md

Now we don't even generate comment tokens. The new tokenizer for comments is much easier. R=eseidel@chromium.org Review URL: https://codereview.chromium.org/682893002
2026-03-30 11:09:55 -07:00 · 2014-10-27 17:04:52 -07:00
parent 36288fcde8
commit 710a717192
8 changed files with 54 additions and 176 deletions
@@ -83,12 +83,6 @@ public:
        return m_data;
    }

-    const String& comment() const
-    {
-        ASSERT(m_type == HTMLToken::Comment);
-        return m_data;
-    }
-
    explicit AtomicHTMLToken(HTMLToken& token)
        : m_type(token.type())
    {
@@ -109,7 +103,6 @@ public:
            break;
        }
        case HTMLToken::Character:
-        case HTMLToken::Comment:
            if (token.isAll8BitData())
                m_data = String::make8BitFrom16BitSource(token.data());
            else
@@ -141,7 +134,6 @@ public:
            m_name = AtomicString(token.data());
            break;
        case HTMLToken::Character:
-        case HTMLToken::Comment:
            m_data = token.data();
            break;
        }
@@ -175,7 +167,7 @@ private:
    // "name" for StartTag and EndTag
    AtomicString m_name;

-    // "data" for Comment, "characters" for Character
+    // "characters" for Character
    String m_data;

    // For StartTag and EndTag
@@ -59,7 +59,6 @@ CompactHTMLToken::CompactHTMLToken(const HTMLToken* token, const TextPosition& t
    case HTMLToken::EndTag:
        m_selfClosing = token->selfClosing();
        // Fall through!
-    case HTMLToken::Comment:
    case HTMLToken::Character: {
        m_isAll8BitData = token->isAll8BitData();
        m_data = attemptStaticStringCreation(token->data(), token->isAll8BitData() ? Force8Bit : Force16Bit);
@@ -50,7 +50,6 @@ public:
        Uninitialized,
        StartTag,
        EndTag,
-        Comment,
        Character,
        EndOfFile,
    };
@@ -114,7 +113,7 @@ public:

    const DataVector& data() const
    {
-        ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
+        ASSERT(m_type == Character || m_type == StartTag || m_type == EndTag);
        return m_data;
    }

@@ -298,28 +297,6 @@ public:
        m_data.appendVector(characters);
    }

-    /* Comment Tokens */
-
-    const DataVector& comment() const
-    {
-        ASSERT(m_type == Comment);
-        return m_data;
-    }
-
-    void beginComment()
-    {
-        ASSERT(m_type == Uninitialized);
-        m_type = Comment;
-    }
-
-    void appendToComment(UChar character)
-    {
-        ASSERT(character);
-        ASSERT(m_type == Comment);
-        m_data.append(character);
-        m_orAllData |= character;
-    }
-
 private:
    Type m_type;
    Attribute::Range m_range; // Always starts at zero.
@@ -235,21 +235,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)

    HTML_BEGIN_STATE(TagOpenState) {
        if (cc == '!')
-            HTML_ADVANCE_TO(MarkupDeclarationOpenState);
+            HTML_ADVANCE_TO(CommentStart1State);
        else if (cc == '/')
-            HTML_ADVANCE_TO(EndTagOpenState);
+            HTML_ADVANCE_TO(CloseTagState);
        else if (isASCIIUpper(cc)) {
            m_token->beginStartTag(toLowerCase(cc));
            HTML_ADVANCE_TO(TagNameState);
        } else if (isASCIILower(cc)) {
            m_token->beginStartTag(cc);
            HTML_ADVANCE_TO(TagNameState);
-        } else if (cc == '?') {
-            parseError();
-            // The spec consumes the current character before switching
-            // to the bogus comment state, but it's easier to implement
-            // if we reconsume the current character.
-            HTML_RECONSUME_IN(BogusCommentState);
        } else {
            parseError();
            bufferCharacter('<');
@@ -258,7 +252,7 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
    }
    END_STATE()

-    HTML_BEGIN_STATE(EndTagOpenState) {
+    HTML_BEGIN_STATE(CloseTagState) {
        if (isASCIIUpper(cc)) {
            m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
            m_appropriateEndTagName.clear();
@@ -268,16 +262,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
            m_appropriateEndTagName.clear();
            HTML_ADVANCE_TO(TagNameState);
        } else if (cc == '>') {
-            parseError();
+            bufferCharacter('<');
+            bufferCharacter('/');
+            bufferCharacter('>');
            HTML_ADVANCE_TO(DataState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
+        } else {
            bufferCharacter('<');
            bufferCharacter('/');
            HTML_RECONSUME_IN(DataState);
-        } else {
-            parseError();
-            HTML_RECONSUME_IN(BogusCommentState);
        }
    }
    END_STATE()
@@ -571,144 +563,54 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
    }
    END_STATE()

-    HTML_BEGIN_STATE(BogusCommentState) {
-        m_token->beginComment();
-        HTML_RECONSUME_IN(ContinueBogusCommentState);
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(ContinueBogusCommentState) {
-        if (cc == '>')
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        else if (cc == kEndOfFileMarker)
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        else {
-            m_token->appendToComment(cc);
-            HTML_ADVANCE_TO(ContinueBogusCommentState);
-        }
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(MarkupDeclarationOpenState) {
+    HTML_BEGIN_STATE(CommentStart1State) {
        if (cc == '-') {
-            SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash);
-            if (result == SegmentedString::DidMatch) {
-                source.advanceAndASSERT('-');
-                source.advanceAndASSERT('-');
-                m_token->beginComment();
-                HTML_SWITCH_TO(CommentStartState);
-            } else if (result == SegmentedString::NotEnoughCharacters)
-                return haveBufferedCharacterToken();
-        }
-        parseError();
-        HTML_RECONSUME_IN(BogusCommentState);
-    }
-    END_STATE()
-
-    HTML_BEGIN_STATE(CommentStartState) {
-        if (cc == '-')
-            HTML_ADVANCE_TO(CommentStartDashState);
-        else if (cc == '>') {
-            parseError();
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
+            HTML_ADVANCE_TO(CommentStart2State);
        } else {
-            m_token->appendToComment(cc);
-            HTML_ADVANCE_TO(CommentState);
+            bufferCharacter('<');
+            bufferCharacter('!');
+            HTML_RECONSUME_IN(DataState);
        }
    }
    END_STATE()

-    HTML_BEGIN_STATE(CommentStartDashState) {
-        if (cc == '-')
-            HTML_ADVANCE_TO(CommentEndState);
-        else if (cc == '>') {
-            parseError();
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment('-');
-            m_token->appendToComment(cc);
+    HTML_BEGIN_STATE(CommentStart2State) {
+        if (cc == '-') {
            HTML_ADVANCE_TO(CommentState);
+        } else {
+            bufferCharacter('<');
+            bufferCharacter('!');
+            bufferCharacter('-');
+            HTML_RECONSUME_IN(DataState);
        }
    }
    END_STATE()

    HTML_BEGIN_STATE(CommentState) {
        if (cc == '-')
-            HTML_ADVANCE_TO(CommentEndDashState);
-        else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment(cc);
+            HTML_ADVANCE_TO(CommentEnd1State);
+        else
            HTML_ADVANCE_TO(CommentState);
-        }
    }
    END_STATE()

-    HTML_BEGIN_STATE(CommentEndDashState) {
+    HTML_BEGIN_STATE(CommentEnd1State) {
        if (cc == '-')
-            HTML_ADVANCE_TO(CommentEndState);
-        else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment('-');
-            m_token->appendToComment(cc);
+            HTML_ADVANCE_TO(CommentEnd2State);
+        else
            HTML_ADVANCE_TO(CommentState);
-        }
    }
    END_STATE()

-    HTML_BEGIN_STATE(CommentEndState) {
-        if (cc == '>')
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        else if (cc == '!') {
-            parseError();
-            HTML_ADVANCE_TO(CommentEndBangState);
-        } else if (cc == '-') {
-            parseError();
-            m_token->appendToComment('-');
-            HTML_ADVANCE_TO(CommentEndState);
-        } else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            parseError();
-            m_token->appendToComment('-');
-            m_token->appendToComment('-');
-            m_token->appendToComment(cc);
+    HTML_BEGIN_STATE(CommentEnd2State) {
+        if (cc == '-')
+            HTML_ADVANCE_TO(CommentEnd2State);
+        else if (cc == '>')
+            HTML_ADVANCE_TO(DataState);
+        else
            HTML_ADVANCE_TO(CommentState);
-        }
    }
    END_STATE()
-
-    HTML_BEGIN_STATE(CommentEndBangState) {
-        if (cc == '-') {
-            m_token->appendToComment('-');
-            m_token->appendToComment('-');
-            m_token->appendToComment('!');
-            HTML_ADVANCE_TO(CommentEndDashState);
-        } else if (cc == '>')
-            return emitAndResumeIn(source, HTMLTokenizer::DataState);
-        else if (cc == kEndOfFileMarker) {
-            parseError();
-            return emitAndReconsumeIn(source, HTMLTokenizer::DataState);
-        } else {
-            m_token->appendToComment('-');
-            m_token->appendToComment('-');
-            m_token->appendToComment('!');
-            m_token->appendToComment(cc);
-            HTML_ADVANCE_TO(CommentState);
-        }
-    }
-    END_STATE()
-
    }

    ASSERT_NOT_REACHED();
@@ -49,7 +49,7 @@ public:
        CharacterReferenceInAttributeValueState,
        RAWTEXTState,
        TagOpenState,
-        EndTagOpenState,
+        CloseTagState,
        TagNameState,
        RAWTEXTLessThanSignState,
        RAWTEXTEndTagOpenState,
@@ -63,18 +63,11 @@ public:
        AttributeValueUnquotedState,
        AfterAttributeValueQuotedState,
        SelfClosingStartTagState,
-        BogusCommentState,
-        // The ContinueBogusCommentState is not in the HTML5 spec, but we use
-        // it internally to keep track of whether we've started the bogus
-        // comment token yet.
-        ContinueBogusCommentState,
-        MarkupDeclarationOpenState,
-        CommentStartState,
-        CommentStartDashState,
+        CommentStart1State,
+        CommentStart2State,
        CommentState,
-        CommentEndDashState,
-        CommentEndState,
-        CommentEndBangState,
+        CommentEnd1State,
+        CommentEnd2State,
    };

    // This function returns true if it emits a token. Otherwise, callers
@@ -128,8 +128,7 @@ void HTMLTreeBuilder::constructTree(AtomicHTMLToken* token)
    } else if (type == HTMLToken::EndOfFile) {
        processEndOfFile(token);
    } else {
-        // We ignore Comments.
-        ASSERT(type == HTMLToken::Comment);
+        ASSERT_NOT_REACHED();
    }

    m_tree.executeQueuedTasks();
@@ -0,0 +1 @@
+< <! <!- <!-> --> -> > </ </>
@@ -0,0 +1,15 @@
+<html>
+<link rel="import" href="../resources/dump-as-text.html" />
+<body>
+<
+<!
+<!-
+<!->
+<!-->aaa-->
+-->
+->
+>
+</
+</>
+</body>
+</html>