Bug 660747 - Reverting YARR begin characters optimization. r=dmandelin

2024-09-13 09:24:08 -07:00 · 2011-11-22 00:21:18 -05:00 · 2011-11-22 00:21:18 -05:00 · 6e53604b3e
commit 6e53604b3e
parent d19ea74612
4 changed files with 2 additions and 304 deletions
--- a/js/src/yarr/YarrInterpreter.cpp
+++ b/js/src/yarr/YarrInterpreter.cpp
@ -1046,39 +1046,10 @@ public:
        return JSRegExpErrorNoMatch;
    }

-    void lookupForBeginChars()
-    {
-        int character;
-        bool firstSingleCharFound;
-
-        while (true) {
-            if (input.isNotAvailableInput(2))
-                return;
-
-            firstSingleCharFound = false;
-
-            character = input.readPair();
-
-            for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
-                BeginChar bc = pattern->m_beginChars[i];
-
-                if (!firstSingleCharFound && bc.value <= 0xFFFF) {
-                    firstSingleCharFound = true;
-                    character &= 0xFFFF;
-                }
-
-                if ((character | bc.mask) == bc.value)
-                    return;
-            }
-
-            input.next();
-        }
-    }
-
 #define MATCH_NEXT() { ++context->term; goto matchAgain; }
 #define BACKTRACK() { --context->term; goto backtrack; }
 #define currentTerm() (disjunction->terms[context->term])
-    JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
+    JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
    {
        if (!--remainingMatchCount)
            return JSRegExpErrorHitLimit;
@ -1086,9 +1057,6 @@ public:
        if (btrack)
            BACKTRACK();

-        if (pattern->m_containsBeginChars && isBody)
-            lookupForBeginChars();
-
        context->matchBegin = input.getPos();
        context->term = 0;

@ -1266,9 +1234,6 @@ public:

            input.next();

-            if (pattern->m_containsBeginChars && isBody)
-                lookupForBeginChars();
-
            context->matchBegin = input.getPos();

            if (currentTerm().alternative.onceThrough)
@ -1397,7 +1362,7 @@ public:

        DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());

-        JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
+        JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
        if (result == JSRegExpMatch) {
            output[0] = context->matchBegin;
            output[1] = context->matchEnd;
--- a/js/src/yarr/YarrInterpreter.h
+++ b/js/src/yarr/YarrInterpreter.h
@ -335,7 +335,6 @@ public:
        : m_body(body)
        , m_ignoreCase(pattern.m_ignoreCase)
        , m_multiline(pattern.m_multiline)
-        , m_containsBeginChars(pattern.m_containsBeginChars)
        , m_allocator(allocator)
    {
        newlineCharacterClass = pattern.newlineCharacterClass();
@ -347,8 +346,6 @@ public:
        // array, so that it won't delete them on destruction.  We'll
        // take responsibility for that.
        pattern.m_userCharacterClasses.clear();
-
-        m_beginChars.append(pattern.m_beginChars);
    }

    ~BytecodePattern()
@ -360,7 +357,6 @@ public:
    OwnPtr<ByteDisjunction> m_body;
    bool m_ignoreCase;
    bool m_multiline;
-    bool m_containsBeginChars;
    // Each BytecodePattern is associated with a RegExp, each RegExp is associated
    // with a JSGlobalData.  Cache a pointer to out JSGlobalData's m_regExpAllocator.
    BumpPointerAllocator* m_allocator;
@ -368,8 +364,6 @@ public:
    CharacterClass* newlineCharacterClass;
    CharacterClass* wordcharCharacterClass;

-    Vector<BeginChar> m_beginChars;
-
 private:
    Vector<ByteDisjunction*> m_allParenthesesInfo;
    Vector<CharacterClass*> m_userCharacterClasses;
--- a/js/src/yarr/YarrPattern.cpp
+++ b/js/src/yarr/YarrPattern.cpp
@ -242,117 +242,11 @@ private:
    Vector<CharacterRange> m_rangesUnicode;
 };

-struct BeginCharHelper {
-    BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
-        : m_beginChars(beginChars)
-        , m_isCaseInsensitive(isCaseInsensitive)
-    {}
-
-    void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
-    {
-        if (quantityType == QuantifierFixedCount && quantityCount > 1) {
-            // We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
-            beginChar.value |= beginChar.value << 16;
-            beginChar.mask |= beginChar.mask << 16;
-            addCharacter(beginChar);
-        } else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
-            // In case of characters with fixed quantifier we should check the next character as well.
-            linkHotTerms(beginChar, hotTerms);
-        else
-            // In case of greedy matching the next character checking is unnecessary therefore we just store
-            // the first character.
-            addCharacter(beginChar);
-    }
-
-    // Merge two following BeginChars in the vector to reduce the number of character checks.
-    void merge(unsigned size)
-    {
-        for (unsigned i = 0; i < size; i++) {
-            BeginChar* curr = &m_beginChars->at(i);
-            BeginChar* next = &m_beginChars->at(i + 1);
-
-            // If the current and the next size of value is different we should skip the merge process
-            // because the 16bit and 32bit values are unmergable.
-            if (curr->value <= 0xFFFF && next->value > 0xFFFF)
-                continue;
-
-            unsigned diff = curr->value ^ next->value;
-
-            curr->mask |= diff;
-            curr->value |= curr->mask;
-
-            m_beginChars->remove(i + 1);
-            size--;
-        }
-    }
-
-private:
-    void addCharacter(BeginChar beginChar)
-    {
-        unsigned pos = 0;
-        unsigned range = m_beginChars->size();
-
-        // binary chop, find position to insert char.
-        while (range) {
-            unsigned index = range >> 1;
-
-            int val = m_beginChars->at(pos+index).value - beginChar.value;
-            if (!val)
-                return;
-            if (val < 0)
-                range = index;
-            else {
-                pos += (index+1);
-                range -= (index+1);
-            }
-        }
-
-        if (pos == m_beginChars->size())
-            m_beginChars->append(beginChar);
-        else
-            m_beginChars->insert(pos, beginChar);
-    }
-
-    // Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
-    void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
-    {
-        for (unsigned i = 0; i < hotTerms->size(); i++) {
-            PatternTerm hotTerm = hotTerms->at(i).term;
-            ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
-
-            UChar characterNext = hotTerm.patternCharacter;
-
-            // Append a character to an existing BeginChar object.
-            if (characterNext <= 0x7f) {
-                unsigned mask = 0;
-
-                if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
-                    mask = 32;
-                    characterNext = toASCIILower(characterNext);
-                }
-
-                addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
-            } else {
-                UChar upper, lower;
-                if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
-                    addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
-                    addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
-                } else
-                    addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
-            }
-        }
-    }
-
-    Vector<BeginChar>* m_beginChars;
-    bool m_isCaseInsensitive;
-};
-
 class YarrPatternConstructor {
 public:
    YarrPatternConstructor(YarrPattern& pattern)
        : m_pattern(pattern)
        , m_characterClassConstructor(pattern.m_ignoreCase)
-        , m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
        , m_invertParentheticalAssertion(false)
    {
        m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
@ -789,144 +683,10 @@ public:
        }
    }

-    // This function collects the terms which are potentially matching the first number of depth characters in the result.
-    // If this function returns false then it found at least one term which makes the beginning character
-    // look-up optimization inefficient.
-    bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
-    {
-        for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
-            PatternAlternative* alternative = disjunction->m_alternatives[alt];
-
-            if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
-                return false;
-        }
-
-        return true;
-    }
-
-    bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
-    {
-        bool checkNext = true;
-        unsigned numTerms = alternative->m_terms.size();
-
-        while (checkNext && termIndex < numTerms) {
-            PatternTerm term = alternative->m_terms[termIndex];
-            checkNext = false;
-
-            switch (term.type) {
-            case PatternTerm::TypeAssertionBOL:
-            case PatternTerm::TypeAssertionEOL:
-            case PatternTerm::TypeAssertionWordBoundary:
-                return false;
-
-            case PatternTerm::TypeBackReference:
-            case PatternTerm::TypeForwardReference:
-                return false;
-
-            case PatternTerm::TypePatternCharacter:
-                if (termIndex != numTerms - 1) {
-                    beginTerms->append(TermChain(term));
-                    termIndex++;
-                    checkNext = true;
-                } else if (term.quantityType == QuantifierFixedCount) {
-                    beginTerms->append(TermChain(term));
-                    if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
-                        if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1))
-                            return false;
-                }
-
-                break;
-
-            case PatternTerm::TypeCharacterClass:
-                return false;
-
-            case PatternTerm::TypeParentheticalAssertion:
-                if (term.invert())
-                    return false;
-
-            case PatternTerm::TypeParenthesesSubpattern:
-                if (term.quantityType != QuantifierFixedCount) {
-                    if (termIndex == numTerms - 1)
-                        break;
-
-                    termIndex++;
-                    checkNext = true;
-                }
-
-                if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
-                    return false;
-
-                break;
-            }
-        }
-
-        return true;
-    }
-
-    void setupBeginChars()
-    {
-        Vector<TermChain> beginTerms;
-        bool containsFixedCharacter = false;
-
-        if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
-                && setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
-            unsigned size = beginTerms.size();
-
-            // If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
-            if (!size)
-                return;
-
-            m_pattern.m_containsBeginChars = true;
-
-            for (unsigned i = 0; i < size; i++) {
-                PatternTerm term = beginTerms[i].term;
-
-                // We have just collected PatternCharacter terms, other terms are not allowed.
-                ASSERT(term.type == PatternTerm::TypePatternCharacter);
-
-                if (term.quantityType == QuantifierFixedCount)
-                    containsFixedCharacter = true;
-
-                UChar character = term.patternCharacter;
-                unsigned mask = 0;
-
-                if (character <= 0x7f) {
-                    if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
-                        mask = 32;
-                        character = toASCIILower(character);
-                    }
-
-                    m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
-                } else {
-                    UChar upper, lower;
-                    if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
-                        m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
-                        m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
-                    } else
-                        m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
-                }
-            }
-
-            // If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
-            if (!containsFixedCharacter) {
-                m_pattern.m_containsBeginChars = false;
-                return;
-            }
-
-            size = m_pattern.m_beginChars.size();
-
-            if (size > 2)
-                m_beginCharHelper.merge(size - 1);
-            else if (size <= 1)
-                m_pattern.m_containsBeginChars = false;
-        }
-    }
-
 private:
    YarrPattern& m_pattern;
    PatternAlternative* m_alternative;
    CharacterClassConstructor m_characterClassConstructor;
-    BeginCharHelper m_beginCharHelper;
    bool m_invertCharacterClass;
    bool m_invertParentheticalAssertion;
 };
@ -959,7 +719,6 @@ ErrorCode YarrPattern::compile(const UString& patternString)
    constructor.optimizeBOL();
        
    constructor.setupOffsets();
-    constructor.setupBeginChars();

    return NoError;
 }
@ -968,7 +727,6 @@ YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline
    : m_ignoreCase(ignoreCase)
    , m_multiline(multiline)
    , m_containsBackreferences(false)
-    , m_containsBeginChars(false)
    , m_containsBOL(false)
    , m_numSubpatterns(0)
    , m_maxBackReference(0)
--- a/js/src/yarr/YarrPattern.h
+++ b/js/src/yarr/YarrPattern.h
@ -332,21 +332,6 @@ struct TermChain {
    Vector<TermChain> hotTerms;
 };

-struct BeginChar {
-    BeginChar()
-        : value(0)
-        , mask(0)
-    {}
-
-    BeginChar(unsigned value, unsigned mask)
-        : value(value)
-        , mask(mask)
-    {}
-
-    unsigned value;
-    unsigned mask;
-};
-
 struct YarrPattern {
    YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error);

@ -362,7 +347,6 @@ struct YarrPattern {
        m_maxBackReference = 0;

        m_containsBackreferences = false;
-        m_containsBeginChars = false;
        m_containsBOL = false;

        newlineCached = 0;
@ -377,7 +361,6 @@ struct YarrPattern {
        m_disjunctions.clear();
        deleteAllValues(m_userCharacterClasses);
        m_userCharacterClasses.clear();
-        m_beginChars.clear();
    }

    bool containsIllegalBackReference()
@ -431,14 +414,12 @@ struct YarrPattern {
    bool m_ignoreCase : 1;
    bool m_multiline : 1;
    bool m_containsBackreferences : 1;
-    bool m_containsBeginChars : 1;
    bool m_containsBOL : 1;
    unsigned m_numSubpatterns;
    unsigned m_maxBackReference;
    PatternDisjunction* m_body;
    Vector<PatternDisjunction*, 4> m_disjunctions;
    Vector<CharacterClass*> m_userCharacterClasses;
-    Vector<BeginChar> m_beginChars;

 private:
    ErrorCode compile(const UString& patternString);