Bug 660747 - Reverting YARR begin characters optimization. r=dmandelin

This commit is contained in:
Andrew Paprocki 2011-11-22 00:21:18 -05:00
parent d19ea74612
commit 6e53604b3e
4 changed files with 2 additions and 304 deletions

View File

@ -1046,39 +1046,10 @@ public:
return JSRegExpErrorNoMatch;
}
void lookupForBeginChars()
{
int character;
bool firstSingleCharFound;
while (true) {
if (input.isNotAvailableInput(2))
return;
firstSingleCharFound = false;
character = input.readPair();
for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
BeginChar bc = pattern->m_beginChars[i];
if (!firstSingleCharFound && bc.value <= 0xFFFF) {
firstSingleCharFound = true;
character &= 0xFFFF;
}
if ((character | bc.mask) == bc.value)
return;
}
input.next();
}
}
#define MATCH_NEXT() { ++context->term; goto matchAgain; }
#define BACKTRACK() { --context->term; goto backtrack; }
#define currentTerm() (disjunction->terms[context->term])
JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
{
if (!--remainingMatchCount)
return JSRegExpErrorHitLimit;
@ -1086,9 +1057,6 @@ public:
if (btrack)
BACKTRACK();
if (pattern->m_containsBeginChars && isBody)
lookupForBeginChars();
context->matchBegin = input.getPos();
context->term = 0;
@ -1266,9 +1234,6 @@ public:
input.next();
if (pattern->m_containsBeginChars && isBody)
lookupForBeginChars();
context->matchBegin = input.getPos();
if (currentTerm().alternative.onceThrough)
@ -1397,7 +1362,7 @@ public:
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
if (result == JSRegExpMatch) {
output[0] = context->matchBegin;
output[1] = context->matchEnd;

View File

@ -335,7 +335,6 @@ public:
: m_body(body)
, m_ignoreCase(pattern.m_ignoreCase)
, m_multiline(pattern.m_multiline)
, m_containsBeginChars(pattern.m_containsBeginChars)
, m_allocator(allocator)
{
newlineCharacterClass = pattern.newlineCharacterClass();
@ -347,8 +346,6 @@ public:
// array, so that it won't delete them on destruction. We'll
// take responsibility for that.
pattern.m_userCharacterClasses.clear();
m_beginChars.append(pattern.m_beginChars);
}
~BytecodePattern()
@ -360,7 +357,6 @@ public:
OwnPtr<ByteDisjunction> m_body;
bool m_ignoreCase;
bool m_multiline;
bool m_containsBeginChars;
// Each BytecodePattern is associated with a RegExp, each RegExp is associated
// with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator.
BumpPointerAllocator* m_allocator;
@ -368,8 +364,6 @@ public:
CharacterClass* newlineCharacterClass;
CharacterClass* wordcharCharacterClass;
Vector<BeginChar> m_beginChars;
private:
Vector<ByteDisjunction*> m_allParenthesesInfo;
Vector<CharacterClass*> m_userCharacterClasses;

View File

@ -242,117 +242,11 @@ private:
Vector<CharacterRange> m_rangesUnicode;
};
struct BeginCharHelper {
BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
: m_beginChars(beginChars)
, m_isCaseInsensitive(isCaseInsensitive)
{}
void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
{
if (quantityType == QuantifierFixedCount && quantityCount > 1) {
// We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
beginChar.value |= beginChar.value << 16;
beginChar.mask |= beginChar.mask << 16;
addCharacter(beginChar);
} else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
// In case of characters with fixed quantifier we should check the next character as well.
linkHotTerms(beginChar, hotTerms);
else
// In case of greedy matching the next character checking is unnecessary therefore we just store
// the first character.
addCharacter(beginChar);
}
// Merge two following BeginChars in the vector to reduce the number of character checks.
void merge(unsigned size)
{
for (unsigned i = 0; i < size; i++) {
BeginChar* curr = &m_beginChars->at(i);
BeginChar* next = &m_beginChars->at(i + 1);
// If the current and the next size of value is different we should skip the merge process
// because the 16bit and 32bit values are unmergable.
if (curr->value <= 0xFFFF && next->value > 0xFFFF)
continue;
unsigned diff = curr->value ^ next->value;
curr->mask |= diff;
curr->value |= curr->mask;
m_beginChars->remove(i + 1);
size--;
}
}
private:
void addCharacter(BeginChar beginChar)
{
unsigned pos = 0;
unsigned range = m_beginChars->size();
// binary chop, find position to insert char.
while (range) {
unsigned index = range >> 1;
int val = m_beginChars->at(pos+index).value - beginChar.value;
if (!val)
return;
if (val < 0)
range = index;
else {
pos += (index+1);
range -= (index+1);
}
}
if (pos == m_beginChars->size())
m_beginChars->append(beginChar);
else
m_beginChars->insert(pos, beginChar);
}
// Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
{
for (unsigned i = 0; i < hotTerms->size(); i++) {
PatternTerm hotTerm = hotTerms->at(i).term;
ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
UChar characterNext = hotTerm.patternCharacter;
// Append a character to an existing BeginChar object.
if (characterNext <= 0x7f) {
unsigned mask = 0;
if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
mask = 32;
characterNext = toASCIILower(characterNext);
}
addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
} else {
UChar upper, lower;
if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
} else
addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
}
}
}
Vector<BeginChar>* m_beginChars;
bool m_isCaseInsensitive;
};
class YarrPatternConstructor {
public:
YarrPatternConstructor(YarrPattern& pattern)
: m_pattern(pattern)
, m_characterClassConstructor(pattern.m_ignoreCase)
, m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
, m_invertParentheticalAssertion(false)
{
m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
@ -789,144 +683,10 @@ public:
}
}
// This function collects the terms which are potentially matching the first number of depth characters in the result.
// If this function returns false then it found at least one term which makes the beginning character
// look-up optimization inefficient.
bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
{
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
PatternAlternative* alternative = disjunction->m_alternatives[alt];
if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
return false;
}
return true;
}
bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
{
bool checkNext = true;
unsigned numTerms = alternative->m_terms.size();
while (checkNext && termIndex < numTerms) {
PatternTerm term = alternative->m_terms[termIndex];
checkNext = false;
switch (term.type) {
case PatternTerm::TypeAssertionBOL:
case PatternTerm::TypeAssertionEOL:
case PatternTerm::TypeAssertionWordBoundary:
return false;
case PatternTerm::TypeBackReference:
case PatternTerm::TypeForwardReference:
return false;
case PatternTerm::TypePatternCharacter:
if (termIndex != numTerms - 1) {
beginTerms->append(TermChain(term));
termIndex++;
checkNext = true;
} else if (term.quantityType == QuantifierFixedCount) {
beginTerms->append(TermChain(term));
if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1))
return false;
}
break;
case PatternTerm::TypeCharacterClass:
return false;
case PatternTerm::TypeParentheticalAssertion:
if (term.invert())
return false;
case PatternTerm::TypeParenthesesSubpattern:
if (term.quantityType != QuantifierFixedCount) {
if (termIndex == numTerms - 1)
break;
termIndex++;
checkNext = true;
}
if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
return false;
break;
}
}
return true;
}
void setupBeginChars()
{
Vector<TermChain> beginTerms;
bool containsFixedCharacter = false;
if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
&& setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
unsigned size = beginTerms.size();
// If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
if (!size)
return;
m_pattern.m_containsBeginChars = true;
for (unsigned i = 0; i < size; i++) {
PatternTerm term = beginTerms[i].term;
// We have just collected PatternCharacter terms, other terms are not allowed.
ASSERT(term.type == PatternTerm::TypePatternCharacter);
if (term.quantityType == QuantifierFixedCount)
containsFixedCharacter = true;
UChar character = term.patternCharacter;
unsigned mask = 0;
if (character <= 0x7f) {
if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
mask = 32;
character = toASCIILower(character);
}
m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
} else {
UChar upper, lower;
if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
} else
m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
}
}
// If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
if (!containsFixedCharacter) {
m_pattern.m_containsBeginChars = false;
return;
}
size = m_pattern.m_beginChars.size();
if (size > 2)
m_beginCharHelper.merge(size - 1);
else if (size <= 1)
m_pattern.m_containsBeginChars = false;
}
}
private:
YarrPattern& m_pattern;
PatternAlternative* m_alternative;
CharacterClassConstructor m_characterClassConstructor;
BeginCharHelper m_beginCharHelper;
bool m_invertCharacterClass;
bool m_invertParentheticalAssertion;
};
@ -959,7 +719,6 @@ ErrorCode YarrPattern::compile(const UString& patternString)
constructor.optimizeBOL();
constructor.setupOffsets();
constructor.setupBeginChars();
return NoError;
}
@ -968,7 +727,6 @@ YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline
: m_ignoreCase(ignoreCase)
, m_multiline(multiline)
, m_containsBackreferences(false)
, m_containsBeginChars(false)
, m_containsBOL(false)
, m_numSubpatterns(0)
, m_maxBackReference(0)

View File

@ -332,21 +332,6 @@ struct TermChain {
Vector<TermChain> hotTerms;
};
struct BeginChar {
BeginChar()
: value(0)
, mask(0)
{}
BeginChar(unsigned value, unsigned mask)
: value(value)
, mask(mask)
{}
unsigned value;
unsigned mask;
};
struct YarrPattern {
YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error);
@ -362,7 +347,6 @@ struct YarrPattern {
m_maxBackReference = 0;
m_containsBackreferences = false;
m_containsBeginChars = false;
m_containsBOL = false;
newlineCached = 0;
@ -377,7 +361,6 @@ struct YarrPattern {
m_disjunctions.clear();
deleteAllValues(m_userCharacterClasses);
m_userCharacterClasses.clear();
m_beginChars.clear();
}
bool containsIllegalBackReference()
@ -431,14 +414,12 @@ struct YarrPattern {
bool m_ignoreCase : 1;
bool m_multiline : 1;
bool m_containsBackreferences : 1;
bool m_containsBeginChars : 1;
bool m_containsBOL : 1;
unsigned m_numSubpatterns;
unsigned m_maxBackReference;
PatternDisjunction* m_body;
Vector<PatternDisjunction*, 4> m_disjunctions;
Vector<CharacterClass*> m_userCharacterClasses;
Vector<BeginChar> m_beginChars;
private:
ErrorCode compile(const UString& patternString);