mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Bug 660747 - Reverting YARR begin characters optimization. r=dmandelin
This commit is contained in:
parent
d19ea74612
commit
6e53604b3e
@ -1046,39 +1046,10 @@ public:
|
||||
return JSRegExpErrorNoMatch;
|
||||
}
|
||||
|
||||
void lookupForBeginChars()
|
||||
{
|
||||
int character;
|
||||
bool firstSingleCharFound;
|
||||
|
||||
while (true) {
|
||||
if (input.isNotAvailableInput(2))
|
||||
return;
|
||||
|
||||
firstSingleCharFound = false;
|
||||
|
||||
character = input.readPair();
|
||||
|
||||
for (unsigned i = 0; i < pattern->m_beginChars.size(); ++i) {
|
||||
BeginChar bc = pattern->m_beginChars[i];
|
||||
|
||||
if (!firstSingleCharFound && bc.value <= 0xFFFF) {
|
||||
firstSingleCharFound = true;
|
||||
character &= 0xFFFF;
|
||||
}
|
||||
|
||||
if ((character | bc.mask) == bc.value)
|
||||
return;
|
||||
}
|
||||
|
||||
input.next();
|
||||
}
|
||||
}
|
||||
|
||||
#define MATCH_NEXT() { ++context->term; goto matchAgain; }
|
||||
#define BACKTRACK() { --context->term; goto backtrack; }
|
||||
#define currentTerm() (disjunction->terms[context->term])
|
||||
JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false, bool isBody = false)
|
||||
JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
|
||||
{
|
||||
if (!--remainingMatchCount)
|
||||
return JSRegExpErrorHitLimit;
|
||||
@ -1086,9 +1057,6 @@ public:
|
||||
if (btrack)
|
||||
BACKTRACK();
|
||||
|
||||
if (pattern->m_containsBeginChars && isBody)
|
||||
lookupForBeginChars();
|
||||
|
||||
context->matchBegin = input.getPos();
|
||||
context->term = 0;
|
||||
|
||||
@ -1266,9 +1234,6 @@ public:
|
||||
|
||||
input.next();
|
||||
|
||||
if (pattern->m_containsBeginChars && isBody)
|
||||
lookupForBeginChars();
|
||||
|
||||
context->matchBegin = input.getPos();
|
||||
|
||||
if (currentTerm().alternative.onceThrough)
|
||||
@ -1397,7 +1362,7 @@ public:
|
||||
|
||||
DisjunctionContext* context = allocDisjunctionContext(pattern->m_body.get());
|
||||
|
||||
JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false, true);
|
||||
JSRegExpResult result = matchDisjunction(pattern->m_body.get(), context, false);
|
||||
if (result == JSRegExpMatch) {
|
||||
output[0] = context->matchBegin;
|
||||
output[1] = context->matchEnd;
|
||||
|
@ -335,7 +335,6 @@ public:
|
||||
: m_body(body)
|
||||
, m_ignoreCase(pattern.m_ignoreCase)
|
||||
, m_multiline(pattern.m_multiline)
|
||||
, m_containsBeginChars(pattern.m_containsBeginChars)
|
||||
, m_allocator(allocator)
|
||||
{
|
||||
newlineCharacterClass = pattern.newlineCharacterClass();
|
||||
@ -347,8 +346,6 @@ public:
|
||||
// array, so that it won't delete them on destruction. We'll
|
||||
// take responsibility for that.
|
||||
pattern.m_userCharacterClasses.clear();
|
||||
|
||||
m_beginChars.append(pattern.m_beginChars);
|
||||
}
|
||||
|
||||
~BytecodePattern()
|
||||
@ -360,7 +357,6 @@ public:
|
||||
OwnPtr<ByteDisjunction> m_body;
|
||||
bool m_ignoreCase;
|
||||
bool m_multiline;
|
||||
bool m_containsBeginChars;
|
||||
// Each BytecodePattern is associated with a RegExp, each RegExp is associated
|
||||
// with a JSGlobalData. Cache a pointer to out JSGlobalData's m_regExpAllocator.
|
||||
BumpPointerAllocator* m_allocator;
|
||||
@ -368,8 +364,6 @@ public:
|
||||
CharacterClass* newlineCharacterClass;
|
||||
CharacterClass* wordcharCharacterClass;
|
||||
|
||||
Vector<BeginChar> m_beginChars;
|
||||
|
||||
private:
|
||||
Vector<ByteDisjunction*> m_allParenthesesInfo;
|
||||
Vector<CharacterClass*> m_userCharacterClasses;
|
||||
|
@ -242,117 +242,11 @@ private:
|
||||
Vector<CharacterRange> m_rangesUnicode;
|
||||
};
|
||||
|
||||
struct BeginCharHelper {
|
||||
BeginCharHelper(Vector<BeginChar>* beginChars, bool isCaseInsensitive = false)
|
||||
: m_beginChars(beginChars)
|
||||
, m_isCaseInsensitive(isCaseInsensitive)
|
||||
{}
|
||||
|
||||
void addBeginChar(BeginChar beginChar, Vector<TermChain>* hotTerms, QuantifierType quantityType, unsigned quantityCount)
|
||||
{
|
||||
if (quantityType == QuantifierFixedCount && quantityCount > 1) {
|
||||
// We duplicate the first found character if the quantity of the term is more than one. eg.: /a{3}/
|
||||
beginChar.value |= beginChar.value << 16;
|
||||
beginChar.mask |= beginChar.mask << 16;
|
||||
addCharacter(beginChar);
|
||||
} else if (quantityType == QuantifierFixedCount && quantityCount == 1 && hotTerms->size())
|
||||
// In case of characters with fixed quantifier we should check the next character as well.
|
||||
linkHotTerms(beginChar, hotTerms);
|
||||
else
|
||||
// In case of greedy matching the next character checking is unnecessary therefore we just store
|
||||
// the first character.
|
||||
addCharacter(beginChar);
|
||||
}
|
||||
|
||||
// Merge two following BeginChars in the vector to reduce the number of character checks.
|
||||
void merge(unsigned size)
|
||||
{
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
BeginChar* curr = &m_beginChars->at(i);
|
||||
BeginChar* next = &m_beginChars->at(i + 1);
|
||||
|
||||
// If the current and the next size of value is different we should skip the merge process
|
||||
// because the 16bit and 32bit values are unmergable.
|
||||
if (curr->value <= 0xFFFF && next->value > 0xFFFF)
|
||||
continue;
|
||||
|
||||
unsigned diff = curr->value ^ next->value;
|
||||
|
||||
curr->mask |= diff;
|
||||
curr->value |= curr->mask;
|
||||
|
||||
m_beginChars->remove(i + 1);
|
||||
size--;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void addCharacter(BeginChar beginChar)
|
||||
{
|
||||
unsigned pos = 0;
|
||||
unsigned range = m_beginChars->size();
|
||||
|
||||
// binary chop, find position to insert char.
|
||||
while (range) {
|
||||
unsigned index = range >> 1;
|
||||
|
||||
int val = m_beginChars->at(pos+index).value - beginChar.value;
|
||||
if (!val)
|
||||
return;
|
||||
if (val < 0)
|
||||
range = index;
|
||||
else {
|
||||
pos += (index+1);
|
||||
range -= (index+1);
|
||||
}
|
||||
}
|
||||
|
||||
if (pos == m_beginChars->size())
|
||||
m_beginChars->append(beginChar);
|
||||
else
|
||||
m_beginChars->insert(pos, beginChar);
|
||||
}
|
||||
|
||||
// Create BeginChar objects by appending each terms from a hotTerms vector to an existing BeginChar object.
|
||||
void linkHotTerms(BeginChar beginChar, Vector<TermChain>* hotTerms)
|
||||
{
|
||||
for (unsigned i = 0; i < hotTerms->size(); i++) {
|
||||
PatternTerm hotTerm = hotTerms->at(i).term;
|
||||
ASSERT(hotTerm.type == PatternTerm::TypePatternCharacter);
|
||||
|
||||
UChar characterNext = hotTerm.patternCharacter;
|
||||
|
||||
// Append a character to an existing BeginChar object.
|
||||
if (characterNext <= 0x7f) {
|
||||
unsigned mask = 0;
|
||||
|
||||
if (m_isCaseInsensitive && isASCIIAlpha(characterNext)) {
|
||||
mask = 32;
|
||||
characterNext = toASCIILower(characterNext);
|
||||
}
|
||||
|
||||
addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask | (mask << 16)));
|
||||
} else {
|
||||
UChar upper, lower;
|
||||
if (m_isCaseInsensitive && ((upper = Unicode::toUpper(characterNext)) != (lower = Unicode::toLower(characterNext)))) {
|
||||
addCharacter(BeginChar(beginChar.value | (upper << 16), beginChar.mask));
|
||||
addCharacter(BeginChar(beginChar.value | (lower << 16), beginChar.mask));
|
||||
} else
|
||||
addCharacter(BeginChar(beginChar.value | (characterNext << 16), beginChar.mask));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Vector<BeginChar>* m_beginChars;
|
||||
bool m_isCaseInsensitive;
|
||||
};
|
||||
|
||||
class YarrPatternConstructor {
|
||||
public:
|
||||
YarrPatternConstructor(YarrPattern& pattern)
|
||||
: m_pattern(pattern)
|
||||
, m_characterClassConstructor(pattern.m_ignoreCase)
|
||||
, m_beginCharHelper(&pattern.m_beginChars, pattern.m_ignoreCase)
|
||||
, m_invertParentheticalAssertion(false)
|
||||
{
|
||||
m_pattern.m_body = js::OffTheBooks::new_<PatternDisjunction>();
|
||||
@ -789,144 +683,10 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
// This function collects the terms which are potentially matching the first number of depth characters in the result.
|
||||
// If this function returns false then it found at least one term which makes the beginning character
|
||||
// look-up optimization inefficient.
|
||||
bool setupDisjunctionBeginTerms(PatternDisjunction* disjunction, Vector<TermChain>* beginTerms, unsigned depth)
|
||||
{
|
||||
for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
|
||||
PatternAlternative* alternative = disjunction->m_alternatives[alt];
|
||||
|
||||
if (!setupAlternativeBeginTerms(alternative, beginTerms, 0, depth))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool setupAlternativeBeginTerms(PatternAlternative* alternative, Vector<TermChain>* beginTerms, unsigned termIndex, unsigned depth)
|
||||
{
|
||||
bool checkNext = true;
|
||||
unsigned numTerms = alternative->m_terms.size();
|
||||
|
||||
while (checkNext && termIndex < numTerms) {
|
||||
PatternTerm term = alternative->m_terms[termIndex];
|
||||
checkNext = false;
|
||||
|
||||
switch (term.type) {
|
||||
case PatternTerm::TypeAssertionBOL:
|
||||
case PatternTerm::TypeAssertionEOL:
|
||||
case PatternTerm::TypeAssertionWordBoundary:
|
||||
return false;
|
||||
|
||||
case PatternTerm::TypeBackReference:
|
||||
case PatternTerm::TypeForwardReference:
|
||||
return false;
|
||||
|
||||
case PatternTerm::TypePatternCharacter:
|
||||
if (termIndex != numTerms - 1) {
|
||||
beginTerms->append(TermChain(term));
|
||||
termIndex++;
|
||||
checkNext = true;
|
||||
} else if (term.quantityType == QuantifierFixedCount) {
|
||||
beginTerms->append(TermChain(term));
|
||||
if (depth < 2 && termIndex < numTerms - 1 && term.quantityCount == 1)
|
||||
if (!setupAlternativeBeginTerms(alternative, &beginTerms->last().hotTerms, termIndex + 1, depth + 1))
|
||||
return false;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case PatternTerm::TypeCharacterClass:
|
||||
return false;
|
||||
|
||||
case PatternTerm::TypeParentheticalAssertion:
|
||||
if (term.invert())
|
||||
return false;
|
||||
|
||||
case PatternTerm::TypeParenthesesSubpattern:
|
||||
if (term.quantityType != QuantifierFixedCount) {
|
||||
if (termIndex == numTerms - 1)
|
||||
break;
|
||||
|
||||
termIndex++;
|
||||
checkNext = true;
|
||||
}
|
||||
|
||||
if (!setupDisjunctionBeginTerms(term.parentheses.disjunction, beginTerms, depth))
|
||||
return false;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void setupBeginChars()
|
||||
{
|
||||
Vector<TermChain> beginTerms;
|
||||
bool containsFixedCharacter = false;
|
||||
|
||||
if ((!m_pattern.m_body->m_hasFixedSize || m_pattern.m_body->m_alternatives.size() > 1)
|
||||
&& setupDisjunctionBeginTerms(m_pattern.m_body, &beginTerms, 0)) {
|
||||
unsigned size = beginTerms.size();
|
||||
|
||||
// If we haven't collected any terms we should abort the preparation of beginning character look-up optimization.
|
||||
if (!size)
|
||||
return;
|
||||
|
||||
m_pattern.m_containsBeginChars = true;
|
||||
|
||||
for (unsigned i = 0; i < size; i++) {
|
||||
PatternTerm term = beginTerms[i].term;
|
||||
|
||||
// We have just collected PatternCharacter terms, other terms are not allowed.
|
||||
ASSERT(term.type == PatternTerm::TypePatternCharacter);
|
||||
|
||||
if (term.quantityType == QuantifierFixedCount)
|
||||
containsFixedCharacter = true;
|
||||
|
||||
UChar character = term.patternCharacter;
|
||||
unsigned mask = 0;
|
||||
|
||||
if (character <= 0x7f) {
|
||||
if (m_pattern.m_ignoreCase && isASCIIAlpha(character)) {
|
||||
mask = 32;
|
||||
character = toASCIILower(character);
|
||||
}
|
||||
|
||||
m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
|
||||
} else {
|
||||
UChar upper, lower;
|
||||
if (m_pattern.m_ignoreCase && ((upper = Unicode::toUpper(character)) != (lower = Unicode::toLower(character)))) {
|
||||
m_beginCharHelper.addBeginChar(BeginChar(upper, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
|
||||
m_beginCharHelper.addBeginChar(BeginChar(lower, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
|
||||
} else
|
||||
m_beginCharHelper.addBeginChar(BeginChar(character, mask), &beginTerms[i].hotTerms, term.quantityType, term.quantityCount);
|
||||
}
|
||||
}
|
||||
|
||||
// If the pattern doesn't contain terms with fixed quantifiers then the beginning character look-up optimization is inefficient.
|
||||
if (!containsFixedCharacter) {
|
||||
m_pattern.m_containsBeginChars = false;
|
||||
return;
|
||||
}
|
||||
|
||||
size = m_pattern.m_beginChars.size();
|
||||
|
||||
if (size > 2)
|
||||
m_beginCharHelper.merge(size - 1);
|
||||
else if (size <= 1)
|
||||
m_pattern.m_containsBeginChars = false;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
YarrPattern& m_pattern;
|
||||
PatternAlternative* m_alternative;
|
||||
CharacterClassConstructor m_characterClassConstructor;
|
||||
BeginCharHelper m_beginCharHelper;
|
||||
bool m_invertCharacterClass;
|
||||
bool m_invertParentheticalAssertion;
|
||||
};
|
||||
@ -959,7 +719,6 @@ ErrorCode YarrPattern::compile(const UString& patternString)
|
||||
constructor.optimizeBOL();
|
||||
|
||||
constructor.setupOffsets();
|
||||
constructor.setupBeginChars();
|
||||
|
||||
return NoError;
|
||||
}
|
||||
@ -968,7 +727,6 @@ YarrPattern::YarrPattern(const UString& pattern, bool ignoreCase, bool multiline
|
||||
: m_ignoreCase(ignoreCase)
|
||||
, m_multiline(multiline)
|
||||
, m_containsBackreferences(false)
|
||||
, m_containsBeginChars(false)
|
||||
, m_containsBOL(false)
|
||||
, m_numSubpatterns(0)
|
||||
, m_maxBackReference(0)
|
||||
|
@ -332,21 +332,6 @@ struct TermChain {
|
||||
Vector<TermChain> hotTerms;
|
||||
};
|
||||
|
||||
struct BeginChar {
|
||||
BeginChar()
|
||||
: value(0)
|
||||
, mask(0)
|
||||
{}
|
||||
|
||||
BeginChar(unsigned value, unsigned mask)
|
||||
: value(value)
|
||||
, mask(mask)
|
||||
{}
|
||||
|
||||
unsigned value;
|
||||
unsigned mask;
|
||||
};
|
||||
|
||||
struct YarrPattern {
|
||||
YarrPattern(const UString& pattern, bool ignoreCase, bool multiline, ErrorCode* error);
|
||||
|
||||
@ -362,7 +347,6 @@ struct YarrPattern {
|
||||
m_maxBackReference = 0;
|
||||
|
||||
m_containsBackreferences = false;
|
||||
m_containsBeginChars = false;
|
||||
m_containsBOL = false;
|
||||
|
||||
newlineCached = 0;
|
||||
@ -377,7 +361,6 @@ struct YarrPattern {
|
||||
m_disjunctions.clear();
|
||||
deleteAllValues(m_userCharacterClasses);
|
||||
m_userCharacterClasses.clear();
|
||||
m_beginChars.clear();
|
||||
}
|
||||
|
||||
bool containsIllegalBackReference()
|
||||
@ -431,14 +414,12 @@ struct YarrPattern {
|
||||
bool m_ignoreCase : 1;
|
||||
bool m_multiline : 1;
|
||||
bool m_containsBackreferences : 1;
|
||||
bool m_containsBeginChars : 1;
|
||||
bool m_containsBOL : 1;
|
||||
unsigned m_numSubpatterns;
|
||||
unsigned m_maxBackReference;
|
||||
PatternDisjunction* m_body;
|
||||
Vector<PatternDisjunction*, 4> m_disjunctions;
|
||||
Vector<CharacterClass*> m_userCharacterClasses;
|
||||
Vector<BeginChar> m_beginChars;
|
||||
|
||||
private:
|
||||
ErrorCode compile(const UString& patternString);
|
||||
|
Loading…
Reference in New Issue
Block a user