Bug 1019585 part 2 - Make RopeMatch work with Latin1 strings. r=luke

This commit is contained in:
Jan de Mooij 2014-06-05 12:02:00 +02:00
parent 5a05eabb50
commit 42ff18f4cc
2 changed files with 128 additions and 63 deletions

View File

@ -20,3 +20,41 @@ function testSearchFlat() {
assertEq("fooBar12345\u1200".search("5\u1201"), -1);
}
testSearchFlat();
function testSearchRope() {
// Tests for the RopeMatch algorithm.
var s1 = "foobarbaz0123456789".repeat(10);
s1.indexOf("333"); // flatten
s1 = toLatin1(s1);
var ropeMixed = s1 + "abcdef\u1200";
assertEq(isLatin1(ropeMixed), false);
var abc = toLatin1("abc");
var baz = toLatin1("baz");
// Mixed + Latin1
assertEq(ropeMixed.search(abc), 190);
assertEq(ropeMixed.search(baz), 6);
// Mixed + TwoByte
assertEq(ropeMixed.search("def\u1200"), 193);
// Latin1 + Latin1
s1 = "foobarbaz0123456789".repeat(10);
var ropeLatin1 = s1 + toLatin1("abcdef\u00AA");
assertEq(isLatin1(ropeLatin1), false);
assertEq(ropeLatin1.search(abc), 190);
// Latin1 + TwoByte
assertEq(ropeLatin1.search("\u1200bc".substr(1)), 191);
// TwoByte + Latin1
s1 = "foobarbaz0123456789\u11AA".repeat(10);
var ropeTwoByte = s1 + "abcdef\u1200";
assertEq(ropeTwoByte.search(abc), 200);
// TwoByte + TwoByte
assertEq(ropeTwoByte.search("def\u1200"), 203);
}
testSearchRope();

View File

@ -1296,21 +1296,79 @@ class StringSegmentRange
}
};
typedef Vector<JSLinearString *, 16, SystemAllocPolicy> LinearStringVector;
template <typename TextChar, typename PatChar>
static int
RopeMatchImpl(const AutoCheckCannotGC &nogc, LinearStringVector &strings,
const PatChar *pat, size_t patLen)
{
/* Absolute offset from the beginning of the logical text string. */
int pos = 0;
for (JSLinearString **outerp = strings.begin(); outerp != strings.end(); ++outerp) {
/* Try to find a match within 'outer'. */
JSLinearString *outer = *outerp;
const TextChar *chars = outer->chars<TextChar>(nogc);
size_t len = outer->length();
int matchResult = StringMatch(chars, len, pat, patLen);
if (matchResult != -1) {
/* Matched! */
return pos + matchResult;
}
/* Try to find a match starting in 'outer' and running into other nodes. */
const TextChar *const text = chars + (patLen > len ? 0 : len - patLen + 1);
const TextChar *const textend = chars + len;
const PatChar p0 = *pat;
const PatChar *const p1 = pat + 1;
const PatChar *const patend = pat + patLen;
for (const TextChar *t = text; t != textend; ) {
if (*t++ != p0)
continue;
JSLinearString **innerp = outerp;
const TextChar *ttend = textend;
const TextChar *tt = t;
for (const PatChar *pp = p1; pp != patend; ++pp, ++tt) {
while (tt == ttend) {
if (++innerp == strings.end())
return -1;
JSLinearString *inner = *innerp;
tt = inner->chars<TextChar>(nogc);
ttend = tt + inner->length();
}
if (*pp != *tt)
goto break_continue;
}
/* Matched! */
return pos + (t - chars) - 1; /* -1 because of *t++ above */
break_continue:;
}
pos += len;
}
return -1;
}
/*
* RopeMatch takes the text to search and the pattern to search for in the text.
* RopeMatch returns false on OOM and otherwise returns the match index through
* the 'match' outparam (-1 for not found).
*/
static bool
RopeMatch(JSContext *cx, JSString *textstr, const jschar *pat, uint32_t patLen, int *match)
RopeMatch(JSContext *cx, JSRope *text, JSLinearString *pat, int *match)
{
JS_ASSERT(textstr->isRope());
uint32_t patLen = pat->length();
if (patLen == 0) {
*match = 0;
return true;
}
if (textstr->length() < patLen) {
if (text->length() < patLen) {
*match = -1;
return true;
}
@ -1320,26 +1378,34 @@ RopeMatch(JSContext *cx, JSString *textstr, const jschar *pat, uint32_t patLen,
* append to this list, we can still fall back to StringMatch, so use the
* system allocator so we don't report OOM in that case.
*/
Vector<JSLinearString *, 16, SystemAllocPolicy> strs;
LinearStringVector strings;
/*
* We don't want to do rope matching if there is a poor node-to-char ratio,
* since this means spending a lot of time in the match loop below. We also
* need to build the list of leaf nodes. Do both here: iterate over the
* nodes so long as there are not too many.
*
* We also don't use rope matching if the rope contains both Latin1 and
* TwoByte nodes, to simplify the match algorithm.
*/
{
size_t textstrlen = textstr->length();
size_t threshold = textstrlen >> sRopeMatchThresholdRatioLog2;
size_t threshold = text->length() >> sRopeMatchThresholdRatioLog2;
StringSegmentRange r(cx);
if (!r.init(textstr))
if (!r.init(text))
return false;
bool textIsLatin1 = text->hasLatin1Chars();
while (!r.empty()) {
if (threshold-- == 0 || !strs.append(r.front())) {
const jschar *chars = textstr->getChars(cx);
if (!chars)
if (threshold-- == 0 ||
r.front()->hasLatin1Chars() != textIsLatin1 ||
!strings.append(r.front()))
{
JSLinearString *linear = text->ensureLinear(cx);
if (!linear)
return false;
*match = StringMatch(chars, textstrlen, pat, patLen);
*match = StringMatch(linear, pat);
return true;
}
if (!r.popFront())
@ -1347,57 +1413,19 @@ RopeMatch(JSContext *cx, JSString *textstr, const jschar *pat, uint32_t patLen,
}
}
/* Absolute offset from the beginning of the logical string textstr. */
int pos = 0;
for (JSLinearString **outerp = strs.begin(); outerp != strs.end(); ++outerp) {
/* Try to find a match within 'outer'. */
JSLinearString *outer = *outerp;
const jschar *chars = outer->chars();
size_t len = outer->length();
int matchResult = StringMatch(chars, len, pat, patLen);
if (matchResult != -1) {
/* Matched! */
*match = pos + matchResult;
return true;
}
/* Try to find a match starting in 'outer' and running into other nodes. */
const jschar *const text = chars + (patLen > len ? 0 : len - patLen + 1);
const jschar *const textend = chars + len;
const jschar p0 = *pat;
const jschar *const p1 = pat + 1;
const jschar *const patend = pat + patLen;
for (const jschar *t = text; t != textend; ) {
if (*t++ != p0)
continue;
JSLinearString **innerp = outerp;
const jschar *ttend = textend;
for (const jschar *pp = p1, *tt = t; pp != patend; ++pp, ++tt) {
while (tt == ttend) {
if (++innerp == strs.end()) {
*match = -1;
return true;
}
JSLinearString *inner = *innerp;
tt = inner->chars();
ttend = tt + inner->length();
}
if (*pp != *tt)
goto break_continue;
}
/* Matched! */
*match = pos + (t - chars) - 1; /* -1 because of *t++ above */
return true;
break_continue:;
}
pos += len;
AutoCheckCannotGC nogc;
if (text->hasLatin1Chars()) {
if (pat->hasLatin1Chars())
*match = RopeMatchImpl<Latin1Char>(nogc, strings, pat->latin1Chars(nogc), patLen);
else
*match = RopeMatchImpl<Latin1Char>(nogc, strings, pat->twoByteChars(nogc), patLen);
} else {
if (pat->hasLatin1Chars())
*match = RopeMatchImpl<jschar>(nogc, strings, pat->latin1Chars(nogc), patLen);
else
*match = RopeMatchImpl<jschar>(nogc, strings, pat->twoByteChars(nogc), patLen);
}
*match = -1;
return true;
}
@ -2002,8 +2030,7 @@ class MOZ_STACK_CLASS StringRegExpGuard
* long as possible.
*/
if (text->isRope()) {
const jschar *pat = fm.pat_->chars();
if (!RopeMatch(cx, text, pat, patLen, &fm.match_))
if (!RopeMatch(cx, &text->asRope(), fm.pat_, &fm.match_))
return nullptr;
} else {
fm.match_ = StringMatch(&text->asLinear(), fm.pat_, 0);