Bug 1135377 - Part 5: Support CharacterClassEscape in RegExp with unicode flag. r=till, f=anba

This commit is contained in:
Tooru Fujisawa 2015-08-07 08:11:56 +09:00
parent a0e5747f63
commit 06cfc7d389
4 changed files with 154 additions and 5 deletions

View File

@ -72,12 +72,30 @@ static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
0xFEFF, 0xFF00, 0x10000 };
static const int kSpaceRangeCount = ArrayLength(kSpaceRanges);
static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
0xFEFF, 0xFF00, 0x10000 };
static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges);
static const int kWordRanges[] = {
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
static const int kWordRangeCount = ArrayLength(kWordRanges);
static const int kWordAndSurrogateRanges[] = {
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
0x10000 };
static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges);
static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
static const int kDigitRangeCount = ArrayLength(kDigitRanges);
static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
static const int kDigitAndSurrogateRanges[] = {
'0', '9' + 1,
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
0x10000 };
static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges);
static const int kSurrogateRanges[] = {
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
0x10000 };
static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges);
static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
0x2028, 0x202A, 0x10000 };
@ -164,6 +182,26 @@ CharacterRange::AddClassEscape(LifoAlloc* alloc, char16_t type,
}
}
// Add class escape, excluding surrogate pair range.
void
CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
CharacterRangeVector* ranges)
{
switch (type) {
case 'S':
AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges);
break;
case 'W':
AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges);
break;
case 'D':
AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges);
break;
default:
MOZ_CRASH("Bad type!");
}
}
// We need to check for the following characters: 0x39c 0x3bc 0x178.
static inline bool
RangeContainsLatin1Equivalents(CharacterRange range)

View File

@ -144,6 +144,8 @@ class CharacterRange
{}
static void AddClassEscape(LifoAlloc* alloc, char16_t type, CharacterRangeVector* ranges);
static void AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
CharacterRangeVector* ranges);
static inline CharacterRange Singleton(char16_t value) {
return CharacterRange(value, value);

View File

@ -611,9 +611,23 @@ AddCharOrEscapeUnicode(LifoAlloc* alloc,
char16_t char_class,
widechar c)
{
if (char_class != kNoCharClass)
CharacterRange::AddClassEscape(alloc, char_class, ranges);
else if (unicode::IsLeadSurrogate(c))
if (char_class != kNoCharClass) {
CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges);
switch (char_class) {
case 'S':
case 'W':
case 'D':
lead_ranges->append(LeadSurrogateRange());
trail_ranges->append(TrailSurrogateRange());
wide_ranges->append(NonBMPRange());
break;
case '.':
MOZ_CRASH("Bad char_class!");
}
return;
}
if (unicode::IsLeadSurrogate(c))
lead_ranges->append(CharacterRange::Singleton(c));
else if (unicode::IsTrailSurrogate(c))
trail_ranges->append(CharacterRange::Singleton(c));
@ -1213,6 +1227,18 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
return builder->ToRegExp();
}
RegExpTree*
UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class)
{
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0);
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false);
}
// Disjunction ::
// Alternative
// Alternative | Disjunction
@ -1377,7 +1403,15 @@ RegExpParser<CharT>::ParseDisjunction()
//
// CharacterClassEscape :: one of
// d D s S w W
case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
case 'D': case 'S': case 'W':
if (unicode_) {
Advance();
builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current()));
Advance();
break;
}
// Fall through
case 'd': case 's': case 'w': {
widechar c = Next();
Advance(2);
CharacterRangeVector* ranges =

View File

@ -0,0 +1,75 @@
var BUGNUMBER = 1135377;
var summary = "Implement RegExp unicode flag -- CharacterClassEscape.";
print(BUGNUMBER + ": " + summary);
// BMP
assertEqArray(/\d+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["0123456789"]);
assertEqArray(/\D+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["abcxyzABCXYZ"]);
assertEqArray(/\s+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["\t\r\n\v\x0c\xa0\uFEFF"]);
assertEqArray(/\S+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["abcxyzABCXYZ0123456789_"]);
assertEqArray(/\w+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["abcxyzABCXYZ0123456789_"]);
assertEqArray(/\W+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["\t\r\n\v\x0c\xa0\uFEFF*"]);
assertEqArray(/\n+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["\n"]);
assertEqArray(/[\d]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["0123456789"]);
assertEqArray(/[\D]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["abcxyzABCXYZ"]);
assertEqArray(/[\s]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["\t\r\n\v\x0c\xa0\uFEFF"]);
assertEqArray(/[\S]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["abcxyzABCXYZ0123456789_"]);
assertEqArray(/[\w]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["abcxyzABCXYZ0123456789_"]);
assertEqArray(/[\W]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["\t\r\n\v\x0c\xa0\uFEFF*"]);
assertEqArray(/[\n]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
["\n"]);
// non-BMP
function testNonBMP(re) {
assertEqArray(re.exec("\uD83D\uDBFF"),
["\uD83D"]);
assertEqArray(re.exec("\uD83D\uDC00"),
["\uD83D\uDC00"]);
assertEqArray(re.exec("\uD83D\uDFFF"),
["\uD83D\uDFFF"]);
assertEqArray(re.exec("\uD83D\uE000"),
["\uD83D"]);
assertEqArray(re.exec("\uD7FF\uDC38"),
["\uD7FF"]);
assertEqArray(re.exec("\uD800\uDC38"),
["\uD800\uDC38"]);
assertEqArray(re.exec("\uDBFF\uDC38"),
["\uDBFF\uDC38"]);
assertEqArray(re.exec("\uDC00\uDC38"),
["\uDC00"]);
}
testNonBMP(/\D/u);
testNonBMP(/\S/u);
testNonBMP(/\W/u);
testNonBMP(/[\D]/u);
testNonBMP(/[\S]/u);
testNonBMP(/[\W]/u);
if (typeof reportCompare === "function")
reportCompare(true, true);