mirror of
https://gitlab.winehq.org/wine/wine-gecko.git
synced 2024-09-13 09:24:08 -07:00
Bug 1135377 - Part 5: Support CharacterClassEscape in RegExp with unicode flag. r=till, f=anba
This commit is contained in:
parent
a0e5747f63
commit
06cfc7d389
@ -72,12 +72,30 @@ static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
|
||||
0xFEFF, 0xFF00, 0x10000 };
|
||||
static const int kSpaceRangeCount = ArrayLength(kSpaceRanges);
|
||||
|
||||
static const int kSpaceAndSurrogateRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
|
||||
0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
|
||||
0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
|
||||
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
|
||||
0xFEFF, 0xFF00, 0x10000 };
|
||||
static const int kSpaceAndSurrogateRangeCount = ArrayLength(kSpaceAndSurrogateRanges);
|
||||
static const int kWordRanges[] = {
|
||||
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
|
||||
static const int kWordRangeCount = ArrayLength(kWordRanges);
|
||||
static const int kWordAndSurrogateRanges[] = {
|
||||
'0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1,
|
||||
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
|
||||
0x10000 };
|
||||
static const int kWordAndSurrogateRangeCount = ArrayLength(kWordAndSurrogateRanges);
|
||||
static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
|
||||
static const int kDigitRangeCount = ArrayLength(kDigitRanges);
|
||||
static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
|
||||
static const int kDigitAndSurrogateRanges[] = {
|
||||
'0', '9' + 1,
|
||||
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
|
||||
0x10000 };
|
||||
static const int kDigitAndSurrogateRangeCount = ArrayLength(kDigitAndSurrogateRanges);
|
||||
static const int kSurrogateRanges[] = {
|
||||
unicode::LeadSurrogateMin, unicode::TrailSurrogateMax + 1,
|
||||
0x10000 };
|
||||
static const int kSurrogateRangeCount = ArrayLength(kSurrogateRanges);
|
||||
static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
|
||||
0x2028, 0x202A, 0x10000 };
|
||||
@ -164,6 +182,26 @@ CharacterRange::AddClassEscape(LifoAlloc* alloc, char16_t type,
|
||||
}
|
||||
}
|
||||
|
||||
// Add class escape, excluding surrogate pair range.
|
||||
void
|
||||
CharacterRange::AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
|
||||
CharacterRangeVector* ranges)
|
||||
{
|
||||
switch (type) {
|
||||
case 'S':
|
||||
AddClassNegated(kSpaceAndSurrogateRanges, kSpaceAndSurrogateRangeCount, ranges);
|
||||
break;
|
||||
case 'W':
|
||||
AddClassNegated(kWordAndSurrogateRanges, kWordAndSurrogateRangeCount, ranges);
|
||||
break;
|
||||
case 'D':
|
||||
AddClassNegated(kDigitAndSurrogateRanges, kDigitAndSurrogateRangeCount, ranges);
|
||||
break;
|
||||
default:
|
||||
MOZ_CRASH("Bad type!");
|
||||
}
|
||||
}
|
||||
|
||||
// We need to check for the following characters: 0x39c 0x3bc 0x178.
|
||||
static inline bool
|
||||
RangeContainsLatin1Equivalents(CharacterRange range)
|
||||
|
@ -144,6 +144,8 @@ class CharacterRange
|
||||
{}
|
||||
|
||||
static void AddClassEscape(LifoAlloc* alloc, char16_t type, CharacterRangeVector* ranges);
|
||||
static void AddClassEscapeUnicode(LifoAlloc* alloc, char16_t type,
|
||||
CharacterRangeVector* ranges);
|
||||
|
||||
static inline CharacterRange Singleton(char16_t value) {
|
||||
return CharacterRange(value, value);
|
||||
|
@ -611,9 +611,23 @@ AddCharOrEscapeUnicode(LifoAlloc* alloc,
|
||||
char16_t char_class,
|
||||
widechar c)
|
||||
{
|
||||
if (char_class != kNoCharClass)
|
||||
CharacterRange::AddClassEscape(alloc, char_class, ranges);
|
||||
else if (unicode::IsLeadSurrogate(c))
|
||||
if (char_class != kNoCharClass) {
|
||||
CharacterRange::AddClassEscapeUnicode(alloc, char_class, ranges);
|
||||
switch (char_class) {
|
||||
case 'S':
|
||||
case 'W':
|
||||
case 'D':
|
||||
lead_ranges->append(LeadSurrogateRange());
|
||||
trail_ranges->append(TrailSurrogateRange());
|
||||
wide_ranges->append(NonBMPRange());
|
||||
break;
|
||||
case '.':
|
||||
MOZ_CRASH("Bad char_class!");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (unicode::IsLeadSurrogate(c))
|
||||
lead_ranges->append(CharacterRange::Singleton(c));
|
||||
else if (unicode::IsTrailSurrogate(c))
|
||||
trail_ranges->append(CharacterRange::Singleton(c));
|
||||
@ -1213,6 +1227,18 @@ UnicodeEverythingAtom(LifoAlloc* alloc)
|
||||
return builder->ToRegExp();
|
||||
}
|
||||
|
||||
RegExpTree*
|
||||
UnicodeCharacterClassEscapeAtom(LifoAlloc* alloc, char16_t char_class)
|
||||
{
|
||||
CharacterRangeVector* ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
|
||||
CharacterRangeVector* lead_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
|
||||
CharacterRangeVector* trail_ranges = alloc->newInfallible<CharacterRangeVector>(*alloc);
|
||||
WideCharRangeVector* wide_ranges = alloc->newInfallible<WideCharRangeVector>(*alloc);
|
||||
AddCharOrEscapeUnicode(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, char_class, 0);
|
||||
|
||||
return UnicodeRangesAtom(alloc, ranges, lead_ranges, trail_ranges, wide_ranges, false);
|
||||
}
|
||||
|
||||
// Disjunction ::
|
||||
// Alternative
|
||||
// Alternative | Disjunction
|
||||
@ -1377,7 +1403,15 @@ RegExpParser<CharT>::ParseDisjunction()
|
||||
//
|
||||
// CharacterClassEscape :: one of
|
||||
// d D s S w W
|
||||
case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {
|
||||
case 'D': case 'S': case 'W':
|
||||
if (unicode_) {
|
||||
Advance();
|
||||
builder->AddAtom(UnicodeCharacterClassEscapeAtom(alloc, current()));
|
||||
Advance();
|
||||
break;
|
||||
}
|
||||
// Fall through
|
||||
case 'd': case 's': case 'w': {
|
||||
widechar c = Next();
|
||||
Advance(2);
|
||||
CharacterRangeVector* ranges =
|
||||
|
75
js/src/tests/ecma_6/RegExp/unicode-character-class-escape.js
Normal file
75
js/src/tests/ecma_6/RegExp/unicode-character-class-escape.js
Normal file
@ -0,0 +1,75 @@
|
||||
var BUGNUMBER = 1135377;
|
||||
var summary = "Implement RegExp unicode flag -- CharacterClassEscape.";
|
||||
|
||||
print(BUGNUMBER + ": " + summary);
|
||||
|
||||
// BMP
|
||||
|
||||
assertEqArray(/\d+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["0123456789"]);
|
||||
assertEqArray(/\D+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["abcxyzABCXYZ"]);
|
||||
|
||||
assertEqArray(/\s+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["\t\r\n\v\x0c\xa0\uFEFF"]);
|
||||
assertEqArray(/\S+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["abcxyzABCXYZ0123456789_"]);
|
||||
|
||||
assertEqArray(/\w+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["abcxyzABCXYZ0123456789_"]);
|
||||
assertEqArray(/\W+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["\t\r\n\v\x0c\xa0\uFEFF*"]);
|
||||
|
||||
assertEqArray(/\n+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["\n"]);
|
||||
|
||||
assertEqArray(/[\d]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["0123456789"]);
|
||||
assertEqArray(/[\D]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["abcxyzABCXYZ"]);
|
||||
|
||||
assertEqArray(/[\s]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["\t\r\n\v\x0c\xa0\uFEFF"]);
|
||||
assertEqArray(/[\S]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["abcxyzABCXYZ0123456789_"]);
|
||||
|
||||
assertEqArray(/[\w]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["abcxyzABCXYZ0123456789_"]);
|
||||
assertEqArray(/[\W]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["\t\r\n\v\x0c\xa0\uFEFF*"]);
|
||||
|
||||
assertEqArray(/[\n]+/u.exec("abcxyzABCXYZ0123456789_\t\r\n\v\x0c\xa0\uFEFF*"),
|
||||
["\n"]);
|
||||
|
||||
// non-BMP
|
||||
|
||||
function testNonBMP(re) {
|
||||
assertEqArray(re.exec("\uD83D\uDBFF"),
|
||||
["\uD83D"]);
|
||||
assertEqArray(re.exec("\uD83D\uDC00"),
|
||||
["\uD83D\uDC00"]);
|
||||
assertEqArray(re.exec("\uD83D\uDFFF"),
|
||||
["\uD83D\uDFFF"]);
|
||||
assertEqArray(re.exec("\uD83D\uE000"),
|
||||
["\uD83D"]);
|
||||
|
||||
assertEqArray(re.exec("\uD7FF\uDC38"),
|
||||
["\uD7FF"]);
|
||||
assertEqArray(re.exec("\uD800\uDC38"),
|
||||
["\uD800\uDC38"]);
|
||||
assertEqArray(re.exec("\uDBFF\uDC38"),
|
||||
["\uDBFF\uDC38"]);
|
||||
assertEqArray(re.exec("\uDC00\uDC38"),
|
||||
["\uDC00"]);
|
||||
}
|
||||
|
||||
testNonBMP(/\D/u);
|
||||
testNonBMP(/\S/u);
|
||||
testNonBMP(/\W/u);
|
||||
|
||||
testNonBMP(/[\D]/u);
|
||||
testNonBMP(/[\S]/u);
|
||||
testNonBMP(/[\W]/u);
|
||||
|
||||
if (typeof reportCompare === "function")
|
||||
reportCompare(true, true);
|
Loading…
Reference in New Issue
Block a user