Bug 684559: bring back PCRE for JavaScript regexps on systems without YARR JIT because it's faster than YARR's interpreter. r=dmandelin

This commit is contained in:
Cameron Kaiser 2011-09-07 17:05:59 -07:00
parent 25316d6093
commit 1270c81714
17 changed files with 9291 additions and 19 deletions

View File

@ -391,22 +391,20 @@ CPPSRCS += checks.cc \
# END enclude sources for V8 dtoa
#############################################
# For architectures without YARR JIT, PCRE is faster than the YARR
# interpreter (bug 684559).
ifeq (,$(filter arm% sparc %86 x86_64,$(TARGET_CPU)))
VPATH += $(srcdir)/assembler \
$(srcdir)/assembler/wtf \
$(srcdir)/yarr\
VPATH += $(srcdir)/yarr/pcre \
$(NULL)
CPPSRCS += \
Assertions.cpp \
OSAllocatorOS2.cpp \
OSAllocatorPosix.cpp \
OSAllocatorWin.cpp \
PageBlock.cpp \
YarrInterpreter.cpp \
YarrPattern.cpp \
YarrSyntaxChecker.cpp \
pcre_compile.cpp \
pcre_exec.cpp \
pcre_tables.cpp \
pcre_xclass.cpp \
pcre_ucp_searchfuncs.cpp \
$(NULL)
else

View File

@ -194,6 +194,40 @@ js_ObjectIsRegExp(JSObject *obj)
* js::RegExp
*/
#if !ENABLE_YARR_JIT
void
RegExp::reportPCREError(JSContext *cx, int error)
{
#define REPORT(msg_) \
JS_ReportErrorFlagsAndNumberUC(cx, JSREPORT_ERROR, js_GetErrorMessage, NULL, msg_); \
return
switch (error) {
case -2: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
case 0: JS_NOT_REACHED("Precondition violation: an error must have occurred.");
case 1: REPORT(JSMSG_TRAILING_SLASH);
case 2: REPORT(JSMSG_TRAILING_SLASH);
case 3: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
case 4: REPORT(JSMSG_BAD_QUANTIFIER);
case 5: REPORT(JSMSG_BAD_QUANTIFIER);
case 6: REPORT(JSMSG_BAD_CLASS_RANGE);
case 7: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
case 8: REPORT(JSMSG_BAD_CLASS_RANGE);
case 9: REPORT(JSMSG_BAD_QUANTIFIER);
case 10: REPORT(JSMSG_UNMATCHED_RIGHT_PAREN);
case 11: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
case 12: REPORT(JSMSG_UNMATCHED_RIGHT_PAREN);
case 13: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
case 14: REPORT(JSMSG_MISSING_PAREN);
case 15: REPORT(JSMSG_BAD_BACKREF);
case 16: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
case 17: REPORT(JSMSG_REGEXP_TOO_COMPLEX);
default:
JS_NOT_REACHED("Precondition violation: unknown PCRE error code.");
}
#undef REPORT
}
#endif
void
RegExp::reportYarrError(JSContext *cx, TokenStream *ts, JSC::Yarr::ErrorCode error)
{

View File

@ -55,6 +55,8 @@
#include "yarr/Yarr.h"
#if ENABLE_YARR_JIT
#include "yarr/YarrJIT.h"
#else
#include "yarr/pcre/pcre.h"
#endif
namespace js {
@ -98,8 +100,10 @@ class RegExp
#if ENABLE_YARR_JIT
/* native code is valid only if codeBlock.isFallBack() == false */
JSC::Yarr::YarrCodeBlock codeBlock;
#endif
JSC::Yarr::BytecodePattern *byteCode;
#else
JSRegExp *compiled;
#endif
JSLinearString *source;
size_t refCount;
unsigned parenCount; /* Must be |unsigned| to interface with YARR. */
@ -115,8 +119,11 @@ class RegExp
:
#if ENABLE_YARR_JIT
codeBlock(),
byteCode(NULL),
#else
compiled(NULL),
#endif
byteCode(NULL), source(source), refCount(1), parenCount(0), flags(flags)
source(source), refCount(1), parenCount(0), flags(flags)
#ifdef DEBUG
, compartment(compartment)
#endif
@ -127,14 +134,20 @@ class RegExp
~RegExp() {
#if ENABLE_YARR_JIT
codeBlock.release();
#endif
if (byteCode)
Foreground::delete_<JSC::Yarr::BytecodePattern>(byteCode);
#else
if (compiled)
jsRegExpFree(compiled);
#endif
}
bool compileHelper(JSContext *cx, JSLinearString &pattern, TokenStream *ts);
bool compile(JSContext *cx, TokenStream *ts);
static const uint32 allFlags = JSREG_FOLD | JSREG_GLOB | JSREG_MULTILINE | JSREG_STICKY;
#if !ENABLE_YARR_JIT
void reportPCREError(JSContext *cx, int error);
#endif
void reportYarrError(JSContext *cx, TokenStream *ts, JSC::Yarr::ErrorCode error);
static inline bool initArena(JSContext *cx);
static inline void checkMatchPairs(JSString *input, int *buf, size_t matchItemCount);
@ -370,13 +383,20 @@ RegExp::executeInternal(JSContext *cx, RegExpStatics *res, JSString *inputstr,
else
result = JSC::Yarr::interpret(byteCode, chars, *lastIndex - inputOffset, len, buf);
#else
result = JSC::Yarr::interpret(byteCode, chars, *lastIndex - inputOffset, len, buf);
result = jsRegExpExecute(cx, compiled, chars, len, *lastIndex - inputOffset, buf, bufCount);
#endif
if (result == -1) {
*rval = NullValue();
return true;
}
#if !ENABLE_YARR_JIT
if (result < 0) {
reportPCREError(cx, result);
return false;
}
#endif
/*
* Adjust buf for the inputOffset. Use of sticky is rare and the matchItemCount is small, so
* just do another pass.
@ -472,6 +492,7 @@ EnableYarrJIT(JSContext *cx)
inline bool
RegExp::compileHelper(JSContext *cx, JSLinearString &pattern, TokenStream *ts)
{
#if ENABLE_YARR_JIT
JSC::Yarr::ErrorCode yarrError;
JSC::Yarr::YarrPattern yarrPattern(pattern, ignoreCase(), multiline(), &yarrError);
if (yarrError) {
@ -480,7 +501,7 @@ RegExp::compileHelper(JSContext *cx, JSLinearString &pattern, TokenStream *ts)
}
parenCount = yarrPattern.m_numSubpatterns;
#if ENABLE_YARR_JIT && defined(JS_METHODJIT)
#if defined(JS_METHODJIT)
if (EnableYarrJIT(cx) && !yarrPattern.m_containsBackreferences) {
bool ok = cx->compartment->ensureJaegerCompartmentExists(cx);
if (!ok)
@ -492,12 +513,21 @@ RegExp::compileHelper(JSContext *cx, JSLinearString &pattern, TokenStream *ts)
}
#endif
#if ENABLE_YARR_JIT
codeBlock.setFallBack(true);
#endif
byteCode = JSC::Yarr::byteCompile(yarrPattern, cx->compartment->regExpAllocator).get();
return true;
#else
int error = 0;
compiled = jsRegExpCompile(pattern.chars(), pattern.length(),
ignoreCase() ? JSRegExpIgnoreCase : JSRegExpDoNotIgnoreCase,
multiline() ? JSRegExpMultiline : JSRegExpSingleLine,
&parenCount, &error);
if (error) {
reportPCREError(cx, error);
return false;
}
return true;
#endif
}
inline bool

12
js/src/yarr/pcre/AUTHORS Normal file
View File

@ -0,0 +1,12 @@
Originally written by: Philip Hazel
Email local part: ph10
Email domain: cam.ac.uk
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2005 University of Cambridge. All rights reserved.
Adapted for JavaScriptCore and WebKit by Apple Inc.
Copyright (c) 2005, 2006, 2007 Apple Inc. All rights reserved.

35
js/src/yarr/pcre/COPYING Normal file
View File

@ -0,0 +1,35 @@
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
This is JavaScriptCore's variant of the PCRE library. While this library
started out as a copy of PCRE, many of the features of PCRE have been
removed.
Copyright (c) 1997-2005 University of Cambridge. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the name of Apple
Inc. nor the names of their contributors may be used to endorse or
promote products derived from this software without specific prior
written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

View File

@ -0,0 +1,96 @@
/*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* This file is automatically written by the dftables auxiliary
program. If you edit it by hand, you might like to edit the Makefile to
prevent its ever being regenerated.
This file contains the default tables for characters with codes less than
128 (ASCII characters). These tables are used when no external tables are
passed to PCRE. */
const unsigned char jsc_pcre_default_tables[480] = {
/* This table is a lower casing table. */
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
/* This table is a case flipping table. */
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F,
0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
/* This table contains bit maps for various character classes.
Each map is 32 bytes long and the bits run from the least
significant end of each byte. The classes are: space, digit, word. */
0x00, 0x3E, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
/* This table identifies various classes of character by individual bits:
0x01 white space character
0x08 hexadecimal digit
0x10 alphanumeric or '_'
*/
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0- 7 */
0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, /* 8- 15 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 16- 23 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 24- 31 */
0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* - ' */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* ( - / */
0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, /* 0 - 7 */
0x18, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8 - ? */
0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x10, /* @ - G */
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, /* H - O */
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, /* P - W */
0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x10, /* X - _ */
0x00, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x10, /* ` - g */
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, /* h - o */
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, /* p - w */
0x10, 0x10, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00}; /* x -127 */
/* End of chartables.c */

273
js/src/yarr/pcre/dftables Normal file
View File

@ -0,0 +1,273 @@
#!/usr/bin/perl -w
#
# This is JavaScriptCore's variant of the PCRE library. While this library
# started out as a copy of PCRE, many of the features of PCRE have been
# removed. This library now supports only the regular expression features
# required by the JavaScript language specification, and has only the functions
# needed by JavaScriptCore and the rest of WebKit.
#
# Originally written by Philip Hazel
# Copyright (c) 1997-2006 University of Cambridge
# Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
#
# -----------------------------------------------------------------------------
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# * Neither the name of the University of Cambridge nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# -----------------------------------------------------------------------------
# This is a freestanding support program to generate a file containing
# character tables. The tables are built according to the default C
# locale.
use strict;
use File::Basename;
use File::Spec;
use File::Temp qw(tempfile);
use Getopt::Long;
sub readHeaderValues();
my %pcre_internal;
if (scalar(@ARGV) < 1) {
print STDERR "Usage: ", basename($0), " [--preprocessor=program] output-file\n";
exit 1;
}
my $outputFile;
my $preprocessor;
GetOptions('preprocessor=s' => \$preprocessor);
if (not $preprocessor) {
$preprocessor = "cpp";
}
$outputFile = $ARGV[0];
die('Must specify output file.') unless defined($outputFile);
readHeaderValues();
open(OUT, ">", $outputFile) or die "$!";
binmode(OUT);
printf(OUT
"/*************************************************\n" .
"* Perl-Compatible Regular Expressions *\n" .
"*************************************************/\n\n" .
"/* This file is automatically written by the dftables auxiliary \n" .
"program. If you edit it by hand, you might like to edit the Makefile to \n" .
"prevent its ever being regenerated.\n\n");
printf(OUT
"This file contains the default tables for characters with codes less than\n" .
"128 (ASCII characters). These tables are used when no external tables are\n" .
"passed to PCRE. */\n\n" .
"const unsigned char jsc_pcre_default_tables[%d] = {\n\n" .
"/* This table is a lower casing table. */\n\n", $pcre_internal{tables_length});
if ($pcre_internal{lcc_offset} != 0) {
die "lcc_offset != 0";
}
printf(OUT " ");
for (my $i = 0; $i < 128; $i++) {
if (($i & 7) == 0 && $i != 0) {
printf(OUT "\n ");
}
printf(OUT "0x%02X", ord(lc(chr($i))));
if ($i != 127) {
printf(OUT ", ");
}
}
printf(OUT ",\n\n");
printf(OUT "/* This table is a case flipping table. */\n\n");
if ($pcre_internal{fcc_offset} != 128) {
die "fcc_offset != 128";
}
printf(OUT " ");
for (my $i = 0; $i < 128; $i++) {
if (($i & 7) == 0 && $i != 0) {
printf(OUT "\n ");
}
my $c = chr($i);
printf(OUT "0x%02X", $c =~ /[[:lower:]]/ ? ord(uc($c)) : ord(lc($c)));
if ($i != 127) {
printf(OUT ", ");
}
}
printf(OUT ",\n\n");
printf(OUT
"/* This table contains bit maps for various character classes.\n" .
"Each map is 32 bytes long and the bits run from the least\n" .
"significant end of each byte. The classes are: space, digit, word. */\n\n");
if ($pcre_internal{cbits_offset} != $pcre_internal{fcc_offset} + 128) {
die "cbits_offset != fcc_offset + 128";
}
my @cbit_table = (0) x $pcre_internal{cbit_length};
for (my $i = ord('0'); $i <= ord('9'); $i++) {
$cbit_table[$pcre_internal{cbit_digit} + $i / 8] |= 1 << ($i & 7);
}
$cbit_table[$pcre_internal{cbit_word} + ord('_') / 8] |= 1 << (ord('_') & 7);
for (my $i = 0; $i < 128; $i++) {
my $c = chr($i);
if ($c =~ /[[:alnum:]]/) {
$cbit_table[$pcre_internal{cbit_word} + $i / 8] |= 1 << ($i & 7);
}
if ($c =~ /[[:space:]]/) {
$cbit_table[$pcre_internal{cbit_space} + $i / 8] |= 1 << ($i & 7);
}
}
printf(OUT " ");
for (my $i = 0; $i < $pcre_internal{cbit_length}; $i++) {
if (($i & 7) == 0 && $i != 0) {
if (($i & 31) == 0) {
printf(OUT "\n");
}
printf(OUT "\n ");
}
printf(OUT "0x%02X", $cbit_table[$i]);
if ($i != $pcre_internal{cbit_length} - 1) {
printf(OUT ", ");
}
}
printf(OUT ",\n\n");
printf(OUT
"/* This table identifies various classes of character by individual bits:\n" .
" 0x%02x white space character\n" .
" 0x%02x hexadecimal digit\n" .
" 0x%02x alphanumeric or '_'\n*/\n\n",
$pcre_internal{ctype_space}, $pcre_internal{ctype_xdigit}, $pcre_internal{ctype_word});
if ($pcre_internal{ctypes_offset} != $pcre_internal{cbits_offset} + $pcre_internal{cbit_length}) {
die "ctypes_offset != cbits_offset + cbit_length";
}
printf(OUT " ");
for (my $i = 0; $i < 128; $i++) {
my $x = 0;
my $c = chr($i);
if ($c =~ /[[:space:]]/) {
$x += $pcre_internal{ctype_space};
}
if ($c =~ /[[:xdigit:]]/) {
$x += $pcre_internal{ctype_xdigit};
}
if ($c =~ /[[:alnum:]_]/) {
$x += $pcre_internal{ctype_word};
}
printf(OUT "0x%02X", $x);
if ($i != 127) {
printf(OUT ", ");
} else {
printf(OUT "};");
}
if (($i & 7) == 7) {
printf(OUT " /* ");
my $d = chr($i - 7);
if ($d =~ /[[:print:]]/) {
printf(OUT " %c -", $i - 7);
} else {
printf(OUT "%3d-", $i - 7);
}
if ($c =~ m/[[:print:]]/) {
printf(OUT " %c ", $i);
} else {
printf(OUT "%3d", $i);
}
printf(OUT " */\n");
if ($i != 127) {
printf(OUT " ");
}
}
}
if ($pcre_internal{tables_length} != $pcre_internal{ctypes_offset} + 128) {
die "tables_length != ctypes_offset + 128";
}
printf(OUT "\n\n/* End of chartables.c */\n");
close(OUT);
exit 0;
sub readHeaderValues()
{
my @variables = qw(
cbit_digit
cbit_length
cbit_space
cbit_word
cbits_offset
ctype_space
ctype_word
ctype_xdigit
ctypes_offset
fcc_offset
lcc_offset
tables_length
);
local $/ = undef;
my $headerPath = File::Spec->catfile(dirname($0), "pcre_internal.h");
my ($fh, $tempFile) = tempfile(
basename($0) . "-XXXXXXXX",
DIR => File::Spec->tmpdir(),
SUFFIX => ".in",
UNLINK => 0,
);
print $fh "#define DFTABLES\n\n";
open(HEADER, "<", $headerPath) or die "$!";
print $fh <HEADER>;
close(HEADER);
print $fh "\n\n";
for my $v (@variables) {
print $fh "\$pcre_internal{\"$v\"} = $v;\n";
}
close($fh);
open(CPP, "$preprocessor \"$tempFile\" |") or die "$!";
my $content = <CPP>;
close(CPP);
eval $content;
die "$@" if $@;
unlink $tempFile;
}

76
js/src/yarr/pcre/pcre.h Normal file
View File

@ -0,0 +1,76 @@
/* This is the public header file for JavaScriptCore's variant of the PCRE
library. While this library started out as a copy of PCRE, many of the
features of PCRE have been removed. This library now supports only the
regular expression features required by the JavaScript language
specification, and has only the functions needed by JavaScriptCore and the
rest of WebKit.
Copyright (c) 1997-2005 University of Cambridge
Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
// FIXME: This file needs to be renamed to JSRegExp.h; it's no longer PCRE.
#ifndef JSRegExp_h
#define JSRegExp_h
// Last remnants from the JSWTFBridge.
#include "assembler/wtf/Platform.h"
#include "jsstr.h"
#include "jsprvtd.h"
#include "jstl.h"
typedef jschar UChar;
typedef JSLinearString UString;
struct JSRegExp;
struct JSContext;
enum JSRegExpIgnoreCaseOption { JSRegExpDoNotIgnoreCase, JSRegExpIgnoreCase };
enum JSRegExpMultilineOption { JSRegExpSingleLine, JSRegExpMultiline };
/* jsRegExpExecute error codes */
const int JSRegExpErrorNoMatch = -1;
const int JSRegExpErrorHitLimit = -2;
const int JSRegExpErrorInternal = -4;
JSRegExp* jsRegExpCompile(
const UChar* pattern, int patternLength,
JSRegExpIgnoreCaseOption, JSRegExpMultilineOption,
unsigned* numSubpatterns, int *error);
int jsRegExpExecute(JSContext *, const JSRegExp*,
const UChar* subject, int subjectLength, int startOffset,
int* offsetsVector, int offsetsVectorLength);
void jsRegExpFree(JSRegExp*);
#endif

12
js/src/yarr/pcre/pcre.pri Normal file
View File

@ -0,0 +1,12 @@
# Perl Compatible Regular Expressions - Qt4 build info
VPATH += $$PWD
INCLUDEPATH += $$PWD $$OUTPUT_DIR/JavaScriptCore/tmp
DEPENDPATH += $$PWD
SOURCES += \
pcre_compile.cpp \
pcre_exec.cpp \
pcre_tables.cpp \
pcre_ucp_searchfuncs.cpp \
pcre_xclass.cpp

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,434 @@
/* This is JavaScriptCore's variant of the PCRE library. While this library
started out as a copy of PCRE, many of the features of PCRE have been
removed. This library now supports only the regular expression features
required by the JavaScript language specification, and has only the functions
needed by JavaScriptCore and the rest of WebKit.
Originally written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This header contains definitions that are shared between the different
modules, but which are not relevant to the exported API. This includes some
functions whose names all begin with "_pcre_". */
#ifndef PCRE_INTERNAL_H
#define PCRE_INTERNAL_H
/* Bit definitions for entries in the pcre_ctypes table. */
#define ctype_space 0x01
#define ctype_xdigit 0x08
#define ctype_word 0x10 /* alphameric or '_' */
/* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
of bits for a class map. Some classes are built by combining these tables. */
#define cbit_space 0 /* \s */
#define cbit_digit 32 /* \d */
#define cbit_word 64 /* \w */
#define cbit_length 96 /* Length of the cbits table */
/* Offsets of the various tables from the base tables pointer, and
total length. */
#define lcc_offset 0
#define fcc_offset 128
#define cbits_offset 256
#define ctypes_offset (cbits_offset + cbit_length)
#define tables_length (ctypes_offset + 128)
#ifndef DFTABLES
#include "pcre.h"
/* The value of LINK_SIZE determines the number of bytes used to store links as
offsets within the compiled regex. The default is 2, which allows for compiled
patterns up to 64K long. */
#define LINK_SIZE 3
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
inline, and there are *still* stupid compilers about that don't like indented
pre-processor statements, or at least there were when I first wrote this. After
all, it had only been about 10 years then... */
#ifdef DEBUG
#define DPRINTF(p) /*printf p; fflush(stdout);*/
#else
#define DPRINTF(p) /*nothing*/
#endif
/* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored
in big-endian order) by default. These are used, for example, to link from the
start of a subpattern to its alternatives and its end. The use of 2 bytes per
offset limits the size of the compiled regex to around 64K, which is big enough
for almost everybody. However, I received a request for an even bigger limit.
For this reason, and also to make the code easier to maintain, the storing and
loading of offsets from the byte string is now handled by the functions that are
defined here. */
/* PCRE uses some other 2-byte quantities that do not change when the size of
offsets changes. There are used for repeat counts and for other things such as
capturing parenthesis numbers in back references. */
static inline void put2ByteValue(unsigned char* opcodePtr, int value)
{
JS_ASSERT(value >= 0 && value <= 0xFFFF);
opcodePtr[0] = value >> 8;
opcodePtr[1] = value;
}
static inline void put3ByteValue(unsigned char* opcodePtr, int value)
{
JS_ASSERT(value >= 0 && value <= 0xFFFFFF);
opcodePtr[0] = value >> 16;
opcodePtr[1] = value >> 8;
opcodePtr[2] = value;
}
static inline int get2ByteValue(const unsigned char* opcodePtr)
{
return (opcodePtr[0] << 8) | opcodePtr[1];
}
static inline int get3ByteValue(const unsigned char* opcodePtr)
{
return (opcodePtr[0] << 16) | (opcodePtr[1] << 8) | opcodePtr[2];
}
static inline void put2ByteValueAndAdvance(unsigned char*& opcodePtr, int value)
{
put2ByteValue(opcodePtr, value);
opcodePtr += 2;
}
static inline void put3ByteValueAndAdvance(unsigned char*& opcodePtr, int value)
{
put3ByteValue(opcodePtr, value);
opcodePtr += 3;
}
static inline void putLinkValueAllowZero(unsigned char* opcodePtr, int value)
{
#if LINK_SIZE == 3
put3ByteValue(opcodePtr, value);
#elif LINK_SIZE == 2
put2ByteValue(opcodePtr, value);
#else
# error LINK_SIZE not supported.
#endif
}
static inline int getLinkValueAllowZero(const unsigned char* opcodePtr)
{
#if LINK_SIZE == 3
return get3ByteValue(opcodePtr);
#elif LINK_SIZE == 2
return get2ByteValue(opcodePtr);
#else
# error LINK_SIZE not supported.
#endif
}
#define MAX_PATTERN_SIZE 4096 * 1024 // Derived by empirical testing of compile time in PCRE and WREC.
JS_STATIC_ASSERT(MAX_PATTERN_SIZE < (1 << (8 * LINK_SIZE)));
static inline void putLinkValue(unsigned char* opcodePtr, int value)
{
JS_ASSERT(value);
putLinkValueAllowZero(opcodePtr, value);
}
static inline int getLinkValue(const unsigned char* opcodePtr)
{
int value = getLinkValueAllowZero(opcodePtr);
JS_ASSERT(value);
return value;
}
static inline void putLinkValueAndAdvance(unsigned char*& opcodePtr, int value)
{
putLinkValue(opcodePtr, value);
opcodePtr += LINK_SIZE;
}
static inline void putLinkValueAllowZeroAndAdvance(unsigned char*& opcodePtr, int value)
{
putLinkValueAllowZero(opcodePtr, value);
opcodePtr += LINK_SIZE;
}
// FIXME: These are really more of a "compiled regexp state" than "regexp options"
enum RegExpOptions {
UseFirstByteOptimizationOption = 0x40000000, /* firstByte is set */
UseRequiredByteOptimizationOption = 0x20000000, /* reqByte is set */
UseMultiLineFirstByteOptimizationOption = 0x10000000, /* start after \n for multiline */
IsAnchoredOption = 0x02000000, /* can't use partial with this regex */
IgnoreCaseOption = 0x00000001,
MatchAcrossMultipleLinesOption = 0x00000002
};
/* Flags added to firstByte or reqByte; a "non-literal" item is either a
variable-length repeat, or a anything other than literal characters. */
#define REQ_IGNORE_CASE 0x0100 /* indicates should ignore case */
#define REQ_VARY 0x0200 /* reqByte followed non-literal item */
/* Miscellaneous definitions */
/* Flag bits and data types for the extended class (OP_XCLASS) for classes that
contain UTF-8 characters with values greater than 255. */
#define XCL_NOT 0x01 /* Flag: this is a negative class */
#define XCL_MAP 0x02 /* Flag: a 32-byte map is present */
#define XCL_END 0 /* Marks end of individual items */
#define XCL_SINGLE 1 /* Single item (one multibyte char) follows */
#define XCL_RANGE 2 /* A range (two multibyte chars) follows */
/* These are escaped items that aren't just an encoding of a particular data
value such as \n. They must have non-zero values, as check_escape() returns
their negation. Also, they must appear in the same order as in the opcode
definitions below, up to ESC_w. The final one must be
ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two
tests in the code for an escape > ESC_b and <= ESC_w to
detect the types that may be repeated. These are the types that consume
characters. If any new escapes are put in between that don't consume a
character, that code will have to change. */
enum { ESC_B = 1, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_REF };
/* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
OP_EOD must correspond in order to the list of escapes immediately above.
Note that whenever this list is updated, the two macro definitions that follow
must also be updated to match. */
#define FOR_EACH_OPCODE(macro) \
macro(END) \
\
, macro(NOT_WORD_BOUNDARY) \
, macro(WORD_BOUNDARY) \
, macro(NOT_DIGIT) \
, macro(DIGIT) \
, macro(NOT_WHITESPACE) \
, macro(WHITESPACE) \
, macro(NOT_WORDCHAR) \
, macro(WORDCHAR) \
\
, macro(NOT_NEWLINE) \
\
, macro(CIRC) \
, macro(DOLL) \
, macro(BOL) \
, macro(EOL) \
, macro(CHAR) \
, macro(CHAR_IGNORING_CASE) \
, macro(ASCII_CHAR) \
, macro(ASCII_LETTER_IGNORING_CASE) \
, macro(NOT) \
\
, macro(STAR) \
, macro(MINSTAR) \
, macro(PLUS) \
, macro(MINPLUS) \
, macro(QUERY) \
, macro(MINQUERY) \
, macro(UPTO) \
, macro(MINUPTO) \
, macro(EXACT) \
\
, macro(NOTSTAR) \
, macro(NOTMINSTAR) \
, macro(NOTPLUS) \
, macro(NOTMINPLUS) \
, macro(NOTQUERY) \
, macro(NOTMINQUERY) \
, macro(NOTUPTO) \
, macro(NOTMINUPTO) \
, macro(NOTEXACT) \
\
, macro(TYPESTAR) \
, macro(TYPEMINSTAR) \
, macro(TYPEPLUS) \
, macro(TYPEMINPLUS) \
, macro(TYPEQUERY) \
, macro(TYPEMINQUERY) \
, macro(TYPEUPTO) \
, macro(TYPEMINUPTO) \
, macro(TYPEEXACT) \
\
, macro(CRSTAR) \
, macro(CRMINSTAR) \
, macro(CRPLUS) \
, macro(CRMINPLUS) \
, macro(CRQUERY) \
, macro(CRMINQUERY) \
, macro(CRRANGE) \
, macro(CRMINRANGE) \
\
, macro(CLASS) \
, macro(NCLASS) \
, macro(XCLASS) \
\
, macro(REF) \
\
, macro(ALT) \
, macro(KET) \
, macro(KETRMAX) \
, macro(KETRMIN) \
\
, macro(ASSERT) \
, macro(ASSERT_NOT) \
\
, macro(BRAZERO) \
, macro(BRAMINZERO) \
, macro(BRANUMBER) \
, macro(BRA)
#define OPCODE_ENUM_VALUE(opcode) OP_##opcode
enum { FOR_EACH_OPCODE(OPCODE_ENUM_VALUE) };
/* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and
study.c that all opcodes are less than 128 in value. This makes handling UTF-8
character sequences easier. */
/* The highest extraction number before we have to start using additional
bytes. (Originally PCRE didn't have support for extraction counts higher than
this number.) The value is limited by the number of opcodes left after OP_BRA,
i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional
opcodes. */
/* FIXME: Note that OP_BRA + 100 is > 128, so the two comments above
are in conflict! */
#define EXTRACT_BASIC_MAX 100
/* The code vector runs on as long as necessary after the end. */
struct JSRegExp {
unsigned options;
unsigned short topBracket;
unsigned short topBackref;
unsigned short firstByte;
unsigned short reqByte;
};
/* Internal shared data tables. These are tables that are used by more than one
of the exported public functions. They have to be "external" in the C sense,
but are not part of the PCRE public API. The data for these tables is in the
pcre_tables.c module. */
#define jsc_pcre_utf8_table1_size 6
extern const int jsc_pcre_utf8_table1[6];
extern const int jsc_pcre_utf8_table2[6];
extern const int jsc_pcre_utf8_table3[6];
extern const unsigned char jsc_pcre_utf8_table4[0x40];
extern const unsigned char jsc_pcre_default_tables[tables_length];
static inline unsigned char toLowerCase(unsigned char c)
{
static const unsigned char* lowerCaseChars = jsc_pcre_default_tables + lcc_offset;
return lowerCaseChars[c];
}
static inline unsigned char flipCase(unsigned char c)
{
static const unsigned char* flippedCaseChars = jsc_pcre_default_tables + fcc_offset;
return flippedCaseChars[c];
}
static inline unsigned char classBitmapForChar(unsigned char c)
{
static const unsigned char* charClassBitmaps = jsc_pcre_default_tables + cbits_offset;
return charClassBitmaps[c];
}
static inline unsigned char charTypeForChar(unsigned char c)
{
const unsigned char* charTypeMap = jsc_pcre_default_tables + ctypes_offset;
return charTypeMap[c];
}
static inline bool isWordChar(UChar c)
{
return c < 128 && (charTypeForChar(c) & ctype_word);
}
static inline bool isSpaceChar(UChar c)
{
return (c < 128 && (charTypeForChar(c) & ctype_space)) || c == 0x00A0;
}
static inline bool isNewline(UChar nl)
{
return (nl == 0xA || nl == 0xD || nl == 0x2028 || nl == 0x2029);
}
static inline bool isBracketStartOpcode(unsigned char opcode)
{
if (opcode >= OP_BRA)
return true;
switch (opcode) {
case OP_ASSERT:
case OP_ASSERT_NOT:
return true;
default:
return false;
}
}
static inline void advanceToEndOfBracket(const unsigned char*& opcodePtr)
{
JS_ASSERT(isBracketStartOpcode(*opcodePtr) || *opcodePtr == OP_ALT);
do
opcodePtr += getLinkValue(opcodePtr + 1);
while (*opcodePtr == OP_ALT);
}
/* Internal shared functions. These are functions that are used in more
that one of the source files. They have to have external linkage, but
but are not part of the public API and so not exported from the library. */
extern int jsc_pcre_ucp_othercase(unsigned);
extern bool jsc_pcre_xclass(int, const unsigned char*);
#endif
#endif
/* End of pcre_internal.h */

View File

@ -0,0 +1,71 @@
/* This is JavaScriptCore's variant of the PCRE library. While this library
started out as a copy of PCRE, many of the features of PCRE have been
removed. This library now supports only the regular expression features
required by the JavaScript language specification, and has only the functions
needed by JavaScriptCore and the rest of WebKit.
Originally written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains some fixed tables that are used by more than one of the
PCRE code modules. */
#include "pcre_internal.h"
/*************************************************
* Tables for UTF-8 support *
*************************************************/
/* These are the breakpoints for different numbers of bytes in a UTF-8
character. */
const int jsc_pcre_utf8_table1[6] =
{ 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
/* These are the indicator bits and the mask for the data bits to set in the
first byte of a character, indexed by the number of additional bytes. */
const int jsc_pcre_utf8_table2[6] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
const int jsc_pcre_utf8_table3[6] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/* Table of the number of extra characters, indexed by the first character
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
0x3d. */
const unsigned char jsc_pcre_utf8_table4[0x40] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
#include "chartables.c"

View File

@ -0,0 +1,98 @@
/* This is JavaScriptCore's variant of the PCRE library. While this library
started out as a copy of PCRE, many of the features of PCRE have been
removed. This library now supports only the regular expression features
required by the JavaScript language specification, and has only the functions
needed by JavaScriptCore and the rest of WebKit.
Originally written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains code for searching the table of Unicode character
properties. */
#include "pcre_internal.h"
#include "ucpinternal.h" /* Internal table details */
#include "ucptable.cpp" /* The table itself */
/*************************************************
* Search table and return other case *
*************************************************/
/* If the given character is a letter, and there is another case for the
letter, return the other case. Otherwise, return -1.
Arguments:
c the character value
Returns: the other case or -1 if none
*/
int jsc_pcre_ucp_othercase(unsigned c)
{
int bot = 0;
int top = sizeof(ucp_table) / sizeof(cnode);
int mid;
/* The table is searched using a binary chop. You might think that using
intermediate variables to hold some of the common expressions would speed
things up, but tests with gcc 3.4.4 on Linux showed that, on the contrary, it
makes things a lot slower. */
for (;;) {
if (top <= bot)
return -1;
mid = (bot + top) >> 1;
if (c == (ucp_table[mid].f0 & f0_charmask))
break;
if (c < (ucp_table[mid].f0 & f0_charmask))
top = mid;
else {
if ((ucp_table[mid].f0 & f0_rangeflag) && (c <= (ucp_table[mid].f0 & f0_charmask) + (ucp_table[mid].f1 & f1_rangemask)))
break;
bot = mid + 1;
}
}
/* Found an entry in the table. Return -1 for a range entry. Otherwise return
the other case if there is one, else -1. */
if (ucp_table[mid].f0 & f0_rangeflag)
return -1;
int offset = ucp_table[mid].f1 & f1_casemask;
if (offset & f1_caseneg)
offset |= f1_caseneg;
return !offset ? -1 : c + offset;
}

View File

@ -0,0 +1,114 @@
/* This is JavaScriptCore's variant of the PCRE library. While this library
started out as a copy of PCRE, many of the features of PCRE have been
removed. This library now supports only the regular expression features
required by the JavaScript language specification, and has only the functions
needed by JavaScriptCore and the rest of WebKit.
Originally written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/* This module contains an internal function that is used to match an extended
class (one that contains characters whose values are > 255). */
#include "pcre_internal.h"
/*************************************************
* Match character against an XCLASS *
*************************************************/
/* This function is called to match a character against an extended class that
might contain values > 255.
Arguments:
c the character
data points to the flag byte of the XCLASS data
Returns: true if character matches, else false
*/
/* Get the next UTF-8 character, advancing the pointer. This is called when we
know we are in UTF-8 mode. */
static inline void getUTF8CharAndAdvancePointer(int& c, const unsigned char*& subjectPtr)
{
c = *subjectPtr++;
if ((c & 0xc0) == 0xc0) {
int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */
int gcss = 6 * gcaa;
c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss;
while (gcaa-- > 0) {
gcss -= 6;
c |= (*subjectPtr++ & 0x3f) << gcss;
}
}
}
bool jsc_pcre_xclass(int c, const unsigned char* data)
{
bool negated = (*data & XCL_NOT);
/* Character values < 256 are matched against a bitmap, if one is present. If
not, we still carry on, because there may be ranges that start below 256 in the
additional data. */
if (c < 256) {
if ((*data & XCL_MAP) != 0 && (data[1 + c/8] & (1 << (c&7))) != 0)
return !negated; /* char found */
}
/* First skip the bit map if present. Then match against the list of Unicode
properties or large chars or ranges that end with a large char. We won't ever
encounter XCL_PROP or XCL_NOTPROP when UCP support is not compiled. */
if ((*data++ & XCL_MAP) != 0)
data += 32;
int t;
while ((t = *data++) != XCL_END) {
if (t == XCL_SINGLE) {
int x;
getUTF8CharAndAdvancePointer(x, data);
if (c == x)
return !negated;
}
else if (t == XCL_RANGE) {
int x, y;
getUTF8CharAndAdvancePointer(x, data);
getUTF8CharAndAdvancePointer(y, data);
if (c >= x && c <= y)
return !negated;
}
}
return negated; /* char did not match */
}

View File

@ -0,0 +1,126 @@
/* This is JavaScriptCore's variant of the PCRE library. While this library
started out as a copy of PCRE, many of the features of PCRE have been
removed. This library now supports only the regular expression features
required by the JavaScript language specification, and has only the functions
needed by JavaScriptCore and the rest of WebKit.
Originally written by Philip Hazel
Copyright (c) 1997-2006 University of Cambridge
Copyright (C) 2002, 2004, 2006, 2007 Apple Inc. All rights reserved.
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the University of Cambridge nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
-----------------------------------------------------------------------------
*/
/*************************************************
* Unicode Property Table handler *
*************************************************/
/* Internal header file defining the layout of the bits in each pair of 32-bit
words that form a data item in the table. */
typedef struct cnode {
unsigned f0;
unsigned f1;
} cnode;
/* Things for the f0 field */
#define f0_scriptmask 0xff000000 /* Mask for script field */
#define f0_scriptshift 24 /* Shift for script value */
#define f0_rangeflag 0x00f00000 /* Flag for a range item */
#define f0_charmask 0x001fffff /* Mask for code point value */
/* Things for the f1 field */
#define f1_typemask 0xfc000000 /* Mask for char type field */
#define f1_typeshift 26 /* Shift for the type field */
#define f1_rangemask 0x0000ffff /* Mask for a range offset */
#define f1_casemask 0x0000ffff /* Mask for a case offset */
#define f1_caseneg 0xffff8000 /* Bits for negation */
/* The data consists of a vector of structures of type cnode. The two unsigned
32-bit integers are used as follows:
(f0) (1) The most significant byte holds the script number. The numbers are
defined by the enum in ucp.h.
(2) The 0x00800000 bit is set if this entry defines a range of characters.
It is not set if this entry defines a single character
(3) The 0x00600000 bits are spare.
(4) The 0x001fffff bits contain the code point. No Unicode code point will
ever be greater than 0x0010ffff, so this should be OK for ever.
(f1) (1) The 0xfc000000 bits contain the character type number. The numbers are
defined by an enum in ucp.h.
(2) The 0x03ff0000 bits are spare.
(3) The 0x0000ffff bits contain EITHER the unsigned offset to the top of
range if this entry defines a range, OR the *signed* offset to the
character's "other case" partner if this entry defines a single
character. There is no partner if the value is zero.
-------------------------------------------------------------------------------
| script (8) |.|.|.| codepoint (21) || type (6) |.|.| spare (8) | offset (16) |
-------------------------------------------------------------------------------
| | | | |
| | |-> spare | |-> spare
| | |
| |-> spare |-> spare
|
|-> range flag
The upper/lower casing information is set only for characters that come in
pairs. The non-one-to-one mappings in the Unicode data are ignored.
When searching the data, proceed as follows:
(1) Set up for a binary chop search.
(2) If the top is not greater than the bottom, the character is not in the
table. Its type must therefore be "Cn" ("Undefined").
(3) Find the middle vector element.
(4) Extract the code point and compare. If equal, we are done.
(5) If the test character is smaller, set the top to the current point, and
goto (2).
(6) If the current entry defines a range, compute the last character by adding
the offset, and see if the test character is within the range. If it is,
we are done.
(7) Otherwise, set the bottom to one element past the current point and goto
(2).
*/
/* End of ucpinternal.h */

File diff suppressed because it is too large Load Diff