/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- * vim: set sw=4 ts=8 et tw=78: * * ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is Mozilla Communicator client code, released * March 31, 1998. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 1998 * the Initial Developer. All Rights Reserved. * * Contributor(s): * * Alternatively, the contents of this file may be used under the terms of * either of the GNU General Public License Version 2 or later (the "GPL"), * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ /* * JS regular expressions, after Perl. */ #include #include #include #include "jstypes.h" #include "jsstdint.h" #include "jsarena.h" /* Added by JSIFY */ #include "jsutil.h" /* Added by JSIFY */ #include "jsapi.h" #include "jsarray.h" #include "jsatom.h" #include "jsbuiltins.h" #include "jscntxt.h" #include "jsversion.h" #include "jsfun.h" #include "jsgc.h" #include "jsinterp.h" #include "jslock.h" #include "jsnum.h" #include "jsobj.h" #include "jsopcode.h" #include "jsregexp.h" #include "jsscan.h" #include "jsscope.h" #include "jsstaticcheck.h" #include "jsstr.h" #include "jsvector.h" #ifdef JS_TRACER #include "jstracer.h" using namespace avmplus; using namespace nanojit; #endif #include "jsobjinlines.h" using namespace js; typedef enum REOp { #define REOP_DEF(opcode, name) opcode, #include "jsreops.tbl" #undef REOP_DEF REOP_LIMIT /* META: no operator >= to this */ } REOp; #define REOP_IS_SIMPLE(op) ((op) <= REOP_NCLASS) #ifdef REGEXP_DEBUG const char *reop_names[] = { #define REOP_DEF(opcode, name) name, #include "jsreops.tbl" #undef REOP_DEF NULL }; #endif #ifdef __GNUC__ static int re_debug(const char *fmt, ...) __attribute__ ((format(printf, 1, 2))); #endif #ifdef REGEXP_DEBUG static int re_debug(const char *fmt, ...) { va_list ap; int retval; va_start(ap, fmt); retval = vprintf(fmt, ap); va_end(ap); return retval; } static void re_debug_chars(const jschar *chrs, size_t length) { int i = 0; printf(" \""); while (*chrs && i++ < length) { putchar((char)*chrs++); } printf("\""); } #else /* !REGEXP_DEBUG */ /* This should be optimized to a no-op by our tier-1 compilers. */ static int re_debug(const char *fmt, ...) { return 0; } static void re_debug_chars(const jschar *chrs, size_t length) { } #endif /* !REGEXP_DEBUG */ struct RENode { REOp op; /* r.e. op bytecode */ RENode *next; /* next in concatenation order */ void *kid; /* first operand */ union { void *kid2; /* second operand */ jsint num; /* could be a number */ size_t parenIndex; /* or a parenthesis index */ struct { /* or a quantifier range */ uintN min; uintN max; JSPackedBool greedy; } range; struct { /* or a character class */ size_t startIndex; size_t kidlen; /* length of string at kid, in jschars */ size_t index; /* index into class list */ uint16 bmsize; /* bitmap size, based on max char code */ JSPackedBool sense; } ucclass; struct { /* or a literal sequence */ jschar chr; /* of one character */ size_t length; /* or many (via the kid) */ } flat; struct { RENode *kid2; /* second operand from ALT */ jschar ch1; /* match char for ALTPREREQ */ jschar ch2; /* ditto, or class index for ALTPREREQ2 */ } altprereq; } u; }; #define RE_IS_LETTER(c) (((c >= 'A') && (c <= 'Z')) || \ ((c >= 'a') && (c <= 'z')) ) #define RE_IS_LINE_TERM(c) ((c == '\n') || (c == '\r') || \ (c == LINE_SEPARATOR) || (c == PARA_SEPARATOR)) #define CLASS_CACHE_SIZE 4 typedef struct CompilerState { JSContext *context; TokenStream *tokenStream; /* For reporting errors */ const jschar *cpbegin; const jschar *cpend; const jschar *cp; size_t parenCount; size_t classCount; /* number of [] encountered */ size_t treeDepth; /* maximum depth of parse tree */ size_t progLength; /* estimated bytecode length */ RENode *result; size_t classBitmapsMem; /* memory to hold all class bitmaps */ struct { const jschar *start; /* small cache of class strings */ size_t length; /* since they're often the same */ size_t index; } classCache[CLASS_CACHE_SIZE]; uint16 flags; } CompilerState; typedef struct EmitStateStackEntry { jsbytecode *altHead; /* start of REOP_ALT* opcode */ jsbytecode *nextAltFixup; /* fixup pointer to next-alt offset */ jsbytecode *nextTermFixup; /* fixup ptr. to REOP_JUMP offset */ jsbytecode *endTermFixup; /* fixup ptr. to REOPT_ALTPREREQ* offset */ RENode *continueNode; /* original REOP_ALT* node being stacked */ jsbytecode continueOp; /* REOP_JUMP or REOP_ENDALT continuation */ JSPackedBool jumpToJumpFlag; /* true if we've patched jump-to-jump to avoid 16-bit unsigned offset overflow */ } EmitStateStackEntry; /* * Immediate operand sizes and getter/setters. Unlike the ones in jsopcode.h, * the getters and setters take the pc of the offset, not of the opcode before * the offset. */ #define ARG_LEN 2 #define GET_ARG(pc) ((uint16)(((pc)[0] << 8) | (pc)[1])) #define SET_ARG(pc, arg) ((pc)[0] = (jsbytecode) ((arg) >> 8), \ (pc)[1] = (jsbytecode) (arg)) #define OFFSET_LEN ARG_LEN #define OFFSET_MAX (JS_BIT(ARG_LEN * 8) - 1) #define GET_OFFSET(pc) GET_ARG(pc) /* * Maximum supported tree depth is maximum size of EmitStateStackEntry stack. * For sanity, we limit it to 2^24 bytes. */ #define TREE_DEPTH_MAX (JS_BIT(24) / sizeof(EmitStateStackEntry)) /* * The maximum memory that can be allocated for class bitmaps. * For sanity, we limit it to 2^24 bytes. */ #define CLASS_BITMAPS_MEM_LIMIT JS_BIT(24) /* * Functions to get size and write/read bytecode that represent small indexes * compactly. * Each byte in the code represent 7-bit chunk of the index. 8th bit when set * indicates that the following byte brings more bits to the index. Otherwise * this is the last byte in the index bytecode representing highest index bits. */ static size_t GetCompactIndexWidth(size_t index) { size_t width; for (width = 1; (index >>= 7) != 0; ++width) { } return width; } static JS_ALWAYS_INLINE jsbytecode * WriteCompactIndex(jsbytecode *pc, size_t index) { size_t next; while ((next = index >> 7) != 0) { *pc++ = (jsbytecode)(index | 0x80); index = next; } *pc++ = (jsbytecode)index; return pc; } static JS_ALWAYS_INLINE jsbytecode * ReadCompactIndex(jsbytecode *pc, size_t *result) { size_t nextByte; nextByte = *pc++; if ((nextByte & 0x80) == 0) { /* * Short-circuit the most common case when compact index <= 127. */ *result = nextByte; } else { size_t shift = 7; *result = 0x7F & nextByte; do { nextByte = *pc++; *result |= (nextByte & 0x7F) << shift; shift += 7; } while ((nextByte & 0x80) != 0); } return pc; } typedef struct RECapture { ptrdiff_t index; /* start of contents, -1 for empty */ size_t length; /* length of capture */ } RECapture; typedef struct REMatchState { const jschar *cp; RECapture parens[1]; /* first of 're->parenCount' captures, allocated at end of this struct */ } REMatchState; struct REBackTrackData; typedef struct REProgState { jsbytecode *continue_pc; /* current continuation data */ jsbytecode continue_op; ptrdiff_t index; /* progress in text */ size_t parenSoFar; /* highest indexed paren started */ union { struct { uintN min; /* current quantifier limits */ uintN max; } quantifier; struct { size_t top; /* backtrack stack state */ size_t sz; } assertion; } u; } REProgState; typedef struct REBackTrackData { size_t sz; /* size of previous stack entry */ jsbytecode *backtrack_pc; /* where to backtrack to */ jsbytecode backtrack_op; const jschar *cp; /* index in text of match at backtrack */ size_t parenIndex; /* start index of saved paren contents */ size_t parenCount; /* # of saved paren contents */ size_t saveStateStackTop; /* number of parent states */ /* saved parent states follow */ /* saved paren contents follow */ } REBackTrackData; #define INITIAL_STATESTACK 100 #define INITIAL_BACKTRACK 8000 typedef struct REGlobalData { JSContext *cx; JSRegExp *regexp; /* the RE in execution */ JSBool ok; /* runtime error (out_of_memory only?) */ size_t start; /* offset to start at */ ptrdiff_t skipped; /* chars skipped anchoring this r.e. */ const jschar *cpbegin; /* text base address */ const jschar *cpend; /* text limit address */ REProgState *stateStack; /* stack of state of current parents */ size_t stateStackTop; size_t stateStackLimit; REBackTrackData *backTrackStack;/* stack of matched-so-far positions */ REBackTrackData *backTrackSP; size_t backTrackStackSize; size_t cursz; /* size of current stack entry */ size_t backTrackCount; /* how many times we've backtracked */ size_t backTrackLimit; /* upper limit on backtrack states */ } REGlobalData; void JSRegExpStatics::clearRoots() { input = NULL; cx->runtime->gcPoke = JS_TRUE; } bool JSRegExpStatics::copy(const JSRegExpStatics& other) { clearRoots(); input = other.input; multiline = other.multiline; lastMatch = other.lastMatch; lastParen = other.lastParen; leftContext = other.leftContext; rightContext = other.rightContext; if (!parens.resize(other.parens.length())) return false; memcpy(parens.begin(), other.parens.begin(), sizeof(JSSubString) * parens.length()); return true; } void JSRegExpStatics::clear() { clearRoots(); multiline = false; lastMatch = lastParen = leftContext = rightContext = js_EmptySubString; parens.clear(); } /* * 1. If IgnoreCase is false, return ch. * 2. Let u be ch converted to upper case as if by calling * String.prototype.toUpperCase on the one-character string ch. * 3. If u does not consist of a single character, return ch. * 4. Let cu be u's character. * 5. If ch's code point value is greater than or equal to decimal 128 and cu's * code point value is less than decimal 128, then return ch. * 6. Return cu. */ static JS_ALWAYS_INLINE uintN upcase(uintN ch) { uintN cu; JS_ASSERT((uintN) (jschar) ch == ch); if (ch < 128) { if (ch - (uintN) 'a' <= (uintN) ('z' - 'a')) ch -= (uintN) ('a' - 'A'); return ch; } cu = JS_TOUPPER(ch); return (cu < 128) ? ch : cu; } /* * Return the 'canonical' inverse upcase of |ch|. That is the character * |lch| such that |upcase(lch) == ch| and (|lch| is the lower-case form * of |ch| or is |ch|). */ static inline jschar inverse_upcase(jschar ch) { jschar lch = JS_TOLOWER(ch); return (upcase(lch) == ch) ? lch : ch; } /* Construct and initialize an RENode, returning NULL for out-of-memory */ static RENode * NewRENode(CompilerState *state, REOp op) { JSContext *cx; RENode *ren; cx = state->context; JS_ARENA_ALLOCATE_CAST(ren, RENode *, &cx->tempPool, sizeof *ren); if (!ren) { js_ReportOutOfScriptQuota(cx); return NULL; } ren->op = op; ren->next = NULL; ren->kid = NULL; return ren; } /* * Validates and converts hex ascii value. */ static JSBool isASCIIHexDigit(jschar c, uintN *digit) { uintN cv = c; if (cv < '0') return JS_FALSE; if (cv <= '9') { *digit = cv - '0'; return JS_TRUE; } cv |= 0x20; if (cv >= 'a' && cv <= 'f') { *digit = cv - 'a' + 10; return JS_TRUE; } return JS_FALSE; } typedef struct { REOp op; const jschar *errPos; size_t parenIndex; } REOpData; static JSBool ReportRegExpErrorHelper(CompilerState *state, uintN flags, uintN errorNumber, const jschar *arg) { if (state->tokenStream) { return ReportCompileErrorNumber(state->context, state->tokenStream, NULL, JSREPORT_UC | flags, errorNumber, arg); } return JS_ReportErrorFlagsAndNumberUC(state->context, flags, js_GetErrorMessage, NULL, errorNumber, arg); } static JSBool ReportRegExpError(CompilerState *state, uintN flags, uintN errorNumber) { return ReportRegExpErrorHelper(state, flags, errorNumber, NULL); } /* * Process the op against the two top operands, reducing them to a single * operand in the penultimate slot. Update progLength and treeDepth. */ static JSBool ProcessOp(CompilerState *state, REOpData *opData, RENode **operandStack, intN operandSP) { RENode *result; switch (opData->op) { case REOP_ALT: result = NewRENode(state, REOP_ALT); if (!result) return JS_FALSE; result->kid = operandStack[operandSP - 2]; result->u.kid2 = operandStack[operandSP - 1]; operandStack[operandSP - 2] = result; if (state->treeDepth == TREE_DEPTH_MAX) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); return JS_FALSE; } ++state->treeDepth; /* * Look at both alternates to see if there's a FLAT or a CLASS at * the start of each. If so, use a prerequisite match. */ if (((RENode *) result->kid)->op == REOP_FLAT && ((RENode *) result->u.kid2)->op == REOP_FLAT && (state->flags & JSREG_FOLD) == 0) { result->op = REOP_ALTPREREQ; result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr; result->u.altprereq.ch2 = ((RENode *) result->u.kid2)->u.flat.chr; /* ALTPREREQ, , uch1, uch2, , ..., JUMP, ... ENDALT */ state->progLength += 13; } else if (((RENode *) result->kid)->op == REOP_CLASS && ((RENode *) result->kid)->u.ucclass.index < 256 && ((RENode *) result->u.kid2)->op == REOP_FLAT && (state->flags & JSREG_FOLD) == 0) { result->op = REOP_ALTPREREQ2; result->u.altprereq.ch1 = ((RENode *) result->u.kid2)->u.flat.chr; result->u.altprereq.ch2 = jschar(((RENode *) result->kid)->u.ucclass.index); /* ALTPREREQ2, , uch1, uch2, , ..., JUMP, ... ENDALT */ state->progLength += 13; } else if (((RENode *) result->kid)->op == REOP_FLAT && ((RENode *) result->u.kid2)->op == REOP_CLASS && ((RENode *) result->u.kid2)->u.ucclass.index < 256 && (state->flags & JSREG_FOLD) == 0) { result->op = REOP_ALTPREREQ2; result->u.altprereq.ch1 = ((RENode *) result->kid)->u.flat.chr; result->u.altprereq.ch2 = jschar(((RENode *) result->u.kid2)->u.ucclass.index); /* ALTPREREQ2, , uch1, uch2, , ..., JUMP, ... ENDALT */ state->progLength += 13; } else { /* ALT, , ..., JUMP, ... ENDALT */ state->progLength += 7; } break; case REOP_CONCAT: result = operandStack[operandSP - 2]; while (result->next) result = result->next; result->next = operandStack[operandSP - 1]; break; case REOP_ASSERT: case REOP_ASSERT_NOT: case REOP_LPARENNON: case REOP_LPAREN: /* These should have been processed by a close paren. */ ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_MISSING_PAREN, opData->errPos); return JS_FALSE; default:; } return JS_TRUE; } /* * Parser forward declarations. */ static JSBool ParseTerm(CompilerState *state); static JSBool ParseQuantifier(CompilerState *state); static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues); /* * Top-down regular expression grammar, based closely on Perl4. * * regexp: altern A regular expression is one or more * altern '|' regexp alternatives separated by vertical bar. */ #define INITIAL_STACK_SIZE 128 static JSBool ParseRegExp(CompilerState *state) { size_t parenIndex; RENode *operand; REOpData *operatorStack; RENode **operandStack; REOp op; intN i; JSBool result = JS_FALSE; intN operatorSP = 0, operatorStackSize = INITIAL_STACK_SIZE; intN operandSP = 0, operandStackSize = INITIAL_STACK_SIZE; /* Watch out for empty regexp */ if (state->cp == state->cpend) { state->result = NewRENode(state, REOP_EMPTY); return (state->result != NULL); } operatorStack = (REOpData *) state->context->malloc(sizeof(REOpData) * operatorStackSize); if (!operatorStack) return JS_FALSE; operandStack = (RENode **) state->context->malloc(sizeof(RENode *) * operandStackSize); if (!operandStack) goto out; for (;;) { parenIndex = state->parenCount; if (state->cp == state->cpend) { /* * If we are at the end of the regexp and we're short one or more * operands, the regexp must have the form /x|/ or some such, with * left parentheses making us short more than one operand. */ if (operatorSP >= operandSP) { operand = NewRENode(state, REOP_EMPTY); if (!operand) goto out; goto pushOperand; } } else { switch (*state->cp) { case '(': ++state->cp; if (state->cp + 1 < state->cpend && *state->cp == '?' && (state->cp[1] == '=' || state->cp[1] == '!' || state->cp[1] == ':')) { switch (state->cp[1]) { case '=': op = REOP_ASSERT; /* ASSERT, , ... ASSERTTEST */ state->progLength += 4; break; case '!': op = REOP_ASSERT_NOT; /* ASSERTNOT, , ... ASSERTNOTTEST */ state->progLength += 4; break; default: op = REOP_LPARENNON; break; } state->cp += 2; } else { op = REOP_LPAREN; /* LPAREN, , ... RPAREN, */ state->progLength += 2 * (1 + GetCompactIndexWidth(parenIndex)); state->parenCount++; if (state->parenCount == 65535) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TOO_MANY_PARENS); goto out; } } goto pushOperator; case ')': /* * If there's no stacked open parenthesis, throw syntax error. */ for (i = operatorSP - 1; ; i--) { if (i < 0) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_UNMATCHED_RIGHT_PAREN); goto out; } if (operatorStack[i].op == REOP_ASSERT || operatorStack[i].op == REOP_ASSERT_NOT || operatorStack[i].op == REOP_LPARENNON || operatorStack[i].op == REOP_LPAREN) { break; } } /* FALL THROUGH */ case '|': /* Expected an operand before these, so make an empty one */ operand = NewRENode(state, REOP_EMPTY); if (!operand) goto out; goto pushOperand; default: if (!ParseTerm(state)) goto out; operand = state->result; pushOperand: if (operandSP == operandStackSize) { RENode **tmp; operandStackSize += operandStackSize; tmp = (RENode **) state->context->realloc(operandStack, sizeof(RENode *) * operandStackSize); if (!tmp) goto out; operandStack = tmp; } operandStack[operandSP++] = operand; break; } } /* At the end; process remaining operators. */ restartOperator: if (state->cp == state->cpend) { while (operatorSP) { --operatorSP; if (!ProcessOp(state, &operatorStack[operatorSP], operandStack, operandSP)) goto out; --operandSP; } JS_ASSERT(operandSP == 1); state->result = operandStack[0]; result = JS_TRUE; goto out; } switch (*state->cp) { case '|': /* Process any stacked 'concat' operators */ ++state->cp; while (operatorSP && operatorStack[operatorSP - 1].op == REOP_CONCAT) { --operatorSP; if (!ProcessOp(state, &operatorStack[operatorSP], operandStack, operandSP)) { goto out; } --operandSP; } op = REOP_ALT; goto pushOperator; case ')': /* * If there's no stacked open parenthesis, throw syntax error. */ for (i = operatorSP - 1; ; i--) { if (i < 0) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_UNMATCHED_RIGHT_PAREN); goto out; } if (operatorStack[i].op == REOP_ASSERT || operatorStack[i].op == REOP_ASSERT_NOT || operatorStack[i].op == REOP_LPARENNON || operatorStack[i].op == REOP_LPAREN) { break; } } ++state->cp; /* Process everything on the stack until the open parenthesis. */ for (;;) { JS_ASSERT(operatorSP); --operatorSP; switch (operatorStack[operatorSP].op) { case REOP_ASSERT: case REOP_ASSERT_NOT: case REOP_LPAREN: operand = NewRENode(state, operatorStack[operatorSP].op); if (!operand) goto out; operand->u.parenIndex = operatorStack[operatorSP].parenIndex; JS_ASSERT(operandSP); operand->kid = operandStack[operandSP - 1]; operandStack[operandSP - 1] = operand; if (state->treeDepth == TREE_DEPTH_MAX) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); goto out; } ++state->treeDepth; /* FALL THROUGH */ case REOP_LPARENNON: state->result = operandStack[operandSP - 1]; if (!ParseQuantifier(state)) goto out; operandStack[operandSP - 1] = state->result; goto restartOperator; default: if (!ProcessOp(state, &operatorStack[operatorSP], operandStack, operandSP)) goto out; --operandSP; break; } } break; case '{': { const jschar *errp = state->cp; if (ParseMinMaxQuantifier(state, JS_TRUE) < 0) { /* * This didn't even scan correctly as a quantifier, so we should * treat it as flat. */ op = REOP_CONCAT; goto pushOperator; } state->cp = errp; /* FALL THROUGH */ } case '+': case '*': case '?': ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER, state->cp); result = JS_FALSE; goto out; default: /* Anything else is the start of the next term. */ op = REOP_CONCAT; pushOperator: if (operatorSP == operatorStackSize) { REOpData *tmp; operatorStackSize += operatorStackSize; tmp = (REOpData *) state->context->realloc(operatorStack, sizeof(REOpData) * operatorStackSize); if (!tmp) goto out; operatorStack = tmp; } operatorStack[operatorSP].op = op; operatorStack[operatorSP].errPos = state->cp; operatorStack[operatorSP++].parenIndex = parenIndex; break; } } out: if (operatorStack) state->context->free(operatorStack); if (operandStack) state->context->free(operandStack); return result; } /* * Hack two bits in CompilerState.flags, for use within FindParenCount to flag * its being on the stack, and to propagate errors to its callers. */ #define JSREG_FIND_PAREN_COUNT 0x8000 #define JSREG_FIND_PAREN_ERROR 0x4000 /* * Magic return value from FindParenCount and GetDecimalValue, to indicate * overflow beyond GetDecimalValue's max parameter, or a computed maximum if * its findMax parameter is non-null. */ #define OVERFLOW_VALUE ((uintN)-1) static uintN FindParenCount(CompilerState *state) { CompilerState temp; int i; if (state->flags & JSREG_FIND_PAREN_COUNT) return OVERFLOW_VALUE; /* * Copy state into temp, flag it so we never report an invalid backref, * and reset its members to parse the entire regexp. This is obviously * suboptimal, but GetDecimalValue calls us only if a backref appears to * refer to a forward parenthetical, which is rare. */ temp = *state; temp.flags |= JSREG_FIND_PAREN_COUNT; temp.cp = temp.cpbegin; temp.parenCount = 0; temp.classCount = 0; temp.progLength = 0; temp.treeDepth = 0; temp.classBitmapsMem = 0; for (i = 0; i < CLASS_CACHE_SIZE; i++) temp.classCache[i].start = NULL; if (!ParseRegExp(&temp)) { state->flags |= JSREG_FIND_PAREN_ERROR; return OVERFLOW_VALUE; } return temp.parenCount; } /* * Extract and return a decimal value at state->cp. The initial character c * has already been read. Return OVERFLOW_VALUE if the result exceeds max. * Callers who pass a non-null findMax should test JSREG_FIND_PAREN_ERROR in * state->flags to discover whether an error occurred under findMax. */ static uintN GetDecimalValue(jschar c, uintN max, uintN (*findMax)(CompilerState *state), CompilerState *state) { uintN value = JS7_UNDEC(c); JSBool overflow = (value > max && (!findMax || value > findMax(state))); /* The following restriction allows simpler overflow checks. */ JS_ASSERT(max <= ((uintN)-1 - 9) / 10); while (state->cp < state->cpend) { c = *state->cp; if (!JS7_ISDEC(c)) break; value = 10 * value + JS7_UNDEC(c); if (!overflow && value > max && (!findMax || value > findMax(state))) overflow = JS_TRUE; ++state->cp; } return overflow ? OVERFLOW_VALUE : value; } /* * Calculate the total size of the bitmap required for a class expression. */ static JSBool CalculateBitmapSize(CompilerState *state, RENode *target, const jschar *src, const jschar *end) { uintN max = 0; JSBool inRange = JS_FALSE; jschar c, rangeStart = 0; uintN n, digit, nDigits, i; target->u.ucclass.bmsize = 0; target->u.ucclass.sense = JS_TRUE; if (src == end) return JS_TRUE; if (*src == '^') { ++src; target->u.ucclass.sense = JS_FALSE; } while (src != end) { JSBool canStartRange = JS_TRUE; jschar localMax = 0; switch (*src) { case '\\': ++src; c = *src++; switch (c) { case 'b': localMax = 0x8; break; case 'f': localMax = 0xC; break; case 'n': localMax = 0xA; break; case 'r': localMax = 0xD; break; case 't': localMax = 0x9; break; case 'v': localMax = 0xB; break; case 'c': if (src < end && RE_IS_LETTER(*src)) { localMax = (uintN) (*src++) & 0x1F; } else { --src; localMax = '\\'; } break; case 'x': nDigits = 2; goto lexHex; case 'u': nDigits = 4; lexHex: n = 0; for (i = 0; (i < nDigits) && (src < end); i++) { c = *src++; if (!isASCIIHexDigit(c, &digit)) { /* * Back off to accepting the original *'\' as a literal. */ src -= i + 1; n = '\\'; break; } n = (n << 4) | digit; } localMax = jschar(n); break; case 'd': canStartRange = JS_FALSE; if (inRange) { JS_ReportErrorNumber(state->context, js_GetErrorMessage, NULL, JSMSG_BAD_CLASS_RANGE); return JS_FALSE; } localMax = '9'; break; case 'D': case 's': case 'S': case 'w': case 'W': canStartRange = JS_FALSE; if (inRange) { JS_ReportErrorNumber(state->context, js_GetErrorMessage, NULL, JSMSG_BAD_CLASS_RANGE); return JS_FALSE; } max = 65535; /* * If this is the start of a range, ensure that it's less than * the end. */ localMax = 0; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. * */ n = JS7_UNDEC(c); c = *src; if ('0' <= c && c <= '7') { src++; n = 8 * n + JS7_UNDEC(c); c = *src; if ('0' <= c && c <= '7') { src++; i = 8 * n + JS7_UNDEC(c); if (i <= 0377) n = i; else src--; } } localMax = jschar(n); break; default: localMax = c; break; } break; default: localMax = *src++; break; } if (inRange) { /* Throw a SyntaxError here, per ECMA-262, 15.10.2.15. */ if (rangeStart > localMax) { JS_ReportErrorNumber(state->context, js_GetErrorMessage, NULL, JSMSG_BAD_CLASS_RANGE); return JS_FALSE; } inRange = JS_FALSE; } else { if (canStartRange && src < end - 1) { if (*src == '-') { ++src; inRange = JS_TRUE; rangeStart = (jschar)localMax; continue; } } if (state->flags & JSREG_FOLD) rangeStart = localMax; /* one run of the uc/dc loop below */ } if (state->flags & JSREG_FOLD) { jschar maxch = localMax; for (i = rangeStart; i <= localMax; i++) { jschar uch, dch; uch = jschar(upcase(i)); dch = inverse_upcase(jschar(i)); maxch = JS_MAX(maxch, uch); maxch = JS_MAX(maxch, dch); } localMax = maxch; } if (localMax > max) max = uintN(localMax); } target->u.ucclass.bmsize = uint16(max); return JS_TRUE; } /* * item: assertion An item is either an assertion or * quantatom a quantified atom. * * assertion: '^' Assertions match beginning of string * (or line if the class static property * RegExp.multiline is true). * '$' End of string (or line if the class * static property RegExp.multiline is * true). * '\b' Word boundary (between \w and \W). * '\B' Word non-boundary. * * quantatom: atom An unquantified atom. * quantatom '{' n ',' m '}' * Atom must occur between n and m times. * quantatom '{' n ',' '}' Atom must occur at least n times. * quantatom '{' n '}' Atom must occur exactly n times. * quantatom '*' Zero or more times (same as {0,}). * quantatom '+' One or more times (same as {1,}). * quantatom '?' Zero or one time (same as {0,1}). * * any of which can be optionally followed by '?' for ungreedy * * atom: '(' regexp ')' A parenthesized regexp (what matched * can be addressed using a backreference, * see '\' n below). * '.' Matches any char except '\n'. * '[' classlist ']' A character class. * '[' '^' classlist ']' A negated character class. * '\f' Form Feed. * '\n' Newline (Line Feed). * '\r' Carriage Return. * '\t' Horizontal Tab. * '\v' Vertical Tab. * '\d' A digit (same as [0-9]). * '\D' A non-digit. * '\w' A word character, [0-9a-z_A-Z]. * '\W' A non-word character. * '\s' A whitespace character, [ \b\f\n\r\t\v]. * '\S' A non-whitespace character. * '\' n A backreference to the nth (n decimal * and positive) parenthesized expression. * '\' octal An octal escape sequence (octal must be * two or three digits long, unless it is * 0 for the null character). * '\x' hex A hex escape (hex must be two digits). * '\u' unicode A unicode escape (must be four digits). * '\c' ctrl A control character, ctrl is a letter. * '\' literalatomchar Any character except one of the above * that follow '\' in an atom. * otheratomchar Any character not first among the other * atom right-hand sides. */ static JSBool ParseTerm(CompilerState *state) { jschar c = *state->cp++; uintN nDigits; uintN num, tmp, n, i; const jschar *termStart; switch (c) { /* assertions and atoms */ case '^': state->result = NewRENode(state, REOP_BOL); if (!state->result) return JS_FALSE; state->progLength++; return JS_TRUE; case '$': state->result = NewRENode(state, REOP_EOL); if (!state->result) return JS_FALSE; state->progLength++; return JS_TRUE; case '\\': if (state->cp >= state->cpend) { /* a trailing '\' is an error */ ReportRegExpError(state, JSREPORT_ERROR, JSMSG_TRAILING_SLASH); return JS_FALSE; } c = *state->cp++; switch (c) { /* assertion escapes */ case 'b' : state->result = NewRENode(state, REOP_WBDRY); if (!state->result) return JS_FALSE; state->progLength++; return JS_TRUE; case 'B': state->result = NewRENode(state, REOP_WNONBDRY); if (!state->result) return JS_FALSE; state->progLength++; return JS_TRUE; /* Decimal escape */ case '0': /* Give a strict warning. See also the note below. */ if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT, JSMSG_INVALID_BACKREF)) { return JS_FALSE; } doOctal: num = 0; while (state->cp < state->cpend) { c = *state->cp; if (c < '0' || '7' < c) break; state->cp++; tmp = 8 * num + (uintN)JS7_UNDEC(c); if (tmp > 0377) break; num = tmp; } c = (jschar)num; doFlat: state->result = NewRENode(state, REOP_FLAT); if (!state->result) return JS_FALSE; state->result->u.flat.chr = c; state->result->u.flat.length = 1; state->progLength += 3; break; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': termStart = state->cp - 1; num = GetDecimalValue(c, state->parenCount, FindParenCount, state); if (state->flags & JSREG_FIND_PAREN_ERROR) return JS_FALSE; if (num == OVERFLOW_VALUE) { /* Give a strict mode warning. */ if (!ReportRegExpError(state, JSREPORT_WARNING | JSREPORT_STRICT, (c >= '8') ? JSMSG_INVALID_BACKREF : JSMSG_BAD_BACKREF)) { return JS_FALSE; } /* * Note: ECMA 262, 15.10.2.9 says that we should throw a syntax * error here. However, for compatibility with IE, we treat the * whole backref as flat if the first character in it is not a * valid octal character, and as an octal escape otherwise. */ state->cp = termStart; if (c >= '8') { /* Treat this as flat. termStart - 1 is the \. */ c = '\\'; goto asFlat; } /* Treat this as an octal escape. */ goto doOctal; } /* * When FindParenCount calls the regex parser recursively (to find * the number of backrefs) num can be arbitrary and the maximum * supported number of backrefs does not bound it. */ JS_ASSERT_IF(!(state->flags & JSREG_FIND_PAREN_COUNT), 1 <= num && num <= 0x10000); state->result = NewRENode(state, REOP_BACKREF); if (!state->result) return JS_FALSE; state->result->u.parenIndex = num - 1; state->progLength += 1 + GetCompactIndexWidth(state->result->u.parenIndex); break; /* Control escape */ case 'f': c = 0xC; goto doFlat; case 'n': c = 0xA; goto doFlat; case 'r': c = 0xD; goto doFlat; case 't': c = 0x9; goto doFlat; case 'v': c = 0xB; goto doFlat; /* Control letter */ case 'c': if (state->cp < state->cpend && RE_IS_LETTER(*state->cp)) { c = (jschar) (*state->cp++ & 0x1F); } else { /* back off to accepting the original '\' as a literal */ --state->cp; c = '\\'; } goto doFlat; /* HexEscapeSequence */ case 'x': nDigits = 2; goto lexHex; /* UnicodeEscapeSequence */ case 'u': nDigits = 4; lexHex: n = 0; for (i = 0; i < nDigits && state->cp < state->cpend; i++) { uintN digit; c = *state->cp++; if (!isASCIIHexDigit(c, &digit)) { /* * Back off to accepting the original 'u' or 'x' as a * literal. */ state->cp -= i + 2; n = *state->cp++; break; } n = (n << 4) | digit; } c = (jschar) n; goto doFlat; /* Character class escapes */ case 'd': state->result = NewRENode(state, REOP_DIGIT); doSimple: if (!state->result) return JS_FALSE; state->progLength++; break; case 'D': state->result = NewRENode(state, REOP_NONDIGIT); goto doSimple; case 's': state->result = NewRENode(state, REOP_SPACE); goto doSimple; case 'S': state->result = NewRENode(state, REOP_NONSPACE); goto doSimple; case 'w': state->result = NewRENode(state, REOP_ALNUM); goto doSimple; case 'W': state->result = NewRENode(state, REOP_NONALNUM); goto doSimple; /* IdentityEscape */ default: state->result = NewRENode(state, REOP_FLAT); if (!state->result) return JS_FALSE; state->result->u.flat.chr = c; state->result->u.flat.length = 1; state->result->kid = (void *) (state->cp - 1); state->progLength += 3; break; } break; case '[': state->result = NewRENode(state, REOP_CLASS); if (!state->result) return JS_FALSE; termStart = state->cp; state->result->u.ucclass.startIndex = termStart - state->cpbegin; for (;;) { if (state->cp == state->cpend) { ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_UNTERM_CLASS, termStart); return JS_FALSE; } if (*state->cp == '\\') { state->cp++; if (state->cp != state->cpend) state->cp++; continue; } if (*state->cp == ']') { state->result->u.ucclass.kidlen = state->cp - termStart; break; } state->cp++; } for (i = 0; i < CLASS_CACHE_SIZE; i++) { if (!state->classCache[i].start) { state->classCache[i].start = termStart; state->classCache[i].length = state->result->u.ucclass.kidlen; state->classCache[i].index = state->classCount; break; } if (state->classCache[i].length == state->result->u.ucclass.kidlen) { for (n = 0; ; n++) { if (n == state->classCache[i].length) { state->result->u.ucclass.index = state->classCache[i].index; goto claim; } if (state->classCache[i].start[n] != termStart[n]) break; } } } state->result->u.ucclass.index = state->classCount++; claim: /* * Call CalculateBitmapSize now as we want any errors it finds * to be reported during the parse phase, not at execution. */ if (!CalculateBitmapSize(state, state->result, termStart, state->cp++)) return JS_FALSE; /* * Update classBitmapsMem with number of bytes to hold bmsize bits, * which is (bitsCount + 7) / 8 or (highest_bit + 1 + 7) / 8 * or highest_bit / 8 + 1 where highest_bit is u.ucclass.bmsize. */ n = (state->result->u.ucclass.bmsize >> 3) + 1; if (n > CLASS_BITMAPS_MEM_LIMIT - state->classBitmapsMem) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); return JS_FALSE; } state->classBitmapsMem += n; /* CLASS, */ state->progLength += 1 + GetCompactIndexWidth(state->result->u.ucclass.index); break; case '.': state->result = NewRENode(state, REOP_DOT); goto doSimple; case '{': { const jschar *errp = state->cp--; intN err; err = ParseMinMaxQuantifier(state, JS_TRUE); state->cp = errp; if (err < 0) goto asFlat; /* FALL THROUGH */ } case '*': case '+': case '?': ReportRegExpErrorHelper(state, JSREPORT_ERROR, JSMSG_BAD_QUANTIFIER, state->cp - 1); return JS_FALSE; default: asFlat: state->result = NewRENode(state, REOP_FLAT); if (!state->result) return JS_FALSE; state->result->u.flat.chr = c; state->result->u.flat.length = 1; state->result->kid = (void *) (state->cp - 1); state->progLength += 3; break; } return ParseQuantifier(state); } static JSBool ParseQuantifier(CompilerState *state) { RENode *term; term = state->result; if (state->cp < state->cpend) { switch (*state->cp) { case '+': state->result = NewRENode(state, REOP_QUANT); if (!state->result) return JS_FALSE; state->result->u.range.min = 1; state->result->u.range.max = (uintN)-1; /* , ... */ state->progLength += 4; goto quantifier; case '*': state->result = NewRENode(state, REOP_QUANT); if (!state->result) return JS_FALSE; state->result->u.range.min = 0; state->result->u.range.max = (uintN)-1; /* , ... */ state->progLength += 4; goto quantifier; case '?': state->result = NewRENode(state, REOP_QUANT); if (!state->result) return JS_FALSE; state->result->u.range.min = 0; state->result->u.range.max = 1; /* , ... */ state->progLength += 4; goto quantifier; case '{': /* balance '}' */ { intN err; const jschar *errp = state->cp; err = ParseMinMaxQuantifier(state, JS_FALSE); if (err == 0) goto quantifier; if (err == -1) return JS_TRUE; ReportRegExpErrorHelper(state, JSREPORT_ERROR, err, errp); return JS_FALSE; } default:; } } return JS_TRUE; quantifier: if (state->treeDepth == TREE_DEPTH_MAX) { ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); return JS_FALSE; } ++state->treeDepth; ++state->cp; state->result->kid = term; if (state->cp < state->cpend && *state->cp == '?') { ++state->cp; state->result->u.range.greedy = JS_FALSE; } else { state->result->u.range.greedy = JS_TRUE; } return JS_TRUE; } static intN ParseMinMaxQuantifier(CompilerState *state, JSBool ignoreValues) { uintN min, max; jschar c; const jschar *errp = state->cp++; c = *state->cp; if (JS7_ISDEC(c)) { ++state->cp; min = GetDecimalValue(c, 0xFFFF, NULL, state); c = *state->cp; if (!ignoreValues && min == OVERFLOW_VALUE) return JSMSG_MIN_TOO_BIG; if (c == ',') { c = *++state->cp; if (JS7_ISDEC(c)) { ++state->cp; max = GetDecimalValue(c, 0xFFFF, NULL, state); c = *state->cp; if (!ignoreValues && max == OVERFLOW_VALUE) return JSMSG_MAX_TOO_BIG; if (!ignoreValues && min > max) return JSMSG_OUT_OF_ORDER; } else { max = (uintN)-1; } } else { max = min; } if (c == '}') { state->result = NewRENode(state, REOP_QUANT); if (!state->result) return JSMSG_OUT_OF_MEMORY; state->result->u.range.min = min; state->result->u.range.max = max; /* * QUANT, , , ... * where is written as compact(max+1) to make * (uintN)-1 sentinel to occupy 1 byte, not width_of(max)+1. */ state->progLength += (1 + GetCompactIndexWidth(min) + GetCompactIndexWidth(max + 1) +3); return 0; } } state->cp = errp; return -1; } static JSBool SetForwardJumpOffset(jsbytecode *jump, jsbytecode *target) { ptrdiff_t offset = target - jump; /* Check that target really points forward. */ JS_ASSERT(offset >= 2); if ((size_t)offset > OFFSET_MAX) return JS_FALSE; jump[0] = JUMP_OFFSET_HI(offset); jump[1] = JUMP_OFFSET_LO(offset); return JS_TRUE; } /* Copy the charset data from a character class node to the charset list * in the regexp object. */ static JS_ALWAYS_INLINE RECharSet * InitNodeCharSet(JSRegExp *re, RENode *node) { RECharSet *charSet = &re->classList[node->u.ucclass.index]; charSet->converted = JS_FALSE; charSet->length = node->u.ucclass.bmsize; charSet->u.src.startIndex = node->u.ucclass.startIndex; charSet->u.src.length = node->u.ucclass.kidlen; charSet->sense = node->u.ucclass.sense; return charSet; } /* * Generate bytecode for the tree rooted at t using an explicit stack instead * of recursion. */ static jsbytecode * EmitREBytecode(CompilerState *state, JSRegExp *re, size_t treeDepth, jsbytecode *pc, RENode *t) { EmitStateStackEntry *emitStateSP, *emitStateStack; REOp op; if (treeDepth == 0) { emitStateStack = NULL; } else { emitStateStack = (EmitStateStackEntry *) state->context->malloc(sizeof(EmitStateStackEntry) * treeDepth); if (!emitStateStack) return NULL; } emitStateSP = emitStateStack; op = t->op; JS_ASSERT(op < REOP_LIMIT); for (;;) { *pc++ = op; switch (op) { case REOP_EMPTY: --pc; break; case REOP_ALTPREREQ2: case REOP_ALTPREREQ: JS_ASSERT(emitStateSP); emitStateSP->altHead = pc - 1; emitStateSP->endTermFixup = pc; pc += OFFSET_LEN; SET_ARG(pc, t->u.altprereq.ch1); pc += ARG_LEN; SET_ARG(pc, t->u.altprereq.ch2); pc += ARG_LEN; emitStateSP->nextAltFixup = pc; /* offset to next alternate */ pc += OFFSET_LEN; emitStateSP->continueNode = t; emitStateSP->continueOp = REOP_JUMP; emitStateSP->jumpToJumpFlag = JS_FALSE; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->kid; op = t->op; JS_ASSERT(op < REOP_LIMIT); continue; case REOP_JUMP: emitStateSP->nextTermFixup = pc; /* offset to following term */ pc += OFFSET_LEN; if (!SetForwardJumpOffset(emitStateSP->nextAltFixup, pc)) goto jump_too_big; emitStateSP->continueOp = REOP_ENDALT; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->u.kid2; op = t->op; JS_ASSERT(op < REOP_LIMIT); continue; case REOP_ENDALT: /* * If we already patched emitStateSP->nextTermFixup to jump to * a nearer jump, to avoid 16-bit immediate offset overflow, we * are done here. */ if (emitStateSP->jumpToJumpFlag) break; /* * Fix up the REOP_JUMP offset to go to the op after REOP_ENDALT. * REOP_ENDALT is executed only on successful match of the last * alternate in a group. */ if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc)) goto jump_too_big; if (t->op != REOP_ALT) { if (!SetForwardJumpOffset(emitStateSP->endTermFixup, pc)) goto jump_too_big; } /* * If the program is bigger than the REOP_JUMP offset range, then * we must check for alternates before this one that are part of * the same group, and fix up their jump offsets to target jumps * close enough to fit in a 16-bit unsigned offset immediate. */ if ((size_t)(pc - re->program) > OFFSET_MAX && emitStateSP > emitStateStack) { EmitStateStackEntry *esp, *esp2; jsbytecode *alt, *jump; ptrdiff_t span, header; esp2 = emitStateSP; alt = esp2->altHead; for (esp = esp2 - 1; esp >= emitStateStack; --esp) { if (esp->continueOp == REOP_ENDALT && !esp->jumpToJumpFlag && esp->nextTermFixup + OFFSET_LEN == alt && (size_t)(pc - ((esp->continueNode->op != REOP_ALT) ? esp->endTermFixup : esp->nextTermFixup)) > OFFSET_MAX) { alt = esp->altHead; jump = esp->nextTermFixup; /* * The span must be 1 less than the distance from * jump offset to jump offset, so we actually jump * to a REOP_JUMP bytecode, not to its offset! */ for (;;) { JS_ASSERT(jump < esp2->nextTermFixup); span = esp2->nextTermFixup - jump - 1; if ((size_t)span <= OFFSET_MAX) break; do { if (--esp2 == esp) goto jump_too_big; } while (esp2->continueOp != REOP_ENDALT); } jump[0] = JUMP_OFFSET_HI(span); jump[1] = JUMP_OFFSET_LO(span); if (esp->continueNode->op != REOP_ALT) { /* * We must patch the offset at esp->endTermFixup * as well, for the REOP_ALTPREREQ{,2} opcodes. * If we're unlucky and endTermFixup is more than * OFFSET_MAX bytes from its target, we cheat by * jumping 6 bytes to the jump whose offset is at * esp->nextTermFixup, which has the same target. */ jump = esp->endTermFixup; header = esp->nextTermFixup - jump; span += header; if ((size_t)span > OFFSET_MAX) span = header; jump[0] = JUMP_OFFSET_HI(span); jump[1] = JUMP_OFFSET_LO(span); } esp->jumpToJumpFlag = JS_TRUE; } } } break; case REOP_ALT: JS_ASSERT(emitStateSP); emitStateSP->altHead = pc - 1; emitStateSP->nextAltFixup = pc; /* offset to next alternate */ pc += OFFSET_LEN; emitStateSP->continueNode = t; emitStateSP->continueOp = REOP_JUMP; emitStateSP->jumpToJumpFlag = JS_FALSE; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->kid; op = t->op; JS_ASSERT(op < REOP_LIMIT); continue; case REOP_FLAT: /* * Coalesce FLATs if possible and if it would not increase bytecode * beyond preallocated limit. The latter happens only when bytecode * size for coalesced string with offset p and length 2 exceeds 6 * bytes preallocated for 2 single char nodes, i.e. when * 1 + GetCompactIndexWidth(p) + GetCompactIndexWidth(2) > 6 or * GetCompactIndexWidth(p) > 4. * Since when GetCompactIndexWidth(p) <= 4 coalescing of 3 or more * nodes strictly decreases bytecode size, the check has to be * done only for the first coalescing. */ if (t->kid && GetCompactIndexWidth((jschar *)t->kid - state->cpbegin) <= 4) { while (t->next && t->next->op == REOP_FLAT && (jschar*)t->kid + t->u.flat.length == (jschar*)t->next->kid) { t->u.flat.length += t->next->u.flat.length; t->next = t->next->next; } } if (t->kid && t->u.flat.length > 1) { pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLATi : REOP_FLAT; pc = WriteCompactIndex(pc, (jschar *)t->kid - state->cpbegin); pc = WriteCompactIndex(pc, t->u.flat.length); } else if (t->u.flat.chr < 256) { pc[-1] = (state->flags & JSREG_FOLD) ? REOP_FLAT1i : REOP_FLAT1; *pc++ = (jsbytecode) t->u.flat.chr; } else { pc[-1] = (state->flags & JSREG_FOLD) ? REOP_UCFLAT1i : REOP_UCFLAT1; SET_ARG(pc, t->u.flat.chr); pc += ARG_LEN; } break; case REOP_LPAREN: JS_ASSERT(emitStateSP); pc = WriteCompactIndex(pc, t->u.parenIndex); emitStateSP->continueNode = t; emitStateSP->continueOp = REOP_RPAREN; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->kid; op = t->op; continue; case REOP_RPAREN: pc = WriteCompactIndex(pc, t->u.parenIndex); break; case REOP_BACKREF: pc = WriteCompactIndex(pc, t->u.parenIndex); break; case REOP_ASSERT: JS_ASSERT(emitStateSP); emitStateSP->nextTermFixup = pc; pc += OFFSET_LEN; emitStateSP->continueNode = t; emitStateSP->continueOp = REOP_ASSERTTEST; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->kid; op = t->op; continue; case REOP_ASSERTTEST: case REOP_ASSERTNOTTEST: if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc)) goto jump_too_big; break; case REOP_ASSERT_NOT: JS_ASSERT(emitStateSP); emitStateSP->nextTermFixup = pc; pc += OFFSET_LEN; emitStateSP->continueNode = t; emitStateSP->continueOp = REOP_ASSERTNOTTEST; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->kid; op = t->op; continue; case REOP_QUANT: JS_ASSERT(emitStateSP); if (t->u.range.min == 0 && t->u.range.max == (uintN)-1) { pc[-1] = (t->u.range.greedy) ? REOP_STAR : REOP_MINIMALSTAR; } else if (t->u.range.min == 0 && t->u.range.max == 1) { pc[-1] = (t->u.range.greedy) ? REOP_OPT : REOP_MINIMALOPT; } else if (t->u.range.min == 1 && t->u.range.max == (uintN) -1) { pc[-1] = (t->u.range.greedy) ? REOP_PLUS : REOP_MINIMALPLUS; } else { if (!t->u.range.greedy) pc[-1] = REOP_MINIMALQUANT; pc = WriteCompactIndex(pc, t->u.range.min); /* * Write max + 1 to avoid using size_t(max) + 1 bytes * for (uintN)-1 sentinel. */ pc = WriteCompactIndex(pc, t->u.range.max + 1); } emitStateSP->nextTermFixup = pc; pc += OFFSET_LEN; emitStateSP->continueNode = t; emitStateSP->continueOp = REOP_ENDCHILD; ++emitStateSP; JS_ASSERT((size_t)(emitStateSP - emitStateStack) <= treeDepth); t = (RENode *) t->kid; op = t->op; continue; case REOP_ENDCHILD: if (!SetForwardJumpOffset(emitStateSP->nextTermFixup, pc)) goto jump_too_big; break; case REOP_CLASS: if (!t->u.ucclass.sense) pc[-1] = REOP_NCLASS; pc = WriteCompactIndex(pc, t->u.ucclass.index); InitNodeCharSet(re, t); break; default: break; } t = t->next; if (t) { op = t->op; } else { if (emitStateSP == emitStateStack) break; --emitStateSP; t = emitStateSP->continueNode; op = (REOp) emitStateSP->continueOp; } } cleanup: if (emitStateStack) state->context->free(emitStateStack); return pc; jump_too_big: ReportRegExpError(state, JSREPORT_ERROR, JSMSG_REGEXP_TOO_COMPLEX); pc = NULL; goto cleanup; } static JSBool CompileRegExpToAST(JSContext* cx, TokenStream* ts, JSString* str, uintN flags, CompilerState& state) { uintN i; size_t len; len = str->length(); state.context = cx; state.tokenStream = ts; state.cp = js_UndependString(cx, str); if (!state.cp) return JS_FALSE; state.cpbegin = state.cp; state.cpend = state.cp + len; state.flags = uint16(flags); state.parenCount = 0; state.classCount = 0; state.progLength = 0; state.treeDepth = 0; state.classBitmapsMem = 0; for (i = 0; i < CLASS_CACHE_SIZE; i++) state.classCache[i].start = NULL; if (len != 0 && (flags & JSREG_FLAT)) { state.result = NewRENode(&state, REOP_FLAT); if (!state.result) return JS_FALSE; state.result->u.flat.chr = *state.cpbegin; state.result->u.flat.length = len; state.result->kid = (void *) state.cpbegin; /* Flat bytecode: REOP_FLAT compact(string_offset) compact(len). */ state.progLength += 1 + GetCompactIndexWidth(0) + GetCompactIndexWidth(len); return JS_TRUE; } return ParseRegExp(&state); } #ifdef JS_TRACER typedef js::Vector LInsList; namespace js { struct REFragment : public nanojit::Fragment { REFragment(const void* _ip verbose_only(, uint32_t profFragID)) : nanojit::Fragment(ip verbose_only(, profFragID)) {} }; } /* namespace js */ /* Return the cached fragment for the given regexp, or create one. */ static Fragment* LookupNativeRegExp(JSContext* cx, uint16 re_flags, const jschar* re_chars, size_t re_length) { TraceMonitor *tm = &JS_TRACE_MONITOR(cx); VMAllocator &alloc = *tm->dataAlloc; REHashMap &table = *tm->reFragments; REHashKey k(re_length, re_flags, re_chars); REFragment *frag = table.get(k); if (!frag) { verbose_only( uint32_t profFragID = (LogController.lcbits & LC_FragProfile) ? (++(tm->lastFragID)) : 0; ) frag = new (alloc) REFragment(0 verbose_only(, profFragID)); /* * Copy the re_chars portion of the hash key into the Allocator, so * its lifecycle is disconnected from the lifecycle of the * underlying regexp. */ k.re_chars = (const jschar*) new (alloc) jschar[re_length]; memcpy((void*) k.re_chars, re_chars, re_length * sizeof(jschar)); table.put(k, frag); } return frag; } static JSBool ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet); /* Utilities for the RegExpNativeCompiler */ namespace { /* * An efficient way to simultaneously statically guard that the sizeof(bool) is a * small power of 2 and take its log2. */ template struct StaticLog2 {}; template <> struct StaticLog2<1> { static const int result = 0; }; template <> struct StaticLog2<2> { static const int result = 1; }; template <> struct StaticLog2<4> { static const int result = 2; }; template <> struct StaticLog2<8> { static const int result = 3; }; } /* * This table allows efficient testing for the ASCII portion of \s during a * trace. ECMA-262 15.10.2.12 defines the following characters below 128 to be * whitespace: 0x9 (0), 0xA (10), 0xB (11), 0xC (12), 0xD (13), 0x20 (32). The * index must be <= 32. */ static const bool js_ws[] = { /* 0 1 2 3 4 5 5 7 8 9 */ /* 0 */ false, false, false, false, false, false, false, false, false, true, /* 1 */ true, true, true, true, false, false, false, false, false, false, /* 2 */ false, false, false, false, false, false, false, false, false, false, /* 3 */ false, false, true }; /* Sets of characters are described in terms of individuals and classes. */ class CharSet { public: CharSet() : charEnd(charBuf), classes(0) {} static const uintN sBufSize = 8; bool full() { return charEnd == charBuf + sBufSize; } /* Add a single char to the set. */ bool addChar(jschar c) { if (full()) return false; *charEnd++ = c; return true; } enum Class { LineTerms = 1 << 0, /* Line Terminators (E262 7.3) */ OtherSpace = 1 << 1, /* \s (E262 15.10.2.12) - LineTerms */ Digit = 1 << 2, /* \d (E262 15.10.2.12) */ OtherAlnum = 1 << 3, /* \w (E262 15,10.2.12) - Digit */ Other = 1 << 4, /* all other characters */ All = LineTerms | OtherSpace | Digit | OtherAlnum | Other, Space = LineTerms | OtherSpace, AlNum = Digit | OtherAlnum, Dot = All & ~LineTerms }; /* Add a set of chars to the set. */ void addClass(Class c) { classes |= c; } /* Return whether two sets of chars are disjoint. */ bool disjoint(const CharSet &) const; private: static bool disjoint(const jschar *beg, const jschar *end, uintN classes); mutable jschar charBuf[sBufSize]; jschar *charEnd; uintN classes; }; /* Appease the type checker. */ static inline CharSet::Class operator|(CharSet::Class c1, CharSet::Class c2) { return (CharSet::Class)(((int)c1) | ((int)c2)); } static inline CharSet::Class operator~(CharSet::Class c) { return (CharSet::Class)(~(int)c); } /* * Return whether the characters in the range [beg, end) fall within any of the * classes with a bit set in 'classes'. */ bool CharSet::disjoint(const jschar *beg, const jschar *end, uintN classes) { for (const jschar *p = beg; p != end; ++p) { if (JS7_ISDEC(*p)) { if (classes & Digit) return false; } else if (JS_ISWORD(*p)) { if (classes & OtherAlnum) return false; } else if (RE_IS_LINE_TERM(*p)) { if (classes & LineTerms) return false; } else if (JS_ISSPACE(*p)) { if (classes & OtherSpace) return false; } else { if (classes & Other) return false; } } return true; } /* * Predicate version of the STL's set_intersection. Assumes both ranges are * sorted and thus runs in linear time. * * FIXME: This is a reusable algorithm, perhaps it should be put somewhere. */ template bool set_disjoint(InputIterator1 p1, InputIterator1 end1, InputIterator2 p2, InputIterator2 end2) { if (p1 == end1 || p2 == end2) return true; while (*p1 != *p2) { if (*p1 < *p2) { ++p1; if (p1 == end1) return true; } else if (*p2 < *p1) { ++p2; if (p2 == end2) return true; } } return false; } static JSBool CharCmp(void *arg, const void *a, const void *b, int *result) { jschar ca = *(jschar *)a, cb = *(jschar *)b; *result = ca - cb; return JS_TRUE; } bool CharSet::disjoint(const CharSet &other) const { /* Check overlap between classes. */ if (classes & other.classes) return false; /* * Check char-class overlap. Compare this->charBuf with other.classes and * vice versa with a loop. */ if (!disjoint(this->charBuf, this->charEnd, other.classes) || !disjoint(other.charBuf, other.charEnd, this->classes)) return false; /* Check char-char overlap. */ jschar tmp[CharSet::sBufSize]; js_MergeSort(charBuf, charEnd - charBuf, sizeof(jschar), CharCmp, 0, tmp); js_MergeSort(other.charBuf, other.charEnd - other.charBuf, sizeof(jschar), CharCmp, 0, tmp); return set_disjoint(charBuf, charEnd, other.charBuf, other.charEnd); } /* * Return true if the given subexpression may match the empty string. The * conservative answer is |true|. If |next| is true, then the subexpression is * considered to be |node| followed by the rest of |node->next|. Otherwise, the * subexpression is considered to be |node| by itself. */ static bool mayMatchEmpty(RENode *node, bool next = true) { if (!node) return true; switch (node->op) { case REOP_EMPTY: return true; case REOP_FLAT: return false; case REOP_CLASS: return false; case REOP_ALNUM: return false; case REOP_ALT: return (mayMatchEmpty((RENode *)node->kid) || mayMatchEmpty((RENode *)node->u.kid2)) && (!next || mayMatchEmpty(node->next)); case REOP_QUANT: return (node->u.range.min == 0 || mayMatchEmpty((RENode *)node->kid)) && (!next || mayMatchEmpty(node->next)); default: return true; } } /* * Enumerate the set of characters that may be consumed next by the given * subexpression in isolation. Return whether the enumeration was successful. */ static bool enumerateNextChars(JSContext *cx, RENode *node, CharSet &set) { JS_CHECK_RECURSION(cx, return JS_FALSE); if (!node) return true; switch (node->op) { /* Record as bitflags. */ case REOP_DOT: set.addClass(CharSet::Dot); return true; case REOP_DIGIT: set.addClass(CharSet::Digit); return true; case REOP_NONDIGIT: set.addClass(~CharSet::Digit); return true; case REOP_ALNUM: set.addClass(CharSet::AlNum); return true; case REOP_NONALNUM: set.addClass(~CharSet::AlNum); return true; case REOP_SPACE: set.addClass(CharSet::Space); return true; case REOP_NONSPACE: set.addClass(~CharSet::Space); return true; /* Record as individual characters. */ case REOP_FLAT: return set.addChar(node->u.flat.chr); /* Control structures. */ case REOP_EMPTY: return true; case REOP_ALT: case REOP_ALTPREREQ: return enumerateNextChars(cx, (RENode *)node->kid, set) && enumerateNextChars(cx, (RENode *)node->u.kid2, set) && (!mayMatchEmpty(node, false) || enumerateNextChars(cx, (RENode *)node->next, set)); case REOP_QUANT: return enumerateNextChars(cx, (RENode *)node->kid, set) && (!mayMatchEmpty(node, false) || enumerateNextChars(cx, (RENode *)node->next, set)); /* Arbitrary character classes and oddities. */ default: return false; } } class RegExpNativeCompiler { private: VMAllocator& tempAlloc; JSContext* cx; JSRegExp* re; CompilerState* cs; /* RegExp to compile */ Fragment* fragment; LirWriter* lir; #ifdef DEBUG LirWriter* validate_writer; #endif #ifdef NJ_VERBOSE LirWriter* verbose_filter; #endif LirBufWriter* lirBufWriter; /* for skip */ LIns* state; LIns* start; LIns* cpend; LirBuffer* const lirbuf; bool outOfMemory() { return tempAlloc.outOfMemory() || JS_TRACE_MONITOR(cx).dataAlloc->outOfMemory(); } JSBool isCaseInsensitive() const { return (cs->flags & JSREG_FOLD) != 0; } void targetCurrentPoint(LIns *ins) { ins->setTarget(lir->ins0(LIR_label)); } void targetCurrentPoint(LInsList &fails) { LIns *fail = lir->ins0(LIR_label); for (size_t i = 0; i < fails.length(); ++i) { fails[i]->setTarget(fail); } fails.clear(); } /* * These functions return the new position after their match operation, * or NULL if there was an error. */ LIns* compileEmpty(RENode* node, LIns* pos, LInsList& fails) { return pos; } #if defined(AVMPLUS_ARM) || defined(AVMPLUS_SPARC) /* We can't do this on ARM or SPARC, since it relies on doing a 32-bit load from * a pointer which is only 2-byte aligned. */ #undef USE_DOUBLE_CHAR_MATCH #else #define USE_DOUBLE_CHAR_MATCH #endif LIns* compileFlatSingleChar(jschar ch, LIns* pos, LInsList& fails) { LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, cpend), 0); if (!fails.append(to_fail)) return NULL; LIns* text_ch = lir->insLoad(LIR_ldus2ui, pos, 0, ACC_READONLY); // Extra characters that need to be compared against when doing folding. struct extra { jschar ch; LIns *match; }; extra extras[5]; int nextras = 0; if (cs->flags & JSREG_FOLD) { ch = JS_TOUPPER(ch); jschar lch = inverse_upcase(ch); if (ch != lch) { if (L'A' <= ch && ch <= L'Z') { // Fast conversion of text character to lower case by OR-ing with 32. text_ch = lir->ins2(LIR_ori, text_ch, lir->insImmI(32)); // These ASCII letters have 2 lower-case forms. We put the ASCII one in // |extras| so it is tested first, because we expect that to be the common // case. Note that the code points of the non-ASCII forms both have the // 32 bit set, so it is OK to compare against the OR-32-converted text char. ch = lch; if (ch == L'i') { extras[nextras++].ch = ch; ch = 0x131; } else if (ch == L's') { extras[nextras++].ch = ch; ch = 0x17f; } goto gen; } else if (0x01c4 <= ch && ch <= 0x1e60) { // The following group of conditionals handles characters that have 1 or 2 // lower-case forms in addition to JS_TOLOWER(ch). if (ch <= 0x1f1) { // DZ,LJ,NJ if (ch == 0x01c4) { extras[nextras++].ch = 0x01c5; } else if (ch == 0x01c7) { extras[nextras++].ch = 0x01c8; } else if (ch == 0x01ca) { extras[nextras++].ch = 0x01cb; } else if (ch == 0x01f1) { extras[nextras++].ch = 0x01f2; } } else if (ch < 0x0392) { // no extra lower-case forms in this range } else if (ch <= 0x03a6) { // Greek if (ch == 0x0392) { extras[nextras++].ch = 0x03d0; } else if (ch == 0x0395) { extras[nextras++].ch = 0x03f5; } else if (ch == 0x0398) { extras[nextras++].ch = 0x03d1; } else if (ch == 0x0399) { extras[nextras++].ch = 0x0345; extras[nextras++].ch = 0x1fbe; } else if (ch == 0x039a) { extras[nextras++].ch = 0x03f0; } else if (ch == 0x039c) { extras[nextras++].ch = 0xb5; } else if (ch == 0x03a0) { extras[nextras++].ch = 0x03d6; } else if (ch == 0x03a1) { extras[nextras++].ch = 0x03f1; } else if (ch == 0x03a3) { extras[nextras++].ch = 0x03c2; } else if (ch == 0x03a6) { extras[nextras++].ch = 0x03d5; } } else if (ch == 0x1e60) { // S with dot above extras[nextras++].ch = 0x1e9b; } } extras[nextras++].ch = lch; } } gen: for (int i = 0; i < nextras; ++i) { LIns *test = lir->ins2(LIR_eqi, text_ch, lir->insImmI(extras[i].ch)); LIns *branch = lir->insBranch(LIR_jt, test, 0); extras[i].match = branch; } if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_eqi, text_ch, lir->insImmI(ch)), 0))) return NULL; for (int i = 0; i < nextras; ++i) targetCurrentPoint(extras[i].match); return lir->ins2(LIR_addp, pos, lir->insImmWord(2)); } JS_INLINE bool hasCases(jschar ch) { return JS_TOLOWER(ch) != JS_TOUPPER(ch); } LIns* compileFlatDoubleChar(jschar ch1, jschar ch2, LIns* pos, LInsList& fails) { #ifdef IS_BIG_ENDIAN uint32 word = (ch1 << 16) | ch2; #else uint32 word = (ch2 << 16) | ch1; #endif /* * Fast case-insensitive test for ASCII letters: convert text * char to lower case by bit-or-ing in 32 and compare. */ JSBool useFastCI = JS_FALSE; union { jschar c[2]; uint32 i; } mask; if (cs->flags & JSREG_FOLD) { jschar uch1 = JS_TOUPPER(ch1); jschar uch2 = JS_TOUPPER(ch2); JSBool mask1 = (L'A' <= uch1 && uch1 <= L'Z' && uch1 != L'I' && uch1 != L'S'); JSBool mask2 = (L'A' <= uch2 && uch2 <= L'Z' && uch2 != L'I' && uch2 != L'S'); if ((!mask1 && hasCases(ch1)) || (!mask2 && hasCases(ch2))) { pos = compileFlatSingleChar(ch1, pos, fails); if (!pos) return NULL; return compileFlatSingleChar(ch2, pos, fails); } mask.c[0] = mask1 ? 0x0020 : 0x0; mask.c[1] = mask2 ? 0x0020 : 0x0; if (mask.i) { word |= mask.i; useFastCI = JS_TRUE; } } LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, lir->ins2(LIR_addp, cpend, lir->insImmWord(-2))), 0); if (!fails.append(to_fail)) return NULL; LIns* text_word = lir->insLoad(LIR_ldi, pos, 0, ACC_OTHER); LIns* comp_word = useFastCI ? lir->ins2(LIR_ori, text_word, lir->insImmI(mask.i)) : text_word; if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_eqi, comp_word, lir->insImmI(word)), 0))) return NULL; return lir->ins2(LIR_addp, pos, lir->insImmWord(4)); } LIns* compileFlat(RENode *&node, LIns* pos, LInsList& fails) { #ifdef USE_DOUBLE_CHAR_MATCH if (node->u.flat.length == 1) { if (node->next && node->next->op == REOP_FLAT && node->next->u.flat.length == 1) { pos = compileFlatDoubleChar(node->u.flat.chr, node->next->u.flat.chr, pos, fails); node = node->next; } else { pos = compileFlatSingleChar(node->u.flat.chr, pos, fails); } return pos; } else { size_t i; for (i = 0; i < node->u.flat.length - 1; i += 2) { if (outOfMemory()) return 0; pos = compileFlatDoubleChar(((jschar*) node->kid)[i], ((jschar*) node->kid)[i+1], pos, fails); if (!pos) return 0; } JS_ASSERT(pos != 0); if (i == node->u.flat.length - 1) pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails); return pos; } #else if (node->u.flat.length == 1) { return compileFlatSingleChar(node->u.flat.chr, pos, fails); } else { for (size_t i = 0; i < node->u.flat.length; i++) { if (outOfMemory()) return 0; pos = compileFlatSingleChar(((jschar*) node->kid)[i], pos, fails); if (!pos) return 0; } return pos; } #endif } LIns* compileClass(RENode* node, LIns* pos, LInsList& fails) { if (!node->u.ucclass.sense) return JS_FALSE; /* * If we share generated native code, we need to make a copy * of the bitmap because the original regexp's copy is destroyed when * that regexp is. */ RECharSet *charSet = &re->classList[node->u.ucclass.index]; size_t bitmapLen = (charSet->length >> 3) + 1; /* Arbitrary size limit on bitmap. */ if (bitmapLen > 1024) return NULL; Allocator &alloc = *JS_TRACE_MONITOR(cx).dataAlloc; /* The following line allocates charSet.u.bits if successful. */ if (!charSet->converted && !ProcessCharSet(cx, re, charSet)) return NULL; void* bitmapData = alloc.alloc(bitmapLen); if (outOfMemory()) return NULL; memcpy(bitmapData, charSet->u.bits, bitmapLen); LIns* to_fail = lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, cpend), 0); if (!fails.append(to_fail)) return NULL; LIns* text_ch = lir->insLoad(LIR_ldus2ui, pos, 0, ACC_READONLY); if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_lei, text_ch, lir->insImmI(charSet->length)), 0))) { return NULL; } LIns* byteIndex = lir->insI2P(lir->ins2(LIR_rshi, text_ch, lir->insImmI(3))); LIns* bitmap = lir->insImmP(bitmapData); LIns* byte = lir->insLoad(LIR_lduc2ui, lir->ins2(LIR_addp, bitmap, byteIndex), (int) 0, ACC_READONLY); LIns* bitMask = lir->ins2(LIR_lshi, lir->insImmI(1), lir->ins2(LIR_andi, text_ch, lir->insImmI(0x7))); LIns* test = lir->ins2(LIR_eqi, lir->ins2(LIR_andi, byte, bitMask), lir->insImmI(0)); LIns* to_next = lir->insBranch(LIR_jt, test, 0); if (!fails.append(to_next)) return NULL; return lir->ins2(LIR_addp, pos, lir->insImmWord(2)); } /* Factor out common code to index js_alnum. */ LIns *compileTableRead(LIns *chr, const bool *tbl) { if (sizeof(bool) != 1) { LIns *sizeLog2 = lir->insImmI(StaticLog2::result); chr = lir->ins2(LIR_lshi, chr, sizeLog2); } LIns *addr = lir->ins2(LIR_addp, lir->insImmP(tbl), lir->insUI2P(chr)); return lir->insLoad(LIR_lduc2ui, addr, 0, ACC_READONLY); } /* Compile a builtin character class. */ LIns *compileBuiltinClass(RENode *node, LIns *pos, LInsList &fails) { /* All the builtins checked below consume one character. */ if (!fails.append(lir->insBranch(LIR_jf, lir->ins2(LIR_ltp, pos, cpend), 0))) return NULL; LIns *chr = lir->insLoad(LIR_ldus2ui, pos, 0, ACC_READONLY); switch (node->op) { case REOP_DOT: { /* Accept any character except those in ECMA-262 15.10.2.8. */ LIns *eq1 = lir->ins2(LIR_eqi, chr, lir->insImmI('\n')); if (!fails.append(lir->insBranch(LIR_jt, eq1, NULL))) return NULL; LIns *eq2 = lir->ins2(LIR_eqi, chr, lir->insImmI('\r')); if (!fails.append(lir->insBranch(LIR_jt, eq2, NULL))) return NULL; LIns *eq3 = lir->ins2(LIR_eqi, chr, lir->insImmI(LINE_SEPARATOR)); if (!fails.append(lir->insBranch(LIR_jt, eq3, NULL))) return NULL; LIns *eq4 = lir->ins2(LIR_eqi, chr, lir->insImmI(PARA_SEPARATOR)); if (!fails.append(lir->insBranch(LIR_jt, eq4, NULL))) return NULL; break; } case REOP_DIGIT: { LIns *ge = lir->ins2(LIR_gei, chr, lir->insImmI('0')); if (!fails.append(lir->insBranch(LIR_jf, ge, NULL))) return NULL; LIns *le = lir->ins2(LIR_lei, chr, lir->insImmI('9')); if (!fails.append(lir->insBranch(LIR_jf, le, NULL))) return NULL; break; } case REOP_NONDIGIT: { /* Use 'and' to give a predictable branch for success path. */ LIns *ge = lir->ins2(LIR_gei, chr, lir->insImmI('0')); LIns *le = lir->ins2(LIR_lei, chr, lir->insImmI('9')); LIns *both = lir->ins2(LIR_andi, ge, le); if (!fails.append(lir->insBranch(LIR_jf, lir->insEqI_0(both), NULL))) return NULL; break; } case REOP_ALNUM: { /* * Compile the condition: * ((uint)*cp) < 128 && js_alnum[(uint)*cp] */ LIns *rangeCnd = lir->ins2(LIR_ltui, chr, lir->insImmI(128)); if (!fails.append(lir->insBranch(LIR_jf, rangeCnd, NULL))) return NULL; LIns *tableVal = compileTableRead(chr, js_alnum); if (!fails.append(lir->insBranch(LIR_jt, lir->insEqI_0(tableVal), NULL))) return NULL; break; } case REOP_NONALNUM: { /* * Compile the condition: * ((uint)*cp) >= 128 || !js_alnum[(uint)*cp] */ LIns *rangeCnd = lir->ins2(LIR_geui, chr, lir->insImmI(128)); LIns *rangeBr = lir->insBranch(LIR_jt, rangeCnd, NULL); LIns *tableVal = compileTableRead(chr, js_alnum); if (!fails.append(lir->insBranch(LIR_jf, lir->insEqI_0(tableVal), NULL))) return NULL; LIns *success = lir->ins0(LIR_label); rangeBr->setTarget(success); break; } case REOP_SPACE: case REOP_NONSPACE: { /* * ECMA-262 7.2, 7.3, and 15.10.2.12 define a bunch of Unicode code * points for whitespace. We optimize here for the common case of * ASCII characters using a table lookup for the lower block that * can actually contain spaces. For the rest, use a (more or less) * binary search to minimize tests. * * [0000,0020]: 9, A, B, C, D, 20 * (0020,00A0): none * [00A0,2000): A0, 1680, 180E * [2000,200A]: all * (200A, max): 2028, 2029, 202F, 205F, 3000 */ /* Below 0x20? */ LIns *tableRangeCnd = lir->ins2(LIR_leui, chr, lir->insImmI(0x20)); LIns *tableRangeBr = lir->insBranch(LIR_jt, tableRangeCnd, NULL); /* Fall through means *chr > 0x20. */ /* Handle (0x20,0xA0). */ LIns *asciiCnd = lir->ins2(LIR_ltui, chr, lir->insImmI(0xA0)); LIns *asciiMissBr = lir->insBranch(LIR_jt, asciiCnd, NULL); /* Fall through means *chr >= 0xA0. */ /* Partition around [0x2000,0x200A]. */ LIns *belowCnd = lir->ins2(LIR_ltui, chr, lir->insImmI(0x2000)); LIns *belowBr = lir->insBranch(LIR_jt, belowCnd, NULL); LIns *aboveCnd = lir->ins2(LIR_gtui, chr, lir->insImmI(0x200A)); LIns *aboveBr = lir->insBranch(LIR_jt, aboveCnd, NULL); LIns *intervalMatchBr = lir->insBranch(LIR_j, NULL, NULL); /* Handle [0xA0,0x2000). */ LIns *belowLbl = lir->ins0(LIR_label); belowBr->setTarget(belowLbl); LIns *eq1Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0xA0)); LIns *eq1Br = lir->insBranch(LIR_jt, eq1Cnd, NULL); LIns *eq2Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x1680)); LIns *eq2Br = lir->insBranch(LIR_jt, eq2Cnd, NULL); LIns *eq3Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x180E)); LIns *eq3Br = lir->insBranch(LIR_jt, eq3Cnd, NULL); LIns *belowMissBr = lir->insBranch(LIR_j, NULL, NULL); /* Handle (0x200A, max). */ LIns *aboveLbl = lir->ins0(LIR_label); aboveBr->setTarget(aboveLbl); LIns *eq4Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x2028)); LIns *eq4Br = lir->insBranch(LIR_jt, eq4Cnd, NULL); LIns *eq5Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x2029)); LIns *eq5Br = lir->insBranch(LIR_jt, eq5Cnd, NULL); LIns *eq6Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x202F)); LIns *eq6Br = lir->insBranch(LIR_jt, eq6Cnd, NULL); LIns *eq7Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x205F)); LIns *eq7Br = lir->insBranch(LIR_jt, eq7Cnd, NULL); LIns *eq8Cnd = lir->ins2(LIR_eqi, chr, lir->insImmI(0x3000)); LIns *eq8Br = lir->insBranch(LIR_jt, eq8Cnd, NULL); LIns *aboveMissBr = lir->insBranch(LIR_j, NULL, NULL); /* Handle [0,0x20]. */ LIns *tableLbl = lir->ins0(LIR_label); tableRangeBr->setTarget(tableLbl); LIns *tableVal = compileTableRead(chr, js_ws); LIns *tableCnd = lir->insEqI_0(tableVal); LIns *tableMatchBr = lir->insBranch(LIR_jf, tableCnd, NULL); /* Collect misses. */ LIns *missLbl = lir->ins0(LIR_label); asciiMissBr->setTarget(missLbl); belowMissBr->setTarget(missLbl); aboveMissBr->setTarget(missLbl); LIns *missBr = lir->insBranch(LIR_j, NULL, NULL); if (node->op == REOP_SPACE) { if (!fails.append(missBr)) return NULL; } /* Collect matches. */ LIns *matchLbl = lir->ins0(LIR_label); intervalMatchBr->setTarget(matchLbl); tableMatchBr->setTarget(matchLbl); eq1Br->setTarget(matchLbl); eq2Br->setTarget(matchLbl); eq3Br->setTarget(matchLbl); eq4Br->setTarget(matchLbl); eq5Br->setTarget(matchLbl); eq6Br->setTarget(matchLbl); eq7Br->setTarget(matchLbl); eq8Br->setTarget(matchLbl); if (node->op == REOP_NONSPACE) { LIns *matchBr = lir->insBranch(LIR_j, NULL, NULL); if (!fails.append(matchBr)) return NULL; } /* Fall through means match == success. */ /* Collect successes to fall through. */ LIns *success = lir->ins0(LIR_label); if (node->op == REOP_NONSPACE) missBr->setTarget(success); break; } default: return NULL; } return lir->ins2(LIR_addp, pos, lir->insImmWord(2)); } LIns *compileAlt(RENode *node, LIns *pos, bool atEnd, LInsList &fails) { RENode *leftRe = (RENode *)node->kid, *rightRe = (RENode *)node->u.kid2; /* * If the RE continues after the alternative, we need to ensure that no * backtracking is required. Recursive calls to compileNode will fail * on capturing parens, so the only thing we have to check here is that, * if the left subexpression matches, we can keep going without later * deciding we need to try the right subexpression. */ if (!atEnd) { /* * If there is no character overlap between left and right, then * there is only one possible path through the alternative. */ CharSet leftSet, rightSet; if (!enumerateNextChars(cx, leftRe, leftSet) || !enumerateNextChars(cx, rightRe, rightSet) || !leftSet.disjoint(rightSet)) return NULL; /* * If there is an empty path through either subexpression, the above * check is incomplete; we need to include |node->next| as well. */ bool epsLeft = mayMatchEmpty(leftRe), epsRight = mayMatchEmpty(rightRe); if (epsRight && epsLeft) { return NULL; } else if (epsLeft || epsRight) { CharSet nextSet; if (!enumerateNextChars(cx, node->next, nextSet) || (epsLeft && !nextSet.disjoint(rightSet)) || (epsRight && !nextSet.disjoint(leftSet))) { return NULL; } } } /* Try left branch. */ LInsList kidFails(cx); LIns *branchEnd = compileNode(leftRe, pos, atEnd, kidFails); if (!branchEnd) return NULL; /* * Since there are no phis, simulate by writing to and reading from * memory (REGlobalData::stateStack, since it is unused). */ lir->insStore(branchEnd, state, offsetof(REGlobalData, stateStack), ACC_OTHER); LIns *leftSuccess = lir->insBranch(LIR_j, NULL, NULL); /* Try right branch. */ targetCurrentPoint(kidFails); if (!(branchEnd = compileNode(rightRe, pos, atEnd, fails))) return NULL; lir->insStore(branchEnd, state, offsetof(REGlobalData, stateStack), ACC_OTHER); /* Land success on the left branch. */ targetCurrentPoint(leftSuccess); return addName(fragment->lirbuf, lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, stateStack), ACC_OTHER), "pos"); } LIns *compileOpt(RENode *node, LIns *pos, bool atEnd, LInsList &fails) { /* * Since there are no phis, simulate by writing to and reading from * memory (REGlobalData::stateStack, since it is unused). */ lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER); /* Try ? body. */ LInsList kidFails(cx); if (!(pos = compileNode(node, pos, atEnd, kidFails))) return NULL; lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER); /* Join success and failure and get new position. */ targetCurrentPoint(kidFails); pos = addName(fragment->lirbuf, lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, stateStack), ACC_OTHER), "pos"); return pos; } LIns *compileQuant(RENode *node, LIns *pos, bool atEnd, LInsList &fails) { /* Only support greedy *, +, ?. */ if (!node->u.range.greedy || node->u.range.min > 1 || (node->u.range.max > 1 && node->u.range.max < (uintN)-1)) { return NULL; } RENode *bodyRe = (RENode *)node->kid; /* * If the RE continues after the alternative, we need to ensure that no * backtracking is required. Recursive calls to compileNode will fail * on capturing parens, so the only thing we have to check here is that, * if the quantifier body matches, we can continue matching the body * without later deciding we need to undo the body matches. */ if (!atEnd) { /* * If there is no character overlap between the body and * |node->next|, then all possible body matches are used. */ CharSet bodySet, nextSet; if (!enumerateNextChars(cx, bodyRe, bodySet) || !enumerateNextChars(cx, node->next, nextSet) || !bodySet.disjoint(nextSet)) { return NULL; } } /* Fork off ? and {1,1}. */ if (node->u.range.max == 1) { if (node->u.range.min == 1) return compileNode(bodyRe, pos, atEnd, fails); else return compileOpt(bodyRe, pos, atEnd, fails); } /* For +, compile a copy of the body where failure is real failure. */ if (node->u.range.min == 1) { if (!(pos = compileNode(bodyRe, pos, atEnd, fails))) return NULL; } /* * Since there are no phis, simulate by writing to and reading from * memory (REGlobalData::stateStack, since it is unused). */ lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER); /* Begin iteration: load loop variables. */ LIns *loopTop = lir->ins0(LIR_label); LIns *iterBegin = addName(fragment->lirbuf, lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, stateStack), ACC_OTHER), "pos"); /* Match quantifier body. */ LInsList kidFails(cx); LIns *iterEnd = compileNode(bodyRe, iterBegin, atEnd, kidFails); if (!iterEnd) return NULL; /* * If there is an epsilon path through the body then, when it is taken, * we need to abort the loop or else we will loop forever. */ if (mayMatchEmpty(bodyRe)) { LIns *eqCnd = lir->ins2(LIR_eqp, iterBegin, iterEnd); if (!kidFails.append(lir->insBranch(LIR_jt, eqCnd, NULL))) return NULL; } /* End iteration: store loop variables, increment, jump */ lir->insStore(iterEnd, state, offsetof(REGlobalData, stateStack), ACC_OTHER); lir->insBranch(LIR_j, NULL, loopTop); /* * Using '+' as branch, the intended control flow is: * * ... * A -> | * |<---. * B -> | | * +--. | * C -> | | | * +--. | * D -> | | | * +--|-' * X -> | | * |<-' * E -> | * ... * * We are currently at point X. Since the regalloc makes a single, * linear, backwards sweep over the IR (going from E to A), point X * must tell the regalloc what LIR insns are live at the end of D. * Thus, we need to report *all* insns defined *before* the end of D * that may be used *after* D. This means insns defined in A, B, C, or * D and used in B, C, D, or E. Since insns in B, C, and D are * conditionally executed, and we (currently) don't have real phi * nodes, we need only consider insns defined in A and used in E. */ lir->ins1(LIR_livep, state); lir->ins1(LIR_livep, cpend); lir->ins1(LIR_livep, start); /* After the loop: reload 'pos' from memory and continue. */ targetCurrentPoint(kidFails); return iterBegin; } /* * Compile the regular expression rooted at 'node'. Return 0 on failed * compilation. Otherwise, generate code that falls through on success (the * returned LIns* is the current 'pos') and jumps to the end on failure (by * adding the guard LIns to 'fails'). */ LIns *compileNode(RENode *node, LIns *pos, bool atEnd, LInsList &fails) { for (; pos && node; node = node->next) { if (outOfMemory()) return NULL; bool childNextIsEnd = atEnd && !node->next; switch (node->op) { case REOP_EMPTY: pos = compileEmpty(node, pos, fails); break; case REOP_FLAT: pos = compileFlat(node, pos, fails); break; case REOP_ALT: case REOP_ALTPREREQ: pos = compileAlt(node, pos, childNextIsEnd, fails); break; case REOP_QUANT: pos = compileQuant(node, pos, childNextIsEnd, fails); break; case REOP_CLASS: pos = compileClass(node, pos, fails); break; case REOP_DOT: case REOP_DIGIT: case REOP_NONDIGIT: case REOP_ALNUM: case REOP_NONALNUM: case REOP_SPACE: case REOP_NONSPACE: pos = compileBuiltinClass(node, pos, fails); break; default: return NULL; } } return pos; } /* * This function kicks off recursive compileNode compilation, finishes the * success path, and lets the failed-match path fall through. */ bool compileRootNode(RENode *root, LIns *pos, LIns *anchorFail) { /* Compile the regular expression body. */ LInsList fails(cx); pos = compileNode(root, pos, true, fails); if (!pos) return false; /* Fall-through from compileNode means success. */ lir->insStore(pos, state, offsetof(REGlobalData, stateStack), ACC_OTHER); lir->ins0(LIR_regfence); lir->ins1(LIR_reti, lir->insImmI(1)); /* Stick return here so we don't have to jump over it every time. */ if (anchorFail) { targetCurrentPoint(anchorFail); lir->ins0(LIR_regfence); lir->ins1(LIR_reti, lir->insImmI(0)); } /* Target failed matches. */ targetCurrentPoint(fails); return true; } /* Compile a regular expressions that can only match on the first char. */ bool compileSticky(RENode *root, LIns *start) { if (!compileRootNode(root, start, NULL)) return false; /* Failed to match on first character, so fail whole match. */ lir->ins0(LIR_regfence); lir->ins1(LIR_reti, lir->insImmI(0)); return !outOfMemory(); } /* Compile normal regular expressions that can match starting at any char. */ bool compileAnchoring(RENode *root, LIns *start) { /* Guard outer anchoring loop. Use <= to allow empty regexp match. */ LIns *anchorFail = lir->insBranch(LIR_jf, lir->ins2(LIR_lep, start, cpend), 0); if (!compileRootNode(root, start, anchorFail)) return false; /* Outer loop increment. */ lir->insStore(lir->ins2(LIR_addp, start, lir->insImmWord(2)), state, offsetof(REGlobalData, skipped), ACC_OTHER); return !outOfMemory(); } inline LIns* addName(LirBuffer* lirbuf, LIns* ins, const char* name) { #ifdef NJ_VERBOSE debug_only_stmt(lirbuf->printer->lirNameMap->addName(ins, name);) #endif return ins; } /* * Insert the side exit and guard record for a compiled regexp. Most * of the fields are not used. The important part is the regexp source * and flags, which we use as the fragment lookup key. */ GuardRecord* insertGuard(LIns* loopLabel, const jschar* re_chars, size_t re_length) { if (loopLabel) { lir->insBranch(LIR_j, NULL, loopLabel); LirBuffer* lirbuf = fragment->lirbuf; lir->ins1(LIR_livep, lirbuf->state); lir->ins1(LIR_livep, lirbuf->param1); } Allocator &alloc = *JS_TRACE_MONITOR(cx).dataAlloc; /* Must only create a VMSideExit; see StackFilter::getTops. */ size_t len = (sizeof(GuardRecord) + sizeof(VMSideExit) + (re_length-1) * sizeof(jschar)); GuardRecord* guard = (GuardRecord *) alloc.alloc(len); VMSideExit* exit = (VMSideExit*)(guard+1); guard->exit = exit; guard->exit->target = fragment; fragment->lastIns = lir->insGuard(LIR_x, NULL, guard); // guard->profCount is calloc'd to zero verbose_only( guard->profGuardID = fragment->guardNumberer++; guard->nextInFrag = fragment->guardsForFrag; fragment->guardsForFrag = guard; ) return guard; } public: RegExpNativeCompiler(JSContext* cx, JSRegExp* re, CompilerState* cs, Fragment* fragment) : tempAlloc(*JS_TRACE_MONITOR(cx).reTempAlloc), cx(cx), re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL), lirbuf(new (tempAlloc) LirBuffer(tempAlloc)) { fragment->lirbuf = lirbuf; #ifdef DEBUG lirbuf->printer = new (tempAlloc) LInsPrinter(tempAlloc); #endif } ~RegExpNativeCompiler() { /* Purge the tempAlloc used during recording. */ tempAlloc.reset(); } JSBool compile() { GuardRecord* guard = NULL; const jschar* re_chars; size_t re_length; TraceMonitor* tm = &JS_TRACE_MONITOR(cx); Assembler *assm = tm->assembler; LIns* loopLabel = NULL; if (outOfMemory() || OverfullJITCache(tm)) return JS_FALSE; re->source->getCharsAndLength(re_chars, re_length); /* * If the regexp is too long nanojit will assert when we * try to insert the guard record. */ if (re_length > 1024) { re->flags |= JSREG_NOCOMPILE; return JS_FALSE; } /* At this point we have an empty fragment. */ LirBuffer* lirbuf = fragment->lirbuf; if (outOfMemory()) goto fail; /* FIXME Use bug 463260 smart pointer when available. */ lir = lirBufWriter = new LirBufWriter(lirbuf, nanojit::AvmCore::config); /* FIXME Use bug 463260 smart pointer when available. */ #ifdef NJ_VERBOSE debug_only_stmt( if (LogController.lcbits & LC_TMRegexp) { lir = verbose_filter = new VerboseWriter(tempAlloc, lir, lirbuf->printer, &LogController); } ) #endif #ifdef DEBUG lir = validate_writer = new ValidateWriter(lir, lirbuf->printer, "regexp writer pipeline"); #endif /* * Although we could just load REGlobalData::cpend from 'state', by * passing it as a parameter, we avoid loading it every iteration. */ lir->ins0(LIR_start); for (int i = 0; i < NumSavedRegs; ++i) lir->insParam(i, 1); #ifdef DEBUG for (int i = 0; i < NumSavedRegs; ++i) addName(lirbuf, lirbuf->savedRegs[i], regNames[Assembler::savedRegs[i]]); #endif lirbuf->state = state = addName(lirbuf, lir->insParam(0, 0), "state"); lirbuf->param1 = cpend = addName(lirbuf, lir->insParam(1, 0), "cpend"); loopLabel = lir->ins0(LIR_label); // If profiling, record where the loop label is, so that the // assembler can insert a frag-entry-counter increment at that // point verbose_only( if (LogController.lcbits & LC_FragProfile) { NanoAssert(!fragment->loopLabel); fragment->loopLabel = loopLabel; }) start = addName(lirbuf, lir->insLoad(LIR_ldp, state, offsetof(REGlobalData, skipped), ACC_OTHER), "start"); if (cs->flags & JSREG_STICKY) { if (!compileSticky(cs->result, start)) goto fail; } else { if (!compileAnchoring(cs->result, start)) goto fail; } guard = insertGuard(loopLabel, re_chars, re_length); if (outOfMemory()) goto fail; /* * Deep in the nanojit compiler, the StackFilter is trying to throw * away stores above the VM interpreter/native stacks. We have no such * stacks, so rely on the fact that lirbuf->sp and lirbuf->rp are null * to ensure our stores are ignored. */ JS_ASSERT(!lirbuf->sp && !lirbuf->rp); assm->compile(fragment, tempAlloc, /*optimize*/true verbose_only(, lirbuf->printer)); if (assm->error() != nanojit::None) goto fail; delete lirBufWriter; #ifdef DEBUG delete validate_writer; #endif #ifdef NJ_VERBOSE debug_only_stmt( if (LogController.lcbits & LC_TMRegexp) delete verbose_filter; ) #endif return JS_TRUE; fail: if (outOfMemory() || OverfullJITCache(tm)) { delete lirBufWriter; // recover profiling data from expiring Fragments verbose_only( REHashMap::Iter iter(*(tm->reFragments)); while (iter.next()) { nanojit::Fragment* frag = iter.value(); FragProfiling_FragFinalizer(frag, tm); } ) FlushJITCache(cx); } else { if (!guard) insertGuard(loopLabel, re_chars, re_length); re->flags |= JSREG_NOCOMPILE; delete lirBufWriter; } #ifdef DEBUG delete validate_writer; #endif #ifdef NJ_VERBOSE debug_only_stmt( if (LogController.lcbits & LC_TMRegexp) delete verbose_filter; ) #endif return JS_FALSE; } }; /* * Compile a regexp to native code in the given fragment. */ static inline JSBool CompileRegExpToNative(JSContext* cx, JSRegExp* re, Fragment* fragment) { JSBool rv = JS_FALSE; void* mark; CompilerState state; RegExpNativeCompiler rc(cx, re, &state, fragment); JS_ASSERT(!fragment->code()); mark = JS_ARENA_MARK(&cx->tempPool); if (!CompileRegExpToAST(cx, NULL, re->source, re->flags, state)) { goto out; } rv = rc.compile(); out: JS_ARENA_RELEASE(&cx->tempPool, mark); return rv; } /* Function type for a compiled native regexp. */ typedef void *(FASTCALL *NativeRegExp)(REGlobalData*, const jschar *); /* * Return a compiled native regexp if one already exists or can be created * now, or NULL otherwise. */ static NativeRegExp GetNativeRegExp(JSContext* cx, JSRegExp* re) { const jschar *re_chars; size_t re_length; re->source->getCharsAndLength(re_chars, re_length); Fragment *fragment = LookupNativeRegExp(cx, re->flags, re_chars, re_length); JS_ASSERT(fragment); if (!fragment->code() && fragment->recordAttempts == 0) { fragment->recordAttempts++; if (!CompileRegExpToNative(cx, re, fragment)) return NULL; } union { NIns *code; NativeRegExp func; } u; u.code = fragment->code(); return u.func; } #endif JSRegExp * js_NewRegExp(JSContext *cx, TokenStream *ts, JSString *str, uintN flags, JSBool flat) { JSRegExp *re; void *mark; CompilerState state; size_t resize; jsbytecode *endPC; uintN i; re = NULL; mark = JS_ARENA_MARK(&cx->tempPool); /* * Parsing the string as flat is now expressed internally using * a flag, so that we keep this information in the JSRegExp, but * we keep the 'flat' parameter for now for compatibility. */ if (flat) flags |= JSREG_FLAT; if (!CompileRegExpToAST(cx, ts, str, flags, state)) goto out; resize = offsetof(JSRegExp, program) + state.progLength + 1; re = (JSRegExp *) cx->malloc(resize); if (!re) goto out; re->nrefs = 1; JS_ASSERT(state.classBitmapsMem <= CLASS_BITMAPS_MEM_LIMIT); re->classCount = state.classCount; if (re->classCount) { re->classList = (RECharSet *) cx->malloc(re->classCount * sizeof(RECharSet)); if (!re->classList) { js_DestroyRegExp(cx, re); re = NULL; goto out; } for (i = 0; i < re->classCount; i++) re->classList[i].converted = JS_FALSE; } else { re->classList = NULL; } /* Compile the bytecode version. */ endPC = EmitREBytecode(&state, re, state.treeDepth, re->program, state.result); if (!endPC) { js_DestroyRegExp(cx, re); re = NULL; goto out; } *endPC++ = REOP_END; /* * Check whether size was overestimated and shrink using realloc. * This is safe since no pointers to newly parsed regexp or its parts * besides re exist here. */ if ((size_t)(endPC - re->program) != state.progLength + 1) { JSRegExp *tmp; JS_ASSERT((size_t)(endPC - re->program) < state.progLength + 1); resize = offsetof(JSRegExp, program) + (endPC - re->program); tmp = (JSRegExp *) cx->realloc(re, resize); if (tmp) re = tmp; } re->flags = uint16(flags); re->parenCount = state.parenCount; re->source = str; out: JS_ARENA_RELEASE(&cx->tempPool, mark); return re; } JSRegExp * js_NewRegExpOpt(JSContext *cx, JSString *str, JSString *opt, JSBool flat) { uintN flags; const jschar *s; size_t i, n; char charBuf[2]; flags = 0; if (opt) { opt->getCharsAndLength(s, n); for (i = 0; i < n; i++) { #define HANDLE_FLAG(name) \ JS_BEGIN_MACRO \ if (flags & (name)) \ goto bad_flag; \ flags |= (name); \ JS_END_MACRO switch (s[i]) { case 'g': HANDLE_FLAG(JSREG_GLOB); break; case 'i': HANDLE_FLAG(JSREG_FOLD); break; case 'm': HANDLE_FLAG(JSREG_MULTILINE); break; case 'y': HANDLE_FLAG(JSREG_STICKY); break; default: bad_flag: charBuf[0] = (char)s[i]; charBuf[1] = '\0'; JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, NULL, JSMSG_BAD_REGEXP_FLAG, charBuf); return NULL; } #undef HANDLE_FLAG } } return js_NewRegExp(cx, NULL, str, flags, flat); } /* * Save the current state of the match - the position in the input * text as well as the position in the bytecode. The state of any * parent expressions is also saved (preceding state). * Contents of parenCount parentheses from parenIndex are also saved. */ static REBackTrackData * PushBackTrackState(REGlobalData *gData, REOp op, jsbytecode *target, REMatchState *x, const jschar *cp, size_t parenIndex, size_t parenCount) { size_t i; REBackTrackData *result = (REBackTrackData *) ((char *)gData->backTrackSP + gData->cursz); size_t sz = sizeof(REBackTrackData) + gData->stateStackTop * sizeof(REProgState) + parenCount * sizeof(RECapture); ptrdiff_t btsize = gData->backTrackStackSize; ptrdiff_t btincr = ((char *)result + sz) - ((char *)gData->backTrackStack + btsize); re_debug("\tBT_Push: %lu,%lu", (unsigned long) parenIndex, (unsigned long) parenCount); if (btincr > 0) { ptrdiff_t offset = (char *)result - (char *)gData->backTrackStack; btincr = JS_ROUNDUP(btincr, btsize); JS_ARENA_GROW_CAST(gData->backTrackStack, REBackTrackData *, &gData->cx->regexpPool, btsize, btincr); if (!gData->backTrackStack) { js_ReportOutOfScriptQuota(gData->cx); gData->ok = JS_FALSE; return NULL; } gData->backTrackStackSize = btsize + btincr; result = (REBackTrackData *) ((char *)gData->backTrackStack + offset); } gData->backTrackSP = result; result->sz = gData->cursz; gData->cursz = sz; result->backtrack_op = op; result->backtrack_pc = target; result->cp = cp; result->parenCount = parenCount; result->parenIndex = parenIndex; result->saveStateStackTop = gData->stateStackTop; JS_ASSERT(gData->stateStackTop); memcpy(result + 1, gData->stateStack, sizeof(REProgState) * result->saveStateStackTop); if (parenCount != 0) { memcpy((char *)(result + 1) + sizeof(REProgState) * result->saveStateStackTop, &x->parens[parenIndex], sizeof(RECapture) * parenCount); for (i = 0; i != parenCount; i++) x->parens[parenIndex + i].index = -1; } return result; } /* * Consecutive literal characters. */ #if 0 static REMatchState * FlatNMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars, size_t length) { size_t i; if (length > gData->cpend - x->cp) return NULL; for (i = 0; i != length; i++) { if (matchChars[i] != x->cp[i]) return NULL; } x->cp += length; return x; } #endif static JS_ALWAYS_INLINE REMatchState * FlatNIMatcher(REGlobalData *gData, REMatchState *x, jschar *matchChars, size_t length) { size_t i; JS_ASSERT(gData->cpend >= x->cp); if (length > (size_t)(gData->cpend - x->cp)) return NULL; for (i = 0; i != length; i++) { if (upcase(matchChars[i]) != upcase(x->cp[i])) return NULL; } x->cp += length; return x; } /* * 1. Evaluate DecimalEscape to obtain an EscapeValue E. * 2. If E is not a character then go to step 6. * 3. Let ch be E's character. * 4. Let A be a one-element RECharSet containing the character ch. * 5. Call CharacterSetMatcher(A, false) and return its Matcher result. * 6. E must be an integer. Let n be that integer. * 7. If n=0 or n>NCapturingParens then throw a SyntaxError exception. * 8. Return an internal Matcher closure that takes two arguments, a State x * and a Continuation c, and performs the following: * 1. Let cap be x's captures internal array. * 2. Let s be cap[n]. * 3. If s is undefined, then call c(x) and return its result. * 4. Let e be x's endIndex. * 5. Let len be s's length. * 6. Let f be e+len. * 7. If f>InputLength, return failure. * 8. If there exists an integer i between 0 (inclusive) and len (exclusive) * such that Canonicalize(s[i]) is not the same character as * Canonicalize(Input [e+i]), then return failure. * 9. Let y be the State (f, cap). * 10. Call c(y) and return its result. */ static REMatchState * BackrefMatcher(REGlobalData *gData, REMatchState *x, size_t parenIndex) { size_t len, i; const jschar *parenContent; RECapture *cap = &x->parens[parenIndex]; if (cap->index == -1) return x; len = cap->length; if (x->cp + len > gData->cpend) return NULL; parenContent = &gData->cpbegin[cap->index]; if (gData->regexp->flags & JSREG_FOLD) { for (i = 0; i < len; i++) { if (upcase(parenContent[i]) != upcase(x->cp[i])) return NULL; } } else { for (i = 0; i < len; i++) { if (parenContent[i] != x->cp[i]) return NULL; } } x->cp += len; return x; } /* Add a single character to the RECharSet */ static void AddCharacterToCharSet(RECharSet *cs, jschar c) { uintN byteIndex = (uintN)(c >> 3); JS_ASSERT(c <= cs->length); cs->u.bits[byteIndex] |= 1 << (c & 0x7); } /* Add a character range, c1 to c2 (inclusive) to the RECharSet */ static void AddCharacterRangeToCharSet(RECharSet *cs, uintN c1, uintN c2) { uintN i; uintN byteIndex1 = c1 >> 3; uintN byteIndex2 = c2 >> 3; JS_ASSERT(c2 <= cs->length && c1 <= c2); c1 &= 0x7; c2 &= 0x7; if (byteIndex1 == byteIndex2) { cs->u.bits[byteIndex1] |= ((uint8)0xFF >> (7 - (c2 - c1))) << c1; } else { cs->u.bits[byteIndex1] |= 0xFF << c1; for (i = byteIndex1 + 1; i < byteIndex2; i++) cs->u.bits[i] = 0xFF; cs->u.bits[byteIndex2] |= (uint8)0xFF >> (7 - c2); } } struct CharacterRange { jschar start; jschar end; }; /* * The following characters are taken from the ECMA-262 standard, section 7.2 * and 7.3, and the Unicode 3 standard, Table 6-1. */ static const CharacterRange WhiteSpaceRanges[] = { /* TAB, LF, VT, FF, CR */ { 0x0009, 0x000D }, /* SPACE */ { 0x0020, 0x0020 }, /* NO-BREAK SPACE */ { 0x00A0, 0x00A0 }, /* * EN QUAD, EM QUAD, EN SPACE, EM SPACE, THREE-PER-EM SPACE, FOUR-PER-EM * SPACE, SIX-PER-EM SPACE, FIGURE SPACE, PUNCTUATION SPACE, THIN SPACE, * HAIR SPACE, ZERO WIDTH SPACE */ { 0x2000, 0x200B }, /* LS, PS */ { 0x2028, 0x2029 }, /* NARROW NO-BREAK SPACE */ { 0x202F, 0x202F }, /* IDEOGRAPHIC SPACE */ { 0x3000, 0x3000 } }; /* ECMA-262 standard, section 15.10.2.6. */ static const CharacterRange WordRanges[] = { { jschar('0'), jschar('9') }, { jschar('A'), jschar('Z') }, { jschar('_'), jschar('_') }, { jschar('a'), jschar('z') } }; static void AddCharacterRanges(RECharSet *charSet, const CharacterRange *range, const CharacterRange *end) { for (; range < end; ++range) AddCharacterRangeToCharSet(charSet, range->start, range->end); } static void AddInvertedCharacterRanges(RECharSet *charSet, const CharacterRange *range, const CharacterRange *end) { uint16 previous = 0; for (; range < end; ++range) { AddCharacterRangeToCharSet(charSet, previous, range->start - 1); previous = range->end + 1; } AddCharacterRangeToCharSet(charSet, previous, charSet->length); } /* Compile the source of the class into a RECharSet */ static JSBool ProcessCharSet(JSContext *cx, JSRegExp *re, RECharSet *charSet) { const jschar *src, *end; JSBool inRange = JS_FALSE; jschar rangeStart = 0; uintN byteLength, n; jschar c, thisCh; intN nDigits, i; JS_ASSERT(!charSet->converted); /* * Assert that startIndex and length points to chars inside [] inside * source string. */ JS_ASSERT(1 <= charSet->u.src.startIndex); JS_ASSERT(charSet->u.src.startIndex < re->source->length()); JS_ASSERT(charSet->u.src.length <= re->source->length() - 1 - charSet->u.src.startIndex); charSet->converted = JS_TRUE; src = re->source->chars() + charSet->u.src.startIndex; end = src + charSet->u.src.length; JS_ASSERT(src[-1] == '['); JS_ASSERT(end[0] == ']'); byteLength = (charSet->length >> 3) + 1; charSet->u.bits = (uint8 *)cx->malloc(byteLength); if (!charSet->u.bits) { JS_ReportOutOfMemory(cx); return JS_FALSE; } memset(charSet->u.bits, 0, byteLength); if (src == end) return JS_TRUE; if (*src == '^') { JS_ASSERT(charSet->sense == JS_FALSE); ++src; } else { JS_ASSERT(charSet->sense == JS_TRUE); } while (src != end) { switch (*src) { case '\\': ++src; c = *src++; switch (c) { case 'b': thisCh = 0x8; break; case 'f': thisCh = 0xC; break; case 'n': thisCh = 0xA; break; case 'r': thisCh = 0xD; break; case 't': thisCh = 0x9; break; case 'v': thisCh = 0xB; break; case 'c': if (src < end && JS_ISWORD(*src)) { thisCh = (jschar)(*src++ & 0x1F); } else { --src; thisCh = '\\'; } break; case 'x': nDigits = 2; goto lexHex; case 'u': nDigits = 4; lexHex: n = 0; for (i = 0; (i < nDigits) && (src < end); i++) { uintN digit; c = *src++; if (!isASCIIHexDigit(c, &digit)) { /* * Back off to accepting the original '\' * as a literal */ src -= i + 1; n = '\\'; break; } n = (n << 4) | digit; } thisCh = (jschar)n; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': /* * This is a non-ECMA extension - decimal escapes (in this * case, octal!) are supposed to be an error inside class * ranges, but supported here for backwards compatibility. */ n = JS7_UNDEC(c); c = *src; if ('0' <= c && c <= '7') { src++; n = 8 * n + JS7_UNDEC(c); c = *src; if ('0' <= c && c <= '7') { src++; i = 8 * n + JS7_UNDEC(c); if (i <= 0377) n = i; else src--; } } thisCh = (jschar)n; break; case 'd': AddCharacterRangeToCharSet(charSet, '0', '9'); continue; /* don't need range processing */ case 'D': AddCharacterRangeToCharSet(charSet, 0, '0' - 1); AddCharacterRangeToCharSet(charSet, (jschar)('9' + 1), (jschar)charSet->length); continue; case 's': AddCharacterRanges(charSet, WhiteSpaceRanges, WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges)); continue; case 'S': AddInvertedCharacterRanges(charSet, WhiteSpaceRanges, WhiteSpaceRanges + JS_ARRAY_LENGTH(WhiteSpaceRanges)); continue; case 'w': AddCharacterRanges(charSet, WordRanges, WordRanges + JS_ARRAY_LENGTH(WordRanges)); continue; case 'W': AddInvertedCharacterRanges(charSet, WordRanges, WordRanges + JS_ARRAY_LENGTH(WordRanges)); continue; default: thisCh = c; break; } break; default: thisCh = *src++; break; } if (inRange) { if (re->flags & JSREG_FOLD) { int i; JS_ASSERT(rangeStart <= thisCh); for (i = rangeStart; i <= thisCh; i++) { jschar uch, dch; AddCharacterToCharSet(charSet, jschar(i)); uch = jschar(upcase(i)); dch = inverse_upcase(jschar(i)); if (i != uch) AddCharacterToCharSet(charSet, uch); if (i != dch) AddCharacterToCharSet(charSet, dch); } } else { AddCharacterRangeToCharSet(charSet, rangeStart, thisCh); } inRange = JS_FALSE; } else { if (re->flags & JSREG_FOLD) { AddCharacterToCharSet(charSet, jschar(upcase(thisCh))); AddCharacterToCharSet(charSet, inverse_upcase(thisCh)); } else { AddCharacterToCharSet(charSet, thisCh); } if (src < end - 1) { if (*src == '-') { ++src; inRange = JS_TRUE; rangeStart = thisCh; } } } } return JS_TRUE; } static inline JSBool MatcherProcessCharSet(REGlobalData *gData, RECharSet *charSet) { JSBool rv = ProcessCharSet(gData->cx, gData->regexp, charSet); if (!rv) gData->ok = JS_FALSE; return rv; } void js_DestroyRegExp(JSContext *cx, JSRegExp *re) { if (JS_ATOMIC_DECREMENT(&re->nrefs) == 0) { if (re->classList) { uintN i; for (i = 0; i < re->classCount; i++) { if (re->classList[i].converted) cx->free(re->classList[i].u.bits); re->classList[i].u.bits = NULL; } cx->free(re->classList); } cx->free(re); } } static JSBool ReallocStateStack(REGlobalData *gData) { size_t limit = gData->stateStackLimit; size_t sz = sizeof(REProgState) * limit; JS_ARENA_GROW_CAST(gData->stateStack, REProgState *, &gData->cx->regexpPool, sz, sz); if (!gData->stateStack) { js_ReportOutOfScriptQuota(gData->cx); gData->ok = JS_FALSE; return JS_FALSE; } gData->stateStackLimit = limit + limit; return JS_TRUE; } #define PUSH_STATE_STACK(data) \ JS_BEGIN_MACRO \ ++(data)->stateStackTop; \ if ((data)->stateStackTop == (data)->stateStackLimit && \ !ReallocStateStack((data))) { \ return NULL; \ } \ JS_END_MACRO /* * Apply the current op against the given input to see if it's going to match * or fail. Return false if we don't get a match, true if we do. If updatecp is * true, then update the current state's cp. Always update startpc to the next * op. */ static JS_ALWAYS_INLINE REMatchState * SimpleMatch(REGlobalData *gData, REMatchState *x, REOp op, jsbytecode **startpc, JSBool updatecp) { REMatchState *result = NULL; jschar matchCh; size_t parenIndex; size_t offset, length, index; jsbytecode *pc = *startpc; /* pc has already been incremented past op */ jschar *source; const jschar *startcp = x->cp; jschar ch; RECharSet *charSet; #ifdef REGEXP_DEBUG const char *opname = reop_names[op]; re_debug("\n%06d: %*s%s", pc - gData->regexp->program, gData->stateStackTop * 2, "", opname); #endif switch (op) { case REOP_EMPTY: result = x; break; case REOP_BOL: if (x->cp != gData->cpbegin) { if (!gData->cx->regExpStatics.multiline && !(gData->regexp->flags & JSREG_MULTILINE)) { break; } if (!RE_IS_LINE_TERM(x->cp[-1])) break; } result = x; break; case REOP_EOL: if (x->cp != gData->cpend) { if (!gData->cx->regExpStatics.multiline && !(gData->regexp->flags & JSREG_MULTILINE)) { break; } if (!RE_IS_LINE_TERM(*x->cp)) break; } result = x; break; case REOP_WBDRY: if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^ !(x->cp != gData->cpend && JS_ISWORD(*x->cp))) { result = x; } break; case REOP_WNONBDRY: if ((x->cp == gData->cpbegin || !JS_ISWORD(x->cp[-1])) ^ (x->cp != gData->cpend && JS_ISWORD(*x->cp))) { result = x; } break; case REOP_DOT: if (x->cp != gData->cpend && !RE_IS_LINE_TERM(*x->cp)) { result = x; result->cp++; } break; case REOP_DIGIT: if (x->cp != gData->cpend && JS7_ISDEC(*x->cp)) { result = x; result->cp++; } break; case REOP_NONDIGIT: if (x->cp != gData->cpend && !JS7_ISDEC(*x->cp)) { result = x; result->cp++; } break; case REOP_ALNUM: if (x->cp != gData->cpend && JS_ISWORD(*x->cp)) { result = x; result->cp++; } break; case REOP_NONALNUM: if (x->cp != gData->cpend && !JS_ISWORD(*x->cp)) { result = x; result->cp++; } break; case REOP_SPACE: if (x->cp != gData->cpend && JS_ISSPACE(*x->cp)) { result = x; result->cp++; } break; case REOP_NONSPACE: if (x->cp != gData->cpend && !JS_ISSPACE(*x->cp)) { result = x; result->cp++; } break; case REOP_BACKREF: pc = ReadCompactIndex(pc, &parenIndex); JS_ASSERT(parenIndex < gData->regexp->parenCount); result = BackrefMatcher(gData, x, parenIndex); break; case REOP_FLAT: pc = ReadCompactIndex(pc, &offset); JS_ASSERT(offset < gData->regexp->source->length()); pc = ReadCompactIndex(pc, &length); JS_ASSERT(1 <= length); JS_ASSERT(length <= gData->regexp->source->length() - offset); if (length <= (size_t)(gData->cpend - x->cp)) { source = gData->regexp->source->chars() + offset; re_debug_chars(source, length); for (index = 0; index != length; index++) { if (source[index] != x->cp[index]) return NULL; } x->cp += length; result = x; } break; case REOP_FLAT1: matchCh = *pc++; re_debug(" '%c' == '%c'", (char)matchCh, (char)*x->cp); if (x->cp != gData->cpend && *x->cp == matchCh) { result = x; result->cp++; } break; case REOP_FLATi: pc = ReadCompactIndex(pc, &offset); JS_ASSERT(offset < gData->regexp->source->length()); pc = ReadCompactIndex(pc, &length); JS_ASSERT(1 <= length); JS_ASSERT(length <= gData->regexp->source->length() - offset); source = gData->regexp->source->chars(); result = FlatNIMatcher(gData, x, source + offset, length); break; case REOP_FLAT1i: matchCh = *pc++; if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) { result = x; result->cp++; } break; case REOP_UCFLAT1: matchCh = GET_ARG(pc); re_debug(" '%c' == '%c'", (char)matchCh, (char)*x->cp); pc += ARG_LEN; if (x->cp != gData->cpend && *x->cp == matchCh) { result = x; result->cp++; } break; case REOP_UCFLAT1i: matchCh = GET_ARG(pc); pc += ARG_LEN; if (x->cp != gData->cpend && upcase(*x->cp) == upcase(matchCh)) { result = x; result->cp++; } break; case REOP_CLASS: pc = ReadCompactIndex(pc, &index); JS_ASSERT(index < gData->regexp->classCount); if (x->cp != gData->cpend) { charSet = &gData->regexp->classList[index]; JS_ASSERT(charSet->converted); ch = *x->cp; index = ch >> 3; if (ch <= charSet->length && (charSet->u.bits[index] & (1 << (ch & 0x7)))) { result = x; result->cp++; } } break; case REOP_NCLASS: pc = ReadCompactIndex(pc, &index); JS_ASSERT(index < gData->regexp->classCount); if (x->cp != gData->cpend) { charSet = &gData->regexp->classList[index]; JS_ASSERT(charSet->converted); ch = *x->cp; index = ch >> 3; if (ch > charSet->length || !(charSet->u.bits[index] & (1 << (ch & 0x7)))) { result = x; result->cp++; } } break; default: JS_ASSERT(JS_FALSE); } if (result) { if (!updatecp) x->cp = startcp; *startpc = pc; re_debug(" * "); return result; } x->cp = startcp; return NULL; } static JS_ALWAYS_INLINE REMatchState * ExecuteREBytecode(REGlobalData *gData, REMatchState *x) { REMatchState *result = NULL; REBackTrackData *backTrackData; jsbytecode *nextpc, *testpc; REOp nextop; RECapture *cap; REProgState *curState; const jschar *startcp; size_t parenIndex, k; size_t parenSoFar = 0; jschar matchCh1, matchCh2; RECharSet *charSet; JSBool anchor; jsbytecode *pc = gData->regexp->program; REOp op = (REOp) *pc++; /* * If the first node is a simple match, step the index into the string * until that match is made, or fail if it can't be found at all. */ if (REOP_IS_SIMPLE(op) && !(gData->regexp->flags & JSREG_STICKY)) { anchor = JS_FALSE; while (x->cp <= gData->cpend) { nextpc = pc; /* reset back to start each time */ result = SimpleMatch(gData, x, op, &nextpc, JS_TRUE); if (result) { anchor = JS_TRUE; x = result; pc = nextpc; /* accept skip to next opcode */ op = (REOp) *pc++; JS_ASSERT(op < REOP_LIMIT); break; } gData->skipped++; x->cp++; } if (!anchor) goto bad; } for (;;) { #ifdef REGEXP_DEBUG const char *opname = reop_names[op]; re_debug("\n%06d: %*s%s", pc - gData->regexp->program, gData->stateStackTop * 2, "", opname); #endif if (REOP_IS_SIMPLE(op)) { result = SimpleMatch(gData, x, op, &pc, JS_TRUE); } else { curState = &gData->stateStack[gData->stateStackTop]; switch (op) { case REOP_END: goto good; case REOP_ALTPREREQ2: nextpc = pc + GET_OFFSET(pc); /* start of next op */ pc += ARG_LEN; matchCh2 = GET_ARG(pc); pc += ARG_LEN; k = GET_ARG(pc); pc += ARG_LEN; if (x->cp != gData->cpend) { if (*x->cp == matchCh2) goto doAlt; charSet = &gData->regexp->classList[k]; if (!charSet->converted && !MatcherProcessCharSet(gData, charSet)) goto bad; matchCh1 = *x->cp; k = matchCh1 >> 3; if ((matchCh1 > charSet->length || !(charSet->u.bits[k] & (1 << (matchCh1 & 0x7)))) ^ charSet->sense) { goto doAlt; } } result = NULL; break; case REOP_ALTPREREQ: nextpc = pc + GET_OFFSET(pc); /* start of next op */ pc += ARG_LEN; matchCh1 = GET_ARG(pc); pc += ARG_LEN; matchCh2 = GET_ARG(pc); pc += ARG_LEN; if (x->cp == gData->cpend || (*x->cp != matchCh1 && *x->cp != matchCh2)) { result = NULL; break; } /* else false thru... */ case REOP_ALT: doAlt: nextpc = pc + GET_OFFSET(pc); /* start of next alternate */ pc += ARG_LEN; /* start of this alternate */ curState->parenSoFar = parenSoFar; PUSH_STATE_STACK(gData); op = (REOp) *pc++; startcp = x->cp; if (REOP_IS_SIMPLE(op)) { if (!SimpleMatch(gData, x, op, &pc, JS_TRUE)) { op = (REOp) *nextpc++; pc = nextpc; continue; } result = x; op = (REOp) *pc++; } nextop = (REOp) *nextpc++; if (!PushBackTrackState(gData, nextop, nextpc, x, startcp, 0, 0)) goto bad; continue; /* * Occurs at (successful) end of REOP_ALT, */ case REOP_JUMP: /* * If we have not gotten a result here, it is because of an * empty match. Do the same thing REOP_EMPTY would do. */ if (!result) result = x; --gData->stateStackTop; pc += GET_OFFSET(pc); op = (REOp) *pc++; continue; /* * Occurs at last (successful) end of REOP_ALT, */ case REOP_ENDALT: /* * If we have not gotten a result here, it is because of an * empty match. Do the same thing REOP_EMPTY would do. */ if (!result) result = x; --gData->stateStackTop; op = (REOp) *pc++; continue; case REOP_LPAREN: pc = ReadCompactIndex(pc, &parenIndex); re_debug("[ %lu ]", (unsigned long) parenIndex); JS_ASSERT(parenIndex < gData->regexp->parenCount); if (parenIndex + 1 > parenSoFar) parenSoFar = parenIndex + 1; x->parens[parenIndex].index = x->cp - gData->cpbegin; x->parens[parenIndex].length = 0; op = (REOp) *pc++; continue; case REOP_RPAREN: { ptrdiff_t delta; pc = ReadCompactIndex(pc, &parenIndex); JS_ASSERT(parenIndex < gData->regexp->parenCount); cap = &x->parens[parenIndex]; delta = x->cp - (gData->cpbegin + cap->index); cap->length = (delta < 0) ? 0 : (size_t) delta; op = (REOp) *pc++; if (!result) result = x; continue; } case REOP_ASSERT: nextpc = pc + GET_OFFSET(pc); /* start of term after ASSERT */ pc += ARG_LEN; /* start of ASSERT child */ op = (REOp) *pc++; testpc = pc; if (REOP_IS_SIMPLE(op) && !SimpleMatch(gData, x, op, &testpc, JS_FALSE)) { result = NULL; break; } curState->u.assertion.top = (char *)gData->backTrackSP - (char *)gData->backTrackStack; curState->u.assertion.sz = gData->cursz; curState->index = x->cp - gData->cpbegin; curState->parenSoFar = parenSoFar; PUSH_STATE_STACK(gData); if (!PushBackTrackState(gData, REOP_ASSERTTEST, nextpc, x, x->cp, 0, 0)) { goto bad; } continue; case REOP_ASSERT_NOT: nextpc = pc + GET_OFFSET(pc); pc += ARG_LEN; op = (REOp) *pc++; testpc = pc; if (REOP_IS_SIMPLE(op) /* Note - fail to fail! */ && SimpleMatch(gData, x, op, &testpc, JS_FALSE) && *testpc == REOP_ASSERTNOTTEST) { result = NULL; break; } curState->u.assertion.top = (char *)gData->backTrackSP - (char *)gData->backTrackStack; curState->u.assertion.sz = gData->cursz; curState->index = x->cp - gData->cpbegin; curState->parenSoFar = parenSoFar; PUSH_STATE_STACK(gData); if (!PushBackTrackState(gData, REOP_ASSERTNOTTEST, nextpc, x, x->cp, 0, 0)) { goto bad; } continue; case REOP_ASSERTTEST: --gData->stateStackTop; --curState; x->cp = gData->cpbegin + curState->index; gData->backTrackSP = (REBackTrackData *) ((char *)gData->backTrackStack + curState->u.assertion.top); gData->cursz = curState->u.assertion.sz; if (result) result = x; break; case REOP_ASSERTNOTTEST: --gData->stateStackTop; --curState; x->cp = gData->cpbegin + curState->index; gData->backTrackSP = (REBackTrackData *) ((char *)gData->backTrackStack + curState->u.assertion.top); gData->cursz = curState->u.assertion.sz; result = (!result) ? x : NULL; break; case REOP_STAR: curState->u.quantifier.min = 0; curState->u.quantifier.max = (uintN)-1; goto quantcommon; case REOP_PLUS: curState->u.quantifier.min = 1; curState->u.quantifier.max = (uintN)-1; goto quantcommon; case REOP_OPT: curState->u.quantifier.min = 0; curState->u.quantifier.max = 1; goto quantcommon; case REOP_QUANT: pc = ReadCompactIndex(pc, &k); curState->u.quantifier.min = k; pc = ReadCompactIndex(pc, &k); /* max is k - 1 to use one byte for (uintN)-1 sentinel. */ curState->u.quantifier.max = k - 1; JS_ASSERT(curState->u.quantifier.min <= curState->u.quantifier.max); quantcommon: if (curState->u.quantifier.max == 0) { pc = pc + GET_OFFSET(pc); op = (REOp) *pc++; result = x; continue; } /* Step over */ nextpc = pc + ARG_LEN; op = (REOp) *nextpc++; startcp = x->cp; if (REOP_IS_SIMPLE(op)) { if (!SimpleMatch(gData, x, op, &nextpc, JS_TRUE)) { if (curState->u.quantifier.min == 0) result = x; else result = NULL; pc = pc + GET_OFFSET(pc); break; } op = (REOp) *nextpc++; result = x; } curState->index = startcp - gData->cpbegin; curState->continue_op = REOP_REPEAT; curState->continue_pc = pc; curState->parenSoFar = parenSoFar; PUSH_STATE_STACK(gData); if (curState->u.quantifier.min == 0 && !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp, 0, 0)) { goto bad; } pc = nextpc; continue; case REOP_ENDCHILD: /* marks the end of a quantifier child */ pc = curState[-1].continue_pc; op = (REOp) curState[-1].continue_op; if (!result) result = x; continue; case REOP_REPEAT: --curState; do { --gData->stateStackTop; if (!result) { /* Failed, see if we have enough children. */ if (curState->u.quantifier.min == 0) goto repeatDone; goto break_switch; } if (curState->u.quantifier.min == 0 && x->cp == gData->cpbegin + curState->index) { /* matched an empty string, that'll get us nowhere */ result = NULL; goto break_switch; } if (curState->u.quantifier.min != 0) curState->u.quantifier.min--; if (curState->u.quantifier.max != (uintN) -1) curState->u.quantifier.max--; if (curState->u.quantifier.max == 0) goto repeatDone; nextpc = pc + ARG_LEN; nextop = (REOp) *nextpc; startcp = x->cp; if (REOP_IS_SIMPLE(nextop)) { nextpc++; if (!SimpleMatch(gData, x, nextop, &nextpc, JS_TRUE)) { if (curState->u.quantifier.min == 0) goto repeatDone; result = NULL; goto break_switch; } result = x; } curState->index = startcp - gData->cpbegin; PUSH_STATE_STACK(gData); if (curState->u.quantifier.min == 0 && !PushBackTrackState(gData, REOP_REPEAT, pc, x, startcp, curState->parenSoFar, parenSoFar - curState->parenSoFar)) { goto bad; } } while (*nextpc == REOP_ENDCHILD); pc = nextpc; op = (REOp) *pc++; parenSoFar = curState->parenSoFar; continue; repeatDone: result = x; pc += GET_OFFSET(pc); goto break_switch; case REOP_MINIMALSTAR: curState->u.quantifier.min = 0; curState->u.quantifier.max = (uintN)-1; goto minimalquantcommon; case REOP_MINIMALPLUS: curState->u.quantifier.min = 1; curState->u.quantifier.max = (uintN)-1; goto minimalquantcommon; case REOP_MINIMALOPT: curState->u.quantifier.min = 0; curState->u.quantifier.max = 1; goto minimalquantcommon; case REOP_MINIMALQUANT: pc = ReadCompactIndex(pc, &k); curState->u.quantifier.min = k; pc = ReadCompactIndex(pc, &k); /* See REOP_QUANT comments about k - 1. */ curState->u.quantifier.max = k - 1; JS_ASSERT(curState->u.quantifier.min <= curState->u.quantifier.max); minimalquantcommon: curState->index = x->cp - gData->cpbegin; curState->parenSoFar = parenSoFar; PUSH_STATE_STACK(gData); if (curState->u.quantifier.min != 0) { curState->continue_op = REOP_MINIMALREPEAT; curState->continue_pc = pc; /* step over */ pc += OFFSET_LEN; op = (REOp) *pc++; } else { if (!PushBackTrackState(gData, REOP_MINIMALREPEAT, pc, x, x->cp, 0, 0)) { goto bad; } --gData->stateStackTop; pc = pc + GET_OFFSET(pc); op = (REOp) *pc++; } continue; case REOP_MINIMALREPEAT: --gData->stateStackTop; --curState; re_debug("{%d,%d}", curState->u.quantifier.min, curState->u.quantifier.max); #define PREPARE_REPEAT() \ JS_BEGIN_MACRO \ curState->index = x->cp - gData->cpbegin; \ curState->continue_op = REOP_MINIMALREPEAT; \ curState->continue_pc = pc; \ pc += ARG_LEN; \ for (k = curState->parenSoFar; k < parenSoFar; k++) \ x->parens[k].index = -1; \ PUSH_STATE_STACK(gData); \ op = (REOp) *pc++; \ JS_ASSERT(op < REOP_LIMIT); \ JS_END_MACRO if (!result) { re_debug(" - "); /* * Non-greedy failure - try to consume another child. */ if (curState->u.quantifier.max == (uintN) -1 || curState->u.quantifier.max > 0) { PREPARE_REPEAT(); continue; } /* Don't need to adjust pc since we're going to pop. */ break; } if (curState->u.quantifier.min == 0 && x->cp == gData->cpbegin + curState->index) { /* Matched an empty string, that'll get us nowhere. */ result = NULL; break; } if (curState->u.quantifier.min != 0) curState->u.quantifier.min--; if (curState->u.quantifier.max != (uintN) -1) curState->u.quantifier.max--; if (curState->u.quantifier.min != 0) { PREPARE_REPEAT(); continue; } curState->index = x->cp - gData->cpbegin; curState->parenSoFar = parenSoFar; PUSH_STATE_STACK(gData); if (!PushBackTrackState(gData, REOP_MINIMALREPEAT, pc, x, x->cp, curState->parenSoFar, parenSoFar - curState->parenSoFar)) { goto bad; } --gData->stateStackTop; pc = pc + GET_OFFSET(pc); op = (REOp) *pc++; JS_ASSERT(op < REOP_LIMIT); continue; default: JS_ASSERT(JS_FALSE); result = NULL; } break_switch:; } /* * If the match failed and there's a backtrack option, take it. * Otherwise this is a complete and utter failure. */ if (!result) { if (gData->cursz == 0) return NULL; if (!JS_CHECK_OPERATION_LIMIT(gData->cx)) { gData->ok = JS_FALSE; return NULL; } /* Potentially detect explosive regex here. */ gData->backTrackCount++; if (gData->backTrackLimit && gData->backTrackCount >= gData->backTrackLimit) { JS_ReportErrorNumber(gData->cx, js_GetErrorMessage, NULL, JSMSG_REGEXP_TOO_COMPLEX); gData->ok = JS_FALSE; return NULL; } backTrackData = gData->backTrackSP; gData->cursz = backTrackData->sz; gData->backTrackSP = (REBackTrackData *) ((char *)backTrackData - backTrackData->sz); x->cp = backTrackData->cp; pc = backTrackData->backtrack_pc; op = (REOp) backTrackData->backtrack_op; JS_ASSERT(op < REOP_LIMIT); gData->stateStackTop = backTrackData->saveStateStackTop; JS_ASSERT(gData->stateStackTop); memcpy(gData->stateStack, backTrackData + 1, sizeof(REProgState) * backTrackData->saveStateStackTop); curState = &gData->stateStack[gData->stateStackTop - 1]; if (backTrackData->parenCount) { memcpy(&x->parens[backTrackData->parenIndex], (char *)(backTrackData + 1) + sizeof(REProgState) * backTrackData->saveStateStackTop, sizeof(RECapture) * backTrackData->parenCount); parenSoFar = backTrackData->parenIndex + backTrackData->parenCount; } else { for (k = curState->parenSoFar; k < parenSoFar; k++) x->parens[k].index = -1; parenSoFar = curState->parenSoFar; } re_debug("\tBT_Pop: %ld,%ld", (unsigned long) backTrackData->parenIndex, (unsigned long) backTrackData->parenCount); continue; } x = result; /* * Continue with the expression. */ op = (REOp)*pc++; JS_ASSERT(op < REOP_LIMIT); } bad: re_debug("\n"); return NULL; good: re_debug("\n"); return x; } static REMatchState * MatchRegExp(REGlobalData *gData, REMatchState *x) { const jschar *cpOrig = x->cp; #ifdef JS_TRACER NativeRegExp native; /* Run with native regexp if possible. */ if (TRACING_ENABLED(gData->cx) && !(gData->regexp->flags & JSREG_NOCOMPILE) && (native = GetNativeRegExp(gData->cx, gData->regexp))) { /* * For efficient native execution, store offset as a direct pointer into * the buffer and convert back after execution finishes. */ gData->skipped = (ptrdiff_t)cpOrig; #ifdef JS_JIT_SPEW debug_only_stmt({ VOUCH_DOES_NOT_REQUIRE_STACK(); JSStackFrame *caller = (JS_ON_TRACE(gData->cx)) ? NULL : js_GetScriptedCaller(gData->cx, NULL); debug_only_printf(LC_TMRegexp, "entering REGEXP trace at %s:%u@%u, code: %p\n", caller ? caller->script->filename : "", caller ? js_FramePCToLineNumber(gData->cx, caller) : 0, caller ? FramePCOffset(gData->cx, caller) : 0, JS_FUNC_TO_DATA_PTR(void *, native)); }) #endif void *result; #if defined(JS_NO_FASTCALL) && defined(NANOJIT_IA32) /* * Although a NativeRegExp takes one argument and SIMULATE_FASTCALL is * passing two, the second goes into 'edx' and can safely be ignored. */ SIMULATE_FASTCALL(result, gData, gData->cpend, native); #else result = native(gData, gData->cpend); #endif debug_only_print0(LC_TMRegexp, "leaving REGEXP trace\n"); if (!result) return NULL; /* Restore REGlobalData::skipped and fill REMatchState. */ x->cp = (const jschar *)gData->stateStack; gData->skipped = (const jschar *)gData->skipped - cpOrig; return x; } #endif /* * Have to include the position beyond the last character * in order to detect end-of-input/line condition. */ for (const jschar *p = cpOrig; p <= gData->cpend; p++) { gData->skipped = p - cpOrig; x->cp = p; for (uintN j = 0; j < gData->regexp->parenCount; j++) x->parens[j].index = -1; REMatchState *result = ExecuteREBytecode(gData, x); if (!gData->ok || result || (gData->regexp->flags & JSREG_STICKY)) return result; gData->backTrackSP = gData->backTrackStack; gData->cursz = 0; gData->stateStackTop = 0; p = cpOrig + gData->skipped; } return NULL; } #define MIN_BACKTRACK_LIMIT 400000 static REMatchState * InitMatch(JSContext *cx, REGlobalData *gData, JSRegExp *re, size_t length) { REMatchState *result; uintN i; gData->backTrackStackSize = INITIAL_BACKTRACK; JS_ARENA_ALLOCATE_CAST(gData->backTrackStack, REBackTrackData *, &cx->regexpPool, INITIAL_BACKTRACK); if (!gData->backTrackStack) goto bad; gData->backTrackSP = gData->backTrackStack; gData->cursz = 0; gData->backTrackCount = 0; gData->backTrackLimit = 0; if (JS_GetOptions(cx) & JSOPTION_RELIMIT) { gData->backTrackLimit = length * length * length; /* O(n^3) */ if (gData->backTrackLimit < MIN_BACKTRACK_LIMIT) gData->backTrackLimit = MIN_BACKTRACK_LIMIT; } gData->stateStackLimit = INITIAL_STATESTACK; JS_ARENA_ALLOCATE_CAST(gData->stateStack, REProgState *, &cx->regexpPool, sizeof(REProgState) * INITIAL_STATESTACK); if (!gData->stateStack) goto bad; gData->stateStackTop = 0; gData->cx = cx; gData->regexp = re; gData->ok = JS_TRUE; JS_ARENA_ALLOCATE_CAST(result, REMatchState *, &cx->regexpPool, offsetof(REMatchState, parens) + re->parenCount * sizeof(RECapture)); if (!result) goto bad; for (i = 0; i < re->classCount; i++) { if (!re->classList[i].converted && !MatcherProcessCharSet(gData, &re->classList[i])) { return NULL; } } return result; bad: js_ReportOutOfScriptQuota(cx); gData->ok = JS_FALSE; return NULL; } JSBool js_ExecuteRegExp(JSContext *cx, JSRegExp *re, JSString *str, size_t *indexp, JSBool test, jsval *rval) { REGlobalData gData; REMatchState *x, *result; const jschar *cp, *ep; size_t i, length, start; JSBool ok; JSRegExpStatics *res; ptrdiff_t matchlen; uintN num; JSString *parstr, *matchstr; JSObject *obj; RECapture *parsub = NULL; void *mark; int64 *timestamp; /* * It's safe to load from cp because JSStrings have a zero at the end, * and we never let cp get beyond cpend. */ start = *indexp; str->getCharsAndLength(cp, length); if (start > length) start = length; gData.cpbegin = cp; gData.cpend = cp + length; cp += start; gData.start = start; gData.skipped = 0; if (!cx->regexpPool.first.next) { /* * The first arena in the regexpPool must have a timestamp at its base. */ JS_ARENA_ALLOCATE_CAST(timestamp, int64 *, &cx->regexpPool, sizeof *timestamp); if (!timestamp) return JS_FALSE; *timestamp = JS_Now(); } mark = JS_ARENA_MARK(&cx->regexpPool); x = InitMatch(cx, &gData, re, length); if (!x) { ok = JS_FALSE; goto out; } x->cp = cp; /* * Call the recursive matcher to do the real work. Return null on mismatch * whether testing or not. On match, return an extended Array object. */ result = MatchRegExp(&gData, x); ok = gData.ok; if (!ok) goto out; if (!result) { *rval = JSVAL_NULL; goto out; } cp = result->cp; i = cp - gData.cpbegin; *indexp = i; matchlen = i - (start + gData.skipped); JS_ASSERT(matchlen >= 0); ep = cp; cp -= matchlen; if (test) { /* * Testing for a match and updating cx->regExpStatics: don't allocate * an array object, do return true. */ *rval = JSVAL_TRUE; /* Avoid warning. (gcc doesn't detect that obj is needed iff !test); */ obj = NULL; } else { /* * The array returned on match has element 0 bound to the matched * string, elements 1 through state.parenCount bound to the paren * matches, an index property telling the length of the left context, * and an input property referring to the input string. */ obj = js_NewSlowArrayObject(cx); if (!obj) { ok = JS_FALSE; goto out; } *rval = OBJECT_TO_JSVAL(obj); #define DEFVAL(val, id) { \ ok = js_DefineProperty(cx, obj, id, val, \ JS_PropertyStub, JS_PropertyStub, \ JSPROP_ENUMERATE); \ if (!ok) \ goto out; \ } matchstr = js_NewDependentString(cx, str, cp - str->chars(), matchlen); if (!matchstr) { ok = JS_FALSE; goto out; } DEFVAL(STRING_TO_JSVAL(matchstr), INT_TO_JSID(0)); } res = &cx->regExpStatics; res->input = str; if (!res->parens.resize(re->parenCount)) { ok = JS_FALSE; goto out; } if (re->parenCount == 0) { res->lastParen = js_EmptySubString; } else { for (num = 0; num < re->parenCount; num++) { JSSubString *sub = &res->parens[num]; parsub = &result->parens[num]; if (parsub->index == -1) { sub->chars = NULL; sub->length = 0; } else { sub->chars = gData.cpbegin + parsub->index; sub->length = parsub->length; } if (test) continue; if (parsub->index == -1) { ok = js_DefineProperty(cx, obj, INT_TO_JSID(num + 1), JSVAL_VOID, NULL, NULL, JSPROP_ENUMERATE); } else { parstr = js_NewDependentString(cx, str, gData.cpbegin + parsub->index - str->chars(), parsub->length); if (!parstr) { ok = JS_FALSE; goto out; } ok = js_DefineProperty(cx, obj, INT_TO_JSID(num + 1), STRING_TO_JSVAL(parstr), NULL, NULL, JSPROP_ENUMERATE); } if (!ok) goto out; } if (parsub->index == -1) { res->lastParen = js_EmptySubString; } else { res->lastParen.chars = gData.cpbegin + parsub->index; res->lastParen.length = parsub->length; } } if (!test) { /* * Define the index and input properties last for better for/in loop * order (so they come after the elements). */ DEFVAL(INT_TO_JSVAL(start + gData.skipped), ATOM_TO_JSID(cx->runtime->atomState.indexAtom)); DEFVAL(STRING_TO_JSVAL(str), ATOM_TO_JSID(cx->runtime->atomState.inputAtom)); } #undef DEFVAL res->lastMatch.chars = cp; res->lastMatch.length = matchlen; /* * For JS1.3 and ECMAv2, emulate Perl5 exactly: * * js1.3 "hi", "hi there" "hihitherehi therebye" */ res->leftContext.chars = str->chars(); res->leftContext.length = start + gData.skipped; res->rightContext.chars = ep; res->rightContext.length = gData.cpend - ep; out: JS_ARENA_RELEASE(&cx->regexpPool, mark); return ok; } /************************************************************************/ static JSBool SetRegExpLastIndex(JSContext *cx, JSObject *obj, jsdouble lastIndex) { JS_ASSERT(obj->isRegExp()); return JS_NewNumberValue(cx, lastIndex, obj->addressOfRegExpLastIndex()); } #define DEFINE_GETTER(name, code) \ static JSBool \ name(JSContext *cx, JSObject *obj, jsval id, jsval *vp) \ { \ while (obj->getClass() != &js_RegExpClass) { \ obj = obj->getProto(); \ if (!obj) \ return true; \ } \ JS_LOCK_OBJ(cx, obj); \ JSRegExp *re = (JSRegExp *) obj->getPrivate(); \ code; \ JS_UNLOCK_OBJ(cx, obj); \ return true; \ } /* lastIndex is stored in the object, re = re silences the compiler warning. */ DEFINE_GETTER(lastIndex_getter, re = re; *vp = obj->getRegExpLastIndex()) DEFINE_GETTER(source_getter, *vp = STRING_TO_JSVAL(re->source)) DEFINE_GETTER(global_getter, *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_GLOB) != 0)) DEFINE_GETTER(ignoreCase_getter, *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_FOLD) != 0)) DEFINE_GETTER(multiline_getter, *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_MULTILINE) != 0)) DEFINE_GETTER(sticky_getter, *vp = BOOLEAN_TO_JSVAL((re->flags & JSREG_STICKY) != 0)) static JSBool lastIndex_setter(JSContext *cx, JSObject *obj, jsval id, jsval *vp) { while (obj->getClass() != &js_RegExpClass) { obj = obj->getProto(); if (!obj) return true; } jsdouble lastIndex; if (!JS_ValueToNumber(cx, *vp, &lastIndex)) return false; lastIndex = js_DoubleToInteger(lastIndex); return SetRegExpLastIndex(cx, obj, lastIndex); } #define REGEXP_PROP_ATTRS (JSPROP_PERMANENT | JSPROP_SHARED) #define RO_REGEXP_PROP_ATTRS (REGEXP_PROP_ATTRS | JSPROP_READONLY) static JSPropertySpec regexp_props[] = { {"source", 0, RO_REGEXP_PROP_ATTRS, source_getter, NULL}, {"global", 0, RO_REGEXP_PROP_ATTRS, global_getter, NULL}, {"ignoreCase", 0, RO_REGEXP_PROP_ATTRS, ignoreCase_getter, NULL}, {"lastIndex", 0, REGEXP_PROP_ATTRS, lastIndex_getter, lastIndex_setter}, {"multiline", 0, RO_REGEXP_PROP_ATTRS, multiline_getter, NULL}, {"sticky", 0, RO_REGEXP_PROP_ATTRS, sticky_getter, NULL}, {0,0,0,0,0} }; /* * RegExp class static properties and their Perl counterparts: * * RegExp.input $_ * RegExp.multiline $* * RegExp.lastMatch $& * RegExp.lastParen $+ * RegExp.leftContext $` * RegExp.rightContext $' */ void js_InitRegExpStatics(JSContext *cx) { /* * To avoid multiple allocations in InitMatch(), the arena size parameter * should be at least as big as: * INITIAL_BACKTRACK * + (sizeof(REProgState) * INITIAL_STATESTACK) * + (offsetof(REMatchState, parens) + avgParanSize * sizeof(RECapture)) */ JS_InitArenaPool(&cx->regexpPool, "regexp", 12 * 1024 - 40, /* FIXME: bug 421435 */ sizeof(void *), &cx->scriptStackQuota); JS_ClearRegExpStatics(cx); } JS_FRIEND_API(void) js_SaveAndClearRegExpStatics(JSContext *cx, JSRegExpStatics *statics, AutoValueRooter *tvr) { statics->copy(cx->regExpStatics); if (statics->input) tvr->setString(statics->input); JS_ClearRegExpStatics(cx); } JS_FRIEND_API(void) js_RestoreRegExpStatics(JSContext *cx, JSRegExpStatics *statics, AutoValueRooter *tvr) { /* Clear/free any new JSRegExpStatics data before clobbering. */ cx->regExpStatics.copy(*statics); } void js_TraceRegExpStatics(JSTracer *trc, JSContext *acx) { JSRegExpStatics *res = &acx->regExpStatics; if (res->input) JS_CALL_STRING_TRACER(trc, res->input, "res->input"); } void js_FreeRegExpStatics(JSContext *cx) { JS_ClearRegExpStatics(cx); JS_FinishArenaPool(&cx->regexpPool); } #define DEFINE_STATIC_GETTER(name, code) \ static JSBool \ name(JSContext *cx, JSObject *obj, jsval id, jsval *vp) \ { \ JSRegExpStatics *res = &cx->regExpStatics; \ code; \ } static bool MakeString(JSContext *cx, JSSubString *sub, jsval *vp) { JSString *str = js_NewStringCopyN(cx, sub->chars, sub->length); if (!str) return false; *vp = STRING_TO_JSVAL(str); return true; } DEFINE_STATIC_GETTER(static_input_getter, *vp = res->input ? STRING_TO_JSVAL(res->input) : JS_GetEmptyStringValue(cx); return true) DEFINE_STATIC_GETTER(static_multiline_getter, *vp = BOOLEAN_TO_JSVAL(res->multiline); return true) DEFINE_STATIC_GETTER(static_lastMatch_getter, return MakeString(cx, &res->lastMatch, vp)) DEFINE_STATIC_GETTER(static_lastParen_getter, return MakeString(cx, &res->lastParen, vp)) DEFINE_STATIC_GETTER(static_leftContext_getter, return MakeString(cx, &res->leftContext, vp)) DEFINE_STATIC_GETTER(static_rightContext_getter, return MakeString(cx, &res->rightContext, vp)) static bool Paren(JSContext *cx, JSRegExpStatics *res, size_t n, jsval *vp) { return MakeString(cx, n < res->parens.length() ? &res->parens[n] : &js_EmptySubString, vp); } DEFINE_STATIC_GETTER(static_paren1_getter, return Paren(cx, res, 0, vp)) DEFINE_STATIC_GETTER(static_paren2_getter, return Paren(cx, res, 1, vp)) DEFINE_STATIC_GETTER(static_paren3_getter, return Paren(cx, res, 2, vp)) DEFINE_STATIC_GETTER(static_paren4_getter, return Paren(cx, res, 3, vp)) DEFINE_STATIC_GETTER(static_paren5_getter, return Paren(cx, res, 4, vp)) DEFINE_STATIC_GETTER(static_paren6_getter, return Paren(cx, res, 5, vp)) DEFINE_STATIC_GETTER(static_paren7_getter, return Paren(cx, res, 6, vp)) DEFINE_STATIC_GETTER(static_paren8_getter, return Paren(cx, res, 7, vp)) DEFINE_STATIC_GETTER(static_paren9_getter, return Paren(cx, res, 8, vp)) #define DEFINE_STATIC_SETTER(name, code) \ static JSBool \ name(JSContext *cx, JSObject *obj, jsval id, jsval *vp) \ { \ JSRegExpStatics *res = &cx->regExpStatics; \ code; \ return true; \ } DEFINE_STATIC_SETTER(static_input_setter, if (!JSVAL_IS_STRING(*vp) && !JS_ConvertValue(cx, *vp, JSTYPE_STRING, vp)) return false; res->input = JSVAL_TO_STRING(*vp)) DEFINE_STATIC_SETTER(static_multiline_setter, if (!JSVAL_IS_BOOLEAN(*vp) && !JS_ConvertValue(cx, *vp, JSTYPE_BOOLEAN, vp)) return false; res->multiline = JSVAL_TO_BOOLEAN(*vp)) #define REGEXP_STATIC_PROP_ATTRS (REGEXP_PROP_ATTRS | JSPROP_ENUMERATE) #define RO_REGEXP_STATIC_PROP_ATTRS (REGEXP_STATIC_PROP_ATTRS | JSPROP_READONLY) static JSPropertySpec regexp_static_props[] = { {"input", 0, REGEXP_STATIC_PROP_ATTRS, static_input_getter, static_input_setter}, {"multiline", 0, REGEXP_STATIC_PROP_ATTRS, static_multiline_getter, static_multiline_setter}, {"lastMatch", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_lastMatch_getter, NULL}, {"lastParen", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_lastParen_getter, NULL}, {"leftContext", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_leftContext_getter, NULL}, {"rightContext", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_rightContext_getter, NULL}, {"$1", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren1_getter, NULL}, {"$2", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren2_getter, NULL}, {"$3", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren3_getter, NULL}, {"$4", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren4_getter, NULL}, {"$5", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren5_getter, NULL}, {"$6", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren6_getter, NULL}, {"$7", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren7_getter, NULL}, {"$8", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren8_getter, NULL}, {"$9", 0, RO_REGEXP_STATIC_PROP_ATTRS, static_paren9_getter, NULL}, {0,0,0,0,0} }; static void regexp_finalize(JSContext *cx, JSObject *obj) { JSRegExp *re = (JSRegExp *) obj->getPrivate(); if (!re) return; js_DestroyRegExp(cx, re); } /* Forward static prototype. */ static JSBool regexp_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, JSBool test, jsval *rval); static JSBool regexp_call(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) { return regexp_exec_sub(cx, JSVAL_TO_OBJECT(argv[-2]), argc, argv, JS_FALSE, rval); } #if JS_HAS_XDR #include "jsxdrapi.h" JSBool js_XDRRegExpObject(JSXDRState *xdr, JSObject **objp) { JSRegExp *re; JSString *source; uint32 flagsword; JSObject *obj; if (xdr->mode == JSXDR_ENCODE) { re = (JSRegExp *) (*objp)->getPrivate(); if (!re) return JS_FALSE; source = re->source; flagsword = (uint32)re->flags; } if (!JS_XDRString(xdr, &source) || !JS_XDRUint32(xdr, &flagsword)) { return JS_FALSE; } if (xdr->mode == JSXDR_DECODE) { obj = NewObject(xdr->cx, &js_RegExpClass, NULL, NULL); if (!obj) return JS_FALSE; obj->clearParent(); obj->clearProto(); re = js_NewRegExp(xdr->cx, NULL, source, (uint8)flagsword, JS_FALSE); if (!re) return JS_FALSE; obj->setPrivate(re); obj->zeroRegExpLastIndex(); *objp = obj; } return JS_TRUE; } #else /* !JS_HAS_XDR */ #define js_XDRRegExpObject NULL #endif /* !JS_HAS_XDR */ static void regexp_trace(JSTracer *trc, JSObject *obj) { JSRegExp *re = (JSRegExp *) obj->getPrivate(); if (re && re->source) JS_CALL_STRING_TRACER(trc, re->source, "source"); } JSClass js_RegExpClass = { js_RegExp_str, JSCLASS_HAS_PRIVATE | JSCLASS_HAS_RESERVED_SLOTS(JSObject::REGEXP_FIXED_RESERVED_SLOTS) | JSCLASS_MARK_IS_TRACE | JSCLASS_HAS_CACHED_PROTO(JSProto_RegExp), JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_PropertyStub, JS_EnumerateStub, JS_ResolveStub, JS_ConvertStub, regexp_finalize, NULL, NULL, regexp_call, NULL, js_XDRRegExpObject, NULL, JS_CLASS_TRACE(regexp_trace), 0 }; static const jschar empty_regexp_ucstr[] = {'(', '?', ':', ')', 0}; JSBool js_regexp_toString(JSContext *cx, JSObject *obj, jsval *vp) { JSRegExp *re; const jschar *source; jschar *chars; size_t length, nflags; uintN flags; JSString *str; if (!JS_InstanceOf(cx, obj, &js_RegExpClass, vp + 2)) return JS_FALSE; JS_LOCK_OBJ(cx, obj); re = (JSRegExp *) obj->getPrivate(); if (!re) { JS_UNLOCK_OBJ(cx, obj); *vp = STRING_TO_JSVAL(cx->runtime->emptyString); return JS_TRUE; } re->source->getCharsAndLength(source, length); if (length == 0) { source = empty_regexp_ucstr; length = JS_ARRAY_LENGTH(empty_regexp_ucstr) - 1; } length += 2; nflags = 0; for (flags = re->flags; flags != 0; flags &= flags - 1) nflags++; chars = (jschar*) cx->malloc((length + nflags + 1) * sizeof(jschar)); if (!chars) { JS_UNLOCK_OBJ(cx, obj); return JS_FALSE; } chars[0] = '/'; js_strncpy(&chars[1], source, length - 2); chars[length-1] = '/'; if (nflags) { if (re->flags & JSREG_GLOB) chars[length++] = 'g'; if (re->flags & JSREG_FOLD) chars[length++] = 'i'; if (re->flags & JSREG_MULTILINE) chars[length++] = 'm'; if (re->flags & JSREG_STICKY) chars[length++] = 'y'; } JS_UNLOCK_OBJ(cx, obj); chars[length] = 0; str = js_NewString(cx, chars, length); if (!str) { cx->free(chars); return JS_FALSE; } *vp = STRING_TO_JSVAL(str); return JS_TRUE; } static JSBool regexp_toString(JSContext *cx, uintN argc, jsval *vp) { JSObject *obj; obj = JS_THIS_OBJECT(cx, vp); return obj && js_regexp_toString(cx, obj, vp); } static JSBool regexp_compile_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) { JSString *opt, *str; JSRegExp *oldre, *re; JSObject *obj2; size_t length, nbytes; const jschar *cp, *start, *end; jschar *nstart, *ncp, *tmp; if (!JS_InstanceOf(cx, obj, &js_RegExpClass, argv)) return JS_FALSE; opt = NULL; if (argc == 0) { str = cx->runtime->emptyString; } else { if (JSVAL_IS_OBJECT(argv[0])) { /* * If we get passed in a RegExp object we construct a new * RegExp that is a duplicate of it by re-compiling the * original source code. ECMA requires that it be an error * here if the flags are specified. (We must use the flags * from the original RegExp also). */ obj2 = JSVAL_TO_OBJECT(argv[0]); if (obj2 && obj2->getClass() == &js_RegExpClass) { if (argc >= 2 && !JSVAL_IS_VOID(argv[1])) { /* 'flags' passed */ JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL, JSMSG_NEWREGEXP_FLAGGED); return JS_FALSE; } JS_LOCK_OBJ(cx, obj2); re = (JSRegExp *) obj2->getPrivate(); if (!re) { JS_UNLOCK_OBJ(cx, obj2); return JS_FALSE; } re = js_NewRegExp(cx, NULL, re->source, re->flags, JS_FALSE); JS_UNLOCK_OBJ(cx, obj2); goto created; } } str = js_ValueToString(cx, argv[0]); if (!str) return JS_FALSE; argv[0] = STRING_TO_JSVAL(str); if (argc > 1) { if (JSVAL_IS_VOID(argv[1])) { opt = NULL; } else { opt = js_ValueToString(cx, argv[1]); if (!opt) return JS_FALSE; argv[1] = STRING_TO_JSVAL(opt); } } /* Escape any naked slashes in the regexp source. */ str->getCharsAndLength(start, length); end = start + length; nstart = ncp = NULL; for (cp = start; cp < end; cp++) { if (*cp == '/' && (cp == start || cp[-1] != '\\')) { nbytes = (++length + 1) * sizeof(jschar); if (!nstart) { nstart = (jschar *) cx->malloc(nbytes); if (!nstart) return JS_FALSE; ncp = nstart + (cp - start); js_strncpy(nstart, start, cp - start); } else { tmp = (jschar *) cx->realloc(nstart, nbytes); if (!tmp) { cx->free(nstart); return JS_FALSE; } ncp = tmp + (ncp - nstart); nstart = tmp; } *ncp++ = '\\'; } if (nstart) *ncp++ = *cp; } if (nstart) { /* Don't forget to store the backstop after the new string. */ JS_ASSERT((size_t)(ncp - nstart) == length); *ncp = 0; str = js_NewString(cx, nstart, length); if (!str) { cx->free(nstart); return JS_FALSE; } argv[0] = STRING_TO_JSVAL(str); } } re = js_NewRegExpOpt(cx, str, opt, JS_FALSE); created: if (!re) return JS_FALSE; JS_LOCK_OBJ(cx, obj); oldre = (JSRegExp *) obj->getPrivate(); obj->setPrivate(re); obj->zeroRegExpLastIndex(); JS_UNLOCK_OBJ(cx, obj); if (oldre) js_DestroyRegExp(cx, oldre); *rval = OBJECT_TO_JSVAL(obj); return JS_TRUE; } static JSBool regexp_compile(JSContext *cx, uintN argc, jsval *vp) { JSObject *obj; obj = JS_THIS_OBJECT(cx, vp); return obj && regexp_compile_sub(cx, obj, argc, vp + 2, vp); } static JSBool regexp_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, JSBool test, jsval *rval) { JSBool ok, sticky; JSRegExp *re; jsdouble lastIndex; JSString *str; size_t i; ok = JS_InstanceOf(cx, obj, &js_RegExpClass, argv); if (!ok) return JS_FALSE; JS_LOCK_OBJ(cx, obj); re = (JSRegExp *) obj->getPrivate(); if (!re) { JS_UNLOCK_OBJ(cx, obj); return JS_TRUE; } /* NB: we must reach out: after this paragraph, in order to drop re. */ HOLD_REGEXP(cx, re); sticky = (re->flags & JSREG_STICKY) != 0; if (re->flags & (JSREG_GLOB | JSREG_STICKY)) { jsval v = obj->getRegExpLastIndex(); if (JSVAL_IS_INT(v)) { lastIndex = JSVAL_TO_INT(v); } else { JS_ASSERT(JSVAL_IS_DOUBLE(v)); lastIndex = *JSVAL_TO_DOUBLE(v); } } else { lastIndex = 0; } JS_UNLOCK_OBJ(cx, obj); /* Now that obj is unlocked, it's safe to (potentially) grab the GC lock. */ if (argc == 0) { str = cx->regExpStatics.input; if (!str) { const char *bytes = js_GetStringBytes(cx, re->source); if (bytes) { JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL, JSMSG_NO_INPUT, bytes, (re->flags & JSREG_GLOB) ? "g" : "", (re->flags & JSREG_FOLD) ? "i" : "", (re->flags & JSREG_MULTILINE) ? "m" : "", (re->flags & JSREG_STICKY) ? "y" : ""); } ok = JS_FALSE; goto out; } } else { str = js_ValueToString(cx, argv[0]); if (!str) { ok = JS_FALSE; goto out; } argv[0] = STRING_TO_JSVAL(str); } if (lastIndex < 0 || str->length() < lastIndex) { obj->zeroRegExpLastIndex(); *rval = JSVAL_NULL; } else { i = (size_t) lastIndex; ok = js_ExecuteRegExp(cx, re, str, &i, test, rval); if (ok && ((re->flags & JSREG_GLOB) || (*rval != JSVAL_NULL && sticky))) { if (*rval == JSVAL_NULL) obj->zeroRegExpLastIndex(); else ok = SetRegExpLastIndex(cx, obj, i); } } out: DROP_REGEXP(cx, re); return ok; } static JSBool regexp_exec(JSContext *cx, uintN argc, jsval *vp) { return regexp_exec_sub(cx, JS_THIS_OBJECT(cx, vp), argc, vp + 2, JS_FALSE, vp); } static JSBool regexp_test(JSContext *cx, uintN argc, jsval *vp) { if (!regexp_exec_sub(cx, JS_THIS_OBJECT(cx, vp), argc, vp + 2, JS_TRUE, vp)) return JS_FALSE; if (*vp != JSVAL_TRUE) *vp = JSVAL_FALSE; return JS_TRUE; } static JSFunctionSpec regexp_methods[] = { #if JS_HAS_TOSOURCE JS_FN(js_toSource_str, regexp_toString, 0,0), #endif JS_FN(js_toString_str, regexp_toString, 0,0), JS_FN("compile", regexp_compile, 2,0), JS_FN("exec", regexp_exec, 1,0), JS_FN("test", regexp_test, 1,0), JS_FS_END }; static JSBool RegExp(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, jsval *rval) { if (!JS_IsConstructing(cx)) { /* * If first arg is regexp and no flags are given, just return the arg. * (regexp_compile_sub detects the regexp + flags case and throws a * TypeError.) See 10.15.3.1. */ if ((argc < 2 || JSVAL_IS_VOID(argv[1])) && !JSVAL_IS_PRIMITIVE(argv[0]) && JSVAL_TO_OBJECT(argv[0])->getClass() == &js_RegExpClass) { *rval = argv[0]; return JS_TRUE; } /* Otherwise, replace obj with a new RegExp object. */ obj = NewObject(cx, &js_RegExpClass, NULL, NULL); if (!obj) return JS_FALSE; /* * regexp_compile_sub does not use rval to root its temporaries so we * can use it to root obj. */ *rval = OBJECT_TO_JSVAL(obj); } return regexp_compile_sub(cx, obj, argc, argv, rval); } JSObject * js_InitRegExpClass(JSContext *cx, JSObject *obj) { JSObject *proto = js_InitClass(cx, obj, NULL, &js_RegExpClass, RegExp, 1, regexp_props, regexp_methods, regexp_static_props, NULL); if (!proto) return NULL; JSObject *ctor = JS_GetConstructor(cx, proto); if (!ctor) return NULL; /* Give RegExp.prototype private data so it matches the empty string. */ jsval rval; if (!JS_AliasProperty(cx, ctor, "input", "$_") || !JS_AliasProperty(cx, ctor, "multiline", "$*") || !JS_AliasProperty(cx, ctor, "lastMatch", "$&") || !JS_AliasProperty(cx, ctor, "lastParen", "$+") || !JS_AliasProperty(cx, ctor, "leftContext", "$`") || !JS_AliasProperty(cx, ctor, "rightContext", "$'") || !regexp_compile_sub(cx, proto, 0, NULL, &rval)) { return NULL; } return proto; } JSObject * js_NewRegExpObject(JSContext *cx, TokenStream *ts, const jschar *chars, size_t length, uintN flags) { JSString *str; JSObject *obj; JSRegExp *re; str = js_NewStringCopyN(cx, chars, length); if (!str) return NULL; AutoValueRooter tvr(cx, str); re = js_NewRegExp(cx, ts, str, flags, JS_FALSE); if (!re) return NULL; obj = NewObject(cx, &js_RegExpClass, NULL, NULL); if (!obj) { js_DestroyRegExp(cx, re); return NULL; } obj->setPrivate(re); obj->zeroRegExpLastIndex(); return obj; } JSObject * JS_FASTCALL js_CloneRegExpObject(JSContext *cx, JSObject *obj, JSObject *proto) { JS_ASSERT(obj->getClass() == &js_RegExpClass); JS_ASSERT(proto); JS_ASSERT(proto->getClass() == &js_RegExpClass); JSObject *clone = NewObjectWithGivenProto(cx, &js_RegExpClass, proto, NULL); if (!clone) return NULL; JSRegExp *re = static_cast(obj->getPrivate()); clone->setPrivate(re); clone->zeroRegExpLastIndex(); HOLD_REGEXP(cx, re); return clone; } #ifdef JS_TRACER JS_DEFINE_CALLINFO_3(extern, OBJECT, js_CloneRegExpObject, CONTEXT, OBJECT, OBJECT, 0, ACC_STORE_ANY) #endif bool js_ContainsRegExpMetaChars(const jschar *chars, size_t length) { for (size_t i = 0; i < length; ++i) { jschar c = chars[i]; switch (c) { /* Taken from the PatternCharacter production in 15.10.1. */ case '^': case '$': case '\\': case '.': case '*': case '+': case '?': case '(': case ')': case '[': case ']': case '{': case '}': case '|': return true; default:; } } return false; } JSBool js_ObjectIsRegExp(JSObject *obj) { return obj->isRegExp(); }