diff --git a/js/src/nanojit/NativeX64.cpp b/js/src/nanojit/NativeX64.cpp new file mode 100644 index 00000000000..7645c561aff --- /dev/null +++ b/js/src/nanojit/NativeX64.cpp @@ -0,0 +1,1302 @@ +/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */ +/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is [Open Source Virtual Machine]. + * + * The Initial Developer of the Original Code is + * Adobe System Incorporated. + * Portions created by the Initial Developer are Copyright (C) 2009 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Adobe AS3 Team + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#include "nanojit.h" + +// uncomment this to enable _vprof/_nvprof macros +//#define DOPROF +#include "../vprof/vprof.h" + +#if defined FEATURE_NANOJIT && defined NANOJIT_X64 + +/* +completion +- 64bit branch offsets +- finish cmov/qcmov with other conditions +- validate asm_cond with other conditions + +better code +- put R12 back in play as a base register +- no-disp addr modes (except RBP/R13) +- disp64 branch/call +- spill gp values to xmm registers? +- prefer xmm registers for copies since gprs are in higher demand? +- stack arg doubles +- stack based LIR_param + +tracing +- asm_loop +- asm_qjoin +- asm_qhi +- nFragExit + +*/ + +namespace nanojit +{ + const Register Assembler::retRegs[] = { RAX }; +#ifdef _MSC_VER + const Register Assembler::argRegs[] = { RCX, RDX, R8, R9 }; + const Register Assembler::savedRegs[] = { RBX, RSI, RDI, R12, R13, R14, R15 }; +#else + const Register Assembler::argRegs[] = { RDI, RSI, RDX, RCX, R8, R9 }; + const Register Assembler::savedRegs[] = { RBX, R12, R13, R14, R15 }; +#endif + + const char *regNames[] = { + "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" + }; + +#ifdef _DEBUG + #define TODO(x) todo(#x) + static void todo(const char *s) { + verbose_only( avmplus::AvmLog("%s",s); ) + NanoAssertMsgf(false, "%s", s); + } +#else + #define TODO(x) +#endif + + // MODRM and restrictions: + // memory access modes != 11 require SIB if base&7 == 4 (RSP or R12) + // mode 00 with base&7 == 5 means RIP+disp32 (RBP or R13), use mode 01 disp8=0 instead + // rex prefix required to use RSP-R15 as 8bit registers in mod/rm8 modes. + + // take R12 out of play as a base register because it requires the SIB byte like ESP + const RegisterMask BaseRegs = GpRegs & ~rmask(R12); + + static inline int oplen(uint64_t op) { + return op & 255; + } + + // encode 2-register rex prefix. dropped if none of its bits are set. + static inline uint64_t rexrb(uint64_t op, Register r, Register b) { + int shift = 64 - 8*oplen(op); + uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3); + return rex != 0x40 ? op | rex << shift : op - 1; + } + + // encode 2-register rex prefix. dropped if none of its bits are set, but + // keep REX if b >= rsp, to allow uniform use of all 16 8bit registers + static inline uint64_t rexrb8(uint64_t op, Register r, Register b) { + int shift = 64 - 8*oplen(op); + uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3); + return ((rex | (b & ~3)) != 0x40) ? (op | (rex << shift)) : op - 1; + } + + // encode 2-register rex prefix that follows a manditory prefix (66,F2,F3) + // [prefix][rex][opcode] + static inline uint64_t rexprb(uint64_t op, Register r, Register b) { + int shift = 64 - 8*oplen(op) + 8; + uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3); + // to drop rex, we replace rex with manditory prefix, and decrement length + return rex != 0x40 ? op | rex << shift : + ((op & ~(255LL<>(shift-8)&255) << shift) - 1; + } + + // [rex][opcode][mod-rr] + static inline uint64_t mod_rr(uint64_t op, Register r, Register b) { + return op | uint64_t((r&7)<<3 | (b&7))<<56; + } + + static inline uint64_t mod_disp32(uint64_t op, Register r, Register b, int32_t d) { + NanoAssert(IsGpReg(r) && IsGpReg(b)); + NanoAssert((b & 7) != 4); // using RSP or R12 as base requires SIB + if (isS8(d)) { + // op is: 0x[disp32=0][mod=2:r:b][op][rex][len] + NanoAssert((((op>>24)&255)>>6) == 2); // disp32 mode + int len = oplen(op); + op = (op & ~0xff000000LL) | (0x40 | (r&7)<<3 | (b&7))<<24; // replace mod + return op<<24 | int64_t(d)<<56 | (len-3); // shrink disp, add disp8 + } else { + // op is: 0x[disp32][mod][op][rex][len] + return op | int64_t(d)<<32 | uint64_t((r&7)<<3 | (b&7))<<24; + } + } + + #ifdef NJ_VERBOSE + void Assembler::dis(NIns *p, int bytes) { + char b[32], *s = b; // room for 8 hex bytes plus null + *s++ = ' '; + for (NIns *end = p + bytes; p < end; p++) { + VMPI_sprintf(s, "%02x ", *p); + s += 3; + } + *s = 0; + asm_output("%s", b); + } + #endif + + void Assembler::emit(uint64_t op) { + int len = oplen(op); + // we will only move nIns by -len bytes, but we write 8 + // bytes. so need to protect 8 so we dont stomp the page + // header or the end of the preceding page (might segf) + underrunProtect(8); + ((int64_t*)_nIns)[-1] = op; + _nIns -= len; // move pointer by length encoded in opcode + _nvprof("x64-bytes", len); + verbose_only( if (_logc->lcbits & LC_Assembly) dis(_nIns, len); ) + } + + void Assembler::emit8(uint64_t op, int64_t v) { + NanoAssert(isS8(v)); + emit(op | uint64_t(v)<<56); + } + + void Assembler::emit32(uint64_t op, int64_t v) { + NanoAssert(isS32(v)); + emit(op | uint64_t(uint32_t(v))<<32); + } + + // 2-register modrm32 form + void Assembler::emitrr(uint64_t op, Register r, Register b) { + emit(rexrb(mod_rr(op, r, b), r, b)); + } + + // 2-register modrm8 form (8 bit operand size) + void Assembler::emitrr8(uint64_t op, Register r, Register b) { + emit(rexrb8(mod_rr(op, r, b), r, b)); + } + + // same as emitrr, but with a prefix byte + void Assembler::emitprr(uint64_t op, Register r, Register b) { + emit(rexprb(mod_rr(op, r, b), r, b)); + } + + // disp32 modrm form, when the disp fits in the instruction (opcode is 1-3 bytes) + void Assembler::emitrm(uint64_t op, Register r, int32_t d, Register b) { + emit(rexrb(mod_disp32(op, r, b, d), r, b)); + } + + // disp32 modrm form when the disp must be written separately (opcode is 4+ bytes) + void Assembler::emitprm(uint64_t op, Register r, int32_t d, Register b) { + if (isS8(d)) { + NanoAssert(((op>>56)&0xC0) == 0x80); // make sure mod bits == 2 == disp32 mode + underrunProtect(1+8); + *(--_nIns) = (NIns) d; + _nvprof("x64-bytes", 1); + op ^= 0xC000000000000000LL; // change mod bits to 1 == disp8 mode + } else { + underrunProtect(4+8); // room for displ plus fullsize op + *((int32_t*)(_nIns -= 4)) = d; + _nvprof("x64-bytes", 4); + } + emitprr(op, r, b); + } + + void Assembler::emitrr_imm(uint64_t op, Register r, Register b, int32_t imm) { + NanoAssert(IsGpReg(r) && IsGpReg(b)); + underrunProtect(4+8); // room for imm plus fullsize op + *((int32_t*)(_nIns -= 4)) = imm; + _nvprof("x86-bytes", 4); + emitrr(op, r, b); + } + + // op = [rex][opcode][modrm][imm8] + void Assembler::emitr_imm8(uint64_t op, Register b, int32_t imm8) { + NanoAssert(IsGpReg(b) && isS8(imm8)); + op |= uint64_t(imm8)<<56 | uint64_t(b&7)<<48; // modrm is 2nd to last byte + emit(rexrb(op, (Register)0, b)); + } + + void Assembler::MR(Register d, Register s) { + NanoAssert(IsGpReg(d) && IsGpReg(s)); + emitrr(X64_movqr, d, s); + } + + void Assembler::JMP(NIns *target) { + if (!target || isS32(target - _nIns)) { + underrunProtect(8); // must do this before calculating offset + if (target && isS8(target - _nIns)) { + emit8(X64_jmp8, target - _nIns); + } else { + emit32(X64_jmp, target ? target - _nIns : 0); + } + } else { + TODO(jmp64); + } + } + + // register allocation for 2-address style ops of the form R = R (op) B + void Assembler::regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb) { + rb = UnknownReg; + LIns *a = ins->oprnd1(); + LIns *b = ins->oprnd2(); + if (a != b) { + rb = findRegFor(b, allow); + allow &= ~rmask(rb); + } + rr = prepResultReg(ins, allow); + Reservation* rA = getresv(a); + // if this is last use of a in reg, we can re-use result reg + if (rA == 0 || (ra = rA->reg) == UnknownReg) { + ra = findSpecificRegFor(a, rr); + } else { + // rA already has a register assigned + } + if (a == b) { + rb = ra; + } + } + + void Assembler::asm_qbinop(LIns *ins) { + asm_arith(ins); + } + + void Assembler::asm_shift(LIns *ins) { + // shift require rcx for shift count + LIns *b = ins->oprnd2(); + if (b->isconst()) { + asm_shift_imm(ins); + return; + } + Register rr, ra; + if (b != ins->oprnd1()) { + findSpecificRegFor(b, RCX); + regalloc_unary(ins, GpRegs & ~rmask(RCX), rr, ra); + } else { + // a == b means both must be in RCX + regalloc_unary(ins, rmask(RCX), rr, ra); + } + X64Opcode xop; + switch (ins->opcode()) { + default: + TODO(asm_shift); + case LIR_qursh: xop = X64_shrq; break; + case LIR_qirsh: xop = X64_sarq; break; + case LIR_qilsh: xop = X64_shlq; break; + case LIR_ush: xop = X64_shr; break; + case LIR_rsh: xop = X64_sar; break; + case LIR_lsh: xop = X64_shl; break; + } + emitr(xop, rr); + if (rr != ra) + MR(rr, ra); + } + + void Assembler::asm_shift_imm(LIns *ins) { + Register rr, ra; + regalloc_unary(ins, GpRegs, rr, ra); + X64Opcode xop; + switch (ins->opcode()) { + default: TODO(shiftimm); + case LIR_qursh: xop = X64_shrqi; break; + case LIR_qirsh: xop = X64_sarqi; break; + case LIR_qilsh: xop = X64_shlqi; break; + case LIR_ush: xop = X64_shri; break; + case LIR_rsh: xop = X64_sari; break; + case LIR_lsh: xop = X64_shli; break; + } + int shift = ins->oprnd2()->imm32() & 255; + emit8(rexrb(xop | uint64_t(rr&7)<<48, (Register)0, rr), shift); + if (rr != ra) + MR(rr, ra); + } + + static bool isImm32(LIns *ins) { + return ins->isconst() || (ins->isconstq() && isS32(ins->imm64())); + } + static int32_t getImm32(LIns *ins) { + return ins->isconst() ? ins->imm32() : int32_t(ins->imm64()); + } + + // binary op, integer regs, rhs is int32 const + void Assembler::asm_arith_imm(LIns *ins) { + LIns *b = ins->oprnd2(); + int32_t imm = getImm32(b); + LOpcode op = ins->opcode(); + Register rr, ra; + if (op == LIR_mul) { + // imul has true 3-addr form, it doesn't clobber ra + rr = prepResultReg(ins, GpRegs); + LIns *a = ins->oprnd1(); + ra = findRegFor(a, GpRegs); + emitrr_imm(X64_imuli, rr, ra, imm); + return; + } + regalloc_unary(ins, GpRegs, rr, ra); + X64Opcode xop; + if (isS8(imm)) { + switch (ins->opcode()) { + default: TODO(arith_imm8); + case LIR_iaddp: + case LIR_add: xop = X64_addlr8; break; + case LIR_and: xop = X64_andlr8; break; + case LIR_or: xop = X64_orlr8; break; + case LIR_sub: xop = X64_sublr8; break; + case LIR_xor: xop = X64_xorlr8; break; + case LIR_qiadd: + case LIR_qaddp: xop = X64_addqr8; break; + case LIR_qiand: xop = X64_andqr8; break; + case LIR_qior: xop = X64_orqr8; break; + case LIR_qxor: xop = X64_xorqr8; break; + } + emitr_imm8(xop, rr, imm); + } else { + switch (ins->opcode()) { + default: TODO(arith_imm); + case LIR_iaddp: + case LIR_add: xop = X64_addlri; break; + case LIR_and: xop = X64_andlri; break; + case LIR_or: xop = X64_orlri; break; + case LIR_sub: xop = X64_sublri; break; + case LIR_xor: xop = X64_xorlri; break; + case LIR_qiadd: + case LIR_qaddp: xop = X64_addqri; break; + case LIR_qiand: xop = X64_andqri; break; + case LIR_qior: xop = X64_orqri; break; + case LIR_qxor: xop = X64_xorqri; break; + } + emitr_imm(xop, rr, imm); + } + if (rr != ra) + MR(rr, ra); + } + + // binary op with integer registers + void Assembler::asm_arith(LIns *ins) { + Register rr, ra, rb; + LOpcode op = ins->opcode(); + if ((op & ~LIR64) >= LIR_lsh && (op & ~LIR64) <= LIR_ush) { + asm_shift(ins); + return; + } + LIns *b = ins->oprnd2(); + if (isImm32(b)) { + asm_arith_imm(ins); + return; + } + regalloc_binary(ins, GpRegs, rr, ra, rb); + X64Opcode xop; + switch (ins->opcode()) { + default: + TODO(asm_arith); + case LIR_or: + xop = X64_orlrr; + break; + case LIR_sub: + xop = X64_subrr; + break; + case LIR_iaddp: + case LIR_add: + xop = X64_addrr; + break; + case LIR_and: + xop = X64_andrr; + break; + case LIR_xor: + xop = X64_xorrr; + break; + case LIR_mul: + xop = X64_imul; + break; + case LIR_qxor: + xop = X64_xorqrr; + break; + case LIR_qior: + xop = X64_orqrr; + break; + case LIR_qiand: + xop = X64_andqrr; + break; + case LIR_qiadd: + case LIR_qaddp: + xop = X64_addqrr; + break; + } + emitrr(xop, rr, rb); + if (rr != ra) + MR(rr,ra); + } + + // binary op with fp registers + void Assembler::asm_fop(LIns *ins) { + Register rr, ra, rb; + regalloc_binary(ins, FpRegs, rr, ra, rb); + X64Opcode xop; + switch (ins->opcode()) { + default: + TODO(asm_fop); + case LIR_fdiv: + xop = X64_divsd; + break; + case LIR_fmul: + xop = X64_mulsd; + break; + case LIR_fadd: + xop = X64_addsd; + break; + case LIR_fsub: + xop = X64_subsd; + break; + } + emitprr(xop, rr, rb); + if (rr != ra) { + asm_nongp_copy(rr, ra); + } + } + + void Assembler::asm_neg_not(LIns *ins) { + Register rr, ra; + regalloc_unary(ins, GpRegs, rr, ra); + NanoAssert(IsGpReg(ra)); + X64Opcode xop; + if (ins->isop(LIR_not)) { + xop = X64_not; + } else { + xop = X64_neg; + } + emitr(xop, rr); + if (rr != ra) + MR(rr, ra); + } + + void Assembler::asm_call(LIns *ins) { + const CallInfo *call = ins->callInfo(); + ArgSize sizes[MAXARGS]; + int argc = call->get_sizes(sizes); + + bool indirect = call->isIndirect(); + if (!indirect) { + verbose_only(if (_logc->lcbits & LC_Assembly) + outputf(" %p:", _nIns); + ) + NIns *target = (NIns*)call->_address; + // must do underrunProtect before calculating offset + underrunProtect(8); + if (isS32(target - _nIns)) { + emit32(X64_call, target - _nIns); + } else { + // can't reach target from here, load imm64 and do an indirect jump + emit(X64_callrax); + emit_quad(RAX, (uint64_t)target); + } + } else { + // Indirect call: we assign the address arg to RAX since it's not + // used for regular arguments, and is otherwise scratch since it's + // clobberred by the call. + asm_regarg(ARGSIZE_P, ins->arg(--argc), RAX); + emit(X64_callrax); + } + + #ifdef _MSC_VER + int stk_used = 32; // always reserve 32byte shadow area + #else + int stk_used = 0; + Register fr = XMM0; + #endif + int arg_index = 0; + for (int i = 0; i < argc; i++) { + int j = argc - i - 1; + ArgSize sz = sizes[j]; + LIns* arg = ins->arg(j); + if ((sz & ARGSIZE_MASK_INT) && arg_index < NumArgRegs) { + // gp arg + asm_regarg(sz, arg, argRegs[arg_index]); + arg_index++; + } + #ifdef _MSC_VER + else if (sz == ARGSIZE_F && arg_index < NumArgRegs) { + // double goes in XMM reg # based on overall arg_index + asm_regarg(sz, arg, Register(XMM0+arg_index)); + arg_index++; + } + #else + else if (sz == ARGSIZE_F && fr < XMM8) { + // double goes in next available XMM register + asm_regarg(sz, arg, fr); + fr = nextreg(fr); + } + #endif + else { + asm_stkarg(sz, arg, stk_used); + stk_used += sizeof(void*); + } + } + + if (stk_used > max_stk_used) + max_stk_used = stk_used; + } + + void Assembler::asm_regarg(ArgSize sz, LIns *p, Register r) { + if (sz == ARGSIZE_I) { + NanoAssert(!p->isQuad()); + if (p->isconst()) { + emit_quad(r, int64_t(p->imm32())); + return; + } + // sign extend int32 to int64 + emitrr(X64_movsxdr, r, r); + } else if (sz == ARGSIZE_U) { + NanoAssert(!p->isQuad()); + if (p->isconst()) { + emit_quad(r, uint64_t(uint32_t(p->imm32()))); + return; + } + // zero extend with 32bit mov, auto-zeros upper 32bits + emitrr(X64_movlr, r, r); + } + /* there is no point in folding an immediate here, because + * the argument register must be a scratch register and we're + * just before a call. Just reserving the register will cause + * the constant to be rematerialized nearby in asm_restore(), + * which is the same instruction we would otherwise emit right + * here, and moving it earlier in the stream provides more scheduling + * freedom to the cpu. */ + findSpecificRegFor(p, r); + } + + void Assembler::asm_stkarg(ArgSize sz, LIns *p, int stk_off) { + NanoAssert(isS8(stk_off)); + if (sz & ARGSIZE_MASK_INT) { + Register r = findRegFor(p, GpRegs); + uint64_t xop = X64_movqspr | uint64_t(stk_off) << 56; // movq [rsp+d8], r + xop |= uint64_t((r&7)<<3) << 40 | uint64_t((r&8)>>1) << 24; // insert r into mod/rm and rex bytes + emit(xop); + if (sz == ARGSIZE_I) { + // extend int32 to int64 + NanoAssert(!p->isQuad()); + emitrr(X64_movsxdr, r, r); + } else if (sz == ARGSIZE_U) { + // extend uint32 to uint64 + NanoAssert(!p->isQuad()); + emitrr(X64_movlr, r, r); + } + } else { + TODO(asm_stkarg_non_int); + } + } + + void Assembler::asm_promote(LIns *ins) { + Register rr, ra; + regalloc_unary(ins, GpRegs, rr, ra); + NanoAssert(IsGpReg(ra)); + if (ins->isop(LIR_u2q)) { + emitrr(X64_movlr, rr, ra); // 32bit mov zeros the upper 32bits of the target + } else { + NanoAssert(ins->isop(LIR_i2q)); + emitrr(X64_movsxdr, rr, ra); // sign extend 32->64 + } + } + + // the CVTSI2SD instruction only writes to the low 64bits of the target + // XMM register, which hinders register renaming and makes dependence + // chains longer. So we precede with XORPS to clear the target register. + + void Assembler::asm_i2f(LIns *ins) { + Register r = prepResultReg(ins, FpRegs); + Register b = findRegFor(ins->oprnd1(), GpRegs); + emitprr(X64_cvtsi2sd, r, b); // cvtsi2sd xmmr, b only writes xmm:0:64 + emitprr(X64_xorps, r, r); // xorpd xmmr,xmmr to break dependency chains + } + + void Assembler::asm_u2f(LIns *ins) { + Register r = prepResultReg(ins, FpRegs); + Register b = findRegFor(ins->oprnd1(), GpRegs); + NanoAssert(!ins->oprnd1()->isQuad()); + // since oprnd1 value is 32bit, its okay to zero-extend the value without worrying about clobbering. + emitprr(X64_cvtsq2sd, r, b); // convert int64 to double + emitprr(X64_xorps, r, r); // xorpd xmmr,xmmr to break dependency chains + emitrr(X64_movlr, b, b); // zero extend u32 to int64 + } + + void Assembler::asm_cmov(LIns *ins) { + LIns* cond = ins->oprnd1(); + LIns* iftrue = ins->oprnd2(); + LIns* iffalse = ins->oprnd3(); + NanoAssert(cond->isCmp()); + NanoAssert((ins->isop(LIR_qcmov) && iftrue->isQuad() && iffalse->isQuad()) || + (ins->isop(LIR_cmov) && !iftrue->isQuad() && !iffalse->isQuad())); + + // this code assumes that neither LD nor MR nor MRcc set any of the condition flags. + // (This is true on Intel, is it true on all architectures?) + const Register rr = prepResultReg(ins, GpRegs); + const Register rf = findRegFor(iffalse, GpRegs & ~rmask(rr)); + X64Opcode xop; + switch (cond->opcode()) { + default: TODO(asm_cmov); + case LIR_qeq: + xop = X64_cmovqne; + break; + } + emitrr(xop, rr, rf); + /*const Register rt =*/ findSpecificRegFor(iftrue, rr); + asm_cmp(cond); + } + + NIns* Assembler::asm_branch(bool onFalse, LIns *cond, NIns *target) { + LOpcode condop = cond->opcode(); + if (condop >= LIR_feq && condop <= LIR_fge) + return asm_fbranch(onFalse, cond, target); + + // we must ensure there's room for the instr before calculating + // the offset. and the offset, determines the opcode (8bit or 32bit) + underrunProtect(8); + if (target && isS8(target - _nIns)) { + static const X64Opcode j8[] = { + X64_je8, // eq + X64_jl8, X64_jg8, X64_jle8, X64_jge8, // lt, gt, le, ge + X64_jb8, X64_ja8, X64_jbe8, X64_jae8 // ult, ugt, ule, uge + }; + uint64_t xop = j8[(condop & ~LIR64) - LIR_eq]; + xop ^= onFalse ? (uint64_t)X64_jneg8 : 0; + emit8(xop, target - _nIns); + } else { + static const X64Opcode j32[] = { + X64_je, // eq + X64_jl, X64_jg, X64_jle, X64_jge, // lt, gt, le, ge + X64_jb, X64_ja, X64_jbe, X64_jae // ult, ugt, ule, uge + }; + uint64_t xop = j32[(condop & ~LIR64) - LIR_eq]; + xop ^= onFalse ? (uint64_t)X64_jneg : 0; + emit32(xop, target ? target - _nIns : 0); + } + NIns *patch = _nIns; // addr of instr to patch + asm_cmp(cond); + return patch; + } + + void Assembler::asm_cmp(LIns *cond) { + LIns *b = cond->oprnd2(); + if (isImm32(b)) { + asm_cmp_imm(cond); + return; + } + LIns *a = cond->oprnd1(); + Register ra, rb; + if (a != b) { + Reservation *resva, *resvb; + findRegFor2(GpRegs, a, resva, b, resvb); + ra = resva->reg; + rb = resvb->reg; + } else { + // optimize-me: this will produce a const result! + ra = rb = findRegFor(a, GpRegs); + } + + LOpcode condop = cond->opcode(); + emitrr(condop & LIR64 ? X64_cmpqr : X64_cmplr, ra, rb); + } + + void Assembler::asm_cmp_imm(LIns *cond) { + LIns *a = cond->oprnd1(); + LIns *b = cond->oprnd2(); + Register ra = findRegFor(a, GpRegs); + int32_t imm = getImm32(b); + if (isS8(imm)) { + X64Opcode xop = (cond->opcode() & LIR64) ? X64_cmpqr8 : X64_cmplr8; + emitr_imm8(xop, ra, imm); + } else { + X64Opcode xop = (cond->opcode() & LIR64) ? X64_cmpqri : X64_cmplri; + emitr_imm(xop, ra, imm); + } + } + + // compiling floating point branches + // discussion in https://bugzilla.mozilla.org/show_bug.cgi?id=443886 + // + // fucom/p/pp: c3 c2 c0 jae ja jbe jb je jne + // ucomisd: Z P C !C !C&!Z C|Z C Z !Z + // -- -- -- -- ----- --- -- -- -- + // unordered 1 1 1 T T T + // greater > 0 0 0 T T T + // less < 0 0 1 T T T + // equal = 1 0 0 T T T + // + // here's the cases, using conditionals: + // + // branch >= > <= < = + // ------ --- --- --- --- --- + // LIR_jt jae ja swap+jae swap+ja jp over je + // LIR_jf jb jbe swap+jb swap+jbe jne+jp + + NIns* Assembler::asm_fbranch(bool onFalse, LIns *cond, NIns *target) { + LOpcode condop = cond->opcode(); + NIns *patch; + LIns *a = cond->oprnd1(); + LIns *b = cond->oprnd2(); + if (condop == LIR_feq) { + if (onFalse) { + // branch if unordered or != + underrunProtect(16); // 12 needed, round up for overhang + emit32(X64_jp, target ? target - _nIns : 0); + emit32(X64_jne, target ? target - _nIns : 0); + patch = _nIns; + } else { + // jp skip (2byte) + // jeq target + // skip: ... + underrunProtect(16); // 7 needed but we write 2 instr + NIns *skip = _nIns; + emit32(X64_je, target ? target - _nIns : 0); + patch = _nIns; + emit8(X64_jp8, skip - _nIns); + } + } + else { + if (condop == LIR_flt) { + condop = LIR_fgt; + LIns *t = a; a = b; b = t; + } else if (condop == LIR_fle) { + condop = LIR_fge; + LIns *t = a; a = b; b = t; + } + X64Opcode xop; + if (condop == LIR_fgt) + xop = onFalse ? X64_jbe : X64_ja; + else // LIR_fge + xop = onFalse ? X64_jb : X64_jae; + underrunProtect(8); + emit32(xop, target ? target - _nIns : 0); + patch = _nIns; + } + fcmp(a, b); + return patch; + } + + void Assembler::asm_fcond(LIns *ins) { + LOpcode op = ins->opcode(); + LIns *a = ins->oprnd1(); + LIns *b = ins->oprnd2(); + if (op == LIR_feq) { + // result = ZF & !PF, must do logic on flags + // r = al|bl|cl|dl, can only use rh without rex prefix + Register r = prepResultReg(ins, 1<reg, resvb->reg); + } + + void Assembler::asm_restore(LIns *ins, Reservation *resv, Register r) { + (void) r; + if (ins->isop(LIR_alloc)) { + int d = disp(resv); + emitrm(X64_leaqrm, r, d, FP); + } + else if (ins->isconst()) { + if (!resv->arIndex) { + ins->resv()->clear(); + } + // unsafe to use xor r,r for zero because it changes cc's + emit_int(r, ins->imm32()); + } + else if (ins->isconstq() && IsGpReg(r)) { + if (!resv->arIndex) { + ins->resv()->clear(); + } + // unsafe to use xor r,r for zero because it changes cc's + emit_quad(r, ins->imm64()); + } + else { + int d = findMemFor(ins); + if (IsFpReg(r)) { + NanoAssert(ins->isQuad()); + // load 64bits into XMM. don't know if double or int64, assume double. + emitprm(X64_movsdrm, r, d, FP); + } else if (ins->isQuad()) { + emitrm(X64_movqrm, r, d, FP); + } else { + emitrm(X64_movlrm, r, d, FP); + } + } + verbose_only( if (_logc->lcbits & LC_RegAlloc) { + outputForEOL(" <= restore %s", + _thisfrag->lirbuf->names->formatRef(ins)); } ) + } + + void Assembler::asm_cond(LIns *ins) { + LOpcode op = ins->opcode(); + // unlike x86-32, with a rex prefix we can use any GP register as an 8bit target + Register r = prepResultReg(ins, GpRegs); + // SETcc only sets low 8 bits, so extend + emitrr8(X64_movzx8, r, r); + X64Opcode xop; + switch (op) { + default: + TODO(cond); + case LIR_qeq: + case LIR_eq: xop = X64_sete; break; + case LIR_qlt: + case LIR_lt: xop = X64_setl; break; + case LIR_qle: + case LIR_le: xop = X64_setle; break; + case LIR_qgt: + case LIR_gt: xop = X64_setg; break; + case LIR_qge: + case LIR_ge: xop = X64_setge; break; + case LIR_qult: + case LIR_ult: xop = X64_setb; break; + case LIR_qule: + case LIR_ule: xop = X64_setbe; break; + case LIR_qugt: + case LIR_ugt: xop = X64_seta; break; + case LIR_quge: + case LIR_uge: xop = X64_setae; break; + case LIR_ov: xop = X64_seto; break; + } + emitr8(xop, r); + asm_cmp(ins); + } + + void Assembler::asm_ret(LIns *ins) { + JMP(_epilogue); + assignSavedRegs(); + LIns *value = ins->oprnd1(); + Register r = ins->isop(LIR_ret) ? RAX : XMM0; + findSpecificRegFor(value, r); + } + + void Assembler::asm_nongp_copy(Register d, Register s) { + if (!IsFpReg(d) && IsFpReg(s)) { + // gpr <- xmm: use movq r/m64, xmm (66 REX.W 0F 7E /r) + emitprr(X64_movqrx, s, d); + } else if (IsFpReg(d) && IsFpReg(s)) { + // xmm <- xmm: use movaps. movsd r,r causes partial register stall + emitrr(X64_movapsr, d, s); + } else { + // xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r) + emitprr(X64_movqxr, d, s); + } + } + + void Assembler::regalloc_load(LIns *ins, Register &rr, int32_t &dr, Register &rb) { + dr = ins->disp(); + LIns *base = ins->oprnd1(); + rb = getBaseReg(base, dr, BaseRegs); + Reservation *resv = getresv(ins); + if (resv && (rr = resv->reg) != UnknownReg) { + // keep already assigned register + freeRsrcOf(ins, false); + } else { + // use a gpr in case we're copying a non-double + rr = prepResultReg(ins, GpRegs & ~rmask(rb)); + } + } + + void Assembler::asm_load64(LIns *ins) { + Register rr, rb; + int32_t dr; + regalloc_load(ins, rr, dr, rb); + if (IsGpReg(rr)) { + // general 64bit load, 32bit const displacement + emitrm(X64_movqrm, rr, dr, rb); + } else { + // load 64bits into XMM. don't know if double or int64, assume double. + emitprm(X64_movsdrm, rr, dr, rb); + } + } + + void Assembler::asm_ld(LIns *ins) { + NanoAssert(!ins->isQuad()); + Register r, b; + int32_t d; + regalloc_load(ins, r, d, b); + emitrm(X64_movlrm, r, d, b); + } + + void Assembler::asm_store64(LIns *value, int d, LIns *base) { + NanoAssert(value->isQuad()); + Register b = getBaseReg(base, d, BaseRegs); + + // if we have to choose a register, use a GPR, but not the base reg + Reservation *resv = getresv(value); + Register r; + if (!resv || (r = resv->reg) == UnknownReg) { + r = findRegFor(value, GpRegs & ~rmask(b)); + } + + if (IsGpReg(r)) { + // gpr store + emitrm(X64_movqmr, r, d, b); + } + else { + // xmm store + emitprm(X64_movsdmr, r, d, b); + } + } + + void Assembler::asm_store32(LIns *value, int d, LIns *base) { + NanoAssert(!value->isQuad()); + Register b = getBaseReg(base, d, BaseRegs); + Register r = findRegFor(value, GpRegs & ~rmask(b)); + + // store 32bits to 64bit addr. use rex so we can use all 16 regs + emitrm(X64_movlmr, r, d, b); + } + + // generate a 32bit constant, must not affect condition codes! + void Assembler::emit_int(Register r, int32_t v) { + NanoAssert(IsGpReg(r)); + emitr_imm(X64_movi, r, v); + } + + // generate a 64bit constant, must not affect condition codes! + void Assembler::emit_quad(Register r, uint64_t v) { + NanoAssert(IsGpReg(r)); + if (isU32(v)) { + emit_int(r, int32_t(v)); + return; + } + if (isS32(v)) { + // safe for sign-extension 32->64 + emitr_imm(X64_movqi32, r, int32_t(v)); + return; + } + underrunProtect(8+8); // imm64 + worst case instr len + ((uint64_t*)_nIns)[-1] = v; + _nIns -= 8; + _nvprof("x64-bytes", 8); + emitr(X64_movqi, r); + } + + void Assembler::asm_int(LIns *ins) { + Register r = prepResultReg(ins, GpRegs); + int32_t v = ins->imm32(); + if (v == 0) { + // special case for zero + emitrr(X64_xorrr, r, r); + return; + } + emit_int(r, v); + } + + void Assembler::asm_quad(LIns *ins) { + uint64_t v = ins->imm64(); + RegisterMask allow = v == 0 ? GpRegs|FpRegs : GpRegs; + Register r = prepResultReg(ins, allow); + if (v == 0) { + if (IsGpReg(r)) { + // special case for zero + emitrr(X64_xorrr, r, r); + } else { + // xorps for xmm + emitprr(X64_xorps, r, r); + } + } else { + emit_quad(r, v); + } + } + + void Assembler::asm_qjoin(LIns*) { + TODO(asm_qjoin); + } + + Register Assembler::asm_prep_fcall(Reservation*, LIns *ins) { + return prepResultReg(ins, rmask(XMM0)); + } + + void Assembler::asm_param(LIns *ins) { + uint32_t a = ins->paramArg(); + uint32_t kind = ins->paramKind(); + if (kind == 0) { + // ordinary param + // first six args always in registers for mac x64 + if (a < 6) { + // incoming arg in register + prepResultReg(ins, rmask(argRegs[a])); + } else { + // todo: support stack based args, arg 0 is at [FP+off] where off + // is the # of regs to be pushed in genProlog() + TODO(asm_param_stk); + } + } + else { + // saved param + prepResultReg(ins, rmask(savedRegs[a])); + } + } + + // register allocation for 2-address style unary ops of the form R = (op) R + void Assembler::regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra) { + LIns *a = ins->oprnd1(); + rr = prepResultReg(ins, allow); + Reservation* rA = getresv(a); + // if this is last use of a in reg, we can re-use result reg + if (rA == 0 || (ra = rA->reg) == UnknownReg) { + ra = findSpecificRegFor(a, rr); + } else { + // rA already has a register assigned. caller must emit a copy + // to rr once instr code is generated. (ie mov rr,ra ; op rr) + } + } + + static const AVMPLUS_ALIGN16(int64_t) negateMask[] = {0x8000000000000000LL,0}; + + void Assembler::asm_fneg(LIns *ins) { + Register rr, ra; + if (isS32((uintptr_t)negateMask) || isS32((NIns*)negateMask - _nIns)) { + regalloc_unary(ins, FpRegs, rr, ra); + if (isS32((uintptr_t)negateMask)) { + // builtin code is in bottom or top 2GB addr space, use absolute addressing + underrunProtect(4+8); + *((int32_t*)(_nIns -= 4)) = (int32_t)(uintptr_t)negateMask; + _nvprof("x64-bytes", 4); + uint64_t xop = X64_xorpsa | uint64_t((rr&7)<<3)<<48; // put rr[0:2] into mod/rm byte + xop = rexrb(xop, rr, (Register)0); // put rr[3] into rex byte + emit(xop); + } else { + // jit code is within +/-2GB of builtin code, use rip-relative + underrunProtect(4+8); + int32_t d = (int32_t) ((NIns*)negateMask - _nIns); + *((int32_t*)(_nIns -= 4)) = d; + _nvprof("x64-bytes", 4); + emitrr(X64_xorpsm, rr, (Register)0); + } + if (ra != rr) + asm_nongp_copy(rr,ra); + } else { + // this is just hideous - can't use RIP-relative load, can't use + // absolute-address load, and cant move imm64 const to XMM. + // so do it all in a GPR. hrmph. + rr = prepResultReg(ins, GpRegs); + ra = findRegFor(ins->oprnd1(), GpRegs & ~rmask(rr)); + emitrr(X64_xorqrr, rr, ra); // xor rr, ra + emit_quad(rr, negateMask[0]); // mov rr, 0x8000000000000000 + } + } + + void Assembler::asm_qhi(LIns*) { + TODO(asm_qhi); + } + + void Assembler::asm_qlo(LIns *ins) { + Register rr, ra; + regalloc_unary(ins, GpRegs, rr, ra); + NanoAssert(IsGpReg(ra)); + emitrr(X64_movlr, rr, ra); // 32bit mov zeros the upper 32bits of the target + } + + void Assembler::asm_spill(Register rr, int d, bool /*pop*/, bool quad) { + if (d) { + if (!IsFpReg(rr)) { + X64Opcode xop = quad ? X64_movqmr : X64_movlmr; + emitrm(xop, rr, d, FP); + } else { + // store 64bits from XMM to memory + NanoAssert(quad); + emitprm(X64_movsdmr, rr, d, FP); + } + } + } + + void Assembler::asm_loop(LIns*, NInsList&) { + TODO(asm_loop); + } + + NIns* Assembler::genPrologue() { + // activation frame is 4 bytes per entry even on 64bit machines + uint32_t stackNeeded = max_stk_used + _activation.highwatermark * 4; + + uint32_t stackPushed = + sizeof(void*) + // returnaddr + sizeof(void*); // ebp + uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK); + uint32_t amt = aligned - stackPushed; + + // Reserve stackNeeded bytes, padded + // to preserve NJ_ALIGN_STACK-byte alignment. + if (amt) { + if (isS8(amt)) + emitr_imm8(X64_subqr8, RSP, amt); + else + emitr_imm(X64_subqri, RSP, amt); + } + + verbose_only( outputAddr=true; asm_output("[patch entry]"); ) + NIns *patchEntry = _nIns; + MR(FP, RSP); // Establish our own FP. + emitr(X64_pushr, FP); // Save caller's FP. + + return patchEntry; + } + + NIns* Assembler::genEpilogue() { + // mov rsp, rbp + // pop rbp + // ret + max_stk_used = 0; + emit(X64_ret); + emitr(X64_popr, RBP); + MR(RSP, RBP); + return _nIns; + } + + void Assembler::nRegisterResetAll(RegAlloc &a) { + // add scratch registers to our free list for the allocator + a.clear(); + a.used = 0; +#ifdef _MSC_VER + a.free = 0x001fffcf; // rax-rbx, rsi, rdi, r8-r15, xmm0-xmm5 +#else + a.free = 0xffffffff & ~(1<,jp, for LIR_jf(feq) + // we just patched the jne, now patch the jp. + next += 6; + NanoAssert(((int32_t*)next)[-1] == 0); + NanoAssert(isS32(target - next)); + ((int32_t*)next)[-1] = int32_t(target - next); + } + } + + Register Assembler::nRegisterAllocFromSet(RegisterMask set) { + #if defined _WIN64 + DWORD tr; + _BitScanForward(&tr, set); + _allocator.free &= ~rmask((Register)tr); + return (Register) tr; + #else + // gcc asm syntax + Register r; + asm("bsf %1, %%eax\n\t" + "btr %%eax, %2\n\t" + "movl %%eax, %0\n\t" + : "=m"(r) : "m"(set), "m"(_allocator.free) : "%eax", "memory"); + (void)set; + return r; + #endif + } + + void Assembler::nFragExit(LIns*) { + TODO(nFragExit); + } + + void Assembler::nInit(AvmCore*) + {} + + void Assembler::underrunProtect(ptrdiff_t bytes) { + NanoAssertMsg(bytes<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small"); + NIns *pc = _nIns; + NIns *top = _inExit ? this->exitStart : this->codeStart; + + #if PEDANTIC + // pedanticTop is based on the last call to underrunProtect; any time we call + // underrunProtect and would use more than what's already protected, then insert + // a page break jump. Sometimes, this will be to a new page, usually it's just + // the next instruction + + NanoAssert(pedanticTop >= top); + if (pc - bytes < pedanticTop) { + // no page break required, but insert a far branch anyway just to be difficult + const int br_size = 8; // opcode + 32bit addr + if (pc - bytes - br_size < top) { + // really do need a page break + verbose_only(if (_logc->lcbits & LC_Assembly) outputf("newpage %p:", pc);) + codeAlloc(); + } + // now emit the jump, but make sure we won't need another page break. + // we're pedantic, but not *that* pedantic. + pedanticTop = _nIns - br_size; + JMP(pc); + pedanticTop = _nIns - bytes; + } + #else + if (pc - bytes < top) { + verbose_only(if (_logc->lcbits & LC_Assembly) outputf("newpage %p:", pc);) + codeAlloc(); + // this jump will call underrunProtect again, but since we're on a new + // page, nothing will happen. + JMP(pc); + } + #endif + } + + RegisterMask Assembler::hint(LIns *, RegisterMask allow) { + return allow; + } + + void Assembler::nativePageSetup() { + if (!_nIns) { + codeAlloc(); + IF_PEDANTIC( pedanticTop = _nIns; ) + } + if (!_nExitIns) { + codeAlloc(true); + } + } + + void Assembler::nativePageReset() + {} + +} // namespace nanojit + +#endif // FEATURE_NANOJIT && NANOJIT_X64 diff --git a/js/src/nanojit/NativeX64.h b/js/src/nanojit/NativeX64.h new file mode 100644 index 00000000000..1e7ce3b086e --- /dev/null +++ b/js/src/nanojit/NativeX64.h @@ -0,0 +1,360 @@ +/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */ +/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is [Open Source Virtual Machine]. + * + * The Initial Developer of the Original Code is + * Adobe System Incorporated. + * Portions created by the Initial Developer are Copyright (C) 2008 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Adobe AS3 Team + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#ifndef __nanojit_NativeX64__ +#define __nanojit_NativeX64__ + +#ifndef NANOJIT_64BIT +#error "NANOJIT_64BIT must be defined for X64 backend" +#endif + +#ifdef PERFM +#define DOPROF +#include "../vprof/vprof.h" +#define count_instr() _nvprof("x64",1) +#define count_prolog() _nvprof("x64-prolog",1); count_instr(); +#define count_imt() _nvprof("x64-imt",1) count_instr() +#else +#define count_instr() +#define count_prolog() +#define count_imt() +#endif + +namespace nanojit +{ +#define NJ_MAX_STACK_ENTRY 256 +#define NJ_ALIGN_STACK 16 + + enum Register { + RAX = 0, // 1st int return, # of sse varargs + RCX = 1, // 4th int arg + RDX = 2, // 3rd int arg 2nd return + RBX = 3, // saved + RSP = 4, // stack ptr + RBP = 5, // frame ptr, saved, sib reqd + RSI = 6, // 2nd int arg + RDI = 7, // 1st int arg + R8 = 8, // 5th int arg + R9 = 9, // 6th int arg + R10 = 10, // scratch + R11 = 11, // scratch + R12 = 12, // saved + R13 = 13, // saved, sib reqd like rbp + R14 = 14, // saved + R15 = 15, // saved + + XMM0 = 16, // 1st double arg, return + XMM1 = 17, // 2nd double arg, return + XMM2 = 18, // 3rd double arg + XMM3 = 19, // 4th double arg + XMM4 = 20, // 5th double arg + XMM5 = 21, // 6th double arg + XMM6 = 22, // 7th double arg + XMM7 = 23, // 8th double arg + XMM8 = 24, // scratch + XMM9 = 25, // scratch + XMM10 = 26, // scratch + XMM11 = 27, // scratch + XMM12 = 28, // scratch + XMM13 = 29, // scratch + XMM14 = 30, // scratch + XMM15 = 31, // scratch + + FP = RBP, + UnknownReg = 32, + FirstReg = RAX, + LastReg = XMM15 + }; + +/* + * Micro-templating variable-length opcodes, idea first + * describe by Mike Pall of Luajit. + * + * X86-64 opcode encodings: LSB encodes the length of the + * opcode in bytes, remaining bytes are encoded as 1-7 bytes + * in a single uint64_t value. The value is written as a single + * store into the code stream, and the code pointer is decremented + * by the length. each successive instruction partially overlaps + * the previous one. + * + * emit methods below are able to encode mod/rm, sib, rex, and + * register and small immediate values into these opcode values + * without much branchy code. + * + * these opcodes encapsulate all the const parts of the instruction. + * for example, the alu-immediate opcodes (add, sub, etc) encode + * part of their opcode in the R field of the mod/rm byte; this + * hardcoded value is in the constant below, and the R argument + * to emitrr() is 0. In a few cases, a whole instruction is encoded + * this way (eg callrax). + * + * when a disp32, imm32, or imm64 suffix can't fit in an 8-byte + * opcode, then it is written into the code separately and not counted + * in the opcode length. + */ + + enum X64Opcode +#if defined(_MSC_VER) && _MSC_VER >= 1400 +#pragma warning(disable:4480) // nonstandard extension used: specifying underlying type for enum + : uint64_t +#endif + { + // 64bit opcode constants + // msb lsb len + X64_addqrr = 0xC003480000000003LL, // 64bit add r += b + X64_addqri = 0xC081480000000003LL, // 64bit add r += int64(imm32) + X64_addqr8 = 0x00C0834800000004LL, // 64bit add r += int64(imm8) + X64_andqri = 0xE081480000000003LL, // 64bit and r &= int64(imm32) + X64_andqr8 = 0x00E0834800000004LL, // 64bit and r &= int64(imm8) + X64_orqri = 0xC881480000000003LL, // 64bit or r |= int64(imm32) + X64_orqr8 = 0x00C8834800000004LL, // 64bit or r |= int64(imm8) + X64_xorqri = 0xF081480000000003LL, // 64bit xor r ^= int64(imm32) + X64_xorqr8 = 0x00F0834800000004LL, // 64bit xor r ^= int64(imm8) + X64_addlri = 0xC081400000000003LL, // 32bit add r += imm32 + X64_addlr8 = 0x00C0834000000004LL, // 32bit add r += imm8 + X64_andlri = 0xE081400000000003LL, // 32bit and r &= imm32 + X64_andlr8 = 0x00E0834000000004LL, // 32bit and r &= imm8 + X64_orlri = 0xC881400000000003LL, // 32bit or r |= imm32 + X64_orlr8 = 0x00C8834000000004LL, // 32bit or r |= imm8 + X64_sublri = 0xE881400000000003LL, // 32bit sub r -= imm32 + X64_sublr8 = 0x00E8834000000004LL, // 32bit sub r -= imm8 + X64_xorlri = 0xF081400000000003LL, // 32bit xor r ^= imm32 + X64_xorlr8 = 0x00F0834000000004LL, // 32bit xor r ^= imm8 + X64_addrr = 0xC003400000000003LL, // 32bit add r += b + X64_andqrr = 0xC023480000000003LL, // 64bit and r &= b + X64_andrr = 0xC023400000000003LL, // 32bit and r &= b + X64_call = 0x00000000E8000005LL, // near call + X64_callrax = 0xD0FF000000000002LL, // indirect call to addr in rax (no REX) + X64_cmovqne = 0xC0450F4800000004LL, // 64bit conditional mov if (c) r = b + X64_cmplr = 0xC03B400000000003LL, // 32bit compare r,b + X64_cmpqr = 0xC03B480000000003LL, // 64bit compare r,b + X64_cmplri = 0xF881400000000003LL, // 32bit compare r,imm32 + X64_cmpqri = 0xF881480000000003LL, // 64bit compare r,int64(imm32) + X64_cmplr8 = 0x00F8834000000004LL, // 32bit compare r,imm8 + X64_cmpqr8 = 0x00F8834800000004LL, // 64bit compare r,int64(imm8) + X64_cvtsi2sd= 0xC02A0F40F2000005LL, // convert int32 to double r = (double) b + X64_cvtsq2sd= 0xC02A0F48F2000005LL, // convert int64 to double r = (double) b + X64_divsd = 0xC05E0F40F2000005LL, // divide scalar double r /= b + X64_mulsd = 0xC0590F40F2000005LL, // multiply scalar double r *= b + X64_addsd = 0xC0580F40F2000005LL, // add scalar double r += b + X64_imul = 0xC0AF0F4000000004LL, // 32bit signed mul r *= b + X64_imuli = 0xC069400000000003LL, // 32bit signed mul r = b * imm32 + X64_imul8 = 0x00C06B4000000004LL, // 32bit signed mul r = b * imm8 + X64_jmp = 0x00000000E9000005LL, // jump near rel32 + X64_jmp8 = 0x00EB000000000002LL, // jump near rel8 + X64_jb = 0x00000000820F0006LL, // jump near if below (uint <) + X64_jae = 0x00000000830F0006LL, // jump near if above or equal (uint >=) + X64_ja = 0x00000000870F0006LL, // jump near if above (uint >) + X64_jbe = 0x00000000860F0006LL, // jump near if below or equal (uint <=) + X64_je = 0x00000000840F0006LL, // near jump if equal + X64_jne = 0x00000000850F0006LL, // jump near if not equal + X64_jl = 0x000000008C0F0006LL, // jump near if less (int <) + X64_jge = 0x000000008D0F0006LL, // jump near if greater or equal (int >=) + X64_jg = 0x000000008F0F0006LL, // jump near if greater (int >) + X64_jle = 0x000000008E0F0006LL, // jump near if less or equal (int <=) + X64_jp = 0x000000008A0F0006LL, // jump near if parity (PF == 1) + X64_jnp = 0x000000008B0F0006LL, // jump near if not parity (PF == 0) + X64_jneg = 0x0000000001000000LL, // xor with this mask to negate the condition + X64_jb8 = 0x0072000000000002LL, // jump near if below (uint <) + X64_jae8 = 0x0073000000000002LL, // jump near if above or equal (uint >=) + X64_ja8 = 0x0077000000000002LL, // jump near if above (uint >) + X64_jbe8 = 0x0076000000000002LL, // jump near if below or equal (uint <=) + X64_je8 = 0x0074000000000002LL, // near jump if equal + X64_jne8 = 0x0075000000000002LL, // jump near if not equal + X64_jl8 = 0x007C000000000002LL, // jump near if less (int <) + X64_jge8 = 0x007D000000000002LL, // jump near if greater or equal (int >=) + X64_jg8 = 0x007F000000000002LL, // jump near if greater (int >) + X64_jle8 = 0x007E000000000002LL, // jump near if less or equal (int <=) + X64_jp8 = 0x007A000000000002LL, // jump near if parity (PF == 1) + X64_jnp8 = 0x007B000000000002LL, // jump near if not parity (PF == 0) + X64_jneg8 = 0x0001000000000000LL, // xor with this mask to negate the condition + X64_leaqrm = 0x00000000808D4807LL, // 64bit load effective addr reg <- disp32+base + X64_learm = 0x00000000808D4007LL, // 32bit load effective addr reg <- disp32+base + X64_movlr = 0xC08B400000000003LL, // 32bit mov r <- b + X64_movlmr = 0x0000000080894007LL, // 32bit store r -> [b+d32] + X64_movlrm = 0x00000000808B4007LL, // 32bit load r <- [b+d32] + X64_movqmr = 0x0000000080894807LL, // 64bit store gpr -> [b+d32] + X64_movqspr = 0x0024448948000005LL, // 64bit store gpr -> [rsp+d32] (sib required) + X64_movqr = 0xC08B480000000003LL, // 64bit mov r <- b + X64_movqi = 0xB848000000000002LL, // 64bit mov r <- imm64 + X64_movi = 0xB840000000000002LL, // 32bit mov r <- imm32 + X64_movqi32 = 0xC0C7480000000003LL, // 64bit mov r <- int64(imm32) + X64_movapsr = 0xC0280F4000000004LL, // 128bit mov xmm <- xmm + X64_movqrx = 0xC07E0F4866000005LL, // 64bit mov b <- xmm-r + X64_movqxr = 0xC06E0F4866000005LL, // 64bit mov b -> xmm-r + X64_movqrm = 0x00000000808B4807LL, // 64bit load r <- [b+d32] + X64_movsdrr = 0xC0100F40F2000005LL, // 64bit mov xmm-r <- xmm-b (upper 64bits unchanged) + X64_movsdrm = 0x80100F40F2000005LL, // 64bit load xmm-r <- [b+d32] (upper 64 cleared) + X64_movsdmr = 0x80110F40F2000005LL, // 64bit store xmm-r -> [b+d32] + X64_movsxdr = 0xC063480000000003LL, // sign extend i32 to i64 r = (int64)(int32) b + X64_movzx8 = 0xC0B60F4000000004LL, // zero extend i8 to i64 r = (uint64)(uint8) b + X64_neg = 0xD8F7400000000003LL, // 32bit two's compliment b = -b + X64_nop1 = 0x9000000000000001LL, // one byte NOP + X64_nop2 = 0x9066000000000002LL, // two byte NOP + X64_nop3 = 0x001F0F0000000003LL, // three byte NOP + X64_nop4 = 0x00401F0F00000004LL, // four byte NOP + X64_nop5 = 0x0000441F0F000005LL, // five byte NOP + X64_nop6 = 0x0000441F0F660006LL, // six byte NOP + X64_nop7 = 0x00000000801F0F07LL, // seven byte NOP + X64_not = 0xD0F7400000000003LL, // 32bit ones compliment b = ~b + X64_orlrr = 0xC00B400000000003LL, // 32bit or r |= b + X64_orqrr = 0xC00B480000000003LL, // 64bit or r |= b + X64_popr = 0x5840000000000002LL, // 64bit pop r <- [rsp++] + X64_pushr = 0x5040000000000002LL, // 64bit push r -> [--rsp] + X64_pxor = 0xC0EF0F4066000005LL, // 128bit xor xmm-r ^= xmm-b + X64_ret = 0xC300000000000001LL, // near return from called procedure + X64_sete = 0xC0940F4000000004LL, // set byte if equal (ZF == 1) + X64_seto = 0xC0900F4000000004LL, // set byte if overflow (OF == 1) + X64_setc = 0xC0920F4000000004LL, // set byte if carry (CF == 1) + X64_setl = 0xC09C0F4000000004LL, // set byte if less (int <) (SF != OF) + X64_setle = 0xC09E0F4000000004LL, // set byte if less or equal (int <=) (ZF == 1 || SF != OF) + X64_setg = 0xC09F0F4000000004LL, // set byte if greater (int >) (ZF == 0 && SF == OF) + X64_setge = 0xC09D0F4000000004LL, // set byte if greater or equal (int >=) (SF == OF) + X64_seta = 0xC0970F4000000004LL, // set byte if above (uint >) (CF == 0 && ZF == 0) + X64_setae = 0xC0930F4000000004LL, // set byte if above or equal (uint >=) (CF == 0) + X64_setb = 0xC0920F4000000004LL, // set byte if below (uint <) (CF == 1) + X64_setbe = 0xC0960F4000000004LL, // set byte if below or equal (uint <=) (ZF == 1 || CF == 1) + X64_subsd = 0xC05C0F40F2000005LL, // subtract scalar double r -= b + X64_shl = 0xE0D3400000000003LL, // 32bit left shift r <<= rcx + X64_shlq = 0xE0D3480000000003LL, // 64bit left shift r <<= rcx + X64_shr = 0xE8D3400000000003LL, // 32bit uint right shift r >>= rcx + X64_shrq = 0xE8D3480000000003LL, // 64bit uint right shift r >>= rcx + X64_sar = 0xF8D3400000000003LL, // 32bit int right shift r >>= rcx + X64_sarq = 0xF8D3480000000003LL, // 64bit int right shift r >>= rcx + X64_shli = 0x00E0C14000000004LL, // 32bit left shift r <<= imm8 + X64_shlqi = 0x00E0C14800000004LL, // 64bit left shift r <<= imm8 + X64_sari = 0x00F8C14000000004LL, // 32bit int right shift r >>= imm8 + X64_sarqi = 0x00F8C14800000004LL, // 64bit int right shift r >>= imm8 + X64_shri = 0x00E8C14000000004LL, // 32bit uint right shift r >>= imm8 + X64_shrqi = 0x00E8C14800000004LL, // 64bit uint right shift r >>= imm8 + X64_subqrr = 0xC02B480000000003LL, // 64bit sub r -= b + X64_subrr = 0xC02B400000000003LL, // 32bit sub r -= b + X64_subqri = 0xE881480000000003LL, // 64bit sub r -= int64(imm32) + X64_subqr8 = 0x00E8834800000004LL, // 64bit sub r -= int64(imm8) + X64_ucomisd = 0xC02E0F4066000005LL, // unordered compare scalar double + X64_xorqrr = 0xC033480000000003LL, // 64bit xor r &= b + X64_xorrr = 0xC033400000000003LL, // 32bit xor r &= b + X64_xorpd = 0xC0570F4066000005LL, // 128bit xor xmm (two packed doubles) + X64_xorps = 0xC0570F4000000004LL, // 128bit xor xmm (four packed singles), one byte shorter + X64_xorpsm = 0x05570F4000000004LL, // 128bit xor xmm, [rip+disp32] + X64_xorpsa = 0x2504570F40000005LL, // 128bit xor xmm, [disp32] + + X86_and8r = 0xC022000000000002LL, // and rl,rh + X86_sete = 0xC0940F0000000003LL, // no-rex version of X64_sete + X86_setnp = 0xC09B0F0000000003LL // no-rex set byte if odd parity (ordered fcmp result) (PF == 0) + }; + + typedef uint32_t RegisterMask; + + static const RegisterMask GpRegs = 0xffff; + static const RegisterMask FpRegs = 0xffff0000; + static const bool CalleeRegsNeedExplicitSaving = true; +#ifdef _MSC_VER + static const RegisterMask SavedRegs = 1<