/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */ /* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */ /* ***** BEGIN LICENSE BLOCK ***** * Version: MPL 1.1/GPL 2.0/LGPL 2.1 * * The contents of this file are subject to the Mozilla Public License Version * 1.1 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * http://www.mozilla.org/MPL/ * * Software distributed under the License is distributed on an "AS IS" basis, * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License * for the specific language governing rights and limitations under the * License. * * The Original Code is [Open Source Virtual Machine]. * * The Initial Developer of the Original Code is * Adobe System Incorporated. * Portions created by the Initial Developer are Copyright (C) 2009 * the Initial Developer. All Rights Reserved. * * Contributor(s): * Adobe AS3 Team * * Alternatively, the contents of this file may be used under the terms of * either the GNU General Public License Version 2 or later (the "GPL"), or * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), * in which case the provisions of the GPL or the LGPL are applicable instead * of those above. If you wish to allow use of your version of this file only * under the terms of either the GPL or the LGPL, and not to allow others to * use your version of this file under the terms of the MPL, indicate your * decision by deleting the provisions above and replace them with the notice * and other provisions required by the GPL or the LGPL. If you do not delete * the provisions above, a recipient may use your version of this file under * the terms of any one of the MPL, the GPL or the LGPL. * * ***** END LICENSE BLOCK ***** */ #include "nanojit.h" // uncomment this to enable _vprof/_nvprof macros //#define DOPROF #include "../vprof/vprof.h" #if defined FEATURE_NANOJIT && defined NANOJIT_X64 /* completion - 64bit branch offsets - finish cmov/qcmov with other conditions - validate asm_cond with other conditions better code - put R12 back in play as a base register - no-disp addr modes (except RBP/R13) - disp64 branch/call - spill gp values to xmm registers? - prefer xmm registers for copies since gprs are in higher demand? - stack arg doubles - stack based LIR_param tracing - asm_qjoin - asm_qhi - nFragExit */ namespace nanojit { const Register Assembler::retRegs[] = { RAX }; #ifdef _MSC_VER const Register Assembler::argRegs[] = { RCX, RDX, R8, R9 }; const Register Assembler::savedRegs[] = { RBX, RSI, RDI, R12, R13, R14, R15 }; #else const Register Assembler::argRegs[] = { RDI, RSI, RDX, RCX, R8, R9 }; const Register Assembler::savedRegs[] = { RBX, R12, R13, R14, R15 }; #endif const char *regNames[] = { "rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" }; #ifdef _DEBUG #define TODO(x) todo(#x) static void todo(const char *s) { verbose_only( avmplus::AvmLog("%s",s); ) NanoAssertMsgf(false, "%s", s); } #else #define TODO(x) #endif // MODRM and restrictions: // memory access modes != 11 require SIB if base&7 == 4 (RSP or R12) // mode 00 with base&7 == 5 means RIP+disp32 (RBP or R13), use mode 01 disp8=0 instead // rex prefix required to use RSP-R15 as 8bit registers in mod/rm8 modes. // take R12 out of play as a base register because it requires the SIB byte like ESP const RegisterMask BaseRegs = GpRegs & ~rmask(R12); static inline int oplen(uint64_t op) { return op & 255; } // encode 2-register rex prefix. dropped if none of its bits are set. static inline uint64_t rexrb(uint64_t op, Register r, Register b) { int shift = 64 - 8*oplen(op); uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3); return rex != 0x40 ? op | rex << shift : op - 1; } // encode 2-register rex prefix. dropped if none of its bits are set, but // keep REX if b >= rsp, to allow uniform use of all 16 8bit registers static inline uint64_t rexrb8(uint64_t op, Register r, Register b) { int shift = 64 - 8*oplen(op); uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3); return ((rex | (b & ~3)) != 0x40) ? (op | (rex << shift)) : op - 1; } // encode 2-register rex prefix that follows a manditory prefix (66,F2,F3) // [prefix][rex][opcode] static inline uint64_t rexprb(uint64_t op, Register r, Register b) { int shift = 64 - 8*oplen(op) + 8; uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3); // to drop rex, we replace rex with manditory prefix, and decrement length return rex != 0x40 ? op | rex << shift : ((op & ~(255LL<>(shift-8)&255) << shift) - 1; } // [rex][opcode][mod-rr] static inline uint64_t mod_rr(uint64_t op, Register r, Register b) { return op | uint64_t((r&7)<<3 | (b&7))<<56; } static inline uint64_t mod_disp32(uint64_t op, Register r, Register b, int32_t d) { NanoAssert(IsGpReg(r) && IsGpReg(b)); NanoAssert((b & 7) != 4); // using RSP or R12 as base requires SIB if (isS8(d)) { // op is: 0x[disp32=0][mod=2:r:b][op][rex][len] NanoAssert((((op>>24)&255)>>6) == 2); // disp32 mode int len = oplen(op); op = (op & ~0xff000000LL) | (0x40 | (r&7)<<3 | (b&7))<<24; // replace mod return op<<24 | int64_t(d)<<56 | (len-3); // shrink disp, add disp8 } else { // op is: 0x[disp32][mod][op][rex][len] return op | int64_t(d)<<32 | uint64_t((r&7)<<3 | (b&7))<<24; } } #ifdef NJ_VERBOSE void Assembler::dis(NIns *p, int bytes) { char b[32], *s = b; // room for 8 hex bytes plus null *s++ = ' '; for (NIns *end = p + bytes; p < end; p++) { VMPI_sprintf(s, "%02x ", *p); s += 3; } *s = 0; asm_output("%s", b); } #endif void Assembler::emit(uint64_t op) { int len = oplen(op); // we will only move nIns by -len bytes, but we write 8 // bytes. so need to protect 8 so we dont stomp the page // header or the end of the preceding page (might segf) underrunProtect(8); ((int64_t*)_nIns)[-1] = op; _nIns -= len; // move pointer by length encoded in opcode _nvprof("x64-bytes", len); verbose_only( if (_logc->lcbits & LC_Assembly) dis(_nIns, len); ) } void Assembler::emit8(uint64_t op, int64_t v) { NanoAssert(isS8(v)); emit(op | uint64_t(v)<<56); } void Assembler::emit32(uint64_t op, int64_t v) { NanoAssert(isS32(v)); emit(op | uint64_t(uint32_t(v))<<32); } // 2-register modrm32 form void Assembler::emitrr(uint64_t op, Register r, Register b) { emit(rexrb(mod_rr(op, r, b), r, b)); } // 2-register modrm8 form (8 bit operand size) void Assembler::emitrr8(uint64_t op, Register r, Register b) { emit(rexrb8(mod_rr(op, r, b), r, b)); } // same as emitrr, but with a prefix byte void Assembler::emitprr(uint64_t op, Register r, Register b) { emit(rexprb(mod_rr(op, r, b), r, b)); } // disp32 modrm form, when the disp fits in the instruction (opcode is 1-3 bytes) void Assembler::emitrm(uint64_t op, Register r, int32_t d, Register b) { emit(rexrb(mod_disp32(op, r, b, d), r, b)); } // disp32 modrm form when the disp must be written separately (opcode is 4+ bytes) void Assembler::emitprm(uint64_t op, Register r, int32_t d, Register b) { if (isS8(d)) { NanoAssert(((op>>56)&0xC0) == 0x80); // make sure mod bits == 2 == disp32 mode underrunProtect(1+8); *(--_nIns) = (NIns) d; _nvprof("x64-bytes", 1); op ^= 0xC000000000000000LL; // change mod bits to 1 == disp8 mode } else { underrunProtect(4+8); // room for displ plus fullsize op *((int32_t*)(_nIns -= 4)) = d; _nvprof("x64-bytes", 4); } emitprr(op, r, b); } void Assembler::emitrr_imm(uint64_t op, Register r, Register b, int32_t imm) { NanoAssert(IsGpReg(r) && IsGpReg(b)); underrunProtect(4+8); // room for imm plus fullsize op *((int32_t*)(_nIns -= 4)) = imm; _nvprof("x86-bytes", 4); emitrr(op, r, b); } // op = [rex][opcode][modrm][imm8] void Assembler::emitr_imm8(uint64_t op, Register b, int32_t imm8) { NanoAssert(IsGpReg(b) && isS8(imm8)); op |= uint64_t(imm8)<<56 | uint64_t(b&7)<<48; // modrm is 2nd to last byte emit(rexrb(op, (Register)0, b)); } void Assembler::MR(Register d, Register s) { NanoAssert(IsGpReg(d) && IsGpReg(s)); emitrr(X64_movqr, d, s); } void Assembler::JMP(NIns *target) { if (!target || isS32(target - _nIns)) { underrunProtect(8); // must do this before calculating offset if (target && isS8(target - _nIns)) { emit8(X64_jmp8, target - _nIns); } else { emit32(X64_jmp, target ? target - _nIns : 0); } } else { TODO(jmp64); } } // register allocation for 2-address style ops of the form R = R (op) B void Assembler::regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb) { rb = UnknownReg; LIns *a = ins->oprnd1(); LIns *b = ins->oprnd2(); if (a != b) { rb = findRegFor(b, allow); allow &= ~rmask(rb); } rr = prepResultReg(ins, allow); Reservation* rA = getresv(a); // if this is last use of a in reg, we can re-use result reg if (rA == 0 || (ra = rA->reg) == UnknownReg) { ra = findSpecificRegFor(a, rr); } else { // rA already has a register assigned } if (a == b) { rb = ra; } } void Assembler::asm_qbinop(LIns *ins) { asm_arith(ins); } void Assembler::asm_shift(LIns *ins) { // shift require rcx for shift count LIns *b = ins->oprnd2(); if (b->isconst()) { asm_shift_imm(ins); return; } Register rr, ra; if (b != ins->oprnd1()) { findSpecificRegFor(b, RCX); regalloc_unary(ins, GpRegs & ~rmask(RCX), rr, ra); } else { // a == b means both must be in RCX regalloc_unary(ins, rmask(RCX), rr, ra); } X64Opcode xop; switch (ins->opcode()) { default: TODO(asm_shift); case LIR_qursh: xop = X64_shrq; break; case LIR_qirsh: xop = X64_sarq; break; case LIR_qilsh: xop = X64_shlq; break; case LIR_ush: xop = X64_shr; break; case LIR_rsh: xop = X64_sar; break; case LIR_lsh: xop = X64_shl; break; } emitr(xop, rr); if (rr != ra) MR(rr, ra); } void Assembler::asm_shift_imm(LIns *ins) { Register rr, ra; regalloc_unary(ins, GpRegs, rr, ra); X64Opcode xop; switch (ins->opcode()) { default: TODO(shiftimm); case LIR_qursh: xop = X64_shrqi; break; case LIR_qirsh: xop = X64_sarqi; break; case LIR_qilsh: xop = X64_shlqi; break; case LIR_ush: xop = X64_shri; break; case LIR_rsh: xop = X64_sari; break; case LIR_lsh: xop = X64_shli; break; } int shift = ins->oprnd2()->imm32() & 255; emit8(rexrb(xop | uint64_t(rr&7)<<48, (Register)0, rr), shift); if (rr != ra) MR(rr, ra); } static bool isImm32(LIns *ins) { return ins->isconst() || (ins->isconstq() && isS32(ins->imm64())); } static int32_t getImm32(LIns *ins) { return ins->isconst() ? ins->imm32() : int32_t(ins->imm64()); } // binary op, integer regs, rhs is int32 const void Assembler::asm_arith_imm(LIns *ins) { LIns *b = ins->oprnd2(); int32_t imm = getImm32(b); LOpcode op = ins->opcode(); Register rr, ra; if (op == LIR_mul) { // imul has true 3-addr form, it doesn't clobber ra rr = prepResultReg(ins, GpRegs); LIns *a = ins->oprnd1(); ra = findRegFor(a, GpRegs); emitrr_imm(X64_imuli, rr, ra, imm); return; } regalloc_unary(ins, GpRegs, rr, ra); X64Opcode xop; if (isS8(imm)) { switch (ins->opcode()) { default: TODO(arith_imm8); case LIR_iaddp: case LIR_add: xop = X64_addlr8; break; case LIR_and: xop = X64_andlr8; break; case LIR_or: xop = X64_orlr8; break; case LIR_sub: xop = X64_sublr8; break; case LIR_xor: xop = X64_xorlr8; break; case LIR_qiadd: case LIR_qaddp: xop = X64_addqr8; break; case LIR_qiand: xop = X64_andqr8; break; case LIR_qior: xop = X64_orqr8; break; case LIR_qxor: xop = X64_xorqr8; break; } emitr_imm8(xop, rr, imm); } else { switch (ins->opcode()) { default: TODO(arith_imm); case LIR_iaddp: case LIR_add: xop = X64_addlri; break; case LIR_and: xop = X64_andlri; break; case LIR_or: xop = X64_orlri; break; case LIR_sub: xop = X64_sublri; break; case LIR_xor: xop = X64_xorlri; break; case LIR_qiadd: case LIR_qaddp: xop = X64_addqri; break; case LIR_qiand: xop = X64_andqri; break; case LIR_qior: xop = X64_orqri; break; case LIR_qxor: xop = X64_xorqri; break; } emitr_imm(xop, rr, imm); } if (rr != ra) MR(rr, ra); } // binary op with integer registers void Assembler::asm_arith(LIns *ins) { Register rr, ra, rb; LOpcode op = ins->opcode(); if ((op & ~LIR64) >= LIR_lsh && (op & ~LIR64) <= LIR_ush) { asm_shift(ins); return; } LIns *b = ins->oprnd2(); if (isImm32(b)) { asm_arith_imm(ins); return; } regalloc_binary(ins, GpRegs, rr, ra, rb); X64Opcode xop; switch (ins->opcode()) { default: TODO(asm_arith); case LIR_or: xop = X64_orlrr; break; case LIR_sub: xop = X64_subrr; break; case LIR_iaddp: case LIR_add: xop = X64_addrr; break; case LIR_and: xop = X64_andrr; break; case LIR_xor: xop = X64_xorrr; break; case LIR_mul: xop = X64_imul; break; case LIR_qxor: xop = X64_xorqrr; break; case LIR_qior: xop = X64_orqrr; break; case LIR_qiand: xop = X64_andqrr; break; case LIR_qiadd: case LIR_qaddp: xop = X64_addqrr; break; } emitrr(xop, rr, rb); if (rr != ra) MR(rr,ra); } // binary op with fp registers void Assembler::asm_fop(LIns *ins) { Register rr, ra, rb; regalloc_binary(ins, FpRegs, rr, ra, rb); X64Opcode xop; switch (ins->opcode()) { default: TODO(asm_fop); case LIR_fdiv: xop = X64_divsd; break; case LIR_fmul: xop = X64_mulsd; break; case LIR_fadd: xop = X64_addsd; break; case LIR_fsub: xop = X64_subsd; break; } emitprr(xop, rr, rb); if (rr != ra) { asm_nongp_copy(rr, ra); } } void Assembler::asm_neg_not(LIns *ins) { Register rr, ra; regalloc_unary(ins, GpRegs, rr, ra); NanoAssert(IsGpReg(ra)); X64Opcode xop; if (ins->isop(LIR_not)) { xop = X64_not; } else { xop = X64_neg; } emitr(xop, rr); if (rr != ra) MR(rr, ra); } void Assembler::asm_call(LIns *ins) { const CallInfo *call = ins->callInfo(); ArgSize sizes[MAXARGS]; int argc = call->get_sizes(sizes); bool indirect = call->isIndirect(); if (!indirect) { verbose_only(if (_logc->lcbits & LC_Assembly) outputf(" %p:", _nIns); ) NIns *target = (NIns*)call->_address; // must do underrunProtect before calculating offset underrunProtect(8); if (isS32(target - _nIns)) { emit32(X64_call, target - _nIns); } else { // can't reach target from here, load imm64 and do an indirect jump emit(X64_callrax); emit_quad(RAX, (uint64_t)target); } } else { // Indirect call: we assign the address arg to RAX since it's not // used for regular arguments, and is otherwise scratch since it's // clobberred by the call. asm_regarg(ARGSIZE_P, ins->arg(--argc), RAX); emit(X64_callrax); } #ifdef _MSC_VER int stk_used = 32; // always reserve 32byte shadow area #else int stk_used = 0; Register fr = XMM0; #endif int arg_index = 0; for (int i = 0; i < argc; i++) { int j = argc - i - 1; ArgSize sz = sizes[j]; LIns* arg = ins->arg(j); if ((sz & ARGSIZE_MASK_INT) && arg_index < NumArgRegs) { // gp arg asm_regarg(sz, arg, argRegs[arg_index]); arg_index++; } #ifdef _MSC_VER else if (sz == ARGSIZE_F && arg_index < NumArgRegs) { // double goes in XMM reg # based on overall arg_index asm_regarg(sz, arg, Register(XMM0+arg_index)); arg_index++; } #else else if (sz == ARGSIZE_F && fr < XMM8) { // double goes in next available XMM register asm_regarg(sz, arg, fr); fr = nextreg(fr); } #endif else { asm_stkarg(sz, arg, stk_used); stk_used += sizeof(void*); } } if (stk_used > max_stk_used) max_stk_used = stk_used; } void Assembler::asm_regarg(ArgSize sz, LIns *p, Register r) { if (sz == ARGSIZE_I) { NanoAssert(!p->isQuad()); if (p->isconst()) { emit_quad(r, int64_t(p->imm32())); return; } // sign extend int32 to int64 emitrr(X64_movsxdr, r, r); } else if (sz == ARGSIZE_U) { NanoAssert(!p->isQuad()); if (p->isconst()) { emit_quad(r, uint64_t(uint32_t(p->imm32()))); return; } // zero extend with 32bit mov, auto-zeros upper 32bits emitrr(X64_movlr, r, r); } /* there is no point in folding an immediate here, because * the argument register must be a scratch register and we're * just before a call. Just reserving the register will cause * the constant to be rematerialized nearby in asm_restore(), * which is the same instruction we would otherwise emit right * here, and moving it earlier in the stream provides more scheduling * freedom to the cpu. */ findSpecificRegFor(p, r); } void Assembler::asm_stkarg(ArgSize sz, LIns *p, int stk_off) { NanoAssert(isS8(stk_off)); if (sz & ARGSIZE_MASK_INT) { Register r = findRegFor(p, GpRegs); uint64_t xop = X64_movqspr | uint64_t(stk_off) << 56; // movq [rsp+d8], r xop |= uint64_t((r&7)<<3) << 40 | uint64_t((r&8)>>1) << 24; // insert r into mod/rm and rex bytes emit(xop); if (sz == ARGSIZE_I) { // extend int32 to int64 NanoAssert(!p->isQuad()); emitrr(X64_movsxdr, r, r); } else if (sz == ARGSIZE_U) { // extend uint32 to uint64 NanoAssert(!p->isQuad()); emitrr(X64_movlr, r, r); } } else { TODO(asm_stkarg_non_int); } } void Assembler::asm_promote(LIns *ins) { Register rr, ra; regalloc_unary(ins, GpRegs, rr, ra); NanoAssert(IsGpReg(ra)); if (ins->isop(LIR_u2q)) { emitrr(X64_movlr, rr, ra); // 32bit mov zeros the upper 32bits of the target } else { NanoAssert(ins->isop(LIR_i2q)); emitrr(X64_movsxdr, rr, ra); // sign extend 32->64 } } // the CVTSI2SD instruction only writes to the low 64bits of the target // XMM register, which hinders register renaming and makes dependence // chains longer. So we precede with XORPS to clear the target register. void Assembler::asm_i2f(LIns *ins) { Register r = prepResultReg(ins, FpRegs); Register b = findRegFor(ins->oprnd1(), GpRegs); emitprr(X64_cvtsi2sd, r, b); // cvtsi2sd xmmr, b only writes xmm:0:64 emitprr(X64_xorps, r, r); // xorpd xmmr,xmmr to break dependency chains } void Assembler::asm_u2f(LIns *ins) { Register r = prepResultReg(ins, FpRegs); Register b = findRegFor(ins->oprnd1(), GpRegs); NanoAssert(!ins->oprnd1()->isQuad()); // since oprnd1 value is 32bit, its okay to zero-extend the value without worrying about clobbering. emitprr(X64_cvtsq2sd, r, b); // convert int64 to double emitprr(X64_xorps, r, r); // xorpd xmmr,xmmr to break dependency chains emitrr(X64_movlr, b, b); // zero extend u32 to int64 } void Assembler::asm_cmov(LIns *ins) { LIns* cond = ins->oprnd1(); LIns* iftrue = ins->oprnd2(); LIns* iffalse = ins->oprnd3(); NanoAssert(cond->isCmp()); NanoAssert((ins->isop(LIR_qcmov) && iftrue->isQuad() && iffalse->isQuad()) || (ins->isop(LIR_cmov) && !iftrue->isQuad() && !iffalse->isQuad())); // this code assumes that neither LD nor MR nor MRcc set any of the condition flags. // (This is true on Intel, is it true on all architectures?) const Register rr = prepResultReg(ins, GpRegs); const Register rf = findRegFor(iffalse, GpRegs & ~rmask(rr)); X64Opcode xop; switch (cond->opcode()) { default: TODO(asm_cmov); case LIR_qeq: xop = X64_cmovqne; break; } emitrr(xop, rr, rf); /*const Register rt =*/ findSpecificRegFor(iftrue, rr); asm_cmp(cond); } NIns* Assembler::asm_branch(bool onFalse, LIns *cond, NIns *target) { LOpcode condop = cond->opcode(); if (condop >= LIR_feq && condop <= LIR_fge) return asm_fbranch(onFalse, cond, target); // we must ensure there's room for the instr before calculating // the offset. and the offset, determines the opcode (8bit or 32bit) underrunProtect(8); if (target && isS8(target - _nIns)) { static const X64Opcode j8[] = { X64_je8, // eq X64_jl8, X64_jg8, X64_jle8, X64_jge8, // lt, gt, le, ge X64_jb8, X64_ja8, X64_jbe8, X64_jae8 // ult, ugt, ule, uge }; uint64_t xop = j8[(condop & ~LIR64) - LIR_eq]; xop ^= onFalse ? (uint64_t)X64_jneg8 : 0; emit8(xop, target - _nIns); } else { static const X64Opcode j32[] = { X64_je, // eq X64_jl, X64_jg, X64_jle, X64_jge, // lt, gt, le, ge X64_jb, X64_ja, X64_jbe, X64_jae // ult, ugt, ule, uge }; uint64_t xop = j32[(condop & ~LIR64) - LIR_eq]; xop ^= onFalse ? (uint64_t)X64_jneg : 0; emit32(xop, target ? target - _nIns : 0); } NIns *patch = _nIns; // addr of instr to patch asm_cmp(cond); return patch; } void Assembler::asm_cmp(LIns *cond) { LIns *b = cond->oprnd2(); if (isImm32(b)) { asm_cmp_imm(cond); return; } LIns *a = cond->oprnd1(); Register ra, rb; if (a != b) { Reservation *resva, *resvb; findRegFor2(GpRegs, a, resva, b, resvb); ra = resva->reg; rb = resvb->reg; } else { // optimize-me: this will produce a const result! ra = rb = findRegFor(a, GpRegs); } LOpcode condop = cond->opcode(); emitrr(condop & LIR64 ? X64_cmpqr : X64_cmplr, ra, rb); } void Assembler::asm_cmp_imm(LIns *cond) { LIns *a = cond->oprnd1(); LIns *b = cond->oprnd2(); Register ra = findRegFor(a, GpRegs); int32_t imm = getImm32(b); if (isS8(imm)) { X64Opcode xop = (cond->opcode() & LIR64) ? X64_cmpqr8 : X64_cmplr8; emitr_imm8(xop, ra, imm); } else { X64Opcode xop = (cond->opcode() & LIR64) ? X64_cmpqri : X64_cmplri; emitr_imm(xop, ra, imm); } } // compiling floating point branches // discussion in https://bugzilla.mozilla.org/show_bug.cgi?id=443886 // // fucom/p/pp: c3 c2 c0 jae ja jbe jb je jne // ucomisd: Z P C !C !C&!Z C|Z C Z !Z // -- -- -- -- ----- --- -- -- -- // unordered 1 1 1 T T T // greater > 0 0 0 T T T // less < 0 0 1 T T T // equal = 1 0 0 T T T // // here's the cases, using conditionals: // // branch >= > <= < = // ------ --- --- --- --- --- // LIR_jt jae ja swap+jae swap+ja jp over je // LIR_jf jb jbe swap+jb swap+jbe jne+jp NIns* Assembler::asm_fbranch(bool onFalse, LIns *cond, NIns *target) { LOpcode condop = cond->opcode(); NIns *patch; LIns *a = cond->oprnd1(); LIns *b = cond->oprnd2(); if (condop == LIR_feq) { if (onFalse) { // branch if unordered or != underrunProtect(16); // 12 needed, round up for overhang emit32(X64_jp, target ? target - _nIns : 0); emit32(X64_jne, target ? target - _nIns : 0); patch = _nIns; } else { // jp skip (2byte) // jeq target // skip: ... underrunProtect(16); // 7 needed but we write 2 instr NIns *skip = _nIns; emit32(X64_je, target ? target - _nIns : 0); patch = _nIns; emit8(X64_jp8, skip - _nIns); } } else { if (condop == LIR_flt) { condop = LIR_fgt; LIns *t = a; a = b; b = t; } else if (condop == LIR_fle) { condop = LIR_fge; LIns *t = a; a = b; b = t; } X64Opcode xop; if (condop == LIR_fgt) xop = onFalse ? X64_jbe : X64_ja; else // LIR_fge xop = onFalse ? X64_jb : X64_jae; underrunProtect(8); emit32(xop, target ? target - _nIns : 0); patch = _nIns; } fcmp(a, b); return patch; } void Assembler::asm_fcond(LIns *ins) { LOpcode op = ins->opcode(); LIns *a = ins->oprnd1(); LIns *b = ins->oprnd2(); if (op == LIR_feq) { // result = ZF & !PF, must do logic on flags // r = al|bl|cl|dl, can only use rh without rex prefix Register r = prepResultReg(ins, 1<reg, resvb->reg); } void Assembler::asm_restore(LIns *ins, Reservation *resv, Register r) { (void) r; if (ins->isop(LIR_alloc)) { int d = disp(resv); emitrm(X64_leaqrm, r, d, FP); } else if (ins->isconst()) { if (!resv->arIndex) { ins->resv()->clear(); } // unsafe to use xor r,r for zero because it changes cc's emit_int(r, ins->imm32()); } else if (ins->isconstq() && IsGpReg(r)) { if (!resv->arIndex) { ins->resv()->clear(); } // unsafe to use xor r,r for zero because it changes cc's emit_quad(r, ins->imm64()); } else { int d = findMemFor(ins); if (IsFpReg(r)) { NanoAssert(ins->isQuad()); // load 64bits into XMM. don't know if double or int64, assume double. emitprm(X64_movsdrm, r, d, FP); } else if (ins->isQuad()) { emitrm(X64_movqrm, r, d, FP); } else { emitrm(X64_movlrm, r, d, FP); } } verbose_only( if (_logc->lcbits & LC_RegAlloc) { outputForEOL(" <= restore %s", _thisfrag->lirbuf->names->formatRef(ins)); } ) } void Assembler::asm_cond(LIns *ins) { LOpcode op = ins->opcode(); // unlike x86-32, with a rex prefix we can use any GP register as an 8bit target Register r = prepResultReg(ins, GpRegs); // SETcc only sets low 8 bits, so extend emitrr8(X64_movzx8, r, r); X64Opcode xop; switch (op) { default: TODO(cond); case LIR_qeq: case LIR_eq: xop = X64_sete; break; case LIR_qlt: case LIR_lt: xop = X64_setl; break; case LIR_qle: case LIR_le: xop = X64_setle; break; case LIR_qgt: case LIR_gt: xop = X64_setg; break; case LIR_qge: case LIR_ge: xop = X64_setge; break; case LIR_qult: case LIR_ult: xop = X64_setb; break; case LIR_qule: case LIR_ule: xop = X64_setbe; break; case LIR_qugt: case LIR_ugt: xop = X64_seta; break; case LIR_quge: case LIR_uge: xop = X64_setae; break; case LIR_ov: xop = X64_seto; break; } emitr8(xop, r); asm_cmp(ins); } void Assembler::asm_ret(LIns *ins) { JMP(_epilogue); assignSavedRegs(); LIns *value = ins->oprnd1(); Register r = ins->isop(LIR_ret) ? RAX : XMM0; findSpecificRegFor(value, r); } void Assembler::asm_nongp_copy(Register d, Register s) { if (!IsFpReg(d) && IsFpReg(s)) { // gpr <- xmm: use movq r/m64, xmm (66 REX.W 0F 7E /r) emitprr(X64_movqrx, s, d); } else if (IsFpReg(d) && IsFpReg(s)) { // xmm <- xmm: use movaps. movsd r,r causes partial register stall emitrr(X64_movapsr, d, s); } else { // xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r) emitprr(X64_movqxr, d, s); } } void Assembler::regalloc_load(LIns *ins, Register &rr, int32_t &dr, Register &rb) { dr = ins->disp(); LIns *base = ins->oprnd1(); rb = getBaseReg(base, dr, BaseRegs); Reservation *resv = getresv(ins); if (resv && (rr = resv->reg) != UnknownReg) { // keep already assigned register freeRsrcOf(ins, false); } else { // use a gpr in case we're copying a non-double rr = prepResultReg(ins, GpRegs & ~rmask(rb)); } } void Assembler::asm_load64(LIns *ins) { Register rr, rb; int32_t dr; regalloc_load(ins, rr, dr, rb); if (IsGpReg(rr)) { // general 64bit load, 32bit const displacement emitrm(X64_movqrm, rr, dr, rb); } else { // load 64bits into XMM. don't know if double or int64, assume double. emitprm(X64_movsdrm, rr, dr, rb); } } void Assembler::asm_ld(LIns *ins) { NanoAssert(!ins->isQuad()); Register r, b; int32_t d; regalloc_load(ins, r, d, b); emitrm(X64_movlrm, r, d, b); } void Assembler::asm_store64(LIns *value, int d, LIns *base) { NanoAssert(value->isQuad()); Register b = getBaseReg(base, d, BaseRegs); // if we have to choose a register, use a GPR, but not the base reg Reservation *resv = getresv(value); Register r; if (!resv || (r = resv->reg) == UnknownReg) { r = findRegFor(value, GpRegs & ~rmask(b)); } if (IsGpReg(r)) { // gpr store emitrm(X64_movqmr, r, d, b); } else { // xmm store emitprm(X64_movsdmr, r, d, b); } } void Assembler::asm_store32(LIns *value, int d, LIns *base) { NanoAssert(!value->isQuad()); Register b = getBaseReg(base, d, BaseRegs); Register r = findRegFor(value, GpRegs & ~rmask(b)); // store 32bits to 64bit addr. use rex so we can use all 16 regs emitrm(X64_movlmr, r, d, b); } // generate a 32bit constant, must not affect condition codes! void Assembler::emit_int(Register r, int32_t v) { NanoAssert(IsGpReg(r)); emitr_imm(X64_movi, r, v); } // generate a 64bit constant, must not affect condition codes! void Assembler::emit_quad(Register r, uint64_t v) { NanoAssert(IsGpReg(r)); if (isU32(v)) { emit_int(r, int32_t(v)); return; } if (isS32(v)) { // safe for sign-extension 32->64 emitr_imm(X64_movqi32, r, int32_t(v)); return; } underrunProtect(8+8); // imm64 + worst case instr len ((uint64_t*)_nIns)[-1] = v; _nIns -= 8; _nvprof("x64-bytes", 8); emitr(X64_movqi, r); } void Assembler::asm_int(LIns *ins) { Register r = prepResultReg(ins, GpRegs); int32_t v = ins->imm32(); if (v == 0) { // special case for zero emitrr(X64_xorrr, r, r); return; } emit_int(r, v); } void Assembler::asm_quad(LIns *ins) { uint64_t v = ins->imm64(); RegisterMask allow = v == 0 ? GpRegs|FpRegs : GpRegs; Register r = prepResultReg(ins, allow); if (v == 0) { if (IsGpReg(r)) { // special case for zero emitrr(X64_xorrr, r, r); } else { // xorps for xmm emitprr(X64_xorps, r, r); } } else { emit_quad(r, v); } } void Assembler::asm_qjoin(LIns*) { TODO(asm_qjoin); } Register Assembler::asm_prep_fcall(Reservation*, LIns *ins) { return prepResultReg(ins, rmask(XMM0)); } void Assembler::asm_param(LIns *ins) { uint32_t a = ins->paramArg(); uint32_t kind = ins->paramKind(); if (kind == 0) { // ordinary param // first six args always in registers for mac x64 if (a < 6) { // incoming arg in register prepResultReg(ins, rmask(argRegs[a])); } else { // todo: support stack based args, arg 0 is at [FP+off] where off // is the # of regs to be pushed in genProlog() TODO(asm_param_stk); } } else { // saved param prepResultReg(ins, rmask(savedRegs[a])); } } // register allocation for 2-address style unary ops of the form R = (op) R void Assembler::regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra) { LIns *a = ins->oprnd1(); rr = prepResultReg(ins, allow); Reservation* rA = getresv(a); // if this is last use of a in reg, we can re-use result reg if (rA == 0 || (ra = rA->reg) == UnknownReg) { ra = findSpecificRegFor(a, rr); } else { // rA already has a register assigned. caller must emit a copy // to rr once instr code is generated. (ie mov rr,ra ; op rr) } } static const AVMPLUS_ALIGN16(int64_t) negateMask[] = {0x8000000000000000LL,0}; void Assembler::asm_fneg(LIns *ins) { Register rr, ra; if (isS32((uintptr_t)negateMask) || isS32((NIns*)negateMask - _nIns)) { regalloc_unary(ins, FpRegs, rr, ra); if (isS32((uintptr_t)negateMask)) { // builtin code is in bottom or top 2GB addr space, use absolute addressing underrunProtect(4+8); *((int32_t*)(_nIns -= 4)) = (int32_t)(uintptr_t)negateMask; _nvprof("x64-bytes", 4); uint64_t xop = X64_xorpsa | uint64_t((rr&7)<<3)<<48; // put rr[0:2] into mod/rm byte xop = rexrb(xop, rr, (Register)0); // put rr[3] into rex byte emit(xop); } else { // jit code is within +/-2GB of builtin code, use rip-relative underrunProtect(4+8); int32_t d = (int32_t) ((NIns*)negateMask - _nIns); *((int32_t*)(_nIns -= 4)) = d; _nvprof("x64-bytes", 4); emitrr(X64_xorpsm, rr, (Register)0); } if (ra != rr) asm_nongp_copy(rr,ra); } else { // this is just hideous - can't use RIP-relative load, can't use // absolute-address load, and cant move imm64 const to XMM. // so do it all in a GPR. hrmph. rr = prepResultReg(ins, GpRegs); ra = findRegFor(ins->oprnd1(), GpRegs & ~rmask(rr)); emitrr(X64_xorqrr, rr, ra); // xor rr, ra emit_quad(rr, negateMask[0]); // mov rr, 0x8000000000000000 } } void Assembler::asm_qhi(LIns*) { TODO(asm_qhi); } void Assembler::asm_qlo(LIns *ins) { Register rr, ra; regalloc_unary(ins, GpRegs, rr, ra); NanoAssert(IsGpReg(ra)); emitrr(X64_movlr, rr, ra); // 32bit mov zeros the upper 32bits of the target } void Assembler::asm_spill(Register rr, int d, bool /*pop*/, bool quad) { if (d) { if (!IsFpReg(rr)) { X64Opcode xop = quad ? X64_movqmr : X64_movlmr; emitrm(xop, rr, d, FP); } else { // store 64bits from XMM to memory NanoAssert(quad); emitprm(X64_movsdmr, rr, d, FP); } } } NIns* Assembler::genPrologue() { // activation frame is 4 bytes per entry even on 64bit machines uint32_t stackNeeded = max_stk_used + _activation.highwatermark * 4; uint32_t stackPushed = sizeof(void*) + // returnaddr sizeof(void*); // ebp uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK); uint32_t amt = aligned - stackPushed; // Reserve stackNeeded bytes, padded // to preserve NJ_ALIGN_STACK-byte alignment. if (amt) { if (isS8(amt)) emitr_imm8(X64_subqr8, RSP, amt); else emitr_imm(X64_subqri, RSP, amt); } verbose_only( outputAddr=true; asm_output("[patch entry]"); ) NIns *patchEntry = _nIns; MR(FP, RSP); // Establish our own FP. emitr(X64_pushr, FP); // Save caller's FP. return patchEntry; } NIns* Assembler::genEpilogue() { // mov rsp, rbp // pop rbp // ret max_stk_used = 0; emit(X64_ret); emitr(X64_popr, RBP); MR(RSP, RBP); return _nIns; } void Assembler::nRegisterResetAll(RegAlloc &a) { // add scratch registers to our free list for the allocator a.clear(); a.used = 0; #ifdef _MSC_VER a.free = 0x001fffcf; // rax-rbx, rsi, rdi, r8-r15, xmm0-xmm5 #else a.free = 0xffffffff & ~(1<,jp, for LIR_jf(feq) // we just patched the jne, now patch the jp. next += 6; NanoAssert(((int32_t*)next)[-1] == 0); NanoAssert(isS32(target - next)); ((int32_t*)next)[-1] = int32_t(target - next); } } Register Assembler::nRegisterAllocFromSet(RegisterMask set) { #if defined _WIN64 DWORD tr; _BitScanForward(&tr, set); _allocator.free &= ~rmask((Register)tr); return (Register) tr; #else // gcc asm syntax Register r; asm("bsf %1, %%eax\n\t" "btr %%eax, %2\n\t" "movl %%eax, %0\n\t" : "=m"(r) : "m"(set), "m"(_allocator.free) : "%eax", "memory"); (void)set; return r; #endif } void Assembler::nFragExit(LIns*) { TODO(nFragExit); } void Assembler::nInit(AvmCore*) {} void Assembler::underrunProtect(ptrdiff_t bytes) { NanoAssertMsg(bytes<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small"); NIns *pc = _nIns; NIns *top = _inExit ? this->exitStart : this->codeStart; #if PEDANTIC // pedanticTop is based on the last call to underrunProtect; any time we call // underrunProtect and would use more than what's already protected, then insert // a page break jump. Sometimes, this will be to a new page, usually it's just // the next instruction NanoAssert(pedanticTop >= top); if (pc - bytes < pedanticTop) { // no page break required, but insert a far branch anyway just to be difficult const int br_size = 8; // opcode + 32bit addr if (pc - bytes - br_size < top) { // really do need a page break verbose_only(if (_logc->lcbits & LC_Assembly) outputf("newpage %p:", pc);) codeAlloc(); } // now emit the jump, but make sure we won't need another page break. // we're pedantic, but not *that* pedantic. pedanticTop = _nIns - br_size; JMP(pc); pedanticTop = _nIns - bytes; } #else if (pc - bytes < top) { verbose_only(if (_logc->lcbits & LC_Assembly) outputf("newpage %p:", pc);) codeAlloc(); // this jump will call underrunProtect again, but since we're on a new // page, nothing will happen. JMP(pc); } #endif } RegisterMask Assembler::hint(LIns *, RegisterMask allow) { return allow; } void Assembler::nativePageSetup() { if (!_nIns) { codeAlloc(); IF_PEDANTIC( pedanticTop = _nIns; ) } if (!_nExitIns) { codeAlloc(true); } } void Assembler::nativePageReset() {} } // namespace nanojit #endif // FEATURE_NANOJIT && NANOJIT_X64