gecko/js/src/nanojit/NativeX64.cpp

1298 lines
45 KiB
C++

/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is [Open Source Virtual Machine].
*
* The Initial Developer of the Original Code is
* Adobe System Incorporated.
* Portions created by the Initial Developer are Copyright (C) 2009
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Adobe AS3 Team
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#include "nanojit.h"
// uncomment this to enable _vprof/_nvprof macros
//#define DOPROF
#include "../vprof/vprof.h"
#if defined FEATURE_NANOJIT && defined NANOJIT_X64
/*
completion
- 64bit branch offsets
- finish cmov/qcmov with other conditions
- validate asm_cond with other conditions
better code
- put R12 back in play as a base register
- no-disp addr modes (except RBP/R13)
- disp64 branch/call
- spill gp values to xmm registers?
- prefer xmm registers for copies since gprs are in higher demand?
- stack arg doubles
- stack based LIR_param
tracing
- asm_qjoin
- asm_qhi
- nFragExit
*/
namespace nanojit
{
const Register Assembler::retRegs[] = { RAX };
#ifdef _MSC_VER
const Register Assembler::argRegs[] = { RCX, RDX, R8, R9 };
const Register Assembler::savedRegs[] = { RBX, RSI, RDI, R12, R13, R14, R15 };
#else
const Register Assembler::argRegs[] = { RDI, RSI, RDX, RCX, R8, R9 };
const Register Assembler::savedRegs[] = { RBX, R12, R13, R14, R15 };
#endif
const char *regNames[] = {
"rax", "rcx", "rdx", "rbx", "rsp", "rbp", "rsi", "rdi",
"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
};
#ifdef _DEBUG
#define TODO(x) todo(#x)
static void todo(const char *s) {
verbose_only( avmplus::AvmLog("%s",s); )
NanoAssertMsgf(false, "%s", s);
}
#else
#define TODO(x)
#endif
// MODRM and restrictions:
// memory access modes != 11 require SIB if base&7 == 4 (RSP or R12)
// mode 00 with base&7 == 5 means RIP+disp32 (RBP or R13), use mode 01 disp8=0 instead
// rex prefix required to use RSP-R15 as 8bit registers in mod/rm8 modes.
// take R12 out of play as a base register because it requires the SIB byte like ESP
const RegisterMask BaseRegs = GpRegs & ~rmask(R12);
static inline int oplen(uint64_t op) {
return op & 255;
}
// encode 2-register rex prefix. dropped if none of its bits are set.
static inline uint64_t rexrb(uint64_t op, Register r, Register b) {
int shift = 64 - 8*oplen(op);
uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3);
return rex != 0x40 ? op | rex << shift : op - 1;
}
// encode 2-register rex prefix. dropped if none of its bits are set, but
// keep REX if b >= rsp, to allow uniform use of all 16 8bit registers
static inline uint64_t rexrb8(uint64_t op, Register r, Register b) {
int shift = 64 - 8*oplen(op);
uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3);
return ((rex | (b & ~3)) != 0x40) ? (op | (rex << shift)) : op - 1;
}
// encode 2-register rex prefix that follows a manditory prefix (66,F2,F3)
// [prefix][rex][opcode]
static inline uint64_t rexprb(uint64_t op, Register r, Register b) {
int shift = 64 - 8*oplen(op) + 8;
uint64_t rex = ((op >> shift) & 255) | ((r&8)>>1) | ((b&8)>>3);
// to drop rex, we replace rex with manditory prefix, and decrement length
return rex != 0x40 ? op | rex << shift :
((op & ~(255LL<<shift)) | (op>>(shift-8)&255) << shift) - 1;
}
// [rex][opcode][mod-rr]
static inline uint64_t mod_rr(uint64_t op, Register r, Register b) {
return op | uint64_t((r&7)<<3 | (b&7))<<56;
}
static inline uint64_t mod_disp32(uint64_t op, Register r, Register b, int32_t d) {
NanoAssert(IsGpReg(r) && IsGpReg(b));
NanoAssert((b & 7) != 4); // using RSP or R12 as base requires SIB
if (isS8(d)) {
// op is: 0x[disp32=0][mod=2:r:b][op][rex][len]
NanoAssert((((op>>24)&255)>>6) == 2); // disp32 mode
int len = oplen(op);
op = (op & ~0xff000000LL) | (0x40 | (r&7)<<3 | (b&7))<<24; // replace mod
return op<<24 | int64_t(d)<<56 | (len-3); // shrink disp, add disp8
} else {
// op is: 0x[disp32][mod][op][rex][len]
return op | int64_t(d)<<32 | uint64_t((r&7)<<3 | (b&7))<<24;
}
}
#ifdef NJ_VERBOSE
void Assembler::dis(NIns *p, int bytes) {
char b[32], *s = b; // room for 8 hex bytes plus null
*s++ = ' ';
for (NIns *end = p + bytes; p < end; p++) {
VMPI_sprintf(s, "%02x ", *p);
s += 3;
}
*s = 0;
asm_output("%s", b);
}
#endif
void Assembler::emit(uint64_t op) {
int len = oplen(op);
// we will only move nIns by -len bytes, but we write 8
// bytes. so need to protect 8 so we dont stomp the page
// header or the end of the preceding page (might segf)
underrunProtect(8);
((int64_t*)_nIns)[-1] = op;
_nIns -= len; // move pointer by length encoded in opcode
_nvprof("x64-bytes", len);
verbose_only( if (_logc->lcbits & LC_Assembly) dis(_nIns, len); )
}
void Assembler::emit8(uint64_t op, int64_t v) {
NanoAssert(isS8(v));
emit(op | uint64_t(v)<<56);
}
void Assembler::emit32(uint64_t op, int64_t v) {
NanoAssert(isS32(v));
emit(op | uint64_t(uint32_t(v))<<32);
}
// 2-register modrm32 form
void Assembler::emitrr(uint64_t op, Register r, Register b) {
emit(rexrb(mod_rr(op, r, b), r, b));
}
// 2-register modrm8 form (8 bit operand size)
void Assembler::emitrr8(uint64_t op, Register r, Register b) {
emit(rexrb8(mod_rr(op, r, b), r, b));
}
// same as emitrr, but with a prefix byte
void Assembler::emitprr(uint64_t op, Register r, Register b) {
emit(rexprb(mod_rr(op, r, b), r, b));
}
// disp32 modrm form, when the disp fits in the instruction (opcode is 1-3 bytes)
void Assembler::emitrm(uint64_t op, Register r, int32_t d, Register b) {
emit(rexrb(mod_disp32(op, r, b, d), r, b));
}
// disp32 modrm form when the disp must be written separately (opcode is 4+ bytes)
void Assembler::emitprm(uint64_t op, Register r, int32_t d, Register b) {
if (isS8(d)) {
NanoAssert(((op>>56)&0xC0) == 0x80); // make sure mod bits == 2 == disp32 mode
underrunProtect(1+8);
*(--_nIns) = (NIns) d;
_nvprof("x64-bytes", 1);
op ^= 0xC000000000000000LL; // change mod bits to 1 == disp8 mode
} else {
underrunProtect(4+8); // room for displ plus fullsize op
*((int32_t*)(_nIns -= 4)) = d;
_nvprof("x64-bytes", 4);
}
emitprr(op, r, b);
}
void Assembler::emitrr_imm(uint64_t op, Register r, Register b, int32_t imm) {
NanoAssert(IsGpReg(r) && IsGpReg(b));
underrunProtect(4+8); // room for imm plus fullsize op
*((int32_t*)(_nIns -= 4)) = imm;
_nvprof("x86-bytes", 4);
emitrr(op, r, b);
}
// op = [rex][opcode][modrm][imm8]
void Assembler::emitr_imm8(uint64_t op, Register b, int32_t imm8) {
NanoAssert(IsGpReg(b) && isS8(imm8));
op |= uint64_t(imm8)<<56 | uint64_t(b&7)<<48; // modrm is 2nd to last byte
emit(rexrb(op, (Register)0, b));
}
void Assembler::MR(Register d, Register s) {
NanoAssert(IsGpReg(d) && IsGpReg(s));
emitrr(X64_movqr, d, s);
}
void Assembler::JMP(NIns *target) {
if (!target || isS32(target - _nIns)) {
underrunProtect(8); // must do this before calculating offset
if (target && isS8(target - _nIns)) {
emit8(X64_jmp8, target - _nIns);
} else {
emit32(X64_jmp, target ? target - _nIns : 0);
}
} else {
TODO(jmp64);
}
}
// register allocation for 2-address style ops of the form R = R (op) B
void Assembler::regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb) {
rb = UnknownReg;
LIns *a = ins->oprnd1();
LIns *b = ins->oprnd2();
if (a != b) {
rb = findRegFor(b, allow);
allow &= ~rmask(rb);
}
rr = prepResultReg(ins, allow);
Reservation* rA = getresv(a);
// if this is last use of a in reg, we can re-use result reg
if (rA == 0 || (ra = rA->reg) == UnknownReg) {
ra = findSpecificRegFor(a, rr);
} else {
// rA already has a register assigned
}
if (a == b) {
rb = ra;
}
}
void Assembler::asm_qbinop(LIns *ins) {
asm_arith(ins);
}
void Assembler::asm_shift(LIns *ins) {
// shift require rcx for shift count
LIns *b = ins->oprnd2();
if (b->isconst()) {
asm_shift_imm(ins);
return;
}
Register rr, ra;
if (b != ins->oprnd1()) {
findSpecificRegFor(b, RCX);
regalloc_unary(ins, GpRegs & ~rmask(RCX), rr, ra);
} else {
// a == b means both must be in RCX
regalloc_unary(ins, rmask(RCX), rr, ra);
}
X64Opcode xop;
switch (ins->opcode()) {
default:
TODO(asm_shift);
case LIR_qursh: xop = X64_shrq; break;
case LIR_qirsh: xop = X64_sarq; break;
case LIR_qilsh: xop = X64_shlq; break;
case LIR_ush: xop = X64_shr; break;
case LIR_rsh: xop = X64_sar; break;
case LIR_lsh: xop = X64_shl; break;
}
emitr(xop, rr);
if (rr != ra)
MR(rr, ra);
}
void Assembler::asm_shift_imm(LIns *ins) {
Register rr, ra;
regalloc_unary(ins, GpRegs, rr, ra);
X64Opcode xop;
switch (ins->opcode()) {
default: TODO(shiftimm);
case LIR_qursh: xop = X64_shrqi; break;
case LIR_qirsh: xop = X64_sarqi; break;
case LIR_qilsh: xop = X64_shlqi; break;
case LIR_ush: xop = X64_shri; break;
case LIR_rsh: xop = X64_sari; break;
case LIR_lsh: xop = X64_shli; break;
}
int shift = ins->oprnd2()->imm32() & 255;
emit8(rexrb(xop | uint64_t(rr&7)<<48, (Register)0, rr), shift);
if (rr != ra)
MR(rr, ra);
}
static bool isImm32(LIns *ins) {
return ins->isconst() || (ins->isconstq() && isS32(ins->imm64()));
}
static int32_t getImm32(LIns *ins) {
return ins->isconst() ? ins->imm32() : int32_t(ins->imm64());
}
// binary op, integer regs, rhs is int32 const
void Assembler::asm_arith_imm(LIns *ins) {
LIns *b = ins->oprnd2();
int32_t imm = getImm32(b);
LOpcode op = ins->opcode();
Register rr, ra;
if (op == LIR_mul) {
// imul has true 3-addr form, it doesn't clobber ra
rr = prepResultReg(ins, GpRegs);
LIns *a = ins->oprnd1();
ra = findRegFor(a, GpRegs);
emitrr_imm(X64_imuli, rr, ra, imm);
return;
}
regalloc_unary(ins, GpRegs, rr, ra);
X64Opcode xop;
if (isS8(imm)) {
switch (ins->opcode()) {
default: TODO(arith_imm8);
case LIR_iaddp:
case LIR_add: xop = X64_addlr8; break;
case LIR_and: xop = X64_andlr8; break;
case LIR_or: xop = X64_orlr8; break;
case LIR_sub: xop = X64_sublr8; break;
case LIR_xor: xop = X64_xorlr8; break;
case LIR_qiadd:
case LIR_qaddp: xop = X64_addqr8; break;
case LIR_qiand: xop = X64_andqr8; break;
case LIR_qior: xop = X64_orqr8; break;
case LIR_qxor: xop = X64_xorqr8; break;
}
emitr_imm8(xop, rr, imm);
} else {
switch (ins->opcode()) {
default: TODO(arith_imm);
case LIR_iaddp:
case LIR_add: xop = X64_addlri; break;
case LIR_and: xop = X64_andlri; break;
case LIR_or: xop = X64_orlri; break;
case LIR_sub: xop = X64_sublri; break;
case LIR_xor: xop = X64_xorlri; break;
case LIR_qiadd:
case LIR_qaddp: xop = X64_addqri; break;
case LIR_qiand: xop = X64_andqri; break;
case LIR_qior: xop = X64_orqri; break;
case LIR_qxor: xop = X64_xorqri; break;
}
emitr_imm(xop, rr, imm);
}
if (rr != ra)
MR(rr, ra);
}
// binary op with integer registers
void Assembler::asm_arith(LIns *ins) {
Register rr, ra, rb;
LOpcode op = ins->opcode();
if ((op & ~LIR64) >= LIR_lsh && (op & ~LIR64) <= LIR_ush) {
asm_shift(ins);
return;
}
LIns *b = ins->oprnd2();
if (isImm32(b)) {
asm_arith_imm(ins);
return;
}
regalloc_binary(ins, GpRegs, rr, ra, rb);
X64Opcode xop;
switch (ins->opcode()) {
default:
TODO(asm_arith);
case LIR_or:
xop = X64_orlrr;
break;
case LIR_sub:
xop = X64_subrr;
break;
case LIR_iaddp:
case LIR_add:
xop = X64_addrr;
break;
case LIR_and:
xop = X64_andrr;
break;
case LIR_xor:
xop = X64_xorrr;
break;
case LIR_mul:
xop = X64_imul;
break;
case LIR_qxor:
xop = X64_xorqrr;
break;
case LIR_qior:
xop = X64_orqrr;
break;
case LIR_qiand:
xop = X64_andqrr;
break;
case LIR_qiadd:
case LIR_qaddp:
xop = X64_addqrr;
break;
}
emitrr(xop, rr, rb);
if (rr != ra)
MR(rr,ra);
}
// binary op with fp registers
void Assembler::asm_fop(LIns *ins) {
Register rr, ra, rb;
regalloc_binary(ins, FpRegs, rr, ra, rb);
X64Opcode xop;
switch (ins->opcode()) {
default:
TODO(asm_fop);
case LIR_fdiv:
xop = X64_divsd;
break;
case LIR_fmul:
xop = X64_mulsd;
break;
case LIR_fadd:
xop = X64_addsd;
break;
case LIR_fsub:
xop = X64_subsd;
break;
}
emitprr(xop, rr, rb);
if (rr != ra) {
asm_nongp_copy(rr, ra);
}
}
void Assembler::asm_neg_not(LIns *ins) {
Register rr, ra;
regalloc_unary(ins, GpRegs, rr, ra);
NanoAssert(IsGpReg(ra));
X64Opcode xop;
if (ins->isop(LIR_not)) {
xop = X64_not;
} else {
xop = X64_neg;
}
emitr(xop, rr);
if (rr != ra)
MR(rr, ra);
}
void Assembler::asm_call(LIns *ins) {
const CallInfo *call = ins->callInfo();
ArgSize sizes[MAXARGS];
int argc = call->get_sizes(sizes);
bool indirect = call->isIndirect();
if (!indirect) {
verbose_only(if (_logc->lcbits & LC_Assembly)
outputf(" %p:", _nIns);
)
NIns *target = (NIns*)call->_address;
// must do underrunProtect before calculating offset
underrunProtect(8);
if (isS32(target - _nIns)) {
emit32(X64_call, target - _nIns);
} else {
// can't reach target from here, load imm64 and do an indirect jump
emit(X64_callrax);
emit_quad(RAX, (uint64_t)target);
}
} else {
// Indirect call: we assign the address arg to RAX since it's not
// used for regular arguments, and is otherwise scratch since it's
// clobberred by the call.
asm_regarg(ARGSIZE_P, ins->arg(--argc), RAX);
emit(X64_callrax);
}
#ifdef _MSC_VER
int stk_used = 32; // always reserve 32byte shadow area
#else
int stk_used = 0;
Register fr = XMM0;
#endif
int arg_index = 0;
for (int i = 0; i < argc; i++) {
int j = argc - i - 1;
ArgSize sz = sizes[j];
LIns* arg = ins->arg(j);
if ((sz & ARGSIZE_MASK_INT) && arg_index < NumArgRegs) {
// gp arg
asm_regarg(sz, arg, argRegs[arg_index]);
arg_index++;
}
#ifdef _MSC_VER
else if (sz == ARGSIZE_F && arg_index < NumArgRegs) {
// double goes in XMM reg # based on overall arg_index
asm_regarg(sz, arg, Register(XMM0+arg_index));
arg_index++;
}
#else
else if (sz == ARGSIZE_F && fr < XMM8) {
// double goes in next available XMM register
asm_regarg(sz, arg, fr);
fr = nextreg(fr);
}
#endif
else {
asm_stkarg(sz, arg, stk_used);
stk_used += sizeof(void*);
}
}
if (stk_used > max_stk_used)
max_stk_used = stk_used;
}
void Assembler::asm_regarg(ArgSize sz, LIns *p, Register r) {
if (sz == ARGSIZE_I) {
NanoAssert(!p->isQuad());
if (p->isconst()) {
emit_quad(r, int64_t(p->imm32()));
return;
}
// sign extend int32 to int64
emitrr(X64_movsxdr, r, r);
} else if (sz == ARGSIZE_U) {
NanoAssert(!p->isQuad());
if (p->isconst()) {
emit_quad(r, uint64_t(uint32_t(p->imm32())));
return;
}
// zero extend with 32bit mov, auto-zeros upper 32bits
emitrr(X64_movlr, r, r);
}
/* there is no point in folding an immediate here, because
* the argument register must be a scratch register and we're
* just before a call. Just reserving the register will cause
* the constant to be rematerialized nearby in asm_restore(),
* which is the same instruction we would otherwise emit right
* here, and moving it earlier in the stream provides more scheduling
* freedom to the cpu. */
findSpecificRegFor(p, r);
}
void Assembler::asm_stkarg(ArgSize sz, LIns *p, int stk_off) {
NanoAssert(isS8(stk_off));
if (sz & ARGSIZE_MASK_INT) {
Register r = findRegFor(p, GpRegs);
uint64_t xop = X64_movqspr | uint64_t(stk_off) << 56; // movq [rsp+d8], r
xop |= uint64_t((r&7)<<3) << 40 | uint64_t((r&8)>>1) << 24; // insert r into mod/rm and rex bytes
emit(xop);
if (sz == ARGSIZE_I) {
// extend int32 to int64
NanoAssert(!p->isQuad());
emitrr(X64_movsxdr, r, r);
} else if (sz == ARGSIZE_U) {
// extend uint32 to uint64
NanoAssert(!p->isQuad());
emitrr(X64_movlr, r, r);
}
} else {
TODO(asm_stkarg_non_int);
}
}
void Assembler::asm_promote(LIns *ins) {
Register rr, ra;
regalloc_unary(ins, GpRegs, rr, ra);
NanoAssert(IsGpReg(ra));
if (ins->isop(LIR_u2q)) {
emitrr(X64_movlr, rr, ra); // 32bit mov zeros the upper 32bits of the target
} else {
NanoAssert(ins->isop(LIR_i2q));
emitrr(X64_movsxdr, rr, ra); // sign extend 32->64
}
}
// the CVTSI2SD instruction only writes to the low 64bits of the target
// XMM register, which hinders register renaming and makes dependence
// chains longer. So we precede with XORPS to clear the target register.
void Assembler::asm_i2f(LIns *ins) {
Register r = prepResultReg(ins, FpRegs);
Register b = findRegFor(ins->oprnd1(), GpRegs);
emitprr(X64_cvtsi2sd, r, b); // cvtsi2sd xmmr, b only writes xmm:0:64
emitprr(X64_xorps, r, r); // xorpd xmmr,xmmr to break dependency chains
}
void Assembler::asm_u2f(LIns *ins) {
Register r = prepResultReg(ins, FpRegs);
Register b = findRegFor(ins->oprnd1(), GpRegs);
NanoAssert(!ins->oprnd1()->isQuad());
// since oprnd1 value is 32bit, its okay to zero-extend the value without worrying about clobbering.
emitprr(X64_cvtsq2sd, r, b); // convert int64 to double
emitprr(X64_xorps, r, r); // xorpd xmmr,xmmr to break dependency chains
emitrr(X64_movlr, b, b); // zero extend u32 to int64
}
void Assembler::asm_cmov(LIns *ins) {
LIns* cond = ins->oprnd1();
LIns* iftrue = ins->oprnd2();
LIns* iffalse = ins->oprnd3();
NanoAssert(cond->isCmp());
NanoAssert((ins->isop(LIR_qcmov) && iftrue->isQuad() && iffalse->isQuad()) ||
(ins->isop(LIR_cmov) && !iftrue->isQuad() && !iffalse->isQuad()));
// this code assumes that neither LD nor MR nor MRcc set any of the condition flags.
// (This is true on Intel, is it true on all architectures?)
const Register rr = prepResultReg(ins, GpRegs);
const Register rf = findRegFor(iffalse, GpRegs & ~rmask(rr));
X64Opcode xop;
switch (cond->opcode()) {
default: TODO(asm_cmov);
case LIR_qeq:
xop = X64_cmovqne;
break;
}
emitrr(xop, rr, rf);
/*const Register rt =*/ findSpecificRegFor(iftrue, rr);
asm_cmp(cond);
}
NIns* Assembler::asm_branch(bool onFalse, LIns *cond, NIns *target) {
LOpcode condop = cond->opcode();
if (condop >= LIR_feq && condop <= LIR_fge)
return asm_fbranch(onFalse, cond, target);
// we must ensure there's room for the instr before calculating
// the offset. and the offset, determines the opcode (8bit or 32bit)
underrunProtect(8);
if (target && isS8(target - _nIns)) {
static const X64Opcode j8[] = {
X64_je8, // eq
X64_jl8, X64_jg8, X64_jle8, X64_jge8, // lt, gt, le, ge
X64_jb8, X64_ja8, X64_jbe8, X64_jae8 // ult, ugt, ule, uge
};
uint64_t xop = j8[(condop & ~LIR64) - LIR_eq];
xop ^= onFalse ? (uint64_t)X64_jneg8 : 0;
emit8(xop, target - _nIns);
} else {
static const X64Opcode j32[] = {
X64_je, // eq
X64_jl, X64_jg, X64_jle, X64_jge, // lt, gt, le, ge
X64_jb, X64_ja, X64_jbe, X64_jae // ult, ugt, ule, uge
};
uint64_t xop = j32[(condop & ~LIR64) - LIR_eq];
xop ^= onFalse ? (uint64_t)X64_jneg : 0;
emit32(xop, target ? target - _nIns : 0);
}
NIns *patch = _nIns; // addr of instr to patch
asm_cmp(cond);
return patch;
}
void Assembler::asm_cmp(LIns *cond) {
LIns *b = cond->oprnd2();
if (isImm32(b)) {
asm_cmp_imm(cond);
return;
}
LIns *a = cond->oprnd1();
Register ra, rb;
if (a != b) {
Reservation *resva, *resvb;
findRegFor2(GpRegs, a, resva, b, resvb);
ra = resva->reg;
rb = resvb->reg;
} else {
// optimize-me: this will produce a const result!
ra = rb = findRegFor(a, GpRegs);
}
LOpcode condop = cond->opcode();
emitrr(condop & LIR64 ? X64_cmpqr : X64_cmplr, ra, rb);
}
void Assembler::asm_cmp_imm(LIns *cond) {
LIns *a = cond->oprnd1();
LIns *b = cond->oprnd2();
Register ra = findRegFor(a, GpRegs);
int32_t imm = getImm32(b);
if (isS8(imm)) {
X64Opcode xop = (cond->opcode() & LIR64) ? X64_cmpqr8 : X64_cmplr8;
emitr_imm8(xop, ra, imm);
} else {
X64Opcode xop = (cond->opcode() & LIR64) ? X64_cmpqri : X64_cmplri;
emitr_imm(xop, ra, imm);
}
}
// compiling floating point branches
// discussion in https://bugzilla.mozilla.org/show_bug.cgi?id=443886
//
// fucom/p/pp: c3 c2 c0 jae ja jbe jb je jne
// ucomisd: Z P C !C !C&!Z C|Z C Z !Z
// -- -- -- -- ----- --- -- -- --
// unordered 1 1 1 T T T
// greater > 0 0 0 T T T
// less < 0 0 1 T T T
// equal = 1 0 0 T T T
//
// here's the cases, using conditionals:
//
// branch >= > <= < =
// ------ --- --- --- --- ---
// LIR_jt jae ja swap+jae swap+ja jp over je
// LIR_jf jb jbe swap+jb swap+jbe jne+jp
NIns* Assembler::asm_fbranch(bool onFalse, LIns *cond, NIns *target) {
LOpcode condop = cond->opcode();
NIns *patch;
LIns *a = cond->oprnd1();
LIns *b = cond->oprnd2();
if (condop == LIR_feq) {
if (onFalse) {
// branch if unordered or !=
underrunProtect(16); // 12 needed, round up for overhang
emit32(X64_jp, target ? target - _nIns : 0);
emit32(X64_jne, target ? target - _nIns : 0);
patch = _nIns;
} else {
// jp skip (2byte)
// jeq target
// skip: ...
underrunProtect(16); // 7 needed but we write 2 instr
NIns *skip = _nIns;
emit32(X64_je, target ? target - _nIns : 0);
patch = _nIns;
emit8(X64_jp8, skip - _nIns);
}
}
else {
if (condop == LIR_flt) {
condop = LIR_fgt;
LIns *t = a; a = b; b = t;
} else if (condop == LIR_fle) {
condop = LIR_fge;
LIns *t = a; a = b; b = t;
}
X64Opcode xop;
if (condop == LIR_fgt)
xop = onFalse ? X64_jbe : X64_ja;
else // LIR_fge
xop = onFalse ? X64_jb : X64_jae;
underrunProtect(8);
emit32(xop, target ? target - _nIns : 0);
patch = _nIns;
}
fcmp(a, b);
return patch;
}
void Assembler::asm_fcond(LIns *ins) {
LOpcode op = ins->opcode();
LIns *a = ins->oprnd1();
LIns *b = ins->oprnd2();
if (op == LIR_feq) {
// result = ZF & !PF, must do logic on flags
// r = al|bl|cl|dl, can only use rh without rex prefix
Register r = prepResultReg(ins, 1<<RAX|1<<RCX|1<<RDX|1<<RBX);
emitrr8(X64_movzx8, r, r); // movzx8 r,rl r[8:63] = 0
emit(X86_and8r | uint64_t(r<<3|(r|4))<<56); // and rl,rh rl &= rh
emit(X86_setnp | uint64_t(r|4)<<56); // setnp rh rh = !PF
emit(X86_sete | uint64_t(r)<<56); // sete rl rl = ZF
} else {
if (op == LIR_flt) {
op = LIR_fgt;
LIns *t = a; a = b; b = t;
} else if (op == LIR_fle) {
op = LIR_fge;
LIns *t = a; a = b; b = t;
}
Register r = prepResultReg(ins, GpRegs); // x64 can use any GPR as setcc target
emitrr8(X64_movzx8, r, r);
emitr8(op == LIR_fgt ? X64_seta : X64_setae, r);
}
fcmp(a, b);
}
void Assembler::fcmp(LIns *a, LIns *b) {
Reservation *resva, *resvb;
findRegFor2(FpRegs, a, resva, b, resvb);
emitprr(X64_ucomisd, resva->reg, resvb->reg);
}
void Assembler::asm_restore(LIns *ins, Reservation *resv, Register r) {
(void) r;
if (ins->isop(LIR_alloc)) {
int d = disp(resv);
emitrm(X64_leaqrm, r, d, FP);
}
else if (ins->isconst()) {
if (!resv->arIndex) {
ins->resv()->clear();
}
// unsafe to use xor r,r for zero because it changes cc's
emit_int(r, ins->imm32());
}
else if (ins->isconstq() && IsGpReg(r)) {
if (!resv->arIndex) {
ins->resv()->clear();
}
// unsafe to use xor r,r for zero because it changes cc's
emit_quad(r, ins->imm64());
}
else {
int d = findMemFor(ins);
if (IsFpReg(r)) {
NanoAssert(ins->isQuad());
// load 64bits into XMM. don't know if double or int64, assume double.
emitprm(X64_movsdrm, r, d, FP);
} else if (ins->isQuad()) {
emitrm(X64_movqrm, r, d, FP);
} else {
emitrm(X64_movlrm, r, d, FP);
}
}
verbose_only( if (_logc->lcbits & LC_RegAlloc) {
outputForEOL(" <= restore %s",
_thisfrag->lirbuf->names->formatRef(ins)); } )
}
void Assembler::asm_cond(LIns *ins) {
LOpcode op = ins->opcode();
// unlike x86-32, with a rex prefix we can use any GP register as an 8bit target
Register r = prepResultReg(ins, GpRegs);
// SETcc only sets low 8 bits, so extend
emitrr8(X64_movzx8, r, r);
X64Opcode xop;
switch (op) {
default:
TODO(cond);
case LIR_qeq:
case LIR_eq: xop = X64_sete; break;
case LIR_qlt:
case LIR_lt: xop = X64_setl; break;
case LIR_qle:
case LIR_le: xop = X64_setle; break;
case LIR_qgt:
case LIR_gt: xop = X64_setg; break;
case LIR_qge:
case LIR_ge: xop = X64_setge; break;
case LIR_qult:
case LIR_ult: xop = X64_setb; break;
case LIR_qule:
case LIR_ule: xop = X64_setbe; break;
case LIR_qugt:
case LIR_ugt: xop = X64_seta; break;
case LIR_quge:
case LIR_uge: xop = X64_setae; break;
case LIR_ov: xop = X64_seto; break;
}
emitr8(xop, r);
asm_cmp(ins);
}
void Assembler::asm_ret(LIns *ins) {
JMP(_epilogue);
assignSavedRegs();
LIns *value = ins->oprnd1();
Register r = ins->isop(LIR_ret) ? RAX : XMM0;
findSpecificRegFor(value, r);
}
void Assembler::asm_nongp_copy(Register d, Register s) {
if (!IsFpReg(d) && IsFpReg(s)) {
// gpr <- xmm: use movq r/m64, xmm (66 REX.W 0F 7E /r)
emitprr(X64_movqrx, s, d);
} else if (IsFpReg(d) && IsFpReg(s)) {
// xmm <- xmm: use movaps. movsd r,r causes partial register stall
emitrr(X64_movapsr, d, s);
} else {
// xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r)
emitprr(X64_movqxr, d, s);
}
}
void Assembler::regalloc_load(LIns *ins, Register &rr, int32_t &dr, Register &rb) {
dr = ins->disp();
LIns *base = ins->oprnd1();
rb = getBaseReg(base, dr, BaseRegs);
Reservation *resv = getresv(ins);
if (resv && (rr = resv->reg) != UnknownReg) {
// keep already assigned register
freeRsrcOf(ins, false);
} else {
// use a gpr in case we're copying a non-double
rr = prepResultReg(ins, GpRegs & ~rmask(rb));
}
}
void Assembler::asm_load64(LIns *ins) {
Register rr, rb;
int32_t dr;
regalloc_load(ins, rr, dr, rb);
if (IsGpReg(rr)) {
// general 64bit load, 32bit const displacement
emitrm(X64_movqrm, rr, dr, rb);
} else {
// load 64bits into XMM. don't know if double or int64, assume double.
emitprm(X64_movsdrm, rr, dr, rb);
}
}
void Assembler::asm_ld(LIns *ins) {
NanoAssert(!ins->isQuad());
Register r, b;
int32_t d;
regalloc_load(ins, r, d, b);
emitrm(X64_movlrm, r, d, b);
}
void Assembler::asm_store64(LIns *value, int d, LIns *base) {
NanoAssert(value->isQuad());
Register b = getBaseReg(base, d, BaseRegs);
// if we have to choose a register, use a GPR, but not the base reg
Reservation *resv = getresv(value);
Register r;
if (!resv || (r = resv->reg) == UnknownReg) {
r = findRegFor(value, GpRegs & ~rmask(b));
}
if (IsGpReg(r)) {
// gpr store
emitrm(X64_movqmr, r, d, b);
}
else {
// xmm store
emitprm(X64_movsdmr, r, d, b);
}
}
void Assembler::asm_store32(LIns *value, int d, LIns *base) {
NanoAssert(!value->isQuad());
Register b = getBaseReg(base, d, BaseRegs);
Register r = findRegFor(value, GpRegs & ~rmask(b));
// store 32bits to 64bit addr. use rex so we can use all 16 regs
emitrm(X64_movlmr, r, d, b);
}
// generate a 32bit constant, must not affect condition codes!
void Assembler::emit_int(Register r, int32_t v) {
NanoAssert(IsGpReg(r));
emitr_imm(X64_movi, r, v);
}
// generate a 64bit constant, must not affect condition codes!
void Assembler::emit_quad(Register r, uint64_t v) {
NanoAssert(IsGpReg(r));
if (isU32(v)) {
emit_int(r, int32_t(v));
return;
}
if (isS32(v)) {
// safe for sign-extension 32->64
emitr_imm(X64_movqi32, r, int32_t(v));
return;
}
underrunProtect(8+8); // imm64 + worst case instr len
((uint64_t*)_nIns)[-1] = v;
_nIns -= 8;
_nvprof("x64-bytes", 8);
emitr(X64_movqi, r);
}
void Assembler::asm_int(LIns *ins) {
Register r = prepResultReg(ins, GpRegs);
int32_t v = ins->imm32();
if (v == 0) {
// special case for zero
emitrr(X64_xorrr, r, r);
return;
}
emit_int(r, v);
}
void Assembler::asm_quad(LIns *ins) {
uint64_t v = ins->imm64();
RegisterMask allow = v == 0 ? GpRegs|FpRegs : GpRegs;
Register r = prepResultReg(ins, allow);
if (v == 0) {
if (IsGpReg(r)) {
// special case for zero
emitrr(X64_xorrr, r, r);
} else {
// xorps for xmm
emitprr(X64_xorps, r, r);
}
} else {
emit_quad(r, v);
}
}
void Assembler::asm_qjoin(LIns*) {
TODO(asm_qjoin);
}
Register Assembler::asm_prep_fcall(Reservation*, LIns *ins) {
return prepResultReg(ins, rmask(XMM0));
}
void Assembler::asm_param(LIns *ins) {
uint32_t a = ins->paramArg();
uint32_t kind = ins->paramKind();
if (kind == 0) {
// ordinary param
// first six args always in registers for mac x64
if (a < 6) {
// incoming arg in register
prepResultReg(ins, rmask(argRegs[a]));
} else {
// todo: support stack based args, arg 0 is at [FP+off] where off
// is the # of regs to be pushed in genProlog()
TODO(asm_param_stk);
}
}
else {
// saved param
prepResultReg(ins, rmask(savedRegs[a]));
}
}
// register allocation for 2-address style unary ops of the form R = (op) R
void Assembler::regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra) {
LIns *a = ins->oprnd1();
rr = prepResultReg(ins, allow);
Reservation* rA = getresv(a);
// if this is last use of a in reg, we can re-use result reg
if (rA == 0 || (ra = rA->reg) == UnknownReg) {
ra = findSpecificRegFor(a, rr);
} else {
// rA already has a register assigned. caller must emit a copy
// to rr once instr code is generated. (ie mov rr,ra ; op rr)
}
}
static const AVMPLUS_ALIGN16(int64_t) negateMask[] = {0x8000000000000000LL,0};
void Assembler::asm_fneg(LIns *ins) {
Register rr, ra;
if (isS32((uintptr_t)negateMask) || isS32((NIns*)negateMask - _nIns)) {
regalloc_unary(ins, FpRegs, rr, ra);
if (isS32((uintptr_t)negateMask)) {
// builtin code is in bottom or top 2GB addr space, use absolute addressing
underrunProtect(4+8);
*((int32_t*)(_nIns -= 4)) = (int32_t)(uintptr_t)negateMask;
_nvprof("x64-bytes", 4);
uint64_t xop = X64_xorpsa | uint64_t((rr&7)<<3)<<48; // put rr[0:2] into mod/rm byte
xop = rexrb(xop, rr, (Register)0); // put rr[3] into rex byte
emit(xop);
} else {
// jit code is within +/-2GB of builtin code, use rip-relative
underrunProtect(4+8);
int32_t d = (int32_t) ((NIns*)negateMask - _nIns);
*((int32_t*)(_nIns -= 4)) = d;
_nvprof("x64-bytes", 4);
emitrr(X64_xorpsm, rr, (Register)0);
}
if (ra != rr)
asm_nongp_copy(rr,ra);
} else {
// this is just hideous - can't use RIP-relative load, can't use
// absolute-address load, and cant move imm64 const to XMM.
// so do it all in a GPR. hrmph.
rr = prepResultReg(ins, GpRegs);
ra = findRegFor(ins->oprnd1(), GpRegs & ~rmask(rr));
emitrr(X64_xorqrr, rr, ra); // xor rr, ra
emit_quad(rr, negateMask[0]); // mov rr, 0x8000000000000000
}
}
void Assembler::asm_qhi(LIns*) {
TODO(asm_qhi);
}
void Assembler::asm_qlo(LIns *ins) {
Register rr, ra;
regalloc_unary(ins, GpRegs, rr, ra);
NanoAssert(IsGpReg(ra));
emitrr(X64_movlr, rr, ra); // 32bit mov zeros the upper 32bits of the target
}
void Assembler::asm_spill(Register rr, int d, bool /*pop*/, bool quad) {
if (d) {
if (!IsFpReg(rr)) {
X64Opcode xop = quad ? X64_movqmr : X64_movlmr;
emitrm(xop, rr, d, FP);
} else {
// store 64bits from XMM to memory
NanoAssert(quad);
emitprm(X64_movsdmr, rr, d, FP);
}
}
}
NIns* Assembler::genPrologue() {
// activation frame is 4 bytes per entry even on 64bit machines
uint32_t stackNeeded = max_stk_used + _activation.highwatermark * 4;
uint32_t stackPushed =
sizeof(void*) + // returnaddr
sizeof(void*); // ebp
uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
uint32_t amt = aligned - stackPushed;
// Reserve stackNeeded bytes, padded
// to preserve NJ_ALIGN_STACK-byte alignment.
if (amt) {
if (isS8(amt))
emitr_imm8(X64_subqr8, RSP, amt);
else
emitr_imm(X64_subqri, RSP, amt);
}
verbose_only( outputAddr=true; asm_output("[patch entry]"); )
NIns *patchEntry = _nIns;
MR(FP, RSP); // Establish our own FP.
emitr(X64_pushr, FP); // Save caller's FP.
return patchEntry;
}
NIns* Assembler::genEpilogue() {
// mov rsp, rbp
// pop rbp
// ret
max_stk_used = 0;
emit(X64_ret);
emitr(X64_popr, RBP);
MR(RSP, RBP);
return _nIns;
}
void Assembler::nRegisterResetAll(RegAlloc &a) {
// add scratch registers to our free list for the allocator
a.clear();
a.used = 0;
#ifdef _MSC_VER
a.free = 0x001fffcf; // rax-rbx, rsi, rdi, r8-r15, xmm0-xmm5
#else
a.free = 0xffffffff & ~(1<<RSP | 1<<RBP);
#endif
debug_only( a.managed = a.free; )
}
void Assembler::nPatchBranch(NIns *patch, NIns *target) {
NIns *next = 0;
if (patch[0] == 0xE9) {
// jmp disp32
next = patch+5;
} else if (patch[0] == 0x0F && (patch[1] & 0xF0) == 0x80) {
// jcc disp32
next = patch+6;
} else {
next = 0;
TODO(unknown_patch);
}
NanoAssert(((int32_t*)next)[-1] == 0);
NanoAssert(isS32(target - next));
((int32_t*)next)[-1] = int32_t(target - next);
if (next[0] == 0x0F && next[1] == 0x8A) {
// code is jne<target>,jp<target>, for LIR_jf(feq)
// we just patched the jne, now patch the jp.
next += 6;
NanoAssert(((int32_t*)next)[-1] == 0);
NanoAssert(isS32(target - next));
((int32_t*)next)[-1] = int32_t(target - next);
}
}
Register Assembler::nRegisterAllocFromSet(RegisterMask set) {
#if defined _WIN64
DWORD tr;
_BitScanForward(&tr, set);
_allocator.free &= ~rmask((Register)tr);
return (Register) tr;
#else
// gcc asm syntax
Register r;
asm("bsf %1, %%eax\n\t"
"btr %%eax, %2\n\t"
"movl %%eax, %0\n\t"
: "=m"(r) : "m"(set), "m"(_allocator.free) : "%eax", "memory");
(void)set;
return r;
#endif
}
void Assembler::nFragExit(LIns*) {
TODO(nFragExit);
}
void Assembler::nInit(AvmCore*)
{}
void Assembler::underrunProtect(ptrdiff_t bytes) {
NanoAssertMsg(bytes<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small");
NIns *pc = _nIns;
NIns *top = _inExit ? this->exitStart : this->codeStart;
#if PEDANTIC
// pedanticTop is based on the last call to underrunProtect; any time we call
// underrunProtect and would use more than what's already protected, then insert
// a page break jump. Sometimes, this will be to a new page, usually it's just
// the next instruction
NanoAssert(pedanticTop >= top);
if (pc - bytes < pedanticTop) {
// no page break required, but insert a far branch anyway just to be difficult
const int br_size = 8; // opcode + 32bit addr
if (pc - bytes - br_size < top) {
// really do need a page break
verbose_only(if (_logc->lcbits & LC_Assembly) outputf("newpage %p:", pc);)
codeAlloc();
}
// now emit the jump, but make sure we won't need another page break.
// we're pedantic, but not *that* pedantic.
pedanticTop = _nIns - br_size;
JMP(pc);
pedanticTop = _nIns - bytes;
}
#else
if (pc - bytes < top) {
verbose_only(if (_logc->lcbits & LC_Assembly) outputf("newpage %p:", pc);)
codeAlloc();
// this jump will call underrunProtect again, but since we're on a new
// page, nothing will happen.
JMP(pc);
}
#endif
}
RegisterMask Assembler::hint(LIns *, RegisterMask allow) {
return allow;
}
void Assembler::nativePageSetup() {
if (!_nIns) {
codeAlloc();
IF_PEDANTIC( pedanticTop = _nIns; )
}
if (!_nExitIns) {
codeAlloc(true);
}
}
void Assembler::nativePageReset()
{}
} // namespace nanojit
#endif // FEATURE_NANOJIT && NANOJIT_X64