implement EXPANDED_LOADSTORE for ARM (code=vladimir,stejohns; r=vladimir,rreitmai; bug=534765)

--HG-- extra : convert_revision : 75f0d95c8bea8ceb0d9bb2dfd55aeb0d0d200bd1
2024-09-13 09:24:08 -07:00 · 2010-01-25 11:08:42 -08:00 · 2010-01-25 11:08:42 -08:00 · e0fe64337d
commit e0fe64337d
parent dbeceb07e7
2 changed files with 357 additions and 142 deletions
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@ -1205,28 +1205,39 @@ Assembler::asm_qjoin(LIns *ins)
 void
 Assembler::asm_store32(LOpcode op, LIns *value, int dr, LIns *base)
 {
+    Register ra, rb;
+    getBaseReg2(GpRegs, value, ra, GpRegs, base, rb, dr);
+
    switch (op) {
        case LIR_sti:
-            // handled by mainline code below for now
-            break;
+            if (isU12(-dr) || isU12(dr)) {
+                STR(ra, rb, dr);
+            } else {
+                STR(ra, IP, 0);
+                asm_add_imm(IP, rb, dr);
+            }
+            return;
        case LIR_stb:
+            if (isU12(-dr) || isU12(dr)) {
+                STRB(ra, rb, dr);
+            } else {
+                STRB(ra, IP, 0);
+                asm_add_imm(IP, rb, dr);
+            }
+            return;
        case LIR_sts:
-            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+            // Similar to the sti/stb case, but the max offset is smaller.
+            if (isU8(-dr) || isU8(dr)) {
+                STRH(ra, rb, dr);
+            } else {
+                STRH(ra, IP, 0);
+                asm_add_imm(IP, rb, dr);
+            }
            return;
        default:
            NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
            return;
    }
-
-    Register ra, rb;
-    getBaseReg2(GpRegs, value, ra, GpRegs, base, rb, dr);
-
-    if (isU12(-dr) || isU12(dr)) {
-        STR(ra, rb, dr);
-    } else {
-        STR(ra, IP, 0);
-        asm_add_imm(IP, rb, dr);
-    }
 }

 void
@ -1308,21 +1319,6 @@ Assembler::asm_load64(LInsp ins)

    //asm_output("<<< load64");

-    switch (ins->opcode()) {
-        case LIR_ldf:
-        case LIR_ldfc:
-            // handled by mainline code below for now
-            break;
-
-        case LIR_ld32f:
-        case LIR_ldc32f:
-            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
-            return;
-        default:
-            NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
-            return;
-    }
-
    NanoAssert(ins->isF64());

    LIns* base = ins->oprnd1();
@ -1337,28 +1333,68 @@ Assembler::asm_load64(LInsp ins)

    //outputf("--- load64: Finished register allocation.");

-    if (ARM_VFP && isKnownReg(rr)) {
-        // VFP is enabled and the result will go into a register.
-        NanoAssert(IsFpReg(rr));
+    switch (ins->opcode()) {
+        case LIR_ldf:
+        case LIR_ldfc:
+            if (ARM_VFP && isKnownReg(rr)) {
+                // VFP is enabled and the result will go into a register.
+                NanoAssert(IsFpReg(rr));

-        if (!isS8(offset >> 2) || (offset&3) != 0) {
-            FLDD(rr,IP,0);
-            asm_add_imm(IP, rb, offset);
-        } else {
-            FLDD(rr,rb,offset);
-        }
-    } else {
-        // Either VFP is not available or the result needs to go into memory;
-        // in either case, VFP instructions are not required. Note that the
-        // result will never be loaded into registers if VFP is not available.
-        NanoAssert(!isKnownReg(rr));
-        NanoAssert(d != 0);
+                if (!isS8(offset >> 2) || (offset&3) != 0) {
+                    FLDD(rr,IP,0);
+                    asm_add_imm(IP, rb, offset);
+                } else {
+                    FLDD(rr,rb,offset);
+                }
+            } else {
+                // Either VFP is not available or the result needs to go into memory;
+                // in either case, VFP instructions are not required. Note that the
+                // result will never be loaded into registers if VFP is not available.
+                NanoAssert(!isKnownReg(rr));
+                NanoAssert(d != 0);

-        // Check that the offset is 8-byte (64-bit) aligned.
-        NanoAssert((d & 0x7) == 0);
+                // Check that the offset is 8-byte (64-bit) aligned.
+                NanoAssert((d & 0x7) == 0);

-        // *(uint64_t*)(FP+d) = *(uint64_t*)(rb+offset)
-        asm_mmq(FP, d, rb, offset);
+                // *(uint64_t*)(FP+d) = *(uint64_t*)(rb+offset)
+                asm_mmq(FP, d, rb, offset);
+            }
+            return;
+
+        case LIR_ld32f:
+        case LIR_ldc32f:
+            if (ARM_VFP) {
+                if (isKnownReg(rr)) {
+                    NanoAssert(IsFpReg(rr));
+                    FCVTDS(rr, S14);
+                } else {
+                    // Normally D7 isn't allowed to be used as an FP reg.
+                    // In this case we make an explicit exception.
+                    if (isS8(d)) {
+                        FSTD_allowD7(D7, FP, d, true);
+                    } else {
+                        FSTD_allowD7(D7, IP, 0, true);
+                        asm_add_imm(IP, FP, d);
+                    }
+                    FCVTDS_allowD7(D7, S14, true);
+                }
+
+                // always load into a VFP reg to do the conversion, and always use
+                // our S14 scratch reg
+                if (!isS8(offset >> 2) || (offset&3) != 0) {
+                    FLDS(S14, IP, 0);
+                    asm_add_imm(IP, rb, offset);
+                } else {
+                    FLDS(S14, rb, offset);
+                }
+            } else {
+                NanoAssertMsg(0, "ld32f not supported with non-VFP, fix me");
+            }
+            return;
+
+        default:
+            NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+            return;
    }

    //asm_output(">>> load64");
@ -1373,65 +1409,106 @@ Assembler::asm_store64(LOpcode op, LInsp value, int dr, LInsp base)

    switch (op) {
        case LIR_stfi:
-            // handled by mainline code below for now
-            break;
+            if (ARM_VFP) {
+                Register rb = findRegFor(base, GpRegs);

-        case LIR_st32f:
-            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                if (value->isconstq()) {
+                    underrunProtect(LD32_size*2 + 8);
+
+                    // XXX use another reg, get rid of dependency
+                    STR(IP, rb, dr);
+                    asm_ld_imm(IP, value->imm64_0(), false);
+                    STR(IP, rb, dr+4);
+                    asm_ld_imm(IP, value->imm64_1(), false);
+
+                    return;
+                }
+
+                Register rv = findRegFor(value, FpRegs);
+
+                NanoAssert(isKnownReg(rb));
+                NanoAssert(isKnownReg(rv));
+
+                Register baseReg = rb;
+                intptr_t baseOffset = dr;
+
+                if (!isS8(dr)) {
+                    baseReg = IP;
+                    baseOffset = 0;
+                }
+
+                FSTD(rv, baseReg, baseOffset);
+
+                if (!isS8(dr)) {
+                    asm_add_imm(IP, rb, dr);
+                }
+
+                // if it's a constant, make sure our baseReg/baseOffset location
+                // has the right value
+                if (value->isconstq()) {
+                    underrunProtect(4*4);
+                    asm_quad_nochk(rv, value->imm64_0(), value->imm64_1());
+                }
+            } else {
+                int da = findMemFor(value);
+                Register rb = findRegFor(base, GpRegs);
+                // *(uint64_t*)(rb+dr) = *(uint64_t*)(FP+da)
+                asm_mmq(rb, dr, FP, da);
+            }
            return;

+        case LIR_st32f:
+            if (ARM_VFP) {
+                Register rb = findRegFor(base, GpRegs);
+
+                if (value->isconstq()) {
+                    underrunProtect(LD32_size*2 + 8);
+
+                    // XXX use another reg, get rid of dependency
+                    STR(IP, rb, dr);
+                    asm_ld_imm(IP, value->imm64_0(), false);
+                    STR(IP, rb, dr+4);
+                    asm_ld_imm(IP, value->imm64_1(), false);
+
+                    return;
+                }
+
+                Register rv = findRegFor(value, FpRegs);
+
+                NanoAssert(isKnownReg(rb));
+                NanoAssert(isKnownReg(rv));
+
+                Register baseReg = rb;
+                intptr_t baseOffset = dr;
+
+                if (!isS8(dr)) {
+                    baseReg = IP;
+                    baseOffset = 0;
+                }
+
+                FSTS(S14, baseReg, baseOffset);
+
+                if (!isS8(dr)) {
+                    asm_add_imm(IP, rb, dr);
+                }
+
+                FCVTSD(S14, rv);
+
+                // if it's a constant, make sure our baseReg/baseOffset location
+                // has the right value
+                if (value->isconstq()) {
+                    underrunProtect(4*4);
+                    asm_quad_nochk(rv, value->imm64_0(), value->imm64_1());
+                }
+            } else {
+                NanoAssertMsg(0, "st32f not supported with non-VFP, fix me");
+            }
+            return;
        default:
            NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
            return;
    }

-    if (ARM_VFP) {
-        Register rb = findRegFor(base, GpRegs);
-
-        if (value->isconstq()) {
-            underrunProtect(LD32_size*2 + 8);
-
-            // XXX use another reg, get rid of dependency
-            STR(IP, rb, dr);
-            asm_ld_imm(IP, value->imm64_0(), false);
-            STR(IP, rb, dr+4);
-            asm_ld_imm(IP, value->imm64_1(), false);
-
-            return;
-        }
-
-        Register rv = findRegFor(value, FpRegs);
-
-        NanoAssert(isKnownReg(rb));
-        NanoAssert(isKnownReg(rv));
-
-        Register baseReg = rb;
-        intptr_t baseOffset = dr;
-
-        if (!isS8(dr)) {
-            baseReg = IP;
-            baseOffset = 0;
-        }
-
-        FSTD(rv, baseReg, baseOffset);
-
-        if (!isS8(dr)) {
-            asm_add_imm(IP, rb, dr);
-        }
-
-        // if it's a constant, make sure our baseReg/baseOffset location
-        // has the right value
-        if (value->isconstq()) {
-            underrunProtect(4*4);
-            asm_quad_nochk(rv, value->imm64_0(), value->imm64_1());
-        }
-    } else {
-        int da = findMemFor(value);
-        Register rb = findRegFor(base, GpRegs);
-        // *(uint64_t*)(rb+dr) = *(uint64_t*)(FP+da)
-        asm_mmq(rb, dr, FP, da);
-    }
-
    //asm_output(">>> store64");
 }

@ -2018,8 +2095,8 @@ Assembler::asm_i2f(LInsp ins)
    // todo: support int value in memory, as per x86
    NanoAssert(isKnownReg(srcr));

-    FSITOD(rr, FpSingleScratch);
-    FMSR(FpSingleScratch, srcr);
+    FSITOD(rr, S14);
+    FMSR(S14, srcr);
 }

 void
@ -2031,8 +2108,8 @@ Assembler::asm_u2f(LInsp ins)
    // todo: support int value in memory, as per x86
    NanoAssert(isKnownReg(sr));

-    FUITOD(rr, FpSingleScratch);
-    FMSR(FpSingleScratch, sr);
+    FUITOD(rr, S14);
+    FMSR(S14, sr);
 }

 void Assembler::asm_f2i(LInsp ins)
@ -2041,8 +2118,8 @@ void Assembler::asm_f2i(LInsp ins)
    Register rr = deprecated_prepResultReg(ins, GpRegs);
    Register sr = findRegFor(ins->oprnd1(), FpRegs);

-    FMRS(rr, FpSingleScratch);
-    FTOSID(FpSingleScratch, sr);
+    FMRS(rr, S14);
+    FTOSID(S14, sr);
 }

 void
@ -2506,8 +2583,7 @@ Assembler::asm_load32(LInsp ins)
            return;
        case LIR_ldzs:
        case LIR_ldcs:
-            // These are expected to be 2-byte aligned.  (Not all ARM machines
-            // can handle unaligned accesses.)
+            // Some ARM machines require 2-byte alignment here.
            // Similar to the ldcb/ldzb case, but the max offset is smaller.
            if (isU8(-d) || isU8(d)) {
                LDRH(rr, ra, d);
@ -2518,7 +2594,7 @@ Assembler::asm_load32(LInsp ins)
            return;
        case LIR_ld:
        case LIR_ldc:
-            // These are expected to be 4-byte aligned.
+            // Some ARM machines require 4-byte alignment here.
            if (isU12(-d) || isU12(d)) {
                LDR(rr, ra, d);
            } else {
@ -2527,10 +2603,22 @@ Assembler::asm_load32(LInsp ins)
            }
            return;
        case LIR_ldsb:
-        case LIR_ldss:
        case LIR_ldcsb:
+            if (isU8(-d) || isU8(d)) {
+                LDRSB(rr, ra, d);
+            } else {
+                LDRSB(rr, IP, 0);
+                asm_add_imm(IP, ra, d);
+            }
+            return;
+        case LIR_ldss:
        case LIR_ldcss:
-            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+            if (isU8(-d) || isU8(d)) {
+                LDRSH(rr, ra, d);
+            } else {
+                LDRSH(rr, IP, 0);
+                asm_add_imm(IP, ra, d);
+            }
            return;
        default:
            NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -79,7 +79,7 @@ namespace nanojit
 #define NJ_MAX_PARAMETERS               16
 #define NJ_ALIGN_STACK                  8
 #define NJ_JTBL_SUPPORTED               1
-#define NJ_EXPANDED_LOADSTORE_SUPPORTED 0
+#define NJ_EXPANDED_LOADSTORE_SUPPORTED 1
 #define NJ_F2I_SUPPORTED                1

 #define NJ_CONSTANT_POOLS
@ -131,8 +131,7 @@ typedef enum {
    LastReg = D6,
    deprecated_UnknownReg = 32,

-    // special value referring to S14
-    FpSingleScratch = 24
+    S14 = 24
 } Register;

 /* ARM condition codes */
@ -621,13 +620,26 @@ enum {
        asm_output("ldrb %s, [%s,#%d]", gpn(_d),gpn(_n),(_off));        \
    } while(0)

-// Load and sign-extend a half word (16 bits). The offset range is ±255, and
-// must be aligned to two bytes on some architectures, but we never make
-// unaligned accesses so a simple assertion is sufficient here.
-#define LDRH(_d,_n,_off) do {                                           \
-        /* TODO: This is actually LDRSH. Is this correct? */            \
+// Load a byte (8 bits), sign-extend to 32 bits. The offset range is
+// ±255 (different from LDRB, same as LDRH/LDRSH)
+#define LDRSB(_d,_n,_off) do {                                          \
+        NanoAssert(IsGpReg(_d) && IsGpReg(_n));                         \
+        underrunProtect(4);                                             \
+        if (_off < 0) {                                                 \
+            NanoAssert(isU8(-_off));                                    \
+            *(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_n)<<16) | ((_d)<<12) | ((0xD)<<4) | (((-_off)&0xf0)<<4) | ((-_off)&0xf) ); \
+        } else {                                                        \
+            NanoAssert(isU8(_off));                                     \
+            *(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_n)<<16) | ((_d)<<12) | ((0xD)<<4) | (((_off)&0xf0)<<4) | ((_off)&0xf) ); \
+        }                                                               \
+        asm_output("ldrsb %s, [%s,#%d]", gpn(_d),gpn(_n),(_off));       \
+    } while(0)
+
+// Load and sign-extend a half word (16 bits). The offset range is ±255, and
+// must be aligned to two bytes on some architectures (the caller is responsible
+// for ensuring appropriate alignment)
+#define LDRH(_d,_n,_off) do {                                           \
        NanoAssert(IsGpReg(_d) && IsGpReg(_n));                         \
-        NanoAssert(((_off) & ~1) == (_off));                            \
        underrunProtect(4);                                             \
        if (_off < 0) {                                                 \
            NanoAssert(isU8(-_off));                                    \
@ -636,9 +648,26 @@ enum {
            NanoAssert(isU8(_off));                                     \
            *(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_n)<<16) | ((_d)<<12) | ((0xB)<<4) | (((_off)&0xf0)<<4) | ((_off)&0xf) ); \
        }                                                               \
+        asm_output("ldrh %s, [%s,#%d]", gpn(_d),gpn(_n),(_off));       \
+    } while(0)
+
+// Load and sign-extend a half word (16 bits). The offset range is ±255, and
+// must be aligned to two bytes on some architectures (the caller is responsible
+// for ensuring appropriate alignment)
+#define LDRSH(_d,_n,_off) do {                                          \
+        NanoAssert(IsGpReg(_d) && IsGpReg(_n));                         \
+        underrunProtect(4);                                             \
+        if (_off < 0) {                                                 \
+            NanoAssert(isU8(-_off));                                    \
+            *(--_nIns) = (NIns)( COND_AL | (0x15<<20) | ((_n)<<16) | ((_d)<<12) | ((0xF)<<4) | (((-_off)&0xf0)<<4) | ((-_off)&0xf) ); \
+        } else {                                                        \
+            NanoAssert(isU8(_off));                                     \
+            *(--_nIns) = (NIns)( COND_AL | (0x1D<<20) | ((_n)<<16) | ((_d)<<12) | ((0xF)<<4) | (((_off)&0xf0)<<4) | ((_off)&0xf) ); \
+        }                                                               \
        asm_output("ldrsh %s, [%s,#%d]", gpn(_d),gpn(_n),(_off));       \
    } while(0)

+// Valid offset for STR and STRB is +/- 4095, STRH only has +/- 255
 #define STR(_d,_n,_off) do {                                            \
        NanoAssert(IsGpReg(_d) && IsGpReg(_n));                         \
        NanoAssert(isU12(_off) || isU12(-_off));                        \
@ -648,6 +677,29 @@ enum {
        asm_output("str %s, [%s, #%d]", gpn(_d), gpn(_n), (_off)); \
    } while(0)

+#define STRB(_d,_n,_off) do {                                           \
+        NanoAssert(IsGpReg(_d) && IsGpReg(_n));                         \
+        NanoAssert(isU12(_off) || isU12(-_off));                        \
+        underrunProtect(4);                                             \
+        if ((_off)<0)   *(--_nIns) = (NIns)( COND_AL | (0x54<<20) | ((_n)<<16) | ((_d)<<12) | ((-(_off))&0xFFF) ); \
+        else            *(--_nIns) = (NIns)( COND_AL | (0x5C<<20) | ((_n)<<16) | ((_d)<<12) | ((_off)&0xFFF) ); \
+        asm_output("strb %s, [%s, #%d]", gpn(_d), gpn(_n), (_off)); \
+    } while(0)
+
+// Only +/- 255 range, unlike STRB/STR
+#define STRH(_d,_n,_off) do {                                           \
+        NanoAssert(IsGpReg(_d) && IsGpReg(_n));                         \
+        underrunProtect(4);                                             \
+        if ((_off)<0) {                                                 \
+            NanoAssert(isU8(-_off));                                    \
+            *(--_nIns) = (NIns)( COND_AL | (0x14<<20) | ((_n)<<16) | ((_d)<<12) | (((-(_off))&0xF0)<<4) | (0xB<<4) | ((-(_off))&0xF) ); \
+        } else {                                                        \
+            NanoAssert(isU8(_off));                                     \
+            *(--_nIns) = (NIns)( COND_AL | (0x1C<<20) | ((_n)<<16) | ((_d)<<12) | (((_off)&0xF0)<<4) | (0xB<<4) | ((_off)&0xF) ); \
+        }                                                               \
+        asm_output("strh %s, [%s, #%d]", gpn(_d), gpn(_n), (_off));     \
+    } while(0)
+
 // Encode a breakpoint. The ID is not important and is ignored by the
 // processor, but it can be useful as a marker when debugging emitted code.
 #define BKPT_insn       ((NIns)( COND_AL | (0x12<<20) | (0x7<<4) ))
@ -813,11 +865,11 @@ enum {
        asm_output("fmrdh %s,%s", gpn(_Rd), gpn(_Dn));                  \
    } while (0)

-#define FSTD(_Dd,_Rn,_offs) do {                                        \
+#define FSTD_allowD7(_Dd,_Rn,_offs,_allowD7) do {                               \
        underrunProtect(4);                                             \
        NanoAssert(ARM_VFP);                                            \
        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2));         \
-        NanoAssert(IsFpReg(_Dd) && !IsFpReg(_Rn));                      \
+        NanoAssert((IsFpReg(_Dd) || ((_allowD7) && (_Dd) == D7)) && !IsFpReg(_Rn));     \
        int negflag = 1<<23;                                            \
        intptr_t offs = (_offs);                                        \
        if (_offs < 0) {                                                \
@ -828,6 +880,9 @@ enum {
        asm_output("fstd %s,%s(%d)", gpn(_Dd), gpn(_Rn), _offs);    \
    } while (0)

+#define FSTD(_Dd,_Rn,_offs) \
+        FSTD_allowD7(_Dd,_Rn,_offs,0)
+
 #define FLDD_chk(_Dd,_Rn,_offs,_chk) do {                               \
        if(_chk) underrunProtect(4);                                    \
        NanoAssert(ARM_VFP);                                            \
@ -844,31 +899,14 @@ enum {
    } while (0)
 #define FLDD(_Dd,_Rn,_offs) FLDD_chk(_Dd,_Rn,_offs,1)

-#define FSITOD(_Dd,_Sm) do {                                            \
-        underrunProtect(4);                                             \
-        NanoAssert(ARM_VFP);                                            \
-        NanoAssert(IsFpReg(_Dd) && ((_Sm) == FpSingleScratch));         \
-        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2F<<6) | (0<<5) | (0x7) ); \
-        asm_output("fsitod %s,%s", gpn(_Dd), gpn(_Sm));                \
-    } while (0)
-
-
 #define FUITOD(_Dd,_Sm) do {                                            \
        underrunProtect(4);                                             \
        NanoAssert(ARM_VFP);                                            \
-        NanoAssert(IsFpReg(_Dd) && ((_Sm) == FpSingleScratch));         \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == S14));                     \
        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2D<<6) | (0<<5) | (0x7) ); \
        asm_output("fuitod %s,%s", gpn(_Dd), gpn(_Sm));                \
    } while (0)

-#define FMSR(_Sn,_Rd) do {                                              \
-        underrunProtect(4);                                             \
-        NanoAssert(ARM_VFP);                                            \
-        NanoAssert(((_Sn) == FpSingleScratch) && IsGpReg(_Rd));         \
-        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
-        asm_output("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
-    } while (0)
-
 #define FNEGD(_Dd,_Dm) do {                                             \
        underrunProtect(4);                                             \
        NanoAssert(ARM_VFP);                                            \
@ -936,17 +974,106 @@ enum {
 #define FMRS(_Rd,_Sn) do {                                              \
        underrunProtect(4);                                             \
        NanoAssert(ARM_VFP);                                            \
-        NanoAssert(((_Sn) == FpSingleScratch) && IsGpReg(_Rd));         \
+        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
        *(--_nIns) = (NIns)( COND_AL | (0xE1<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
        asm_output("fmrs %s,%s", gpn(_Rd), gpn(_Sn));                  \
    } while (0)

-#define FTOSID(_Sd,_Dm) do {                                            \
+/*
+ * The following instructions can only be used with S14 as the
+ * single-precision register; that limitation can be removed if
+ * needed, but we'd have to teach NJ about all the single precision
+ * regs, and their encoding is strange (top 4 bits usually in a block,
+ * low bit elsewhere).
+ */
+
+#define FSITOD(_Dd,_Sm) do {                                            \
        underrunProtect(4);                                             \
        NanoAssert(ARM_VFP);                                            \
-        NanoAssert(((_Sd) == FpSingleScratch) && IsFpReg(_Dm));         \
+        NanoAssert(IsFpReg(_Dd) && ((_Sm) == S14));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB8<<16) | (FpRegNum(_Dd)<<12) | (0x2F<<6) | (0<<5) | (0x7) ); \
+        asm_output("fsitod %s,%s", gpn(_Dd), gpn(_Sm));                \
+    } while (0)
+
+#define FMSR(_Sn,_Rd) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(ARM_VFP);                                            \
+        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        asm_output("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
+    } while (0)
+
+#define FMRS(_Rd,_Sn) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(ARM_VFP);                                            \
+        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE1<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        asm_output("fmrs %s,%s", gpn(_Rd), gpn(_Sn));                  \
+    } while (0)
+
+#define FMSR(_Sn,_Rd) do {                                              \
+        underrunProtect(4);                                             \
+        NanoAssert(ARM_VFP);                                            \
+        NanoAssert(((_Sn) == S14) && IsGpReg(_Rd));                     \
+        *(--_nIns) = (NIns)( COND_AL | (0xE0<<20) | (0x7<<16) | ((_Rd)<<12) | (0xA<<8) | (0<<7) | (0x1<<4) ); \
+        asm_output("fmsr %s,%s", gpn(_Sn), gpn(_Rd));                  \
+    } while (0)
+
+#define FCVTSD(_Sd,_Dm) do {                        \
+        underrunProtect(4);                         \
+        NanoAssert(ARM_VFP);                        \
+        NanoAssert(((_Sd) == S14) && IsFpReg(_Dm)); \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB7<<16) | (0x7<<12) | (0xBC<<4) | (FpRegNum(_Dm)) ); \
+        asm_output("[0x%08x] fcvtsd s14,%s", *_nIns, gpn(_Dm));                          \
+    } while (0)
+
+#define FCVTDS_allowD7(_Dd,_Sm,_allowD7) do {               \
+        underrunProtect(4);                         \
+        NanoAssert(ARM_VFP);                        \
+        NanoAssert(((_Sm) == S14) && (IsFpReg(_Dd) || ((_allowD7) && (_Dd) == D7))); \
+        *(--_nIns) = (NIns)( COND_AL | (0xEB7<<16) | (FpRegNum(_Dd)<<12) | (0xAC<<4) | (0x7) ); \
+        asm_output("[0x%08x] fcvtds %s,s14", *_nIns, gpn(_Dd));      \
+    } while(0)
+
+#define FCVTDS(_Dd,_Sm) \
+    FCVTDS_allowD7(_Dd,_Sm,0)
+
+#define FLDS(_Sd,_Rn,_offs) do {                                \
+        underrunProtect(4);                                     \
+        NanoAssert(ARM_VFP);                                    \
+        NanoAssert(((_Sd) == S14) && !IsFpReg(_Rn));            \
+        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2)); \
+        int addflag = 1<<23;                                    \
+        intptr_t offs = (_offs);                                \
+        if (offs < 0) {                                         \
+            addflag = 0;                                        \
+            offs = -offs;                                       \
+        }                                                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xD1<<20) | ((_Rn)<<16) | (0x7<<12) | (0xA << 8) | addflag | ((offs>>2)&0xff) ); \
+        asm_output("[0x%08x] flds s14, [%s, #%d]", *_nIns, gpn(_Rn), (_offs)); \
+    } while (0)
+
+#define FSTS(_Sd,_Rn,_offs) do {                \
+        underrunProtect(4);                                     \
+        NanoAssert(ARM_VFP);                                    \
+        NanoAssert(((_Sd) == S14) && !IsFpReg(_Rn));            \
+        NanoAssert((((_offs) & 3) == 0) && isS8((_offs) >> 2)); \
+        int addflag = 1<<23;                                    \
+        intptr_t offs = (_offs);                                \
+        if (offs < 0) {                                         \
+            addflag = 0;                                        \
+            offs = -offs;                                       \
+        }                                                       \
+        *(--_nIns) = (NIns)( COND_AL | (0xD0<<20) | ((_Rn)<<16) | (0x7<<12) | (0xA << 8) | addflag | ((offs>>2)&0xff) ); \
+        asm_output("[0x%08x] fsts s14, [%s, #%d]", *_nIns, gpn(_Rn), (_offs)); \
+    } while (0)
+
+#define FTOSID(_Sd,_Dm) do {                                   \
+        underrunProtect(4);                                    \
+        NanoAssert(ARM_VFP);                                   \
+        NanoAssert(((_Sd) == S14) && IsFpReg(_Dm));            \
        *(--_nIns) = (NIns)( COND_AL | (0xEBD<<16) | (0x7<<12) | (0xB4<<4) | FpRegNum(_Dm) ); \
-        asm_output("ftosid %s,%s", gpn(_Sd), gpn(_Dm));                \
+        asm_output("ftosid s14, %s", gpn(_Dm));                         \
    } while (0)

 } // namespace nanojit