irjit: Add flag to prefer Vec4, use for add/sub.

This will improve things when using SIMD.
2026-03-10 12:43:04 -07:00 · 2023-08-13 13:52:45 -07:00
parent e0be6858b8
commit 2e6dbab5fa
4 changed files with 32 additions and 12 deletions
@@ -336,7 +336,7 @@ namespace MIPSComp {
 		if (js.prefixD == 0)
 			return;

-		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0) {
+		if (IsVec4(sz, regs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
 			// Use temps for all, we'll blend in the end (keeping in Vec4.)
 			for (int i = 0; i < 4; ++i)
 				regs[i] = IRVTEMP_PFX_D + i;
@@ -378,7 +378,7 @@ namespace MIPSComp {
 	}

 	void IRFrontend::ApplyPrefixDMask(u8 *vregs, VectorSize sz, int vectorReg) {
-		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0) {
+		if (IsVec4(sz, vregs) && js.VfpuWriteMask() != 0 && opts.preferVec4) {
 			u8 origV[4];
 			GetVectorRegs(origV, sz, vectorReg);

@@ -815,7 +815,7 @@ namespace MIPSComp {
 		}

 		// If all three are consecutive 4, we're safe regardless of if we use temps so we should not check that here.
-		if (allowSIMD && IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+		if (allowSIMD) {
 			IROp opFunc = IROp::Nop;
 			switch (type) {
 			case VecDo3Op::VADD: // d[i] = s[i] + t[i]; break; //vadd
@@ -835,13 +835,24 @@ namespace MIPSComp {
 				break;
 			}

-			if (opFunc != IROp::Nop) {
-				ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
-			} else {
-				DISABLE;
+			if (IsVec4(sz, dregs) && IsVec4(sz, sregs) && IsVec4(sz, tregs)) {
+				if (opFunc != IROp::Nop) {
+					ir.Write(opFunc, dregs[0], sregs[0], tregs[0]);
+				} else {
+					DISABLE;
+				}
+				ApplyPrefixD(dregs, sz, _VD);
+				return;
+			} else if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
+				// This is actually pretty common.  Use a temp + blend.
+				// We could post-process this, but it's easier to do it here.
+				if (opFunc == IROp::Nop)
+					DISABLE;
+				ir.Write(opFunc, IRVTEMP_0, sregs[0], tregs[0]);
+				ir.Write({ IROp::Vec4Blend, dregs[0], dregs[0], IRVTEMP_0, 0x7 });
+				ApplyPrefixD(dregs, sz, _VD);
+				return;
 			}
-			ApplyPrefixD(dregs, sz, _VD);
-			return;
 		}

 		if (type == VecDo3Op::VSGE || type == VecDo3Op::VSLT) {
@@ -1647,7 +1658,7 @@ namespace MIPSComp {
 		GetVectorRegsPrefixT(tregs, sz, _VT);
 		GetVectorRegsPrefixD(dregs, sz, _VD);

-		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs)) {
+		if (IsVec3of4(sz, dregs) && IsVec3of4(sz, sregs) && IsVec3of4(sz, tregs) && opts.preferVec4) {
 			// Use Vec4 where we can.  First, apply shuffles.
 			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_S, sregs[0], VFPU_SWIZZLE(1, 2, 0, 3));
 			ir.Write(IROp::Vec4Shuffle, IRVTEMP_PFX_T, tregs[0], VFPU_SWIZZLE(2, 0, 1, 3));
@@ -385,6 +385,7 @@ private:
 struct IROptions {
 	uint32_t disableFlags;
 	bool unalignedLoadStore;
+	bool preferVec4;
 };

 const IRMeta *GetIRMeta(IROp op);
@@ -50,9 +50,16 @@ IRJit::IRJit(MIPSState *mipsState) : frontend_(mipsState->HasDefaultPrefix()), m

 	IROptions opts{};
 	opts.disableFlags = g_Config.uJitDisableFlags;
-	// Assume that RISC-V always has very slow unaligned memory accesses.
-#if !PPSSPP_ARCH(RISCV64)
+#if PPSSPP_ARCH(RISCV64)
+	// Assume RISC-V always has very slow unaligned memory accesses.
+	opts.unalignedLoadStore = false;
+	opts.preferVec4 = cpu_info.RiscV_V;
+#elif PPSSPP_ARCH(ARM)
 	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	opts.preferVec4 = cpu_info.bASIMD || cpu_info.bNEON;
+#else
+	opts.unalignedLoadStore = (opts.disableFlags & (uint32_t)JitDisable::LSU_UNALIGNED) == 0;
+	opts.preferVec4 = true;
 #endif
 	frontend_.SetOptions(opts);
 }