Submitting on behalf of Refik.Karic

ISPC templates integration #rb alex.mcadams, jeff.rous [CL 31018238 by alex mcadams in ue5-main branch]
2026-03-26 18:15:20 -07:00 · 2024-01-30 15:20:51 -05:00
parent 3c9e10eb06
commit 85516efd79
7 changed files with 1338 additions and 856 deletions
--- a/Engine/Source/Runtime/Core/Public/Math/Aossoa.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/Aossoa.isph
--- a/Engine/Source/Runtime/Core/Public/Math/Matrix.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/Matrix.isph
@@ -176,14 +176,8 @@ inline uniform FVector MatrixGetOrigin(const uniform FMatrix &M)
 	return SetVector(M.M[12], M.M[13], M.M[14]);
 }

-inline void MatrixGetScaledAxes(const uniform FMatrix44d &M, uniform FVector3d &X, uniform FVector3d &Y, uniform FVector3d &Z)
-{
-	X = SetVector(M.M[0], M.M[1], M.M[2]);
-	Y = SetVector(M.M[4], M.M[5], M.M[6]);
-	Z = SetVector(M.M[8], M.M[9], M.M[10]);
-}
-
-inline void MatrixGetScaledAxes(const uniform FMatrix44f &M, uniform FVector3f &X, uniform FVector3f &Y, uniform FVector3f &Z)
+template <typename T, typename V>
+inline void MatrixGetScaledAxes(const T& M, V &X, V &Y, V &Z)
 {
 	X = SetVector(M.M[0], M.M[1], M.M[2]);
 	Y = SetVector(M.M[4], M.M[5], M.M[6]);
@@ -271,41 +265,27 @@ inline uniform FMatrix MatrixTranspose(const uniform FMatrix& M)
 // we use __m128 to represent 2x2 matrix as A = | A0  A1 |
 //                                              | A2  A3 |
 // 2x2 row major Matrix multiply A*B
-static inline uniform FVector4d Mat2Mul(const uniform FVector4d& vec1, const uniform FVector4d& vec2)
-{
-	return
-		VectorAdd(VectorMultiply( vec1, VectorSwizzle(vec2, 0,3,0,3)),
-			VectorMultiply(VectorSwizzle(vec1, 1,0,3,2), VectorSwizzle(vec2, 2,1,2,1)));
-}
-static inline uniform FVector4f Mat2Mul(const uniform FVector4f& vec1, const uniform FVector4f& vec2)
+template <typename T>
+static inline uniform T Mat2Mul(const uniform T& vec1, const uniform T& vec2)
 {
 	return
 		VectorAdd(VectorMultiply( vec1, VectorSwizzle(vec2, 0,3,0,3)),
 			VectorMultiply(VectorSwizzle(vec1, 1,0,3,2), VectorSwizzle(vec2, 2,1,2,1)));
 }
+
 // 2x2 row major Matrix adjugate multiply (A#)*B
-static inline uniform FVector4d Mat2AdjMul(const uniform FVector4d& vec1, const uniform FVector4d& vec2)
+template <typename T>
+static inline uniform T Mat2AdjMul(const uniform T& vec1, const uniform T& vec2)
 {
 	return
 		VectorSubtract(VectorMultiply(VectorSwizzle(vec1, 3,3,0,0), vec2),
 			VectorMultiply(VectorSwizzle(vec1, 1,1,2,2), VectorSwizzle(vec2, 2,3,0,1)));

 }
-static inline uniform FVector4f Mat2AdjMul(const uniform FVector4f& vec1, const uniform FVector4f& vec2)
-{
-	return
-		VectorSubtract(VectorMultiply(VectorSwizzle(vec1, 3,3,0,0), vec2),
-			VectorMultiply(VectorSwizzle(vec1, 1,1,2,2), VectorSwizzle(vec2, 2,3,0,1)));

-}
 // 2x2 row major Matrix multiply adjugate A*(B#)
-static inline uniform FVector4d Mat2MulAdj(const uniform FVector4d& vec1, const uniform FVector4d& vec2)
-{
-	return
-		VectorSubtract(VectorMultiply( vec1, VectorSwizzle(vec2, 3,0,3,0)),
-			VectorMultiply(VectorSwizzle(vec1, 1,0,3,2), VectorSwizzle(vec2, 2,1,2,1)));
-}
-static inline uniform FVector4f Mat2MulAdj(const uniform FVector4f& vec1, const uniform FVector4f& vec2)
+template <typename T>
+static inline uniform T Mat2MulAdj(const uniform T& vec1, const uniform T& vec2)
 {
 	return
 		VectorSubtract(VectorMultiply( vec1, VectorSwizzle(vec2, 3,0,3,0)),
@@ -572,57 +552,10 @@ inline uniform FMatrix44f MatrixInverse(const uniform FMatrix44f& M)
 	return Result;
 }

-inline uniform FVector4d VectorTransformVector(const uniform FVector4d &VecP, const uniform FMatrix44d &M)
+template <typename T, typename V>
+inline T VectorTransformVector(const T& VecP, const V& M)
 {
-	uniform FVector4d VTempX, VTempY, VTempZ, VTempW;
-
-	// Splat x,y,z and w
-	VTempX = VectorReplicate(VecP, 0);
-	VTempY = VectorReplicate(VecP, 1);
-	VTempZ = VectorReplicate(VecP, 2);
-	VTempW = VectorReplicate(VecP, 3);
-
-	// Mul by the matrix
-	VTempX = VectorMultiply(VTempX, SetVector4(M.M[0], M.M[1], M.M[2], M.M[3]));
-	VTempY = VectorMultiply(VTempY, SetVector4(M.M[4], M.M[5], M.M[6], M.M[7]));
-	VTempZ = VectorMultiply(VTempZ, SetVector4(M.M[8], M.M[9], M.M[10], M.M[11]));
-	VTempW = VectorMultiply(VTempW, SetVector4(M.M[12], M.M[13], M.M[14], M.M[15]));
-
-	// Add them all together
-	VTempX = VectorAdd(VTempX, VTempY);
-	VTempZ = VectorAdd(VTempZ, VTempW);
-	VTempX = VectorAdd(VTempX, VTempZ);
-
-	return VTempX;
-}
-
-inline uniform FVector4f VectorTransformVector(const uniform FVector4f &VecP, const uniform FMatrix44f &M)
-{
-	uniform FVector4f VTempX, VTempY, VTempZ, VTempW;
-
-	// Splat x,y,z and w
-	VTempX = VectorReplicate(VecP, 0);
-	VTempY = VectorReplicate(VecP, 1);
-	VTempZ = VectorReplicate(VecP, 2);
-	VTempW = VectorReplicate(VecP, 3);
-
-	// Mul by the matrix
-	VTempX = VectorMultiply(VTempX, SetVector4(M.M[0], M.M[1], M.M[2], M.M[3]));
-	VTempY = VectorMultiply(VTempY, SetVector4(M.M[4], M.M[5], M.M[6], M.M[7]));
-	VTempZ = VectorMultiply(VTempZ, SetVector4(M.M[8], M.M[9], M.M[10], M.M[11]));
-	VTempW = VectorMultiply(VTempW, SetVector4(M.M[12], M.M[13], M.M[14], M.M[15]));
-
-	// Add them all together
-	VTempX = VectorAdd(VTempX, VTempY);
-	VTempZ = VectorAdd(VTempZ, VTempW);
-	VTempX = VectorAdd(VTempX, VTempZ);
-
-	return VTempX;
-}
-
-inline FVector4 VectorTransformVector(const FVector4 &VecP, const FMatrix &M)
-{
-	FVector4 VTempX, VTempY, VTempZ, VTempW;
+	T VTempX, VTempY, VTempZ, VTempW;

 	// Splat x,y,z and w
 	VTempX = VectorReplicate(VecP, 0);
@@ -715,6 +648,30 @@ inline FVector3f MatrixTransformPosition(const FVector3f &P, const uniform FMatr
 	return VTempX;
 }

+// Calculate homogeneous transform. W component assumed to be 1.0
+inline uniform FVector3f MatrixTransformPosition(const uniform FVector3f& P, const uniform FMatrix44f& M)
+{
+	uniform FVector3f VTempX, VTempY, VTempZ;
+
+	// Splat x,y,z
+	VTempX = SetVector(P.V[0], P.V[0], P.V[0]);
+	VTempY = SetVector(P.V[1], P.V[1], P.V[1]);
+	VTempZ = SetVector(P.V[2], P.V[2], P.V[2]);
+
+	// Mul by the matrix
+	VTempX = VTempX * SetVector(M.M[0], M.M[1], M.M[2]);
+	VTempY = VTempY * SetVector(M.M[4], M.M[5], M.M[6]);
+	VTempZ = VTempZ * SetVector(M.M[8], M.M[9], M.M[10]);
+	const uniform FVector3f VTempW = SetVector(M.M[12], M.M[13], M.M[14]);
+
+	// Add them all together
+	VTempX = VTempX + VTempY;
+	VTempZ = VTempZ + VTempW;
+	VTempX = VTempX + VTempZ;
+
+	return VTempX;
+}
+
 // Calculate homogeneous transform. W component assumed to be 0.0
 inline FVector MatrixTransformVector(const FVector &P, const FMatrix &M)
 {
@@ -746,17 +703,8 @@ inline uniform FVector3f MatrixInverseTransformVector(const uniform FMatrix44f &
 	return SetVector(VectorTransformVector(SetVector4(V, FLOAT_ZERO), InvSelf));
 }

-inline uniform FMatrix44d MatrixReduceAdd(const varying FMatrix44d &M)
-{
-	return SetMatrix(
-		SetVector4(reduce_add(M.M[0]), reduce_add(M.M[1]), reduce_add(M.M[2]), reduce_add(M.M[3])),
-		SetVector4(reduce_add(M.M[4]), reduce_add(M.M[5]), reduce_add(M.M[6]), reduce_add(M.M[7])),
-		SetVector4(reduce_add(M.M[8]), reduce_add(M.M[9]), reduce_add(M.M[10]), reduce_add(M.M[11])),
-		SetVector4(reduce_add(M.M[12]), reduce_add(M.M[13]), reduce_add(M.M[14]), reduce_add(M.M[15]))
-		);
-}
-
-inline uniform FMatrix44f MatrixReduceAdd(const varying FMatrix44f &M)
+template <typename T>
+inline uniform T MatrixReduceAdd(const varying T& M)
 {
 	return SetMatrix(
 		SetVector4(reduce_add(M.M[0]), reduce_add(M.M[1]), reduce_add(M.M[2]), reduce_add(M.M[3])),
--- a/Engine/Source/Runtime/Core/Public/Math/Quat.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/Quat.isph
@@ -80,19 +80,10 @@ inline uniform FVector4 MatrixToQuat(const uniform FMatrix &M)
 * @param Quat2	Pointer to the second quaternion
 * @return Quat1 * Quat2
 */
-inline FVector4 VectorQuaternionMultiply2( const FVector4& Quat1, const FVector4& Quat2 )
+template <typename T>
+inline T VectorQuaternionMultiply2(const T& Quat1, const T& Quat2)
 {
-	FVector4 Result = VectorReplicate(Quat1, 3) * Quat2;
-	Result = VectorMultiplyAdd((VectorReplicate(Quat1, 0) * VectorSwizzle(Quat2, 3,2,1,0)), QMULTI_SIGN_MASK0, Result);
-	Result = VectorMultiplyAdd((VectorReplicate(Quat1, 1) * VectorSwizzle(Quat2, 2,3,0,1)), QMULTI_SIGN_MASK1, Result);
-	Result = VectorMultiplyAdd((VectorReplicate(Quat1, 2) * VectorSwizzle(Quat2, 1,0,3,2)), QMULTI_SIGN_MASK2, Result);
-
-	return Result;
-}
-
-inline uniform FVector4 VectorQuaternionMultiply2( const uniform FVector4& Quat1, const uniform FVector4& Quat2 )
-{
-	uniform FVector4 Result = VectorReplicate(Quat1, 3) * Quat2;
+	T Result = VectorReplicate(Quat1, 3) * Quat2;
 	Result = VectorMultiplyAdd((VectorReplicate(Quat1, 0) * VectorSwizzle(Quat2, 3,2,1,0)), QMULTI_SIGN_MASK0, Result);
 	Result = VectorMultiplyAdd((VectorReplicate(Quat1, 1) * VectorSwizzle(Quat2, 2,3,0,1)), QMULTI_SIGN_MASK1, Result);
 	Result = VectorMultiplyAdd((VectorReplicate(Quat1, 2) * VectorSwizzle(Quat2, 1,0,3,2)), QMULTI_SIGN_MASK2, Result);
@@ -131,28 +122,13 @@ inline uniform FVector4f QuatInverse(const uniform FVector4f &Quat)
 	return Quat * FLOAT_QINV_SIGN_MASK;
 }

-inline FVector4d QuatFastLerp(const FVector4d& A, const FVector4d& B, const double Alpha)
+template <typename T, typename F>
+inline T QuatFastLerp(const T& A, const T& B, const F Alpha)
 {
 	// To ensure the 'shortest route', we make sure the dot product between the both rotations is positive.
-	const double DotResult = VectorDot(A, B);
-	const double Bias = select(DotResult >= 0.d, 1.d, -1.d);
-	return (B * Alpha) + (A * (Bias * (1.d - Alpha)));
-}
-
-inline FVector4f QuatFastLerp(const FVector4f& A, const FVector4f& B, const float Alpha)
-{
-	// To ensure the 'shortest route', we make sure the dot product between the both rotations is positive.
-	const float DotResult = VectorDot(A, B);
-	const float Bias = select(DotResult >= 0.f, 1.f, -1.f);
-	return (B * Alpha) + (A * (Bias * (1.f - Alpha)));
-}
-
-inline uniform FVector4 QuatFastLerp(const uniform FVector4& A, const uniform FVector4& B, const uniform FReal Alpha)
-{
-	// To ensure the 'shortest route', we make sure the dot product between the both rotations is positive.
-	const uniform FReal DotResult = VectorDot(A, B);
-	const uniform FReal Bias = select(DotResult >= ZERO, ONE, -ONE);
-	return (B * Alpha) + (A * (Bias * (ONE - Alpha)));
+	const F DotResult = VectorDot(A, B);
+	const F Bias = select(DotResult >= 0, 1, -1);
+	return (B * Alpha) + (A * (Bias * (1 - Alpha)));
 }

 // A and B are quaternions.  The result is A + (|A.B| >= 0 ? 1 : -1) * B
@@ -176,7 +152,8 @@ inline uniform FVector4 VectorAccumulateQuaternionShortestPath(const uniform FVe
 * @param VectorW0 Vector to rotate. W component must be zero.
 * @return Vector after rotation by Quat.
 */
-inline FVector4d VectorQuaternionRotateVector(const FVector4d& Quat, const FVector4d& VectorW0)
+template<typename T, typename V>
+inline V VectorQuaternionRotateVector(const T& Quat, const V& VectorW0)
 {
 	// Q * V * Q.Inverse
 	//const VectorRegister InverseRotation = VectorQuaternionInverse(Quat);
@@ -191,136 +168,17 @@ inline FVector4d VectorQuaternionRotateVector(const FVector4d& Quat, const FVect
 	// T = 2(Q x V);
 	// V' = V + w*(T) + (Q x T)

-	const FVector4d QW = VectorReplicate(Quat, 3);
-	FVector4d T = VectorCross(Quat, VectorW0);
-	T = VectorAdd(T, T);
-	const FVector4d VTemp0 = VectorMultiplyAdd(QW, T, VectorW0);
-	const FVector4d VTemp1 = VectorCross(Quat, T);
-	const FVector4d Rotated = VectorAdd(VTemp0, VTemp1);
+	const V QW = VectorReplicate(Quat, 3);
+	V Q = VectorCross(Quat, VectorW0);
+	Q = VectorAdd(Q, Q);
+	const V VTemp0 = VectorMultiplyAdd(QW, Q, VectorW0);
+	const V VTemp1 = VectorCross(Quat, Q);
+	const V Rotated = VectorAdd(VTemp0, VTemp1);
 	return Rotated;
 }

-inline FVector4f VectorQuaternionRotateVector(const FVector4f& Quat, const FVector4f& VectorW0)
-{
-	// Q * V * Q.Inverse
-	//const VectorRegister InverseRotation = VectorQuaternionInverse(Quat);
-	//const VectorRegister Temp = VectorQuaternionMultiply2(Quat, VectorW0);
-	//const VectorRegister Rotated = VectorQuaternionMultiply2(Temp, InverseRotation);
-
-	// Equivalence of above can be shown to be:
-	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
-	// V' = V + 2w(Q x V) + (2Q x (Q x V))
-	// refactor:
-	// V' = V + w(2(Q x V)) + (Q x (2(Q x V)))
-	// T = 2(Q x V);
-	// V' = V + w*(T) + (Q x T)
-
-	const FVector4f QW = VectorReplicate(Quat, 3);
-	FVector4f T = VectorCross(Quat, VectorW0);
-	T = VectorAdd(T, T);
-	const FVector4f VTemp0 = VectorMultiplyAdd(QW, T, VectorW0);
-	const FVector4f VTemp1 = VectorCross(Quat, T);
-	const FVector4f Rotated = VectorAdd(VTemp0, VTemp1);
-	return Rotated;
-}
-
-inline FVector4 VectorQuaternionRotateVector(const uniform FVector4& Quat, const FVector4& VectorW0)
-{
-	// Q * V * Q.Inverse
-	//const VectorRegister InverseRotation = VectorQuaternionInverse(Quat);
-	//const VectorRegister Temp = VectorQuaternionMultiply2(Quat, VectorW0);
-	//const VectorRegister Rotated = VectorQuaternionMultiply2(Temp, InverseRotation);
-
-	// Equivalence of above can be shown to be:
-	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
-	// V' = V + 2w(Q x V) + (2Q x (Q x V))
-	// refactor:
-	// V' = V + w(2(Q x V)) + (Q x (2(Q x V)))
-	// T = 2(Q x V);
-	// V' = V + w*(T) + (Q x T)
-
-	const uniform FVector4 QW = VectorReplicate(Quat, 3);
-	FVector4 T = VectorCross(Quat, VectorW0);
-	T = VectorAdd(T, T);
-	const FVector4 VTemp0 = VectorMultiplyAdd(QW, T, VectorW0);
-	const FVector4 VTemp1 = VectorCross(Quat, T);
-	const FVector4 Rotated = VectorAdd(VTemp0, VTemp1);
-	return Rotated;
-}
-
-inline FVector4f VectorQuaternionRotateVector(const uniform FVector4f& Quat, const FVector4f& VectorW0)
-{
-	// Q * V * Q.Inverse
-	//const VectorRegister InverseRotation = VectorQuaternionInverse(Quat);
-	//const VectorRegister Temp = VectorQuaternionMultiply2(Quat, VectorW0);
-	//const VectorRegister Rotated = VectorQuaternionMultiply2(Temp, InverseRotation);
-
-	// Equivalence of above can be shown to be:
-	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
-	// V' = V + 2w(Q x V) + (2Q x (Q x V))
-	// refactor:
-	// V' = V + w(2(Q x V)) + (Q x (2(Q x V)))
-	// T = 2(Q x V);
-	// V' = V + w*(T) + (Q x T)
-
-	const uniform FVector4f QW = VectorReplicate(Quat, 3);
-	FVector4f T = VectorCross(Quat, VectorW0);
-	T = VectorAdd(T, T);
-	const FVector4f VTemp0 = VectorMultiplyAdd(QW, T, VectorW0);
-	const FVector4f VTemp1 = VectorCross(Quat, T);
-	const FVector4f Rotated = VectorAdd(VTemp0, VTemp1);
-	return Rotated;
-}
-
-inline uniform FVector4d VectorQuaternionRotateVector(const uniform FVector4d& Quat, const uniform FVector4d& VectorW0)
-{
-	// Q * V * Q.Inverse
-	//const VectorRegister InverseRotation = VectorQuaternionInverse(Quat);
-	//const VectorRegister Temp = VectorQuaternionMultiply2(Quat, VectorW0);
-	//const VectorRegister Rotated = VectorQuaternionMultiply2(Temp, InverseRotation);
-
-	// Equivalence of above can be shown to be:
-	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
-	// V' = V + 2w(Q x V) + (2Q x (Q x V))
-	// refactor:
-	// V' = V + w(2(Q x V)) + (Q x (2(Q x V)))
-	// T = 2(Q x V);
-	// V' = V + w*(T) + (Q x T)
-
-	const uniform FVector4d QW = VectorReplicate(Quat, 3);
-	uniform FVector4d T = VectorCross(Quat, VectorW0);
-	T = VectorAdd(T, T);
-	const uniform FVector4d VTemp0 = VectorMultiplyAdd(QW, T, VectorW0);
-	const uniform FVector4d VTemp1 = VectorCross(Quat, T);
-	const uniform FVector4d Rotated = VectorAdd(VTemp0, VTemp1);
-	return Rotated;
-}
-
-inline uniform FVector4f VectorQuaternionRotateVector(const uniform FVector4f& Quat, const uniform FVector4f& VectorW0)
-{
-	// Q * V * Q.Inverse
-	//const VectorRegister InverseRotation = VectorQuaternionInverse(Quat);
-	//const VectorRegister Temp = VectorQuaternionMultiply2(Quat, VectorW0);
-	//const VectorRegister Rotated = VectorQuaternionMultiply2(Temp, InverseRotation);
-
-	// Equivalence of above can be shown to be:
-	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
-	// V' = V + 2w(Q x V) + (2Q x (Q x V))
-	// refactor:
-	// V' = V + w(2(Q x V)) + (Q x (2(Q x V)))
-	// T = 2(Q x V);
-	// V' = V + w*(T) + (Q x T)
-
-	const uniform FVector4f QW = VectorReplicate(Quat, 3);
-	uniform FVector4f T = VectorCross(Quat, VectorW0);
-	T = VectorAdd(T, T);
-	const uniform FVector4f VTemp0 = VectorMultiplyAdd(QW, T, VectorW0);
-	const uniform FVector4f VTemp1 = VectorCross(Quat, T);
-	const uniform FVector4f Rotated = VectorAdd(VTemp0, VTemp1);
-	return Rotated;
-}
-
-inline uniform FVector VectorQuaternionRotateVector(const uniform FVector4& Quat, const uniform FVector& V)
+template<>
+inline uniform FVector VectorQuaternionRotateVector<uniform FVector4, uniform FVector>(const uniform FVector4& Quat, const uniform FVector& V)
 {
 	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
 	// V' = V + 2w(Q x V) + (2Q x (Q x V))
@@ -335,7 +193,8 @@ inline uniform FVector VectorQuaternionRotateVector(const uniform FVector4& Quat
 	return Result;
 }

-inline FVector3d VectorQuaternionRotateVector(const FVector4d& Quat, const FVector3d& V)
+template<>
+inline FVector3d VectorQuaternionRotateVector<FVector4d, FVector3d>(const FVector4d& Quat, const FVector3d& V)
 {
 	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
 	// V' = V + 2w(Q x V) + (2Q x (Q x V))
@@ -350,7 +209,8 @@ inline FVector3d VectorQuaternionRotateVector(const FVector4d& Quat, const FVect
 	return Result;
 }

-inline FVector3f VectorQuaternionRotateVector(const FVector4f& Quat, const FVector3f& V)
+template<>
+inline FVector3f VectorQuaternionRotateVector<FVector4f, FVector3f>(const FVector4f& Quat, const FVector3f& V)
 {
 	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
 	// V' = V + 2w(Q x V) + (2Q x (Q x V))
@@ -365,7 +225,8 @@ inline FVector3f VectorQuaternionRotateVector(const FVector4f& Quat, const FVect
 	return Result;
 }

-inline FVector3d VectorQuaternionRotateVector(const uniform FVector4d& Quat, const FVector3d& V)
+template<>
+inline FVector3d VectorQuaternionRotateVector<uniform FVector4d, FVector3d>(const uniform FVector4d& Quat, const FVector3d& V)
 {
 	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
 	// V' = V + 2w(Q x V) + (2Q x (Q x V))
@@ -380,7 +241,8 @@ inline FVector3d VectorQuaternionRotateVector(const uniform FVector4d& Quat, con
 	return Result;
 }

-inline FVector3f VectorQuaternionRotateVector(const uniform FVector4f& Quat, const FVector3f& V)
+template<>
+inline FVector3f VectorQuaternionRotateVector<uniform FVector4f, FVector3f>(const uniform FVector4f& Quat, const FVector3f& V)
 {
 	// http://people.csail.mit.edu/bkph/articles/Quaternions.pdf
 	// V' = V + 2w(Q x V) + (2Q x (Q x V))
@@ -395,7 +257,8 @@ inline FVector3f VectorQuaternionRotateVector(const uniform FVector4f& Quat, con
 	return Result;
 }

-inline uniform FVector8 VectorQuaternionRotateVector(const uniform FVector8& Quat, const uniform FVector8& VectorW0)
+template<>
+inline uniform FVector8 VectorQuaternionRotateVector<uniform FVector8, uniform FVector8>(const uniform FVector8& Quat, const uniform FVector8& VectorW0)
 {
 	const uniform FVector8 QW = VectorReplicate(Quat, 3);
 	uniform FVector8 T = VectorCross(Quat, VectorW0);
--- a/Engine/Source/Runtime/Core/Public/Math/Soaaos.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/Soaaos.isph
--- a/Engine/Source/Runtime/Core/Public/Math/Transform.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/Transform.isph
@@ -347,7 +347,7 @@ inline FVector TransformPosition(const uniform FTransform &T, const FVector& V)
 	const FVector4 ScaledVec = VectorMultiply(T.Scale3D, InputVectorW0);
 	const FVector4 RotatedVec = VectorQuaternionRotateVector(T.Rotation, ScaledVec);

-	const FVector4 TranslatedVec = VectorAdd(RotatedVec, T.Translation);
+	const FVector4 TranslatedVec = VectorAdd(RotatedVec, (const varying FVector4)T.Translation);

 	return SetVector(TranslatedVec);
 }
@@ -363,7 +363,7 @@ inline FVector3f TransformPosition(const uniform FTransform3f &T, const FVector3
 	const FVector4f ScaledVec = VectorMultiply(T.Scale3D, InputVectorW0);
 	const FVector4f RotatedVec = VectorQuaternionRotateVector(T.Rotation, ScaledVec);

-	const FVector4f TranslatedVec = VectorAdd(RotatedVec, T.Translation);
+	const FVector4f TranslatedVec = VectorAdd(RotatedVec, (const varying FVector4f)SetVector4(T.Translation.V[0], T.Translation.V[1], T.Translation.V[2], 0.f));

 	return SetVector(TranslatedVec);
 }
--- a/Engine/Source/Runtime/Core/Public/Math/Vector.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/Vector.isph
--- a/Engine/Source/Runtime/Core/Public/Math/WideVector.isph
+++ b/Engine/Source/Runtime/Core/Public/Math/WideVector.isph
@@ -86,22 +86,26 @@ unmasked inline uniform WideFVector4 VectorSwizzle(const uniform WideFVector4 &V
 	return Result;
 }

-unmasked inline uniform WideFVector4 VectorReplicate(const uniform WideFVector4 &Vec, const uniform int R)
+template<>
+inline uniform WideFVector4 VectorReplicate<uniform WideFVector4>(const uniform WideFVector4 &Vec, const uniform int R)
 {
-#if TARGET_WIDTH == 4
-	const varying int vPerm = { R, R, R, R };
-#elif TARGET_WIDTH == 8
-	const varying int vPerm = { R, R, R, R, R+4, R+4, R+4, R+4 };
-#elif TARGET_WIDTH == 16
-	const varying int vPerm = { R, R, R, R, R+4, R+4, R+4, R+4, R+8, R+8, R+8, R+8, R+12, R+12, R+12, R+12 };
-#endif
+	unmasked
+	{
+	#if TARGET_WIDTH == 4
+		const varying int vPerm = { R, R, R, R };
+	#elif TARGET_WIDTH == 8
+		const varying int vPerm = { R, R, R, R, R+4, R+4, R+4, R+4 };
+	#elif TARGET_WIDTH == 16
+		const varying int vPerm = { R, R, R, R, R+4, R+4, R+4, R+4, R+8, R+8, R+8, R+8, R+12, R+12, R+12, R+12 };
+	#endif

-	const FReal V = Vec.V[programIndex];
-	const FReal S = shuffle(V, vPerm);
-	uniform WideFVector4 Result;
-	Result.V[programIndex] = S;
+		const FReal V = Vec.V[programIndex];
+		const FReal S = shuffle(V, vPerm);
+		uniform WideFVector4 Result;
+		Result.V[programIndex] = S;

-	return Result;
+		return Result;
+	}
 }

 unmasked inline uniform WideFVector4 VectorCompareGE(const uniform WideFVector4 &A, const uniform WideFVector4 &B)