Files
UnrealEngineUWP/Engine/Shaders/Public/LaneVectorization.ush
guillaume abadie d59499b3a3 Concatenates the convolutions of the TSR rejection's blend and clamp to save perf
#rb none
#jira UE-179496

[CL 27386517 by guillaume abadie in ue5-main branch]
2023-08-25 14:41:22 -04:00

1246 lines
33 KiB
Plaintext

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
LaneVectorization.ush: Vectorize arbitrary number of processing per lane.
=============================================================================*/
#pragma once
#include "Platform.ush"
#include "WaveBroadcastIntrinsics.ush"
//------------------------------------------------------- DEFINE
#ifndef TENSOR_REGISTER_LAYOUT
#define TENSOR_REGISTER_LAYOUT FRegisters_AoS_VectorArray
#endif
#ifndef TENSOR_REGISTER_TIGHTENING
#define TENSOR_REGISTER_TIGHTENING 0
#endif
//------------------------------------------------------- STANDARD SCALAR OPERATIONS
bool not(bool x)
{
return !x;
}
bool2 not(bool2 x)
{
return !x;
}
bool3 not(bool3 x)
{
return !x;
}
bool4 not(bool4 x)
{
return !x;
}
bool2 v_pack_b32_b16(bool a, bool b)
{
return bool2(a, b);
}
float2 v_pack_b32_b16(float a, float b)
{
return float2(a, b);
}
uint2 v_pack_b32_b16(uint a, uint b)
{
return uint2(a, b);
}
int2 v_pack_b32_b16(int a, int b)
{
return int2(a, b);
}
float fast_sign(float x)
{
return clamp(x * asfloat(0x7f7fffff), float(-1.0), float(1.0));
}
float2 fast_sign(float2 x)
{
return clamp(x * asfloat(0x7f7fffff), float(-1.0), float(1.0));
}
#if PLATFORM_SUPPORTS_REAL_TYPES
half fast_sign(half x)
{
return clamp(x * asfloat16(uint16_t(0x7bff)), half(-1.0), half(1.0));
}
half2 fast_sign(half2 x)
{
return clamp(x * asfloat16(uint16_t(0x7bff)), half(-1.0), half(1.0));
}
#endif
uint bit_and(uint a, uint b)
{
return a & b;
}
uint2 bit_and(uint2 a, uint2 b)
{
return a & b;
}
uint bit_or(uint a, uint b)
{
return a | b;
}
uint2 bit_or(uint2 a, uint2 b)
{
return a | b;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
uint16_t bit_and(uint16_t a, uint16_t b)
{
return a & b;
}
uint16_t2 bit_and(uint16_t2 a, uint16_t2 b)
{
return a & b;
}
uint16_t bit_or(uint16_t a, uint16_t b)
{
return a | b;
}
uint16_t2 bit_or(uint16_t2 a, uint16_t2 b)
{
return a | b;
}
#endif
uint bit_shift_left(uint a, uint b)
{
return a << b;
}
uint2 bit_shift_left(uint2 a, uint2 b)
{
return a << b;
}
uint bit_shift_right(uint a, uint b)
{
return a >> b;
}
uint2 bit_shift_right(uint2 a, uint2 b)
{
return a >> b;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
uint16_t bit_shift_left(uint16_t a, uint16_t b)
{
return a << b;
}
uint16_t2 bit_shift_left(uint16_t2 a, uint16_t2 b)
{
return a << b;
}
uint16_t bit_shift_right(uint16_t a, uint16_t b)
{
return a >> b;
}
uint16_t2 bit_shift_right(uint16_t2 a, uint16_t2 b)
{
return a >> b;
}
#endif
//------------------------------------------------------- TIGHT REGISTERS
bool PackRegistersTightly(bool v)
{
return v;
}
bool2 PackRegistersTightly(bool2 v)
{
return v;
}
bool3 PackRegistersTightly(bool3 v)
{
return v;
}
bool4 PackRegistersTightly(bool4 v)
{
return v;
}
float PackRegistersTightly(float v)
{
return v;
}
float2 PackRegistersTightly(float2 v)
{
return v;
}
float3 PackRegistersTightly(float3 v)
{
return v;
}
float4 PackRegistersTightly(float4 v)
{
return v;
}
uint PackRegistersTightly(uint v)
{
return v;
}
uint2 PackRegistersTightly(uint2 v)
{
return v;
}
uint3 PackRegistersTightly(uint3 v)
{
return v;
}
uint4 PackRegistersTightly(uint4 v)
{
return v;
}
int PackRegistersTightly(int v)
{
return v;
}
int2 PackRegistersTightly(int2 v)
{
return v;
}
int3 PackRegistersTightly(int3 v)
{
return v;
}
int4 PackRegistersTightly(int4 v)
{
return v;
}
#if PLATFORM_SUPPORTS_REAL_TYPES
half PackRegistersTightly(half v)
{
return v;
}
half2 PackRegistersTightly(half2 v)
{
return v_pack_b32_b16(v.x, v.y);
}
half3 PackRegistersTightly(half3 v)
{
return half3(v_pack_b32_b16(v.x, v.y), v.z);
}
half4 PackRegistersTightly(half4 v)
{
return half4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w));
}
uint16_t PackRegistersTightly(uint16_t v)
{
return v;
}
uint16_t2 PackRegistersTightly(uint16_t2 v)
{
return v_pack_b32_b16(v.x, v.y);
}
uint16_t3 PackRegistersTightly(uint16_t3 v)
{
return uint16_t3(v_pack_b32_b16(v.x, v.y), v.z);
}
uint16_t4 PackRegistersTightly(uint16_t4 v)
{
return uint16_t4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w));
}
int16_t PackRegistersTightly(int16_t v)
{
return v;
}
int16_t2 PackRegistersTightly(int16_t2 v)
{
return v_pack_b32_b16(v.x, v.y);
}
int16_t3 PackRegistersTightly(int16_t3 v)
{
return int16_t3(v_pack_b32_b16(v.x, v.y), v.z);
}
int16_t4 PackRegistersTightly(int16_t4 v)
{
return int16_t4(v_pack_b32_b16(v.x, v.y), v_pack_b32_b16(v.z, v.w));
}
#endif // PLATFORM_SUPPORTS_REAL_TYPES
//------------------------------------------------------- REGISTERS LAYOUTS
/** Stores one unique vector<>. InElementCount must be == 1 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_S_OneVector
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = 1;
/** Size and number of register rows */
static const uint RegisterRowSize = VectorSize;
static const uint RegisterRowCount = 1;
vector<ScalarType, VectorSize> E;
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return E;
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
E = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return E;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
E = Element;
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex));
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return E[ComponentIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
E[ComponentIndex] = Component;
}
}; // struct FRegisters_S_OneVector
/** Stores an array of vector<> as an array of structure in registers. */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_AoS_VectorArray
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = VectorSize;
static const uint RegisterRowCount = ElementCount;
vector<ScalarType, VectorSize> Array[ElementCount];
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return Array[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
Array[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return Array[ElementIndex];
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
Array[ElementIndex] = Element;
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex));
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return Array[ElementIndex][ComponentIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
Array[ElementIndex][ComponentIndex] = Component;
}
}; // struct FRegisters_AoS_VectorArray
/** Stores an array of vector<> as an array of structure in registers using a matrix<>. */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_AoS_Matrix
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = VectorSize;
static const uint RegisterRowCount = ElementCount;
matrix<ScalarType, ElementCount, VectorSize> M;
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return M[ElementIndex];
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
M[ElementIndex] = Element;
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
return WaveBroadcast(BroadcastSettings, GetElement(ElementIndex));
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return M[ElementIndex][ComponentIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
M[ElementIndex][ComponentIndex] = Component;
}
}; // struct FRegisters_AoS_Matrix
/** Stores an array of vector<> as a structure of array in registers using a matrix<>. Requires InElementCount a pair >= 2 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_SoA_Matrix
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = ElementCount;
static const uint RegisterRowCount = VectorSize;
matrix<ScalarType, VectorSize, ElementCount> M;
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = M[ComponentIndex][ElementIndex];
}
return Element;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
M[ComponentIndex][ElementIndex] = Element[ComponentIndex];
}
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ComponentIndex])[ElementIndex];
}
return Element;
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
return M[ComponentIndex][ElementIndex];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
M[ComponentIndex][ElementIndex] = Component;
}
}; // struct FRegisters_SoA_Matrix
/** Stores an array of vector<> as a structure of array in registers using a matrix<>. Requires InElementCount a pair >= 2 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_AoS_PairArray
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = 2;
static const uint RegisterRowCount = (ElementCount * VectorSize) / 2;
vector<ScalarType, RegisterRowSize> M[RegisterRowCount];
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = GetElementComponent(ElementIndex, ComponentIndex);
}
return Element;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
SetElementComponent(ElementIndex, ComponentIndex, Element[ComponentIndex]);
}
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
if (sizeof(ScalarType) == 4)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex;
Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ArrayIndex / 2u][ArrayIndex % 2u]);
}
}
else
{
const uint FirstArrayIndex = VectorSize * ElementIndex;
vector<ScalarType, RegisterRowSize> BroadcastedArray[(VectorSize + 1u) / 2u];
UNROLL
for (uint BroadcastIndex = 0; BroadcastIndex < ((VectorSize + 1u) / 2u); BroadcastIndex++)
{
BroadcastedArray[BroadcastIndex] = WaveBroadcast(BroadcastSettings, M[FirstArrayIndex / 2u + BroadcastIndex]);
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
const uint BroadcastedArrayIndex = ComponentIndex + VectorSize * ElementIndex - (FirstArrayIndex / 2u) * 2u;
Element[ComponentIndex] = BroadcastedArray[BroadcastedArrayIndex / 2u][BroadcastedArrayIndex % 2u];
}
}
return Element;
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex;
return M[ArrayIndex / 2u][ArrayIndex % 2u];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
const uint ArrayIndex = ComponentIndex + VectorSize * ElementIndex;
M[ArrayIndex / 2u][ArrayIndex % 2u] = Component;
}
}; // struct FRegisters_AoS_PairArray
/** Stores an array of vector<> as a structure of array in registers using a matrix<>. Requires InElementCount a pair >= 2 */
template<typename ScalarType, uint InVectorSize, uint InElementCount>
struct FRegisters_SoA_PairArray
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint ElementCount = InElementCount;
/** Size and number of register rows */
static const uint RegisterRowSize = 2;
static const uint RegisterRowCount = (ElementCount * VectorSize) / 2;
vector<ScalarType, RegisterRowSize> M[RegisterRowCount];
// ------------- register rows
CALL_SITE_DEBUGLOC
vector<ScalarType, RegisterRowSize> GetRegisterRow(const uint RegisterRowIndex)
{
return M[RegisterRowIndex];
}
CALL_SITE_DEBUGLOC
void SetRegisterRow(const uint RegisterRowIndex, vector<ScalarType, RegisterRowSize> RegisterRow)
{
M[RegisterRowIndex] = RegisterRow;
}
// ------------- elements
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
Element[ComponentIndex] = GetElementComponent(ElementIndex, ComponentIndex);
}
return Element;
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
SetElementComponent(ElementIndex, ComponentIndex, Element[ComponentIndex]);
}
}
vector<ScalarType, VectorSize> WaveBroadcastElement(const FWaveBroadcastSettings BroadcastSettings, const uint ElementIndex)
{
vector<ScalarType, VectorSize> Element;
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSize; ComponentIndex++)
{
const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex;
Element[ComponentIndex] = WaveBroadcast(BroadcastSettings, M[ArrayIndex / 2u])[ArrayIndex % 2u];
}
return Element;
}
// ------------- elements' components
CALL_SITE_DEBUGLOC
ScalarType GetElementComponent(const uint ElementIndex, const uint ComponentIndex)
{
const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex;
return M[ArrayIndex / 2u][ArrayIndex % 2u];
}
CALL_SITE_DEBUGLOC
void SetElementComponent(const uint ElementIndex, const uint ComponentIndex, ScalarType Component)
{
const uint ArrayIndex = ElementIndex + VectorSize * ComponentIndex;
M[ArrayIndex / 2u][ArrayIndex % 2u] = Component;
}
}; // struct FRegisters_SoA_PairArray
//------------------------------------------------------- TENSORS
/** Packs a vector<> in 2 dimension. */
template<typename ScalarType, uint InVectorSize, uint InSimdSizeX, uint InSimdSizeY>
struct TLaneVector2D
{
/** Size of each element. */
static const uint VectorSize = InVectorSize;
/** Number of elememnts */
static const uint SimdSize = InSimdSizeX * InSimdSizeY;
static const uint SimdSizeX = InSimdSizeX;
static const uint SimdSizeY = InSimdSizeY;
/** Size and number of register rows */
static const uint RegisterRowSize = TENSOR_REGISTER_LAYOUT<ScalarType, InVectorSize, InSimdSizeX * InSimdSizeY>::RegisterRowSize;
static const uint RegisterRowCount = TENSOR_REGISTER_LAYOUT<ScalarType, InVectorSize, InSimdSizeX * InSimdSizeY>::RegisterRowCount;
TENSOR_REGISTER_LAYOUT<ScalarType, InVectorSize, InSimdSizeX * InSimdSizeY> Registers;
// ------------- getting and setting elements and components
// Access and set individual element of the vector.
CALL_SITE_DEBUGLOC
vector<ScalarType, VectorSize> GetElement(const uint ElementIndex)
{
return Registers.GetElement(ElementIndex);
}
CALL_SITE_DEBUGLOC
void SetElement(const uint ElementIndex, vector<ScalarType, VectorSize> Element)
{
Registers.SetElement(ElementIndex, Element);
}
CALL_SITE_DEBUGLOC
void SetAllElements(vector<ScalarType, VectorSize> Element)
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
Registers.SetElement(ElementIndex, Element);
}
}
// Access and set a component of the vector
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> GetComponent(const uint ComponentIndex)
{
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
return R;
}
CALL_SITE_DEBUGLOC
void SetComponent(uint ComponentIndex, TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> Comp)
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
Registers.SetElementComponent(ElementIndex, ComponentIndex, Comp.GetElement(ElementIndex));
}
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> operator [](const uint ComponentIndex)
{
return GetComponent(ComponentIndex);
}
// ------------- constructors
/** Casts a scalar into a vector<>. */
CALL_SITE_DEBUGLOC
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> Vectorize(
TLaneVector2D<ScalarType, 1, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, A.GetElement(ElementIndex));
}
return R;
}
/** Initialize all elements with a single same vector<>. */
CALL_SITE_DEBUGLOC
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> Const(
vector<ScalarType, VectorSize> A)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, A);
}
return R;
}
/** Casts vector<A> to vector<B>. */
CALL_SITE_DEBUGLOC
template<typename SourceScalarType>
static TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> CastFrom(
TLaneVector2D<SourceScalarType, 1, SimdSizeX, SimdSizeY> A)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSize; ElementIndex++)
{
R.SetElement(ElementIndex, vector<ScalarType, VectorSize>(A.GetElement(ElementIndex)));
}
return R;
}
/** Force tight register packing. */
CALL_SITE_DEBUGLOC
void TightenRegisters()
#if TENSOR_REGISTER_TIGHTENING
{
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
Registers.SetRegisterRow(RegisterRowIndex, PackRegistersTightly(Registers.GetRegisterRow(RegisterRowIndex)));
}
}
#else
{
// NOP
}
#endif
// ------------- binary operator +
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator + (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) + B.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator + (vector<ScalarType, VectorSize> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> BV;
BV.SetAllElements(B);
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) + BV.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
// ------------- unary operator -
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator - ()
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, -Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
// ------------- binary operator -
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator - (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) - B.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator - (vector<ScalarType, VectorSize> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> BV;
BV.SetAllElements(B);
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) - BV.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
// ------------- binary operator *
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator * (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) * B.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> operator * (vector<ScalarType, VectorSize> B)
{
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> BV;
BV.SetAllElements(B);
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) * BV.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
// ------------- comparison operators
CALL_SITE_DEBUGLOC
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> operator > (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) > B.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
CALL_SITE_DEBUGLOC
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> operator < (TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> R;
UNROLL
for (uint RegisterRowIndex = 0; RegisterRowIndex < RegisterRowCount; RegisterRowIndex++)
{
R.Registers.SetRegisterRow(RegisterRowIndex, Registers.GetRegisterRow(RegisterRowIndex) < B.Registers.GetRegisterRow(RegisterRowIndex));
}
return R;
}
}; // TLaneVector2D
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint SimdSizeX, uint SimdSizeY>
void Concatenate(
TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B,
out TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> R)
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, ComponentIndex, A.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++)
{
R.Registers.SetElementComponent(ElementIndex, VectorSizeA + ComponentIndex, B.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
}
}
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint SimdSizeX, uint SimdSizeY>
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> Concatenate(
TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B)
{
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> R;
Concatenate(A, B, /* out */ R);
return R;
}
CALL_SITE_DEBUGLOC
template<typename ScalarType, uint VectorSizeA, uint VectorSizeB, uint SimdSizeX, uint SimdSizeY>
void Deconcatenate(
TLaneVector2D<ScalarType, VectorSizeA + VectorSizeB, SimdSizeX, SimdSizeY> M,
out TLaneVector2D<ScalarType, VectorSizeA, SimdSizeX, SimdSizeY> A,
out TLaneVector2D<ScalarType, VectorSizeB, SimdSizeX, SimdSizeY> B)
{
UNROLL
for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++)
{
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeA; ComponentIndex++)
{
A.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, ComponentIndex));
}
UNROLL
for (uint ComponentIndex = 0; ComponentIndex < VectorSizeB; ComponentIndex++)
{
B.Registers.SetElementComponent(ElementIndex, ComponentIndex, M.Registers.GetElementComponent(ElementIndex, VectorSizeA + ComponentIndex));
}
}
}
// ------------- 1 parameter
#define TVECTOR_FUNCTION_1PARAMS(ReturnScalarType, FunctionName) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ReturnScalarType, VectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A) \
{ \
TLaneVector2D<ReturnScalarType, VectorSize, SimdSizeX, SimdSizeY> R; \
UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount; RegisterRowIndex++) { \
R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex))); \
} \
return R; \
} \
TVECTOR_FUNCTION_1PARAMS(ScalarType, rcp);
TVECTOR_FUNCTION_1PARAMS(ScalarType, saturate);
TVECTOR_FUNCTION_1PARAMS(ScalarType, abs);
TVECTOR_FUNCTION_1PARAMS(ScalarType, floor);
TVECTOR_FUNCTION_1PARAMS(ScalarType, ceil);
TVECTOR_FUNCTION_1PARAMS(ScalarType, round);
TVECTOR_FUNCTION_1PARAMS(ScalarType, fast_sign);
TVECTOR_FUNCTION_1PARAMS(bool, not);
TVECTOR_FUNCTION_1PARAMS(uint, asuint);
#if PLATFORM_SUPPORTS_REAL_TYPES
TVECTOR_FUNCTION_1PARAMS(uint16_t, asuint16);
#endif
// ------------- 2 parameters
#define TVECTOR_FUNCTION_2PARAMS(FunctionName) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B) \
{ \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R; \
UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount; RegisterRowIndex++) { \
R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex), B.Registers.GetRegisterRow(RegisterRowIndex))); \
} \
return R; \
} \
TVECTOR_FUNCTION_2PARAMS(min);
TVECTOR_FUNCTION_2PARAMS(max);
TVECTOR_FUNCTION_2PARAMS(and_internal);
TVECTOR_FUNCTION_2PARAMS(or_internal);
TVECTOR_FUNCTION_2PARAMS(bit_and);
TVECTOR_FUNCTION_2PARAMS(bit_or);
TVECTOR_FUNCTION_2PARAMS(bit_shift_left);
TVECTOR_FUNCTION_2PARAMS(bit_shift_right);
// ------------- 2 parameters but different return dimension
#define TVECTOR_FUNCTION_2PARAMS_DIFF_RETURN(ReturnVectorSize, FunctionName) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ScalarType, ReturnVectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> A, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B) \
{ \
TLaneVector2D<ScalarType, ReturnVectorSize, SimdSizeX, SimdSizeY> R; \
UNROLL for (uint ElementIndex = 0; ElementIndex < SimdSizeX * SimdSizeY; ElementIndex++) { \
R.SetElement(ElementIndex, FunctionName(A.GetElement(ElementIndex), B.GetElement(ElementIndex))); \
} \
return R; \
} \
TVECTOR_FUNCTION_2PARAMS_DIFF_RETURN(/* VectorSize = */ 1, dot);
// ------------- 3 parameters
#define TVECTOR_FUNCTION_3PARAMS(FunctionName, ScalarTypeA) \
CALL_SITE_DEBUGLOC \
template<typename ScalarType, uint VectorSize, uint SimdSizeX, uint SimdSizeY> \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> FunctionName( \
TLaneVector2D<ScalarTypeA, VectorSize, SimdSizeX, SimdSizeY> A, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> B, \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> C) \
{ \
TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY> R; \
UNROLL for (uint RegisterRowIndex = 0; RegisterRowIndex < TLaneVector2D<ScalarType, VectorSize, SimdSizeX, SimdSizeY>::RegisterRowCount; RegisterRowIndex++) { \
R.Registers.SetRegisterRow(RegisterRowIndex, FunctionName(A.Registers.GetRegisterRow(RegisterRowIndex), B.Registers.GetRegisterRow(RegisterRowIndex), C.Registers.GetRegisterRow(RegisterRowIndex))); \
} \
return R; \
} \
TVECTOR_FUNCTION_3PARAMS(select_internal, bool);
TVECTOR_FUNCTION_3PARAMS(clamp, ScalarType);
TVECTOR_FUNCTION_3PARAMS(lerp, ScalarType);
TVECTOR_FUNCTION_3PARAMS(min3, ScalarType);
TVECTOR_FUNCTION_3PARAMS(max3, ScalarType);
#if COMPILER_SUPPORTS_MED3
TVECTOR_FUNCTION_3PARAMS(med3, ScalarType);
#endif
// ------------- AnyElement & AllElement
CALL_SITE_DEBUGLOC
template<uint VectorSize, uint SimdSizeX, uint SimdSizeY>
bool AnyComponent(TLaneVector2D<bool, VectorSize, SimdSizeX, SimdSizeY> A)
{
bool R = any(A.GetElement(0));
UNROLL for (uint SimdIndex = 1; SimdIndex < SimdSizeX * SimdSizeY; SimdIndex++) { R = R || any(A.GetElement(SimdIndex)); }
return R;
}