// Copyright Epic Games, Inc. All Rights Reserved. #include "VectorVM.h" #include "VectorVMSerialization.h" #include "Modules/ModuleManager.h" #include "UObject/Class.h" #include "UObject/Package.h" #include "VectorVMPrivate.h" #include "Stats/Stats.h" #include "HAL/ConsoleManager.h" #include "HAL/PlatformFileManager.h" #include "GenericPlatform/GenericPlatformFile.h" #include "Async/ParallelFor.h" #include "Math/UnrealPlatformMathSSE.h" #include IMPLEMENT_MODULE(FDefaultModuleImpl, VectorVM); DECLARE_STATS_GROUP(TEXT("VectorVM"), STATGROUP_VectorVM, STATCAT_Advanced); DECLARE_CYCLE_STAT(TEXT("VVM Execution"), STAT_VVMExec, STATGROUP_VectorVM); DECLARE_CYCLE_STAT(TEXT("VVM Chunk"), STAT_VVMExecChunk, STATGROUP_VectorVM); DEFINE_LOG_CATEGORY_STATIC(LogVectorVM, All, All); //#define FREE_TABLE_LOCK_CONTENTION_WARNINGS (!UE_BUILD_SHIPPING) #define FREE_TABLE_LOCK_CONTENTION_WARNINGS (0) //I don't expect us to ever be waiting long #define FREE_TABLE_LOCK_CONTENTION_WARN_THRESHOLD_MS (0.01) #if UE_BUILD_DEBUG #define VM_FORCEINLINE FORCENOINLINE #else #define VM_FORCEINLINE FORCEINLINE #endif #define OP_REGISTER (0) #define OP0_CONST (1 << 0) #define OP1_CONST (1 << 1) #define OP2_CONST (1 << 2) #define SRCOP_RRR (OP_REGISTER | OP_REGISTER | OP_REGISTER) #define SRCOP_RRC (OP_REGISTER | OP_REGISTER | OP0_CONST) #define SRCOP_RCR (OP_REGISTER | OP1_CONST | OP_REGISTER) #define SRCOP_RCC (OP_REGISTER | OP1_CONST | OP0_CONST) #define SRCOP_CRR (OP2_CONST | OP_REGISTER | OP_REGISTER) #define SRCOP_CRC (OP2_CONST | OP_REGISTER | OP0_CONST) #define SRCOP_CCR (OP2_CONST | OP1_CONST | OP_REGISTER) #define SRCOP_CCC (OP2_CONST | OP1_CONST | OP0_CONST) namespace VectorVMConstants { static const VectorRegister4Int VectorStride = MakeVectorRegisterInt(VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS); // for generating shuffle masks given input {A, B, C, D} constexpr uint32 ShufMaskIgnore = 0xFFFFFFFF; constexpr uint32 ShufMaskA = 0x03020100; constexpr uint32 ShufMaskB = 0x07060504; constexpr uint32 ShufMaskC = 0x0B0A0908; constexpr uint32 ShufMaskD = 0x0F0E0D0C; static const VectorRegister4Int RegisterShuffleMask[] = { MakeVectorRegisterInt(ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0000 MakeVectorRegisterInt(ShufMaskA, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0001 MakeVectorRegisterInt(ShufMaskB, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0010 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskIgnore, ShufMaskIgnore), // 0011 MakeVectorRegisterInt(ShufMaskC, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0100 MakeVectorRegisterInt(ShufMaskA, ShufMaskC, ShufMaskIgnore, ShufMaskIgnore), // 0101 MakeVectorRegisterInt(ShufMaskB, ShufMaskC, ShufMaskIgnore, ShufMaskIgnore), // 0110 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskC, ShufMaskIgnore), // 0111 MakeVectorRegisterInt(ShufMaskD, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 1000 MakeVectorRegisterInt(ShufMaskA, ShufMaskD, ShufMaskIgnore, ShufMaskIgnore), // 1001 MakeVectorRegisterInt(ShufMaskB, ShufMaskD, ShufMaskIgnore, ShufMaskIgnore), // 1010 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskD, ShufMaskIgnore), // 1011 MakeVectorRegisterInt(ShufMaskC, ShufMaskD, ShufMaskIgnore, ShufMaskIgnore), // 1100 MakeVectorRegisterInt(ShufMaskA, ShufMaskC, ShufMaskD, ShufMaskIgnore), // 1101 MakeVectorRegisterInt(ShufMaskB, ShufMaskC, ShufMaskD, ShufMaskIgnore), // 1110 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskC, ShufMaskD), // 1111 }; }; #define VM_USE_ACCURATE_VECTOR_FUNCTIONS (1) namespace VectorVMAccuracy { VM_FORCEINLINE static VectorRegister4Float Reciprocal(const VectorRegister4Float& Src) { #if VM_USE_ACCURATE_VECTOR_FUNCTIONS return VectorReciprocalAccurate(Src); #else return VectorReciprocal(Src); #endif } VM_FORCEINLINE static VectorRegister4Float ReciprocalSqrt(const VectorRegister4Float& Src) { #if VM_USE_ACCURATE_VECTOR_FUNCTIONS return VectorReciprocalSqrtAccurate(Src); #else return VectorReciprocalSqrt(Src); #endif } VM_FORCEINLINE static VectorRegister4Float Sqrt(const VectorRegister4Float& Src) { return Reciprocal(ReciprocalSqrt(Src)); } }; // helper function wrapping the SSE3 shuffle operation. Currently implemented for some platforms, the // rest will just use the FPU version so as to not push the requirements up to SSE3 (currently SSE2) #if PLATFORM_ENABLE_VECTORINTRINSICS && PLATFORM_ALWAYS_HAS_SSE4_1 #define VectorIntShuffle( Vec, Mask ) _mm_shuffle_epi8( (Vec), (Mask) ) #elif PLATFORM_ENABLE_VECTORINTRINSICS_NEON /** * Shuffles a VectorInt using a provided shuffle mask * * @param Vec Source vector * @param Mask Shuffle vector */ FORCEINLINE VectorRegister4Int VectorIntShuffle(const VectorRegister4Int& Vec, const VectorRegister4Int& Mask) { uint8x8x2_t VecSplit = { { vget_low_u8(Vec), vget_high_u8(Vec) } }; return vcombine_u8(vtbl2_u8(VecSplit, vget_low_u8(Mask)), vtbl2_u8(VecSplit, vget_high_u8(Mask))); } #else FORCEINLINE VectorRegister4Int VectorIntShuffle(const VectorRegister4Int& Vec, const VectorRegister4Int& Mask) { VectorRegister4Int Result; const int8* VecBytes = reinterpret_cast(&Vec); const int8* MaskBytes = reinterpret_cast(&Mask); int8* ResultBytes = reinterpret_cast(&Result); for (int32 i = 0; i < sizeof(VectorRegister4Int); ++i) { ResultBytes[i] = (MaskBytes[i] < 0) ? 0 : VecBytes[MaskBytes[i] % 16]; } return Result; } #endif #if VECTORVM_SUPPORTS_LEGACY //Temporarily locking the free table until we can implement a lock free algorithm. UE-65856 FORCEINLINE void FDataSetMeta::LockFreeTable() { #if FREE_TABLE_LOCK_CONTENTION_WARNINGS uint64 StartCycles = FPlatformTime::Cycles64(); #endif FreeTableLock.Lock(); #if FREE_TABLE_LOCK_CONTENTION_WARNINGS uint64 EndCylces = FPlatformTime::Cycles64(); double DurationMs = FPlatformTime::ToMilliseconds64(EndCylces - StartCycles); if (DurationMs >= FREE_TABLE_LOCK_CONTENTION_WARN_THRESHOLD_MS) { UE_LOG(LogVectorVM, Warning, TEXT("VectorVM Stalled in LockFreeTable()! %g ms"), DurationMs); } #endif } FORCEINLINE void FDataSetMeta::UnlockFreeTable() { FreeTableLock.Unlock(); } #endif //VECTORVM_SUPPORTS_LEGACY static int32 GbParallelVVM = 1; static FAutoConsoleVariableRef CVarbParallelVVM( TEXT("vm.Parallel"), GbParallelVVM, TEXT("If > 0 vector VM chunk level paralellism will be enabled. \n"), ECVF_Default ); static int32 GParallelVVMChunksPerBatch = 4; static FAutoConsoleVariableRef CVarParallelVVMChunksPerBatch( TEXT("vm.ParallelChunksPerBatch"), GParallelVVMChunksPerBatch, TEXT("Number of chunks to process per task when running in parallel. \n"), ECVF_Default ); //These are possibly too granular to enable for everyone. static int32 GbDetailedVMScriptStats = 0; static FAutoConsoleVariableRef CVarDetailedVMScriptStats( TEXT("vm.DetailedVMScriptStats"), GbDetailedVMScriptStats, TEXT("If > 0 the vector VM will emit stats for it's internal module calls. \n"), ECVF_Default ); static int32 GParallelVVMInstancesPerChunk = 128; static FAutoConsoleVariableRef CVarParallelVVMInstancesPerChunk( TEXT("vm.InstancesPerChunk"), GParallelVVMInstancesPerChunk, TEXT("Number of instances per VM chunk. (default=128) \n"), ECVF_ReadOnly ); static int32 GVVMChunkSizeInBytes = 32768; static FAutoConsoleVariableRef CVarVVMChunkSizeInBytes( TEXT("vm.ChunkSizeInBytes"), GVVMChunkSizeInBytes, TEXT("Number of bytes per VM chunk Ideally <= L1 size. (default=32768) \n"), ECVF_Default ); static int32 GVVMMaxThreadsPerScript = 8; static FAutoConsoleVariableRef CVarVVMMaxThreadsPerScript( TEXT("vm.MaxThreadsPerScript"), GVVMMaxThreadsPerScript, TEXT("Maximum number of threads per script. Set 0 to mean 'as many as necessary'\n"), ECVF_Default ); static int32 GbOptimizeVMByteCode = 1; static FAutoConsoleVariableRef CVarbOptimizeVMByteCode( TEXT("vm.OptimizeVMByteCode"), GbOptimizeVMByteCode, TEXT("If > 0 vector VM code optimization will be enabled at runtime.\n"), ECVF_Default ); static int32 GbFreeUnoptimizedVMByteCode = 1; static FAutoConsoleVariableRef CVarbFreeUnoptimizedVMByteCode( TEXT("vm.FreeUnoptimizedByteCode"), GbFreeUnoptimizedVMByteCode, TEXT("When we have optimized the VM byte code should we free the original unoptimized byte code?"), ECVF_Default ); static int32 GbUseOptimizedVMByteCode = 1; static FAutoConsoleVariableRef CVarbUseOptimizedVMByteCode( TEXT("vm.UseOptimizedVMByteCode"), GbUseOptimizedVMByteCode, TEXT("If > 0 optimized vector VM code will be excuted at runtime.\n"), ECVF_Default ); static int32 GbSafeOptimizedKernels = 1; static FAutoConsoleVariableRef CVarbSafeOptimizedKernels( TEXT("vm.SafeOptimizedKernels"), GbSafeOptimizedKernels, TEXT("If > 0 optimized vector VM byte code will use safe versions of the kernels.\n"), ECVF_Default ); static int32 GbBatchVMInput = 0; static FAutoConsoleVariableRef CVarBatchVMInput( TEXT("vm.BatchVMInput"), GbBatchVMInput, TEXT("If > 0 input elements will be batched.\n"), ECVF_Default ); static int32 GbBatchVMOutput = 0; static FAutoConsoleVariableRef CVarBatchVMOutput( TEXT("vm.BatchVMOutput"), GbBatchVMOutput, TEXT("If > 0 output elements will be batched.\n"), ECVF_Default ); static int32 GbBatchPackVMOutput = 1; static FAutoConsoleVariableRef CVarbBatchPackVMOutput( TEXT("vm.BatchPackedVMOutput"), GbBatchPackVMOutput, TEXT("If > 0 output elements will be packed and batched branch free.\n"), ECVF_Default ); uint8 VectorVM::GetNumOpCodes() { return (uint8)EVectorVMOp::NumOpcodes; } #if WITH_EDITOR //UEnum* g_VectorVMEnumStateObj = nullptr; UEnum* g_VectorVMEnumOperandObj = nullptr; #define VVM_OP_XM(n, ...) #n, static const char *VVM_OP_NAMES[] { VVM_OP_XM_LIST }; #undef VVM_OP_XM FString VectorVM::GetOpName(EVectorVMOp Op) { //check(g_VectorVMEnumStateObj); // //FString OpStr = g_VectorVMEnumStateObj->GetNameByValue((uint8)Op).ToString(); //int32 LastIdx = 0; //OpStr.FindLastChar(TEXT(':'), LastIdx); //return OpStr.RightChop(LastIdx + 1); int OpIdx = (int)Op; if (OpIdx < 0 || OpIdx >= (int)EVectorVMOp::NumOpcodes) { OpIdx = 0; } FString OpStr(VVM_OP_NAMES[OpIdx]); return OpStr; } FString VectorVM::GetOperandLocationName(EVectorVMOperandLocation Location) { check(g_VectorVMEnumOperandObj); FString LocStr = g_VectorVMEnumOperandObj->GetNameByValue((uint8)Location).ToString(); int32 LastIdx = 0; LocStr.FindLastChar(TEXT(':'), LastIdx); return LocStr.RightChop(LastIdx + 1); } #endif uint8 VectorVM::CreateSrcOperandMask(EVectorVMOperandLocation Type0, EVectorVMOperandLocation Type1, EVectorVMOperandLocation Type2) { return (Type0 == EVectorVMOperandLocation::Constant ? OP0_CONST : OP_REGISTER) | (Type1 == EVectorVMOperandLocation::Constant ? OP1_CONST : OP_REGISTER) | (Type2 == EVectorVMOperandLocation::Constant ? OP2_CONST : OP_REGISTER); } #if VECTORVM_SUPPORTS_LEGACY namespace VectorKernelNoiseImpl { static void BuildNoiseTable(); } #endif void VectorVM::Init() { static bool Inited = false; if (Inited == false) { #if WITH_EDITOR //g_VectorVMEnumStateObj = StaticEnum(); g_VectorVMEnumOperandObj = StaticEnum(); #endif #if VECTORVM_SUPPORTS_LEGACY VectorKernelNoiseImpl::BuildNoiseTable(); #endif Inited = true; } } #if VECTORVM_SUPPORTS_LEGACY ////////////////////////////////////////////////////////////////////////// // VM Code Optimizer Context typedef void(*FVectorVMExecFunction)(FVectorVMContext&); struct FVectorVMCodeOptimizerContext { typedef bool(*OptimizeVMFunction)(EVectorVMOp, FVectorVMCodeOptimizerContext&); explicit FVectorVMCodeOptimizerContext(FVectorVMContext& InBaseContext, const uint8* ByteCode, TArray& InOptimizedCode, TArrayView InExternalFunctionRegisterCounts) : BaseContext(InBaseContext) , OptimizedCode(InOptimizedCode) , ExternalFunctionRegisterCounts(InExternalFunctionRegisterCounts) { BaseContext.PrepareForExec(0, 0, nullptr, nullptr, nullptr, nullptr, TArrayView(), 0, false); BaseContext.PrepareForChunk(ByteCode, 0, 0); // write out a jump table offset that we'll fill in when the table is encoded (EncodeJumpTable()) Write(0); } FVectorVMCodeOptimizerContext(const FVectorVMCodeOptimizerContext&) = delete; FVectorVMCodeOptimizerContext(const FVectorVMCodeOptimizerContext&&) = delete; template int32 GetNumLoops() const { return 0; } FORCEINLINE EVectorVMOp PeekOp() { return static_cast(*BaseContext.Code); } FORCEINLINE uint8 DecodeU8() { return BaseContext.DecodeU8(); } FORCEINLINE uint16 DecodeU16() { return BaseContext.DecodeU16(); } FORCEINLINE uint32 DecodeU32() { return BaseContext.DecodeU32(); } FORCEINLINE uint64 DecodeU64() { return BaseContext.DecodeU64(); } //-TODO: Support unaligned writes template void Write(const T& v) { reinterpret_cast(OptimizedCode[OptimizedCode.AddUninitialized(sizeof(T))]) = v; } void WriteExecFunction(const FVectorVMExecFunction& Function) { const int32 JumpTableIndex = JumpTable.AddUnique(Function); check(JumpTableIndex <= TNumericLimits::Max()); Write(JumpTableIndex); } struct FOptimizerCodeState { uint8 const* BaseContextCode; int32 OptimizedCodeLength; }; FOptimizerCodeState CreateCodeState() { FOptimizerCodeState State; State.BaseContextCode = BaseContext.Code; State.OptimizedCodeLength = OptimizedCode.Num(); return State; } void RollbackCodeState(const FOptimizerCodeState& State) { BaseContext.Code = State.BaseContextCode; OptimizedCode.SetNum(State.OptimizedCodeLength, EAllowShrinking::No); } // Jump table is encoded at the end of the optimized code, with the first int32 in the byte code // stream being the start offset static const FVectorVMExecFunction* DecodeJumpTable(const uint8*& OptimizedByteCode) { const uint32 JumpTableOffset = *reinterpret_cast(OptimizedByteCode); const FVectorVMExecFunction* JumpTable = reinterpret_cast(OptimizedByteCode + JumpTableOffset); OptimizedByteCode += sizeof(uint32); return JumpTable; } void EncodeJumpTable() { const int32 JumpTableCount = JumpTable.Num(); check(JumpTableCount > 0); check(JumpTableCount <= TNumericLimits::Max()); if (JumpTableCount <= 0 || JumpTableCount > TNumericLimits::Max()) { // if the jump table is too big, then clear out the OptimizedCode and we'll just have to run the unoptimized path OptimizedCode.Reset(); return; } // write the offset to the jump table into the reserved slot const uint32 JumpTableOffset = OptimizedCode.Num(); *reinterpret_cast(OptimizedCode.GetData()) = JumpTableOffset; const int32 JumpTableSize = JumpTableCount * sizeof(FVectorVMExecFunction); OptimizedCode.AddUninitialized(JumpTableSize); FMemory::Memcpy(OptimizedCode.GetData() + JumpTableOffset, JumpTable.GetData(), JumpTableSize); } FVectorVMContext& BaseContext; TArray& OptimizedCode; TArray> JumpTable; const TArrayView ExternalFunctionRegisterCounts; const int32 StartInstance = 0; }; ////////////////////////////////////////////////////////////////////////// // Constant Handlers struct FConstantHandlerBase { const uint16 ConstantIndex; FConstantHandlerBase(FVectorVMContext& Context) : ConstantIndex(Context.DecodeU16()) {} FORCEINLINE void Advance() { } static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Context.DecodeU16()); } static void OptimizeSkip(FVectorVMCodeOptimizerContext& Context) { Context.DecodeU16(); } }; template struct FConstantHandler : public FConstantHandlerBase { const T Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(*Context.GetConstant(ConstantIndex)) { } FORCEINLINE const T& Get() const { return Constant; } FORCEINLINE const T& GetAndAdvance() { return Constant; } }; template<> struct FConstantHandler : public FConstantHandlerBase { static VectorRegister4Float LoadConstant(const FVectorVMContext& Context, uint16 ConstantIndex) { float ConstantValue = *Context.GetConstant(ConstantIndex); return MakeVectorRegister(ConstantValue, ConstantValue, ConstantValue, ConstantValue); } const VectorRegister4Float Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(LoadConstant(Context, ConstantIndex)) {} FORCEINLINE const VectorRegister4Float Get() const { return Constant; } FORCEINLINE const VectorRegister4Float GetAndAdvance() { return Constant; } }; template<> struct FConstantHandler : public FConstantHandlerBase { static VectorRegister4Int LoadConstant(const FVectorVMContext& Context, uint16 ConstantIndex) { int32 ConstantValue = *Context.GetConstant(ConstantIndex); return MakeVectorRegisterInt(ConstantValue, ConstantValue, ConstantValue, ConstantValue); } const VectorRegister4Int Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(LoadConstant(Context, ConstantIndex)) {} FORCEINLINE const VectorRegister4Int Get() const { return Constant; } FORCEINLINE const VectorRegister4Int GetAndAdvance() { return Constant; } }; ////////////////////////////////////////////////////////////////////////// // Register handlers. // Handle reading of a register, advancing the pointer with each read. struct FRegisterHandlerBase { const int32 RegisterIndex; FORCEINLINE FRegisterHandlerBase(FVectorVMContext& Context) : RegisterIndex(Context.DecodeU16()) {} static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Context.DecodeU16()); } }; template struct FRegisterHandler : public FRegisterHandlerBase { private: T * RESTRICT Register; public: FORCEINLINE FRegisterHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((T*)Context.GetTempRegister(RegisterIndex)) {} FORCEINLINE const T Get() { return *Register; } FORCEINLINE T* GetDest() { return Register; } FORCEINLINE void Advance() { ++Register; } FORCEINLINE const T GetAndAdvance() { return *Register++; } FORCEINLINE T* GetDestAndAdvance() { return Register++; } }; ////////////////////////////////////////////////////////////////////////// FVectorVMContext::FVectorVMContext() : Code(nullptr) , ConstantTable(nullptr) , ExternalFunctionTable(nullptr) , UserPtrTable(nullptr) , NumInstances(0) , StartInstance(0) , TempRegisterSize(0) , TempBufferSize(0) { RandStream.GenerateNewSeed(); } void FVectorVMContext::PrepareForExec( int32 InNumTempRegisters, int32 InConstantTableCount, const uint8* const* InConstantTable, const int32* InConstantTableSizes, const FVMExternalFunction* const* InExternalFunctionTable, void** InUserPtrTable, TArrayView InDataSetMetaTable, int32 MaxNumInstances, bool bInParallelExecution ) { NumTempRegisters = InNumTempRegisters; ConstantTableCount = InConstantTableCount; ConstantTableSizes = InConstantTableSizes; ConstantTable = InConstantTable; ExternalFunctionTable = InExternalFunctionTable; UserPtrTable = InUserPtrTable; TempRegisterSize = Align(MaxNumInstances * VectorVM::MaxInstanceSizeBytes, PLATFORM_CACHE_LINE_SIZE); TempBufferSize = TempRegisterSize * NumTempRegisters; TempRegTable.SetNumUninitialized(TempBufferSize, EAllowShrinking::No); DataSetMetaTable = InDataSetMetaTable; for (auto& TLSTempData : ThreadLocalTempData) { TLSTempData.Reset(); } ThreadLocalTempData.SetNum(DataSetMetaTable.Num()); bIsParallelExecution = bInParallelExecution; } #if STATS void FVectorVMContext::SetStatScopes(TArrayView InStatScopes) { StatScopes = InStatScopes; StatCounterStack.Reserve(StatScopes.Num()); ScopeExecCycles.Reset(StatScopes.Num()); ScopeExecCycles.AddZeroed(StatScopes.Num()); } #elif ENABLE_STATNAMEDEVENTS void FVectorVMContext::SetStatNamedEventScopes(TArrayView InStatNamedEventScopes) { StatNamedEventScopes = InStatNamedEventScopes; } #endif void FVectorVMContext::FinishExec() { //At the end of executing each chunk we can push any thread local temporary data out to the main storage with locks or atomics. check(ThreadLocalTempData.Num() == DataSetMetaTable.Num()); for(int32 DataSetIndex=0; DataSetIndex < DataSetMetaTable.Num(); ++DataSetIndex) { FDataSetThreadLocalTempData&RESTRICT Data = ThreadLocalTempData[DataSetIndex]; if (Data.IDsToFree.Num() > 0) { TArray&RESTRICT FreeIDTable = *DataSetMetaTable[DataSetIndex].FreeIDTable; int32&RESTRICT NumFreeIDs = *DataSetMetaTable[DataSetIndex].NumFreeIDs; check(FreeIDTable.Num() >= NumFreeIDs + Data.IDsToFree.Num()); //Temporarily locking the free table until we can implement something lock-free DataSetMetaTable[DataSetIndex].LockFreeTable(); for (int32 IDToFree : Data.IDsToFree) { //UE_LOG(LogVectorVM, Warning, TEXT("AddFreeID: ID:%d | FreeTableIdx:%d."), IDToFree, NumFreeIDs); FreeIDTable[NumFreeIDs++] = IDToFree; } //Unlock the free table. DataSetMetaTable[DataSetIndex].UnlockFreeTable(); Data.IDsToFree.Reset(); } //Also update the max ID seen. This should be the ONLY place in the VM we update this max value. if ( bIsParallelExecution ) { volatile int32* MaxUsedID = DataSetMetaTable[DataSetIndex].MaxUsedID; int32 LocalMaxUsedID; do { LocalMaxUsedID = *MaxUsedID; if (LocalMaxUsedID >= Data.MaxID) { break; } } while (FPlatformAtomics::InterlockedCompareExchange(MaxUsedID, Data.MaxID, LocalMaxUsedID) != LocalMaxUsedID); *MaxUsedID = FMath::Max(*MaxUsedID, Data.MaxID); } else { int32* MaxUsedID = DataSetMetaTable[DataSetIndex].MaxUsedID; *MaxUsedID = FMath::Max(*MaxUsedID, Data.MaxID); } } #if STATS check(ScopeExecCycles.Num() == StatScopes.Num()); for (int i = 0; i < StatScopes.Num(); i++) { uint64 ExecTime = ScopeExecCycles[i]; if (ExecTime > 0) { std::atomic_fetch_add(&StatScopes[i].ExecutionCycleCount, ExecTime); } } StatScopes = TArrayView(); #elif ENABLE_STATNAMEDEVENTS StatNamedEventScopes = TArrayView(); #endif } ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// // Kernels template struct TUnaryKernelHandler { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Arg0Handler::Optimize(Context); DstHandler::Optimize(Context); } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); DstHandler Dst(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance()); } } }; template struct TBinaryKernelHandler { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Arg0Handler::Optimize(Context); Arg1Handler::Optimize(Context); DstHandler::Optimize(Context); } static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); DstHandler Dst(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance()); } } }; template struct TTrinaryKernelHandler { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Arg0Handler::Optimize(Context); Arg1Handler::Optimize(Context); Arg2Handler::Optimize(Context); DstHandler::Optimize(Context); } static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Arg2Handler Arg2(Context); DstHandler Dst(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance(), Arg2.GetAndAdvance()); } } }; /** Base class of vector kernels with a single operand. */ template struct TUnaryKernel { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TUnaryKernelHandler::Optimize(Context); break; case SRCOP_RRC: TUnaryKernelHandler::Optimize(Context); break; default: check(0); break; }; } static void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TUnaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TUnaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TUnaryScalarKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TUnaryVectorKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TUnaryScalarIntKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TUnaryVectorIntKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /** Base class of Vector kernels with 2 operands. */ template struct TBinaryKernel { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TBinaryKernelHandler::Optimize(Context); break; case SRCOP_RRC: TBinaryKernelHandler::Optimize(Context); break; case SRCOP_RCR: TBinaryKernelHandler::Optimize(Context); break; case SRCOP_RCC: TBinaryKernelHandler::Optimize(Context); break; default: check(0); break; }; } static void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RCR: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RCC: TBinaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TBinaryScalarKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TBinaryVectorKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TBinaryVectorIntKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /** Base class of Vector kernels with 3 operands. */ template struct TTrinaryKernel { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_RRC: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_RCR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_RCC: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CRR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CRC: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CCR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CCC: TTrinaryKernelHandler::Optimize(Context); break; default: check(0); break; }; } static void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RCR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RCC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CRR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CRC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CCR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CCC: TTrinaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TTrinaryScalarKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TTrinaryVectorKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TTrinaryVectorIntKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /*------------------------------------------------------------------------------ Implementation of all kernel operations. ------------------------------------------------------------------------------*/ struct FVectorKernelAdd : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1) { *Dst = VectorAdd(Src0, Src1); } }; struct FVectorKernelSub : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1) { *Dst = VectorSubtract(Src0, Src1); } }; struct FVectorKernelMul : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1) { *Dst = VectorMultiply(Src0, Src1); } }; struct FVectorKernelDiv : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorDivide(Src0, Src1); } }; struct FVectorKernelDivSafe : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { VectorRegister4Float ValidMask = VectorCompareGT(VectorAbs(Src1), GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorDivide(Src0, Src1), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelMad : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1,VectorRegister4Float Src2) { *Dst = VectorMultiplyAdd(Src0, Src1, Src2); } }; struct FVectorKernelLerp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1,VectorRegister4Float Src2) { const VectorRegister4Float OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Src2); const VectorRegister4Float Tmp = VectorMultiply(Src0, OneMinusAlpha); *Dst = VectorMultiplyAdd(Src1, Src2, Tmp); } }; struct FVectorKernelRcp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0) { *Dst = VectorVMAccuracy::Reciprocal(Src0); } }; // if the magnitude of the value is too small, then the result will be 0 (not NaN/Inf) struct FVectorKernelRcpSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { VectorRegister4Float ValidMask = VectorCompareGT(VectorAbs(Src0), GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorVMAccuracy::Reciprocal(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelRsq : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0) { *Dst = VectorVMAccuracy::ReciprocalSqrt(Src0); } }; // if the value is very small or negative, then the result will be 0 (not NaN/Inf/imaginary) struct FVectorKernelRsqSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { VectorRegister4Float ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorVMAccuracy::ReciprocalSqrt(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelSqrt : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0) { *Dst = VectorVMAccuracy::Sqrt(Src0); } }; struct FVectorKernelSqrtSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { VectorRegister4Float ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorVMAccuracy::Sqrt(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelNeg : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0) { *Dst = VectorNegate(Src0); } }; struct FVectorKernelAbs : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0) { *Dst = VectorAbs(Src0); } }; struct FVectorKernelExp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorExp(Src0); } }; struct FVectorKernelExp2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorExp2(Src0); } }; struct FVectorKernelLog : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorLog(Src0); } }; struct FVectorKernelLogSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { VectorRegister4Float ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::FloatZero); *Dst = VectorSelect(ValidMask, VectorLog(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelLog2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorLog2(Src0); } }; struct FVectorKernelStep : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorStep(VectorSubtract(Src1, Src0)); } }; struct FVectorKernelClamp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1,VectorRegister4Float Src2) { const VectorRegister4Float Tmp = VectorMax(Src0, Src1); *Dst = VectorMin(Tmp, Src2); } }; struct FVectorKernelSin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorSin(Src0); } }; struct FVectorKernelCos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorCos(Src0); } }; struct FVectorKernelTan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorTan(Src0); } }; struct FVectorKernelASin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorASin(Src0); } }; struct FVectorKernelACos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorACos(Src0); } }; struct FVectorKernelATan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorATan(Src0); } }; struct FVectorKernelATan2 : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorATan2(Src0, Src1); } }; struct FVectorKernelCeil : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorCeil(Src0); } }; struct FVectorKernelFloor : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorFloor(Src0); } }; struct FVectorKernelRound : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { //TODO: >SSE4 has direct ops for this. VectorRegister4Float Trunc = VectorTruncate(Src0); *Dst = VectorAdd(Trunc, VectorTruncate(VectorMultiply(VectorSubtract(Src0, Trunc), GlobalVectorConstants::FloatAlmostTwo()))); } }; struct FVectorKernelMod : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorMod(Src0, Src1); } }; struct FVectorKernelFrac : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorFractional(Src0); } }; struct FVectorKernelTrunc : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorTruncate(Src0); } }; struct FVectorKernelCompareLT : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorCompareLT(Src0, Src1); } }; struct FVectorKernelCompareLE : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorCompareLE(Src0, Src1); } }; struct FVectorKernelCompareGT : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorCompareGT(Src0, Src1); } }; struct FVectorKernelCompareGE : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorCompareGE(Src0, Src1); } }; struct FVectorKernelCompareEQ : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorCompareEQ(Src0, Src1); } }; struct FVectorKernelCompareNEQ : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { *Dst = VectorCompareNE(Src0, Src1); } }; struct FVectorKernelSelect : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Mask, VectorRegister4Float A, VectorRegister4Float B) { *Dst = VectorSelect(Mask, A, B); } }; struct FVectorKernelExecutionIndex { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); FRegisterHandler::Optimize(Context); } static void VM_FORCEINLINE Exec(FVectorVMContext& Context) { static_assert(VECTOR_WIDTH_FLOATS == 4, "Need to update this when upgrading the VM to support >SSE2"); VectorRegister4Int VectorStride = MakeVectorRegisterInt(VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS); VectorRegister4Int Index = MakeVectorRegisterInt(Context.StartInstance, Context.StartInstance + 1, Context.StartInstance + 2, Context.StartInstance + 3); FRegisterHandler Dest(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { *Dest.GetDestAndAdvance() = Index; Index = VectorIntAdd(Index, VectorStride); } } }; struct FVectorKernelEnterStatScope { static void Optimize(FVectorVMCodeOptimizerContext& Context) { #if STATS Context.WriteExecFunction(Exec); FConstantHandler::Optimize(Context); #elif ENABLE_STATNAMEDEVENTS if ( GbDetailedVMScriptStats ) { Context.WriteExecFunction(Exec); FConstantHandler::Optimize(Context); } else { FConstantHandler::OptimizeSkip(Context); } #else // just skip the op if we don't have stats enabled FConstantHandler::OptimizeSkip(Context); #endif } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { FConstantHandler ScopeIdx(Context); #if STATS if (GbDetailedVMScriptStats && Context.StatScopes.Num()) { int32 CounterIdx = Context.StatCounterStack.AddDefaulted(1); int32 ScopeIndex = ScopeIdx.Get(); Context.StatCounterStack[CounterIdx].CycleCounter.Start(Context.StatScopes[ScopeIndex].StatId); Context.StatCounterStack[CounterIdx].VmCycleCounter = { ScopeIndex, FPlatformTime::Cycles64() }; } #elif ENABLE_STATNAMEDEVENTS if (Context.StatNamedEventScopes.Num()) { FPlatformMisc::BeginNamedEvent(FColor::Red, *Context.StatNamedEventScopes[ScopeIdx.Get()]); } #endif } }; struct FVectorKernelExitStatScope { static void Optimize(FVectorVMCodeOptimizerContext& Context) { #if STATS Context.WriteExecFunction(Exec); #elif ENABLE_STATNAMEDEVENTS if (GbDetailedVMScriptStats) { Context.WriteExecFunction(Exec); } #endif } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { #if STATS if (GbDetailedVMScriptStats && Context.StatScopes.Num()) { FStatStackEntry& StackEntry = Context.StatCounterStack.Last(); StackEntry.CycleCounter.Stop(); Context.ScopeExecCycles[StackEntry.VmCycleCounter.ScopeIndex] += FPlatformTime::Cycles64() - StackEntry.VmCycleCounter.ScopeEnterCycles; Context.StatCounterStack.Pop(EAllowShrinking::No); } #elif ENABLE_STATNAMEDEVENTS if (Context.StatNamedEventScopes.Num()) { FPlatformMisc::EndNamedEvent(); } #endif } }; struct FVectorKernelRandom : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { //EEK!. Improve this. Implement GPU style seeded rand instead of this. VectorRegister4Float Result = MakeVectorRegister(Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction()); *Dst = VectorMultiply(Result, Src0); } }; /* gaussian distribution random number (not working yet) */ struct FVectorKernelRandomGauss : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { VectorRegister4Float Result = MakeVectorRegister(Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction()); Result = VectorSubtract(Result, GlobalVectorConstants::FloatOneHalf); Result = VectorMultiply(MakeVectorRegister(3.0f, 3.0f, 3.0f, 3.0f), Result); // taylor series gaussian approximation const VectorRegister4Float SPi2 = VectorReciprocal(VectorReciprocalSqrt(MakeVectorRegister(2 * PI, 2 * PI, 2 * PI, 2 * PI))); VectorRegister4Float Gauss = VectorReciprocal(SPi2); VectorRegister4Float Div = VectorMultiply(GlobalVectorConstants::FloatTwo, SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorMultiply(Result, Result), Div)); Div = VectorMultiply(MakeVectorRegister(8.0f, 8.0f, 8.0f, 8.0f), SPi2); Gauss = VectorAdd(Gauss, VectorDivide(VectorPow(MakeVectorRegister(4.0f, 4.0f, 4.0f, 4.0f), Result), Div)); Div = VectorMultiply(MakeVectorRegister(48.0f, 48.0f, 48.0f, 48.0f), SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorPow(MakeVectorRegister(6.0f, 6.0f, 6.0f, 6.0f), Result), Div)); Gauss = VectorDivide(Gauss, MakeVectorRegister(0.4f, 0.4f, 0.4f, 0.4f)); Gauss = VectorMultiply(Gauss, Src0); *Dst = Gauss; } }; struct FVectorKernelMin : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1) { *Dst = VectorMin(Src0, Src1); } }; struct FVectorKernelMax : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1) { *Dst = VectorMax(Src0, Src1); } }; struct FVectorKernelPow : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst,VectorRegister4Float Src0,VectorRegister4Float Src1) { *Dst = VectorPow(Src0, Src1); } }; // if the base is small, then the result will be 0 struct FVectorKernelPowSafe : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0, VectorRegister4Float Src1) { VectorRegister4Float ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorPow(Src0, Src1), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelSign : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { *Dst = VectorSign(Src0); } }; namespace VectorVMNoise { int32 P[512] = { 151,160,137,91,90,15, 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23, 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33, 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166, 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244, 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196, 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123, 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42, 223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9, 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228, 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107, 49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254, 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180, 151,160,137,91,90,15, 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23, 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33, 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166, 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244, 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196, 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123, 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42, 223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9, 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228, 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107, 49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254, 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180 }; static FORCEINLINE float Lerp(float X, float A, float B) { return A + X * (B - A); } static FORCEINLINE float Fade(float X) { return X * X * X * (X * (X * 6 - 15) + 10); } static FORCEINLINE float Grad(int32 hash, float x, float y, float z) { hash &= 15; float u = (hash < 8) ? x : y; float v = (hash < 4) ? y : ((hash == 12 || hash == 14) ? x : z); return ((hash & 1) == 0 ? u : -u) + ((hash & 2) == 0 ? v : -v); } struct FScalarKernelNoise3D_iNoise : TTrinaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X, float Y, float Z) { float Xfl = FMath::FloorToFloat(X); float Yfl = FMath::FloorToFloat(Y); float Zfl = FMath::FloorToFloat(Z); int32 Xi = (int32)(Xfl) & 255; int32 Yi = (int32)(Yfl) & 255; int32 Zi = (int32)(Zfl) & 255; X -= Xfl; Y -= Yfl; Z -= Zfl; float Xm1 = X - 1.0f; float Ym1 = Y - 1.0f; float Zm1 = Z - 1.0f; int32 A = P[Xi] + Yi; int32 AA = P[A] + Zi; int32 AB = P[A + 1] + Zi; int32 B = P[Xi + 1] + Yi; int32 BA = P[B] + Zi; int32 BB = P[B + 1] + Zi; float U = Fade(X); float V = Fade(Y); float W = Fade(Z); *Dst = Lerp(W, Lerp(V, Lerp(U, Grad(P[AA], X, Y, Z), Grad(P[BA], Xm1, Y, Z)), Lerp(U, Grad(P[AB], X, Ym1, Z), Grad(P[BB], Xm1, Ym1, Z))), Lerp(V, Lerp(U, Grad(P[AA + 1], X, Y, Zm1), Grad(P[BA + 1], Xm1, Y, Zm1)), Lerp(U, Grad(P[AB + 1], X, Ym1, Zm1), Grad(P[BB + 1], Xm1, Ym1, Zm1)))); } }; struct FScalarKernelNoise2D_iNoise : TBinaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X, float Y) { *Dst = 0.0f;//TODO } }; struct FScalarKernelNoise1D_iNoise : TUnaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X) { *Dst = 0.0f;//TODO; } }; static void Noise1D(FVectorVMContext& Context) { FScalarKernelNoise1D_iNoise::Exec(Context); } static void Noise2D(FVectorVMContext& Context) { FScalarKernelNoise2D_iNoise::Exec(Context); } static void Noise3D(FVectorVMContext& Context) { //Basic scalar implementation of perlin's improved noise until I can spend some quality time exploring vectorized implementations of Marc O's noise from Random.ush. //http://mrl.nyu.edu/~perlin/noise/ FScalarKernelNoise3D_iNoise::Exec(Context); } static void Optimize_Noise1D(FVectorVMCodeOptimizerContext& Context) { FScalarKernelNoise1D_iNoise::Optimize(Context); } static void Optimize_Noise2D(FVectorVMCodeOptimizerContext& Context) { FScalarKernelNoise2D_iNoise::Optimize(Context); } static void Optimize_Noise3D(FVectorVMCodeOptimizerContext& Context) { FScalarKernelNoise3D_iNoise::Optimize(Context); } }; //Olaf's orginal curl noise. Needs updating for the new scalar VM and possibly calling Curl Noise to avoid confusion with regular noise? //Possibly needs to be a data interface as the VM can't output Vectors? struct FVectorKernelNoise : public TUnaryVectorKernel { static VectorRegister4Float RandomTable[17][17][17]; static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* RESTRICT Dst, VectorRegister4Float Src0) { const VectorRegister4Float VecSize = MakeVectorRegister(16.0f, 16.0f, 16.0f, 16.0f); *Dst = GlobalVectorConstants::FloatZero; for (uint32 i = 1; i < 2; i++) { float Di = 0.2f * (1.0f/(1<(&Coords); const int32 Cx = CoordPtr[0]; const int32 Cy = CoordPtr[1]; const int32 Cz = CoordPtr[2]; VectorRegister4Float Frac = VectorFractional(Coords); VectorRegister4Float Alpha = VectorReplicate(Frac, 0); VectorRegister4Float OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Alpha); VectorRegister4Float XV1 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz], OneMinusAlpha)); VectorRegister4Float XV2 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz], OneMinusAlpha)); VectorRegister4Float XV3 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz+1], OneMinusAlpha)); VectorRegister4Float XV4 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz+1], OneMinusAlpha)); Alpha = VectorReplicate(Frac, 1); OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Alpha); VectorRegister4Float YV1 = VectorMultiplyAdd(XV1, Alpha, VectorMultiply(XV2, OneMinusAlpha)); VectorRegister4Float YV2 = VectorMultiplyAdd(XV3, Alpha, VectorMultiply(XV4, OneMinusAlpha)); Alpha = VectorReplicate(Frac, 2); OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Alpha); VectorRegister4Float ZV = VectorMultiplyAdd(YV1, Alpha, VectorMultiply(YV2, OneMinusAlpha)); *Dst = VectorAdd(*Dst, ZV); } } }; namespace VectorKernelNoiseImpl { static void BuildNoiseTable() { // random noise float TempTable[17][17][17]; for (int z = 0; z < 17; z++) { for (int y = 0; y < 17; y++) { for (int x = 0; x < 17; x++) { float f1 = (float)FMath::FRandRange(-1.0f, 1.0f); TempTable[x][y][z] = f1; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable[i][j][16] = TempTable[i][j][0]; TempTable[i][16][j] = TempTable[i][0][j]; TempTable[16][j][i] = TempTable[0][j][i]; } } // compute gradients FVector3f TempTable2[17][17][17]; for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector3f XGrad = FVector3f(1.0f, 0.0f, TempTable[x][y][z] - TempTable[x + 1][y][z]); FVector3f YGrad = FVector3f(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y + 1][z]); FVector3f ZGrad = FVector3f(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y][z + 1]); FVector3f Grad = FVector3f(XGrad.Z, YGrad.Z, ZGrad.Z); TempTable2[x][y][z] = Grad; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable2[i][j][16] = TempTable2[i][j][0]; TempTable2[i][16][j] = TempTable2[i][0][j]; TempTable2[16][j][i] = TempTable2[0][j][i]; } } // compute curl of gradient field for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector3f Dy = TempTable2[x][y][z] - TempTable2[x][y + 1][z]; FVector3f Sy = TempTable2[x][y][z] + TempTable2[x][y + 1][z]; FVector3f Dx = TempTable2[x][y][z] - TempTable2[x + 1][y][z]; FVector3f Sx = TempTable2[x][y][z] + TempTable2[x + 1][y][z]; FVector3f Dz = TempTable2[x][y][z] - TempTable2[x][y][z + 1]; FVector3f Sz = TempTable2[x][y][z] + TempTable2[x][y][z + 1]; FVector3f Dir = FVector3f(Dy.Z - Sz.Y, Dz.X - Sx.Z, Dx.Y - Sy.X); FVectorKernelNoise::RandomTable[x][y][z] = MakeVectorRegister(Dir.X, Dir.Y, Dir.Z, 0.f); } } } } } VectorRegister4Float FVectorKernelNoise::RandomTable[17][17][17]; ////////////////////////////////////////////////////////////////////////// //Special Kernels. /** Special kernel for acquiring a new ID. TODO. Can be written as general RWBuffer ops when we support that. */ struct FScalarKernelAcquireID { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // IDIndexReg Context.Write(Context.DecodeU16()); // IDTagReg } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const TArrayView MetaTable = Context.DataSetMetaTable; TArray&RESTRICT FreeIDTable = *MetaTable[DataSetIndex].FreeIDTable; TArray&RESTRICT SpawnedIDsTable = *MetaTable[DataSetIndex].SpawnedIDsTable; const int32 Tag = MetaTable[DataSetIndex].IDAcquireTag; const int32 IDIndexReg = Context.DecodeU16(); int32*RESTRICT IDIndex = (int32*)(Context.GetTempRegister(IDIndexReg)); const int32 IDTagReg = Context.DecodeU16(); int32*RESTRICT IDTag = (int32*)(Context.GetTempRegister(IDTagReg)); int32& NumFreeIDs = *MetaTable[DataSetIndex].NumFreeIDs; //Temporarily using a lock to ensure thread safety for accessing the FreeIDTable until a lock free solution can be implemented. MetaTable[DataSetIndex].LockFreeTable(); check(FreeIDTable.Num() >= Context.NumInstances); check(NumFreeIDs >= Context.NumInstances); for (int32 i = 0; i < Context.NumInstances; ++i) { int32 FreeIDTableIndex = --NumFreeIDs; //Grab the value from the FreeIDTable. int32 AcquiredID = FreeIDTable[FreeIDTableIndex]; checkSlow(AcquiredID != INDEX_NONE); //UE_LOG(LogVectorVM, Warning, TEXT("AcquireID: ID:%d | FreeTableIdx:%d."), AcquiredID, FreeIDTableIndex); //Mark this entry in the FreeIDTable as invalid. FreeIDTable[FreeIDTableIndex] = INDEX_NONE; *IDIndex = AcquiredID; *IDTag = Tag; ++IDIndex; ++IDTag; SpawnedIDsTable.Add(AcquiredID); } MetaTable[DataSetIndex].UnlockFreeTable(); } }; /** Special kernel for updating a new ID. TODO. Can be written as general RWBuffer ops when we support that. */ struct FScalarKernelUpdateID { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InstanceIDRegisterIndex Context.Write(Context.DecodeU16()); // InstanceIndexRegisterIndex } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 InstanceIDRegisterIndex = Context.DecodeU16(); const int32 InstanceIndexRegisterIndex = Context.DecodeU16(); const TArrayView MetaTable = Context.DataSetMetaTable; TArray&RESTRICT IDTable = *MetaTable[DataSetIndex].IDTable; const int32 InstanceOffset = MetaTable[DataSetIndex].InstanceOffset; const int32 AbsoluteStartInstance = InstanceOffset + Context.StartInstance; const int32*RESTRICT IDRegister = (int32*)(Context.GetTempRegister(InstanceIDRegisterIndex)); const int32*RESTRICT IndexRegister = (int32*)(Context.GetTempRegister(InstanceIndexRegisterIndex)); FDataSetThreadLocalTempData& DataSetTempData = Context.ThreadLocalTempData[DataSetIndex]; TArray&RESTRICT IDsToFree = DataSetTempData.IDsToFree; check(IDTable.Num() >= AbsoluteStartInstance + Context.NumInstances); for (int32 i = 0; i < Context.NumInstances; ++i) { int32 InstanceId = IDRegister[i]; int32 Index = IndexRegister[i]; if (Index == INDEX_NONE) { //Add the ID to a thread local list of IDs to free which are actually added to the list safely at the end of this chunk's execution. IDsToFree.Add(InstanceId); //UE_LOG(LogVectorVM, Warning, TEXT("FreeingID: InstanceID:%d."), InstanceId); } else { // Update the actual index for this ID. No thread safety is needed as this ID slot can only ever be written by this instance and so a single thread. // The index passed into this function is the same as that given to the OutputData*() functions (FScalarKernelWriteOutputIndexed kernel). // That value is local to the execution step (update or spawn), so we must offset it by the start instance number for the step, just like // GetOutputRegister() does. IDTable[InstanceId] = Index + InstanceOffset; //Update thread local max ID seen. We push this to the real value at the end of execution. DataSetTempData.MaxID = FMath::Max(DataSetTempData.MaxID, InstanceId); //UE_LOG(LogVectorVM, Warning, TEXT("UpdateID: RealIdx:%d | InstanceID:%d."), RealIdx, InstanceId); } } } }; /** Special kernel for reading from the main input dataset. */ template struct FVectorKernelReadInput { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InputRegisterIdx Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { static const int32 InstancesPerVector = sizeof(VectorRegister4Float ) / sizeof(SourceType); const int32 DataSetIndex = Context.DecodeU16(); const int32 InputRegisterIdx = Context.DecodeU16(); const int32 DestRegisterIdx = Context.DecodeU16(); int32 Loops = Context.GetNumLoops(); VectorRegister4Float* DestReg = (VectorRegister4Float*)(Context.GetTempRegister(DestRegisterIdx)); VectorRegister4Float* InputReg = (VectorRegister4Float*)(Context.GetInputRegister(DataSetIndex, InputRegisterIdx) + Context.GetStartInstance()); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. while(Loops > 0) { *DestReg = VectorLoad(InputReg); ++DestReg; ++InputReg; Loops--; } } }; template<> struct FVectorKernelReadInput { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InputRegisterIdx Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 InputRegisterIdx = Context.DecodeU16(); const int32 DestRegisterIdx = Context.DecodeU16(); int32 Loops = Context.GetNumLoops<4>(); float* DestReg = (float*)(Context.GetTempRegister(DestRegisterIdx)); uint16* InputReg = (uint16*)(Context.GetInputRegister(DataSetIndex, InputRegisterIdx) + Context.GetStartInstance()); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. while (Loops > 1) { FPlatformMath::WideVectorLoadHalf(DestReg, InputReg); DestReg += 8; InputReg += 8; Loops -= 2; } while (Loops > 0) { FPlatformMath::VectorLoadHalf(DestReg, InputReg); DestReg += 4; InputReg += 4; Loops -= 1; } } }; /** Special kernel for reading from an input dataset; non-advancing (reads same instance everytime). * this kernel splats the X component of the source register to all 4 dest components; it's meant to * use scalar data sets as the source (e.g. events) */ template struct FVectorKernelReadInputNoAdvance { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InputRegisterIdx Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { static const int32 InstancesPerVector = sizeof(VectorRegister4Float) / sizeof(T); const int32 DataSetIndex = Context.DecodeU16(); const int32 InputRegisterIdx = Context.DecodeU16(); const int32 DestRegisterIdx = Context.DecodeU16(); const int32 Loops = Context.GetNumLoops(); VectorRegister4Float* DestReg = (VectorRegister4Float*)(Context.GetTempRegister(DestRegisterIdx)); VectorRegister4Float* InputReg = (VectorRegister4Float*)(Context.GetInputRegister(DataSetIndex, InputRegisterIdx)); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. for (int32 i = 0; i < Loops; ++i) { *DestReg = VectorSwizzle(VectorLoad(InputReg), 0,0,0,0); ++DestReg; } } }; template<> struct FVectorKernelReadInputNoAdvance { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.WriteExecFunction(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InputRegisterIdx Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 InputRegisterIdx = Context.DecodeU16(); const int32 DestRegisterIdx = Context.DecodeU16(); const int32 Loops = Context.GetNumLoops<4>(); VectorRegister4Float* DestReg = (VectorRegister4Float*)(Context.GetTempRegister(DestRegisterIdx)); uint16* InputReg = (uint16*)(Context.GetInputRegister(DataSetIndex, InputRegisterIdx)); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. for (int32 i = 0; i < Loops; ++i) { float flt = FPlatformMath::LoadHalf(InputReg); *DestReg = VectorLoadFloat1(&flt); ++DestReg; } } }; //TODO - Should be straight forwards to follow the input with a mix of the outputs direct indexing /** Special kernel for reading an specific location in an input register. */ // template // struct FScalarKernelReadInputIndexed // { // static VM_FORCEINLINE void Exec(FVectorVMContext& Context) // { // int32* IndexReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]); // T* InputReg = (T*)(Context.RegisterTable[DecodeU16(Context)]); // T* DestReg = (T*)(Context.RegisterTable[DecodeU16(Context)]); // // //Has to be scalar as each instance can read from a different location in the input buffer. // for (int32 i = 0; i < Context.NumInstances; ++i) // { // T* ReadPtr = (*InputReg) + (*IndexReg); // *DestReg = (*ReadPtr); // ++IndexReg; // ++DestReg; // } // } // }; /** Special kernel for writing to a specific output register. */ template struct FScalarKernelWriteOutputIndexed { static VM_FORCEINLINE void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: Context.WriteExecFunction(DoKernel>); break; case SRCOP_RRC: Context.WriteExecFunction(DoKernel>); break; default: check(0); break; }; Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // DestIndexRegisterIdx Context.Write(Context.DecodeU16()); // DataHandlerType Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: DoKernel>(Context); break; case SRCOP_RRC: DoKernel>(Context); break; default: check(0); break; }; } template static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 DestIndexRegisterIdx = Context.DecodeU16(); int32* RESTRICT DestIndexReg = (int32*)(Context.GetTempRegister(DestIndexRegisterIdx)); DataHandlerType DataHandler(Context); const int32 DestRegisterIdx = Context.DecodeU16(); DestType* RESTRICT DestReg = Context.GetOutputRegister(DataSetIndex, DestRegisterIdx); int NumInstances = Context.GetNumInstances(); for (int32 i = 0; i < NumInstances; ++i) { int32 DestIndex = *DestIndexReg; if (DestIndex != INDEX_NONE) { DestReg[DestIndex] = DataHandler.Get(); } ++DestIndexReg; DataHandler.Advance(); //We don't increment the dest as we index into it directly. } } }; template<> struct FScalarKernelWriteOutputIndexed { static VM_FORCEINLINE void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: Context.WriteExecFunction(DoKernel>); break; case SRCOP_RRC: Context.WriteExecFunction(DoKernel>); break; default: check(0); break; }; Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // DestIndexRegisterIdx Context.Write(Context.DecodeU16()); // DataHandlerType Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: DoKernel>(Context); break; case SRCOP_RRC: DoKernel>(Context); break; default: check(0); break; }; } template static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 DestIndexRegisterIdx = Context.DecodeU16(); int32* RESTRICT DestIndexReg = (int32*)(Context.GetTempRegister(DestIndexRegisterIdx)); DataHandlerType DataHandler(Context); const int32 DestRegisterIdx = Context.DecodeU16(); uint16* RESTRICT DestReg = (uint16*)Context.GetOutputRegister(DataSetIndex, DestRegisterIdx); int NumInstances = Context.GetNumInstances(); for (int32 i = 0; i < NumInstances; ++i) { int32 DestIndex = *DestIndexReg; if (DestIndex != INDEX_NONE) { FPlatformMath::StoreHalf(&DestReg[DestIndex], DataHandler.Get()); } ++DestIndexReg; DataHandler.Advance(); //We don't increment the dest as we index into it directly. } } }; struct FDataSetCounterHandler { int32* Counter; FDataSetCounterHandler(FVectorVMContext& Context) : Counter(&Context.GetDataSetMeta(Context.DecodeU16()).DataSetAccessIndex) {} VM_FORCEINLINE void Advance() { } VM_FORCEINLINE int32* Get() { return Counter; } VM_FORCEINLINE int32* GetAndAdvance() { return Counter; } //VM_FORCEINLINE const int32* GetDest() { return Counter; }Should never use as a dest. All kernels with read and write to this. static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Context.DecodeU16()); } }; struct FScalarKernelAcquireCounterIndex { template struct InternalKernel { static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context, int32* RESTRICT Dst, int32* Index, int32 Valid) { if (Valid != 0) { *Dst = bThreadsafe ? FPlatformAtomics::InterlockedIncrement(Index) : ++(*Index); } else { *Dst = INDEX_NONE; // Subsequent DoKernal calls above will skip over INDEX_NONE register entries... } } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const uint32 SrcOpType = Context.DecodeSrcOperandTypes(); switch (SrcOpType) { case SRCOP_RRR: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; }; } }; template static void ExecOptimized(FVectorVMContext& Context) { if (Context.IsParallelExecution()) { switch (SrcOpType) { case SRCOP_RRR: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; } } else { switch (SrcOpType) { case SRCOP_RRR: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; } } } static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpType = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpType) { case SRCOP_RRR: Context.WriteExecFunction(FScalarKernelAcquireCounterIndex::ExecOptimized); break; case SRCOP_RRC: Context.WriteExecFunction(FScalarKernelAcquireCounterIndex::ExecOptimized); break; default: check(0); break; } // Three registers, note we don't call Optimize on the Kernel since that will write the Exec and we are selecting based upon thread safe or not Context.Write(Context.DecodeU16()); Context.Write(Context.DecodeU16()); Context.Write(Context.DecodeU16()); } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { if ( Context.IsParallelExecution() ) { InternalKernel::Exec(Context); } else { InternalKernel::Exec(Context); } } }; //TODO: REWORK TO FUNCITON LIKE THE ABOVE. // /** Special kernel for decrementing a dataset counter. */ // struct FScalarKernelReleaseCounterIndex // { // static VM_FORCEINLINE void Exec(FVectorVMContext& Context) // { // int32* CounterPtr = (int32*)(Context.ConstantTable[DecodeU16(Context)]); // int32* DestReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]); // // for (int32 i = 0; i < Context.NumInstances; ++i) // { // int32 Counter = (*CounterPtr--); // *DestReg = Counter >= 0 ? Counter : INDEX_NONE; // // ++DestReg; // } // } // }; ////////////////////////////////////////////////////////////////////////// //external_func_call struct FKernelExternalFunctionCall { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 ExternalFuncIdx = Context.DecodeU8(); Context.WriteExecFunction(Exec); Context.Write(ExternalFuncIdx); const int32 NumRegisters = Context.ExternalFunctionRegisterCounts[ExternalFuncIdx]; for ( int32 i=0; i < NumRegisters; ++i ) { Context.Write(Context.DecodeU16()); } } static void Exec(FVectorVMContext& Context) { #if VECTORVM_SUPPORTS_LEGACY const uint32 ExternalFuncIdx = Context.DecodeU8(); const FVMExternalFunction* ExternalFunction = Context.ExternalFunctionTable[ExternalFuncIdx]; check(ExternalFunction); if (ExternalFunction) { #if VECTORVM_SUPPORTS_EXPERIMENTAL FVectorVMExternalFunctionContext DataInterfaceFunctionContext(&Context); #else FVectorVMExternalFunctionContextLegacy DataInterfaceFunctionContext(&Context); #endif ExternalFunction->Execute(DataInterfaceFunctionContext); } #endif } }; ////////////////////////////////////////////////////////////////////////// //Integer operations //addi, struct FVectorIntKernelAdd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntAdd(Src0, Src1); } }; //subi, struct FVectorIntKernelSubtract : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntSubtract(Src0, Src1); } }; //muli, struct FVectorIntKernelMultiply : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntMultiply(Src0, Src1); } }; //divi, struct FVectorIntKernelDivide : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); // No intrinsics exist for integer divide. Since div by zero causes crashes, we must be safe against that. int32 TmpDst[4]; TmpDst[0] = SafeIntDivide(TmpA[0], TmpB[0]); TmpDst[1] = SafeIntDivide(TmpA[1], TmpB[1]); TmpDst[2] = SafeIntDivide(TmpA[2], TmpB[2]); TmpDst[3] = SafeIntDivide(TmpA[3], TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3]); } private: VM_FORCEINLINE static int32 SafeIntDivide(int32 Numerator, int32 Denominator) { static constexpr int32 MinIntValue = std::numeric_limits::min(); static constexpr int32 MaxIntValue = std::numeric_limits::max(); if (Denominator == 0) { return 0; } else if ((Denominator == -1) && (Numerator == MinIntValue)) { return MaxIntValue; } return Numerator / Denominator; } }; //clampi, struct FVectorIntKernelClamp : TTrinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1, VectorRegister4Int Src2) { *Dst = VectorIntMin(VectorIntMax(Src0, Src1), Src2); } }; //mini, struct FVectorIntKernelMin : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntMin(Src0, Src1); } }; //maxi, struct FVectorIntKernelMax : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntMax(Src0, Src1); } }; //absi, struct FVectorIntKernelAbs : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { *Dst = VectorIntAbs(Src0); } }; //negi, struct FVectorIntKernelNegate : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { *Dst = VectorIntNegate(Src0); } }; //signi, struct FVectorIntKernelSign : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { *Dst = VectorIntSign(Src0); } }; //randomi, //No good way to do this with SSE atm so just do it scalar. struct FScalarIntKernelRandom : public TUnaryScalarIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, int32* RESTRICT Dst, int32 Src0) { //EEK!. Improve this. Implement GPU style seeded rand instead of this. *Dst = static_cast(Context.RandStream.GetFraction() * Src0); } }; //cmplti, struct FVectorIntKernelCompareLT : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntCompareLT(Src0, Src1); } }; //cmplei, struct FVectorIntKernelCompareLE : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntCompareLE(Src0, Src1); } }; //cmpgti, struct FVectorIntKernelCompareGT : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntCompareGT(Src0, Src1); } }; //cmpgei, struct FVectorIntKernelCompareGE : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntCompareGE(Src0, Src1); } }; //cmpeqi, struct FVectorIntKernelCompareEQ : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntCompareEQ(Src0, Src1); } }; //cmpneqi, struct FVectorIntKernelCompareNEQ : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntCompareNEQ(Src0, Src1); } }; //bit_and, struct FVectorIntKernelBitAnd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntAnd(Src0, Src1); } }; //bit_or, struct FVectorIntKernelBitOr : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntOr(Src0, Src1); } }; //bit_xor, struct FVectorIntKernelBitXor : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { *Dst = VectorIntXor(Src0, Src1); } }; //bit_not, struct FVectorIntKernelBitNot : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { *Dst = VectorIntNot(Src0); } }; // bit_lshift struct FVectorIntKernelBitLShift : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); int32 TmpDst[4]; TmpDst[0] = (TmpA[0] << TmpB[0]); TmpDst[1] = (TmpA[1] << TmpB[1]); TmpDst[2] = (TmpA[2] << TmpB[2]); TmpDst[3] = (TmpA[3] << TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3] ); } }; // bit_rshift struct FVectorIntKernelBitRShift : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); int32 TmpDst[4]; TmpDst[0] = (TmpA[0] >> TmpB[0]); TmpDst[1] = (TmpA[1] >> TmpB[1]); TmpDst[2] = (TmpA[2] >> TmpB[2]); TmpDst[3] = (TmpA[3] >> TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3]); } }; //"Boolean" ops. Currently handling bools as integers. //logic_and, struct FVectorIntKernelLogicAnd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntAnd(Src0, Src1); } }; //logic_or, struct FVectorIntKernelLogicOr : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntOr(Src0, Src1); } }; //logic_xor, struct FVectorIntKernelLogicXor : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0, VectorRegister4Int Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntXor(Src0, Src1); } }; //logic_not, struct FVectorIntKernelLogicNot : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntNot(Src0); } }; //conversions //f2i, struct FVectorKernelFloatToInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Float Src0) { *Dst = VectorFloatToInt(Src0); } }; //i2f, struct FVectorKernelIntToFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* Dst, VectorRegister4Int Src0) { *Dst = VectorIntToFloat(Src0); } }; //f2b, struct FVectorKernelFloatToBool : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* Dst, VectorRegister4Float Src0) { *Dst = VectorCompareGT(Src0, GlobalVectorConstants::FloatZero); } }; //b2f, struct FVectorKernelBoolToFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* Dst, VectorRegister4Float Src0) { *Dst = VectorSelect(Src0, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero); } }; //i2b, struct FVectorKernelIntToBool : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { *Dst = VectorIntCompareGT(Src0, GlobalVectorConstants::IntZero); } }; //b2i, struct FVectorKernelBoolToInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Int Src0) { *Dst = VectorIntSelect(Src0, GlobalVectorConstants::IntOne, GlobalVectorConstants::IntZero); } }; //reinterpret bits //fasi, struct FVectorKernelFloatAsInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Int* Dst, VectorRegister4Float Src0) { *Dst = VectorCastFloatToInt(Src0); } }; //iasf, struct FVectorKernelIntAsFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister4Float* Dst, VectorRegister4Int Src0) { *Dst = VectorCastIntToFloat(Src0); } }; void VectorVM::Exec(FVectorVMExecArgs& Args, FVectorVMSerializeState *SerializeState) { //TRACE_CPUPROFILER_EVENT_SCOPE(VMExec); SCOPE_CYCLE_COUNTER(STAT_VVMExec); #if UE_BUILD_TEST const bool bNumInstancesEvent = GbDetailedVMScriptStats != 0; if (bNumInstancesEvent) { FPlatformMisc::BeginNamedEvent(FColor::Red, *FString::Printf(TEXT("STAT_VVMExec - %d"), Args.NumInstances)); } #endif const int32 MaxInstances = FMath::Min(GParallelVVMInstancesPerChunk, Args.NumInstances); const int32 NumChunks = (Args.NumInstances / GParallelVVMInstancesPerChunk) + 1; const int32 ChunksPerBatch = (GbParallelVVM != 0 && FApp::ShouldUseThreadingForPerformance()) ? GParallelVVMChunksPerBatch : NumChunks; const int32 NumBatches = FMath::DivideAndRoundUp(NumChunks, ChunksPerBatch); const bool bParallel = NumBatches > 1 && Args.bAllowParallel; # ifdef VVM_INCLUDE_SERIALIZATION const bool bUseOptimizedByteCode = false; //serializes the bytecode from the instructions, cannot use jump table # else //VVM_INCLUDE_SERIALIZATION const bool bUseOptimizedByteCode = (Args.OptimizedByteCode != nullptr) && GbUseOptimizedVMByteCode; # endif const FVectorVMExecFunction* OptimizedJumpTable = bUseOptimizedByteCode ? FVectorVMCodeOptimizerContext::DecodeJumpTable(Args.OptimizedByteCode) : nullptr; uint64 StartTime = FPlatformTime::Cycles64(); auto ExecChunkBatch = [&](int32 BatchIdx) { //SCOPE_CYCLE_COUNTER(STAT_VVMExecChunk); FVectorVMContext& Context = FVectorVMContext::Get(); Context.PrepareForExec(Args.NumTempRegisters, Args.ConstantTableCount, Args.ConstantTable, Args.ConstantTableSizes, Args.ExternalFunctionTable, Args.UserPtrTable, Args.DataSetMetaTable, MaxInstances, bParallel); #if STATS Context.SetStatScopes(Args.StatScopes); #elif ENABLE_STATNAMEDEVENTS Context.SetStatNamedEventScopes(Args.StatNamedEventsScopes); #endif // Process one chunk at a time. int32 ChunkIdx = BatchIdx * ChunksPerBatch; const int32 FirstInstance = ChunkIdx * GParallelVVMInstancesPerChunk; const int32 FinalInstance = FMath::Min(Args.NumInstances, FirstInstance + (ChunksPerBatch * GParallelVVMInstancesPerChunk)); int32 InstancesLeft = FinalInstance - FirstInstance; while (InstancesLeft > 0) { int32 NumInstancesThisChunk = FMath::Min(InstancesLeft, (int32)GParallelVVMInstancesPerChunk); int32 StartInstance = GParallelVVMInstancesPerChunk * ChunkIdx; // Execute optimized byte code version if ( bUseOptimizedByteCode ) { // Setup execution context. Context.PrepareForChunk(Args.OptimizedByteCode, NumInstancesThisChunk, StartInstance); while (true) { FVectorVMExecFunction ExecFunction = OptimizedJumpTable[Context.DecodeU8()]; if (ExecFunction == nullptr) { break; } ExecFunction(Context); } } else { // Setup execution context. Context.PrepareForChunk(Args.ByteCode, NumInstancesThisChunk, StartInstance); VVMSer_chunkStart(Context, ChunkIdx, BatchIdx); // Execute VM on all vectors in this chunk. EVectorVMOp Op = EVectorVMOp::done; do { VVMSer_insStart(Context); Op = Context.DecodeOp(); switch (Op) { // Dispatch kernel ops. case EVectorVMOp::add: FVectorKernelAdd::Exec(Context); break; case EVectorVMOp::sub: FVectorKernelSub::Exec(Context); break; case EVectorVMOp::mul: FVectorKernelMul::Exec(Context); break; case EVectorVMOp::div: FVectorKernelDivSafe::Exec(Context); break; case EVectorVMOp::mad: FVectorKernelMad::Exec(Context); break; case EVectorVMOp::lerp: FVectorKernelLerp::Exec(Context); break; case EVectorVMOp::rcp: FVectorKernelRcpSafe::Exec(Context); break; case EVectorVMOp::rsq: FVectorKernelRsqSafe::Exec(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrtSafe::Exec(Context); break; case EVectorVMOp::neg: FVectorKernelNeg::Exec(Context); break; case EVectorVMOp::abs: FVectorKernelAbs::Exec(Context); break; case EVectorVMOp::exp: FVectorKernelExp::Exec(Context); break; case EVectorVMOp::exp2: FVectorKernelExp2::Exec(Context); break; case EVectorVMOp::log: FVectorKernelLogSafe::Exec(Context); break; case EVectorVMOp::log2: FVectorKernelLog2::Exec(Context); break; case EVectorVMOp::sin: FVectorKernelSin::Exec(Context); break; case EVectorVMOp::cos: FVectorKernelCos::Exec(Context); break; case EVectorVMOp::tan: FVectorKernelTan::Exec(Context); break; case EVectorVMOp::asin: FVectorKernelASin::Exec(Context); break; case EVectorVMOp::acos: FVectorKernelACos::Exec(Context); break; case EVectorVMOp::atan: FVectorKernelATan::Exec(Context); break; case EVectorVMOp::atan2: FVectorKernelATan2::Exec(Context); break; case EVectorVMOp::ceil: FVectorKernelCeil::Exec(Context); break; case EVectorVMOp::floor: FVectorKernelFloor::Exec(Context); break; case EVectorVMOp::round: FVectorKernelRound::Exec(Context); break; case EVectorVMOp::fmod: FVectorKernelMod::Exec(Context); break; case EVectorVMOp::frac: FVectorKernelFrac::Exec(Context); break; case EVectorVMOp::trunc: FVectorKernelTrunc::Exec(Context); break; case EVectorVMOp::clamp: FVectorKernelClamp::Exec(Context); break; case EVectorVMOp::min: FVectorKernelMin::Exec(Context); break; case EVectorVMOp::max: FVectorKernelMax::Exec(Context); break; case EVectorVMOp::pow: FVectorKernelPowSafe::Exec(Context); break; case EVectorVMOp::sign: FVectorKernelSign::Exec(Context); break; case EVectorVMOp::step: FVectorKernelStep::Exec(Context); break; case EVectorVMOp::random: FVectorKernelRandom::Exec(Context); break; case EVectorVMOp::noise: VectorVMNoise::Noise1D(Context); break; case EVectorVMOp::noise2D: VectorVMNoise::Noise2D(Context); break; case EVectorVMOp::noise3D: VectorVMNoise::Noise3D(Context); break; case EVectorVMOp::cmplt: FVectorKernelCompareLT::Exec(Context); break; case EVectorVMOp::cmple: FVectorKernelCompareLE::Exec(Context); break; case EVectorVMOp::cmpgt: FVectorKernelCompareGT::Exec(Context); break; case EVectorVMOp::cmpge: FVectorKernelCompareGE::Exec(Context); break; case EVectorVMOp::cmpeq: FVectorKernelCompareEQ::Exec(Context); break; case EVectorVMOp::cmpneq: FVectorKernelCompareNEQ::Exec(Context); break; case EVectorVMOp::select: FVectorKernelSelect::Exec(Context); break; case EVectorVMOp::addi: FVectorIntKernelAdd::Exec(Context); break; case EVectorVMOp::subi: FVectorIntKernelSubtract::Exec(Context); break; case EVectorVMOp::muli: FVectorIntKernelMultiply::Exec(Context); break; case EVectorVMOp::divi: FVectorIntKernelDivide::Exec(Context); break; case EVectorVMOp::clampi: FVectorIntKernelClamp::Exec(Context); break; case EVectorVMOp::mini: FVectorIntKernelMin::Exec(Context); break; case EVectorVMOp::maxi: FVectorIntKernelMax::Exec(Context); break; case EVectorVMOp::absi: FVectorIntKernelAbs::Exec(Context); break; case EVectorVMOp::negi: FVectorIntKernelNegate::Exec(Context); break; case EVectorVMOp::signi: FVectorIntKernelSign::Exec(Context); break; case EVectorVMOp::randomi: FScalarIntKernelRandom::Exec(Context); break; case EVectorVMOp::cmplti: FVectorIntKernelCompareLT::Exec(Context); break; case EVectorVMOp::cmplei: FVectorIntKernelCompareLE::Exec(Context); break; case EVectorVMOp::cmpgti: FVectorIntKernelCompareGT::Exec(Context); break; case EVectorVMOp::cmpgei: FVectorIntKernelCompareGE::Exec(Context); break; case EVectorVMOp::cmpeqi: FVectorIntKernelCompareEQ::Exec(Context); break; case EVectorVMOp::cmpneqi: FVectorIntKernelCompareNEQ::Exec(Context); break; case EVectorVMOp::bit_and: FVectorIntKernelBitAnd::Exec(Context); break; case EVectorVMOp::bit_or: FVectorIntKernelBitOr::Exec(Context); break; case EVectorVMOp::bit_xor: FVectorIntKernelBitXor::Exec(Context); break; case EVectorVMOp::bit_not: FVectorIntKernelBitNot::Exec(Context); break; case EVectorVMOp::bit_lshift: FVectorIntKernelBitLShift::Exec(Context); break; case EVectorVMOp::bit_rshift: FVectorIntKernelBitRShift::Exec(Context); break; case EVectorVMOp::logic_and: FVectorIntKernelLogicAnd::Exec(Context); break; case EVectorVMOp::logic_or: FVectorIntKernelLogicOr::Exec(Context); break; case EVectorVMOp::logic_xor: FVectorIntKernelLogicXor::Exec(Context); break; case EVectorVMOp::logic_not: FVectorIntKernelLogicNot::Exec(Context); break; case EVectorVMOp::f2i: FVectorKernelFloatToInt::Exec(Context); break; case EVectorVMOp::i2f: FVectorKernelIntToFloat::Exec(Context); break; case EVectorVMOp::f2b: FVectorKernelFloatToBool::Exec(Context); break; case EVectorVMOp::b2f: FVectorKernelBoolToFloat::Exec(Context); break; case EVectorVMOp::i2b: FVectorKernelIntToBool::Exec(Context); break; case EVectorVMOp::b2i: FVectorKernelBoolToInt::Exec(Context); break; case EVectorVMOp::fasi: FVectorKernelFloatAsInt::Exec(Context); break; case EVectorVMOp::iasf: FVectorKernelIntAsFloat::Exec(Context); break; case EVectorVMOp::outputdata_half: FScalarKernelWriteOutputIndexed::Exec(Context); break; case EVectorVMOp::inputdata_half: FVectorKernelReadInput::Exec(Context); break; case EVectorVMOp::outputdata_int32: FScalarKernelWriteOutputIndexed::Exec(Context); break; case EVectorVMOp::inputdata_int32: FVectorKernelReadInput::Exec(Context); break; case EVectorVMOp::outputdata_float: FScalarKernelWriteOutputIndexed::Exec(Context); break; case EVectorVMOp::inputdata_float: FVectorKernelReadInput::Exec(Context); break; case EVectorVMOp::inputdata_noadvance_int32: FVectorKernelReadInputNoAdvance::Exec(Context); break; case EVectorVMOp::inputdata_noadvance_float: FVectorKernelReadInputNoAdvance::Exec(Context); break; case EVectorVMOp::inputdata_noadvance_half: FVectorKernelReadInputNoAdvance::Exec(Context); break; case EVectorVMOp::acquireindex: FScalarKernelAcquireCounterIndex::Exec(Context); break; case EVectorVMOp::external_func_call: FKernelExternalFunctionCall::Exec(Context); break; case EVectorVMOp::exec_index: FVectorKernelExecutionIndex::Exec(Context); break; case EVectorVMOp::enter_stat_scope: FVectorKernelEnterStatScope::Exec(Context); break; case EVectorVMOp::exit_stat_scope: FVectorKernelExitStatScope::Exec(Context); break; //Special case ops to handle unique IDs but this can be written as generalized buffer operations. TODO! case EVectorVMOp::update_id: FScalarKernelUpdateID::Exec(Context); break; case EVectorVMOp::acquire_id: FScalarKernelAcquireID::Exec(Context); break; // Execution always terminates with a "done" opcode. case EVectorVMOp::done: break; // Opcode not recognized / implemented. default: UE_LOG(LogVectorVM, Fatal, TEXT("Unknown op code 0x%02x"), (uint32)Op); return;//BAIL } VVMSer_insEnd(Context, (int)(VVMSerCtxStartInsCode - VVMSerStartCtxCode), (int)(Context.Code - VVMSerCtxStartInsCode)); } while (Op != EVectorVMOp::done); VVMSer_chunkEnd(SerializeState) } InstancesLeft -= GParallelVVMInstancesPerChunk; ++ChunkIdx; } Context.FinishExec(); }; if ( NumBatches > 1 ) { ParallelFor(NumBatches, ExecChunkBatch, GbParallelVVM == 0 || !bParallel); } else { ExecChunkBatch(0); } #if UE_BUILD_TEST if (bNumInstancesEvent) { FPlatformMisc::EndNamedEvent(); } #endif #if VECTORVM_SUPPORTS_SERIALIZATION uint64 EndTime = FPlatformTime::Cycles64(); if (SerializeState) { SerializeState->ExecDt = EndTime - StartTime; //NOTE: doesn't work if ParallelFor splits the work into multiple threads } #endif //VVM_INCLUDE_SERIALIZATION } void ExecBatchedOutput(FVectorVMContext& Context) { while (true) { FVectorVMExecFunction ExecFunction = reinterpret_cast(Context.DecodePtr()); if (ExecFunction == nullptr) { break; } ExecFunction(Context); } } void ConditionalAddOutputWrapper(FVectorVMCodeOptimizerContext& Context, bool& OuterAdded) { if (!OuterAdded) { Context.WriteExecFunction(ExecBatchedOutput); OuterAdded = true; } } EVectorVMOp BatchedOutputOptimization(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { if (!GbBatchVMOutput) { return Op; } bool OuterAdded = false; bool HasValidOp = true; while (HasValidOp) { switch (Op) { case EVectorVMOp::outputdata_half: ConditionalAddOutputWrapper(Context, OuterAdded); FScalarKernelWriteOutputIndexed::Optimize(Context); break; case EVectorVMOp::outputdata_int32: ConditionalAddOutputWrapper(Context, OuterAdded); FScalarKernelWriteOutputIndexed::Optimize(Context); break; case EVectorVMOp::outputdata_float: ConditionalAddOutputWrapper(Context, OuterAdded); FScalarKernelWriteOutputIndexed::Optimize(Context); break; default: HasValidOp = false; break; } if (HasValidOp) { Op = Context.BaseContext.DecodeOp(); } } if (OuterAdded) { Context.WriteExecFunction(nullptr); } return Op; } // Optimization managed by GbBatchPackVMOutput via PackedOutputOptimization() // Looks for the common pattern of an acquireindex op followed by a number of associated outputdata ops. The // stock operation is to write an index into a temporary register, and then have the different outputs streams // write into the indexed location. This optimization does a number of things: // -first we check if 'validity' is uniform or not, if it is we can have a fast path of both figuring out how many // indices we need, as well as how to write the output (if we find that they are all invalid, then we don't need to do anything!) // -if we need to evaluate the validity of each element we quickly count up the number (with vector intrinsics) and // grab a block of the indices (rather than one at a time) // -rather than storing the indices to use, we store a int8 mask which indicates a valid flag for each of the next 4 samples // -outputs are then written to depending on their source and their frequency: // -uniform sources will be splatted to all valid entries // -variable sources will be packed into the available slots struct FBatchedWriteIndexedOutput { // acquires a batch of indices from the provided CounterHandler. If we're running in parallel, then we'll need to use // atomics to guarantee our place in the list of indices. template static VM_FORCEINLINE void AcquireCounterIndex(FVectorVMContext& Context, FDataSetCounterHandler& CounterHandler, int32 AcquireCount) { if (AcquireCount) { int32* CounterHandlerIndex = CounterHandler.Get(); int32 StartIndex = INDEX_NONE; if (bParallel) { StartIndex = FPlatformAtomics::InterlockedAdd(CounterHandlerIndex, AcquireCount); } else { StartIndex = *CounterHandlerIndex; *CounterHandlerIndex = StartIndex + AcquireCount; } // increment StartIndex, since CounterHandlerIndex starts at INDEX_NONE Context.ValidInstanceIndexStart = StartIndex + 1; } Context.ValidInstanceCount = AcquireCount; Context.ValidInstanceUniform = !AcquireCount || (Context.NumInstances == AcquireCount); } // evaluates a register to evaluate which instances are valid or not; will read 4 entries at a time and generate a // a mask for which entries are valid as well as an overall count template static void HandleRegisterValidIndices(FVectorVMContext& Context) { FDataSetCounterHandler CounterHandler(Context); FRegisterHandler ValidReader(Context); FRegisterHandler Dst(Context); int8* DestAddr = Dst.GetDest(); // we can process VECTOR_WIDTH_FLOATS entries at a time, generating a int8 mask for each set of 4 indicating // which are valid const int32 LoopCount = FMath::DivideAndRoundUp(Context.NumInstances, VECTOR_WIDTH_FLOATS); int32 Remainder = Context.NumInstances; int32 ValidCount = 0; for (int32 LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { // input register needs to be padded to allow for 16 byte reads; but mask out the ones beyond NumInstances uint8 ValidMask = static_cast(VectorMaskBits(ValidReader.GetAndAdvance())); ValidMask &= ~(0xFF << FMath::Min(VECTOR_WIDTH_FLOATS, Remainder)); ValidCount += FMath::CountBits(ValidMask); DestAddr[LoopIt] = ValidMask; Remainder -= VECTOR_WIDTH_FLOATS; } // grab our batch of indices AcquireCounterIndex(Context, CounterHandler, ValidCount); } // evaluates the uniform check and grab the appropriate number of indices template static VM_FORCEINLINE void HandleUniformValidIndices(FVectorVMContext& Context) { FDataSetCounterHandler CounterHandler(Context); ValidReaderType ValidReader(Context); if (ValidReader.Get()) { AcquireCounterIndex(Context, CounterHandler, Context.NumInstances); } } template static VM_FORCEINLINE void IndexExecOptimized(FVectorVMContext& Context) { if (Context.IsParallelExecution()) { switch (SrcOpType) { case SRCOP_RRR: HandleRegisterValidIndices(Context); break; case SRCOP_RRC: HandleUniformValidIndices, true>(Context); break; default: check(0); break; } } else { switch (SrcOpType) { case SRCOP_RRR: HandleRegisterValidIndices(Context); break; case SRCOP_RRC: HandleUniformValidIndices, false>(Context); break; default: check(0); break; } } } void OptimizeAcquireIndex(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpType = Context.BaseContext.DecodeSrcOperandTypes(); AcquireIndexConstant = !!(SrcOpType & OP0_CONST); switch (SrcOpType) { case SRCOP_RRR: Context.WriteExecFunction(IndexExecOptimized); break; case SRCOP_RRC: Context.WriteExecFunction(IndexExecOptimized); break; default: check(0); break; } DataSetCounterIndex = Context.DecodeU16(); ValidTestRegisterIndex = Context.DecodeU16(); WorkingRegisterIndex = Context.DecodeU16(); Context.Write(DataSetCounterIndex); Context.Write(ValidTestRegisterIndex); // we only need the working register if we've got non-uniform data if (SrcOpType == SRCOP_RRR) { Context.Write(WorkingRegisterIndex); } } bool IsOutput(EVectorVMOp Op) const { return (Op == EVectorVMOp::outputdata_int32 || Op == EVectorVMOp::outputdata_float || Op == EVectorVMOp::outputdata_half); } bool IsValidEnd(EVectorVMOp Op) const { return (Op == EVectorVMOp::done || Op == EVectorVMOp::acquireindex); } static bool SkipIfEmpty(FVectorVMContext& Context) { if (!Context.ValidInstanceCount) { Context.SkipCode(2 * sizeof(uint16)); // don't need DataSetIndex or DestIndexRegisterIdx const uint16 AccumulatedOpCount = Context.DecodeU16(); Context.SkipCode(AccumulatedOpCount * 2 * sizeof(uint16)); return true; } return false; } VM_FORCEINLINE static void SplatElement(int32 OutputCount, int32 ElementValue, int32* RESTRICT OutputElements) { const int32 OutputCountWide = AlignDown(OutputCount, VECTOR_WIDTH_FLOATS); if (OutputCountWide) { const VectorRegister4Int SplatValue = MakeVectorRegisterInt(ElementValue, ElementValue, ElementValue, ElementValue); for (int32 i = 0; i < OutputCountWide; i += VECTOR_WIDTH_FLOATS) { VectorIntStore(SplatValue, OutputElements + i); } } for (int32 i = OutputCountWide; i < OutputCount; ++i) { OutputElements[i] = ElementValue; } } VM_FORCEINLINE static void SplatElement(int32 OutputCount, float ElementValue, float* RESTRICT OutputElements) { const int32 OutputCountWide = AlignDown(OutputCount, VECTOR_WIDTH_FLOATS); if (OutputCountWide) { const VectorRegister4Float SplatValue = MakeVectorRegister(ElementValue, ElementValue, ElementValue, ElementValue); for (int32 i = 0; i < OutputCountWide; i += VECTOR_WIDTH_FLOATS) { VectorStore(SplatValue, OutputElements + i); } } for (int32 i = OutputCountWide; i < OutputCount; ++i) { OutputElements[i] = ElementValue; } } VM_FORCEINLINE static void SplatElement(int32 OutputCount, float ElementValue, FFloat16* RESTRICT OutputElements) { uint16 TargetValue; FPlatformMath::StoreHalf(&TargetValue, ElementValue); uint32 TargetValue32 = uint32(TargetValue) | ((uint32(TargetValue) << 16)); const VectorRegister4Float SplatValue = MakeVectorRegister(TargetValue32, TargetValue32, TargetValue32, TargetValue32); while (OutputCount > 7) { VectorStore(SplatValue, (float*)OutputElements); // TODO: LWC: this was using a void* previously which always assumed float*, but revisit, doesn't seem correct either way... OutputElements += 8; OutputCount -= 8; } while (OutputCount > 0) { *((uint16*)OutputElements) = TargetValue; OutputElements++; OutputCount--; } } template VM_FORCEINLINE static void CopyElements(int32 ElementCount, const T* RESTRICT SourceElements, T* RESTRICT OutputElements) { memcpy(OutputElements, SourceElements, ElementCount * sizeof(T)); } VM_FORCEINLINE static void CopyElements(int32 ElementCount, const float* RESTRICT SourceElements, FFloat16* RESTRICT OutputElements) { const float* Src = SourceElements; uint16* Dst = (uint16*)OutputElements; while (ElementCount > 7) { FPlatformMath::WideVectorStoreHalf(Dst, Src); Dst += 8; Src += 8; ElementCount -= 8; } while (ElementCount > 3) { FPlatformMath::VectorStoreHalf(Dst, Src); Dst += 4; Src += 4; ElementCount -= 4; } while (ElementCount > 0) { FPlatformMath::StoreHalf(Dst, *Src); Dst += 1; Src += 1; ElementCount -= 1; } } template VM_FORCEINLINE static void ScalarShuffleElements(int32 ElementCount, const SourceType* RESTRICT SourceElements, const int8* RESTRICT ValidMask, TargetType* RESTRICT OutputElements) { constexpr int32 ElementsPerMask = 4; // scalar path that will read 4 values and write one at a time to the output based on the valid mask int32 MaskIt = 0; while (ElementCount) { const int8 ShuffleMask = ValidMask[MaskIt]; const int8 AdvanceCount = FMath::CountBits(ShuffleMask); check(AdvanceCount >= 0 && AdvanceCount <= 4); if (AdvanceCount) { for (int32 ScalarIt = 0; ScalarIt < ElementsPerMask; ++ScalarIt) { if (!!(ShuffleMask & (1 << ScalarIt))) { *OutputElements = SourceElements[MaskIt * ElementsPerMask + ScalarIt]; ++OutputElements; } } ElementCount -= AdvanceCount; } ++MaskIt; } } VM_FORCEINLINE static void ScalarShuffleElements(int32 ElementCount, const float* RESTRICT SourceElements, const int8* RESTRICT ValidMask, FFloat16* RESTRICT OutputElements) { constexpr int32 ElementsPerMask = 4; // scalar path that will read 4 values and write one at a time to the output based on the valid mask int32 MaskIt = 0; while (ElementCount) { checkSlow(ElementCount > 0); const int8 ShuffleMask = ValidMask[MaskIt]; const int8 AdvanceCount = FMath::CountBits(ShuffleMask); if (AdvanceCount) { for (int32 ScalarIt = 0; ScalarIt < ElementsPerMask; ++ScalarIt) { if (!!(ShuffleMask & (1 << ScalarIt))) { FPlatformMath::StoreHalf((uint16*)OutputElements, SourceElements[MaskIt * ElementsPerMask + ScalarIt]); ++OutputElements; } } ElementCount -= AdvanceCount; } ++MaskIt; } } VM_FORCEINLINE static void VectorShuffleElements(int32 ElementCount, const int32* RESTRICT SourceElements, const int8* RESTRICT ValidMask, int32* RESTRICT OutputElements) { int32 SourceIt = 0; const VectorRegister4Int* RESTRICT SourceVectors = reinterpret_cast(SourceElements); // vector shuffle path writes 4 at a time (though the trailing elements may not be valid) until we have to move over // to the scalar version for fear of overwriting our neighbors while (ElementCount >= VECTOR_WIDTH_FLOATS) { const int8 ShuffleMask = ValidMask[SourceIt]; const int8 AdvanceCount = FMath::CountBits(ShuffleMask); check(AdvanceCount >= 0 && AdvanceCount <= 4); // // VectorIntStore( - unaligned writes of 16 bytes to our Destination; note that this maneuver requires us to have // our output buffers padded out to 16 bytes! // VectorIntShuffle( - swizzle our source register to pack the valid entries at the beginning, with 0s at the end // Source, - source data // ShuffleMask), - result of the VectorMaskBits done in the acquireindex, int8/VectorRegister4Float of input // Destination); VectorIntStore(VectorIntShuffle(SourceVectors[SourceIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]), OutputElements); OutputElements += AdvanceCount; ElementCount -= AdvanceCount; ++SourceIt; } ScalarShuffleElements(ElementCount, SourceElements + VECTOR_WIDTH_FLOATS * SourceIt, ValidMask + SourceIt, OutputElements); } VM_FORCEINLINE static void VectorShuffleElements(int32 ElementCount, const float* RESTRICT SourceElements, const int8* RESTRICT ValidMask, FFloat16* RESTRICT OutputElements) { int32 SourceIt = 0; const VectorRegister4Int* RESTRICT SourceVectors = reinterpret_cast(SourceElements); // vector shuffle path writes 4 at a time (though the trailing elements may not be valid) until we have to move over // to the scalar version for fear of overwriting our neighbors while (ElementCount >= VECTOR_WIDTH_FLOATS) { const int8 ShuffleMask = ValidMask[SourceIt]; const int8 AdvanceCount = FMath::CountBits(ShuffleMask); check(AdvanceCount >= 0 && AdvanceCount <= 4); // VectorIntStore( - unaligned writes of 16 bytes to our Destination; note that this maneuver requires us to have // our output buffers padded out to 16 bytes! // VectorIntShuffle( - swizzle our source register to pack the valid entries at the beginning, with 0s at the end // Source, - source data // ShuffleMask), - result of the VectorMaskBits done in the acquireindex, int8/VectorRegister4Float of input // Destination); const VectorRegister4Int ShuffledFloats = VectorIntShuffle(SourceVectors[SourceIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]); FPlatformMath::VectorStoreHalf((uint16*)OutputElements, (float*)&ShuffledFloats); OutputElements += AdvanceCount; ElementCount -= AdvanceCount; ++SourceIt; } ScalarShuffleElements(ElementCount, SourceElements + VECTOR_WIDTH_FLOATS * SourceIt, ValidMask + SourceIt, OutputElements); } template static void CopyConstantToOutput(FVectorVMContext& Context) { if (SkipIfEmpty(Context)) { return; } const uint16 DataSetIndex = Context.DecodeU16(); Context.SkipCode(sizeof(uint16)); // don't need DestIndexRegisterIdx const uint16 AccumulatedOpCount = Context.DecodeU16(); FDataSetMeta& DataSetMeta = Context.GetDataSetMeta(DataSetIndex); for (uint16 OpIt = 0; OpIt < AccumulatedOpCount; ++OpIt) { FConstantHandler SourceHandler(Context); const uint16 DestRegisterIndex = Context.DecodeU16(); TargetType* RESTRICT OutputElements = Context.GetOutputRegister(DataSetIndex, DestRegisterIndex) + Context.ValidInstanceIndexStart; SplatElement(Context.ValidInstanceCount, SourceHandler.Constant, OutputElements); } } template static void CopyRegisterToOutput(FVectorVMContext& Context) { if (bCheckIfEmpty && SkipIfEmpty(Context)) { return; } const uint16 DataSetIndex = Context.DecodeU16(); Context.SkipCode(sizeof(uint16)); // don't need DestIndexRegisterIdx const uint16 AccumulatedOpCount = Context.DecodeU16(); check(Context.ValidInstanceCount == Context.NumInstances); FDataSetMeta& DataSetMeta = Context.GetDataSetMeta(DataSetIndex); for (uint16 OpIt = 0; OpIt < AccumulatedOpCount; ++OpIt) { FRegisterHandler SourceHandler(Context); const uint16 DestRegisterIndex = Context.DecodeU16(); TargetType* RESTRICT OutputElements = Context.GetOutputRegister(DataSetIndex, DestRegisterIndex) + Context.ValidInstanceIndexStart; CopyElements(Context.ValidInstanceCount, SourceHandler.GetDest(), OutputElements); } } template static void ShuffleRegisterToOutput(FVectorVMContext& Context) { if (SkipIfEmpty(Context)) { return; } else if (Context.ValidInstanceUniform) { CopyRegisterToOutput(Context); return; } const uint16 DataSetIndex = Context.DecodeU16(); const uint16 DestIndexRegisterIdx = Context.DecodeU16(); const uint16 AccumulatedOpCount = Context.DecodeU16(); FDataSetMeta& DataSetMeta = Context.GetDataSetMeta(DataSetIndex); const int8* RESTRICT DestIndexReg = reinterpret_cast(Context.GetTempRegister(DestIndexRegisterIdx)); for (uint16 OpIt = 0; OpIt < AccumulatedOpCount; ++OpIt) { FRegisterHandler SourceRegister(Context); TargetType* RESTRICT DestReg = Context.GetOutputRegister(DataSetIndex, Context.DecodeU16()) + Context.ValidInstanceIndexStart; // the number of instances that we're expecting to write. it is important that we keep track of it because when we // get down to the end we need to switch from the shuffled approach to a scalar approach so that we don't // overwrite the indexed output that another parallel context might have written to VectorShuffleElements(Context.ValidInstanceCount, SourceRegister.GetDest(), DestIndexReg, DestReg); } } bool OptimizeBatch(FVectorVMCodeOptimizerContext& Context) { const int32 BatchedOpCount = BatchedOps.Num(); if (!BatchedOpCount) return false; for (const auto& BatchEntry : BatchedOps) { const uint16 AccumulatedOpCount = BatchEntry.Value.Num(); if (!AccumulatedOpCount) continue; // matrix of options based on the: // Op (are we outputting ints, floats or halfs) // SrcOpType (are we copying a constant or a register over) // additionally, if we know that the index is constant, then we know at worst we're doing a copy if (BatchEntry.Key.SrcOpType == SRCOP_RRC) { switch (BatchEntry.Key.Op) { case EVectorVMOp::outputdata_float: Context.WriteExecFunction(CopyConstantToOutput); break; case EVectorVMOp::outputdata_int32: Context.WriteExecFunction(CopyConstantToOutput); break; case EVectorVMOp::outputdata_half: Context.WriteExecFunction(CopyConstantToOutput); break; default: check(0); } } else if (AcquireIndexConstant) { switch (BatchEntry.Key.Op) { case EVectorVMOp::outputdata_float: Context.WriteExecFunction(CopyRegisterToOutput); break; case EVectorVMOp::outputdata_int32: Context.WriteExecFunction(CopyRegisterToOutput); break; case EVectorVMOp::outputdata_half: Context.WriteExecFunction(CopyRegisterToOutput); break; default: check(0); } } else { check(BatchEntry.Key.SrcOpType == SRCOP_RRR); switch (BatchEntry.Key.Op) { case EVectorVMOp::outputdata_float: Context.WriteExecFunction(ShuffleRegisterToOutput); break; case EVectorVMOp::outputdata_int32: Context.WriteExecFunction(ShuffleRegisterToOutput); break; case EVectorVMOp::outputdata_half: Context.WriteExecFunction(ShuffleRegisterToOutput); break; default: check(0); } } Context.Write(BatchEntry.Key.DataSetIndex); Context.Write(BatchEntry.Key.DestIndexRegisterIdx); Context.Write(AccumulatedOpCount); for (const FOpValue& OpValue : BatchEntry.Value) { Context.Write(OpValue.SourceRegisterIndex); Context.Write(OpValue.DestRegisterIdx); } } return true; } bool ExtractOp(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { FOpKey Key; Key.Op = Op; Key.SrcOpType = Context.BaseContext.DecodeSrcOperandTypes(); Key.DataSetIndex = Context.DecodeU16(); Key.DestIndexRegisterIdx = Context.DecodeU16(); if (Key.DestIndexRegisterIdx != WorkingRegisterIndex) { // if we've found an output node that is not related to the acquire index op, then just exit return false; } FOpValue Value; Value.SourceRegisterIndex = Context.DecodeU16(); Value.DestRegisterIdx = Context.DecodeU16(); TArray& ExistingOps = BatchedOps.FindOrAdd(Key); ExistingOps.Add(Value); return true; } private: using RegisterType = VectorRegister4Int; using ScalarType = int32; uint16 DataSetCounterIndex = 0; uint16 ValidTestRegisterIndex = 0; uint16 WorkingRegisterIndex = 0; bool AcquireIndexConstant = false; struct FOpKey { uint16 DestIndexRegisterIdx; uint16 DataSetIndex; uint8 SrcOpType; EVectorVMOp Op; }; struct FOpValue { uint16 SourceRegisterIndex; uint16 DestRegisterIdx; }; struct FOpKeyFuncs : public TDefaultMapKeyFuncs, false> { static VM_FORCEINLINE bool Matches(const FOpKey& A, const FOpKey& B) { return A.DestIndexRegisterIdx == B.DestIndexRegisterIdx && A.DataSetIndex == B.DataSetIndex && A.SrcOpType == B.SrcOpType && A.Op == B.Op; } static VM_FORCEINLINE uint32 GetKeyHash(const FOpKey& Key) { return HashCombine(Key.DestIndexRegisterIdx | (Key.DataSetIndex << 16), Key.SrcOpType | (static_cast(Key.Op) << 8)); } }; TMap, FDefaultSetAllocator, FOpKeyFuncs> BatchedOps; }; // look for the pattern of acquireindex followed by a bunch of outputs. bool PackedOutputOptimization(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { if (!GbBatchPackVMOutput) { return false; } if (Op == EVectorVMOp::acquireindex) { const auto RollbackState = Context.CreateCodeState(); FBatchedWriteIndexedOutput BatchedOutputOp; BatchedOutputOp.OptimizeAcquireIndex(Context); bool BatchValid = true; EVectorVMOp NextOp = Context.PeekOp(); while (BatchValid && BatchedOutputOp.IsOutput(NextOp)) { BatchValid = BatchedOutputOp.ExtractOp(Context.BaseContext.DecodeOp(), Context); NextOp = Context.PeekOp(); } // if there's nothing worth optimizing here, then just revert what we've parsed if (!BatchValid || !BatchedOutputOp.OptimizeBatch(Context)) { Context.RollbackCodeState(RollbackState); return false; } // We handled the existing op return true; } return false; } void ExecBatchedInput(FVectorVMContext& Context) { while (true) { FVectorVMExecFunction ExecFunction = reinterpret_cast(Context.DecodePtr()); if (ExecFunction == nullptr) { break; } ExecFunction(Context); } } void ConditionalAddInputWrapper(FVectorVMCodeOptimizerContext& Context, bool& OuterAdded) { if (!OuterAdded) { Context.WriteExecFunction(ExecBatchedInput); OuterAdded = true; } } EVectorVMOp BatchedInputOptimization(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { if (!GbBatchVMInput) { return Op; } bool OuterAdded = false; bool HasValidOp = true; while (HasValidOp) { switch (Op) { case EVectorVMOp::inputdata_half: ConditionalAddInputWrapper(Context, OuterAdded); FVectorKernelReadInput::Optimize(Context); break; case EVectorVMOp::inputdata_int32: ConditionalAddInputWrapper(Context, OuterAdded); FVectorKernelReadInput::Optimize(Context); break; case EVectorVMOp::inputdata_float: ConditionalAddInputWrapper(Context, OuterAdded); FVectorKernelReadInput::Optimize(Context); break; default: HasValidOp = false; break; } if (HasValidOp) { Op = Context.BaseContext.DecodeOp(); } } if (OuterAdded) { Context.WriteExecFunction(nullptr); } return Op; } bool SafeMathOptimization(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { if (!GbSafeOptimizedKernels) { return false; } switch (Op) { case EVectorVMOp::div: FVectorKernelDivSafe::Optimize(Context); return true; case EVectorVMOp::rcp: FVectorKernelRcpSafe::Optimize(Context); return true; case EVectorVMOp::rsq: FVectorKernelRsqSafe::Optimize(Context); return true; case EVectorVMOp::sqrt: FVectorKernelSqrtSafe::Optimize(Context); return true; case EVectorVMOp::log: FVectorKernelLogSafe::Optimize(Context); return true; case EVectorVMOp::pow: FVectorKernelPowSafe::Optimize(Context); return true; default: return false; } } void VectorVM::OptimizeByteCode(const uint8* ByteCode, TArray& OptimizedCode, TArrayView ExternalFunctionRegisterCounts) { OptimizedCode.Empty(); //-TODO:7 Support unaligned writes & little endian #if PLATFORM_SUPPORTS_UNALIGNED_LOADS && PLATFORM_LITTLE_ENDIAN if (!GbOptimizeVMByteCode || (ByteCode == nullptr)) { return; } FVectorVMCodeOptimizerContext Context(FVectorVMContext::Get(), ByteCode, OptimizedCode, ExternalFunctionRegisterCounts); // add any optimization filters in here, useful so what we can isolate optimizations with CVars FVectorVMCodeOptimizerContext::OptimizeVMFunction VMFilters[] = { //BatchedInputOptimization, //BatchedOutputOptimization, PackedOutputOptimization, SafeMathOptimization, }; EVectorVMOp Op = EVectorVMOp::done; do { Op = Context.BaseContext.DecodeOp(); // Filters allow us to modify a single or series of operations bool bOpWasFiltered = false; for (auto Filter : VMFilters) { if ( Filter(Op, Context) ) { bOpWasFiltered = true; break; } } if (bOpWasFiltered) { continue; } // Optimize op switch (Op) { case EVectorVMOp::add: FVectorKernelAdd::Optimize(Context); break; case EVectorVMOp::sub: FVectorKernelSub::Optimize(Context); break; case EVectorVMOp::mul: FVectorKernelMul::Optimize(Context); break; case EVectorVMOp::div: FVectorKernelDiv::Optimize(Context); break; case EVectorVMOp::mad: FVectorKernelMad::Optimize(Context); break; case EVectorVMOp::lerp: FVectorKernelLerp::Optimize(Context); break; case EVectorVMOp::rcp: FVectorKernelRcp::Optimize(Context); break; case EVectorVMOp::rsq: FVectorKernelRsq::Optimize(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrt::Optimize(Context); break; case EVectorVMOp::neg: FVectorKernelNeg::Optimize(Context); break; case EVectorVMOp::abs: FVectorKernelAbs::Optimize(Context); break; case EVectorVMOp::exp: FVectorKernelExp::Optimize(Context); break; case EVectorVMOp::exp2: FVectorKernelExp2::Optimize(Context); break; case EVectorVMOp::log: FVectorKernelLog::Optimize(Context); break; case EVectorVMOp::log2: FVectorKernelLog2::Optimize(Context); break; case EVectorVMOp::sin: FVectorKernelSin::Optimize(Context); break; case EVectorVMOp::cos: FVectorKernelCos::Optimize(Context); break; case EVectorVMOp::tan: FVectorKernelTan::Optimize(Context); break; case EVectorVMOp::asin: FVectorKernelASin::Optimize(Context); break; case EVectorVMOp::acos: FVectorKernelACos::Optimize(Context); break; case EVectorVMOp::atan: FVectorKernelATan::Optimize(Context); break; case EVectorVMOp::atan2: FVectorKernelATan2::Optimize(Context); break; case EVectorVMOp::ceil: FVectorKernelCeil::Optimize(Context); break; case EVectorVMOp::floor: FVectorKernelFloor::Optimize(Context); break; case EVectorVMOp::round: FVectorKernelRound::Optimize(Context); break; case EVectorVMOp::fmod: FVectorKernelMod::Optimize(Context); break; case EVectorVMOp::frac: FVectorKernelFrac::Optimize(Context); break; case EVectorVMOp::trunc: FVectorKernelTrunc::Optimize(Context); break; case EVectorVMOp::clamp: FVectorKernelClamp::Optimize(Context); break; case EVectorVMOp::min: FVectorKernelMin::Optimize(Context); break; case EVectorVMOp::max: FVectorKernelMax::Optimize(Context); break; case EVectorVMOp::pow: FVectorKernelPow::Optimize(Context); break; case EVectorVMOp::sign: FVectorKernelSign::Optimize(Context); break; case EVectorVMOp::step: FVectorKernelStep::Optimize(Context); break; case EVectorVMOp::random: FVectorKernelRandom::Optimize(Context); break; case EVectorVMOp::noise: VectorVMNoise::Optimize_Noise1D(Context); break; case EVectorVMOp::noise2D: VectorVMNoise::Optimize_Noise2D(Context); break; case EVectorVMOp::noise3D: VectorVMNoise::Optimize_Noise3D(Context); break; case EVectorVMOp::cmplt: FVectorKernelCompareLT::Optimize(Context); break; case EVectorVMOp::cmple: FVectorKernelCompareLE::Optimize(Context); break; case EVectorVMOp::cmpgt: FVectorKernelCompareGT::Optimize(Context); break; case EVectorVMOp::cmpge: FVectorKernelCompareGE::Optimize(Context); break; case EVectorVMOp::cmpeq: FVectorKernelCompareEQ::Optimize(Context); break; case EVectorVMOp::cmpneq: FVectorKernelCompareNEQ::Optimize(Context); break; case EVectorVMOp::select: FVectorKernelSelect::Optimize(Context); break; case EVectorVMOp::addi: FVectorIntKernelAdd::Optimize(Context); break; case EVectorVMOp::subi: FVectorIntKernelSubtract::Optimize(Context); break; case EVectorVMOp::muli: FVectorIntKernelMultiply::Optimize(Context); break; case EVectorVMOp::divi: FVectorIntKernelDivide::Optimize(Context); break; case EVectorVMOp::clampi: FVectorIntKernelClamp::Optimize(Context); break; case EVectorVMOp::mini: FVectorIntKernelMin::Optimize(Context); break; case EVectorVMOp::maxi: FVectorIntKernelMax::Optimize(Context); break; case EVectorVMOp::absi: FVectorIntKernelAbs::Optimize(Context); break; case EVectorVMOp::negi: FVectorIntKernelNegate::Optimize(Context); break; case EVectorVMOp::signi: FVectorIntKernelSign::Optimize(Context); break; case EVectorVMOp::randomi: FScalarIntKernelRandom::Optimize(Context); break; case EVectorVMOp::cmplti: FVectorIntKernelCompareLT::Optimize(Context); break; case EVectorVMOp::cmplei: FVectorIntKernelCompareLE::Optimize(Context); break; case EVectorVMOp::cmpgti: FVectorIntKernelCompareGT::Optimize(Context); break; case EVectorVMOp::cmpgei: FVectorIntKernelCompareGE::Optimize(Context); break; case EVectorVMOp::cmpeqi: FVectorIntKernelCompareEQ::Optimize(Context); break; case EVectorVMOp::cmpneqi: FVectorIntKernelCompareNEQ::Optimize(Context); break; case EVectorVMOp::bit_and: FVectorIntKernelBitAnd::Optimize(Context); break; case EVectorVMOp::bit_or: FVectorIntKernelBitOr::Optimize(Context); break; case EVectorVMOp::bit_xor: FVectorIntKernelBitXor::Optimize(Context); break; case EVectorVMOp::bit_not: FVectorIntKernelBitNot::Optimize(Context); break; case EVectorVMOp::bit_lshift: FVectorIntKernelBitLShift::Optimize(Context); break; case EVectorVMOp::bit_rshift: FVectorIntKernelBitRShift::Optimize(Context); break; case EVectorVMOp::logic_and: FVectorIntKernelLogicAnd::Optimize(Context); break; case EVectorVMOp::logic_or: FVectorIntKernelLogicOr::Optimize(Context); break; case EVectorVMOp::logic_xor: FVectorIntKernelLogicXor::Optimize(Context); break; case EVectorVMOp::logic_not: FVectorIntKernelLogicNot::Optimize(Context); break; case EVectorVMOp::f2i: FVectorKernelFloatToInt::Optimize(Context); break; case EVectorVMOp::i2f: FVectorKernelIntToFloat::Optimize(Context); break; case EVectorVMOp::f2b: FVectorKernelFloatToBool::Optimize(Context); break; case EVectorVMOp::b2f: FVectorKernelBoolToFloat::Optimize(Context); break; case EVectorVMOp::i2b: FVectorKernelIntToBool::Optimize(Context); break; case EVectorVMOp::b2i: FVectorKernelBoolToInt::Optimize(Context); break; case EVectorVMOp::fasi: FVectorKernelFloatAsInt::Optimize(Context); break; case EVectorVMOp::iasf: FVectorKernelIntAsFloat::Optimize(Context); break; case EVectorVMOp::outputdata_half: FScalarKernelWriteOutputIndexed::Optimize(Context); break; case EVectorVMOp::inputdata_half: FVectorKernelReadInput::Optimize(Context); break; case EVectorVMOp::outputdata_int32: FScalarKernelWriteOutputIndexed::Optimize(Context); break; case EVectorVMOp::inputdata_int32: FVectorKernelReadInput::Optimize(Context); break; case EVectorVMOp::outputdata_float: FScalarKernelWriteOutputIndexed::Optimize(Context); break; case EVectorVMOp::inputdata_float: FVectorKernelReadInput::Optimize(Context); break; case EVectorVMOp::inputdata_noadvance_int32: FVectorKernelReadInputNoAdvance::Optimize(Context); break; case EVectorVMOp::inputdata_noadvance_float: FVectorKernelReadInputNoAdvance::Optimize(Context); break; case EVectorVMOp::inputdata_noadvance_half: FVectorKernelReadInputNoAdvance::Optimize(Context); break; case EVectorVMOp::acquireindex: FScalarKernelAcquireCounterIndex::Optimize(Context); break; case EVectorVMOp::external_func_call: FKernelExternalFunctionCall::Optimize(Context); break; case EVectorVMOp::exec_index: FVectorKernelExecutionIndex::Optimize(Context); break; case EVectorVMOp::enter_stat_scope: FVectorKernelEnterStatScope::Optimize(Context); break; case EVectorVMOp::exit_stat_scope: FVectorKernelExitStatScope::Optimize(Context); break; //Special case ops to handle unique IDs but this can be written as generalized buffer operations. TODO! case EVectorVMOp::update_id: FScalarKernelUpdateID::Optimize(Context); break; case EVectorVMOp::acquire_id: FScalarKernelAcquireID::Optimize(Context); break; // Execution always terminates with a "done" opcode. case EVectorVMOp::done: break; // Opcode not recognized / implemented. default: UE_LOG(LogVectorVM, Fatal, TEXT("Unknown op code 0x%02x"), (uint32)Op); OptimizedCode.Empty(); return;//BAIL } } while (Op != EVectorVMOp::done); Context.WriteExecFunction(nullptr); Context.EncodeJumpTable(); #endif //PLATFORM_SUPPORTS_UNALIGNED_LOADS && PLATFORM_LITTLE_ENDIAN } #endif // #if VECTORVM_SUPPORTS_LEGACY #undef VM_FORCEINLINE