// Copyright 1998-2019 Epic Games, Inc. All Rights Reserved. #include "VectorVM.h" #include "Modules/ModuleManager.h" #include "UObject/Class.h" #include "UObject/Package.h" #include "VectorVMPrivate.h" #include "Stats/Stats.h" #include "HAL/ConsoleManager.h" #include "Async/ParallelFor.h" IMPLEMENT_MODULE(FDefaultModuleImpl, VectorVM); DECLARE_STATS_GROUP(TEXT("VectorVM"), STATGROUP_VectorVM, STATCAT_Advanced); DECLARE_CYCLE_STAT(TEXT("VVM Execution"), STAT_VVMExec, STATGROUP_VectorVM); DECLARE_CYCLE_STAT(TEXT("VVM Chunk"), STAT_VVMExecChunk, STATGROUP_VectorVM); DEFINE_LOG_CATEGORY_STATIC(LogVectorVM, All, All); //#define FREE_TABLE_LOCK_CONTENTION_WARNINGS (!UE_BUILD_SHIPPING) #define FREE_TABLE_LOCK_CONTENTION_WARNINGS (0) //I don't expect us to ever be waiting long #define FREE_TABLE_LOCK_CONTENTION_WARN_THRESHOLD_MS (0.01) //#define VM_FORCEINLINE #define VM_FORCEINLINE FORCEINLINE #define OP_REGISTER (0) #define OP0_CONST (1 << 0) #define OP1_CONST (1 << 1) #define OP2_CONST (1 << 2) #define SRCOP_RRR (OP_REGISTER | OP_REGISTER | OP_REGISTER) #define SRCOP_RRC (OP_REGISTER | OP_REGISTER | OP0_CONST) #define SRCOP_RCR (OP_REGISTER | OP1_CONST | OP_REGISTER) #define SRCOP_RCC (OP_REGISTER | OP1_CONST | OP0_CONST) #define SRCOP_CRR (OP2_CONST | OP_REGISTER | OP_REGISTER) #define SRCOP_CRC (OP2_CONST | OP_REGISTER | OP0_CONST) #define SRCOP_CCR (OP2_CONST | OP1_CONST | OP_REGISTER) #define SRCOP_CCC (OP2_CONST | OP1_CONST | OP0_CONST) namespace VectorVMConstants { static const VectorRegisterInt VectorStride = MakeVectorRegisterInt(VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS); // for generating shuffle masks given input {A, B, C, D} constexpr uint32 ShufMaskIgnore = 0xFFFFFFFF; constexpr uint32 ShufMaskA = 0x03020100; constexpr uint32 ShufMaskB = 0x07060504; constexpr uint32 ShufMaskC = 0x0B0A0908; constexpr uint32 ShufMaskD = 0x0F0E0D0C; static const VectorRegisterInt RegisterShuffleMask[] = { MakeVectorRegisterInt(ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0000 MakeVectorRegisterInt(ShufMaskD, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0001 MakeVectorRegisterInt(ShufMaskC, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0010 MakeVectorRegisterInt(ShufMaskC, ShufMaskD, ShufMaskIgnore, ShufMaskIgnore), // 0011 MakeVectorRegisterInt(ShufMaskB, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 0100 MakeVectorRegisterInt(ShufMaskB, ShufMaskD, ShufMaskIgnore, ShufMaskIgnore), // 0101 MakeVectorRegisterInt(ShufMaskB, ShufMaskC, ShufMaskIgnore, ShufMaskIgnore), // 0110 MakeVectorRegisterInt(ShufMaskB, ShufMaskC, ShufMaskD, ShufMaskIgnore), // 0111 MakeVectorRegisterInt(ShufMaskA, ShufMaskIgnore, ShufMaskIgnore, ShufMaskIgnore), // 1000 MakeVectorRegisterInt(ShufMaskA, ShufMaskD, ShufMaskIgnore, ShufMaskIgnore), // 1001 MakeVectorRegisterInt(ShufMaskA, ShufMaskC, ShufMaskIgnore, ShufMaskIgnore), // 1010 MakeVectorRegisterInt(ShufMaskA, ShufMaskC, ShufMaskD, ShufMaskIgnore), // 1011 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskIgnore, ShufMaskIgnore), // 1100 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskD, ShufMaskIgnore), // 1101 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskC, ShufMaskIgnore), // 1110 MakeVectorRegisterInt(ShufMaskA, ShufMaskB, ShufMaskC, ShufMaskD), // 1111 }; constexpr uint32 cOne = 0xFFFFFFFFU; constexpr uint32 cZero = 0x00000000U; static const VectorRegister RemainderMask[] = { MakeVectorRegister(cZero, cZero, cZero, cZero), // 0 remaining MakeVectorRegister(cOne, cZero, cZero, cZero), // 1 remaining MakeVectorRegister(cOne, cOne, cZero, cZero), // 2 remaining MakeVectorRegister(cOne, cOne, cOne, cZero), // 3 remaining MakeVectorRegister(cOne, cOne, cOne, cOne), // 4 remaining }; }; // helper function wrapping the SSE3 shuffle operation. Currently implemented for PS4/XB1/Neon, the // rest will just use the FPU version so as to not push the requirements up to SSE3 (currently SSE2) #if PLATFORM_ENABLE_VECTORINTRINSICS && (PLATFORM_PS4 || PLATFORM_XBOXONE) #define VectorIntShuffle( Vec, Mask ) _mm_shuffle_epi8( (Vec), (Mask) ) #elif PLATFORM_ENABLE_VECTORINTRINSICS_NEON /** * Shuffles a VectorInt using a provided shuffle mask * * @param Vec Source vector * @param Mask Shuffle vector */ FORCEINLINE VectorRegisterInt VectorIntShuffle(const VectorRegisterInt& Vec, const VectorRegisterInt& Mask) { uint8x8x2_t VecSplit = { { vget_low_u8(Vec), vget_high_u8(Vec) } }; return vcombine_u8(vtbl2_u8(VecSplit, vget_low_u8(Mask)), vtbl2_u8(VecSplit, vget_high_u8(Mask))); } #else FORCEINLINE VectorRegisterInt VectorIntShuffle(const VectorRegisterInt& Vec, const VectorRegisterInt& Mask) { VectorRegisterInt Result; const int8* VecBytes = reinterpret_cast(&Vec); const int8* MaskBytes = reinterpret_cast(&Mask); int8* ResultBytes = reinterpret_cast(&Result); for (int32 i = 0; i < sizeof(VectorRegisterInt); ++i) { ResultBytes[i] = (MaskBytes[i] < 0) ? 0 : VecBytes[MaskBytes[i] % 16]; } return Result; } #endif //Temporarily locking the free table until we can implement a lock free algorithm. UE-65856 FORCEINLINE void FDataSetMeta::LockFreeTable() { #if FREE_TABLE_LOCK_CONTENTION_WARNINGS uint64 StartCycles = FPlatformTime::Cycles64(); #endif FreeTableLock.Lock(); #if FREE_TABLE_LOCK_CONTENTION_WARNINGS uint64 EndCylces = FPlatformTime::Cycles64(); double DurationMs = FPlatformTime::ToMilliseconds64(EndCylces - StartCycles); if (DurationMs >= FREE_TABLE_LOCK_CONTENTION_WARN_THRESHOLD_MS) { UE_LOG(LogVectorVM, Warning, TEXT("VectorVM Stalled in LockFreeTable()! %g ms"), DurationMs); } #endif } FORCEINLINE void FDataSetMeta::UnlockFreeTable() { FreeTableLock.Unlock(); } static int32 GbParallelVVM = 1; static FAutoConsoleVariableRef CVarbParallelVVM( TEXT("vm.Parallel"), GbParallelVVM, TEXT("If > 0 vector VM chunk level paralellism will be enabled. \n"), ECVF_Default ); static int32 GParallelVVMChunksPerBatch = 4; static FAutoConsoleVariableRef CVarParallelVVMChunksPerBatch( TEXT("vm.ParallelChunksPerBatch"), GParallelVVMChunksPerBatch, TEXT("Number of chunks to process per task when running in parallel. \n"), ECVF_Default ); //These are possibly too granular to enable for everyone. static int32 GbDetailedVMScriptStats = 0; static FAutoConsoleVariableRef CVarDetailedVMScriptStats( TEXT("vm.DetailedVMScriptStats"), GbDetailedVMScriptStats, TEXT("If > 0 the vector VM will emit stats for it's internal module calls. \n"), ECVF_Default ); static int32 GParallelVVMInstancesPerChunk = 128; static FAutoConsoleVariableRef CVarParallelVVMInstancesPerChunk( TEXT("vm.InstancesPerChunk"), GParallelVVMInstancesPerChunk, TEXT("Number of instances per VM chunk. (default=128) \n"), ECVF_ReadOnly ); static int32 GbOptimizeVMByteCode = 0; static FAutoConsoleVariableRef CVarbOptimizeVMByteCode( TEXT("vm.OptimizeVMByteCode"), GbOptimizeVMByteCode, TEXT("If > 0 vector VM code optimization will be enabled at runtime.\n"), ECVF_Default ); static int32 GbUseOptimizedVMByteCode = 0; static FAutoConsoleVariableRef CVarbUseOptimizedVMByteCode( TEXT("vm.UseOptimizedVMByteCode"), GbUseOptimizedVMByteCode, TEXT("If > 0 optimized vector VM code will be excuted at runtime.\n"), ECVF_Default ); static int32 GbSafeOptimizedKernels = 1; static FAutoConsoleVariableRef CVarbSafeOptimizedKernels( TEXT("vm.SafeOptimizedKernels"), GbSafeOptimizedKernels, TEXT("If > 0 optimized vector VM byte code will use safe versions of the kernels.\n"), ECVF_Default ); static int32 GbBatchPackVMOutput = 1; static FAutoConsoleVariableRef CVarbBatchPackVMOutput( TEXT("vm.BatchPackedVMOutput"), GbBatchPackVMOutput, TEXT("If > 0 output elements will be packed and batched branch free.\n"), ECVF_Default ); ////////////////////////////////////////////////////////////////////////// // VM Code Optimizer Context typedef void(*FVectorVMExecFunction)(FVectorVMContext&); struct FVectorVMCodeOptimizerContext { typedef EVectorVMOp(*OptimizeVMFunction)(EVectorVMOp, FVectorVMCodeOptimizerContext&); explicit FVectorVMCodeOptimizerContext(FVectorVMContext& InBaseContext, const uint8* ByteCode, TArray& InOptimizedCode, TArrayView InExternalFunctionRegisterCounts) : BaseContext(InBaseContext) , OptimizedCode(InOptimizedCode) , ExternalFunctionRegisterCounts(InExternalFunctionRegisterCounts) { BaseContext.PrepareForExec(0, nullptr, nullptr, nullptr, TArrayView(), 0, false); BaseContext.PrepareForChunk(ByteCode, 0, 0); } FVectorVMCodeOptimizerContext(const FVectorVMCodeOptimizerContext&) = delete; FVectorVMCodeOptimizerContext(const FVectorVMCodeOptimizerContext&&) = delete; template int32 GetNumLoops() const { return 0; } FORCEINLINE uint8 DecodeU8() { return BaseContext.DecodeU8(); } FORCEINLINE uint16 DecodeU16() { return BaseContext.DecodeU16(); } FORCEINLINE uint32 DecodeU32() { return BaseContext.DecodeU32(); } FORCEINLINE uint64 DecodeU64() { return BaseContext.DecodeU64(); } //-TODO: Support unaligned writes template void Write(const T& v) { reinterpret_cast(OptimizedCode[OptimizedCode.AddUninitialized(sizeof(T))]) = v; } struct FOptimizerCodeState { uint8 const* BaseContextCode; int32 OptimizedCodeLength; }; FOptimizerCodeState CreateCodeState() { FOptimizerCodeState State; State.BaseContextCode = BaseContext.Code; State.OptimizedCodeLength = OptimizedCode.Num(); return State; } void RollbackCodeState(const FOptimizerCodeState& State) { BaseContext.Code = State.BaseContextCode; OptimizedCode.SetNum(State.OptimizedCodeLength, false /* allowShrink */); } FVectorVMContext& BaseContext; TArray& OptimizedCode; const TArrayView ExternalFunctionRegisterCounts; const int32 StartInstance = 0; }; ////////////////////////////////////////////////////////////////////////// // Constant Handlers struct FConstantHandlerBase { const uint16 ConstantIndex; FConstantHandlerBase(FVectorVMContext& Context) : ConstantIndex(Context.DecodeU16()) {} FORCEINLINE void Advance() { } static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Context.DecodeU16()); } }; template struct FConstantHandler : public FConstantHandlerBase { const T Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(*((T*)(Context.ConstantTable + ConstantIndex))) {} FORCEINLINE const T& Get() { return Constant; } FORCEINLINE const T& GetAndAdvance() { return Constant; } }; template<> struct FConstantHandler : public FConstantHandlerBase { const VectorRegister Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(VectorLoadFloat1(&Context.ConstantTable[ConstantIndex])) {} FORCEINLINE const VectorRegister Get() { return Constant; } FORCEINLINE const VectorRegister GetAndAdvance() { return Constant; } }; template<> struct FConstantHandler : public FConstantHandlerBase { const VectorRegisterInt Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(VectorIntLoad1(&Context.ConstantTable[ConstantIndex])) {} FORCEINLINE const VectorRegisterInt Get() { return Constant; } FORCEINLINE const VectorRegisterInt GetAndAdvance() { return Constant; } }; ////////////////////////////////////////////////////////////////////////// // Register handlers. // Handle reading of a register, advancing the pointer with each read. struct FRegisterHandlerBase { const int32 RegisterIndex; FORCEINLINE FRegisterHandlerBase(FVectorVMContext& Context) : RegisterIndex(Context.DecodeU16()) {} static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Context.DecodeU16()); } }; template struct FRegisterHandler : public FRegisterHandlerBase { private: T * RESTRICT Register; public: FORCEINLINE FRegisterHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((T*)Context.GetTempRegister(RegisterIndex)) {} FORCEINLINE const T Get() { return *Register; } FORCEINLINE T* GetDest() { return Register; } FORCEINLINE void Advance() { ++Register; } FORCEINLINE const T GetAndAdvance() { return *Register++; } FORCEINLINE T* GetDestAndAdvance() { return Register++; } }; ////////////////////////////////////////////////////////////////////////// FVectorVMContext::FVectorVMContext() : Code(nullptr) , ConstantTable(nullptr) , ExternalFunctionTable(nullptr) , UserPtrTable(nullptr) , NumInstances(0) , StartInstance(0) #if STATS , StatScopes(nullptr) #endif , TempRegisterSize(0) , TempBufferSize(0) { RandStream.GenerateNewSeed(); } void FVectorVMContext::PrepareForExec( int32 InNumTempRegisters, const uint8* InConstantTable, FVMExternalFunction* InExternalFunctionTable, void** InUserPtrTable, TArrayView InDataSetMetaTable, int32 MaxNumInstances, bool bInParallelExecution ) { NumTempRegisters = InNumTempRegisters; ConstantTable = InConstantTable; ExternalFunctionTable = InExternalFunctionTable; UserPtrTable = InUserPtrTable; TempRegisterSize = Align(MaxNumInstances * VectorVM::MaxInstanceSizeBytes, PLATFORM_CACHE_LINE_SIZE); TempBufferSize = TempRegisterSize * NumTempRegisters; TempRegTable.SetNumUninitialized(TempBufferSize, false); DataSetMetaTable = InDataSetMetaTable; for (auto& TLSTempData : ThreadLocalTempData) { TLSTempData.Reset(); } ThreadLocalTempData.SetNum(DataSetMetaTable.Num()); bIsParallelExecution = bInParallelExecution; } #if STATS void FVectorVMContext::SetStatScopes(const TArray* InStatScopes) { check(InStatScopes); StatScopes = InStatScopes; StatCounterStack.Reserve(StatScopes->Num()); } #endif void FVectorVMContext::FinishExec() { //At the end of executing each chunk we can push any thread local temporary data out to the main storage with locks or atomics. check(ThreadLocalTempData.Num() == DataSetMetaTable.Num()); for(int32 DataSetIndex=0; DataSetIndex < DataSetMetaTable.Num(); ++DataSetIndex) { FDataSetThreadLocalTempData&RESTRICT Data = ThreadLocalTempData[DataSetIndex]; if (Data.IDsToFree.Num() > 0) { TArray&RESTRICT FreeIDTable = *DataSetMetaTable[DataSetIndex].FreeIDTable; int32&RESTRICT NumFreeIDs = *DataSetMetaTable[DataSetIndex].NumFreeIDs; check(FreeIDTable.Num() >= NumFreeIDs + Data.IDsToFree.Num()); //Temporarily locking the free table until we can implement something lock-free DataSetMetaTable[DataSetIndex].LockFreeTable(); for (int32 IDToFree : Data.IDsToFree) { //UE_LOG(LogVectorVM, Warning, TEXT("AddFreeID: ID:%d | FreeTableIdx:%d."), IDToFree, NumFreeIDs); FreeIDTable[NumFreeIDs++] = IDToFree; } //Unlock the free table. DataSetMetaTable[DataSetIndex].UnlockFreeTable(); Data.IDsToFree.Reset(); } //Also update the max ID seen. This should be the ONLY place in the VM we update this max value. if ( bIsParallelExecution ) { volatile int32* MaxUsedID = DataSetMetaTable[DataSetIndex].MaxUsedID; int32 LocalMaxUsedID; do { LocalMaxUsedID = *MaxUsedID; if (LocalMaxUsedID >= Data.MaxID) { break; } } while (FPlatformAtomics::InterlockedCompareExchange(MaxUsedID, Data.MaxID, LocalMaxUsedID) != LocalMaxUsedID); *MaxUsedID = FMath::Max(*MaxUsedID, Data.MaxID); } else { int32* MaxUsedID = DataSetMetaTable[DataSetIndex].MaxUsedID; *MaxUsedID = FMath::Max(*MaxUsedID, Data.MaxID); } } } ////////////////////////////////////////////////////////////////////////// uint8 VectorVM::CreateSrcOperandMask(EVectorVMOperandLocation Type0, EVectorVMOperandLocation Type1, EVectorVMOperandLocation Type2) { return (Type0 == EVectorVMOperandLocation::Constant ? OP0_CONST : OP_REGISTER) | (Type1 == EVectorVMOperandLocation::Constant ? OP1_CONST : OP_REGISTER) | (Type2 == EVectorVMOperandLocation::Constant ? OP2_CONST : OP_REGISTER); } ////////////////////////////////////////////////////////////////////////// // Kernels template struct TUnaryKernelHandler { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Arg0Handler::Optimize(Context); DstHandler::Optimize(Context); } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); DstHandler Dst(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance()); } } }; template struct TBinaryKernelHandler { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Arg0Handler::Optimize(Context); Arg1Handler::Optimize(Context); DstHandler::Optimize(Context); } static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); DstHandler Dst(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance()); } } }; template struct TTrinaryKernelHandler { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Arg0Handler::Optimize(Context); Arg1Handler::Optimize(Context); Arg2Handler::Optimize(Context); DstHandler::Optimize(Context); } static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Arg2Handler Arg2(Context); DstHandler Dst(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance(), Arg2.GetAndAdvance()); } } }; /** Base class of vector kernels with a single operand. */ template struct TUnaryKernel { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TUnaryKernelHandler::Optimize(Context); break; case SRCOP_RRC: TUnaryKernelHandler::Optimize(Context); break; default: check(0); break; }; } static void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TUnaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TUnaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TUnaryScalarKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TUnaryVectorKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TUnaryScalarIntKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TUnaryVectorIntKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /** Base class of Vector kernels with 2 operands. */ template struct TBinaryKernel { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TBinaryKernelHandler::Optimize(Context); break; case SRCOP_RRC: TBinaryKernelHandler::Optimize(Context); break; case SRCOP_RCR: TBinaryKernelHandler::Optimize(Context); break; case SRCOP_RCC: TBinaryKernelHandler::Optimize(Context); break; default: check(0); break; }; } static void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RCR: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RCC: TBinaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TBinaryScalarKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TBinaryVectorKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TBinaryVectorIntKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /** Base class of Vector kernels with 3 operands. */ template struct TTrinaryKernel { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_RRC: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_RCR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_RCC: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CRR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CRC: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CCR: TTrinaryKernelHandler::Optimize(Context); break; case SRCOP_CCC: TTrinaryKernelHandler::Optimize(Context); break; default: check(0); break; }; } static void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RCR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RCC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CRR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CRC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CCR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CCC: TTrinaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TTrinaryScalarKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TTrinaryVectorKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TTrinaryVectorIntKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /*------------------------------------------------------------------------------ Implementation of all kernel operations. ------------------------------------------------------------------------------*/ struct FVectorKernelAdd : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorAdd(Src0, Src1); } }; struct FVectorKernelSub : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorSubtract(Src0, Src1); } }; struct FVectorKernelMul : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMultiply(Src0, Src1); } }; struct FVectorKernelDiv : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorDivide(Src0, Src1); } }; struct FVectorKernelDivSafe : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { VectorRegister ValidMask = VectorCompareGT(VectorAbs(Src1), GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorDivide(Src0, Src1), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelMad : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { *Dst = VectorMultiplyAdd(Src0, Src1, Src2); } }; struct FVectorKernelLerp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { const VectorRegister OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Src2); const VectorRegister Tmp = VectorMultiply(Src0, OneMinusAlpha); *Dst = VectorMultiplyAdd(Src1, Src2, Tmp); } }; struct FVectorKernelRcp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorReciprocal(Src0); } }; // if the magnitude of the value is too small, then the result will be 0 (not NaN/Inf) struct FVectorKernelRcpSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { VectorRegister ValidMask = VectorCompareGT(VectorAbs(Src0), GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorReciprocal(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelRsq : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorReciprocalSqrt(Src0); } }; // if the value is very small or negative, then the result will be 0 (not NaN/Inf/imaginary) struct FVectorKernelRsqSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { VectorRegister ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorReciprocalSqrt(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelSqrt : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { // TODO: Need a SIMD sqrt! *Dst = VectorReciprocal(VectorReciprocalSqrt(Src0)); } }; struct FVectorKernelSqrtSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { VectorRegister ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorReciprocal(VectorReciprocalSqrt(Src0)), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelNeg : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorNegate(Src0); } }; struct FVectorKernelAbs : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorAbs(Src0); } }; struct FVectorKernelExp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorExp(Src0); } }; struct FVectorKernelExp2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorExp2(Src0); } }; struct FVectorKernelLog : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorLog(Src0); } }; struct FVectorKernelLogSafe : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { VectorRegister ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::FloatZero); *Dst = VectorSelect(ValidMask, VectorLog(Src0), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelLog2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorLog2(Src0); } }; struct FVectorKernelClamp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { const VectorRegister Tmp = VectorMax(Src0, Src1); *Dst = VectorMin(Tmp, Src2); } }; struct FVectorKernelSin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorSin(Src0); } }; struct FVectorKernelCos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorCos(Src0); } }; struct FVectorKernelTan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorTan(Src0); } }; struct FVectorKernelASin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorASin(Src0); } }; struct FVectorKernelACos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorACos(Src0); } }; struct FVectorKernelATan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorATan(Src0); } }; struct FVectorKernelATan2 : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorATan2(Src0, Src1); } }; struct FVectorKernelCeil : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorCeil(Src0); } }; struct FVectorKernelFloor : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorFloor(Src0); } }; struct FVectorKernelRound : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { //TODO: >SSE4 has direct ops for this. VectorRegister Trunc = VectorTruncate(Src0); *Dst = VectorAdd(Trunc, VectorTruncate(VectorMultiply(VectorSubtract(Src0, Trunc), GlobalVectorConstants::FloatAlmostTwo))); } }; struct FVectorKernelMod : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorMod(Src0, Src1); } }; struct FVectorKernelFrac : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorFractional(Src0); } }; struct FVectorKernelTrunc : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorTruncate(Src0); } }; struct FVectorKernelCompareLT : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareLT(Src0, Src1); } }; struct FVectorKernelCompareLE : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareLE(Src0, Src1); } }; struct FVectorKernelCompareGT : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareGT(Src0, Src1); } }; struct FVectorKernelCompareGE : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareGE(Src0, Src1); } }; struct FVectorKernelCompareEQ : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareEQ(Src0, Src1); } }; struct FVectorKernelCompareNEQ : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareNE(Src0, Src1); } }; struct FVectorKernelSelect : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Mask, VectorRegister A, VectorRegister B) { *Dst = VectorSelect(Mask, A, B); } }; struct FVectorKernelExecutionIndex { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); FRegisterHandler::Optimize(Context); } static void VM_FORCEINLINE Exec(FVectorVMContext& Context) { static_assert(VECTOR_WIDTH_FLOATS == 4, "Need to update this when upgrading the VM to support >SSE2"); VectorRegisterInt VectorStride = MakeVectorRegisterInt(VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS); VectorRegisterInt Index = MakeVectorRegisterInt(Context.StartInstance, Context.StartInstance + 1, Context.StartInstance + 2, Context.StartInstance + 3); FRegisterHandler Dest(Context); const int32 Loops = Context.GetNumLoops(); for (int32 i = 0; i < Loops; ++i) { *Dest.GetDestAndAdvance() = Index; Index = VectorIntAdd(Index, VectorStride); } } }; struct FVectorKernelEnterStatScope { static void Optimize(FVectorVMCodeOptimizerContext& Context) { #if STATS Context.Write(Exec); FConstantHandler::Optimize(Context); #else // just skip the op if we don't have stats enabled FConstantHandler(Context.BaseContext); #endif } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { FConstantHandler ScopeIdx(Context); #if STATS if (GbDetailedVMScriptStats && Context.StatScopes) { int32 CounterIdx = Context.StatCounterStack.AddDefaulted(1); Context.StatCounterStack[CounterIdx].Start((*Context.StatScopes)[ScopeIdx.Get()]); } #endif } }; struct FVectorKernelExitStatScope { static void Optimize(FVectorVMCodeOptimizerContext& Context) { #if STATS Context.Write(Exec); #endif } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { #if STATS if (GbDetailedVMScriptStats) { Context.StatCounterStack.Last().Stop(); Context.StatCounterStack.Pop(false); } #endif } }; struct FVectorKernelRandom : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { const float rm = RAND_MAX; //EEK!. Improve this. Implement GPU style seeded rand instead of this. VectorRegister Result = MakeVectorRegister(Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction()); *Dst = VectorMultiply(Result, Src0); } }; /* gaussian distribution random number (not working yet) */ struct FVectorKernelRandomGauss : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { const float rm = RAND_MAX; VectorRegister Result = MakeVectorRegister(Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction()); Result = VectorSubtract(Result, GlobalVectorConstants::FloatOneHalf); Result = VectorMultiply(MakeVectorRegister(3.0f, 3.0f, 3.0f, 3.0f), Result); // taylor series gaussian approximation const VectorRegister SPi2 = VectorReciprocal(VectorReciprocalSqrt(MakeVectorRegister(2 * PI, 2 * PI, 2 * PI, 2 * PI))); VectorRegister Gauss = VectorReciprocal(SPi2); VectorRegister Div = VectorMultiply(GlobalVectorConstants::FloatTwo, SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorMultiply(Result, Result), Div)); Div = VectorMultiply(MakeVectorRegister(8.0f, 8.0f, 8.0f, 8.0f), SPi2); Gauss = VectorAdd(Gauss, VectorDivide(VectorPow(MakeVectorRegister(4.0f, 4.0f, 4.0f, 4.0f), Result), Div)); Div = VectorMultiply(MakeVectorRegister(48.0f, 48.0f, 48.0f, 48.0f), SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorPow(MakeVectorRegister(6.0f, 6.0f, 6.0f, 6.0f), Result), Div)); Gauss = VectorDivide(Gauss, MakeVectorRegister(0.4f, 0.4f, 0.4f, 0.4f)); Gauss = VectorMultiply(Gauss, Src0); *Dst = Gauss; } }; struct FVectorKernelMin : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMin(Src0, Src1); } }; struct FVectorKernelMax : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMax(Src0, Src1); } }; struct FVectorKernelPow : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorPow(Src0, Src1); } }; // if the base is small, then the result will be 0 struct FVectorKernelPowSafe : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { VectorRegister ValidMask = VectorCompareGT(Src0, GlobalVectorConstants::SmallNumber); *Dst = VectorSelect(ValidMask, VectorPow(Src0, Src1), GlobalVectorConstants::FloatZero); } }; struct FVectorKernelSign : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorSign(Src0); } }; struct FVectorKernelStep : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorStep(Src0); } }; namespace VectorVMNoise { int32 P[512] = { 151,160,137,91,90,15, 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23, 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33, 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166, 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244, 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196, 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123, 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42, 223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9, 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228, 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107, 49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254, 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180, 151,160,137,91,90,15, 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23, 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33, 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166, 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244, 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196, 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123, 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42, 223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9, 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228, 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107, 49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254, 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180 }; static FORCEINLINE float Lerp(float X, float A, float B) { return A + X * (B - A); } static FORCEINLINE float Fade(float X) { return X * X * X * (X * (X * 6 - 15) + 10); } static FORCEINLINE float Grad(int32 hash, float x, float y, float z) { hash &= 15; float u = (hash < 8) ? x : y; float v = (hash < 4) ? y : ((hash == 12 || hash == 14) ? x : z); return ((hash & 1) == 0 ? u : -u) + ((hash & 2) == 0 ? v : -v); } struct FScalarKernelNoise3D_iNoise : TTrinaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X, float Y, float Z) { float Xfl = FMath::FloorToFloat(X); float Yfl = FMath::FloorToFloat(Y); float Zfl = FMath::FloorToFloat(Z); int32 Xi = (int32)(Xfl) & 255; int32 Yi = (int32)(Yfl) & 255; int32 Zi = (int32)(Zfl) & 255; X -= Xfl; Y -= Yfl; Z -= Zfl; float Xm1 = X - 1.0f; float Ym1 = Y - 1.0f; float Zm1 = Z - 1.0f; int32 A = P[Xi] + Yi; int32 AA = P[A] + Zi; int32 AB = P[A + 1] + Zi; int32 B = P[Xi + 1] + Yi; int32 BA = P[B] + Zi; int32 BB = P[B + 1] + Zi; float U = Fade(X); float V = Fade(Y); float W = Fade(Z); *Dst = Lerp(W, Lerp(V, Lerp(U, Grad(P[AA], X, Y, Z), Grad(P[BA], Xm1, Y, Z)), Lerp(U, Grad(P[AB], X, Ym1, Z), Grad(P[BB], Xm1, Ym1, Z))), Lerp(V, Lerp(U, Grad(P[AA + 1], X, Y, Zm1), Grad(P[BA + 1], Xm1, Y, Zm1)), Lerp(U, Grad(P[AB + 1], X, Ym1, Zm1), Grad(P[BB + 1], Xm1, Ym1, Zm1)))); } }; struct FScalarKernelNoise2D_iNoise : TBinaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X, float Y) { *Dst = 0.0f;//TODO } }; struct FScalarKernelNoise1D_iNoise : TUnaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X) { *Dst = 0.0f;//TODO; } }; static void Noise1D(FVectorVMContext& Context) { FScalarKernelNoise1D_iNoise::Exec(Context); } static void Noise2D(FVectorVMContext& Context) { FScalarKernelNoise2D_iNoise::Exec(Context); } static void Noise3D(FVectorVMContext& Context) { //Basic scalar implementation of perlin's improved noise until I can spend some quality time exploring vectorized implementations of Marc O's noise from Random.ush. //http://mrl.nyu.edu/~perlin/noise/ FScalarKernelNoise3D_iNoise::Exec(Context); } static void Optimize_Noise1D(FVectorVMCodeOptimizerContext& Context) { FScalarKernelNoise1D_iNoise::Optimize(Context); } static void Optimize_Noise2D(FVectorVMCodeOptimizerContext& Context) { FScalarKernelNoise2D_iNoise::Optimize(Context); } static void Optimize_Noise3D(FVectorVMCodeOptimizerContext& Context) { FScalarKernelNoise3D_iNoise::Optimize(Context); } }; //Olaf's orginal curl noise. Needs updating for the new scalar VM and possibly calling Curl Noise to avoid confusion with regular noise? //Possibly needs to be a data interface as the VM can't output Vectors? struct FVectorKernelNoise : public TUnaryVectorKernel { static VectorRegister RandomTable[17][17][17]; static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { const VectorRegister VecSize = MakeVectorRegister(16.0f, 16.0f, 16.0f, 16.0f); *Dst = GlobalVectorConstants::FloatZero; for (uint32 i = 1; i < 2; i++) { float Di = 0.2f * (1.0f/(1<(&Coords); const int32 Cx = CoordPtr[0]; const int32 Cy = CoordPtr[1]; const int32 Cz = CoordPtr[2]; VectorRegister Frac = VectorFractional(Coords); VectorRegister Alpha = VectorReplicate(Frac, 0); VectorRegister OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Alpha); VectorRegister XV1 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz], OneMinusAlpha)); VectorRegister XV2 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz], OneMinusAlpha)); VectorRegister XV3 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz+1], OneMinusAlpha)); VectorRegister XV4 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz+1], OneMinusAlpha)); Alpha = VectorReplicate(Frac, 1); OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Alpha); VectorRegister YV1 = VectorMultiplyAdd(XV1, Alpha, VectorMultiply(XV2, OneMinusAlpha)); VectorRegister YV2 = VectorMultiplyAdd(XV3, Alpha, VectorMultiply(XV4, OneMinusAlpha)); Alpha = VectorReplicate(Frac, 2); OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Alpha); VectorRegister ZV = VectorMultiplyAdd(YV1, Alpha, VectorMultiply(YV2, OneMinusAlpha)); *Dst = VectorAdd(*Dst, ZV); } } }; VectorRegister FVectorKernelNoise::RandomTable[17][17][17]; ////////////////////////////////////////////////////////////////////////// //Special Kernels. /** Special kernel for acquiring a new ID. TODO. Can be written as general RWBuffer ops when we support that. */ struct FScalarKernelAcquireID { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // IDIndexReg Context.Write(Context.DecodeU16()); // IDTagReg } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const TArrayView MetaTable = Context.DataSetMetaTable; TArray&RESTRICT FreeIDTable = *MetaTable[DataSetIndex].FreeIDTable; const int32 Tag = MetaTable[DataSetIndex].IDAcquireTag; const int32 IDIndexReg = Context.DecodeU16(); int32*RESTRICT IDIndex = (int32*)(Context.GetTempRegister(IDIndexReg)); const int32 IDTagReg = Context.DecodeU16(); int32*RESTRICT IDTag = (int32*)(Context.GetTempRegister(IDTagReg)); int32& NumFreeIDs = *MetaTable[DataSetIndex].NumFreeIDs; //Temporarily using a lock to ensure thread safety for accessing the FreeIDTable until a lock free solution can be implemented. MetaTable[DataSetIndex].LockFreeTable(); check(FreeIDTable.Num() >= Context.NumInstances); check(NumFreeIDs >= Context.NumInstances); for (int32 i = 0; i < Context.NumInstances; ++i) { int32 FreeIDTableIndex = --NumFreeIDs; //Grab the value from the FreeIDTable. int32 AcquiredID = FreeIDTable[FreeIDTableIndex]; checkSlow(AcquiredID != INDEX_NONE); //UE_LOG(LogVectorVM, Warning, TEXT("AcquireID: ID:%d | FreeTableIdx:%d."), AcquiredID, FreeIDTableIndex); //Mark this entry in the FreeIDTable as invalid. FreeIDTable[FreeIDTableIndex] = INDEX_NONE; *IDIndex = AcquiredID; *IDTag = Tag; ++IDIndex; ++IDTag; } MetaTable[DataSetIndex].UnlockFreeTable(); } }; /** Special kernel for updating a new ID. TODO. Can be written as general RWBuffer ops when we support that. */ struct FScalarKernelUpdateID { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InstanceIDRegisterIndex Context.Write(Context.DecodeU16()); // InstanceIndexRegisterIndex } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 InstanceIDRegisterIndex = Context.DecodeU16(); const int32 InstanceIndexRegisterIndex = Context.DecodeU16(); const TArrayView MetaTable = Context.DataSetMetaTable; TArray&RESTRICT IDTable = *MetaTable[DataSetIndex].IDTable; const int32 InstanceOffset = MetaTable[DataSetIndex].InstanceOffset + Context.StartInstance; const int32*RESTRICT IDRegister = (int32*)(Context.GetTempRegister(InstanceIDRegisterIndex)); const int32*RESTRICT IndexRegister = (int32*)(Context.GetTempRegister(InstanceIndexRegisterIndex)); FDataSetThreadLocalTempData& DataSetTempData = Context.ThreadLocalTempData[DataSetIndex]; TArray&RESTRICT IDsToFree = DataSetTempData.IDsToFree; check(IDTable.Num() >= InstanceOffset + Context.NumInstances); for (int32 i = 0; i < Context.NumInstances; ++i) { int32 InstanceId = IDRegister[i]; int32 Index = IndexRegister[i]; if (Index == INDEX_NONE) { //Add the ID to a thread local list of IDs to free which are actually added to the list safely at the end of this chunk's execution. IDsToFree.Add(InstanceId); IDTable[InstanceId] = INDEX_NONE; //UE_LOG(LogVectorVM, Warning, TEXT("FreeingID: InstanceID:%d."), InstanceId); } else { //Update the actual index for this ID. No thread safety is needed as this ID slot can only ever be written by this instance and so a single thread. IDTable[InstanceId] = Index; //Update thread local max ID seen. We push this to the real value at the end of execution. DataSetTempData.MaxID = FMath::Max(DataSetTempData.MaxID, InstanceId); //UE_LOG(LogVectorVM, Warning, TEXT("UpdateID: RealIdx:%d | InstanceID:%d."), RealIdx, InstanceId); } } } }; /** Special kernel for reading from the main input dataset. */ template struct FVectorKernelReadInput { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InputRegisterIdx Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { static const int32 InstancesPerVector = sizeof(VectorRegister) / sizeof(T); const int32 DataSetIndex = Context.DecodeU16(); const int32 InputRegisterIdx = Context.DecodeU16(); const int32 DestRegisterIdx = Context.DecodeU16(); const int32 Loops = Context.GetNumLoops(); VectorRegister* DestReg = (VectorRegister*)(Context.GetTempRegister(DestRegisterIdx)); VectorRegister* InputReg = (VectorRegister*)(Context.GetInputRegister(DataSetIndex, InputRegisterIdx) + Context.GetStartInstance()); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. for (int32 i = 0; i < Loops; ++i) { *DestReg = VectorLoad(InputReg); ++DestReg; ++InputReg; } } }; /** Special kernel for reading from an input dataset; non-advancing (reads same instance everytime). * this kernel splats the X component of the source register to all 4 dest components; it's meant to * use scalar data sets as the source (e.g. events) */ template struct FVectorKernelReadInputNoAdvance { static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Exec); Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // InputRegisterIdx Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { static const int32 InstancesPerVector = sizeof(VectorRegister) / sizeof(T); const int32 DataSetIndex = Context.DecodeU16(); const int32 InputRegisterIdx = Context.DecodeU16(); const int32 DestRegisterIdx = Context.DecodeU16(); const int32 Loops = Context.GetNumLoops(); VectorRegister* DestReg = (VectorRegister*)(Context.GetTempRegister(DestRegisterIdx)); VectorRegister* InputReg = (VectorRegister*)(Context.GetInputRegister(DataSetIndex, InputRegisterIdx)); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. for (int32 i = 0; i < Loops; ++i) { *DestReg = VectorSwizzle(VectorLoad(InputReg), 0,0,0,0); ++DestReg; } } }; //TODO - Should be straight forwards to follow the input with a mix of the outputs direct indexing /** Special kernel for reading an specific location in an input register. */ // template // struct FScalarKernelReadInputIndexed // { // static VM_FORCEINLINE void Exec(FVectorVMContext& Context) // { // int32* IndexReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]); // T* InputReg = (T*)(Context.RegisterTable[DecodeU16(Context)]); // T* DestReg = (T*)(Context.RegisterTable[DecodeU16(Context)]); // // //Has to be scalar as each instance can read from a different location in the input buffer. // for (int32 i = 0; i < Context.NumInstances; ++i) // { // T* ReadPtr = (*InputReg) + (*IndexReg); // *DestReg = (*ReadPtr); // ++IndexReg; // ++DestReg; // } // } // }; /** Special kernel for writing to a specific output register. */ template struct FScalarKernelWriteOutputIndexed { static VM_FORCEINLINE void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpTypes = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: Context.Write(DoKernel>); break; case SRCOP_RRC: Context.Write(DoKernel>); break; default: check(0); break; }; Context.Write(Context.DecodeU16()); // DataSetIndex Context.Write(Context.DecodeU16()); // DestIndexRegisterIdx Context.Write(Context.DecodeU16()); // DataHandlerType Context.Write(Context.DecodeU16()); // DestRegisterIdx } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const uint32 SrcOpTypes = Context.DecodeSrcOperandTypes(); switch (SrcOpTypes) { case SRCOP_RRR: DoKernel>(Context); break; case SRCOP_RRC: DoKernel>(Context); break; default: check(0); break; }; } template static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context) { const int32 DataSetIndex = Context.DecodeU16(); const int32 DestIndexRegisterIdx = Context.DecodeU16(); T* DestIndexReg = (T*)(Context.GetTempRegister(DestIndexRegisterIdx)); DataHandlerType DataHandler(Context); const int32 DestRegisterIdx = Context.DecodeU16(); T* DestReg = Context.GetOutputRegister(DataSetIndex, DestRegisterIdx); for (int32 i = 0; i < Context.GetNumInstances(); ++i) { int32 DestIndex = *DestIndexReg; if (DestIndex != INDEX_NONE) { DestReg[DestIndex] = DataHandler.Get(); } ++DestIndexReg; DataHandler.Advance(); //We don't increment the dest as we index into it directly. } } }; struct FDataSetCounterHandler { int32* Counter; FDataSetCounterHandler(FVectorVMContext& Context) : Counter(&Context.GetDataSetMeta(Context.DecodeU16()).DataSetAccessIndex) {} VM_FORCEINLINE void Advance() { } VM_FORCEINLINE int32* Get() { return Counter; } VM_FORCEINLINE int32* GetAndAdvance() { return Counter; } //VM_FORCEINLINE const int32* GetDest() { return Counter; }Should never use as a dest. All kernels with read and write to this. static void Optimize(FVectorVMCodeOptimizerContext& Context) { Context.Write(Context.DecodeU16()); } }; struct FScalarKernelAcquireCounterIndex { template struct InternalKernel { static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context, int32* RESTRICT Dst, int32* Index, int32 Valid) { if (Valid != 0) { *Dst = bThreadsafe ? FPlatformAtomics::InterlockedIncrement(Index) : ++(*Index); } else { *Dst = INDEX_NONE; // Subsequent DoKernal calls above will skip over INDEX_NONE register entries... } } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { const uint32 SrcOpType = Context.DecodeSrcOperandTypes(); switch (SrcOpType) { case SRCOP_RRR: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; }; } }; template static void ExecOptimized(FVectorVMContext& Context) { if (Context.IsParallelExecution()) { switch (SrcOpType) { case SRCOP_RRR: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; } } else { switch (SrcOpType) { case SRCOP_RRR: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FRegisterHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; } } } static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpType = Context.BaseContext.DecodeSrcOperandTypes(); switch (SrcOpType) { case SRCOP_RRR: Context.Write(FScalarKernelAcquireCounterIndex::ExecOptimized); break; case SRCOP_RRC: Context.Write(FScalarKernelAcquireCounterIndex::ExecOptimized); break; default: check(0); break; } // Three registers, note we don't call Optimize on the Kernel since that will write the Exec and we are selecting based upon thread safe or not Context.Write(Context.DecodeU16()); Context.Write(Context.DecodeU16()); Context.Write(Context.DecodeU16()); } static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { if ( Context.IsParallelExecution() ) { InternalKernel::Exec(Context); } else { InternalKernel::Exec(Context); } } }; //TODO: REWORK TO FUNCITON LIKE THE ABOVE. // /** Special kernel for decrementing a dataset counter. */ // struct FScalarKernelReleaseCounterIndex // { // static VM_FORCEINLINE void Exec(FVectorVMContext& Context) // { // int32* CounterPtr = (int32*)(Context.ConstantTable[DecodeU16(Context)]); // int32* DestReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]); // // for (int32 i = 0; i < Context.NumInstances; ++i) // { // int32 Counter = (*CounterPtr--); // *DestReg = Counter >= 0 ? Counter : INDEX_NONE; // // ++DestReg; // } // } // }; ////////////////////////////////////////////////////////////////////////// //external_func_call struct FKernelExternalFunctionCall { static void Optimize(FVectorVMCodeOptimizerContext& Context) { const uint32 ExternalFuncIdx = Context.DecodeU8(); Context.Write(Exec); Context.Write(ExternalFuncIdx); const int32 NumRegisters = Context.ExternalFunctionRegisterCounts[ExternalFuncIdx]; for ( int32 i=0; i < NumRegisters; ++i ) { Context.Write(Context.DecodeU16()); } } static void Exec(FVectorVMContext& Context) { const uint32 ExternalFuncIdx = Context.DecodeU8(); Context.ExternalFunctionTable[ExternalFuncIdx].Execute(Context); } }; ////////////////////////////////////////////////////////////////////////// //Integer operations //addi, struct FVectorIntKernelAdd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntAdd(Src0, Src1); } }; //subi, struct FVectorIntKernelSubtract : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntSubtract(Src0, Src1); } }; //muli, struct FVectorIntKernelMultiply : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntMultiply(Src0, Src1); } }; //divi, struct FVectorIntKernelDivide : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); // No intrinsics exist for integer divide. Since div by zero causes crashes, we must be safe against that. int32 TmpDst[4]; TmpDst[0] = TmpB[0] != 0 ? (TmpA[0] / TmpB[0]) : 0; TmpDst[1] = TmpB[1] != 0 ? (TmpA[1] / TmpB[1]) : 0; TmpDst[2] = TmpB[2] != 0 ? (TmpA[2] / TmpB[2]) : 0; TmpDst[3] = TmpB[3] != 0 ? (TmpA[3] / TmpB[3]) : 0; *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3]); } }; //clampi, struct FVectorIntKernelClamp : TTrinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1, VectorRegisterInt Src2) { *Dst = VectorIntMin(VectorIntMax(Src0, Src1), Src2); } }; //mini, struct FVectorIntKernelMin : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntMin(Src0, Src1); } }; //maxi, struct FVectorIntKernelMax : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntMax(Src0, Src1); } }; //absi, struct FVectorIntKernelAbs : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntAbs(Src0); } }; //negi, struct FVectorIntKernelNegate : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntNegate(Src0); } }; //signi, struct FVectorIntKernelSign : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntSign(Src0); } }; //randomi, //No good way to do this with SSE atm so just do it scalar. struct FScalarIntKernelRandom : public TUnaryScalarIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, int32* RESTRICT Dst, int32 Src0) { const float rm = RAND_MAX; //EEK!. Improve this. Implement GPU style seeded rand instead of this. *Dst = static_cast(Context.RandStream.GetFraction() * Src0); } }; //cmplti, struct FVectorIntKernelCompareLT : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareLT(Src0, Src1); } }; //cmplei, struct FVectorIntKernelCompareLE : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareLE(Src0, Src1); } }; //cmpgti, struct FVectorIntKernelCompareGT : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareGT(Src0, Src1); } }; //cmpgei, struct FVectorIntKernelCompareGE : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareGE(Src0, Src1); } }; //cmpeqi, struct FVectorIntKernelCompareEQ : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareEQ(Src0, Src1); } }; //cmpneqi, struct FVectorIntKernelCompareNEQ : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareNEQ(Src0, Src1); } }; //bit_and, struct FVectorIntKernelBitAnd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntAnd(Src0, Src1); } }; //bit_or, struct FVectorIntKernelBitOr : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntOr(Src0, Src1); } }; //bit_xor, struct FVectorIntKernelBitXor : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntXor(Src0, Src1); } }; //bit_not, struct FVectorIntKernelBitNot : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntNot(Src0); } }; // bit_lshift struct FVectorIntKernelBitLShift : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); int32 TmpDst[4]; TmpDst[0] = (TmpA[0] << TmpB[0]); TmpDst[1] = (TmpA[1] << TmpB[1]); TmpDst[2] = (TmpA[2] << TmpB[2]); TmpDst[3] = (TmpA[3] << TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3] ); } }; // bit_rshift struct FVectorIntKernelBitRShift : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); int32 TmpDst[4]; TmpDst[0] = (TmpA[0] >> TmpB[0]); TmpDst[1] = (TmpA[1] >> TmpB[1]); TmpDst[2] = (TmpA[2] >> TmpB[2]); TmpDst[3] = (TmpA[3] >> TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3]); } }; //"Boolean" ops. Currently handling bools as integers. //logic_and, struct FVectorIntKernelLogicAnd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntAnd(Src0, Src1); } }; //logic_or, struct FVectorIntKernelLogicOr : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntOr(Src0, Src1); } }; //logic_xor, struct FVectorIntKernelLogicXor : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntXor(Src0, Src1); } }; //logic_not, struct FVectorIntKernelLogicNot : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntNot(Src0); } }; //conversions //f2i, struct FVectorKernelFloatToInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegister Src0) { *Dst = VectorFloatToInt(Src0); } }; //i2f, struct FVectorKernelIntToFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* Dst, VectorRegisterInt Src0) { *Dst = VectorIntToFloat(Src0); } }; //f2b, struct FVectorKernelFloatToBool : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* Dst, VectorRegister Src0) { *Dst = VectorCompareGT(Src0, GlobalVectorConstants::FloatZero); } }; //b2f, struct FVectorKernelBoolToFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* Dst, VectorRegister Src0) { *Dst = VectorSelect(Src0, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero); } }; //i2b, struct FVectorKernelIntToBool : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntCompareGT(Src0, GlobalVectorConstants::IntZero); } }; //b2i, struct FVectorKernelBoolToInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntSelect(Src0, GlobalVectorConstants::IntOne, GlobalVectorConstants::IntZero); } }; #if WITH_EDITOR UEnum* g_VectorVMEnumStateObj = nullptr; UEnum* g_VectorVMEnumOperandObj = nullptr; #endif void VectorVM::Init() { static bool Inited = false; if (Inited == false) { #if WITH_EDITOR g_VectorVMEnumStateObj = StaticEnum(); g_VectorVMEnumOperandObj = StaticEnum(); #endif // random noise float TempTable[17][17][17]; for (int z = 0; z < 17; z++) { for (int y = 0; y < 17; y++) { for (int x = 0; x < 17; x++) { float f1 = (float)FMath::FRandRange(-1.0f, 1.0f); TempTable[x][y][z] = f1; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable[i][j][16] = TempTable[i][j][0]; TempTable[i][16][j] = TempTable[i][0][j]; TempTable[16][j][i] = TempTable[0][j][i]; } } // compute gradients FVector TempTable2[17][17][17]; for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector XGrad = FVector(1.0f, 0.0f, TempTable[x][y][z] - TempTable[x+1][y][z]); FVector YGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y + 1][z]); FVector ZGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y][z+1]); FVector Grad = FVector(XGrad.Z, YGrad.Z, ZGrad.Z); TempTable2[x][y][z] = Grad; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable2[i][j][16] = TempTable2[i][j][0]; TempTable2[i][16][j] = TempTable2[i][0][j]; TempTable2[16][j][i] = TempTable2[0][j][i]; } } // compute curl of gradient field for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector Dy = TempTable2[x][y][z] - TempTable2[x][y + 1][z]; FVector Sy = TempTable2[x][y][z] + TempTable2[x][y + 1][z]; FVector Dx = TempTable2[x][y][z] - TempTable2[x + 1][y][z]; FVector Sx = TempTable2[x][y][z] + TempTable2[x + 1][y][z]; FVector Dz = TempTable2[x][y][z] - TempTable2[x][y][z + 1]; FVector Sz = TempTable2[x][y][z] + TempTable2[x][y][z + 1]; FVector Dir = FVector(Dy.Z - Sz.Y, Dz.X - Sx.Z, Dx.Y - Sy.X); FVectorKernelNoise::RandomTable[x][y][z] = MakeVectorRegister(Dir.X, Dir.Y, Dir.Z, 0.0f); } } } Inited = true; } } void VectorVM::Exec( uint8 const* ByteCode, uint8 const* OptimizedByteCode, int32 NumTempRegisters, uint8 const* ConstantTable, TArrayView DataSetMetaTable, FVMExternalFunction* ExternalFunctionTable, void** UserPtrTable, int32 NumInstances #if STATS , const TArray& StatScopes #endif ) { //TRACE_CPUPROFILER_EVENT_SCOPE("VMExec"); SCOPE_CYCLE_COUNTER(STAT_VVMExec); const int32 MaxInstances = FMath::Min(GParallelVVMInstancesPerChunk, NumInstances); const int32 NumChunks = (NumInstances / GParallelVVMInstancesPerChunk) + 1; const int32 ChunksPerBatch = (GbParallelVVM != 0 && FApp::ShouldUseThreadingForPerformance()) ? GParallelVVMChunksPerBatch : NumChunks; const int32 NumBatches = FMath::DivideAndRoundUp(NumChunks, ChunksPerBatch); const bool bParallel = NumBatches > 1; const bool bUseOptimizedByteCode = (OptimizedByteCode != nullptr) && GbUseOptimizedVMByteCode; auto ExecChunkBatch = [&](int32 BatchIdx) { //SCOPE_CYCLE_COUNTER(STAT_VVMExecChunk); FVectorVMContext& Context = FVectorVMContext::Get(); Context.PrepareForExec(NumTempRegisters, ConstantTable, ExternalFunctionTable, UserPtrTable, DataSetMetaTable, MaxInstances, bParallel); #if STATS Context.SetStatScopes(&StatScopes); #endif // Process one chunk at a time. int32 ChunkIdx = BatchIdx * ChunksPerBatch; const int32 FirstInstance = ChunkIdx * GParallelVVMInstancesPerChunk; const int32 FinalInstance = FMath::Min(NumInstances, FirstInstance + (ChunksPerBatch * GParallelVVMInstancesPerChunk)); int32 InstancesLeft = FinalInstance - FirstInstance; while (InstancesLeft > 0) { int32 NumInstancesThisChunk = FMath::Min(InstancesLeft, (int32)GParallelVVMInstancesPerChunk); int32 StartInstance = GParallelVVMInstancesPerChunk * ChunkIdx; // Execute optimized byte code version if ( bUseOptimizedByteCode ) { // Setup execution context. Context.PrepareForChunk(OptimizedByteCode, NumInstancesThisChunk, StartInstance); while (true) { FVectorVMExecFunction ExecFunction = reinterpret_cast(Context.DecodePtr()); if (ExecFunction == nullptr) { break; } ExecFunction(Context); } } else { // Setup execution context. Context.PrepareForChunk(ByteCode, NumInstancesThisChunk, StartInstance); // Execute VM on all vectors in this chunk. EVectorVMOp Op = EVectorVMOp::done; do { Op = Context.DecodeOp(); switch (Op) { // Dispatch kernel ops. case EVectorVMOp::add: FVectorKernelAdd::Exec(Context); break; case EVectorVMOp::sub: FVectorKernelSub::Exec(Context); break; case EVectorVMOp::mul: FVectorKernelMul::Exec(Context); break; case EVectorVMOp::div: FVectorKernelDivSafe::Exec(Context); break; case EVectorVMOp::mad: FVectorKernelMad::Exec(Context); break; case EVectorVMOp::lerp: FVectorKernelLerp::Exec(Context); break; case EVectorVMOp::rcp: FVectorKernelRcpSafe::Exec(Context); break; case EVectorVMOp::rsq: FVectorKernelRsqSafe::Exec(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrtSafe::Exec(Context); break; case EVectorVMOp::neg: FVectorKernelNeg::Exec(Context); break; case EVectorVMOp::abs: FVectorKernelAbs::Exec(Context); break; case EVectorVMOp::exp: FVectorKernelExp::Exec(Context); break; case EVectorVMOp::exp2: FVectorKernelExp2::Exec(Context); break; case EVectorVMOp::log: FVectorKernelLogSafe::Exec(Context); break; case EVectorVMOp::log2: FVectorKernelLog2::Exec(Context); break; case EVectorVMOp::sin: FVectorKernelSin::Exec(Context); break; case EVectorVMOp::cos: FVectorKernelCos::Exec(Context); break; case EVectorVMOp::tan: FVectorKernelTan::Exec(Context); break; case EVectorVMOp::asin: FVectorKernelASin::Exec(Context); break; case EVectorVMOp::acos: FVectorKernelACos::Exec(Context); break; case EVectorVMOp::atan: FVectorKernelATan::Exec(Context); break; case EVectorVMOp::atan2: FVectorKernelATan2::Exec(Context); break; case EVectorVMOp::ceil: FVectorKernelCeil::Exec(Context); break; case EVectorVMOp::floor: FVectorKernelFloor::Exec(Context); break; case EVectorVMOp::round: FVectorKernelRound::Exec(Context); break; case EVectorVMOp::fmod: FVectorKernelMod::Exec(Context); break; case EVectorVMOp::frac: FVectorKernelFrac::Exec(Context); break; case EVectorVMOp::trunc: FVectorKernelTrunc::Exec(Context); break; case EVectorVMOp::clamp: FVectorKernelClamp::Exec(Context); break; case EVectorVMOp::min: FVectorKernelMin::Exec(Context); break; case EVectorVMOp::max: FVectorKernelMax::Exec(Context); break; case EVectorVMOp::pow: FVectorKernelPowSafe::Exec(Context); break; case EVectorVMOp::sign: FVectorKernelSign::Exec(Context); break; case EVectorVMOp::step: FVectorKernelStep::Exec(Context); break; case EVectorVMOp::random: FVectorKernelRandom::Exec(Context); break; case EVectorVMOp::noise: VectorVMNoise::Noise1D(Context); break; case EVectorVMOp::noise2D: VectorVMNoise::Noise2D(Context); break; case EVectorVMOp::noise3D: VectorVMNoise::Noise3D(Context); break; case EVectorVMOp::cmplt: FVectorKernelCompareLT::Exec(Context); break; case EVectorVMOp::cmple: FVectorKernelCompareLE::Exec(Context); break; case EVectorVMOp::cmpgt: FVectorKernelCompareGT::Exec(Context); break; case EVectorVMOp::cmpge: FVectorKernelCompareGE::Exec(Context); break; case EVectorVMOp::cmpeq: FVectorKernelCompareEQ::Exec(Context); break; case EVectorVMOp::cmpneq: FVectorKernelCompareNEQ::Exec(Context); break; case EVectorVMOp::select: FVectorKernelSelect::Exec(Context); break; case EVectorVMOp::addi: FVectorIntKernelAdd::Exec(Context); break; case EVectorVMOp::subi: FVectorIntKernelSubtract::Exec(Context); break; case EVectorVMOp::muli: FVectorIntKernelMultiply::Exec(Context); break; case EVectorVMOp::divi: FVectorIntKernelDivide::Exec(Context); break; case EVectorVMOp::clampi: FVectorIntKernelClamp::Exec(Context); break; case EVectorVMOp::mini: FVectorIntKernelMin::Exec(Context); break; case EVectorVMOp::maxi: FVectorIntKernelMax::Exec(Context); break; case EVectorVMOp::absi: FVectorIntKernelAbs::Exec(Context); break; case EVectorVMOp::negi: FVectorIntKernelNegate::Exec(Context); break; case EVectorVMOp::signi: FVectorIntKernelSign::Exec(Context); break; case EVectorVMOp::randomi: FScalarIntKernelRandom::Exec(Context); break; case EVectorVMOp::cmplti: FVectorIntKernelCompareLT::Exec(Context); break; case EVectorVMOp::cmplei: FVectorIntKernelCompareLE::Exec(Context); break; case EVectorVMOp::cmpgti: FVectorIntKernelCompareGT::Exec(Context); break; case EVectorVMOp::cmpgei: FVectorIntKernelCompareGE::Exec(Context); break; case EVectorVMOp::cmpeqi: FVectorIntKernelCompareEQ::Exec(Context); break; case EVectorVMOp::cmpneqi: FVectorIntKernelCompareNEQ::Exec(Context); break; case EVectorVMOp::bit_and: FVectorIntKernelBitAnd::Exec(Context); break; case EVectorVMOp::bit_or: FVectorIntKernelBitOr::Exec(Context); break; case EVectorVMOp::bit_xor: FVectorIntKernelBitXor::Exec(Context); break; case EVectorVMOp::bit_not: FVectorIntKernelBitNot::Exec(Context); break; case EVectorVMOp::bit_lshift: FVectorIntKernelBitLShift::Exec(Context); break; case EVectorVMOp::bit_rshift: FVectorIntKernelBitRShift::Exec(Context); break; case EVectorVMOp::logic_and: FVectorIntKernelLogicAnd::Exec(Context); break; case EVectorVMOp::logic_or: FVectorIntKernelLogicOr::Exec(Context); break; case EVectorVMOp::logic_xor: FVectorIntKernelLogicXor::Exec(Context); break; case EVectorVMOp::logic_not: FVectorIntKernelLogicNot::Exec(Context); break; case EVectorVMOp::f2i: FVectorKernelFloatToInt::Exec(Context); break; case EVectorVMOp::i2f: FVectorKernelIntToFloat::Exec(Context); break; case EVectorVMOp::f2b: FVectorKernelFloatToBool::Exec(Context); break; case EVectorVMOp::b2f: FVectorKernelBoolToFloat::Exec(Context); break; case EVectorVMOp::i2b: FVectorKernelIntToBool::Exec(Context); break; case EVectorVMOp::b2i: FVectorKernelBoolToInt::Exec(Context); break; case EVectorVMOp::outputdata_32bit: FScalarKernelWriteOutputIndexed::Exec(Context); break; case EVectorVMOp::inputdata_32bit: FVectorKernelReadInput::Exec(Context); break; case EVectorVMOp::inputdata_noadvance_32bit: FVectorKernelReadInputNoAdvance::Exec(Context); break; case EVectorVMOp::acquireindex: FScalarKernelAcquireCounterIndex::Exec(Context); break; case EVectorVMOp::external_func_call: FKernelExternalFunctionCall::Exec(Context); break; case EVectorVMOp::exec_index: FVectorKernelExecutionIndex::Exec(Context); break; case EVectorVMOp::enter_stat_scope: FVectorKernelEnterStatScope::Exec(Context); break; case EVectorVMOp::exit_stat_scope: FVectorKernelExitStatScope::Exec(Context); break; //Special case ops to handle unique IDs but this can be written as generalized buffer operations. TODO! case EVectorVMOp::update_id: FScalarKernelUpdateID::Exec(Context); break; case EVectorVMOp::acquire_id: FScalarKernelAcquireID::Exec(Context); break; // Execution always terminates with a "done" opcode. case EVectorVMOp::done: break; // Opcode not recognized / implemented. default: UE_LOG(LogVectorVM, Fatal, TEXT("Unknown op code 0x%02x"), (uint32)Op); return;//BAIL } } while (Op != EVectorVMOp::done); } InstancesLeft -= GParallelVVMInstancesPerChunk; ++ChunkIdx; } Context.FinishExec(); }; if ( NumBatches > 1 ) { ParallelFor(NumBatches, ExecChunkBatch, GbParallelVVM == 0 || !bParallel); } else { ExecChunkBatch(0); } } uint8 VectorVM::GetNumOpCodes() { return (uint8)EVectorVMOp::NumOpcodes; } #if WITH_EDITOR FString VectorVM::GetOpName(EVectorVMOp Op) { check(g_VectorVMEnumStateObj); FString OpStr = g_VectorVMEnumStateObj->GetNameByValue((uint8)Op).ToString(); int32 LastIdx = 0; OpStr.FindLastChar(TEXT(':'),LastIdx); return OpStr.RightChop(LastIdx); } FString VectorVM::GetOperandLocationName(EVectorVMOperandLocation Location) { check(g_VectorVMEnumOperandObj); FString LocStr = g_VectorVMEnumOperandObj->GetNameByValue((uint8)Location).ToString(); int32 LastIdx = 0; LocStr.FindLastChar(TEXT(':'), LastIdx); return LocStr.RightChop(LastIdx); } #endif // local implementation of VectorIntShuffle for neon/directx/ // Optimization managed by GbBatchPackVMOutput via PackedOutputOptimization() // Looks for the common pattern of an acquireindex op followed by a number of associated outputdata_32bit ops. The // stock operation is to write an index into a temporary register, and then have the different outputs streams // write into the indexed location. This optimization does a number of things: // -first we check if 'validity' is uniform or not, if it is we can have a fast path of both figuring out how many // indices we need, as well as how to write the output (if we find that they are all invalid, then we don't need to do anything!) // -if we need to evaluate the validity of each element we quickly count up the number (with vector intrinsics) and // grab a block of the indices (rather than one at a time) // -rather than storing the indices to use, we store a int8 mask which indicates a valid flag for each of the next 4 samples // -outputs are then written to depending on their source and their frequency: // -uniform sources will be splatted to all valid entries // -variable sources will be packed into the available slots struct FBatchedWriteIndexedOutput { // functor for copying a source register to an output register struct FCopyOp { void VM_FORCEINLINE operator()(FVectorVMContext& Context, uint16 DataSetIndex) { FRegisterHandler SourceRegister(Context); const uint16 DestRegisterIdx = Context.DecodeU16(); int32* DestReg = Context.GetOutputRegister(DataSetIndex, DestRegisterIdx) + Context.ValidInstanceIndexStart; FMemory::StreamingMemcpy(DestReg, SourceRegister.GetDest(), sizeof(int32) * Context.ValidInstanceCount); } }; // functor for splatting a constant value to an output register template struct FSplatOp { void VM_FORCEINLINE operator()(FVectorVMContext& Context, uint16 DataSetIndex) { InputHandler SourceRegister(Context); const uint16 DestRegisterIdx = Context.DecodeU16(); int32* DestReg = Context.GetOutputRegister(DataSetIndex, DestRegisterIdx) + Context.ValidInstanceIndexStart; const int32 SourceValue = SourceRegister.Get(); const int32 InstanceVectorCount = FMath::DivideAndRoundDown(Context.ValidInstanceCount, VECTOR_WIDTH_FLOATS); if (InstanceVectorCount) { const VectorRegisterInt SplatValue = MakeVectorRegisterInt(SourceValue, SourceValue, SourceValue, SourceValue); for (int32 VectorIt = 0; VectorIt < InstanceVectorCount; ++VectorIt) { VectorIntStore(SplatValue, DestReg + VectorIt * VECTOR_WIDTH_FLOATS); } } for (int32 InstanceIt = InstanceVectorCount * VECTOR_WIDTH_FLOATS; InstanceIt < Context.ValidInstanceCount; ++InstanceIt) { DestReg[InstanceIt] = SourceValue; } } }; // performs the operation of copying data from a temporary register to an output register under the assumption // that the validity of each instance is uniform (valid or not). template static VM_FORCEINLINE void DoRegisterKernelFixedValid(FVectorVMContext& Context) { const uint16 DataSetIndex = Context.DecodeU16(); Context.DecodeU16(); // DestIndexRegisterIdx const uint16 AccumulatedOpCount = Context.DecodeU16(); // if none of the instances are valid, then don't bother writing anything if (!Context.ValidInstanceCount) { // todo we should early out of this case rather than keep parsing the code for (uint16 OpIt = 0; OpIt < AccumulatedOpCount; ++OpIt) { FRegisterHandler Dummy(Context); Context.DecodeU16(); // DestRegisterIdx } return; } // for each of our ops, copy the data from the working register to the output const int32 DataSize = sizeof(int32) * Context.ValidInstanceCount; for (uint16 OpIt = 0; OpIt < AccumulatedOpCount; ++OpIt) { PopulateOp()(Context, DataSetIndex); } } // performs the operation of copying data from a temporary register to an output register without foreknowledge // of the validity of individual instances static VM_FORCEINLINE void DoRegisterKernelVariableValid(FVectorVMContext& Context) { // if we found that all of the instances are valid, then just run the fixed version if (Context.ValidInstanceUniform) { DoRegisterKernelFixedValid(Context); return; } const uint16 DataSetIndex = Context.DecodeU16(); const uint16 DestIndexRegisterIdx = Context.DecodeU16(); const uint16 AccumulatedOpCount = Context.DecodeU16(); FDataSetMeta& DataSetMeta = Context.GetDataSetMeta(DataSetIndex); const int8* DestIndexReg = reinterpret_cast(Context.GetTempRegister(DestIndexRegisterIdx)); const int32 LoopCount = FMath::DivideAndRoundUp(Context.NumInstances, VECTOR_WIDTH_FLOATS); uint16 OpIt = 0; // // VectorIntStore( - unaligned writes of 16 bytes to our Destination; note that this maneuver requires us to have // our output buffers padded out to 16 bytes! // VectorIntShuffle( - swizzle our source register to pack the valid entries at the beginning, with 0s at the end // Source, - source data // ShuffleMask), - result of the VectorMaskBits done in the acquireindex, int8/VectorRegister of input // Destination); constexpr int32 OpsPerLoop = 4; for (OpIt = 0; (OpIt + OpsPerLoop) < AccumulatedOpCount; OpIt += OpsPerLoop) { const RegisterType* Source0 = FRegisterHandler(Context).GetDest(); int32* DestReg0 = Context.GetOutputRegister(DataSetIndex, Context.DecodeU16()) + Context.ValidInstanceIndexStart; const RegisterType* Source1 = FRegisterHandler(Context).GetDest(); int32* DestReg1 = Context.GetOutputRegister(DataSetIndex, Context.DecodeU16()) + Context.ValidInstanceIndexStart; const RegisterType* Source2 = FRegisterHandler(Context).GetDest(); int32* DestReg2 = Context.GetOutputRegister(DataSetIndex, Context.DecodeU16()) + Context.ValidInstanceIndexStart; const RegisterType* Source3 = FRegisterHandler(Context).GetDest(); int32* DestReg3 = Context.GetOutputRegister(DataSetIndex, Context.DecodeU16()) + Context.ValidInstanceIndexStart; for (int32 LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { const int8 ShuffleMask = DestIndexReg[LoopIt]; const int8 AdvanceCount = FMath::CountBits(ShuffleMask); VectorIntStore(VectorIntShuffle(Source0[LoopIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]), DestReg0); VectorIntStore(VectorIntShuffle(Source1[LoopIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]), DestReg1); VectorIntStore(VectorIntShuffle(Source2[LoopIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]), DestReg2); VectorIntStore(VectorIntShuffle(Source3[LoopIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]), DestReg3); DestReg0 += AdvanceCount; DestReg1 += AdvanceCount; DestReg2 += AdvanceCount; DestReg3 += AdvanceCount; } } for (; OpIt < AccumulatedOpCount; ++OpIt) { const RegisterType* Source = FRegisterHandler(Context).GetDest(); int32* DestReg0 = Context.GetOutputRegister(DataSetIndex, Context.DecodeU16()) + Context.ValidInstanceIndexStart; for (int32 LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { const int8 ShuffleMask = DestIndexReg[LoopIt]; const int8 AdvanceCount = FMath::CountBits(ShuffleMask); VectorIntStore(VectorIntShuffle(Source[LoopIt], VectorVMConstants::RegisterShuffleMask[ShuffleMask]), DestReg0); DestReg0 += AdvanceCount; } } } // acquires a batch of indices from the provided CounterHandler. If we're running in parallel, then we'll need to use // atomics to guarantee our place in the list of indices. template static VM_FORCEINLINE void AcquireCounterIndex(FVectorVMContext& Context, FDataSetCounterHandler& CounterHandler, int32 AcquireCount) { if (AcquireCount) { int32* CounterHandlerIndex = CounterHandler.Get(); int32 StartIndex = INDEX_NONE; if (bParallel) { StartIndex = FPlatformAtomics::InterlockedAdd(CounterHandlerIndex, AcquireCount); } else { StartIndex = *CounterHandlerIndex; *CounterHandlerIndex = StartIndex + AcquireCount; } // increment StartIndex, since CounterHandlerIndex starts at INDEX_NONE Context.ValidInstanceIndexStart = StartIndex + 1; } Context.ValidInstanceCount = AcquireCount; Context.ValidInstanceUniform = !AcquireCount || (Context.NumInstances == AcquireCount); } // evaluates a register to evaluate which instances are valid or not; will read 4 entries at a time and generate a // a mask for which entries are valid as well as an overall count template static void HandleRegisterValidIndices(FVectorVMContext& Context) { FDataSetCounterHandler CounterHandler(Context); FRegisterHandler ValidReader(Context); FRegisterHandler Dst(Context); int8* DestAddr = Dst.GetDest(); // we can process VECTOR_WIDTH_FLOATS entries at a time, generating a int8 mask for each set of 4 indicating // which are valid const int32 LoopCount = FMath::DivideAndRoundUp(Context.NumInstances, VECTOR_WIDTH_FLOATS); int32 Remainder = Context.NumInstances; int32 ValidCount = 0; for (int32 LoopIt = 0; LoopIt < LoopCount; ++LoopIt) { // input register needs to be padded to allow for 16 byte reads; but mask out the ones beyond NumInstances const VectorRegister Mask = VectorVMConstants::RemainderMask[FMath::Min(VECTOR_WIDTH_FLOATS, Remainder)]; const int8 ValidMask = static_cast(VectorMaskBits(VectorSelect(Mask, ValidReader.GetAndAdvance(), GlobalVectorConstants::FloatZero))); ValidCount += FMath::CountBits(ValidMask); DestAddr[LoopIt] = ValidMask; Remainder -= VECTOR_WIDTH_FLOATS; } // grab our batch of indices AcquireCounterIndex(Context, CounterHandler, ValidCount); } // evaluates the uniform check and grab the appropriate number of indices template static VM_FORCEINLINE void HandleUniformValidIndices(FVectorVMContext& Context) { FDataSetCounterHandler CounterHandler(Context); ValidReaderType ValidReader(Context); if (ValidReader.Get()) { AcquireCounterIndex(Context, CounterHandler, Context.NumInstances); } } template static VM_FORCEINLINE void IndexExecOptimized(FVectorVMContext& Context) { if (Context.IsParallelExecution()) { switch (SrcOpType) { case SRCOP_RRR: HandleRegisterValidIndices(Context); break; case SRCOP_RRC: HandleUniformValidIndices, true>(Context); break; default: check(0); break; } } else { switch (SrcOpType) { case SRCOP_RRR: HandleRegisterValidIndices(Context); break; case SRCOP_RRC: HandleUniformValidIndices, false>(Context); break; default: check(0); break; } } } void OptimizeAcquireIndex(FVectorVMCodeOptimizerContext& Context) { const uint32 SrcOpType = Context.BaseContext.DecodeSrcOperandTypes(); AcquireIndexConstant = !!(SrcOpType & OP0_CONST); switch (SrcOpType) { case SRCOP_RRR: Context.Write(IndexExecOptimized); break; case SRCOP_RRC: Context.Write(IndexExecOptimized); break; default: check(0); break; } DataSetCounterIndex = Context.DecodeU16(); ValidTestRegisterIndex = Context.DecodeU16(); WorkingRegisterIndex = Context.DecodeU16(); Context.Write(DataSetCounterIndex); Context.Write(ValidTestRegisterIndex); // we only need the working register if we've got non-uniform data if (SrcOpType == SRCOP_RRR) { Context.Write(WorkingRegisterIndex); } } bool OptimizeBatch(FVectorVMCodeOptimizerContext& Context) { const int32 BatchedOpCount = BatchedOps.Num(); if (!BatchedOpCount) return false; for (const auto& BatchEntry : BatchedOps) { const uint16 AccumulatedOpCount = BatchEntry.Value.Num(); if (!AccumulatedOpCount) continue; switch (BatchEntry.Key.SrcOpType) { case SRCOP_RRR: if (AcquireIndexConstant) { Context.Write(DoRegisterKernelFixedValid); } else { Context.Write(DoRegisterKernelVariableValid); } break; case SRCOP_RRC: Context.Write(DoRegisterKernelFixedValid>>); break; default: check(0); break; } Context.Write(BatchEntry.Key.DataSetIndex); Context.Write(BatchEntry.Key.DestIndexRegisterIdx); Context.Write(AccumulatedOpCount); for (const FOpValue& OpValue : BatchEntry.Value) { Context.Write(OpValue.SourceRegisterIndex); Context.Write(OpValue.DestRegisterIdx); } } return true; } bool ExtractOp(FVectorVMCodeOptimizerContext& Context) { FOpKey Key; Key.SrcOpType = Context.BaseContext.DecodeSrcOperandTypes(); Key.DataSetIndex = Context.DecodeU16(); Key.DestIndexRegisterIdx = Context.DecodeU16(); if (Key.DestIndexRegisterIdx != WorkingRegisterIndex) { // if we've found an output node that is not related to the acquire index op, then just exit return false; } FOpValue Value; Value.SourceRegisterIndex = Context.DecodeU16(); Value.DestRegisterIdx = Context.DecodeU16(); TArray& ExistingOps = BatchedOps.FindOrAdd(Key); ExistingOps.Add(Value); return true; } private: using RegisterType = VectorRegisterInt; using ScalarType = int32; uint16 DataSetCounterIndex = 0; uint16 ValidTestRegisterIndex = 0; uint16 WorkingRegisterIndex = 0; bool AcquireIndexConstant = false; struct FOpKey { uint16 DestIndexRegisterIdx; uint16 DataSetIndex; uint8 SrcOpType; }; struct FOpValue { uint16 SourceRegisterIndex; uint16 DestRegisterIdx; }; struct FOpKeyFuncs : public TDefaultMapKeyFuncs, false> { static VM_FORCEINLINE bool Matches(const FOpKey& A, const FOpKey& B) { return A.DestIndexRegisterIdx == B.DestIndexRegisterIdx && A.DataSetIndex == B.DataSetIndex && A.SrcOpType == B.SrcOpType; } static VM_FORCEINLINE uint32 GetKeyHash(const FOpKey& Key) { return HashCombine( HashCombine(GetTypeHash(Key.DestIndexRegisterIdx), GetTypeHash(Key.DataSetIndex)), GetTypeHash(Key.SrcOpType)); } }; TMap, FDefaultSetAllocator, FOpKeyFuncs> BatchedOps; }; // look for the pattern of acquireindex followed by a bunch of outputs. EVectorVMOp PackedOutputOptimization(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { if (!GbBatchPackVMOutput) { return Op; } if (Op == EVectorVMOp::acquireindex) { const auto RollbackState = Context.CreateCodeState(); FBatchedWriteIndexedOutput BatchedOutputOp; BatchedOutputOp.OptimizeAcquireIndex(Context); bool BatchValid = true; Op = Context.BaseContext.DecodeOp(); while (BatchValid && Op == EVectorVMOp::outputdata_32bit) { BatchValid = BatchedOutputOp.ExtractOp(Context); Op = Context.BaseContext.DecodeOp(); } // if there's nothing worth optimizing here, then just revert what we've parsed if (!BatchValid || !BatchedOutputOp.OptimizeBatch(Context)) { Context.RollbackCodeState(RollbackState); return EVectorVMOp::acquireindex; } } return Op; } EVectorVMOp SafeMathOptimization(EVectorVMOp Op, FVectorVMCodeOptimizerContext& Context) { if (!GbSafeOptimizedKernels) { return Op; } switch (Op) { case EVectorVMOp::div: FVectorKernelDivSafe::Optimize(Context); break; case EVectorVMOp::rcp: FVectorKernelRcpSafe::Optimize(Context); break; case EVectorVMOp::rsq: FVectorKernelRsqSafe::Optimize(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrtSafe::Optimize(Context); break; case EVectorVMOp::log: FVectorKernelLogSafe::Optimize(Context); break; case EVectorVMOp::pow: FVectorKernelPowSafe::Optimize(Context); break; default: return Op; } return Context.BaseContext.DecodeOp(); } void VectorVM::OptimizeByteCode(const uint8* ByteCode, TArray& OptimizedCode, TArrayView ExternalFunctionRegisterCounts) { OptimizedCode.Empty(); //-TODO: Support unaligned writes & little endian #if PLATFORM_SUPPORTS_UNALIGNED_LOADS && PLATFORM_LITTLE_ENDIAN if ( !GbOptimizeVMByteCode || (ByteCode == nullptr) ) { return; } FVectorVMCodeOptimizerContext Context(FVectorVMContext::Get(), ByteCode, OptimizedCode, ExternalFunctionRegisterCounts); // add any optimization filters in here, useful so what we can isolate optimizations with CVars FVectorVMCodeOptimizerContext::OptimizeVMFunction VMFilters[] = { PackedOutputOptimization, SafeMathOptimization, }; EVectorVMOp Op = EVectorVMOp::done; do { Op = Context.BaseContext.DecodeOp(); for (auto Filter : VMFilters) Op = Filter(Op, Context); switch (Op) { case EVectorVMOp::add: FVectorKernelAdd::Optimize(Context); break; case EVectorVMOp::sub: FVectorKernelSub::Optimize(Context); break; case EVectorVMOp::mul: FVectorKernelMul::Optimize(Context); break; case EVectorVMOp::div: FVectorKernelDiv::Optimize(Context); break; case EVectorVMOp::mad: FVectorKernelMad::Optimize(Context); break; case EVectorVMOp::lerp: FVectorKernelLerp::Optimize(Context); break; case EVectorVMOp::rcp: FVectorKernelRcp::Optimize(Context); break; case EVectorVMOp::rsq: FVectorKernelRsq::Optimize(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrt::Optimize(Context); break; case EVectorVMOp::neg: FVectorKernelNeg::Optimize(Context); break; case EVectorVMOp::abs: FVectorKernelAbs::Optimize(Context); break; case EVectorVMOp::exp: FVectorKernelExp::Optimize(Context); break; case EVectorVMOp::exp2: FVectorKernelExp2::Optimize(Context); break; case EVectorVMOp::log: FVectorKernelLog::Optimize(Context); break; case EVectorVMOp::log2: FVectorKernelLog2::Optimize(Context); break; case EVectorVMOp::sin: FVectorKernelSin::Optimize(Context); break; case EVectorVMOp::cos: FVectorKernelCos::Optimize(Context); break; case EVectorVMOp::tan: FVectorKernelTan::Optimize(Context); break; case EVectorVMOp::asin: FVectorKernelASin::Optimize(Context); break; case EVectorVMOp::acos: FVectorKernelACos::Optimize(Context); break; case EVectorVMOp::atan: FVectorKernelATan::Optimize(Context); break; case EVectorVMOp::atan2: FVectorKernelATan2::Optimize(Context); break; case EVectorVMOp::ceil: FVectorKernelCeil::Optimize(Context); break; case EVectorVMOp::floor: FVectorKernelFloor::Optimize(Context); break; case EVectorVMOp::round: FVectorKernelRound::Optimize(Context); break; case EVectorVMOp::fmod: FVectorKernelMod::Optimize(Context); break; case EVectorVMOp::frac: FVectorKernelFrac::Optimize(Context); break; case EVectorVMOp::trunc: FVectorKernelTrunc::Optimize(Context); break; case EVectorVMOp::clamp: FVectorKernelClamp::Optimize(Context); break; case EVectorVMOp::min: FVectorKernelMin::Optimize(Context); break; case EVectorVMOp::max: FVectorKernelMax::Optimize(Context); break; case EVectorVMOp::pow: FVectorKernelPow::Optimize(Context); break; case EVectorVMOp::sign: FVectorKernelSign::Optimize(Context); break; case EVectorVMOp::step: FVectorKernelStep::Optimize(Context); break; case EVectorVMOp::random: FVectorKernelRandom::Optimize(Context); break; case EVectorVMOp::noise: VectorVMNoise::Optimize_Noise1D(Context); break; case EVectorVMOp::noise2D: VectorVMNoise::Optimize_Noise2D(Context); break; case EVectorVMOp::noise3D: VectorVMNoise::Optimize_Noise3D(Context); break; case EVectorVMOp::cmplt: FVectorKernelCompareLT::Optimize(Context); break; case EVectorVMOp::cmple: FVectorKernelCompareLE::Optimize(Context); break; case EVectorVMOp::cmpgt: FVectorKernelCompareGT::Optimize(Context); break; case EVectorVMOp::cmpge: FVectorKernelCompareGE::Optimize(Context); break; case EVectorVMOp::cmpeq: FVectorKernelCompareEQ::Optimize(Context); break; case EVectorVMOp::cmpneq: FVectorKernelCompareNEQ::Optimize(Context); break; case EVectorVMOp::select: FVectorKernelSelect::Optimize(Context); break; case EVectorVMOp::addi: FVectorIntKernelAdd::Optimize(Context); break; case EVectorVMOp::subi: FVectorIntKernelSubtract::Optimize(Context); break; case EVectorVMOp::muli: FVectorIntKernelMultiply::Optimize(Context); break; case EVectorVMOp::divi: FVectorIntKernelDivide::Optimize(Context); break; case EVectorVMOp::clampi: FVectorIntKernelClamp::Optimize(Context); break; case EVectorVMOp::mini: FVectorIntKernelMin::Optimize(Context); break; case EVectorVMOp::maxi: FVectorIntKernelMax::Optimize(Context); break; case EVectorVMOp::absi: FVectorIntKernelAbs::Optimize(Context); break; case EVectorVMOp::negi: FVectorIntKernelNegate::Optimize(Context); break; case EVectorVMOp::signi: FVectorIntKernelSign::Optimize(Context); break; case EVectorVMOp::randomi: FScalarIntKernelRandom::Optimize(Context); break; case EVectorVMOp::cmplti: FVectorIntKernelCompareLT::Optimize(Context); break; case EVectorVMOp::cmplei: FVectorIntKernelCompareLE::Optimize(Context); break; case EVectorVMOp::cmpgti: FVectorIntKernelCompareGT::Optimize(Context); break; case EVectorVMOp::cmpgei: FVectorIntKernelCompareGE::Optimize(Context); break; case EVectorVMOp::cmpeqi: FVectorIntKernelCompareEQ::Optimize(Context); break; case EVectorVMOp::cmpneqi: FVectorIntKernelCompareNEQ::Optimize(Context); break; case EVectorVMOp::bit_and: FVectorIntKernelBitAnd::Optimize(Context); break; case EVectorVMOp::bit_or: FVectorIntKernelBitOr::Optimize(Context); break; case EVectorVMOp::bit_xor: FVectorIntKernelBitXor::Optimize(Context); break; case EVectorVMOp::bit_not: FVectorIntKernelBitNot::Optimize(Context); break; case EVectorVMOp::bit_lshift: FVectorIntKernelBitLShift::Optimize(Context); break; case EVectorVMOp::bit_rshift: FVectorIntKernelBitRShift::Optimize(Context); break; case EVectorVMOp::logic_and: FVectorIntKernelLogicAnd::Optimize(Context); break; case EVectorVMOp::logic_or: FVectorIntKernelLogicOr::Optimize(Context); break; case EVectorVMOp::logic_xor: FVectorIntKernelLogicXor::Optimize(Context); break; case EVectorVMOp::logic_not: FVectorIntKernelLogicNot::Optimize(Context); break; case EVectorVMOp::f2i: FVectorKernelFloatToInt::Optimize(Context); break; case EVectorVMOp::i2f: FVectorKernelIntToFloat::Optimize(Context); break; case EVectorVMOp::f2b: FVectorKernelFloatToBool::Optimize(Context); break; case EVectorVMOp::b2f: FVectorKernelBoolToFloat::Optimize(Context); break; case EVectorVMOp::i2b: FVectorKernelIntToBool::Optimize(Context); break; case EVectorVMOp::b2i: FVectorKernelBoolToInt::Optimize(Context); break; case EVectorVMOp::outputdata_32bit: FScalarKernelWriteOutputIndexed::Optimize(Context); break; case EVectorVMOp::inputdata_32bit: FVectorKernelReadInput::Optimize(Context); break; case EVectorVMOp::inputdata_noadvance_32bit: FVectorKernelReadInputNoAdvance::Optimize(Context); break; case EVectorVMOp::acquireindex: FScalarKernelAcquireCounterIndex::Optimize(Context); break; case EVectorVMOp::external_func_call: FKernelExternalFunctionCall::Optimize(Context); break; case EVectorVMOp::exec_index: FVectorKernelExecutionIndex::Optimize(Context); break; case EVectorVMOp::enter_stat_scope: FVectorKernelEnterStatScope::Optimize(Context); break; case EVectorVMOp::exit_stat_scope: FVectorKernelExitStatScope::Optimize(Context); break; //Special case ops to handle unique IDs but this can be written as generalized buffer operations. TODO! case EVectorVMOp::update_id: FScalarKernelUpdateID::Optimize(Context); break; case EVectorVMOp::acquire_id: FScalarKernelAcquireID::Optimize(Context); break; // Execution always terminates with a "done" opcode. case EVectorVMOp::done: break; // Opcode not recognized / implemented. default: UE_LOG(LogVectorVM, Fatal, TEXT("Unknown op code 0x%02x"), (uint32)Op); OptimizedCode.Empty(); return;//BAIL } } while (Op != EVectorVMOp::done); Context.Write(nullptr); #endif //PLATFORM_SUPPORTS_UNALIGNED_LOADS && PLATFORM_LITTLE_ENDIAN } #undef VM_FORCEINLINE