// Copyright 1998-2019 Epic Games, Inc. All Rights Reserved. #include "VectorVM.h" #include "Modules/ModuleManager.h" #include "UObject/Class.h" #include "UObject/Package.h" #include "VectorVMPrivate.h" #include "Stats/Stats.h" #include "HAL/ConsoleManager.h" #include "Async/ParallelFor.h" IMPLEMENT_MODULE(FDefaultModuleImpl, VectorVM); DECLARE_STATS_GROUP(TEXT("VectorVM"), STATGROUP_VectorVM, STATCAT_Advanced); DECLARE_CYCLE_STAT(TEXT("VVM Execution"), STAT_VVMExec, STATGROUP_VectorVM); DECLARE_CYCLE_STAT(TEXT("VVM Chunk"), STAT_VVMExecChunk, STATGROUP_VectorVM); DEFINE_LOG_CATEGORY_STATIC(LogVectorVM, All, All); //#define FREE_TABLE_LOCK_CONTENTION_WARNINGS (!UE_BUILD_SHIPPING) #define FREE_TABLE_LOCK_CONTENTION_WARNINGS (0) //I don't expect us to ever be waiting long #define FREE_TABLE_LOCK_CONTENTION_WARN_THRESHOLD_MS (0.01) //#define VM_FORCEINLINE #define VM_FORCEINLINE FORCEINLINE #define OP_REGISTER (0) #define OP0_CONST (1 << 0) #define OP1_CONST (1 << 1) #define OP2_CONST (1 << 2) #define SRCOP_RRR (OP_REGISTER | OP_REGISTER | OP_REGISTER) #define SRCOP_RRC (OP_REGISTER | OP_REGISTER | OP0_CONST) #define SRCOP_RCR (OP_REGISTER | OP1_CONST | OP_REGISTER) #define SRCOP_RCC (OP_REGISTER | OP1_CONST | OP0_CONST) #define SRCOP_CRR (OP2_CONST | OP_REGISTER | OP_REGISTER) #define SRCOP_CRC (OP2_CONST | OP_REGISTER | OP0_CONST) #define SRCOP_CCR (OP2_CONST | OP1_CONST | OP_REGISTER) #define SRCOP_CCC (OP2_CONST | OP1_CONST | OP0_CONST) //Temporarily locking the free table until we can implement a lock free algorithm. UE-65856 FORCEINLINE void FDataSetMeta::LockFreeTable() { #if FREE_TABLE_LOCK_CONTENTION_WARNINGS uint64 StartCycles = FPlatformTime::Cycles64(); #endif FreeTableLock.Lock(); #if FREE_TABLE_LOCK_CONTENTION_WARNINGS uint64 EndCylces = FPlatformTime::Cycles64(); double DurationMs = FPlatformTime::ToMilliseconds64(EndCylces - StartCycles); if (DurationMs >= FREE_TABLE_LOCK_CONTENTION_WARN_THRESHOLD_MS) { UE_LOG(LogVectorVM, Warning, TEXT("VectorVM Stalled in LockFreeTable()! %g ms"), DurationMs); } #endif } FORCEINLINE void FDataSetMeta::UnlockFreeTable() { FreeTableLock.Unlock(); } static int32 GbParallelVVM = 1; static FAutoConsoleVariableRef CVarbParallelVVM( TEXT("vm.Parallel"), GbParallelVVM, TEXT("If > 0 vector VM chunk level paralellism will be enabled. \n"), ECVF_Default ); static int32 GParallelVVMChunksPerBatch = 4; static FAutoConsoleVariableRef CVarParallelVVMChunksPerBatch( TEXT("vm.ParallelChunksPerBatch"), GParallelVVMChunksPerBatch, TEXT("Number of chunks to process per task when running in parallel. \n"), ECVF_Default ); ////////////////////////////////////////////////////////////////////////// // Constant Handlers struct FConstantHandlerBase { uint16 ConstantIndex; FConstantHandlerBase(FVectorVMContext& Context) : ConstantIndex(VectorVM::DecodeU16(Context)) {} FORCEINLINE void Advance() { } }; template struct FConstantHandler : public FConstantHandlerBase { T Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(*((T*)(Context.ConstantTable + ConstantIndex))) {} FORCEINLINE const T& Get() { return Constant; } FORCEINLINE const T& GetAndAdvance() { return Constant; } }; struct FDataSetOffsetHandler : FConstantHandlerBase { uint32 Offset; FDataSetOffsetHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Offset(Context.DataSetOffsetTable[ConstantIndex]) {} FORCEINLINE const uint32 Get() { return Offset; } FORCEINLINE const uint32 GetAndAdvance() { return Offset; } }; template<> struct FConstantHandler : public FConstantHandlerBase { VectorRegister Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(VectorLoadFloat1(&Context.ConstantTable[ConstantIndex])) {} FORCEINLINE const VectorRegister Get() { return Constant; } FORCEINLINE const VectorRegister GetAndAdvance() { return Constant; } }; template<> struct FConstantHandler : public FConstantHandlerBase { VectorRegisterInt Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(VectorIntLoad1(&Context.ConstantTable[ConstantIndex])) {} FORCEINLINE const VectorRegisterInt Get() { return Constant; } FORCEINLINE const VectorRegisterInt GetAndAdvance() { return Constant; } }; ////////////////////////////////////////////////////////////////////////// // Register handlers. // Handle reading of a register, advancing the pointer with each read. struct FRegisterHandlerBase { int32 RegisterIndex; FORCEINLINE FRegisterHandlerBase(FVectorVMContext& Context) : RegisterIndex(VectorVM::DecodeU16(Context)) {} }; template struct FRegisterHandler : public FRegisterHandlerBase { private: T * RESTRICT Register; public: FORCEINLINE FRegisterHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((T*)Context.RegisterTable[RegisterIndex]) {} FORCEINLINE const T Get() { return *Register; } FORCEINLINE T* GetDest() { return Register; } FORCEINLINE void Advance() { ++Register; } FORCEINLINE const T GetAndAdvance() { return *Register++; } FORCEINLINE T* GetDestAndAdvance() { return Register++; } }; ////////////////////////////////////////////////////////////////////////// FVectorVMContext::FVectorVMContext() : Code(nullptr) , ConstantTable(nullptr) , DataSetIndexTable(nullptr) , DataSetOffsetTable(nullptr) , NumSecondaryDataSets(0) , ExternalFunctionTable(nullptr) , UserPtrTable(nullptr) , NumInstances(0) , StartInstance(0) #if STATS , StatScopes(nullptr) #endif { uint32 TempRegisterSize = Align((VectorVM::InstancesPerChunk) * VectorVM::MaxInstanceSizeBytes, VECTOR_WIDTH_BYTES) + VECTOR_WIDTH_BYTES; TempRegTable.SetNumUninitialized(TempRegisterSize * VectorVM::NumTempRegisters); // Map temporary registers. for (int32 i = 0; i < VectorVM::NumTempRegisters; ++i) { RegisterTable[i] = TempRegTable.GetData() + TempRegisterSize * i; } RandStream.GenerateNewSeed(); } void FVectorVMContext::PrepareForExec( uint8*RESTRICT*RESTRICT InputRegisters, uint8*RESTRICT*RESTRICT OutputRegisters, int32 NumInputRegisters, int32 NumOutputRegisters, const uint8* InConstantTable, int32 *InDataSetIndexTable, int32 *InDataSetOffsetTable, int32 InNumSecondaryDatasets, FVMExternalFunction* InExternalFunctionTable, void** InUserPtrTable, TArray& RESTRICT InDataSetMetaTable #if STATS , const TArray* InStatScopes #endif ) { ConstantTable = InConstantTable; DataSetIndexTable = InDataSetIndexTable; DataSetOffsetTable = InDataSetOffsetTable; NumSecondaryDataSets = InNumSecondaryDatasets; ExternalFunctionTable = InExternalFunctionTable; UserPtrTable = InUserPtrTable; #if STATS check(InStatScopes); StatScopes = InStatScopes; StatCounterStack.Reserve(StatScopes->Num()); #endif //Map IO Registers for (int32 i = 0; i < NumInputRegisters; ++i) { RegisterTable[VectorVM::NumTempRegisters + i] = InputRegisters[i]; } for (int32 i = 0; i < NumOutputRegisters; ++i) { RegisterTable[VectorVM::NumTempRegisters + VectorVM::MaxInputRegisters + i] = OutputRegisters[i]; } DataSetMetaTable = &InDataSetMetaTable; ThreadLocalTempData.Reset(DataSetMetaTable->Num()); ThreadLocalTempData.SetNum(DataSetMetaTable->Num()); } void FVectorVMContext::FinishExec() { //At the end of executing each chunk we can push any thread local temporary data out to the main storage with locks or atomics. TArray& MetaTable = *DataSetMetaTable; check(ThreadLocalTempData.Num() == MetaTable.Num()); for(int32 DataSetIndex=0; DataSetIndex < MetaTable.Num(); ++DataSetIndex) { FDataSetThreadLocalTempData&RESTRICT Data = ThreadLocalTempData[DataSetIndex]; if (Data.IDsToFree.Num() > 0) { TArray&RESTRICT FreeIDTable = *MetaTable[DataSetIndex].FreeIDTable; int32&RESTRICT NumFreeIDs = *MetaTable[DataSetIndex].NumFreeIDs; check(FreeIDTable.Num() >= NumFreeIDs + Data.IDsToFree.Num()); //Temporarily locking the free table until we can implement something lock-free MetaTable[DataSetIndex].LockFreeTable(); for (int32 IDToFree : Data.IDsToFree) { //UE_LOG(LogVectorVM, Warning, TEXT("AddFreeID: ID:%d | FreeTableIdx:%d."), IDToFree, NumFreeIDs); FreeIDTable[NumFreeIDs++] = IDToFree; } //Unlock the free table. MetaTable[DataSetIndex].UnlockFreeTable(); Data.IDsToFree.Reset(); } //Also update the max ID seen. This should be the ONLY place in the VM we update this max value. volatile int32* MaxUsedID = MetaTable[DataSetIndex].MaxUsedID; int32 LocalMaxUsedID; do { LocalMaxUsedID = *MaxUsedID; if (LocalMaxUsedID >= Data.MaxID) { break; } } while (FPlatformAtomics::InterlockedCompareExchange(MaxUsedID, Data.MaxID, LocalMaxUsedID) != LocalMaxUsedID); *MaxUsedID = FMath::Max(*MaxUsedID, Data.MaxID); } } ////////////////////////////////////////////////////////////////////////// uint8 VectorVM::CreateSrcOperandMask(EVectorVMOperandLocation Type0, EVectorVMOperandLocation Type1, EVectorVMOperandLocation Type2) { return (Type0 == EVectorVMOperandLocation::Constant ? OP0_CONST : OP_REGISTER) | (Type1 == EVectorVMOperandLocation::Constant ? OP1_CONST : OP_REGISTER) | (Type2 == EVectorVMOperandLocation::Constant ? OP2_CONST : OP_REGISTER); } ////////////////////////////////////////////////////////////////////////// // Kernels template struct TUnaryKernelHandler { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); DstHandler Dst(Context); int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp; for (int32 i = 0; i < LoopInstances; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance()); } } }; template struct TBinaryKernelHandler { static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); DstHandler Dst(Context); int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp; for (int32 i = 0; i < LoopInstances; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance()); } } }; template struct TTrinaryKernelHandler { static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Arg2Handler Arg2(Context); DstHandler Dst(Context); int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp; for (int32 i = 0; i < LoopInstances; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance(), Arg2.GetAndAdvance()); } } }; template struct TTrinaryOutputKernelHandler { static void Exec(FVectorVMContext& Context) { Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Arg2Handler Arg2(Context); DstHandler Dst(Context, Arg0.Get()); int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp; for (int32 i = 0; i < LoopInstances; ++i) { Kernel::DoKernel(Context, Dst.GetDestAndAdvance(), Arg0.GetAndAdvance(), Arg1.GetAndAdvance(), Arg2.GetAndAdvance()); } } }; /** Base class of vector kernels with a single operand. */ template struct TUnaryKernel { static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = VectorVM::DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRR: TUnaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TUnaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TUnaryScalarKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TUnaryVectorKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TUnaryScalarIntKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TUnaryVectorIntKernel : public TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /** Base class of Vector kernels with 2 operands. */ template struct TBinaryKernel { static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = VectorVM::DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRR: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RCR: TBinaryKernelHandler::Exec(Context); break; case SRCOP_RCC: TBinaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TBinaryScalarKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TBinaryVectorKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TBinaryVectorIntKernel : public TBinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /** Base class of Vector kernels with 3 operands. */ template struct TTrinaryKernel { static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = VectorVM::DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RRC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RCR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_RCC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CRR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CRC: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CCR: TTrinaryKernelHandler::Exec(Context); break; case SRCOP_CCC: TTrinaryKernelHandler::Exec(Context); break; default: check(0); break; }; } }; template struct TTrinaryScalarKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, 1> {}; template struct TTrinaryVectorKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; template struct TTrinaryVectorIntKernel : public TTrinaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> {}; /*------------------------------------------------------------------------------ Implementation of all kernel operations. ------------------------------------------------------------------------------*/ struct FVectorKernelAdd : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorAdd(Src0, Src1); } }; struct FVectorKernelSub : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorSubtract(Src0, Src1); } }; struct FVectorKernelMul : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMultiply(Src0, Src1); } }; struct FVectorKernelDiv : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorDivide(Src0, Src1); } }; struct FVectorKernelMad : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { *Dst = VectorMultiplyAdd(Src0, Src1, Src2); } }; struct FVectorKernelLerp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { const VectorRegister OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Src2); const VectorRegister Tmp = VectorMultiply(Src0, OneMinusAlpha); *Dst = VectorMultiplyAdd(Src1, Src2, Tmp); } }; struct FVectorKernelRcp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorReciprocal(Src0); } }; struct FVectorKernelRsq : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorReciprocalSqrt(Src0); } }; struct FVectorKernelSqrt : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { // TODO: Need a SIMD sqrt! *Dst = VectorReciprocal(VectorReciprocalSqrt(Src0)); } }; struct FVectorKernelNeg : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorNegate(Src0); } }; struct FVectorKernelAbs : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorAbs(Src0); } }; struct FVectorKernelExp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorExp(Src0); } }; struct FVectorKernelExp2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorExp2(Src0); } }; struct FVectorKernelLog : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorLog(Src0); } }; struct FVectorKernelLog2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorLog2(Src0); } }; struct FVectorKernelClamp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { const VectorRegister Tmp = VectorMax(Src0, Src1); *Dst = VectorMin(Tmp, Src2); } }; struct FVectorKernelSin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorSin(Src0); } }; struct FVectorKernelCos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorCos(Src0); } }; struct FVectorKernelTan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorTan(Src0); } }; struct FVectorKernelASin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorASin(Src0); } }; struct FVectorKernelACos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorACos(Src0); } }; struct FVectorKernelATan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorATan(Src0); } }; struct FVectorKernelATan2 : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorATan2(Src0, Src1); } }; struct FVectorKernelCeil : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorCeil(Src0); } }; struct FVectorKernelFloor : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorFloor(Src0); } }; struct FVectorKernelRound : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { //TODO: >SSE4 has direct ops for this. VectorRegister Trunc = VectorTruncate(Src0); *Dst = VectorAdd(Trunc, VectorTruncate(VectorMultiply(VectorSubtract(Src0, Trunc), GlobalVectorConstants::FloatAlmostTwo))); } }; struct FVectorKernelMod : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorMod(Src0, Src1); } }; struct FVectorKernelFrac : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorFractional(Src0); } }; struct FVectorKernelTrunc : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorTruncate(Src0); } }; struct FVectorKernelCompareLT : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareLT(Src0, Src1); } }; struct FVectorKernelCompareLE : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareLE(Src0, Src1); } }; struct FVectorKernelCompareGT : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareGT(Src0, Src1); } }; struct FVectorKernelCompareGE : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareGE(Src0, Src1); } }; struct FVectorKernelCompareEQ : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareEQ(Src0, Src1); } }; struct FVectorKernelCompareNEQ : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCompareNE(Src0, Src1); } }; struct FVectorKernelSelect : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Mask, VectorRegister A, VectorRegister B) { *Dst = VectorSelect(Mask, A, B); } }; struct FVectorKernelExecutionIndex { static void VM_FORCEINLINE Exec(FVectorVMContext& Context) { static_assert(VECTOR_WIDTH_FLOATS == 4, "Need to update this when upgrading the VM to support >SSE2"); VectorRegisterInt VectorStride = MakeVectorRegisterInt(VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS); VectorRegisterInt Index = MakeVectorRegisterInt(Context.StartInstance, Context.StartInstance + 1, Context.StartInstance + 2, Context.StartInstance + 3); FRegisterHandler Dest(Context); int32 Loops = Align(Context.NumInstances, VECTOR_WIDTH_FLOATS) / VECTOR_WIDTH_FLOATS; for (int32 i = 0; i < Loops; ++i) { *Dest.GetDestAndAdvance() = Index; Index = VectorIntAdd(Index, VectorStride); } } }; struct FVectorKernelEnterStatScope { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { FConstantHandler ScopeIdx(Context); #if STATS //int32 CounterIdx = Context.StatCounterStack.AddDefaulted(1); //Context.StatCounterStack[CounterIdx].Start((*Context.StatScopes)[ScopeIdx.Get()]); #endif } }; struct FVectorKernelExitStatScope { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { #if STATS //Context.StatCounterStack.Last().Stop(); //Context.StatCounterStack.Pop(false); #endif } }; struct FVectorKernelRandom : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { const float rm = RAND_MAX; //EEK!. Improve this. Implement GPU style seeded rand instead of this. VectorRegister Result = MakeVectorRegister(Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction()); *Dst = VectorMultiply(Result, Src0); } }; /* gaussian distribution random number (not working yet) */ struct FVectorKernelRandomGauss : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { const float rm = RAND_MAX; VectorRegister Result = MakeVectorRegister(Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction(), Context.RandStream.GetFraction()); Result = VectorSubtract(Result, MakeVectorRegister(0.5f, 0.5f, 0.5f, 0.5f)); Result = VectorMultiply(MakeVectorRegister(3.0f, 3.0f, 3.0f, 3.0f), Result); // taylor series gaussian approximation const VectorRegister SPi2 = VectorReciprocal(VectorReciprocalSqrt(MakeVectorRegister(2 * PI, 2 * PI, 2 * PI, 2 * PI))); VectorRegister Gauss = VectorReciprocal(SPi2); VectorRegister Div = VectorMultiply(MakeVectorRegister(2.0f, 2.0f, 2.0f, 2.0f), SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorMultiply(Result, Result), Div)); Div = VectorMultiply(MakeVectorRegister(8.0f, 8.0f, 8.0f, 8.0f), SPi2); Gauss = VectorAdd(Gauss, VectorDivide(VectorPow(MakeVectorRegister(4.0f, 4.0f, 4.0f, 4.0f), Result), Div)); Div = VectorMultiply(MakeVectorRegister(48.0f, 48.0f, 48.0f, 48.0f), SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorPow(MakeVectorRegister(6.0f, 6.0f, 6.0f, 6.0f), Result), Div)); Gauss = VectorDivide(Gauss, MakeVectorRegister(0.4f, 0.4f, 0.4f, 0.4f)); Gauss = VectorMultiply(Gauss, Src0); *Dst = Gauss; } }; struct FVectorKernelMin : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMin(Src0, Src1); } }; struct FVectorKernelMax : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMax(Src0, Src1); } }; struct FVectorKernelPow : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorPow(Src0, Src1); } }; struct FVectorKernelSign : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorSign(Src0); } }; struct FVectorKernelStep : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorStep(Src0); } }; namespace VectorVMNoise { int32 P[512] = { 151,160,137,91,90,15, 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23, 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33, 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166, 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244, 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196, 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123, 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42, 223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9, 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228, 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107, 49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254, 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180, 151,160,137,91,90,15, 131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23, 190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33, 88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166, 77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244, 102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196, 135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123, 5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42, 223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9, 129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228, 251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107, 49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254, 138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180 }; static FORCEINLINE float Lerp(float X, float A, float B) { return A + X * (B - A); } static FORCEINLINE float Fade(float X) { return X * X * X * (X * (X * 6 - 15) + 10); } static FORCEINLINE float Grad(int32 hash, float x, float y, float z) { hash &= 15; float u = (hash < 8) ? x : y; float v = (hash < 4) ? y : ((hash == 12 || hash == 14) ? x : z); return ((hash & 1) == 0 ? u : -u) + ((hash & 2) == 0 ? v : -v); } struct FScalarKernelNoise3D_iNoise : TTrinaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X, float Y, float Z) { float Xfl = FMath::FloorToFloat(X); float Yfl = FMath::FloorToFloat(Y); float Zfl = FMath::FloorToFloat(Z); int32 Xi = (int32)(Xfl) & 255; int32 Yi = (int32)(Yfl) & 255; int32 Zi = (int32)(Zfl) & 255; X -= Xfl; Y -= Yfl; Z -= Zfl; float Xm1 = X - 1.0f; float Ym1 = Y - 1.0f; float Zm1 = Z - 1.0f; int32 A = P[Xi] + Yi; int32 AA = P[A] + Zi; int32 AB = P[A + 1] + Zi; int32 B = P[Xi + 1] + Yi; int32 BA = P[B] + Zi; int32 BB = P[B + 1] + Zi; float U = Fade(X); float V = Fade(Y); float W = Fade(Z); *Dst = Lerp(W, Lerp(V, Lerp(U, Grad(P[AA], X, Y, Z), Grad(P[BA], Xm1, Y, Z)), Lerp(U, Grad(P[AB], X, Ym1, Z), Grad(P[BB], Xm1, Ym1, Z))), Lerp(V, Lerp(U, Grad(P[AA + 1], X, Y, Zm1), Grad(P[BA + 1], Xm1, Y, Zm1)), Lerp(U, Grad(P[AB + 1], X, Ym1, Zm1), Grad(P[BB + 1], Xm1, Ym1, Zm1)))); } }; struct FScalarKernelNoise2D_iNoise : TBinaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X, float Y) { *Dst = 0.0f;//TODO } }; struct FScalarKernelNoise1D_iNoise : TUnaryScalarKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, float* RESTRICT Dst, float X) { *Dst = 0.0f;//TODO; } }; static void Noise1D(FVectorVMContext& Context) { FScalarKernelNoise1D_iNoise::Exec(Context); } static void Noise2D(FVectorVMContext& Context) { FScalarKernelNoise2D_iNoise::Exec(Context); } static void Noise3D(FVectorVMContext& Context) { //Basic scalar implementation of perlin's improved noise until I can spend some quality time exploring vectorized implementations of Marc O's noise from Random.ush. //http://mrl.nyu.edu/~perlin/noise/ FScalarKernelNoise3D_iNoise::Exec(Context); } }; //Olaf's orginal curl noise. Needs updating for the new scalar VM and possibly calling Curl Noise to avoid confusion with regular noise? //Possibly needs to be a data interface as the VM can't output Vectors? struct FVectorKernelNoise : public TUnaryVectorKernel { static VectorRegister RandomTable[17][17][17]; static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* RESTRICT Dst, VectorRegister Src0) { const VectorRegister One = MakeVectorRegister(1.0f, 1.0f, 1.0f, 1.0f); const VectorRegister VecSize = MakeVectorRegister(16.0f, 16.0f, 16.0f, 16.0f); *Dst = MakeVectorRegister(0.0f, 0.0f, 0.0f, 0.0f); for (uint32 i = 1; i < 2; i++) { float Di = 0.2f * (1.0f/(1<(&Coords); const int32 Cx = CoordPtr[0]; const int32 Cy = CoordPtr[1]; const int32 Cz = CoordPtr[2]; VectorRegister Frac = VectorFractional(Coords); VectorRegister Alpha = VectorReplicate(Frac, 0); VectorRegister OneMinusAlpha = VectorSubtract(One, Alpha); VectorRegister XV1 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz], OneMinusAlpha)); VectorRegister XV2 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz], OneMinusAlpha)); VectorRegister XV3 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz+1], OneMinusAlpha)); VectorRegister XV4 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz+1], OneMinusAlpha)); Alpha = VectorReplicate(Frac, 1); OneMinusAlpha = VectorSubtract(One, Alpha); VectorRegister YV1 = VectorMultiplyAdd(XV1, Alpha, VectorMultiply(XV2, OneMinusAlpha)); VectorRegister YV2 = VectorMultiplyAdd(XV3, Alpha, VectorMultiply(XV4, OneMinusAlpha)); Alpha = VectorReplicate(Frac, 2); OneMinusAlpha = VectorSubtract(One, Alpha); VectorRegister ZV = VectorMultiplyAdd(YV1, Alpha, VectorMultiply(YV2, OneMinusAlpha)); *Dst = VectorAdd(*Dst, ZV); } } }; VectorRegister FVectorKernelNoise::RandomTable[17][17][17]; ////////////////////////////////////////////////////////////////////////// //Special Kernels. /** Special kernel for acquiring a new ID. TODO. Can be written as general RWBuffer ops when we support that. */ struct FScalarKernelAcquireID { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { int32 DataSetIndex = VectorVM::DecodeU16(Context); TArray& MetaTable = *Context.DataSetMetaTable; TArray&RESTRICT FreeIDTable = *MetaTable[DataSetIndex].FreeIDTable; int32 Tag = MetaTable[DataSetIndex].IDAcquireTag; int32 IDIndexReg = VectorVM::DecodeU16(Context); int32*RESTRICT IDIndex = (int32*)(Context.RegisterTable[IDIndexReg]); int32 IDTagReg = VectorVM::DecodeU16(Context); int32*RESTRICT IDTag = (int32*)(Context.RegisterTable[IDTagReg]); int32& NumFreeIDs = *MetaTable[DataSetIndex].NumFreeIDs; //Temporarily using a lock to ensure thread safety for accessing the FreeIDTable until a lock free solution can be implemented. MetaTable[DataSetIndex].LockFreeTable(); check(FreeIDTable.Num() >= Context.NumInstances); check(NumFreeIDs >= Context.NumInstances); for (int32 i = 0; i < Context.NumInstances; ++i) { int32 FreeIDTableIndex = --NumFreeIDs; //Grab the value from the FreeIDTable. int32 AcquiredID = FreeIDTable[FreeIDTableIndex]; checkSlow(AcquiredID != INDEX_NONE); //UE_LOG(LogVectorVM, Warning, TEXT("AcquireID: ID:%d | FreeTableIdx:%d."), AcquiredID, FreeIDTableIndex); //Mark this entry in the FreeIDTable as invalid. FreeIDTable[FreeIDTableIndex] = INDEX_NONE; *IDIndex = AcquiredID; *IDTag = Tag; ++IDIndex; ++IDTag; } MetaTable[DataSetIndex].UnlockFreeTable(); } }; /** Special kernel for updating a new ID. TODO. Can be written as general RWBuffer ops when we support that. */ struct FScalarKernelUpdateID { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { int32 DataSetIndex = VectorVM::DecodeU16(Context); int32 InstanceIDRegisterIndex = VectorVM::DecodeU16(Context); int32 InstanceIndexRegisterIndex = VectorVM::DecodeU16(Context); TArray& MetaTable = *Context.DataSetMetaTable; TArray&RESTRICT IDTable = *MetaTable[DataSetIndex].IDTable; int32 InstanceOffset = MetaTable[DataSetIndex].InstanceOffset + Context.StartInstance; int32*RESTRICT IDRegister = (int32*)(Context.RegisterTable[InstanceIDRegisterIndex]); int32*RESTRICT IndexRegister = (int32*)(Context.RegisterTable[InstanceIndexRegisterIndex]); FDataSetThreadLocalTempData& DataSetTempData = Context.ThreadLocalTempData[DataSetIndex]; TArray&RESTRICT IDsToFree = DataSetTempData.IDsToFree; check(IDTable.Num() >= InstanceOffset + Context.NumInstances); for (int32 i = 0; i < Context.NumInstances; ++i) { int32 InstanceId = IDRegister[i]; int32 Index = IndexRegister[i]; if (Index == INDEX_NONE) { //Add the ID to a thread local list of IDs to free which are actually added to the list safely at the end of this chunk's execution. IDsToFree.Add(InstanceId); IDTable[InstanceId] = INDEX_NONE; //UE_LOG(LogVectorVM, Warning, TEXT("FreeingID: InstanceID:%d."), InstanceId); } else { //Update the actual index for this ID. No thread safety is needed as this ID slot can only ever be written by this instance and so a single thread. int32 RealIdx = InstanceOffset + Index; IDTable[InstanceId] = RealIdx; //Update thread local max ID seen. We push this to the real value at the end of execution. DataSetTempData.MaxID = FMath::Max(DataSetTempData.MaxID, InstanceId); //UE_LOG(LogVectorVM, Warning, TEXT("UpdateID: RealIdx:%d | InstanceID:%d."), RealIdx, InstanceId); } } } }; /** Special kernel for reading from the main input dataset. */ template struct FVectorKernelReadInput { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { static const int32 InstancesPerVector = sizeof(VectorRegister) / sizeof(T); int32 DataSetIndex = VectorVM::DecodeU16(Context); int32 InputRegisterIdx = VectorVM::DecodeU16(Context); int32 DestRegisterIdx = VectorVM::DecodeU16(Context); int32 Loops = Align(Context.NumInstances, InstancesPerVector) / InstancesPerVector; VectorRegister* DestReg = (VectorRegister*)(Context.RegisterTable[DestRegisterIdx]); int32 DataSetOffset = Context.DataSetOffsetTable[DataSetIndex]; //TODO: we'll need a different way of doing this for a proper kernel; might need a specific offset handler VectorRegister* InputReg = (VectorRegister*)((T*)(Context.RegisterTable[InputRegisterIdx + DataSetOffset]) + Context.StartInstance); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. for (int32 i = 0; i < Loops; ++i) { *DestReg = VectorLoad(InputReg); ++DestReg; ++InputReg; } } }; /** Special kernel for reading from an input dataset; non-advancing (reads same instance everytime). * this kernel splats the X component of the source register to all 4 dest components; it's meant to * use scalar data sets as the source (e.g. events) */ template struct FVectorKernelReadInputNoAdvance { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { static const int32 InstancesPerVector = sizeof(VectorRegister) / sizeof(T); int32 DataSetIndex = VectorVM::DecodeU16(Context); int32 InputRegisterIdx = VectorVM::DecodeU16(Context); int32 DestRegisterIdx = VectorVM::DecodeU16(Context); int32 Loops = Align(Context.NumInstances, InstancesPerVector) / InstancesPerVector; VectorRegister* DestReg = (VectorRegister*)(Context.RegisterTable[DestRegisterIdx]); int32 DataSetOffset = Context.DataSetOffsetTable[DataSetIndex]; //TODO: we'll need a different way of doing this for a proper kernel; might need a specific offset handler VectorRegister* InputReg = (VectorRegister*)((T*)(Context.RegisterTable[InputRegisterIdx + DataSetOffset]) ); //TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others. for (int32 i = 0; i < Loops; ++i) { *DestReg = VectorSwizzle(VectorLoad(InputReg), 0,0,0,0); ++DestReg; } } }; //TODO - Should be straight forwards to follow the input with a mix of the outputs direct indexing /** Special kernel for reading an specific location in an input register. */ // template // struct FScalarKernelReadInputIndexed // { // static VM_FORCEINLINE void Exec(FVectorVMContext& Context) // { // int32* IndexReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]); // T* InputReg = (T*)(Context.RegisterTable[DecodeU16(Context)]); // T* DestReg = (T*)(Context.RegisterTable[DecodeU16(Context)]); // // //Has to be scalar as each instance can read from a different location in the input buffer. // for (int32 i = 0; i < Context.NumInstances; ++i) // { // T* ReadPtr = (*InputReg) + (*IndexReg); // *DestReg = (*ReadPtr); // ++IndexReg; // ++DestReg; // } // } // }; //Needs it's own handler as the output registers are indexed absolutely rather than incrementing in advance(). template struct FOutputRegisterHandler : public FRegisterHandlerBase { T* Register; FOutputRegisterHandler(FVectorVMContext& Context, uint32 DataSetOffset) : FRegisterHandlerBase(Context) , Register((T*)Context.RegisterTable[RegisterIndex+DataSetOffset]) {} VM_FORCEINLINE void Advance() { } VM_FORCEINLINE T Get() { return *Register; } VM_FORCEINLINE T*RESTRICT GetDest() { return Register; } VM_FORCEINLINE T*RESTRICT GetDestAndAdvance() { return Register; } VM_FORCEINLINE T GetAndAdvance() { return *Register; } }; /** Special kernel for writing to a specific output register. */ template struct FScalarKernelWriteOutputIndexed { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = VectorVM::DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRR: TTrinaryOutputKernelHandler, FDataSetOffsetHandler, FRegisterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TTrinaryOutputKernelHandler, FDataSetOffsetHandler, FRegisterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; }; } static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context, T* RESTRICT Dst, int32 SetOffset, int32 Index, T Data) { if (Index != INDEX_NONE) { Dst[Index] = Data;//TODO: On sse4 we can use _mm_stream_ss here. } } }; struct FDataSetCounterHandler { int32* Counter; FDataSetCounterHandler(FVectorVMContext& Context) : Counter(Context.DataSetIndexTable + VectorVM::DecodeU16(Context)) {} VM_FORCEINLINE void Advance() { } VM_FORCEINLINE int32* Get() { return Counter; } VM_FORCEINLINE int32* GetAndAdvance() { return Counter; } //VM_FORCEINLINE const int32* GetDest() { return Counter; }Should never use as a dest. All kernels with read and write to this. }; template struct FScalarKernelAcquireCounterIndex { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = VectorVM::DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRR: TBinaryKernelHandler, FDataSetCounterHandler, FRegisterHandler, 1>::Exec(Context); break; case SRCOP_RRC: TBinaryKernelHandler, FDataSetCounterHandler, FConstantHandler, 1>::Exec(Context); break; default: check(0); break; }; } static VM_FORCEINLINE void DoKernel(FVectorVMContext& Context, int32* RESTRICT Dst, int32* Index, int32 Valid) { if (Valid != 0) { if (bThreadsafe) { *Dst = FPlatformAtomics::InterlockedIncrement(Index); } else { *Dst = ++(*Index); } } else { *Dst = INDEX_NONE; // Subsequent DoKernal calls above will skip over INDEX_NONE register entries... } } }; //TODO: REWORK TO FUNCITON LIKE THE ABOVE. // /** Special kernel for decrementing a dataset counter. */ // struct FScalarKernelReleaseCounterIndex // { // static VM_FORCEINLINE void Exec(FVectorVMContext& Context) // { // int32* CounterPtr = (int32*)(Context.ConstantTable[DecodeU16(Context)]); // int32* DestReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]); // // for (int32 i = 0; i < Context.NumInstances; ++i) // { // int32 Counter = (*CounterPtr--); // *DestReg = Counter >= 0 ? Counter : INDEX_NONE; // // ++DestReg; // } // } // }; ////////////////////////////////////////////////////////////////////////// //external_func_call struct FKernelExternalFunctionCall { static void Exec(FVectorVMContext& Context) { uint32 ExternalFuncIdx = VectorVM::DecodeU8(Context); Context.ExternalFunctionTable[ExternalFuncIdx].Execute(Context); } }; ////////////////////////////////////////////////////////////////////////// //Integer operations //addi, struct FVectorIntKernelAdd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntAdd(Src0, Src1); } }; //subi, struct FVectorIntKernelSubtract : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntSubtract(Src0, Src1); } }; //muli, struct FVectorIntKernelMultiply : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntMultiply(Src0, Src1); } }; //divi, struct FVectorIntKernelDivide : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); // No intrinsics exist for integer divide. Since div by zero causes crashes, we must be safe against that. int32 TmpDst[4]; TmpDst[0] = TmpB[0] != 0 ? (TmpA[0] / TmpB[0]) : 0; TmpDst[1] = TmpB[1] != 0 ? (TmpA[1] / TmpB[1]) : 0; TmpDst[2] = TmpB[2] != 0 ? (TmpA[2] / TmpB[2]) : 0; TmpDst[3] = TmpB[3] != 0 ? (TmpA[3] / TmpB[3]) : 0; *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3]); } }; //clampi, struct FVectorIntKernelClamp : TTrinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1, VectorRegisterInt Src2) { *Dst = VectorIntMin(VectorIntMax(Src0, Src1), Src2); } }; //mini, struct FVectorIntKernelMin : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntMin(Src0, Src1); } }; //maxi, struct FVectorIntKernelMax : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntMax(Src0, Src1); } }; //absi, struct FVectorIntKernelAbs : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntAbs(Src0); } }; //negi, struct FVectorIntKernelNegate : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntNegate(Src0); } }; //signi, struct FVectorIntKernelSign : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntSign(Src0); } }; //randomi, //No good way to do this with SSE atm so just do it scalar. struct FScalarIntKernelRandom : public TUnaryScalarIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, int32* RESTRICT Dst, int32 Src0) { const float rm = RAND_MAX; //EEK!. Improve this. Implement GPU style seeded rand instead of this. *Dst = static_cast(Context.RandStream.GetFraction() * Src0); } }; //cmplti, struct FVectorIntKernelCompareLT : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareLT(Src0, Src1); } }; //cmplei, struct FVectorIntKernelCompareLE : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareLE(Src0, Src1); } }; //cmpgti, struct FVectorIntKernelCompareGT : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareGT(Src0, Src1); } }; //cmpgei, struct FVectorIntKernelCompareGE : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareGE(Src0, Src1); } }; //cmpeqi, struct FVectorIntKernelCompareEQ : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareEQ(Src0, Src1); } }; //cmpneqi, struct FVectorIntKernelCompareNEQ : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntCompareNEQ(Src0, Src1); } }; //bit_and, struct FVectorIntKernelBitAnd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntAnd(Src0, Src1); } }; //bit_or, struct FVectorIntKernelBitOr : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntOr(Src0, Src1); } }; //bit_xor, struct FVectorIntKernelBitXor : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { *Dst = VectorIntXor(Src0, Src1); } }; //bit_not, struct FVectorIntKernelBitNot : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntNot(Src0); } }; // bit_lshift struct FVectorIntKernelBitLShift : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); int32 TmpDst[4]; TmpDst[0] = (TmpA[0] << TmpB[0]); TmpDst[1] = (TmpA[1] << TmpB[1]); TmpDst[2] = (TmpA[2] << TmpB[2]); TmpDst[3] = (TmpA[3] << TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3] ); } }; // bit_rshift struct FVectorIntKernelBitRShift : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { int32 TmpA[4]; VectorIntStore(Src0, TmpA); int32 TmpB[4]; VectorIntStore(Src1, TmpB); int32 TmpDst[4]; TmpDst[0] = (TmpA[0] >> TmpB[0]); TmpDst[1] = (TmpA[1] >> TmpB[1]); TmpDst[2] = (TmpA[2] >> TmpB[2]); TmpDst[3] = (TmpA[3] >> TmpB[3]); *Dst = MakeVectorRegisterInt(TmpDst[0], TmpDst[1], TmpDst[2], TmpDst[3]); } }; //"Boolean" ops. Currently handling bools as integers. //logic_and, struct FVectorIntKernelLogicAnd : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntAnd(Src0, Src1); } }; //logic_or, struct FVectorIntKernelLogicOr : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntOr(Src0, Src1); } }; //logic_xor, struct FVectorIntKernelLogicXor : TBinaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntXor(Src0, Src1); } }; //logic_not, struct FVectorIntKernelLogicNot : TUnaryVectorIntKernel { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { //We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these? *Dst = VectorIntNot(Src0); } }; //conversions //f2i, struct FVectorKernelFloatToInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegister Src0) { *Dst = VectorFloatToInt(Src0); } }; //i2f, struct FVectorKernelIntToFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* Dst, VectorRegisterInt Src0) { *Dst = VectorIntToFloat(Src0); } }; //f2b, struct FVectorKernelFloatToBool : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* Dst, VectorRegister Src0) { *Dst = VectorCompareGT(Src0, GlobalVectorConstants::FloatZero); } }; //b2f, struct FVectorKernelBoolToFloat : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegister* Dst, VectorRegister Src0) { *Dst = VectorSelect(Src0, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero); } }; //i2b, struct FVectorKernelIntToBool : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntCompareGT(Src0, GlobalVectorConstants::IntZero); } }; //b2i, struct FVectorKernelBoolToInt : TUnaryKernel, FConstantHandler, FRegisterHandler, VECTOR_WIDTH_FLOATS> { static void VM_FORCEINLINE DoKernel(FVectorVMContext& Context, VectorRegisterInt* Dst, VectorRegisterInt Src0) { *Dst = VectorIntSelect(Src0, GlobalVectorConstants::IntOne, GlobalVectorConstants::IntZero); } }; UEnum* g_VectorVMEnumStateObj = nullptr; UEnum* g_VectorVMEnumOperandObj = nullptr; void VectorVM::Init() { static bool Inited = false; if (Inited == false) { g_VectorVMEnumStateObj = StaticEnum(); g_VectorVMEnumOperandObj = StaticEnum(); // random noise float TempTable[17][17][17]; for (int z = 0; z < 17; z++) { for (int y = 0; y < 17; y++) { for (int x = 0; x < 17; x++) { float f1 = (float)FMath::FRandRange(-1.0f, 1.0f); TempTable[x][y][z] = f1; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable[i][j][16] = TempTable[i][j][0]; TempTable[i][16][j] = TempTable[i][0][j]; TempTable[16][j][i] = TempTable[0][j][i]; } } // compute gradients FVector TempTable2[17][17][17]; for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector XGrad = FVector(1.0f, 0.0f, TempTable[x][y][z] - TempTable[x+1][y][z]); FVector YGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y + 1][z]); FVector ZGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y][z+1]); FVector Grad = FVector(XGrad.Z, YGrad.Z, ZGrad.Z); TempTable2[x][y][z] = Grad; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable2[i][j][16] = TempTable2[i][j][0]; TempTable2[i][16][j] = TempTable2[i][0][j]; TempTable2[16][j][i] = TempTable2[0][j][i]; } } // compute curl of gradient field for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector Dy = TempTable2[x][y][z] - TempTable2[x][y + 1][z]; FVector Sy = TempTable2[x][y][z] + TempTable2[x][y + 1][z]; FVector Dx = TempTable2[x][y][z] - TempTable2[x + 1][y][z]; FVector Sx = TempTable2[x][y][z] + TempTable2[x + 1][y][z]; FVector Dz = TempTable2[x][y][z] - TempTable2[x][y][z + 1]; FVector Sz = TempTable2[x][y][z] + TempTable2[x][y][z + 1]; FVector Dir = FVector(Dy.Z - Sz.Y, Dz.X - Sx.Z, Dx.Y - Sy.X); FVectorKernelNoise::RandomTable[x][y][z] = MakeVectorRegister(Dir.X, Dir.Y, Dir.Z, 0.0f); } } } Inited = true; } } void VectorVM::Exec( uint8 const* Code, uint8** InputRegisters, int32 NumInputRegisters, uint8** OutputRegisters, int32 NumOutputRegisters, uint8 const* ConstantTable, TArray &DataSetMetaTable, FVMExternalFunction* ExternalFunctionTable, void** UserPtrTable, int32 NumInstances #if STATS , const TArray& StatScopes #endif ) { SCOPE_CYCLE_COUNTER(STAT_VVMExec); // table of index counters, one for each data set TArray> DataSetIndexTable; TArray> DataSetOffsetTable; // map secondary data sets and fill in the offset table into the register table // for (int32 Idx = 0; Idx < DataSetMetaTable.Num(); Idx++) { uint32 DataSetOffset = DataSetMetaTable[Idx].RegisterOffset; DataSetOffsetTable.Add(DataSetOffset); DataSetIndexTable.Add(DataSetMetaTable[Idx].DataSetAccessIndex); // prime counter index table with the data set offset; will be incremented with every write for each instance } int32 NumChunks = (NumInstances / InstancesPerChunk) + 1; int32 ChunksPerBatch = (GbParallelVVM != 0 && FApp::ShouldUseThreadingForPerformance()) ? GParallelVVMChunksPerBatch : NumChunks; int32 NumBatches = FMath::DivideAndRoundUp(NumChunks, ChunksPerBatch); bool bParallel = NumBatches > 1; auto ExecChunkBatch = [&](int32 BatchIdx) { SCOPE_CYCLE_COUNTER(STAT_VVMExecChunk); FVectorVMContext& Context = FVectorVMContext::Get(); Context.PrepareForExec(InputRegisters, OutputRegisters, NumInputRegisters, NumOutputRegisters, ConstantTable, DataSetIndexTable.GetData(), DataSetOffsetTable.GetData(), DataSetOffsetTable.Num(), ExternalFunctionTable, UserPtrTable, DataSetMetaTable #if STATS , &StatScopes #endif ); // Process one chunk at a time. int32 ChunkIdx = BatchIdx * ChunksPerBatch; int32 FirstInstance = ChunkIdx * InstancesPerChunk; int32 FinalInstance = FMath::Min(NumInstances, FirstInstance + (ChunksPerBatch * InstancesPerChunk)); int32 InstancesLeft = FinalInstance - FirstInstance; while (InstancesLeft > 0) { int32 NumInstancesThisChunk = FMath::Min(InstancesLeft, (int32)InstancesPerChunk); int32 StartInstance = InstancesPerChunk * ChunkIdx; // Setup execution context. Context.PrepareForChunk(Code, NumInstancesThisChunk, StartInstance); EVectorVMOp Op = EVectorVMOp::done; // Execute VM on all vectors in this chunk. do { Op = DecodeOp(Context); switch (Op) { // Dispatch kernel ops. case EVectorVMOp::add: FVectorKernelAdd::Exec(Context); break; case EVectorVMOp::sub: FVectorKernelSub::Exec(Context); break; case EVectorVMOp::mul: FVectorKernelMul::Exec(Context); break; case EVectorVMOp::div: FVectorKernelDiv::Exec(Context); break; case EVectorVMOp::mad: FVectorKernelMad::Exec(Context); break; case EVectorVMOp::lerp: FVectorKernelLerp::Exec(Context); break; case EVectorVMOp::rcp: FVectorKernelRcp::Exec(Context); break; case EVectorVMOp::rsq: FVectorKernelRsq::Exec(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrt::Exec(Context); break; case EVectorVMOp::neg: FVectorKernelNeg::Exec(Context); break; case EVectorVMOp::abs: FVectorKernelAbs::Exec(Context); break; case EVectorVMOp::exp: FVectorKernelExp::Exec(Context); break; case EVectorVMOp::exp2: FVectorKernelExp2::Exec(Context); break; case EVectorVMOp::log: FVectorKernelLog::Exec(Context); break; case EVectorVMOp::log2: FVectorKernelLog2::Exec(Context); break; case EVectorVMOp::sin: FVectorKernelSin::Exec(Context); break; case EVectorVMOp::cos: FVectorKernelCos::Exec(Context); break; case EVectorVMOp::tan: FVectorKernelTan::Exec(Context); break; case EVectorVMOp::asin: FVectorKernelASin::Exec(Context); break; case EVectorVMOp::acos: FVectorKernelACos::Exec(Context); break; case EVectorVMOp::atan: FVectorKernelATan::Exec(Context); break; case EVectorVMOp::atan2: FVectorKernelATan2::Exec(Context); break; case EVectorVMOp::ceil: FVectorKernelCeil::Exec(Context); break; case EVectorVMOp::floor: FVectorKernelFloor::Exec(Context); break; case EVectorVMOp::round: FVectorKernelRound::Exec(Context); break; case EVectorVMOp::fmod: FVectorKernelMod::Exec(Context); break; case EVectorVMOp::frac: FVectorKernelFrac::Exec(Context); break; case EVectorVMOp::trunc: FVectorKernelTrunc::Exec(Context); break; case EVectorVMOp::clamp: FVectorKernelClamp::Exec(Context); break; case EVectorVMOp::min: FVectorKernelMin::Exec(Context); break; case EVectorVMOp::max: FVectorKernelMax::Exec(Context); break; case EVectorVMOp::pow: FVectorKernelPow::Exec(Context); break; case EVectorVMOp::sign: FVectorKernelSign::Exec(Context); break; case EVectorVMOp::step: FVectorKernelStep::Exec(Context); break; case EVectorVMOp::random: FVectorKernelRandom::Exec(Context); break; case EVectorVMOp::noise: VectorVMNoise::Noise1D(Context); break; case EVectorVMOp::noise2D: VectorVMNoise::Noise2D(Context); break; case EVectorVMOp::noise3D: VectorVMNoise::Noise3D(Context); break; case EVectorVMOp::cmplt: FVectorKernelCompareLT::Exec(Context); break; case EVectorVMOp::cmple: FVectorKernelCompareLE::Exec(Context); break; case EVectorVMOp::cmpgt: FVectorKernelCompareGT::Exec(Context); break; case EVectorVMOp::cmpge: FVectorKernelCompareGE::Exec(Context); break; case EVectorVMOp::cmpeq: FVectorKernelCompareEQ::Exec(Context); break; case EVectorVMOp::cmpneq: FVectorKernelCompareNEQ::Exec(Context); break; case EVectorVMOp::select: FVectorKernelSelect::Exec(Context); break; case EVectorVMOp::addi: FVectorIntKernelAdd::Exec(Context); break; case EVectorVMOp::subi: FVectorIntKernelSubtract::Exec(Context); break; case EVectorVMOp::muli: FVectorIntKernelMultiply::Exec(Context); break; case EVectorVMOp::divi: FVectorIntKernelDivide::Exec(Context); break; case EVectorVMOp::clampi: FVectorIntKernelClamp::Exec(Context); break; case EVectorVMOp::mini: FVectorIntKernelMin::Exec(Context); break; case EVectorVMOp::maxi: FVectorIntKernelMax::Exec(Context); break; case EVectorVMOp::absi: FVectorIntKernelAbs::Exec(Context); break; case EVectorVMOp::negi: FVectorIntKernelNegate::Exec(Context); break; case EVectorVMOp::signi: FVectorIntKernelSign::Exec(Context); break; case EVectorVMOp::randomi: FScalarIntKernelRandom::Exec(Context); break; case EVectorVMOp::cmplti: FVectorIntKernelCompareLT::Exec(Context); break; case EVectorVMOp::cmplei: FVectorIntKernelCompareLE::Exec(Context); break; case EVectorVMOp::cmpgti: FVectorIntKernelCompareGT::Exec(Context); break; case EVectorVMOp::cmpgei: FVectorIntKernelCompareGE::Exec(Context); break; case EVectorVMOp::cmpeqi: FVectorIntKernelCompareEQ::Exec(Context); break; case EVectorVMOp::cmpneqi: FVectorIntKernelCompareNEQ::Exec(Context); break; case EVectorVMOp::bit_and: FVectorIntKernelBitAnd::Exec(Context); break; case EVectorVMOp::bit_or: FVectorIntKernelBitOr::Exec(Context); break; case EVectorVMOp::bit_xor: FVectorIntKernelBitXor::Exec(Context); break; case EVectorVMOp::bit_not: FVectorIntKernelBitNot::Exec(Context); break; case EVectorVMOp::bit_lshift: FVectorIntKernelBitLShift::Exec(Context); break; case EVectorVMOp::bit_rshift: FVectorIntKernelBitRShift::Exec(Context); break; case EVectorVMOp::logic_and: FVectorIntKernelLogicAnd::Exec(Context); break; case EVectorVMOp::logic_or: FVectorIntKernelLogicOr::Exec(Context); break; case EVectorVMOp::logic_xor: FVectorIntKernelLogicXor::Exec(Context); break; case EVectorVMOp::logic_not: FVectorIntKernelLogicNot::Exec(Context); break; case EVectorVMOp::f2i: FVectorKernelFloatToInt::Exec(Context); break; case EVectorVMOp::i2f: FVectorKernelIntToFloat::Exec(Context); break; case EVectorVMOp::f2b: FVectorKernelFloatToBool::Exec(Context); break; case EVectorVMOp::b2f: FVectorKernelBoolToFloat::Exec(Context); break; case EVectorVMOp::i2b: FVectorKernelIntToBool::Exec(Context); break; case EVectorVMOp::b2i: FVectorKernelBoolToInt::Exec(Context); break; case EVectorVMOp::outputdata_32bit: FScalarKernelWriteOutputIndexed::Exec(Context); break; case EVectorVMOp::inputdata_32bit: FVectorKernelReadInput::Exec(Context); break; //case EVectorVMOp::inputdata_32bit: FVectorKernelReadInput32::Exec(Context); break; case EVectorVMOp::inputdata_noadvance_32bit: FVectorKernelReadInputNoAdvance::Exec(Context); break; case EVectorVMOp::acquireindex: { if (bParallel) { FScalarKernelAcquireCounterIndex::Exec(Context); } else { FScalarKernelAcquireCounterIndex::Exec(Context); } }break; case EVectorVMOp::external_func_call: FKernelExternalFunctionCall::Exec(Context); break; case EVectorVMOp::exec_index: FVectorKernelExecutionIndex::Exec(Context); break; case EVectorVMOp::enter_stat_scope: FVectorKernelEnterStatScope::Exec(Context); break; case EVectorVMOp::exit_stat_scope: FVectorKernelExitStatScope::Exec(Context); break; //Special case ops to handle unique IDs but this can be written as generalized buffer operations. TODO! case EVectorVMOp::update_id: FScalarKernelUpdateID::Exec(Context); break; case EVectorVMOp::acquire_id: FScalarKernelAcquireID::Exec(Context); break; // Execution always terminates with a "done" opcode. case EVectorVMOp::done: break; // Opcode not recognized / implemented. default: UE_LOG(LogVectorVM, Fatal, TEXT("Unknown op code 0x%02x"), (uint32)Op); return;//BAIL } } while (Op != EVectorVMOp::done); InstancesLeft -= InstancesPerChunk; ++ChunkIdx; } Context.FinishExec(); }; ParallelFor(NumBatches, ExecChunkBatch, GbParallelVVM == 0 || !bParallel); // write back data set access indices, so we know how much was written to each data set for (int32 Idx = 0; Idx < DataSetMetaTable.Num(); Idx++) { DataSetMetaTable[Idx].DataSetAccessIndex = DataSetIndexTable[Idx]; } } uint8 VectorVM::GetNumOpCodes() { return (uint8)EVectorVMOp::NumOpcodes; } #if WITH_EDITOR FString VectorVM::GetOpName(EVectorVMOp Op) { check(g_VectorVMEnumStateObj); FString OpStr = g_VectorVMEnumStateObj->GetNameByValue((uint8)Op).ToString(); int32 LastIdx = 0; OpStr.FindLastChar(TEXT(':'),LastIdx); return OpStr.RightChop(LastIdx); } FString VectorVM::GetOperandLocationName(EVectorVMOperandLocation Location) { check(g_VectorVMEnumOperandObj); FString LocStr = g_VectorVMEnumOperandObj->GetNameByValue((uint8)Location).ToString(); int32 LastIdx = 0; LocStr.FindLastChar(TEXT(':'), LastIdx); return LocStr.RightChop(LastIdx); } #endif #undef VM_FORCEINLINE