// Copyright 1998-2015 Epic Games, Inc. All Rights Reserved. #include "VectorVMPrivate.h" #include "CurveVector.h" #include "VectorVMDataObject.h" #include "ModuleManager.h" IMPLEMENT_MODULE(FDefaultModuleImpl, VectorVM); DEFINE_LOG_CATEGORY_STATIC(LogVectorVM, All, All); //#define VM_FORCEINLINE #define VM_FORCEINLINE FORCEINLINE #define OP_REGISTER (0) #define OP0_CONST (1 << 0) #define OP1_CONST (1 << 1) #define OP2_CONST (1 << 2) #define OP3_CONST (1 << 3) #define OP0_DATAOBJ (1 << 4) #define OP1_DATAOBJ (1 << 5) #define OP2_DATAOBJ (1 << 6) #define OP3_DATAOBJ (1 << 7) #define SRCOP_RRRR (OP_REGISTER | OP_REGISTER | OP_REGISTER | OP_REGISTER) #define SRCOP_RRRC (OP_REGISTER | OP_REGISTER | OP_REGISTER | OP0_CONST) #define SRCOP_RRCR (OP_REGISTER | OP_REGISTER | OP1_CONST | OP_REGISTER) #define SRCOP_RRCC (OP_REGISTER | OP_REGISTER | OP1_CONST | OP0_CONST) #define SRCOP_RCRR (OP_REGISTER | OP2_CONST | OP_REGISTER | OP_REGISTER) #define SRCOP_RCRC (OP_REGISTER | OP2_CONST | OP_REGISTER | OP0_CONST) #define SRCOP_RCCR (OP_REGISTER | OP2_CONST | OP1_CONST | OP_REGISTER) #define SRCOP_RCCC (OP_REGISTER | OP2_CONST | OP1_CONST | OP0_CONST) #define SRCOP_CRRR (OP3_CONST | OP_REGISTER | OP_REGISTER | OP_REGISTER) #define SRCOP_CRRC (OP3_CONST | OP_REGISTER | OP_REGISTER | OP0_CONST) #define SRCOP_CRCR (OP3_CONST | OP_REGISTER | OP1_CONST | OP_REGISTER) #define SRCOP_CRCC (OP3_CONST | OP_REGISTER | OP1_CONST | OP0_CONST) #define SRCOP_CCRR (OP3_CONST | OP2_CONST | OP_REGISTER | OP_REGISTER) #define SRCOP_CCRC (OP3_CONST | OP2_CONST | OP_REGISTER | OP0_CONST) #define SRCOP_CCCR (OP3_CONST | OP2_CONST | OP1_CONST | OP_REGISTER) #define SRCOP_CCCC (OP3_CONST | OP2_CONST | OP1_CONST | OP0_CONST) #define SRCOP_RRRB (OP_REGISTER | OP_REGISTER | OP_REGISTER | OP0_DATAOBJ) #define SRCOP_RRBR (OP_REGISTER | OP_REGISTER | OP1_DATAOBJ | OP_REGISTER) #define SRCOP_RRBB (OP_REGISTER | OP_REGISTER | OP1_DATAOBJ | OP0_DATAOBJ) #define SRCOP_RRCB (OP_REGISTER | OP_REGISTER | OP1_CONST | OP0_DATAOBJ) uint8 VectorVM::CreateSrcOperandMask(EVectorVMOperandLocation Type1, EVectorVMOperandLocation Type2, EVectorVMOperandLocation Type3, EVectorVMOperandLocation Type4) { return (Type1 == EVectorVMOperandLocation::Constant ? OP0_CONST : OP_REGISTER) | (Type2 == EVectorVMOperandLocation::Constant ? OP1_CONST : OP_REGISTER) | (Type3 == EVectorVMOperandLocation::Constant ? OP2_CONST : OP_REGISTER) | (Type4 == EVectorVMOperandLocation::Constant ? OP3_CONST : OP_REGISTER) | (Type1 == EVectorVMOperandLocation::DataObjConstant ? OP0_DATAOBJ : OP_REGISTER) | (Type2 == EVectorVMOperandLocation::DataObjConstant ? OP1_DATAOBJ : OP_REGISTER) | (Type3 == EVectorVMOperandLocation::DataObjConstant ? OP2_DATAOBJ : OP_REGISTER) | (Type4 == EVectorVMOperandLocation::DataObjConstant ? OP3_DATAOBJ : OP_REGISTER); } UNiagaraDataObject::UNiagaraDataObject(const FObjectInitializer& ObjectInitializer) : Super(ObjectInitializer) { } UNiagaraCurveDataObject::UNiagaraCurveDataObject(const FObjectInitializer& ObjectInitializer) : Super(ObjectInitializer), CurveObj(nullptr) { } FVector4 UNiagaraCurveDataObject::Sample(const FVector4& InCoords) const { FVector Vec = CurveObj->GetVectorValue(InCoords.X); return FVector4(Vec, 0.0f); } UNiagaraSparseVolumeDataObject::UNiagaraSparseVolumeDataObject(const FObjectInitializer& ObjectInitializer) : Super(ObjectInitializer) { Size = 64; NumBuckets = Size*Size*Size; //Data.AddZeroed(NumBuckets); Data.Init(FVector4(0.1f, 0.1f, 0.1f, 0.1f), NumBuckets); } ////////////////////////////////////////////////////////////////////////// #if WITH_EDITOR TArray OpNames; TArray OperandLocationNames; #endif #if ENABLE_VM_DEBUGGING FVectorVMDebuggerImpl* AttachedDebugger = NULL; #endif /** * Context information passed around during VM execution. */ struct FVectorVMContext { /** Pointer to the next element in the byte code. */ uint8 const* RESTRICT Code; /** Pointer to the table of vector register arrays. */ VectorRegister* RESTRICT * RESTRICT RegisterTable; /** Pointer to the constant table. */ FVector4 const* RESTRICT ConstantTable; /** Pointer to the data object constant table. */ UNiagaraDataObject * RESTRICT *DataObjConstantTable; /** Pointer to the shared data table. */ FVectorVMSharedDataView* RESTRICT SharedDataTable; /** The number of vectors to process. */ int32 NumVectors; /** The number of instances to process. */ int32 NumInstances; /** The Operation currently executing. */ EVectorVMOp CurrOp; /** The instance we're currently starting at. Advances with each chunk processed. */ int32 StartInstance; /** Initialization constructor. */ FVectorVMContext( const uint8* InCode, VectorRegister** InRegisterTable, const FVector4* InConstantTable, UNiagaraDataObject** InDataObjTable, FVectorVMSharedDataView* InSharedDataTable, int32 InNumVectors, int32 InNumInstances, int32 InStartInstance ) : Code(InCode) , RegisterTable(InRegisterTable) , ConstantTable(InConstantTable) , DataObjConstantTable(InDataObjTable) , SharedDataTable(InSharedDataTable) , NumVectors(InNumVectors) , NumInstances(InNumInstances) , CurrOp(EVectorVMOp::done) , StartInstance(InStartInstance) { } FORCEINLINE bool IsDebugging() { #if ENABLE_VM_DEBUGGING return AttachedDebugger != nullptr; #else return false; #endif } FORCEINLINE void SetOp(EVectorVMOp InOp){ CurrOp = InOp; } #if ENABLE_VM_DEBUGGING FORCEINLINE void BeginOp(VectorVM::EVMType InType, int32 InNumArgs, int32 InNumInstancesPerOp) { if (AttachedDebugger) { AttachedDebugger->BeginOp(*this, InType, InNumArgs, InNumInstancesPerOp); } } template FORCEINLINE void PreOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1 = DummyHandler, Arg2Handler& Arg2 = DummyHandler, Arg3Handler& Arg3 = DummyHandler) { if (AttachedDebugger) { AttachedDebugger->PreOp(*this, Dst, Arg0, Arg1, Arg2, Arg3); } } template FORCEINLINE void PostOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1 = DummyHandler, Arg2Handler& Arg2 = DummyHandler, Arg3Handler& Arg3 = DummyHandler) { if (AttachedDebugger) { AttachedDebugger->PostOp(*this, Dst, Arg0, Arg1, Arg2, Arg3); } } #else FORCEINLINE void BeginOp(VectorVM::EVMType InType, int32 InNumArgs, int32 InNumInstancesPerOp) { } template FORCEINLINE void PreOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1, Arg2Handler& Arg2, Arg3Handler& Arg3){ } template FORCEINLINE void PostOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1, Arg2Handler& Arg2, Arg3Handler& Arg3){ } template FORCEINLINE void PreOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1, Arg2Handler& Arg2){ } template FORCEINLINE void PostOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1, Arg2Handler& Arg2){ } template FORCEINLINE void PreOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1){ } template FORCEINLINE void PostOp(DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1){ } template FORCEINLINE void PreOp(DstHandler& Dst, Arg0Handler& Arg0){ } template FORCEINLINE void PostOp(DstHandler& Dst, Arg0Handler& Arg0){ } #endif }; static VM_FORCEINLINE uint8 DecodeU8(FVectorVMContext& Context) { return *Context.Code++; } static VM_FORCEINLINE uint8 DecodeU16(FVectorVMContext& Context) { return (*((uint16*)Context.Code))++; } static VM_FORCEINLINE uint8 DecodeU32(FVectorVMContext& Context) { return (*((uint32*)Context.Code))++; } /** Decode the next operation contained in the bytecode. */ static VM_FORCEINLINE EVectorVMOp DecodeOp(FVectorVMContext& Context) { return static_cast(DecodeU8(Context)); } static VM_FORCEINLINE uint8 DecodeSrcOperandTypes(FVectorVMContext& Context) { return DecodeU8(Context); } ////////////////////////////////////////////////////////////////////////// /** Constant handler. */ struct FConstantHandlerBase { uint8 ConstantIndex; FConstantHandlerBase(FVectorVMContext& Context) : ConstantIndex(DecodeU8(Context)) {} VM_FORCEINLINE void Advance(){ } VM_FORCEINLINE int32 GetLocationIndex(){ return ConstantIndex; } VM_FORCEINLINE int32 GetSecondaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE int32 GetTertiaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE EVectorVMOperandLocation GetLocation(){ return EVectorVMOperandLocation::Constant; } }; template struct FConstantHandler : public FConstantHandlerBase { T Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(Context.ConstantTable[ConstantIndex]) {} VM_FORCEINLINE const T& Get(){ return Constant; } }; template<> struct FConstantHandler : public FConstantHandlerBase { VectorRegister Constant; FConstantHandler(FVectorVMContext& Context) : FConstantHandlerBase(Context) , Constant(VectorLoadAligned(&Context.ConstantTable[ConstantIndex])) {} VM_FORCEINLINE const VectorRegister Get(){ return Constant; } }; typedef FConstantHandler FVectorConstantHandler; ////////////////////////////////////////////////////////////////////////// struct FDataObjectConstantHandler { int32 ConstantIndex; UNiagaraDataObject *Constant; FDataObjectConstantHandler(FVectorVMContext& Context) : ConstantIndex(DecodeU8(Context)) , Constant(Context.DataObjConstantTable[ConstantIndex]) {} VM_FORCEINLINE void Advance(){ } VM_FORCEINLINE int32 GetLocationIndex(){ return ConstantIndex; } VM_FORCEINLINE int32 GetSecondaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE int32 GetTertiaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE EVectorVMOperandLocation GetLocation(){ return EVectorVMOperandLocation::DataObjConstant; } VM_FORCEINLINE UNiagaraDataObject *Get(){ return Constant; } }; ////////////////////////////////////////////////////////////////////////// // Register handlers. // Handle reading of a register, advancing the pointer with each read. struct FRegisterHandlerBase { int32 RegisterIndex; FRegisterHandlerBase(FVectorVMContext& Context) : RegisterIndex(DecodeU8(Context)) {} VM_FORCEINLINE int32 GetLocationIndex() { if (RegisterIndex < VectorVM::NumTempRegisters) { return RegisterIndex; } else if (RegisterIndex < VectorVM::NumTempRegisters + VectorVM::MaxInputRegisters) { return RegisterIndex - VectorVM::NumTempRegisters; } else { return RegisterIndex - (VectorVM::NumTempRegisters + VectorVM::MaxInputRegisters); } } VM_FORCEINLINE int32 GetSecondaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE int32 GetTertiaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE EVectorVMOperandLocation GetLocation() { if (RegisterIndex < VectorVM::NumTempRegisters) { return EVectorVMOperandLocation::TemporaryRegister; } else if (RegisterIndex < VectorVM::NumTempRegisters + VectorVM::MaxInputRegisters) { return EVectorVMOperandLocation::InputRegister; } else { return EVectorVMOperandLocation::OutputRegister; } } }; template struct FRegisterHandler : public FRegisterHandlerBase { T* RESTRICT Register; FRegisterHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((T*)Context.RegisterTable[RegisterIndex]) {} VM_FORCEINLINE const T Get(){ return *Register; } VM_FORCEINLINE void Advance(){ Register += NumInstancesPerOp; } }; template struct FRegisterHandler : public FRegisterHandlerBase { VectorRegister* RESTRICT Register; FRegisterHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((VectorRegister*)Context.RegisterTable[RegisterIndex]) {} VM_FORCEINLINE const VectorRegister Get(){ return VectorLoadAligned(Register); } VM_FORCEINLINE void Advance(){ Register += NumInstancesPerOp; } }; typedef FRegisterHandler FVectorRegisterHandler; /** Handles writing to a register, advancing the pointer with each write. */ template struct FRegisterDestHandler : public FRegisterHandlerBase { T* RESTRICT Register; FRegisterDestHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((T*)Context.RegisterTable[RegisterIndex]) {} VM_FORCEINLINE T* RESTRICT Get(){ return Register; } VM_FORCEINLINE T GetValue(){ return *Register; } VM_FORCEINLINE void Advance(){ Register += NumInstancesPerOp; } }; template struct FRegisterDestHandler : public FRegisterHandlerBase { VectorRegister* RESTRICT Register; FRegisterDestHandler(FVectorVMContext& Context) : FRegisterHandlerBase(Context) , Register((VectorRegister*)Context.RegisterTable[RegisterIndex]) {} VM_FORCEINLINE VectorRegister* RESTRICT Get(){ return Register; } VM_FORCEINLINE VectorRegister GetValue(){ return VectorLoadAligned(Register); } VM_FORCEINLINE void Advance(){ Register += NumInstancesPerOp; } }; ////////////////////////////////////////////////////////////////////////// // Kernels template struct TUnaryKernel { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { DstHandler Dst(Context); Arg0Handler Arg0(Context); Context.BeginOp(Kernel::Type, 1, Kernel::NumInstancesPerOp); for (int32 i = 0; i < Context.NumInstances; i += Kernel::NumInstancesPerOp) { Context.PreOp(Dst, Arg0); Kernel::DoKernel(Dst.Get(), Arg0.Get()); Context.PostOp(Dst, Arg0); Dst.Advance(); Arg0.Advance(); } } }; template struct TBinaryKernel { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { DstHandler Dst(Context); Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Context.BeginOp(Kernel::Type, 2, Kernel::NumInstancesPerOp); for (int32 i = 0; i < Context.NumInstances; i += Kernel::NumInstancesPerOp) { Context.PreOp(Dst, Arg0, Arg1); Kernel::DoKernel(Dst.Get(), Arg0.Get(), Arg1.Get()); Context.PostOp(Dst, Arg0, Arg1); Dst.Advance(); Arg0.Advance(); Arg1.Advance(); } } }; template struct TTrinaryKernel { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { DstHandler Dst(Context); Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Arg2Handler Arg2(Context); Context.BeginOp(Kernel::Type, 3, Kernel::NumInstancesPerOp); for (int32 i = 0; i < Context.NumInstances; i += Kernel::NumInstancesPerOp) { Context.PreOp(Dst, Arg0, Arg1, Arg2); Kernel::DoKernel(Dst.Get(), Arg0.Get(), Arg1.Get(), Arg2.Get()); Context.PostOp(Dst, Arg0, Arg1, Arg2); Dst.Advance(); Arg0.Advance(); Arg1.Advance(); Arg2.Advance(); } } }; template struct TQuaternaryKernel { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { DstHandler Dst(Context); Arg0Handler Arg0(Context); Arg1Handler Arg1(Context); Arg2Handler Arg2(Context); Arg3Handler Arg3(Context); Context.BeginOp(Kernel::Type, 4, Kernel::NumInstancesPerOp); for (int32 i = 0; i < Context.NumInstances; i += Kernel::NumInstancesPerOp) { Context.PreOp(Dst, Arg0, Arg1, Arg2, Arg3); Kernel::DoKernel(Dst.Get(), Arg0.Get(), Arg1.Get(), Arg2.Get(), Arg3.Get()); Context.PostOp(Dst, Arg0, Arg1, Arg2, Arg3); Dst.Advance(); Arg0.Advance(); Arg1.Advance(); Arg2.Advance(); Arg3.Advance(); } } }; /** Base class of vector kernels with a single operand. */ template > struct TUnaryVectorKernel { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRRR: TUnaryKernel::Exec(Context); break; case SRCOP_RRRC: TUnaryKernel::Exec(Context); break; default: check(0); break; }; } }; /** Base class of Vector kernels with 2 operands. */ template > struct TBinaryVectorKernel { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRRR: TBinaryKernel::Exec(Context); break; case SRCOP_RRRC: TBinaryKernel::Exec(Context); break; case SRCOP_RRCR: TBinaryKernel::Exec(Context); break; case SRCOP_RRCC: TBinaryKernel::Exec(Context); break; default: check(0); break; }; } }; /** Base class of Vector kernels with 2 operands, one of which can be a data object. */ template > struct TBinaryVectorKernelData { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRRB: TBinaryKernel::Exec(Context); break; case SRCOP_RRCB: TBinaryKernel::Exec(Context); break; default: check(0); break; }; } }; /** Base class of Vector kernels with 2 operands, one of which can be a data object. */ template > struct TTrinaryVectorKernelData { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRRB: TTrinaryKernel::Exec(Context); break; default: check(0); break; }; } }; /** Base class of Vector kernels with 3 operands. */ template > struct TTrinaryVectorKernel { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRRR: TTrinaryKernel::Exec(Context); break; case SRCOP_RRRC: TTrinaryKernel::Exec(Context); break; case SRCOP_RRCR: TTrinaryKernel::Exec(Context); break; case SRCOP_RRCC: TTrinaryKernel::Exec(Context); break; case SRCOP_RCRR: TTrinaryKernel::Exec(Context); break; case SRCOP_RCRC: TTrinaryKernel::Exec(Context); break; case SRCOP_RCCR: TTrinaryKernel::Exec(Context); break; case SRCOP_RCCC: TTrinaryKernel::Exec(Context); break; default: check(0); break; }; } }; /** Base class of Vector kernels with 4 operands. */ template > struct TQuatenaryVectorKernel { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void Exec(FVectorVMContext& Context) { uint32 SrcOpTypes = DecodeSrcOperandTypes(Context); switch (SrcOpTypes) { case SRCOP_RRRR: TQuaternaryKernel::Exec(Context); break; case SRCOP_RRRC: TQuaternaryKernel::Exec(Context); break; case SRCOP_RRCR: TQuaternaryKernel::Exec(Context); break; case SRCOP_RRCC: TQuaternaryKernel::Exec(Context); break; case SRCOP_RCRR: TQuaternaryKernel::Exec(Context); break; case SRCOP_RCRC: TQuaternaryKernel::Exec(Context); break; case SRCOP_RCCR: TQuaternaryKernel::Exec(Context); break; case SRCOP_RCCC: TQuaternaryKernel::Exec(Context); break; case SRCOP_CRRR: TQuaternaryKernel::Exec(Context); break; case SRCOP_CRRC: TQuaternaryKernel::Exec(Context); break; case SRCOP_CRCR: TQuaternaryKernel::Exec(Context); break; case SRCOP_CRCC: TQuaternaryKernel::Exec(Context); break; case SRCOP_CCRR: TQuaternaryKernel::Exec(Context); break; case SRCOP_CCRC: TQuaternaryKernel::Exec(Context); break; case SRCOP_CCCR: TQuaternaryKernel::Exec(Context); break; case SRCOP_CCCC: TQuaternaryKernel::Exec(Context); break; default: check(0); break; }; } }; /*------------------------------------------------------------------------------ Implementation of all kernel operations. ------------------------------------------------------------------------------*/ struct FVectorKernelAdd : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorAdd(Src0, Src1); } }; struct FVectorKernelSub : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorSubtract(Src0, Src1); } }; struct FVectorKernelMul : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMultiply(Src0, Src1); } }; struct FVectorKernelDiv : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorDivide(Src0, Src1); } }; struct FVectorKernelMad : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { *Dst = VectorMultiplyAdd(Src0, Src1, Src2); } }; struct FVectorKernelLerp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { const VectorRegister OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Src2); const VectorRegister Tmp = VectorMultiply(Src0, OneMinusAlpha); *Dst = VectorMultiplyAdd(Src1, Src2, Tmp); } }; struct FVectorKernelRcp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorReciprocal(Src0); } }; struct FVectorKernelRsq : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorReciprocalSqrt(Src0); } }; struct FVectorKernelSqrt : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0) { // TODO: Need a SIMD sqrt! *Dst = VectorReciprocal(VectorReciprocalSqrt(Src0)); } }; struct FVectorKernelNeg : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorNegate(Src0); } }; struct FVectorKernelAbs : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0) { *Dst = VectorAbs(Src0); } }; struct FVectorKernelExp : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorExp(Src0); } }; struct FVectorKernelExp2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorExp2(Src0); } }; struct FVectorKernelLog : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorLog(Src0); } }; struct FVectorKernelLog2 : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorLog2(Src0); } }; struct FVectorKernelClamp : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2) { const VectorRegister Tmp = VectorMax(Src0, Src1); *Dst = VectorMin(Tmp, Src2); } }; struct FVectorKernelSin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorSin(VectorMultiply(Src0, GlobalVectorConstants::TwoPi)); } }; struct FVectorKernelCos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorCos(VectorMultiply(Src0, GlobalVectorConstants::TwoPi)); } }; struct FVectorKernelTan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorTan(VectorMultiply(Src0, GlobalVectorConstants::TwoPi)); } }; struct FVectorKernelASin : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorMultiply(VectorASin(Src0), GlobalVectorConstants::OneOverTwoPi); } }; struct FVectorKernelACos : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorMultiply(VectorACos(Src0), GlobalVectorConstants::OneOverTwoPi); } }; struct FVectorKernelATan : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorMultiply(VectorATan(Src0), GlobalVectorConstants::OneOverTwoPi); } }; struct FVectorKernelATan2 : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorMultiply(VectorATan2(Src0, Src1), GlobalVectorConstants::OneOverTwoPi); } }; struct FVectorKernelCeil : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorCeil(Src0); } }; struct FVectorKernelFloor : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorFloor(Src0); } }; struct FVectorKernelMod : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorMod(Src0, Src1); } }; struct FVectorKernelFrac : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorFractional(Src0); } }; struct FVectorKernelTrunc : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorTruncate(Src0); } }; struct FVectorKernelLessThan : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { VectorRegister Tmp = VectorSubtract(Src1, Src0); Tmp = VectorMultiply(Tmp, GlobalVectorConstants::BigNumber); Tmp = VectorMin(Tmp, GlobalVectorConstants::FloatOne); *Dst = VectorMax(Tmp, GlobalVectorConstants::FloatZero); } }; struct FVectorKernelSelect : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Mask, VectorRegister A, VectorRegister B) { //Currently works by cmpgt 0 to match the current, all vector float vm/scripts but with int support, this should probably change to direct use of a mask. *Dst = VectorSelect(VectorCompareGT(Mask, GlobalVectorConstants::FloatZero), A, B); } }; struct FVectorKernelSample : public TBinaryVectorKernelData { static void FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, UNiagaraDataObject *Src0, VectorRegister Src1) { if (Src0) { float const* FloatSrc1 = reinterpret_cast(&Src1); FVector4 Tmp = Src0->Sample(FVector4(FloatSrc1[0], FloatSrc1[1], FloatSrc1[2], FloatSrc1[3])); *Dst = VectorLoad(&Tmp); } } }; struct FVectorKernelBufferWrite : public TTrinaryVectorKernelData { static void FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, UNiagaraDataObject *Src0, VectorRegister Src1, VectorRegister Src2) { if (Src0) { float const* FloatSrc1 = reinterpret_cast(&Src1); // Coords float const* FloatSrc2 = reinterpret_cast(&Src2); // Value FVector4 Tmp = Src0->Write(FVector4(FloatSrc1[0], FloatSrc1[1], FloatSrc1[2], FloatSrc1[3]), FVector4(FloatSrc2[0], FloatSrc2[1], FloatSrc2[2], FloatSrc2[3])); *Dst = VectorLoad(&Tmp); } } }; struct FVectorKernelDot : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorDot4(Src0, Src1); } }; struct FVectorKernelLength : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { VectorRegister Temp = VectorReciprocalLen(Src0); *Dst = VectorReciprocal(Temp); } }; struct FVectorKernelCross : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { *Dst = VectorCross(Src0, Src1); } }; struct FVectorKernelNormalize : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorNormalize(Src0); } }; struct FVectorKernelRandom : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { const float rm = RAND_MAX; VectorRegister Result = MakeVectorRegister(static_cast(FMath::Rand()) / rm, static_cast(FMath::Rand()) / rm, static_cast(FMath::Rand()) / rm, static_cast(FMath::Rand()) / rm); *Dst = VectorMultiply(Result, Src0); } }; /* gaussian distribution random number (not working yet) */ struct FVectorKernelRandomGauss : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1) { const float rm = RAND_MAX; VectorRegister Result = MakeVectorRegister(static_cast(FMath::Rand()) / rm, static_cast(FMath::Rand()) / rm, static_cast(FMath::Rand()) / rm, static_cast(FMath::Rand()) / rm); Result = VectorSubtract(Result, MakeVectorRegister(0.5f, 0.5f, 0.5f, 0.5f)); Result = VectorMultiply(MakeVectorRegister(3.0f, 3.0f, 3.0f, 3.0f), Result); // taylor series gaussian approximation const VectorRegister SPi2 = VectorReciprocal(VectorReciprocalSqrt(MakeVectorRegister(2 * PI, 2 * PI, 2 * PI, 2 * PI))); VectorRegister Gauss = VectorReciprocal(SPi2); VectorRegister Div = VectorMultiply(MakeVectorRegister(2.0f, 2.0f, 2.0f, 2.0f), SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorMultiply(Result, Result), Div)); Div = VectorMultiply(MakeVectorRegister(8.0f, 8.0f, 8.0f, 8.0f), SPi2); Gauss = VectorAdd(Gauss, VectorDivide(VectorPow(MakeVectorRegister(4.0f, 4.0f, 4.0f, 4.0f), Result), Div)); Div = VectorMultiply(MakeVectorRegister(48.0f, 48.0f, 48.0f, 48.0f), SPi2); Gauss = VectorSubtract(Gauss, VectorDivide(VectorPow(MakeVectorRegister(6.0f, 6.0f, 6.0f, 6.0f), Result), Div)); Gauss = VectorDivide(Gauss, MakeVectorRegister(0.4f, 0.4f, 0.4f, 0.4f)); Gauss = VectorMultiply(Gauss, Src0); *Dst = Gauss; } }; struct FVectorKernelMin : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMin(Src0, Src1); } }; struct FVectorKernelMax : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorMax(Src0, Src1); } }; struct FVectorKernelPow : public TBinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1) { *Dst = VectorPow(Src0, Src1); } }; struct FVectorKernelSign : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorSign(Src0); } }; struct FVectorKernelStep : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorStep(Src0); } }; struct FVectorKernelNoise : public TUnaryVectorKernel { static VectorRegister RandomTable[17][17][17]; static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { const VectorRegister One = MakeVectorRegister(1.0f, 1.0f, 1.0f, 1.0f); const VectorRegister VecSize = MakeVectorRegister(16.0f, 16.0f, 16.0f, 16.0f); *Dst = MakeVectorRegister(0.0f, 0.0f, 0.0f, 0.0f); for (uint32 i = 1; i < 2; i++) { float Di = 0.2f * (1.0f/(1<(&Coords); const int32 Cx = CoordPtr[0]; const int32 Cy = CoordPtr[1]; const int32 Cz = CoordPtr[2]; VectorRegister Frac = VectorFractional(Coords); VectorRegister Alpha = VectorReplicate(Frac, 0); VectorRegister OneMinusAlpha = VectorSubtract(One, Alpha); VectorRegister XV1 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz], OneMinusAlpha)); VectorRegister XV2 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz], OneMinusAlpha)); VectorRegister XV3 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz+1], OneMinusAlpha)); VectorRegister XV4 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz+1], OneMinusAlpha)); Alpha = VectorReplicate(Frac, 1); OneMinusAlpha = VectorSubtract(One, Alpha); VectorRegister YV1 = VectorMultiplyAdd(XV1, Alpha, VectorMultiply(XV2, OneMinusAlpha)); VectorRegister YV2 = VectorMultiplyAdd(XV3, Alpha, VectorMultiply(XV4, OneMinusAlpha)); Alpha = VectorReplicate(Frac, 2); OneMinusAlpha = VectorSubtract(One, Alpha); VectorRegister ZV = VectorMultiplyAdd(YV1, Alpha, VectorMultiply(YV2, OneMinusAlpha)); *Dst = VectorAdd(*Dst, ZV); } } }; VectorRegister FVectorKernelNoise::RandomTable[17][17][17]; template struct FVectorKernelSplat : public TUnaryVectorKernel> { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0) { *Dst = VectorReplicate(Src0, Component); } }; template struct FVectorKernelCompose : public TQuatenaryVectorKernel> { //Passing as const refs as some compilers cant handle > 3 aligned vectorregister params. //inlined so shouldn't impact perf //Todo: ^^^^ test this static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, const VectorRegister& Src0, const VectorRegister& Src1, const VectorRegister& Src2, const VectorRegister& Src3) { //TODO - There's probably a faster way to do this. VectorRegister Tmp0 = VectorShuffle(Src0, Src1, Cmp0, Cmp0, Cmp1, Cmp1); VectorRegister Tmp1 = VectorShuffle(Src2, Src3, Cmp2, Cmp2, Cmp3, Cmp3); *Dst = VectorShuffle(Tmp0, Tmp1, 0, 2, 0, 2); } }; // Ken Perlin's smootherstep function (zero first and second order derivatives at 0 and 1) // calculated separately for each channel of Src2 struct FVectorKernelEaseIn : public TTrinaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, const VectorRegister& Src0, const VectorRegister& Src1, const VectorRegister& Src2) { VectorRegister X = VectorMin( VectorDivide(VectorSubtract(Src2, Src0), VectorSubtract(Src1, Src0)), MakeVectorRegister(1.0f, 1.0f, 1.0f, 1.0f) ); X = VectorMax(X, MakeVectorRegister(0.0f, 0.0f, 0.0f, 0.0f)); VectorRegister X3 = VectorMultiply( VectorMultiply(X, X), X); VectorRegister N6 = MakeVectorRegister(6.0f, 6.0f, 6.0f, 6.0f); VectorRegister N15 = MakeVectorRegister(15.0f, 15.0f, 15.0f, 15.0f); VectorRegister T = VectorSubtract( VectorMultiply(X, N6), N15 ); T = VectorAdd(VectorMultiply(X, T), MakeVectorRegister(10.0f, 10.0f, 10.0f, 10.0f)); *Dst = VectorMultiply(X3, T); } }; // smoothly runs 0->1->0 struct FVectorKernelEaseInOut : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, const VectorRegister& Src0) { VectorRegister T = VectorMultiply(Src0, MakeVectorRegister(2.0f, 2.0f, 2.0f, 2.0f)); T = VectorSubtract(T, MakeVectorRegister(1.0f, 1.0f, 1.0f, 1.0f)); VectorRegister X2 = VectorMultiply(T, T); VectorRegister R = VectorMultiply(X2, MakeVectorRegister(0.9604f, 0.9604f, 0.9604f, 0.9604f)); R = VectorSubtract(R, MakeVectorRegister(1.96f, 1.96f, 1.96f, 1.96f)); R = VectorMultiply(R, X2); *Dst = VectorAdd(R, MakeVectorRegister(1.0f, 1.0f, 1.0f, 1.0f)); } }; struct FVectorKernelOutputStreamed : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* Dst, VectorRegister Src0) { VectorStoreAlignedStreamed(Src0, Dst); } }; struct FVectorKernelOutput : public TUnaryVectorKernel { static void VM_FORCEINLINE DoKernel(VectorRegister* Dst, VectorRegister Src0) { VectorStoreAligned(Src0, Dst); } }; ////////////////////////////////////////////////////////////////////////// //Shared data struct FSharedDataHandlerBase { int32 SharedDataIdx; FVectorVMSharedDataView& SharedData; int32 VarIndex; FRegisterHandler IndexRegisterHandler; FSharedDataHandlerBase(FVectorVMContext& Context) : SharedDataIdx(DecodeU8(Context)) , SharedData(Context.SharedDataTable[SharedDataIdx]) , VarIndex(DecodeU8(Context)) , IndexRegisterHandler(Context) { } VM_FORCEINLINE void Advance(){ IndexRegisterHandler.Advance(); } VM_FORCEINLINE int32 GetSharedDataIndex(){ return SharedDataIdx; } VM_FORCEINLINE int32 GetVarIndex(){ return VarIndex; } VM_FORCEINLINE EVectorVMOperandLocation GetLocation(){ return EVectorVMOperandLocation::SharedData; } VM_FORCEINLINE int32 GetDataIndex(){ return IndexRegisterHandler.Get(); } VM_FORCEINLINE int32 GetLocationIndex(){ return GetSharedDataIndex(); } VM_FORCEINLINE int32 GetSecondaryIndex(){ return GetVarIndex(); } VM_FORCEINLINE int32 GetTertiaryIndex(){ return GetDataIndex(); } }; struct FSharedDataHandler : public FSharedDataHandlerBase { FSharedDataHandler(FVectorVMContext& Context) : FSharedDataHandlerBase(Context) { } VM_FORCEINLINE VectorRegister Get() { return VectorLoad((VectorRegister*)SharedData.GetReadBuffer(VarIndex, IndexRegisterHandler.Get())); } }; struct FSharedDataDestHandler : public FSharedDataHandlerBase { FSharedDataDestHandler(FVectorVMContext& Context) : FSharedDataHandlerBase(Context) { } VM_FORCEINLINE VectorRegister* Get() { return (VectorRegister*)SharedData.GetWriteBuffer(VarIndex, IndexRegisterHandler.Get()); } VM_FORCEINLINE VectorRegister GetValue() { return VectorLoadAligned(Get()); } }; /** Temporary Vector only version of this. */ struct FVectorKernelSharedDataWrite : public TUnaryKernel> { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void VM_FORCEINLINE DoKernel(VectorRegister* Buffer, VectorRegister Data) { VectorStoreAlignedStreamed(Data, (float*)(Buffer)); } }; /** Temporary Vector only version of this. */ struct FVectorKernelSharedDataRead : public TUnaryKernel, FSharedDataHandler> { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static void VM_FORCEINLINE DoKernel(VectorRegister* Dst, VectorRegister Data) { *Dst = Data; } }; struct FSharedDataIndexHandlerBase { int32 SharedDataIndex; int32 CurrIndex; FVectorVMSharedDataView& SharedData; VM_FORCEINLINE void Advance(){} VM_FORCEINLINE EVectorVMOperandLocation GetLocation(){ return EVectorVMOperandLocation::Undefined; } VM_FORCEINLINE int32 GetLocationIndex(){ return SharedDataIndex; } VM_FORCEINLINE int32 GetSecondaryIndex(){ return INDEX_NONE; } VM_FORCEINLINE int32 GetTertiaryIndex(){ return INDEX_NONE; } int32 Get(){ return CurrIndex; } FSharedDataIndexHandlerBase(FVectorVMContext& Context) : SharedDataIndex(DecodeU8(Context)) , SharedData(Context.SharedDataTable[SharedDataIndex]) { } }; struct FSharedDataIndexHandler_Acquire : public FSharedDataIndexHandlerBase { FSharedDataIndexHandler_Acquire(FVectorVMContext& Context) : FSharedDataIndexHandlerBase(Context) { } int32 GetNextIndex(){ CurrIndex = SharedData.AcquireIndexWrap(); return CurrIndex; } }; struct FSharedDataIndexHandler_AcquireWrap : public FSharedDataIndexHandlerBase { FSharedDataIndexHandler_AcquireWrap(FVectorVMContext& Context) : FSharedDataIndexHandlerBase(Context) { } int32 GetNextIndex(){ CurrIndex = SharedData.AcquireIndexWrap(); return CurrIndex; } }; struct FSharedDataIndexHandler_Consume : public FSharedDataIndexHandlerBase { FSharedDataIndexHandler_Consume(FVectorVMContext& Context) : FSharedDataIndexHandlerBase(Context) { } int32 GetNextIndex(){ CurrIndex = SharedData.ConsumeIndex(); return CurrIndex; } }; struct FSharedDataIndexHandler_ConsumeWrap : public FSharedDataIndexHandlerBase { FSharedDataIndexHandler_ConsumeWrap(FVectorVMContext& Context) : FSharedDataIndexHandlerBase(Context) { } int32 GetNextIndex(){ CurrIndex = SharedData.ConsumeIndexWrap(); return CurrIndex; } }; //Temporary until after scalarization and we can store the size and counter of shared data in constant data. template struct FKernelSharedDataGetAppendIndexBase { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { FRegisterDestHandler IndexDest(Context); FRegisterHandler ValidSrc(Context); IndexHandler IdxHandler(Context); int32 NumInstances = Context.NumInstances; Context.BeginOp(VectorVM::EVMType::Vector4, 2, 1); for (int32 i = 0; i < NumInstances; ++i) { Context.PreOp, FRegisterHandler, IndexHandler>(IndexDest, ValidSrc, IdxHandler); int32 Index = ValidSrc.Get() > 0.0f ? IdxHandler.GetNextIndex() : INDEX_NONE; int32* Dest = IndexDest.Get(); Dest[0] = Index; Dest[1] = Index; Dest[2] = Index; Dest[3] = Index; Context.PostOp, FRegisterHandler, IndexHandler>(IndexDest, ValidSrc, IdxHandler); ValidSrc.Advance(); IndexDest.Advance(); IdxHandler.Advance(); } } }; typedef FKernelSharedDataGetAppendIndexBase FKernelSharedDataGetAppendIndex; typedef FKernelSharedDataGetAppendIndexBase FKernelSharedDataGetAppendIndex_Wrap; //Temporary until after scalarization and we can store the size and counter of shared data in constant data. template struct FKernelSharedDataGetConsumeIndexBase { static const VectorVM::EVMType Type = VectorVM::EVMType::Vector4; static const int32 NumInstancesPerOp = 1; static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { FRegisterDestHandler IndexDest(Context); IndexHandler IdxHandler(Context); int32 NumInstances = Context.NumInstances; Context.BeginOp(VectorVM::EVMType::Vector4, 1, 1); for (int32 i = 0; i < NumInstances; ++i) { Context.PreOp, IndexHandler>(IndexDest, IdxHandler); int32 Index = IdxHandler.GetNextIndex(); //Better to just stay in int pipeline? *IndexDest.Get() = Index;//Only need index in X; Context.PostOp, IndexHandler>(IndexDest, IdxHandler); IndexDest.Advance(); IdxHandler.Advance(); } } }; typedef FKernelSharedDataGetConsumeIndexBase FKernelSharedDataGetConsumeIndex; typedef FKernelSharedDataGetConsumeIndexBase FKernelSharedDataGetConsumeIndex_Wrap; //Temporary until after scalarization and we can store the size and counter of shared data in constant data. struct FKernelSharedDataIndexValid { static VM_FORCEINLINE void Exec(FVectorVMContext& Context) { FRegisterDestHandler ValidDest(Context); FRegisterHandler IndexSrc(Context); FVectorVMSharedDataView& SharedData = Context.SharedDataTable[DecodeU8(Context)]; int32 NumInstances = Context.NumInstances; Context.BeginOp(VectorVM::EVMType::Vector4, 1, 1); for (int32 i = 0; i < NumInstances; ++i) { Context.PreOp, FRegisterHandler>(ValidDest, IndexSrc); int32 Index = IndexSrc.Get(); float Valid = SharedData.ValidIndex(Index) ? 1.0f : 0.0f; //Better to just stay in int pipeline? float* Dst = ValidDest.Get(); Dst[0] = Valid; Dst[1] = Valid; Dst[2] = Valid; Dst[3] = Valid; Context.PostOp, FRegisterHandler>(ValidDest, IndexSrc); ValidDest.Advance(); IndexSrc.Advance();; } } }; void VectorVM::Init() { static bool Inited = false; if (Inited == false) { // random noise float TempTable[17][17][17]; for (int z = 0; z < 17; z++) { for (int y = 0; y < 17; y++) { for (int x = 0; x < 17; x++) { float f1 = (float)FMath::FRandRange(-1.0f, 1.0f); TempTable[x][y][z] = f1; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable[i][j][16] = TempTable[i][j][0]; TempTable[i][16][j] = TempTable[i][0][j]; TempTable[16][j][i] = TempTable[0][j][i]; } } // compute gradients FVector TempTable2[17][17][17]; for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector XGrad = FVector(1.0f, 0.0f, TempTable[x][y][z] - TempTable[x+1][y][z]); FVector YGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y + 1][z]); FVector ZGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y][z+1]); FVector Grad = FVector(XGrad.Z, YGrad.Z, ZGrad.Z); TempTable2[x][y][z] = Grad; } } } // pad for (int i = 0; i < 17; i++) { for (int j = 0; j < 17; j++) { TempTable2[i][j][16] = TempTable2[i][j][0]; TempTable2[i][16][j] = TempTable2[i][0][j]; TempTable2[16][j][i] = TempTable2[0][j][i]; } } // compute curl of gradient field for (int z = 0; z < 16; z++) { for (int y = 0; y < 16; y++) { for (int x = 0; x < 16; x++) { FVector Dy = TempTable2[x][y][z] - TempTable2[x][y + 1][z]; FVector Sy = TempTable2[x][y][z] + TempTable2[x][y + 1][z]; FVector Dx = TempTable2[x][y][z] - TempTable2[x + 1][y][z]; FVector Sx = TempTable2[x][y][z] + TempTable2[x + 1][y][z]; FVector Dz = TempTable2[x][y][z] - TempTable2[x][y][z + 1]; FVector Sz = TempTable2[x][y][z] + TempTable2[x][y][z + 1]; FVector Dir = FVector(Dy.Z - Sz.Y, Dz.X - Sx.Z, Dx.Y - Sy.X); FVectorKernelNoise::RandomTable[x][y][z] = MakeVectorRegister(Dir.X, Dir.Y, Dir.Z, 0.0f); } } } #if WITH_EDITOR OpNames.AddDefaulted((int32)EVectorVMOp::NumOpcodes); if (UEnum* EnumStateObj = FindObject(ANY_PACKAGE, TEXT("EVectorVMOp"), true)) { for (int32 i = 0; i < (int32)EVectorVMOp::NumOpcodes; ++i) { OpNames[i] = EnumStateObj->GetDisplayNameText(i).ToString(); } } OperandLocationNames.AddDefaulted((int32)EVectorVMOperandLocation::Num); if (UEnum* EnumStateObj = FindObject(ANY_PACKAGE, TEXT("EVectorVMOperandLocation"), true)) { for (int32 i = 0; i < (int32)EVectorVMOperandLocation::Num; ++i) { OperandLocationNames[i] = EnumStateObj->GetDisplayNameText(i).ToString(); } } #endif Inited = true; } } void VectorVM::Exec( uint8 const* Code, VectorRegister** InputRegisters, int32 NumInputRegisters, VectorRegister** OutputRegisters, int32 NumOutputRegisters, FVector4 const* ConstantTable, UNiagaraDataObject* *DataObjConstTable, FVectorVMSharedDataView* SharedDataTable, int32 NumVectors ) { VectorRegister TempRegisters[NumTempRegisters][VectorsPerChunk]; VectorRegister* RegisterTable[MaxRegisters] = {0}; // Map temporary registers. for (int32 i = 0; i < NumTempRegisters; ++i) { RegisterTable[i] = TempRegisters[i]; } // Process one chunk at a time. int32 NumChunks = (NumVectors + VectorsPerChunk - 1) / VectorsPerChunk; for (int32 ChunkIndex = 0; ChunkIndex < NumChunks; ++ChunkIndex) { // Map input and output registers. for (int32 i = 0; i < NumInputRegisters; ++i) { RegisterTable[NumTempRegisters + i] = InputRegisters[i] + ChunkIndex * VectorsPerChunk; } for (int32 i = 0; i < NumOutputRegisters; ++i) { RegisterTable[NumTempRegisters + MaxInputRegisters + i] = OutputRegisters[i] + ChunkIndex * VectorsPerChunk; } // Setup execution context. int32 VectorsThisChunk = FMath::Min(NumVectors, VectorsPerChunk); int32 InstancesThisChunk = VectorsThisChunk; FVectorVMContext Context(Code, RegisterTable, ConstantTable, DataObjConstTable, SharedDataTable, VectorsThisChunk, InstancesThisChunk, VectorsPerChunk * ChunkIndex); EVectorVMOp Op = EVectorVMOp::done; // Execute VM on all vectors in this chunk. do { Op = DecodeOp(Context); Context.SetOp(Op); switch (Op) { // Dispatch kernel ops. case EVectorVMOp::add: FVectorKernelAdd::Exec(Context); break; case EVectorVMOp::sub: FVectorKernelSub::Exec(Context); break; case EVectorVMOp::mul: FVectorKernelMul::Exec(Context); break; case EVectorVMOp::div: FVectorKernelDiv::Exec(Context); break; case EVectorVMOp::mad: FVectorKernelMad::Exec(Context); break; case EVectorVMOp::lerp: FVectorKernelLerp::Exec(Context); break; case EVectorVMOp::rcp: FVectorKernelRcp::Exec(Context); break; case EVectorVMOp::rsq: FVectorKernelRsq::Exec(Context); break; case EVectorVMOp::sqrt: FVectorKernelSqrt::Exec(Context); break; case EVectorVMOp::neg: FVectorKernelNeg::Exec(Context); break; case EVectorVMOp::abs: FVectorKernelAbs::Exec(Context); break; case EVectorVMOp::exp: FVectorKernelExp::Exec(Context); break; case EVectorVMOp::exp2: FVectorKernelExp2::Exec(Context); break; case EVectorVMOp::log: FVectorKernelLog::Exec(Context); break; case EVectorVMOp::log2: FVectorKernelLog2::Exec(Context); break; case EVectorVMOp::sin: FVectorKernelSin::Exec(Context); break; case EVectorVMOp::cos: FVectorKernelCos::Exec(Context); break; case EVectorVMOp::tan: FVectorKernelTan::Exec(Context); break; case EVectorVMOp::asin: FVectorKernelASin::Exec(Context); break; case EVectorVMOp::acos: FVectorKernelACos::Exec(Context); break; case EVectorVMOp::atan: FVectorKernelATan::Exec(Context); break; case EVectorVMOp::atan2: FVectorKernelATan2::Exec(Context); break; case EVectorVMOp::ceil: FVectorKernelCeil::Exec(Context); break; case EVectorVMOp::floor: FVectorKernelFloor::Exec(Context); break; case EVectorVMOp::fmod: FVectorKernelMod::Exec(Context); break; case EVectorVMOp::frac: FVectorKernelFrac::Exec(Context); break; case EVectorVMOp::trunc: FVectorKernelTrunc::Exec(Context); break; case EVectorVMOp::clamp: FVectorKernelClamp::Exec(Context); break; case EVectorVMOp::min: FVectorKernelMin::Exec(Context); break; case EVectorVMOp::max: FVectorKernelMax::Exec(Context); break; case EVectorVMOp::pow: FVectorKernelPow::Exec(Context); break; case EVectorVMOp::sign: FVectorKernelSign::Exec(Context); break; case EVectorVMOp::step: FVectorKernelStep::Exec(Context); break; case EVectorVMOp::dot: FVectorKernelDot::Exec(Context); break; case EVectorVMOp::cross: FVectorKernelCross::Exec(Context); break; case EVectorVMOp::normalize: FVectorKernelNormalize::Exec(Context); break; case EVectorVMOp::random: FVectorKernelRandom::Exec(Context); break; case EVectorVMOp::length: FVectorKernelLength::Exec(Context); break; case EVectorVMOp::noise: FVectorKernelNoise::Exec(Context); break; case EVectorVMOp::splatx: FVectorKernelSplat<0>::Exec(Context); break; case EVectorVMOp::splaty: FVectorKernelSplat<1>::Exec(Context); break; case EVectorVMOp::splatz: FVectorKernelSplat<2>::Exec(Context); break; case EVectorVMOp::splatw: FVectorKernelSplat<3>::Exec(Context); break; case EVectorVMOp::compose: FVectorKernelCompose<0,1,2,3>::Exec(Context); break; case EVectorVMOp::composex: FVectorKernelCompose<0, 0, 0, 0>::Exec(Context); break; case EVectorVMOp::composey: FVectorKernelCompose<1, 1, 1, 1>::Exec(Context); break; case EVectorVMOp::composez: FVectorKernelCompose<2, 2, 2, 2>::Exec(Context); break; case EVectorVMOp::composew: FVectorKernelCompose<3, 3, 3, 3>::Exec(Context); break; case EVectorVMOp::lessthan: FVectorKernelLessThan::Exec(Context); break; case EVectorVMOp::sample: FVectorKernelSample::Exec(Context); break; case EVectorVMOp::bufferwrite: FVectorKernelBufferWrite::Exec(Context); break; case EVectorVMOp::easein: FVectorKernelEaseIn::Exec(Context); break; case EVectorVMOp::easeinout: FVectorKernelEaseInOut::Exec(Context); break; case EVectorVMOp::aquireshareddataindex: FKernelSharedDataGetAppendIndex::Exec(Context); break; case EVectorVMOp::aquireshareddataindexwrap: FKernelSharedDataGetAppendIndex_Wrap::Exec(Context); break; case EVectorVMOp::consumeshareddataindex: FKernelSharedDataGetConsumeIndex::Exec(Context); break; case EVectorVMOp::consumeshareddataindexwrap: FKernelSharedDataGetConsumeIndex_Wrap::Exec(Context); break; case EVectorVMOp::shareddataread: FVectorKernelSharedDataRead::Exec(Context); break; case EVectorVMOp::shareddatawrite: FVectorKernelSharedDataWrite::Exec(Context); break; case EVectorVMOp::shareddataindexvalid: FKernelSharedDataIndexValid::Exec(Context); break; case EVectorVMOp::select: FVectorKernelSelect::Exec(Context); break; case EVectorVMOp::output: { if ((ENABLE_VM_DEBUGGING && Context.IsDebugging()) == false) { FVectorKernelOutput::Exec(Context); } else { FVectorKernelOutputStreamed::Exec(Context); } } break; // Execution always terminates with a "done" opcode. case EVectorVMOp::done: break; // Opcode not recognized / implemented. default: UE_LOG(LogVectorVM, Error, TEXT("Unknown op code 0x%02x"), (uint32)Op); return;//BAIL } } while (Op != EVectorVMOp::done); NumVectors -= VectorsPerChunk; } } uint8 VectorVM::GetNumOpCodes() { return (uint8)EVectorVMOp::NumOpcodes; } #if WITH_EDITOR FString VectorVM::GetOpName(EVectorVMOp Op) { return OpNames[(int32)Op]; } FString VectorVM::GetOperandLocationName(EVectorVMOperandLocation Location) { return OperandLocationNames[(int32)Location]; } #endif ////////////////////////////////////////////////////////////////////////// #if ENABLE_VM_DEBUGGING void FVectorVMDebuggerImpl::BeginOp(FVectorVMContext& Context, VectorVM::EVMType InType, int32 InNumArgs, int32 InNumInstancesPerOp) { CurrOp = Context.CurrOp; OpType = InType; CurrNumArgs = InNumArgs; NumInstancesPerOp = InNumInstancesPerOp; CurrInstanceBase = StartInstance + Context.StartInstance; } template void FVectorVMDebuggerImpl::PreOp(FVectorVMContext& Context, DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1, Arg2Handler& Arg2, Arg3Handler& Arg3) { CachedPreOpData[(int32)VectorVM::EKernelArgs::Dest].Set(Dst.GetValue()); CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg0].Set(Arg0.Get()); CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg1].Set(Arg1.Get()); CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg2].Set(Arg2.Get()); CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg3].Set(Arg3.Get()); } template void FVectorVMDebuggerImpl::PostOp(FVectorVMContext& Context, DstHandler& Dst, Arg0Handler& Arg0, Arg1Handler& Arg1, Arg2Handler& Arg2, Arg3Handler& Arg3) { for (int32 i = 0; i < NumInstancesPerOp; ++i) { if (TArray* Info = DebugInfo.Find(CurrInstanceBase + i)) { TArray& InfoArray = *Info; int32 NewIdx = InfoArray.AddUninitialized(); VectorVM::FOpDebugInfo& NewInfo = InfoArray[NewIdx]; NewInfo.NumArgs = CurrNumArgs; NewInfo.Op = CurrOp; NewInfo.OpType = OpType; NewInfo.LocationInfo[(int32)VectorVM::EKernelArgs::Dest] = VectorVM::FDataLocationInfo(Context, Dst.GetLocation(), Dst.GetLocationIndex(), Dst.GetSecondaryIndex(), Dst.GetTertiaryIndex()); NewInfo.LocationInfo[(int32)VectorVM::EKernelArgs::Arg0] = VectorVM::FDataLocationInfo(Context, Arg0.GetLocation(), Arg0.GetLocationIndex(), Arg0.GetSecondaryIndex(), Arg0.GetTertiaryIndex());; NewInfo.LocationInfo[(int32)VectorVM::EKernelArgs::Arg1] = VectorVM::FDataLocationInfo(Context, Arg1.GetLocation(), Arg1.GetLocationIndex(), Arg1.GetSecondaryIndex(), Arg1.GetTertiaryIndex());; NewInfo.LocationInfo[(int32)VectorVM::EKernelArgs::Arg2] = VectorVM::FDataLocationInfo(Context, Arg2.GetLocation(), Arg2.GetLocationIndex(), Arg2.GetSecondaryIndex(), Arg2.GetTertiaryIndex());; NewInfo.LocationInfo[(int32)VectorVM::EKernelArgs::Arg3] = VectorVM::FDataLocationInfo(Context, Arg3.GetLocation(), Arg3.GetLocationIndex(), Arg3.GetSecondaryIndex(), Arg3.GetTertiaryIndex());; NewInfo.PreOpValues[(int32)VectorVM::EKernelArgs::Dest] = CachedPreOpData[(int32)VectorVM::EKernelArgs::Dest]; NewInfo.PreOpValues[(int32)VectorVM::EKernelArgs::Arg0] = CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg0]; NewInfo.PreOpValues[(int32)VectorVM::EKernelArgs::Arg1] = CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg1]; NewInfo.PreOpValues[(int32)VectorVM::EKernelArgs::Arg2] = CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg2]; NewInfo.PreOpValues[(int32)VectorVM::EKernelArgs::Arg3] = CachedPreOpData[(int32)VectorVM::EKernelArgs::Arg3]; NewInfo.PostOpValues[(int32)VectorVM::EKernelArgs::Dest].Set(Dst.GetValue()); NewInfo.PostOpValues[(int32)VectorVM::EKernelArgs::Arg0].Set(Arg0.Get()); NewInfo.PostOpValues[(int32)VectorVM::EKernelArgs::Arg1].Set(Arg1.Get()); NewInfo.PostOpValues[(int32)VectorVM::EKernelArgs::Arg2].Set(Arg2.Get()); NewInfo.PostOpValues[(int32)VectorVM::EKernelArgs::Arg3].Set(Arg3.Get()); } } CurrInstanceBase += NumInstancesPerOp; } void VectorVM::AttachDebugger(VectorVM::FVectorVMDebugger* Debugger) { check(IsInGameThread()); check(Debugger); check(AttachedDebugger == NULL); AttachedDebugger = Debugger->GetImpl(); } void VectorVM::DetachDebugger(VectorVM::FVectorVMDebugger* Debugger) { check(IsInGameThread()); check(Debugger); check(AttachedDebugger == Debugger->GetImpl()); AttachedDebugger = NULL; } VectorVM::FVectorVMDebugger::FVectorVMDebugger() : Impl(new FVectorVMDebuggerImpl()) { } VectorVM::FVectorVMDebugger::~FVectorVMDebugger() { if (Impl) { delete Impl; } } void VectorVM::FVectorVMDebugger::AddInstanceToGather(int32 Instance) { check(Impl); Impl->DebugInfo.Add(Instance); } const TArray* VectorVM::FVectorVMDebugger::GetDebugInfoForInstance(int32 Instance) { check(Impl); return Impl->DebugInfo.Find(Instance); } FVectorVMDebuggerImpl* VectorVM::FVectorVMDebugger::GetImpl() { return Impl; } void VectorVM::FVectorVMDebugger::InitForScriptRun(int32 StartInstance, const TArray InstancesToDebug) { Impl->InitForScriptRun(StartInstance, InstancesToDebug); } VectorVM::FDataLocationInfo::FDataLocationInfo(FVectorVMContext& Context, EVectorVMOperandLocation InLocation, int32 InPrimaryIndex, int32 InSecondaryIndex, int32 InTertiaryIndex) : Location(InLocation) , PrimaryLocationIndex(InPrimaryIndex) , SecondaryLocationIndex(InSecondaryIndex) , TertiaryLocationIndex(InTertiaryIndex) { switch (Location) { case EVectorVMOperandLocation::Constant: MemoryAddress = &Context.ConstantTable[PrimaryLocationIndex]; break; case EVectorVMOperandLocation::TemporaryRegister: MemoryAddress = &Context.RegisterTable[PrimaryLocationIndex]; break; case EVectorVMOperandLocation::InputRegister: MemoryAddress = &Context.RegisterTable[NumTempRegisters + PrimaryLocationIndex]; break; case EVectorVMOperandLocation::OutputRegister: MemoryAddress = &Context.ConstantTable[NumTempRegisters + MaxInputRegisters + PrimaryLocationIndex]; break; case EVectorVMOperandLocation::SharedData: { FVectorVMSharedDataView& SharedData = Context.SharedDataTable[PrimaryLocationIndex]; MemoryAddress = SharedData.GetReadBuffer(SecondaryLocationIndex, TertiaryLocationIndex); } break; default: MemoryAddress = nullptr; break; }; } #endif #undef VM_FORCEINLINE