You've already forked UnrealEngineUWP
mirror of
https://github.com/izzy2lost/UnrealEngineUWP.git
synced 2026-03-26 18:15:20 -07:00
#lockdown Nick.Penwarden #rb none ========================== MAJOR FEATURES + CHANGES ========================== Change3244756on 2017/01/03 by Marcus.Wassmer Copying //Tasks/UE4/Dev-Niagara@3244743 to Dev-Rendering (//UE4/Dev-Rendering) Change 3248667 on 2017/01/05 by Olaf.Piesche Resaving default asset because of engine verison issue; maybe unnecessary, but resaving niagara engine content to be sure #jira UE-40160 Change 3249324 on 2017/01/06 by Marcus.Wassmer Resave with an actual version to stop cook warning Change 3249611 on 2017/01/06 by Marcus.Wassmer Just remove warning-causing niagara data for now. Change 3308052 on 2017/02/16 by Rolando.Caloca DR - Check for Vulkan SDK, and only use it if it's newer or the same as the headers we distribute Change 3308109 on 2017/02/16 by Rolando.Caloca DR - Upgrade glslang to 1.0.39.1 Change 3308111 on 2017/02/16 by Rolando.Caloca DR - Update Vulkan distribution to 1.0.39.1 Change 3308153 on 2017/02/16 by Rolando.Caloca DR - Updated glslang libs Change 3308842 on 2017/02/17 by Rolando.Caloca DR - Fixed copy/paste Change 3310007 on 2017/02/17 by Chris.Bunner Back out CL 3221219 - causing MIC generation issues and superseded by CL 3273971. #jira UE-37792 Change 3310154 on 2017/02/17 by Chris.Bunner Assert when attempting to add a custom material attribute already in the base attributes list. Change 3310155 on 2017/02/17 by Chris.Bunner PR #3231: Validate material index before accessing (Contributed by projectgheist) #jira UE-41774, UE-41788 Change 3310162 on 2017/02/17 by Chris.Bunner PR #3252: Added MobileMaterialInterface to UsedMaterials (Contributed by projectgheist) #jira UE-41823, UE-41950 Change 3310176 on 2017/02/17 by Chris.Bunner Merging CL 3233886: AMD HDR support (requires r.AMDSupportsHDRDisplayOutput=1 in ini). Update to AGS 5.0.5. Partial code tidy up. Change 3310187 on 2017/02/17 by Chris.Bunner Preserve constant expressions rather than always casting after translating a material attribute. Losing the notion of constant means we can't correctly detect used properties and falsely enable e.g. PDO. Happened because of the incorrect component masks in BreakMaterialNodes which then had to be downcast to the correct type which is done as an inline fragment rather than swizzle expression. #jira UE-41594 Change 3310215 on 2017/02/17 by Chris.Bunner Prevent SpeedTree node compiling for skeletal meshes (not supported as uses more UV sets than available). More descriptive error for missing Cubemap UV input on TextureSample material node . #jira UE-33098 Change 3310838 on 2017/02/18 by Joe.Graf Moved some private functions to public for a licensee #CodeReview: matt.kuhlenschmidt #rb: n/a Change 3311876 on 2017/02/20 by Rolando.Caloca DR - Expose skin cache cvar r.SkinCache.AccumulationBufferSizeInKB #jira UE-42014 Change 3314139 on 2017/02/21 by Rolando.Caloca DR - Minor cleanup pass - Remove FVulkanPendingState - Renamed some classes for clarity - Hoist pending UAVs for flush out to pending compute state Change 3314642 on 2017/02/21 by Rolando.Caloca DR - Some more renaming Change3315431on 2017/02/21 by Ben.Salem Properly set default values for test time out and tick. We now will default to ticking once per second, and tracking the macro stats of GPU/Render/Game thread time. #tests Ran showdown demo several times Change 3316710 on 2017/02/22 by Rolando.Caloca DR - hlslcc - Fix refract intrinsic Change 3316718 on 2017/02/22 by Rolando.Caloca DR - hlslcc - Built libs to pick up change from 3316710 - refract fix Change 3316820 on 2017/02/22 by Benjamin.Hyder updating Tm-TrigNodes map Change 3317192 on 2017/02/22 by Benjamin.Hyder Updating QA-Decals map Change 3317528 on 2017/02/22 by Benjamin.Hyder Updating QA-Decals map Change 3317639 on 2017/02/22 by Benjamin.Hyder Updating Decal on Complex Mesh example in QA-Decals Change 3317764 on 2017/02/22 by Benjamin.Hyder Final updates to QA-Decals Change 3318319 on 2017/02/22 by Rolando.Caloca DR - minor reorg/rename Change 3318379 on 2017/02/22 by Rolando.Caloca DR - more cleanup Change 3321181 on 2017/02/24 by Rolando.Caloca DR - Fix GL bug Change 3321247 on 2017/02/24 by Rolando.Caloca DR - Fix misc bugs Change 3321898 on 2017/02/24 by Chris.Bunner Only issue clear TLV dispatch if required. #jira UERNDR-193 Change 3321904 on 2017/02/24 by Chris.Bunner Added comment for potential future optimization. Change 3322013 on 2017/02/24 by Uriel.Doyon Fixed separate translucency being affected by Gaussian DOF #jira UE-40489 Change 3322517 on 2017/02/24 by Uriel.Doyon Fixed issue with InvestigateTexture command removing budget limit. Fixed StreamingBounds show flag not working. It nows shows the streaming bound for the currently selected textures. #jira UE-40485 Change 3323470 on 2017/02/27 by Chad.Garyet Removing DDC job from dev-rendering Change 3323479 on 2017/02/27 by Chad.Garyet Removing RDU agent type Change 3323519 on 2017/02/27 by Chad.Garyet removing NCL/LHR/SEA agent types to clean up space Change 3323639 on 2017/02/27 by Benjamin.Hyder More updates to QA-Decals Change 3324207 on 2017/02/27 by Uriel.Doyon Fixed typo ScaleTexturesByGlobalMyBias -> ScaleTexturesByGlobalMipBias Removed bad merge in FStreamingTextureLevelContext::GetBuildDataIndexRef Change 3324396 on 2017/02/27 by Uriel.Doyon Fixed an issue with the Streaming Bounds show flag interferring with the static level data initialization #jira UE-40485 Change 3325227 on 2017/02/28 by Chris.Bunner Fix-up AMD AGS libs. Change 3325566 on 2017/02/28 by Uriel.Doyon Fixed possible out-of-bound access in GetUsedTexture() when passing ERHIFeatureLevel::Num Change 3326009 on 2017/02/28 by Uriel.Doyon Better fix for 3325566, as the previous fix would ignore the material instance overrides. Change 3327058 on 2017/03/01 by Benjamin.Hyder Preparing TM_Shadermodels map for automation Change 3328222 on 2017/03/01 by Chris.Bunner Prevent decals from drawing in separate translucency pass. Whilst user control and material relevance were already removed, if the flag was checked before being disabled (by swapping to decal domain) this was still being read in the render loop, now explicitly ignores decals. #jira UE-42449, UE-42446 Change 3329848 on 2017/03/02 by Uriel.Doyon Added some extra logs to help track UE-42168 Change 3329977 on 2017/03/02 by Rolando.Caloca DR - Fix bad clear value Change 3330008 on 2017/03/02 by Benjamin.Hyder More preparations for QA-Decals automation Change 3330754 on 2017/03/02 by Daniel.Wright Prominent comment explaining reflection env async compute usage and why it's not overlapped with anything Change 3331451 on 2017/03/03 by Marc.Olano Manually unroll simplex noise loop to avoid PSO bug on AMD/Metal Change 3331839 on 2017/03/03 by Rolando.Caloca DR - hlslcc - add missing file to project Change 3332247 on 2017/03/03 by Rolando.Caloca DR - Fix for integrated intel PR #3305 #jira UE-42393 Change 3332259 on 2017/03/03 by Rolando.Caloca DR - Fix bad index into pixel formats PR #3237 #jira UE-41855 Change 3332305 on 2017/03/03 by Rolando.Caloca DR - OpenGL SRV for index buffers PR #3271 #jira UE-32618 Change 3332313 on 2017/03/03 by Rolando.Caloca DR - Fix for integrated intel (properly) PR #3305 #jira UE-42393 Change 3332317 on 2017/03/03 by Rolando.Caloca DR - OpenGL SRV for index buffers (properly) PR #3271 #jira UE-32618 Change 3332368 on 2017/03/03 by Rolando.Caloca DR - Minor fixes so -sm4 and -sm5 can be used on windows with OpenGL/Vulkan Change 3333690 on 2017/03/06 by Daniel.Wright [Copy] Changing movable skylight properties no longer affects static draw lists Change 3333693 on 2017/03/06 by Daniel.Wright [Copy] Added 'r.AOListMeshDistanceFields' which dumps out mesh distance fields sorted by memory size, useful for directing content optimizations Change3333705on 2017/03/06 by Daniel.Wright [Copy] Mesh distance fields are now 8 bit fixed point by default, but can be changed back to 16 bit floating piont with a project setting. * 8 bit uses half memory but introduces error for thin surfaces or large meshes. Change 3333721 on 2017/03/06 by David.Hill DecalProxy: Copy float FadeScreenSize to FDeferredDecalProxy for use in the render thread. This avoids pointer chasing to the UDecalComponent (game thread component). Change 3333772 on 2017/03/06 by Daniel.Wright [Copy] Scene motion blur data is only updated for the main renderer frames. Fixes scene captures and planar reflections breaking object motion blur. Change 3333790 on 2017/03/06 by Daniel.Wright [Copy] Mesh distance field generation uses Embree, for a 2.5x speedup * Can switch back to old kDOP generation with 'r.DistanceFieldBuild.UseEmbree 0' for debugging Change 3333822 on 2017/03/06 by Daniel.Wright [Copy] Moved mesh distance field code into MeshDistanceFieldUtilities.cpp * Moved FMeshUtilities to its own header so the 8k line MeshUtilites.cpp file can be further split up Change 3333827 on 2017/03/06 by Daniel.Wright [Copy] Range compress 8bit distance fields - gets one extra bit of precision on average Change 3333828 on 2017/03/06 by Daniel.Wright [Copy] Raised High ShadowQuality to 2048 as 1024 for CSM is way too low Change 3333831 on 2017/03/06 by Daniel.Wright Non-editor compile fix Change 3333836 on 2017/03/06 by Daniel.Wright [Copy] Workaround for gobal distance field volume textures being bloated by 4x on PS4 due to the recommended tiling modes. They now use a 2d tiling mode which avoids the bloat, saving 96Mb. Change 3333843 on 2017/03/06 by Daniel.Wright [Copy] Added OcclusionExponent to skylight component * Useful for brightening up indoors without losing contact shadows as MinOcclusion does Change 3333845 on 2017/03/06 by Daniel.Wright [Copy] Capsule shadow BP functions Change 3333850 on 2017/03/06 by Daniel.Wright [Copy] Added OcclusionCombineMode to skylight component Change 3333854 on 2017/03/06 by Daniel.Wright [Copy] Gnm properly registers clears as GPU work so those events show up in profilegpu Change 3333857 on 2017/03/06 by Daniel.Wright [Copy] Clear light attenuation for local lights with a quad covering their screen extents * Clearing the entire light attenuation buffer costs .1ms on PS4. This optimization lowers the minimum cost of a shadow casting light from .15ms -> .03ms. * Shadowed lights in Fortnite with 25 lights 3.7ms -> 1.42ms on PS4 Change 3333860 on 2017/03/06 by Daniel.Wright [Copy] Flush deferred deletes when reallocating distance field atlas to reduce peak memory Change 3333861 on 2017/03/06 by Daniel.Wright [Copy] Disable all distance field features on Intel cards as HD 4000 hangs in the RHICreateTexture3D call to allocate the large atlas Change 3333869 on 2017/03/06 by Daniel.Wright [Copy] Volumetric Fog using a volume texture mapped to the camera frustum * Volumetric fog can be enabled on an Exponential Height Fog component with additional controls * Lights have a VolumetricScatteringIntensity * New cvars r.VolumetricFog, r.VolumetricFog.GridPixelSize, r.VolumetricFog.GridSizeZ, r.VolumetricFog.DepthDistributionScale * Lighting features supported: * Directional light with CSM and a light function * Point / spot lights without shadows / light functions / IES profiles * Skylight with occlusion from distance fields * Analytical height fog covers the view range past where the volumetric fog ends * Temporal reprojection is used on the volumetric fog scattering and extinction to achieve stability * Translucency integrates properly into volumetric fog * Height fog StartDistance is not supported by volumetric fog and should be set to 0. Change 3333894 on 2017/03/06 by Daniel.Wright [Copy] Initialize GDummyVolumetricFogGlobalDataUniformBuffer outside of parallel rendering Change 3333902 on 2017/03/06 by Daniel.Wright [Copy] Better handling of volumetric fog enabled with distance of 0 Change 3333903 on 2017/03/06 by Daniel.Wright [Copy] Fixed volumetric fog trying to render light functions for a point light Change 3333908 on 2017/03/06 by Daniel.Wright [Copy] Volumetric materials * Added new material domain Volume, which can output Scattering, Absorption and Emissive. All properties are in world space densities. * Particle systems using the Volume domain are voxelized based on their ParticlePosition and ParticleRadius * Volumetric fog integration is now energy conservative - scattering is integrated against transmission over the depth of each slice. * Added bOverrideLightColorsWithFogInscatteringColors to exponential height fog, which can be enabled to make Volumetric Fog match Height fog more closely Change 3334134 on 2017/03/06 by Daniel.Wright [Copy from Michael Trepka] Added Embree 2.14.0 and changed MeshUtilities to use it as this solves issues with Embree leaking TLS keys. UnrealLightmass is still using older Embree 2.7.0 until we can find time to properly test it with the new version. Also, invalidated distance field DDC to force it to rebuild with updated Embree. Change 3334420 on 2017/03/06 by Daniel.Wright Fixed RTDF shadows Change 3335467 on 2017/03/07 by Benjamin.Hyder Initial submission of QA-Decals map to EngineTest Change 3335556 on 2017/03/07 by Daniel.Wright Changed mesh distance field default format back to R16f Change 3338020 on 2017/03/08 by Daniel.Wright Disable volumetric fog in vertex shaders for feature levels which don't support it Change 3339394 on 2017/03/09 by Chris.Bunner Correctly handle material texture translation error edge case. #jira UE-42579, UE-42670 Change 3339992 on 2017/03/09 by Daniel.Wright Only compile volumetric fog shaders on supporting platforms Change 3341858 on 2017/03/10 by Arne.Schober Copying //UE4/Dev-Rendering-PSO to Dev-Rendering (//UE4/Dev-Rendering) #RB Rolando.Caloca, Marcus.Wassmer, Daniel.Wright, Nick.Penwarden, Mark.Satterthwaite Change 3342004 on 2017/03/10 by Arne.Schober Copying //UE4/Dev-Rendering-PSO to Dev-Rendering (//UE4/Dev-Rendering) Fix unity build #RB Marcus.Wassmer Change 3343307 on 2017/03/13 by Marcus.Wassmer Update showflags when we are guaranteed it will happen in all possible ways to spawn the scenecapture. (drag into editor, PIE, -game, etc) Change 3343732 on 2017/03/13 by Rolando.Caloca DR - Vulkan compute pipeline & refactor Change 3344846 on 2017/03/14 by Rolando.Caloca DR - Android compile fixes Change 3344883 on 2017/03/14 by Rolando.Caloca DR - Add missing stencil load/store to PSO initializer Change 3344985 on 2017/03/14 by Rolando.Caloca DR - Made load/store actions uint8 Change 3345141 on 2017/03/14 by Rolando.Caloca DR - vk - Rework render pass hash Change 3345304 on 2017/03/14 by Benjamin.Hyder Updating TM-Distancefields map to include TemplateFloor mesh Change 3345387 on 2017/03/14 by Rolando.Caloca DR - Add _RenderThread calls for Create*Shader so RHIs can choose not to stall when creating Change 3345388 on 2017/03/14 by Rolando.Caloca DR - Do not stall when creating shaders on Vulkan Change 3345722 on 2017/03/14 by Chris.Bunner PR #3357: MinimalAPI add to many material expressions (Contributed by DeanoC) #jira UE-42752 Change 3345723 on 2017/03/14 by Chris.Bunner Reduce log verbosity causing spamming during landscape editing. #jira UE-42714 Change 3345725 on 2017/03/14 by Chris.Bunner [Duplicate 3341860] Fixed material translation error with multiple connections from custom interpolator nodes. Change 3345726 on 2017/03/14 by Chris.Bunner Typo fixes. Change 3345732 on 2017/03/14 by Rolando.Caloca DR - Decouple vertex declaration off BSS Change 3345746 on 2017/03/14 by Chris.Bunner Added sign() intrinsic material graph node and delisted material function workaround. Change 3346042 on 2017/03/14 by Chris.Bunner Implement missing size query interface for FRenderTargetResources. #jira UE-41672 Change 3346387 on 2017/03/14 by Daniel.Wright [Copy] Added VolumetricScatteringIntensity to particle lights Change 3346389 on 2017/03/14 by Daniel.Wright [Copy] Clamp Volumetric material attributes to fp16 range to avoid INFs Disable volumetric fog when the fog show flag is disabled Change 3346392 on 2017/03/14 by Daniel.Wright [Copy] Fixed skylight being much too bright on volumetric fog Change 3346406 on 2017/03/14 by Daniel.Wright [Copy] CSM resolution is now controlled by r.Shadow.MaxCSMResolution. * Changed HighPC to use 1024 MaxShadowResolution (max for all non-CSM shadows), saves 60Mb in Fortnite Change 3346412 on 2017/03/14 by Daniel.Wright [Copy] TexCreate_ReduceMemoryWithTilingMode for translucency lighting 3d textures, saves 13Mb Change3346414on 2017/03/14 by Daniel.Wright [Copy] TexCreate_ReduceMemoryWithTilingMode for volumetric fog 3d textures, saves 13Mb Change 3346415 on 2017/03/14 by Daniel.Wright [Copy] Missing file from cl 3338451 Change 3346421 on 2017/03/14 by Daniel.Wright [Copy] Fixed NaNs in volumetric fog due to rendering when height fog is disabled * Volumetric fog converts NaNs to black now so they don't spread Change 3346422 on 2017/03/14 by Daniel.Wright [Copy] Fixed NaN in volumetric fog with low density values Change 3346423 on 2017/03/14 by Daniel.Wright [Copy] Changed default VolumetricFogScatteringDistribution to .2 Change 3346430 on 2017/03/14 by Daniel.Wright [Copy] New translucent material option to compute fog per pixel instead of the default per vertex Change 3346432 on 2017/03/14 by Daniel.Wright [Copy] Moved Volumetric Fog parameters to view uniform buffer for translucency pass Fixed lifetimes of temporary Volumetric Fog render targets Change 3346526 on 2017/03/14 by Daniel.Wright [Copy] Volumetric Fog supports point and spot light shadows * These lights are injected separately so that per-light resources can be bound (shadow depth map, static shadow depth map) * Forward lighting of local lights can be forced with 'r.VolumetricFog.InjectShadowedLightsSeparately 0' * Shadowed lights come at a cost: 2.9ms for volumetric fog on 970 -> 4.2ms with shadowing Change 3347053 on 2017/03/15 by Rolando.Caloca DR - android compile fix Change 3347384 on 2017/03/15 by Rolando.Caloca DR - Fix merge issue Change 3347643 on 2017/03/15 by Marcus.Wassmer Fix some bugs with the 'disable stationary skylight ffor the project' feature. Fixes lighting in Persona on Paragon. Change 3347979 on 2017/03/15 by Rolando.Caloca DR - Allow to automatically apply cached rendertargets to PSO initializer Change 3348024 on 2017/03/15 by Rolando.Caloca DR - Remove NullPS on Vulkan to avoid deadlock Change 3348303 on 2017/03/15 by Rolando.Caloca DR - Fix for debugging SCW with material SRT Change 3348357 on 2017/03/15 by Marcus.Wassmer Fix stencildither and a stencilref bug that was probably breaking decals sometimes. Change 3348549 on 2017/03/15 by Marcus.Wassmer Hopefully fix static analysis for potential nullptr access. Change 3348614 on 2017/03/15 by Marcus.Wassmer Duplicate some switch changes to fix crash on launch. Change 3349369 on 2017/03/16 by Gil.Gribb Fixed botched merge Change 3349947 on 2017/03/16 by Rolando.Caloca DR - Fix for mismatched primitive type Change 3349956 on 2017/03/16 by Benjamin.Hyder initial updates to TM-DistanceFields map Change 3350151 on 2017/03/16 by Rolando.Caloca DR - Fix UT compile issue Change 3350155 on 2017/03/16 by Rolando.Caloca DR - Catch mismatched primitive type on PSOs on D3D11 Change 3350192 on 2017/03/16 by Daniel.Wright Fix for point light shadow depths rendering with wrong cull mode due to PSO refactor Change 3350736 on 2017/03/16 by Daniel.Wright Fixed formatting from merge Change 3350881 on 2017/03/16 by Rolando.Caloca DR - Fix texture arrays as UAVs on Metal Change 3350927 on 2017/03/16 by Rolando.Caloca DR - Fix warning Change 3350935 on 2017/03/16 by Daniel.Wright Fix for materials with non-Surface domains being skipped in mesh passes Change 3351583 on 2017/03/17 by Marcus.Wassmer Fix clang platforms Change 3351917 on 2017/03/17 by Marcus.Wassmer Fix linux compile Change 3351973 on 2017/03/17 by Marcus.Wassmer Fix mismatched rendertargetformat Change 3352038 on 2017/03/17 by Daniel.Wright Enabled GetAndOrCreateGraphicsPipelineState ensures in Development for testing Change 3352110 on 2017/03/17 by Marcus.Wassmer Fix missing RT PSO apply Change 3352695 on 2017/03/17 by Arne.Schober DR - Remove PSO Rendertarget check in DX12 Resolve with Shader. #RB Rolando.Caloca Change 3352960 on 2017/03/17 by Arne.Schober DR - Fix some things that slipped trough the PSO merge #RB none Change 3353150 on 2017/03/18 by Rolando.Caloca DR - compile fix Change 3353205 on 2017/03/18 by Arne.Schober DR - Fix Incremental Compile and PS4 runtime error where CMASK is not allowed for ThickTile Mode #RB none Change 3353207 on 2017/03/18 by Arne.Schober DR - Fix Confusion #RB none Change 3355183 on 2017/03/20 by Nick.Bullard Fixed up Content orginzation for Decals automation tests in EngineTest Change 3355627 on 2017/03/20 by Arne.Schober DR - [UE-43094] - removed ensure in comporiton graph as control of the clear color cannot be gurantueed. Change 3356342 on 2017/03/21 by Marcus.Wassmer Fix clang errors Change 3356591 on 2017/03/21 by Arne.Schober DR - Fix ensure message #RB none Change 3356873 on 2017/03/21 by Arne.Schober DR - Fix comparission of undefined values in RendertargetApply Check Change 3357261 on 2017/03/21 by Marcus.Wassmer Fix LinuxEditor compile Change 3357294 on 2017/03/21 by Marcus.Wassmer Add missing SSE functions Change 3357351 on 2017/03/21 by Frank.Fella Fix win32 and linux compiler errors Change 3357370 on 2017/03/21 by Arne.Schober DR - disable ensure in test builds #RB Marcus.Wassmer [CL 3357449 by Marcus Wassmer in Main branch]
1560 lines
55 KiB
C++
1560 lines
55 KiB
C++
// Copyright 1998-2017 Epic Games, Inc. All Rights Reserved.
|
|
|
|
#include "VectorVM.h"
|
|
#include "Modules/ModuleManager.h"
|
|
#include "UObject/Class.h"
|
|
#include "UObject/Package.h"
|
|
#include "VectorVMPrivate.h"
|
|
|
|
IMPLEMENT_MODULE(FDefaultModuleImpl, VectorVM);
|
|
|
|
DEFINE_LOG_CATEGORY_STATIC(LogVectorVM, All, All);
|
|
|
|
//#define VM_FORCEINLINE
|
|
#define VM_FORCEINLINE FORCEINLINE
|
|
|
|
#define OP_REGISTER (0)
|
|
#define OP0_CONST (1 << 0)
|
|
#define OP1_CONST (1 << 1)
|
|
#define OP2_CONST (1 << 2)
|
|
|
|
#define SRCOP_RRR (OP_REGISTER | OP_REGISTER | OP_REGISTER)
|
|
#define SRCOP_RRC (OP_REGISTER | OP_REGISTER | OP0_CONST)
|
|
#define SRCOP_RCR (OP_REGISTER | OP1_CONST | OP_REGISTER)
|
|
#define SRCOP_RCC (OP_REGISTER | OP1_CONST | OP0_CONST)
|
|
#define SRCOP_CRR (OP2_CONST | OP_REGISTER | OP_REGISTER)
|
|
#define SRCOP_CRC (OP2_CONST | OP_REGISTER | OP0_CONST)
|
|
#define SRCOP_CCR (OP2_CONST | OP1_CONST | OP_REGISTER)
|
|
#define SRCOP_CCC (OP2_CONST | OP1_CONST | OP0_CONST)
|
|
|
|
uint8 VectorVM::CreateSrcOperandMask(EVectorVMOperandLocation Type0, EVectorVMOperandLocation Type1, EVectorVMOperandLocation Type2)
|
|
{
|
|
return (Type0 == EVectorVMOperandLocation::Constant ? OP0_CONST : OP_REGISTER) |
|
|
(Type1 == EVectorVMOperandLocation::Constant ? OP1_CONST : OP_REGISTER) |
|
|
(Type2 == EVectorVMOperandLocation::Constant ? OP2_CONST : OP_REGISTER);
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// Kernels
|
|
template<typename Kernel, typename DstHandler, typename Arg0Handler, uint32 NumInstancesPerOp>
|
|
struct TUnaryKernelHandler
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
Arg0Handler Arg0(Context);
|
|
DstHandler Dst(Context);
|
|
|
|
int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp;
|
|
for (int32 i = 0; i < LoopInstances; ++i)
|
|
{
|
|
Kernel::DoKernel(Dst.GetDest(), Arg0.Get());
|
|
Dst.Advance(); Arg0.Advance();
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename Kernel, typename DstHandler, typename Arg0Handler, typename Arg1Handler, int32 NumInstancesPerOp>
|
|
struct TBinaryKernelHandler
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
Arg0Handler Arg0(Context);
|
|
Arg1Handler Arg1(Context);
|
|
|
|
DstHandler Dst(Context);
|
|
|
|
int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp;
|
|
for (int32 i = 0; i < LoopInstances; ++i)
|
|
{
|
|
Kernel::DoKernel(Dst.GetDest(), Arg0.Get(), Arg1.Get());
|
|
Dst.Advance(); Arg0.Advance(); Arg1.Advance();
|
|
}
|
|
}
|
|
};
|
|
|
|
template<typename Kernel, typename DstHandler, typename Arg0Handler, typename Arg1Handler, typename Arg2Handler, int32 NumInstancesPerOp>
|
|
struct TTrinaryKernelHandler
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
Arg0Handler Arg0(Context);
|
|
Arg1Handler Arg1(Context);
|
|
Arg2Handler Arg2(Context);
|
|
|
|
DstHandler Dst(Context);
|
|
|
|
int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp;
|
|
for (int32 i = 0; i < LoopInstances; ++i)
|
|
{
|
|
Kernel::DoKernel(Dst.GetDest(), Arg0.Get(), Arg1.Get(), Arg2.Get());
|
|
Dst.Advance(); Arg0.Advance(); Arg1.Advance(); Arg2.Advance();
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
template<typename Kernel, typename DstHandler, typename Arg0Handler, typename Arg1Handler, typename Arg2Handler, int32 NumInstancesPerOp>
|
|
struct TTrinaryOutputKernelHandler
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
Arg0Handler Arg0(Context);
|
|
Arg1Handler Arg1(Context);
|
|
Arg2Handler Arg2(Context);
|
|
|
|
DstHandler Dst(Context, Arg0.Get());
|
|
|
|
int32 LoopInstances = Align(Context.NumInstances, NumInstancesPerOp) / NumInstancesPerOp;
|
|
for (int32 i = 0; i < LoopInstances; ++i)
|
|
{
|
|
Kernel::DoKernel(Dst.GetDest(), Arg0.Get(), Arg1.Get(), Arg2.Get());
|
|
Dst.Advance(); Arg0.Advance(); Arg1.Advance(); Arg2.Advance();
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
/** Base class of vector kernels with a single operand. */
|
|
template <typename Kernel, typename DstHandler, typename ConstHandler, typename RegisterHandler, int32 NumInstancesPerOp>
|
|
struct TUnaryKernel
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
uint32 SrcOpTypes = DecodeSrcOperandTypes(Context);
|
|
switch (SrcOpTypes)
|
|
{
|
|
case SRCOP_RRR: TUnaryKernelHandler<Kernel, DstHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RRC: TUnaryKernelHandler<Kernel, DstHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
default: check(0); break;
|
|
};
|
|
}
|
|
};
|
|
template<typename Kernel>
|
|
struct TUnaryScalarKernel : public TUnaryKernel<Kernel, FRegisterHandler<float>, FConstantHandler<float>, FRegisterHandler<float>, 1> {};
|
|
template<typename Kernel>
|
|
struct TUnaryVectorKernel : public TUnaryKernel<Kernel, FRegisterDestHandler<VectorRegister>, FConstantHandler<VectorRegister>, FRegisterHandler<VectorRegister>, VECTOR_WIDTH_FLOATS> {};
|
|
template<typename Kernel>
|
|
struct TUnaryVectorIntKernel : public TUnaryKernel<Kernel, FRegisterDestHandler<VectorRegisterInt>, FConstantHandler<VectorRegisterInt>, FRegisterHandler<VectorRegisterInt>, VECTOR_WIDTH_FLOATS> {};
|
|
|
|
/** Base class of Vector kernels with 2 operands. */
|
|
template <typename Kernel, typename DstHandler, typename ConstHandler, typename RegisterHandler, uint32 NumInstancesPerOp>
|
|
struct TBinaryKernel
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
uint32 SrcOpTypes = DecodeSrcOperandTypes(Context);
|
|
switch (SrcOpTypes)
|
|
{
|
|
case SRCOP_RRR: TBinaryKernelHandler<Kernel, DstHandler, RegisterHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RRC: TBinaryKernelHandler<Kernel, DstHandler, ConstHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RCR: TBinaryKernelHandler<Kernel, DstHandler, RegisterHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RCC: TBinaryKernelHandler<Kernel, DstHandler, ConstHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
default: check(0); break;
|
|
};
|
|
}
|
|
};
|
|
template<typename Kernel>
|
|
struct TBinaryScalarKernel : public TBinaryKernel<Kernel, FRegisterHandler<float>, FConstantHandler<float>, FRegisterHandler<float>, 1> {};
|
|
template<typename Kernel>
|
|
struct TBinaryVectorKernel : public TBinaryKernel<Kernel, FRegisterDestHandler<VectorRegister>, FConstantHandler<VectorRegister>, FRegisterHandler<VectorRegister>, VECTOR_WIDTH_FLOATS> {};
|
|
template<typename Kernel>
|
|
struct TBinaryVectorIntKernel : public TBinaryKernel<Kernel, FRegisterDestHandler<VectorRegisterInt>, FConstantHandler<VectorRegisterInt>, FRegisterHandler<VectorRegisterInt>, VECTOR_WIDTH_FLOATS> {};
|
|
|
|
/** Base class of Vector kernels with 3 operands. */
|
|
template <typename Kernel, typename DstHandler, typename ConstHandler, typename RegisterHandler, uint32 NumInstancesPerOp>
|
|
struct TTrinaryKernel
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
uint32 SrcOpTypes = DecodeSrcOperandTypes(Context);
|
|
switch (SrcOpTypes)
|
|
{
|
|
case SRCOP_RRR: TTrinaryKernelHandler<Kernel, DstHandler, RegisterHandler, RegisterHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RRC: TTrinaryKernelHandler<Kernel, DstHandler, ConstHandler, RegisterHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RCR: TTrinaryKernelHandler<Kernel, DstHandler, RegisterHandler, ConstHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_RCC: TTrinaryKernelHandler<Kernel, DstHandler, ConstHandler, ConstHandler, RegisterHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_CRR: TTrinaryKernelHandler<Kernel, DstHandler, RegisterHandler, RegisterHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_CRC: TTrinaryKernelHandler<Kernel, DstHandler, ConstHandler, RegisterHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_CCR: TTrinaryKernelHandler<Kernel, DstHandler, RegisterHandler, ConstHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
case SRCOP_CCC: TTrinaryKernelHandler<Kernel, DstHandler, ConstHandler, ConstHandler, ConstHandler, NumInstancesPerOp>::Exec(Context); break;
|
|
default: check(0); break;
|
|
};
|
|
}
|
|
};
|
|
|
|
template<typename Kernel>
|
|
struct TTrinaryScalarKernel : public TTrinaryKernel<Kernel, FRegisterHandler<float>, FConstantHandler<float>, FRegisterHandler<float>, 1> {};
|
|
template<typename Kernel>
|
|
struct TTrinaryVectorKernel : public TTrinaryKernel<Kernel, FRegisterDestHandler<VectorRegister>, FConstantHandler<VectorRegister>, FRegisterHandler<VectorRegister>, VECTOR_WIDTH_FLOATS> {};
|
|
template<typename Kernel>
|
|
struct TTrinaryVectorIntKernel : public TTrinaryKernel<Kernel, FRegisterDestHandler<VectorRegisterInt>, FConstantHandler<VectorRegisterInt>, FRegisterHandler<VectorRegisterInt>, VECTOR_WIDTH_FLOATS> {};
|
|
|
|
|
|
/*------------------------------------------------------------------------------
|
|
Implementation of all kernel operations.
|
|
------------------------------------------------------------------------------*/
|
|
|
|
struct FVectorKernelAdd : public TBinaryVectorKernel<FVectorKernelAdd>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1)
|
|
{
|
|
*Dst = VectorAdd(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelSub : public TBinaryVectorKernel<FVectorKernelSub>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1)
|
|
{
|
|
*Dst = VectorSubtract(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelMul : public TBinaryVectorKernel<FVectorKernelMul>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1)
|
|
{
|
|
*Dst = VectorMultiply(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelDiv : public TBinaryVectorKernel<FVectorKernelDiv>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorDivide(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
|
|
struct FVectorKernelMad : public TTrinaryVectorKernel<FVectorKernelMad>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2)
|
|
{
|
|
*Dst = VectorMultiplyAdd(Src0, Src1, Src2);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelLerp : public TTrinaryVectorKernel<FVectorKernelLerp>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2)
|
|
{
|
|
const VectorRegister OneMinusAlpha = VectorSubtract(GlobalVectorConstants::FloatOne, Src2);
|
|
const VectorRegister Tmp = VectorMultiply(Src0, OneMinusAlpha);
|
|
*Dst = VectorMultiplyAdd(Src1, Src2, Tmp);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelRcp : public TUnaryVectorKernel<FVectorKernelRcp>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0)
|
|
{
|
|
*Dst = VectorReciprocal(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelRsq : public TUnaryVectorKernel<FVectorKernelRsq>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0)
|
|
{
|
|
*Dst = VectorReciprocalSqrt(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelSqrt : public TUnaryVectorKernel<FVectorKernelSqrt>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0)
|
|
{
|
|
// TODO: Need a SIMD sqrt!
|
|
*Dst = VectorReciprocal(VectorReciprocalSqrt(Src0));
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelNeg : public TUnaryVectorKernel<FVectorKernelNeg>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0)
|
|
{
|
|
*Dst = VectorNegate(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelAbs : public TUnaryVectorKernel<FVectorKernelAbs>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0)
|
|
{
|
|
*Dst = VectorAbs(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelExp : public TUnaryVectorKernel<FVectorKernelExp>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorExp(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelExp2 : public TUnaryVectorKernel<FVectorKernelExp2>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorExp2(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelLog : public TUnaryVectorKernel<FVectorKernelLog>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorLog(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelLog2 : public TUnaryVectorKernel<FVectorKernelLog2>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorLog2(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelClamp : public TTrinaryVectorKernel<FVectorKernelClamp>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1,VectorRegister Src2)
|
|
{
|
|
const VectorRegister Tmp = VectorMax(Src0, Src1);
|
|
*Dst = VectorMin(Tmp, Src2);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelSin : public TUnaryVectorKernel<FVectorKernelSin>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorSin(VectorMultiply(Src0, GlobalVectorConstants::TwoPi));
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCos : public TUnaryVectorKernel<FVectorKernelCos>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorCos(VectorMultiply(Src0, GlobalVectorConstants::TwoPi));
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelTan : public TUnaryVectorKernel<FVectorKernelTan>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorTan(VectorMultiply(Src0, GlobalVectorConstants::TwoPi));
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelASin : public TUnaryVectorKernel<FVectorKernelASin>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorMultiply(VectorASin(Src0), GlobalVectorConstants::OneOverTwoPi);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelACos : public TUnaryVectorKernel<FVectorKernelACos>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorMultiply(VectorACos(Src0), GlobalVectorConstants::OneOverTwoPi);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelATan : public TUnaryVectorKernel<FVectorKernelATan>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorMultiply(VectorATan(Src0), GlobalVectorConstants::OneOverTwoPi);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelATan2 : public TBinaryVectorKernel<FVectorKernelATan2>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorMultiply(VectorATan2(Src0, Src1), GlobalVectorConstants::OneOverTwoPi);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCeil : public TUnaryVectorKernel<FVectorKernelCeil>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorCeil(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelFloor : public TUnaryVectorKernel<FVectorKernelFloor>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorFloor(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelRound : public TUnaryVectorKernel<FVectorKernelRound>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
//TODO: >SSE4 has direct ops for this.
|
|
VectorRegister Trunc = VectorTruncate(Src0);
|
|
*Dst = VectorAdd(Trunc, VectorTruncate(VectorMultiply(VectorSubtract(Src0, Trunc), GlobalVectorConstants::FloatAlmostTwo)));
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelMod : public TBinaryVectorKernel<FVectorKernelMod>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorMod(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelFrac : public TUnaryVectorKernel<FVectorKernelFrac>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorFractional(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelTrunc : public TUnaryVectorKernel<FVectorKernelTrunc>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorTruncate(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCompareLT : public TBinaryVectorKernel<FVectorKernelCompareLT>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorCompareLT(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCompareLE : public TBinaryVectorKernel<FVectorKernelCompareLE>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorCompareLE(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCompareGT : public TBinaryVectorKernel<FVectorKernelCompareGT>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorCompareGT(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCompareGE : public TBinaryVectorKernel<FVectorKernelCompareGE>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorCompareGE(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCompareEQ : public TBinaryVectorKernel<FVectorKernelCompareEQ>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorCompareEQ(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelCompareNEQ : public TBinaryVectorKernel<FVectorKernelCompareNEQ>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
*Dst = VectorCompareNE(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelSelect : public TTrinaryVectorKernel<FVectorKernelSelect>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Mask, VectorRegister A, VectorRegister B)
|
|
{
|
|
*Dst = VectorSelect(Mask, A, B);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelExecutionIndex
|
|
{
|
|
static void VM_FORCEINLINE Exec(FVectorVMContext& Context)
|
|
{
|
|
static_assert(VECTOR_WIDTH_FLOATS == 4, "Need to update this when upgrading the VM to support >SSE2");
|
|
VectorRegisterInt VectorStride = MakeVectorRegisterInt(VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS, VECTOR_WIDTH_FLOATS);
|
|
VectorRegisterInt Index = MakeVectorRegisterInt(Context.StartInstance, Context.StartInstance + 1, Context.StartInstance + 2, Context.StartInstance + 3);
|
|
|
|
FRegisterDestHandler<VectorRegisterInt> Dest(Context);
|
|
int32 Loops = Align(Context.NumInstances, VECTOR_WIDTH_FLOATS) / VECTOR_WIDTH_FLOATS;
|
|
for (int32 i = 0; i < Loops; ++i)
|
|
{
|
|
*Dest.GetDest() = Index;
|
|
Dest.Advance();
|
|
Index = VectorIntAdd(Index, VectorStride);
|
|
}
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelEnterStatScope
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
#if STATS
|
|
FConstantHandler<int32> ScopeIdx(Context);
|
|
int32 CounterIdx = Context.StatCounterStack.AddDefaulted(1);
|
|
Context.StatCounterStack[CounterIdx].Start(Context.StatScopes[ScopeIdx.Get()]);
|
|
#endif
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelExitStatScope
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
#if STATS
|
|
Context.StatCounterStack.Last().Stop();
|
|
Context.StatCounterStack.Pop();
|
|
#endif
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelRandom : public TUnaryVectorKernel<FVectorKernelRandom>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
const float rm = RAND_MAX;
|
|
VectorRegister Result = MakeVectorRegister(static_cast<float>(FMath::Rand()) / rm,
|
|
static_cast<float>(FMath::Rand()) / rm,
|
|
static_cast<float>(FMath::Rand()) / rm,
|
|
static_cast<float>(FMath::Rand()) / rm);
|
|
*Dst = VectorMultiply(Result, Src0);
|
|
}
|
|
};
|
|
|
|
|
|
/* gaussian distribution random number (not working yet) */
|
|
struct FVectorKernelRandomGauss : public TBinaryVectorKernel<FVectorKernelRandomGauss>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0, VectorRegister Src1)
|
|
{
|
|
const float rm = RAND_MAX;
|
|
VectorRegister Result = MakeVectorRegister(static_cast<float>(FMath::Rand()) / rm,
|
|
static_cast<float>(FMath::Rand()) / rm,
|
|
static_cast<float>(FMath::Rand()) / rm,
|
|
static_cast<float>(FMath::Rand()) / rm);
|
|
|
|
Result = VectorSubtract(Result, MakeVectorRegister(0.5f, 0.5f, 0.5f, 0.5f));
|
|
Result = VectorMultiply(MakeVectorRegister(3.0f, 3.0f, 3.0f, 3.0f), Result);
|
|
|
|
// taylor series gaussian approximation
|
|
const VectorRegister SPi2 = VectorReciprocal(VectorReciprocalSqrt(MakeVectorRegister(2 * PI, 2 * PI, 2 * PI, 2 * PI)));
|
|
VectorRegister Gauss = VectorReciprocal(SPi2);
|
|
VectorRegister Div = VectorMultiply(MakeVectorRegister(2.0f, 2.0f, 2.0f, 2.0f), SPi2);
|
|
Gauss = VectorSubtract(Gauss, VectorDivide(VectorMultiply(Result, Result), Div));
|
|
Div = VectorMultiply(MakeVectorRegister(8.0f, 8.0f, 8.0f, 8.0f), SPi2);
|
|
Gauss = VectorAdd(Gauss, VectorDivide(VectorPow(MakeVectorRegister(4.0f, 4.0f, 4.0f, 4.0f), Result), Div));
|
|
Div = VectorMultiply(MakeVectorRegister(48.0f, 48.0f, 48.0f, 48.0f), SPi2);
|
|
Gauss = VectorSubtract(Gauss, VectorDivide(VectorPow(MakeVectorRegister(6.0f, 6.0f, 6.0f, 6.0f), Result), Div));
|
|
|
|
Gauss = VectorDivide(Gauss, MakeVectorRegister(0.4f, 0.4f, 0.4f, 0.4f));
|
|
Gauss = VectorMultiply(Gauss, Src0);
|
|
*Dst = Gauss;
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelMin : public TBinaryVectorKernel<FVectorKernelMin>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1)
|
|
{
|
|
*Dst = VectorMin(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelMax : public TBinaryVectorKernel<FVectorKernelMax>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1)
|
|
{
|
|
*Dst = VectorMax(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelPow : public TBinaryVectorKernel<FVectorKernelPow>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst,VectorRegister Src0,VectorRegister Src1)
|
|
{
|
|
*Dst = VectorPow(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelSign : public TUnaryVectorKernel<FVectorKernelSign>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorSign(Src0);
|
|
}
|
|
};
|
|
|
|
struct FVectorKernelStep : public TUnaryVectorKernel<FVectorKernelStep>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorStep(Src0);
|
|
}
|
|
};
|
|
|
|
namespace VectorVMNoise
|
|
{
|
|
int32 P[512] =
|
|
{
|
|
151,160,137,91,90,15,
|
|
131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
|
|
190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
|
|
88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
|
|
77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
|
|
102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
|
|
135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
|
|
5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
|
|
223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9,
|
|
129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
|
|
251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
|
|
49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,
|
|
138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180,
|
|
151,160,137,91,90,15,
|
|
131,13,201,95,96,53,194,233,7,225,140,36,103,30,69,142,8,99,37,240,21,10,23,
|
|
190, 6,148,247,120,234,75,0,26,197,62,94,252,219,203,117,35,11,32,57,177,33,
|
|
88,237,149,56,87,174,20,125,136,171,168, 68,175,74,165,71,134,139,48,27,166,
|
|
77,146,158,231,83,111,229,122,60,211,133,230,220,105,92,41,55,46,245,40,244,
|
|
102,143,54, 65,25,63,161, 1,216,80,73,209,76,132,187,208, 89,18,169,200,196,
|
|
135,130,116,188,159,86,164,100,109,198,173,186, 3,64,52,217,226,250,124,123,
|
|
5,202,38,147,118,126,255,82,85,212,207,206,59,227,47,16,58,17,182,189,28,42,
|
|
223,183,170,213,119,248,152, 2,44,154,163, 70,221,153,101,155,167, 43,172,9,
|
|
129,22,39,253, 19,98,108,110,79,113,224,232,178,185, 112,104,218,246,97,228,
|
|
251,34,242,193,238,210,144,12,191,179,162,241, 81,51,145,235,249,14,239,107,
|
|
49,192,214, 31,181,199,106,157,184, 84,204,176,115,121,50,45,127, 4,150,254,
|
|
138,236,205,93,222,114,67,29,24,72,243,141,128,195,78,66,215,61,156,180
|
|
};
|
|
|
|
static FORCEINLINE float Lerp(float X, float A, float B)
|
|
{
|
|
return A + X * (B - A);
|
|
}
|
|
|
|
static FORCEINLINE float Fade(float X)
|
|
{
|
|
return X * X * X * (X * (X * 6 - 15) + 10);
|
|
}
|
|
|
|
static FORCEINLINE float Grad(int32 hash, float x, float y, float z)
|
|
{
|
|
float u = (hash < 8) ? x : y;
|
|
float v = (hash < 4) ? y : ((hash == 12 || hash == 14) ? x : z);
|
|
return ((hash & 1) == 0 ? u : -u) + ((hash & 2) == 0 ? v : -v);
|
|
}
|
|
|
|
struct FScalarKernelNoise3D_iNoise : TTrinaryScalarKernel<FScalarKernelNoise3D_iNoise>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(float* RESTRICT Dst, float X, float Y, float Z)
|
|
{
|
|
float Xfl = FMath::FloorToFloat(X);
|
|
float Yfl = FMath::FloorToFloat(Y);
|
|
float Zfl = FMath::FloorToFloat(Z);
|
|
int32 Xi = (int32)(Xfl) & 255;
|
|
int32 Yi = (int32)(Yfl) & 255;
|
|
int32 Zi = (int32)(Zfl) & 255;
|
|
X -= Xfl;
|
|
Y -= Yfl;
|
|
Z -= Zfl;
|
|
float Xm1 = X - 1.0f;
|
|
float Ym1 = Y - 1.0f;
|
|
float Zm1 = Z - 1.0f;
|
|
|
|
int32 A = P[Xi] + Yi;
|
|
int32 AA = P[A] + Zi; int32 AB = P[A + 1] + Zi;
|
|
|
|
int32 B = P[Xi + 1] + Yi;
|
|
int32 BA = P[B] + Zi; int32 BB = P[B + 1] + Zi;
|
|
|
|
float U = Fade(X);
|
|
float V = Fade(Y);
|
|
float W = Fade(Z);
|
|
|
|
*Dst =
|
|
Lerp(W,
|
|
Lerp(V,
|
|
Lerp(U,
|
|
Grad(P[AA], X, Y, Z),
|
|
Grad(P[BA], Xm1, Y, Z)),
|
|
Lerp(U,
|
|
Grad(P[AB], X, Ym1, Z),
|
|
Grad(P[BB], Xm1, Ym1, Z))),
|
|
Lerp(V,
|
|
Lerp(U,
|
|
Grad(P[AA + 1], X, Y, Zm1),
|
|
Grad(P[BA + 1], Xm1, Y, Zm1)),
|
|
Lerp(U,
|
|
Grad(P[AB + 1], X, Ym1, Zm1),
|
|
Grad(P[BB + 1], Xm1, Ym1, Zm1))));
|
|
}
|
|
};
|
|
|
|
struct FScalarKernelNoise2D_iNoise : TBinaryScalarKernel<FScalarKernelNoise2D_iNoise>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(float* RESTRICT Dst, float X, float Y)
|
|
{
|
|
*Dst = 0.0f;//TODO
|
|
}
|
|
};
|
|
|
|
struct FScalarKernelNoise1D_iNoise : TUnaryScalarKernel<FScalarKernelNoise1D_iNoise>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(float* RESTRICT Dst, float X)
|
|
{
|
|
*Dst = 0.0f;//TODO;
|
|
}
|
|
};
|
|
|
|
static void Noise1D(FVectorVMContext& Context) { FScalarKernelNoise1D_iNoise::Exec(Context); }
|
|
static void Noise2D(FVectorVMContext& Context) { FScalarKernelNoise2D_iNoise::Exec(Context); }
|
|
static void Noise3D(FVectorVMContext& Context)
|
|
{
|
|
//Basic scalar implementation of perlin's improved noise until I can spend some quality time exploring vectorized implementations of Marc O's noise from Random.usf.
|
|
//http://mrl.nyu.edu/~perlin/noise/
|
|
FScalarKernelNoise3D_iNoise::Exec(Context);
|
|
}
|
|
};
|
|
|
|
//Olaf's orginal curl noise. Needs updating for the new scalar VM and possibly calling Curl Noise to avoid confusion with regular noise?
|
|
//Possibly needs to be a data interface as the VM can't output Vectors?
|
|
struct FVectorKernelNoise : public TUnaryVectorKernel<FVectorKernelNoise>
|
|
{
|
|
static VectorRegister RandomTable[17][17][17];
|
|
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* RESTRICT Dst, VectorRegister Src0)
|
|
{
|
|
const VectorRegister One = MakeVectorRegister(1.0f, 1.0f, 1.0f, 1.0f);
|
|
const VectorRegister VecSize = MakeVectorRegister(16.0f, 16.0f, 16.0f, 16.0f);
|
|
|
|
*Dst = MakeVectorRegister(0.0f, 0.0f, 0.0f, 0.0f);
|
|
|
|
for (uint32 i = 1; i < 2; i++)
|
|
{
|
|
float Di = 0.2f * (1.0f/(1<<i));
|
|
VectorRegister Div = MakeVectorRegister(Di, Di, Di, Di);
|
|
VectorRegister Coords = VectorMod( VectorAbs( VectorMultiply(Src0, Div) ), VecSize );
|
|
const float *CoordPtr = reinterpret_cast<float const*>(&Coords);
|
|
const int32 Cx = CoordPtr[0];
|
|
const int32 Cy = CoordPtr[1];
|
|
const int32 Cz = CoordPtr[2];
|
|
|
|
VectorRegister Frac = VectorFractional(Coords);
|
|
VectorRegister Alpha = VectorReplicate(Frac, 0);
|
|
VectorRegister OneMinusAlpha = VectorSubtract(One, Alpha);
|
|
|
|
VectorRegister XV1 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz], OneMinusAlpha));
|
|
VectorRegister XV2 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz], OneMinusAlpha));
|
|
VectorRegister XV3 = VectorMultiplyAdd(RandomTable[Cx][Cy][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy][Cz+1], OneMinusAlpha));
|
|
VectorRegister XV4 = VectorMultiplyAdd(RandomTable[Cx][Cy+1][Cz+1], Alpha, VectorMultiply(RandomTable[Cx+1][Cy+1][Cz+1], OneMinusAlpha));
|
|
|
|
Alpha = VectorReplicate(Frac, 1);
|
|
OneMinusAlpha = VectorSubtract(One, Alpha);
|
|
VectorRegister YV1 = VectorMultiplyAdd(XV1, Alpha, VectorMultiply(XV2, OneMinusAlpha));
|
|
VectorRegister YV2 = VectorMultiplyAdd(XV3, Alpha, VectorMultiply(XV4, OneMinusAlpha));
|
|
|
|
Alpha = VectorReplicate(Frac, 2);
|
|
OneMinusAlpha = VectorSubtract(One, Alpha);
|
|
VectorRegister ZV = VectorMultiplyAdd(YV1, Alpha, VectorMultiply(YV2, OneMinusAlpha));
|
|
|
|
*Dst = VectorAdd(*Dst, ZV);
|
|
}
|
|
}
|
|
};
|
|
|
|
VectorRegister FVectorKernelNoise::RandomTable[17][17][17];
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//Special Kernels.
|
|
|
|
/** Special kernel for reading from the main input dataset. */
|
|
template<typename T>
|
|
struct FVectorKernelReadInput
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
static const int32 InstancesPerVector = sizeof(VectorRegister) / sizeof(T);
|
|
|
|
int32 DataSetIndex = DecodeU16(Context);
|
|
int32 InputRegisterIdx = DecodeU16(Context);
|
|
int32 DestRegisterIdx = DecodeU16(Context);
|
|
int32 Loops = Align(Context.NumInstances, InstancesPerVector) / InstancesPerVector;
|
|
|
|
VectorRegister* DestReg = (VectorRegister*)(Context.RegisterTable[DestRegisterIdx]);
|
|
int32 DataSetOffset = Context.DataSetOffsetTable[DataSetIndex]; //TODO: we'll need a different way of doing this for a proper kernel; might need a specific offset handler
|
|
VectorRegister* InputReg = (VectorRegister*)((T*)(Context.RegisterTable[InputRegisterIdx + DataSetOffset]) + Context.StartInstance);
|
|
|
|
//TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others.
|
|
for (int32 i = 0; i < Loops; ++i)
|
|
{
|
|
*DestReg = VectorLoad(InputReg);
|
|
++DestReg;
|
|
++InputReg;
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
|
|
/** Special kernel for reading from an input dataset; non-advancing (reads same instance everytime).
|
|
* this kernel splats the X component of the source register to all 4 dest components; it's meant to
|
|
* use scalar data sets as the source (e.g. events)
|
|
*/
|
|
template<typename T>
|
|
struct FVectorKernelReadInputNoAdvance
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
static const int32 InstancesPerVector = sizeof(VectorRegister) / sizeof(T);
|
|
|
|
int32 DataSetIndex = DecodeU16(Context);
|
|
int32 InputRegisterIdx = DecodeU16(Context);
|
|
int32 DestRegisterIdx = DecodeU16(Context);
|
|
int32 Loops = Align(Context.NumInstances, InstancesPerVector) / InstancesPerVector;
|
|
|
|
VectorRegister* DestReg = (VectorRegister*)(Context.RegisterTable[DestRegisterIdx]);
|
|
int32 DataSetOffset = Context.DataSetOffsetTable[DataSetIndex]; //TODO: we'll need a different way of doing this for a proper kernel; might need a specific offset handler
|
|
VectorRegister* InputReg = (VectorRegister*)((T*)(Context.RegisterTable[InputRegisterIdx + DataSetOffset]) );
|
|
|
|
//TODO: We can actually do some scalar loads into the first and final vectors to get around alignment issues and then use the aligned load for all others.
|
|
for (int32 i = 0; i < Loops; ++i)
|
|
{
|
|
*DestReg = VectorSwizzle(VectorLoad(InputReg), 0,0,0,0);
|
|
++DestReg;
|
|
}
|
|
}
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
//TODO - Should be straight forwards to follow the input with a mix of the outputs direct indexing
|
|
/** Special kernel for reading an specific location in an input register. */
|
|
// template<typename T>
|
|
// struct FScalarKernelReadInputIndexed
|
|
// {
|
|
// static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
// {
|
|
// int32* IndexReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]);
|
|
// T* InputReg = (T*)(Context.RegisterTable[DecodeU16(Context)]);
|
|
// T* DestReg = (T*)(Context.RegisterTable[DecodeU16(Context)]);
|
|
//
|
|
// //Has to be scalar as each instance can read from a different location in the input buffer.
|
|
// for (int32 i = 0; i < Context.NumInstances; ++i)
|
|
// {
|
|
// T* ReadPtr = (*InputReg) + (*IndexReg);
|
|
// *DestReg = (*ReadPtr);
|
|
// ++IndexReg;
|
|
// ++DestReg;
|
|
// }
|
|
// }
|
|
// };
|
|
|
|
//Needs it's own handler as the output registers are indexed absolutely rather than incrementing in advance().
|
|
template<typename T>
|
|
struct FOutputRegisterHandler : public FRegisterHandlerBase
|
|
{
|
|
T* Register;
|
|
FOutputRegisterHandler(FVectorVMContext& Context, uint32 DataSetOffset)
|
|
: FRegisterHandlerBase(Context)
|
|
, Register((T*)Context.RegisterTable[RegisterIndex+DataSetOffset])
|
|
{}
|
|
|
|
VM_FORCEINLINE void Advance() { }
|
|
VM_FORCEINLINE T Get() { return *Register; }
|
|
VM_FORCEINLINE T* GetDest() { return Register; }
|
|
};
|
|
|
|
/** Special kernel for writing to a specific output register. */
|
|
template<typename T>
|
|
struct FScalarKernelWriteOutputIndexed
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
uint32 SrcOpTypes = DecodeSrcOperandTypes(Context);
|
|
switch (SrcOpTypes)
|
|
{
|
|
case SRCOP_RRR: TTrinaryOutputKernelHandler<FScalarKernelWriteOutputIndexed, FOutputRegisterHandler<T>, FDataSetOffsetHandler, FRegisterHandler<int32>, FRegisterHandler<T>, 1>::Exec(Context); break;
|
|
case SRCOP_RRC: TTrinaryOutputKernelHandler<FScalarKernelWriteOutputIndexed, FOutputRegisterHandler<T>, FDataSetOffsetHandler, FRegisterHandler<int32>, FConstantHandler<T>, 1>::Exec(Context); break;
|
|
default: check(0); break;
|
|
};
|
|
}
|
|
|
|
static VM_FORCEINLINE void DoKernel(T* RESTRICT Dst, int32 SetOffset, int32 Index, T Data)
|
|
{
|
|
if (Index != INDEX_NONE)
|
|
{
|
|
Dst[Index] = Data;//TODO: On sse4 we can use _mm_stream_ss here.
|
|
}
|
|
}
|
|
};
|
|
|
|
struct FDataSetCounterHandler
|
|
{
|
|
int32* Counter;
|
|
FDataSetCounterHandler(FVectorVMContext& Context)
|
|
: Counter(Context.DataSetIndexTable + DecodeU16(Context))
|
|
{}
|
|
|
|
VM_FORCEINLINE void Advance() { }
|
|
VM_FORCEINLINE int32* Get() { return Counter; }
|
|
//VM_FORCEINLINE const int32* GetDest() { return Counter; }Should never use as a dest. All kernels with read and write to this.
|
|
};
|
|
|
|
struct FScalarKernelAcquireCounterIndex
|
|
{
|
|
static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
{
|
|
uint32 SrcOpTypes = DecodeSrcOperandTypes(Context);
|
|
switch (SrcOpTypes)
|
|
{
|
|
case SRCOP_RRR: TBinaryKernelHandler<FScalarKernelAcquireCounterIndex, FRegisterDestHandler<int32>, FDataSetCounterHandler, FRegisterHandler<int32>, 1>::Exec(Context); break;
|
|
case SRCOP_RRC: TBinaryKernelHandler<FScalarKernelAcquireCounterIndex, FRegisterDestHandler<int32>, FDataSetCounterHandler, FConstantHandler<int32>, 1>::Exec(Context); break;
|
|
default: check(0); break;
|
|
};
|
|
}
|
|
|
|
static VM_FORCEINLINE void DoKernel(int32* RESTRICT Dst, int32* Index, int32 Valid)
|
|
{
|
|
if (*Index != INDEX_NONE && Valid != 0)
|
|
{
|
|
*Dst = (*Index)++;
|
|
}
|
|
else
|
|
{
|
|
*Dst = INDEX_NONE; // Subsequent DoKernal calls above will skip over INDEX_NONE register entries...
|
|
}
|
|
}
|
|
};
|
|
|
|
//TODO: REWORK TO FUNCITON LIKE THE ABOVE.
|
|
// /** Special kernel for decrementing a dataset counter. */
|
|
// struct FScalarKernelReleaseCounterIndex
|
|
// {
|
|
// static VM_FORCEINLINE void Exec(FVectorVMContext& Context)
|
|
// {
|
|
// int32* CounterPtr = (int32*)(Context.ConstantTable[DecodeU16(Context)]);
|
|
// int32* DestReg = (int32*)(Context.RegisterTable[DecodeU16(Context)]);
|
|
//
|
|
// for (int32 i = 0; i < Context.NumInstances; ++i)
|
|
// {
|
|
// int32 Counter = (*CounterPtr--);
|
|
// *DestReg = Counter >= 0 ? Counter : INDEX_NONE;
|
|
//
|
|
// ++DestReg;
|
|
// }
|
|
// }
|
|
// };
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//external_func_call
|
|
|
|
struct FKernelExternalFunctionCall
|
|
{
|
|
static void Exec(FVectorVMContext& Context)
|
|
{
|
|
uint32 ExternalFuncIdx = DecodeU8(Context);
|
|
Context.ExternalFunctionTable[ExternalFuncIdx].Execute(Context);
|
|
}
|
|
};
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
//Integer operations
|
|
|
|
//addi,
|
|
struct FVectorIntKernelAdd : TBinaryVectorIntKernel<FVectorIntKernelAdd>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntAdd(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//subi,
|
|
struct FVectorIntKernelSubtract : TBinaryVectorIntKernel<FVectorIntKernelSubtract>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntSubtract(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//muli,
|
|
struct FVectorIntKernelMultiply : TBinaryVectorIntKernel<FVectorIntKernelMultiply>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntMultiply(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//clampi,
|
|
struct FVectorIntKernelClamp : TTrinaryVectorIntKernel<FVectorIntKernelClamp>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1, VectorRegisterInt Src2)
|
|
{
|
|
*Dst = VectorIntMin(VectorIntMax(Src0, Src1), Src2);
|
|
}
|
|
};
|
|
|
|
//mini,
|
|
struct FVectorIntKernelMin : TBinaryVectorIntKernel<FVectorIntKernelMin>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntMin(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//maxi,
|
|
struct FVectorIntKernelMax : TBinaryVectorIntKernel<FVectorIntKernelMax>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntMax(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//absi,
|
|
struct FVectorIntKernelAbs : TUnaryVectorIntKernel<FVectorIntKernelAbs>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntAbs(Src0);
|
|
}
|
|
};
|
|
|
|
//negi,
|
|
struct FVectorIntKernelNegate : TUnaryVectorIntKernel<FVectorIntKernelNegate>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntNegate(Src0);
|
|
}
|
|
};
|
|
|
|
//signi,
|
|
struct FVectorIntKernelSign : TUnaryVectorIntKernel<FVectorIntKernelSign>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntSign(Src0);
|
|
}
|
|
};
|
|
|
|
//cmplti,
|
|
struct FVectorIntKernelCompareLT : TBinaryVectorIntKernel<FVectorIntKernelCompareLT>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntCompareLT(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//cmplei,
|
|
struct FVectorIntKernelCompareLE : TBinaryVectorIntKernel<FVectorIntKernelCompareLE>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntCompareLE(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//cmpgti,
|
|
struct FVectorIntKernelCompareGT : TBinaryVectorIntKernel<FVectorIntKernelCompareGT>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntCompareGT(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//cmpgei,
|
|
struct FVectorIntKernelCompareGE : TBinaryVectorIntKernel<FVectorIntKernelCompareGE>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntCompareGE(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//cmpeqi,
|
|
struct FVectorIntKernelCompareEQ : TBinaryVectorIntKernel<FVectorIntKernelCompareEQ>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntCompareEQ(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//cmpneqi,
|
|
struct FVectorIntKernelCompareNEQ : TBinaryVectorIntKernel<FVectorIntKernelCompareNEQ>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntCompareNEQ(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//bit_and,
|
|
struct FVectorIntKernelBitAnd : TBinaryVectorIntKernel<FVectorIntKernelBitAnd>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntAnd(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//bit_or,
|
|
struct FVectorIntKernelBitOr : TBinaryVectorIntKernel<FVectorIntKernelBitOr>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntOr(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//bit_xor,
|
|
struct FVectorIntKernelBitXor : TBinaryVectorIntKernel<FVectorIntKernelBitXor>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
*Dst = VectorIntXor(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//bit_not,
|
|
struct FVectorIntKernelBitNot : TUnaryVectorIntKernel<FVectorIntKernelBitNot>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntNot(Src0);
|
|
}
|
|
};
|
|
|
|
//"Boolean" ops. Currently handling bools as integers.
|
|
//logic_and,
|
|
struct FVectorIntKernelLogicAnd : TBinaryVectorIntKernel<FVectorIntKernelLogicAnd>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
//We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these?
|
|
*Dst = VectorIntAnd(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//logic_or,
|
|
struct FVectorIntKernelLogicOr : TBinaryVectorIntKernel<FVectorIntKernelLogicOr>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
//We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these?
|
|
*Dst = VectorIntOr(Src0, Src1);
|
|
}
|
|
};
|
|
//logic_xor,
|
|
struct FVectorIntKernelLogicXor : TBinaryVectorIntKernel<FVectorIntKernelLogicXor>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0, VectorRegisterInt Src1)
|
|
{
|
|
//We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these?
|
|
*Dst = VectorIntXor(Src0, Src1);
|
|
}
|
|
};
|
|
|
|
//logic_not,
|
|
struct FVectorIntKernelLogicNot : TUnaryVectorIntKernel<FVectorIntKernelLogicNot>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
//We need to assume a mask input and produce a mask output so just bitwise ops actually fine for these?
|
|
*Dst = VectorIntNot(Src0);
|
|
}
|
|
};
|
|
|
|
//conversions
|
|
//f2i,
|
|
struct FVectorKernelFloatToInt : TUnaryKernel<FVectorKernelFloatToInt, FRegisterDestHandler<VectorRegisterInt>, FConstantHandler<VectorRegister>, FRegisterHandler<VectorRegister>, VECTOR_WIDTH_FLOATS>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorFloatToInt(Src0);
|
|
}
|
|
};
|
|
|
|
//i2f,
|
|
struct FVectorKernelIntToFloat : TUnaryKernel<FVectorKernelIntToFloat, FRegisterDestHandler<VectorRegister>, FConstantHandler<VectorRegisterInt>, FRegisterHandler<VectorRegisterInt>, VECTOR_WIDTH_FLOATS>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntToFloat(Src0);
|
|
}
|
|
};
|
|
|
|
//f2b,
|
|
struct FVectorKernelFloatToBool : TUnaryKernel<FVectorKernelFloatToBool, FRegisterDestHandler<VectorRegister>, FConstantHandler<VectorRegister>, FRegisterHandler<VectorRegister>, VECTOR_WIDTH_FLOATS>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorCompareGT(Src0, GlobalVectorConstants::FloatZero);
|
|
}
|
|
};
|
|
|
|
//b2f,
|
|
struct FVectorKernelBoolToFloat : TUnaryKernel<FVectorKernelBoolToFloat, FRegisterDestHandler<VectorRegister>, FConstantHandler<VectorRegister>, FRegisterHandler<VectorRegister>, VECTOR_WIDTH_FLOATS>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegister* Dst, VectorRegister Src0)
|
|
{
|
|
*Dst = VectorSelect(Src0, GlobalVectorConstants::FloatOne, GlobalVectorConstants::FloatZero);
|
|
}
|
|
};
|
|
|
|
//i2b,
|
|
struct FVectorKernelIntToBool : TUnaryKernel<FVectorKernelIntToBool, FRegisterDestHandler<VectorRegisterInt>, FConstantHandler<VectorRegisterInt>, FRegisterHandler<VectorRegisterInt>, VECTOR_WIDTH_FLOATS>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntCompareGT(Src0, GlobalVectorConstants::IntZero);
|
|
}
|
|
};
|
|
|
|
//b2i,
|
|
struct FVectorKernelBoolToInt : TUnaryKernel<FVectorKernelBoolToInt, FRegisterDestHandler<VectorRegisterInt>, FConstantHandler<VectorRegisterInt>, FRegisterHandler<VectorRegisterInt>, VECTOR_WIDTH_FLOATS>
|
|
{
|
|
static void VM_FORCEINLINE DoKernel(VectorRegisterInt* Dst, VectorRegisterInt Src0)
|
|
{
|
|
*Dst = VectorIntSelect(Src0, GlobalVectorConstants::IntOne, GlobalVectorConstants::IntZero);
|
|
}
|
|
};
|
|
|
|
|
|
void VectorVM::Init()
|
|
{
|
|
static bool Inited = false;
|
|
if (Inited == false)
|
|
{
|
|
// random noise
|
|
float TempTable[17][17][17];
|
|
for (int z = 0; z < 17; z++)
|
|
{
|
|
for (int y = 0; y < 17; y++)
|
|
{
|
|
for (int x = 0; x < 17; x++)
|
|
{
|
|
float f1 = (float)FMath::FRandRange(-1.0f, 1.0f);
|
|
TempTable[x][y][z] = f1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// pad
|
|
for (int i = 0; i < 17; i++)
|
|
{
|
|
for (int j = 0; j < 17; j++)
|
|
{
|
|
TempTable[i][j][16] = TempTable[i][j][0];
|
|
TempTable[i][16][j] = TempTable[i][0][j];
|
|
TempTable[16][j][i] = TempTable[0][j][i];
|
|
}
|
|
}
|
|
|
|
// compute gradients
|
|
FVector TempTable2[17][17][17];
|
|
for (int z = 0; z < 16; z++)
|
|
{
|
|
for (int y = 0; y < 16; y++)
|
|
{
|
|
for (int x = 0; x < 16; x++)
|
|
{
|
|
FVector XGrad = FVector(1.0f, 0.0f, TempTable[x][y][z] - TempTable[x+1][y][z]);
|
|
FVector YGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y + 1][z]);
|
|
FVector ZGrad = FVector(0.0f, 1.0f, TempTable[x][y][z] - TempTable[x][y][z+1]);
|
|
|
|
FVector Grad = FVector(XGrad.Z, YGrad.Z, ZGrad.Z);
|
|
TempTable2[x][y][z] = Grad;
|
|
}
|
|
}
|
|
}
|
|
|
|
// pad
|
|
for (int i = 0; i < 17; i++)
|
|
{
|
|
for (int j = 0; j < 17; j++)
|
|
{
|
|
TempTable2[i][j][16] = TempTable2[i][j][0];
|
|
TempTable2[i][16][j] = TempTable2[i][0][j];
|
|
TempTable2[16][j][i] = TempTable2[0][j][i];
|
|
}
|
|
}
|
|
|
|
|
|
// compute curl of gradient field
|
|
for (int z = 0; z < 16; z++)
|
|
{
|
|
for (int y = 0; y < 16; y++)
|
|
{
|
|
for (int x = 0; x < 16; x++)
|
|
{
|
|
FVector Dy = TempTable2[x][y][z] - TempTable2[x][y + 1][z];
|
|
FVector Sy = TempTable2[x][y][z] + TempTable2[x][y + 1][z];
|
|
FVector Dx = TempTable2[x][y][z] - TempTable2[x + 1][y][z];
|
|
FVector Sx = TempTable2[x][y][z] + TempTable2[x + 1][y][z];
|
|
FVector Dz = TempTable2[x][y][z] - TempTable2[x][y][z + 1];
|
|
FVector Sz = TempTable2[x][y][z] + TempTable2[x][y][z + 1];
|
|
FVector Dir = FVector(Dy.Z - Sz.Y, Dz.X - Sx.Z, Dx.Y - Sy.X);
|
|
|
|
FVectorKernelNoise::RandomTable[x][y][z] = MakeVectorRegister(Dir.X, Dir.Y, Dir.Z, 0.0f);
|
|
}
|
|
}
|
|
}
|
|
|
|
Inited = true;
|
|
}
|
|
}
|
|
|
|
void VectorVM::Exec(
|
|
uint8 const* Code,
|
|
uint8** InputRegisters,
|
|
uint8* InputRegisterSizes,
|
|
int32 NumInputRegisters,
|
|
uint8** OutputRegisters,
|
|
uint8* OutputRegisterSizes,
|
|
int32 NumOutputRegisters,
|
|
uint8 const* ConstantTable,
|
|
TArray<FDataSetMeta> &DataSetMetaTable,
|
|
FVMExternalFunction* ExternalFunctionTable,
|
|
int32 NumInstances
|
|
|
|
#if STATS
|
|
, TArray<TStatId>& StatScopes
|
|
#endif
|
|
)
|
|
{
|
|
uint32 TempRegisterSize = Align((InstancesPerChunk) * MaxInstanceSizeBytes, VECTOR_WIDTH_BYTES) + VECTOR_WIDTH_BYTES;
|
|
//TODO: Refactor this so VMs are a persistent object with growing buffers. Once spun up, there are no allocs.
|
|
//Can be pooled and used for threading and branching.
|
|
TArray<uint8,TAlignedHeapAllocator<VECTOR_WIDTH_BYTES>> TempRegTable;
|
|
TempRegTable.SetNumUninitialized(TempRegisterSize * NumTempRegisters);
|
|
uint8* RegisterTable[MaxRegisters] = {0};
|
|
|
|
|
|
// Map temporary registers.
|
|
for (int32 i = 0; i < NumTempRegisters; ++i)
|
|
{
|
|
RegisterTable[i] = TempRegTable.GetData() + TempRegisterSize * i;
|
|
}
|
|
|
|
// Map input and output registers.
|
|
//Input and output registers are indexed absolutely directly in their kernels.
|
|
//TODO: No need for these to be in the same table now.
|
|
//TODO: Also no need for the i/o size table as the ops will deal with that now.
|
|
for (int32 i = 0; i < NumInputRegisters; ++i)
|
|
{
|
|
RegisterTable[NumTempRegisters + i] = InputRegisters[i];
|
|
}
|
|
for (int32 i = 0; i < NumOutputRegisters; ++i)
|
|
{
|
|
RegisterTable[NumTempRegisters + MaxInputRegisters + i] = OutputRegisters[i];
|
|
}
|
|
|
|
// table of index counters, one for each data set
|
|
TArray<int32> DataSetIndexTable;
|
|
|
|
// map secondary data sets and fill in the offset table into the register table
|
|
//
|
|
TArray<int32> DataSetOffsetTable;
|
|
for (int32 Idx = 0; Idx < DataSetMetaTable.Num(); Idx++)
|
|
{
|
|
uint32 DataSetOffset = DataSetMetaTable[Idx].NumVariables;
|
|
DataSetOffsetTable.Add(DataSetOffset);
|
|
DataSetIndexTable.Add(DataSetMetaTable[Idx].DataSetAccessIndex); // prime counter index table with the data set offset; will be incremented with every write for each instance
|
|
}
|
|
|
|
|
|
|
|
// Process one chunk at a time.
|
|
int32 InstancesLeft = NumInstances;
|
|
int32 ChunkIdx = 0;
|
|
while (InstancesLeft > 0)
|
|
{
|
|
// Setup execution context.
|
|
FVectorVMContext Context(Code, RegisterTable, ConstantTable, DataSetIndexTable.GetData(), DataSetOffsetTable.GetData(),
|
|
ExternalFunctionTable, FMath::Min(InstancesLeft, (int32)InstancesPerChunk), InstancesPerChunk * ChunkIdx
|
|
#if STATS
|
|
, StatScopes
|
|
#endif
|
|
);
|
|
Context.NumSecondaryDataSets = DataSetOffsetTable.Num();
|
|
EVectorVMOp Op = EVectorVMOp::done;
|
|
|
|
// Execute VM on all vectors in this chunk.
|
|
do
|
|
{
|
|
Op = DecodeOp(Context);
|
|
switch (Op)
|
|
{
|
|
// Dispatch kernel ops.
|
|
case EVectorVMOp::add: FVectorKernelAdd::Exec(Context); break;
|
|
case EVectorVMOp::sub: FVectorKernelSub::Exec(Context); break;
|
|
case EVectorVMOp::mul: FVectorKernelMul::Exec(Context); break;
|
|
case EVectorVMOp::div: FVectorKernelDiv::Exec(Context); break;
|
|
case EVectorVMOp::mad: FVectorKernelMad::Exec(Context); break;
|
|
case EVectorVMOp::lerp: FVectorKernelLerp::Exec(Context); break;
|
|
case EVectorVMOp::rcp: FVectorKernelRcp::Exec(Context); break;
|
|
case EVectorVMOp::rsq: FVectorKernelRsq::Exec(Context); break;
|
|
case EVectorVMOp::sqrt: FVectorKernelSqrt::Exec(Context); break;
|
|
case EVectorVMOp::neg: FVectorKernelNeg::Exec(Context); break;
|
|
case EVectorVMOp::abs: FVectorKernelAbs::Exec(Context); break;
|
|
case EVectorVMOp::exp: FVectorKernelExp::Exec(Context); break;
|
|
case EVectorVMOp::exp2: FVectorKernelExp2::Exec(Context); break;
|
|
case EVectorVMOp::log: FVectorKernelLog::Exec(Context); break;
|
|
case EVectorVMOp::log2: FVectorKernelLog2::Exec(Context); break;
|
|
case EVectorVMOp::sin: FVectorKernelSin::Exec(Context); break;
|
|
case EVectorVMOp::cos: FVectorKernelCos::Exec(Context); break;
|
|
case EVectorVMOp::tan: FVectorKernelTan::Exec(Context); break;
|
|
case EVectorVMOp::asin: FVectorKernelASin::Exec(Context); break;
|
|
case EVectorVMOp::acos: FVectorKernelACos::Exec(Context); break;
|
|
case EVectorVMOp::atan: FVectorKernelATan::Exec(Context); break;
|
|
case EVectorVMOp::atan2: FVectorKernelATan2::Exec(Context); break;
|
|
case EVectorVMOp::ceil: FVectorKernelCeil::Exec(Context); break;
|
|
case EVectorVMOp::floor: FVectorKernelFloor::Exec(Context); break;
|
|
case EVectorVMOp::round: FVectorKernelRound::Exec(Context); break;
|
|
case EVectorVMOp::fmod: FVectorKernelMod::Exec(Context); break;
|
|
case EVectorVMOp::frac: FVectorKernelFrac::Exec(Context); break;
|
|
case EVectorVMOp::trunc: FVectorKernelTrunc::Exec(Context); break;
|
|
case EVectorVMOp::clamp: FVectorKernelClamp::Exec(Context); break;
|
|
case EVectorVMOp::min: FVectorKernelMin::Exec(Context); break;
|
|
case EVectorVMOp::max: FVectorKernelMax::Exec(Context); break;
|
|
case EVectorVMOp::pow: FVectorKernelPow::Exec(Context); break;
|
|
case EVectorVMOp::sign: FVectorKernelSign::Exec(Context); break;
|
|
case EVectorVMOp::step: FVectorKernelStep::Exec(Context); break;
|
|
case EVectorVMOp::random: FVectorKernelRandom::Exec(Context); break;
|
|
case EVectorVMOp::noise: VectorVMNoise::Noise1D(Context); break;
|
|
case EVectorVMOp::noise2D: VectorVMNoise::Noise2D(Context); break;
|
|
case EVectorVMOp::noise3D: VectorVMNoise::Noise3D(Context); break;
|
|
|
|
case EVectorVMOp::cmplt: FVectorKernelCompareLT::Exec(Context); break;
|
|
case EVectorVMOp::cmple: FVectorKernelCompareLE::Exec(Context); break;
|
|
case EVectorVMOp::cmpgt: FVectorKernelCompareGT::Exec(Context); break;
|
|
case EVectorVMOp::cmpge: FVectorKernelCompareGE::Exec(Context); break;
|
|
case EVectorVMOp::cmpeq: FVectorKernelCompareEQ::Exec(Context); break;
|
|
case EVectorVMOp::cmpneq: FVectorKernelCompareNEQ::Exec(Context); break;
|
|
case EVectorVMOp::select: FVectorKernelSelect::Exec(Context); break;
|
|
|
|
case EVectorVMOp::addi: FVectorIntKernelAdd::Exec(Context); break;
|
|
case EVectorVMOp::subi: FVectorIntKernelSubtract::Exec(Context); break;
|
|
case EVectorVMOp::muli: FVectorIntKernelMultiply::Exec(Context); break;
|
|
case EVectorVMOp::clampi: FVectorIntKernelClamp::Exec(Context); break;
|
|
case EVectorVMOp::mini: FVectorIntKernelMin::Exec(Context); break;
|
|
case EVectorVMOp::maxi: FVectorIntKernelMax::Exec(Context); break;
|
|
case EVectorVMOp::absi: FVectorIntKernelAbs::Exec(Context); break;
|
|
case EVectorVMOp::negi: FVectorIntKernelNegate::Exec(Context); break;
|
|
case EVectorVMOp::signi: FVectorIntKernelSign::Exec(Context); break;
|
|
case EVectorVMOp::cmplti: FVectorIntKernelCompareLT::Exec(Context); break;
|
|
case EVectorVMOp::cmplei: FVectorIntKernelCompareLE::Exec(Context); break;
|
|
case EVectorVMOp::cmpgti: FVectorIntKernelCompareGT::Exec(Context); break;
|
|
case EVectorVMOp::cmpgei: FVectorIntKernelCompareGE::Exec(Context); break;
|
|
case EVectorVMOp::cmpeqi: FVectorIntKernelCompareEQ::Exec(Context); break;
|
|
case EVectorVMOp::cmpneqi: FVectorIntKernelCompareNEQ::Exec(Context); break;
|
|
case EVectorVMOp::bit_and: FVectorIntKernelBitAnd::Exec(Context); break;
|
|
case EVectorVMOp::bit_or: FVectorIntKernelBitOr::Exec(Context); break;
|
|
case EVectorVMOp::bit_xor: FVectorIntKernelBitXor::Exec(Context); break;
|
|
case EVectorVMOp::bit_not: FVectorIntKernelBitNot::Exec(Context); break;
|
|
case EVectorVMOp::logic_and: FVectorIntKernelLogicAnd::Exec(Context); break;
|
|
case EVectorVMOp::logic_or: FVectorIntKernelLogicOr::Exec(Context); break;
|
|
case EVectorVMOp::logic_xor: FVectorIntKernelLogicXor::Exec(Context); break;
|
|
case EVectorVMOp::logic_not: FVectorIntKernelLogicNot::Exec(Context); break;
|
|
case EVectorVMOp::f2i: FVectorKernelFloatToInt::Exec(Context); break;
|
|
case EVectorVMOp::i2f: FVectorKernelIntToFloat::Exec(Context); break;
|
|
case EVectorVMOp::f2b: FVectorKernelFloatToBool::Exec(Context); break;
|
|
case EVectorVMOp::b2f: FVectorKernelBoolToFloat::Exec(Context); break;
|
|
case EVectorVMOp::i2b: FVectorKernelIntToBool::Exec(Context); break;
|
|
case EVectorVMOp::b2i: FVectorKernelBoolToInt::Exec(Context); break;
|
|
|
|
case EVectorVMOp::outputdata_32bit: FScalarKernelWriteOutputIndexed<int32>::Exec(Context); break;
|
|
case EVectorVMOp::inputdata_32bit: FVectorKernelReadInput<int32>::Exec(Context); break;
|
|
//case EVectorVMOp::inputdata_32bit: FVectorKernelReadInput32::Exec(Context); break;
|
|
case EVectorVMOp::inputdata_noadvance_32bit: FVectorKernelReadInputNoAdvance<int32>::Exec(Context); break;
|
|
case EVectorVMOp::acquireindex: FScalarKernelAcquireCounterIndex::Exec(Context); break;
|
|
case EVectorVMOp::external_func_call: FKernelExternalFunctionCall::Exec(Context); break;
|
|
|
|
case EVectorVMOp::exec_index: FVectorKernelExecutionIndex::Exec(Context); break;
|
|
|
|
case EVectorVMOp::enter_stat_scope: FVectorKernelEnterStatScope::Exec(Context); break;
|
|
case EVectorVMOp::exit_stat_scope: FVectorKernelExitStatScope::Exec(Context); break;
|
|
|
|
// Execution always terminates with a "done" opcode.
|
|
case EVectorVMOp::done:
|
|
break;
|
|
|
|
// Opcode not recognized / implemented.
|
|
default:
|
|
UE_LOG(LogVectorVM, Error, TEXT("Unknown op code 0x%02x"), (uint32)Op);
|
|
return;//BAIL
|
|
}
|
|
} while (Op != EVectorVMOp::done);
|
|
|
|
InstancesLeft -= InstancesPerChunk;
|
|
++ChunkIdx;
|
|
}
|
|
|
|
// write back data set access indices, so we know how much was written to each data set
|
|
for (int32 Idx = 0; Idx < DataSetMetaTable.Num(); Idx++)
|
|
{
|
|
DataSetMetaTable[Idx].DataSetAccessIndex = DataSetIndexTable[Idx];
|
|
}
|
|
|
|
}
|
|
|
|
uint8 VectorVM::GetNumOpCodes()
|
|
{
|
|
return (uint8)EVectorVMOp::NumOpcodes;
|
|
}
|
|
|
|
#if WITH_EDITOR
|
|
FString VectorVM::GetOpName(EVectorVMOp Op)
|
|
{
|
|
static UEnum* EnumStateObj = FindObject<UEnum>(ANY_PACKAGE, TEXT("EVectorVMOp"), true);
|
|
check(EnumStateObj);
|
|
|
|
FString OpStr = EnumStateObj->GetNameByValue((uint8)Op).ToString();
|
|
int32 LastIdx = 0;
|
|
OpStr.FindLastChar(TEXT(':'),LastIdx);
|
|
return OpStr.RightChop(LastIdx);
|
|
}
|
|
|
|
FString VectorVM::GetOperandLocationName(EVectorVMOperandLocation Location)
|
|
{
|
|
static UEnum* EnumStateObj = FindObject<UEnum>(ANY_PACKAGE, TEXT("EVectorVMOperandLocation"), true);
|
|
check(EnumStateObj);
|
|
|
|
FString LocStr = EnumStateObj->GetNameByValue((uint8)Location).ToString();
|
|
int32 LastIdx = 0;
|
|
LocStr.FindLastChar(TEXT(':'), LastIdx);
|
|
return LocStr.RightChop(LastIdx);
|
|
}
|
|
#endif
|
|
|
|
#undef VM_FORCEINLINE
|