Files
UnrealEngineUWP/Engine/Shaders/Private/SceneData.ush

1190 lines
49 KiB
Plaintext
Raw Normal View History

// Copyright Epic Games, Inc. All Rights Reserved.
#pragma once
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
#include "LargeWorldCoordinates.ush"
#include "OctahedralCommon.ush"
#include "/Engine/Shared/NaniteDefinitions.h"
#ifndef USE_GLOBAL_GPU_SCENE_DATA
#define USE_GLOBAL_GPU_SCENE_DATA 0
#endif
#ifndef USE_GLOBAL_GPU_SCENE_DATA_RW
#define USE_GLOBAL_GPU_SCENE_DATA_RW 0
#endif
#ifndef USES_PER_INSTANCE_CUSTOM_DATA
#define USES_PER_INSTANCE_CUSTOM_DATA 0
#endif
#ifndef USES_PER_INSTANCE_RANDOM
#define USES_PER_INSTANCE_RANDOM 0
#endif
#ifndef NEEDS_LIGHTMAP_COORDINATE
#define NEEDS_LIGHTMAP_COORDINATE 0
#endif
#ifndef VF_REQUIRES_PER_INSTANCE_CUSTOM_DATA
#define VF_REQUIRES_PER_INSTANCE_CUSTOM_DATA 0
#endif
#define ENABLE_PER_INSTANCE_CUSTOM_DATA (USES_PER_INSTANCE_CUSTOM_DATA || VF_REQUIRES_PER_INSTANCE_CUSTOM_DATA)
// When transforms come from the InstanceSceneData buffer, indicates whether or not the transforms are compressed (ignored on mobile)
// TODO: Global setting/define
#define INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS 1
// Whether to fetch primitive values (eg LocalToWorld) by dynamically indexing a scene-wide buffer, or to reference a single Primitive uniform buffer
#if VF_SUPPORTS_PRIMITIVE_SCENE_DATA
#if FEATURE_LEVEL == FEATURE_LEVEL_ES3_1
#define VF_USE_PRIMITIVE_SCENE_DATA 2
#else
#define VF_USE_PRIMITIVE_SCENE_DATA 1
#endif
#else
#define VF_USE_PRIMITIVE_SCENE_DATA 0
#endif
// Must match PrimitiveUniformShaderParameters.h
#define PRIMITIVE_SCENE_DATA_FLAG_CAST_SHADOWS 0x1
#define PRIMITIVE_SCENE_DATA_FLAG_USE_SINGLE_SAMPLE_SHADOW_SL 0x2
#define PRIMITIVE_SCENE_DATA_FLAG_USE_VOLUMETRIC_LM_SHADOW_SL 0x4
#define PRIMITIVE_SCENE_DATA_FLAG_DECAL_RECEIVER 0x8
#define PRIMITIVE_SCENE_DATA_FLAG_SHOULD_CACHE_SHADOW 0x10
#define PRIMITIVE_SCENE_DATA_FLAG_OUTPUT_VELOCITY 0x20
#define PRIMITIVE_SCENE_DATA_FLAG_DETERMINANT_SIGN 0x40
#define PRIMITIVE_SCENE_DATA_FLAG_HAS_CAPSULE_REPRESENTATION 0x80
#define PRIMITIVE_SCENE_DATA_FLAG_HAS_CAST_CONTACT_SHADOW 0x100
#define PRIMITIVE_SCENE_DATA_FLAG_HAS_PRIMITIVE_CUSTOM_DATA 0x200
#define PRIMITIVE_SCENE_DATA_FLAG_LIGHTING_CHANNEL_0 0x400
#define PRIMITIVE_SCENE_DATA_FLAG_LIGHTING_CHANNEL_1 0x800
#define PRIMITIVE_SCENE_DATA_FLAG_LIGHTING_CHANNEL_2 0x1000
#define PRIMITIVE_SCENE_DATA_FLAG_HAS_INSTANCE_LOCAL_BOUNDS 0x2000
#define PRIMITIVE_SCENE_DATA_FLAG_HAS_NANITE_IMPOSTER 0x4000
#define PRIMITIVE_SCENE_DATA_FLAG_VISIBLE_IN_GAME 0x8000
#define PRIMITIVE_SCENE_DATA_FLAG_VISIBLE_IN_EDITOR 0x10000
#define PRIMITIVE_SCENE_DATA_FLAG_VISIBLE_IN_REFLECTION_CAPTURES 0x20000
#define PRIMITIVE_SCENE_DATA_FLAG_VISIBLE_IN_REAL_TIME_SKY_CAPTURES 0x40000
#define PRIMITIVE_SCENE_DATA_FLAG_VISIBLE_IN_RAY_TRACING 0x80000
#define PRIMITIVE_SCENE_DATA_FLAG_VISIBLE_IN_SCENE_CAPTURE_ONLY 0x100000
#define PRIMITIVE_SCENE_DATA_FLAG_HIDDEN_IN_SCENE_CAPTURE 0x200000
#define PRIMITIVE_SCENE_DATA_FLAG_FORCE_HIDDEN 0x400000
#define PRIMITIVE_SCENE_DATA_FLAG_CAST_HIDDEN_SHADOW 0x800000
#define PRIMITIVE_SCENE_DATA_FLAG_EVALUATE_WORLD_POSITION_OFFSET 0x1000000
#define PRIMITIVE_SCENE_DATA_FLAG_CAMERA_DISTANCE_CULL 0x2000000
// GPUCULL_TODO: Eventually we need to remove this workaround
#define VF_TREAT_INSTANCE_ID_OFFSET_AS_PRIMITIVE_ID_FLAG (1U << 31U)
#define PRIMITIVE_ID_NUM_BITS (20u) // Max of 1,048,576 primitives
#define INSTANCE_SCENE_DATA_FLAGS_NUM_BITS (12u) // Max of 12 flags
#define INSTANCE_RELATIVE_ID_NUM_BITS (24u) // Max of 16,777,216 instances per primitive
#define INSTANCE_CUSTOM_DATA_COUNT_NUM_BITS (8u) // Max of 255 custom data floats per instance
#define PRIMITIVE_ID_MASK ((1u << PRIMITIVE_ID_NUM_BITS) - 1u)
#define INSTANCE_RELATIVE_ID_MASK ((1u << INSTANCE_RELATIVE_ID_NUM_BITS) - 1u)
#define INVALID_PRIMITIVE_ID PRIMITIVE_ID_MASK
#define INVALID_INSTANCE_PAYLOAD_OFFSET (0xFFFFFFFFu)
// Must match InstanceUniformShaderParameters.h
#define INSTANCE_SCENE_DATA_FLAG_DETERMINANT_SIGN 0x1
#define INSTANCE_SCENE_DATA_FLAG_HAS_RANDOM 0x2
#define INSTANCE_SCENE_DATA_FLAG_HAS_CUSTOM_DATA 0x4
#define INSTANCE_SCENE_DATA_FLAG_HAS_DYNAMIC_DATA 0x8
#define INSTANCE_SCENE_DATA_FLAG_HAS_LIGHTSHADOW_UV_BIAS 0x10
#define INSTANCE_SCENE_DATA_FLAG_HAS_HIERARCHY_OFFSET 0x20
#define INSTANCE_SCENE_DATA_FLAG_HAS_LOCAL_BOUNDS 0x40
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
#define INSTANCE_SCENE_DATA_FLAG_HAS_EDITOR_DATA 0x80
#define INSTANCE_SCENE_DATA_FLAG_IS_RAYTRACING_FAR_FIELD 0x100
#if INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS
static const uint InstanceTransformSizeFloat4Count = 2u; // compressed transform
#else
static const uint InstanceTransformSizeFloat4Count = 3u; // encoded scale/rotation (uint4) and translation (float3)
#endif
#define NUM_CUSTOM_PRIMITIVE_DATA 9 // Num float4s used for custom data. Must match FCustomPrimitiveData::NumCustomPrimitiveDataFloat4s in SceneTypes.h
// Must match FPrimitiveUniformShaderParameters in C++
struct FPrimitiveSceneData
{
uint Flags; // TODO: Use 16 bits?
int InstanceSceneDataOffset; // Link to the range of instances that belong to this primitive
int NumInstanceSceneDataEntries;
int PersistentPrimitiveIndex;
uint SingleCaptureIndex; // TODO: Use 16 bits? 8 bits?
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
float3 TilePosition;
uint PrimitiveComponentId; // TODO: Refactor to use PersistentPrimitiveIndex, ENGINE USE ONLY - will be removed
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
FLWCMatrix LocalToWorld;
FLWCInverseMatrix WorldToLocal;
FLWCMatrix PreviousLocalToWorld;
FLWCInverseMatrix PreviousWorldToLocal;
float3 InvNonUniformScale;
float ObjectBoundsX;
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
FLWCVector3 ObjectWorldPosition;
FLWCVector3 ActorWorldPosition;
float ObjectRadius;
uint LightmapUVIndex; // TODO: Use 16 bits? // TODO: Move into associated array that disappears if static lighting is disabled
float3 ObjectOrientation; // TODO: More efficient representation?
uint LightmapDataIndex; // TODO: Use 16 bits? // TODO: Move into associated array that disappears if static lighting is disabled
float4 NonUniformScale;
float3 PreSkinnedLocalBoundsMin;
uint NaniteResourceID;
float3 PreSkinnedLocalBoundsMax;
uint NaniteHierarchyOffset;
float3 LocalObjectBoundsMin;
float ObjectBoundsY;
float3 LocalObjectBoundsMax;
float ObjectBoundsZ;
uint InstancePayloadDataOffset;
uint InstancePayloadDataStride; // TODO: Use 16 bits? 8 bits?
float3 InstanceLocalBoundsCenter;
float3 InstanceLocalBoundsExtent;
float3 WireframeColor; // TODO: Should refactor out all editor data into a separate buffer
float3 LevelColor; // TODO: Should refactor out all editor data into a separate buffer
uint NaniteImposterIndex;
uint NaniteFilterFlags;
float2 CameraDistanceCullMinMaxSquared;
float4 CustomPrimitiveData[NUM_CUSTOM_PRIMITIVE_DATA]; // TODO: Move to associated array to shrink primitive data and pack cachelines more effectively
};
// Fetch from Primitive uniform buffer
FPrimitiveSceneData GetPrimitiveDataFromUniformBuffer()
{
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
FPrimitiveSceneData PrimitiveData;
PrimitiveData.Flags = Primitive.Flags;
PrimitiveData.InstanceSceneDataOffset = Primitive.InstanceSceneDataOffset;
PrimitiveData.NumInstanceSceneDataEntries = Primitive.NumInstanceSceneDataEntries;
PrimitiveData.SingleCaptureIndex = Primitive.SingleCaptureIndex;
PrimitiveData.TilePosition = Primitive.TilePosition;
PrimitiveData.PrimitiveComponentId = Primitive.PrimitiveComponentId;
PrimitiveData.LocalToWorld = MakeLWCMatrix4x3(Primitive.TilePosition, Primitive.LocalToRelativeWorld);
PrimitiveData.WorldToLocal = MakeLWCInverseMatrix4x3(Primitive.TilePosition, Primitive.RelativeWorldToLocal);
PrimitiveData.PreviousLocalToWorld = MakeLWCMatrix4x3(Primitive.TilePosition, Primitive.PreviousLocalToRelativeWorld);
PrimitiveData.PreviousWorldToLocal = MakeLWCInverseMatrix4x3(Primitive.TilePosition, Primitive.PreviousRelativeWorldToLocal);
PrimitiveData.InvNonUniformScale = Primitive.InvNonUniformScale;
PrimitiveData.ObjectBoundsX = Primitive.ObjectBoundsX;
PrimitiveData.ObjectWorldPosition = MakeLWCVector3(Primitive.TilePosition, Primitive.ObjectRelativeWorldPositionAndRadius.xyz);
PrimitiveData.ObjectRadius = Primitive.ObjectRelativeWorldPositionAndRadius.w;
PrimitiveData.ActorWorldPosition = MakeLWCVector3(Primitive.TilePosition, Primitive.ActorRelativeWorldPosition);
PrimitiveData.LightmapUVIndex = Primitive.LightmapUVIndex;
PrimitiveData.ObjectOrientation = Primitive.ObjectOrientation;
PrimitiveData.LightmapDataIndex = Primitive.LightmapDataIndex;
PrimitiveData.NonUniformScale = Primitive.NonUniformScale;
PrimitiveData.PreSkinnedLocalBoundsMin = Primitive.PreSkinnedLocalBoundsMin;
PrimitiveData.NaniteResourceID = Primitive.NaniteResourceID;
PrimitiveData.PreSkinnedLocalBoundsMax = Primitive.PreSkinnedLocalBoundsMax;
PrimitiveData.NaniteHierarchyOffset = Primitive.NaniteHierarchyOffset;
PrimitiveData.LocalObjectBoundsMin = Primitive.LocalObjectBoundsMin;
PrimitiveData.ObjectBoundsY = Primitive.ObjectBoundsY;
PrimitiveData.LocalObjectBoundsMax = Primitive.LocalObjectBoundsMax;
PrimitiveData.ObjectBoundsZ = Primitive.ObjectBoundsZ;
PrimitiveData.InstancePayloadDataOffset = Primitive.InstancePayloadDataOffset;
PrimitiveData.InstancePayloadDataStride = Primitive.InstancePayloadDataStride;
PrimitiveData.WireframeColor = Primitive.WireframeColor;
PrimitiveData.LevelColor = Primitive.LevelColor;
PrimitiveData.NaniteImposterIndex = Primitive.NaniteImposterIndexAndFilterFlags & NANITE_IMPOSTER_INDEX_MASK;
PrimitiveData.NaniteFilterFlags = Primitive.NaniteImposterIndexAndFilterFlags >> NANITE_IMPOSTER_INDEX_NUM_BITS;
PrimitiveData.CameraDistanceCullMinMaxSquared = Primitive.CameraDistanceCullMinMaxSquared;
PrimitiveData.PersistentPrimitiveIndex = Primitive.PersistentPrimitiveIndex;
UNROLL
for (int DataIndex = 0; DataIndex < NUM_CUSTOM_PRIMITIVE_DATA; ++DataIndex)
{
PrimitiveData.CustomPrimitiveData[DataIndex] = Primitive.CustomPrimitiveData[DataIndex];
}
return PrimitiveData;
}
#if VF_USE_PRIMITIVE_SCENE_DATA
#if USE_GLOBAL_GPU_SCENE_DATA
StructuredBuffer<float4> GPUScenePrimitiveSceneData;
#elif USE_GLOBAL_GPU_SCENE_DATA_RW
RWStructuredBuffer<float4> GPUScenePrimitiveSceneDataRW;
#endif
// Stride of a single primitive's data in float4's, must match C++
#define PRIMITIVE_SCENE_DATA_STRIDE 41
float4 LoadPrimitivePrimitiveSceneDataElement(uint PrimitiveIndex, uint ItemIndex)
{
uint TargetIdx = PrimitiveIndex + ItemIndex;
#if USE_GLOBAL_GPU_SCENE_DATA
checkStructuredBufferAccessSlow(GPUScenePrimitiveSceneData, TargetIdx);
return GPUScenePrimitiveSceneData[TargetIdx];
#elif USE_GLOBAL_GPU_SCENE_DATA_RW
checkStructuredBufferAccessSlow(GPUScenePrimitiveSceneDataRW, TargetIdx);
return GPUScenePrimitiveSceneDataRW[TargetIdx];
#else
checkStructuredBufferAccessSlow(View.PrimitiveSceneData, TargetIdx);
return View.PrimitiveSceneData[TargetIdx];
#endif
}
// Fetch from scene primitive buffer
FPrimitiveSceneData GetPrimitiveData(uint PrimitiveId)
{
#if (FEATURE_LEVEL == FEATURE_LEVEL_ES3_1 && VERTEXSHADER)
// Vertex shaders do not have access to GPUScene on mobile. Use GetPrimitiveData(FVertexFactoryIntermediates Intermediates)
// TODO: need a way to report invalid usage, after all dead code elimination
return (FPrimitiveSceneData)0;
#else
FPrimitiveSceneData PrimitiveData = (FPrimitiveSceneData)0;
// Note: layout must match FPrimitiveSceneShaderData in C++
// Relying on optimizer to remove unused loads
uint PrimitiveIndex = PrimitiveId * PRIMITIVE_SCENE_DATA_STRIDE;
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
float3 TilePosition = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 1).xyz;
float4x4 LocalToWorld;
LocalToWorld[0] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 2).xyz, 0.0f);
LocalToWorld[1] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 3).xyz, 0.0f);
LocalToWorld[2] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 4).xyz, 0.0f);
LocalToWorld[3] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 5).xyz, 1.0f);
float4x4 PreviousLocalToWorld;
PreviousLocalToWorld[0] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 10).xyz, 0.0f);
PreviousLocalToWorld[1] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 11).xyz, 0.0f);
PreviousLocalToWorld[2] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 12).xyz, 0.0f);
PreviousLocalToWorld[3] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 13).xyz, 1.0f);
float4x4 WorldToLocal;
WorldToLocal[0] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 6).xyz, 0.0f);
WorldToLocal[1] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 7).xyz, 0.0f);
WorldToLocal[2] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 8).xyz, 0.0f);
WorldToLocal[3] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 9).xyz, 1.0f);
float4x4 PreviousWorldToLocal;
PreviousWorldToLocal[0] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 14).xyz, 0.0f);
PreviousWorldToLocal[1] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 15).xyz, 0.0f);
PreviousWorldToLocal[2] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 16).xyz, 0.0f);
PreviousWorldToLocal[3] = float4(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 17).xyz, 1.0f);
float4 ObjectWorldPositionAndRadius = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 19);
PrimitiveData.Flags = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 0).x);
PrimitiveData.InstanceSceneDataOffset = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 0).y);
PrimitiveData.NumInstanceSceneDataEntries = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 0).z);
PrimitiveData.SingleCaptureIndex = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 0).w);
PrimitiveData.TilePosition = TilePosition; // 1.xyz
PrimitiveData.PrimitiveComponentId = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 1).w);
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
PrimitiveData.LocalToWorld = MakeLWCMatrix4x3(TilePosition, LocalToWorld);
PrimitiveData.WorldToLocal = MakeLWCInverseMatrix4x3(TilePosition, WorldToLocal);
PrimitiveData.PreviousLocalToWorld = MakeLWCMatrix4x3(TilePosition, PreviousLocalToWorld);
PrimitiveData.PreviousWorldToLocal = MakeLWCInverseMatrix4x3(TilePosition, PreviousWorldToLocal);
PrimitiveData.InvNonUniformScale = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 18).xyz;
PrimitiveData.ObjectBoundsX = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 18).w;
PrimitiveData.ObjectWorldPosition = MakeLWCVector3(TilePosition, ObjectWorldPositionAndRadius.xyz);
PrimitiveData.ObjectRadius = ObjectWorldPositionAndRadius.w;
PrimitiveData.ActorWorldPosition = MakeLWCVector3(TilePosition, LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 20).xyz);
PrimitiveData.LightmapUVIndex = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 20).w);
PrimitiveData.ObjectOrientation = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 21).xyz;
PrimitiveData.LightmapDataIndex = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 21).w);
PrimitiveData.NonUniformScale = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 22);
PrimitiveData.PreSkinnedLocalBoundsMin = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 23).xyz;
PrimitiveData.NaniteResourceID = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 23).w);
PrimitiveData.PreSkinnedLocalBoundsMax = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 24).xyz;
PrimitiveData.NaniteHierarchyOffset = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 24).w);
PrimitiveData.LocalObjectBoundsMin = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 25).xyz;
PrimitiveData.ObjectBoundsY = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 25).w;
PrimitiveData.LocalObjectBoundsMax = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 26).xyz;
PrimitiveData.ObjectBoundsZ = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 26).w;
PrimitiveData.InstanceLocalBoundsCenter = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 27).xyz;
PrimitiveData.InstancePayloadDataOffset = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 27).w);
PrimitiveData.InstanceLocalBoundsExtent = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 28).xyz;
PrimitiveData.InstancePayloadDataStride = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 28).w);
PrimitiveData.WireframeColor = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 29).xyz;
PrimitiveData.LevelColor = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 30).xyz;
PrimitiveData.PersistentPrimitiveIndex = asint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 30).w);
PrimitiveData.CameraDistanceCullMinMaxSquared = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 31).xy;
uint NaniteImposterIndexAndFilterFlags = asuint(LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 29).w);
PrimitiveData.NaniteFilterFlags = NaniteImposterIndexAndFilterFlags >> NANITE_IMPOSTER_INDEX_NUM_BITS;
PrimitiveData.NaniteImposterIndex = NaniteImposterIndexAndFilterFlags & NANITE_IMPOSTER_INDEX_MASK;
if (PrimitiveData.NaniteImposterIndex == NANITE_IMPOSTER_INDEX_MASK)
{
PrimitiveData.NaniteImposterIndex = INVALID_NANITE_IMPOSTER_INDEX;
}
// TODO: Move to associated array (and editor data) to shrink primitive data and better pack cachelines
UNROLL
for (int DataIndex = 0; DataIndex < NUM_CUSTOM_PRIMITIVE_DATA; ++DataIndex)
{
PrimitiveData.CustomPrimitiveData[DataIndex] = LoadPrimitivePrimitiveSceneDataElement(PrimitiveIndex, 32 + DataIndex);
}
return PrimitiveData;
#endif
}
#else // !VF_USE_PRIMITIVE_SCENE_DATA
FPrimitiveSceneData GetPrimitiveData(uint PrimitiveId)
{
return GetPrimitiveDataFromUniformBuffer();
}
#endif // VF_USE_PRIMITIVE_SCENE_DATA
float GetPrimitive_DeterminantSign_FromFlags(uint Flags)
{
return CondMask(Flags & PRIMITIVE_SCENE_DATA_FLAG_DETERMINANT_SIGN, -1.0f, 1.0f);
}
float GetPrimitive_DeterminantSign(uint PrimitiveId)
{
return GetPrimitive_DeterminantSign_FromFlags(GetPrimitiveData(PrimitiveId).Flags);
}
#if VF_USE_PRIMITIVE_SCENE_DATA
float GetPrimitive_DeterminantSign(FPrimitiveSceneData Primitive)
{
return GetPrimitive_DeterminantSign_FromFlags(Primitive.Flags);
}
#endif
float GetPrimitive_PerObjectGBufferData_FromFlags(uint Flags)
{
const float CapsuleRepresentation = CondMask(Flags & PRIMITIVE_SCENE_DATA_FLAG_HAS_CAPSULE_REPRESENTATION, 1.0f, 0.0f);
const float CastContactShadow = CondMask(Flags & PRIMITIVE_SCENE_DATA_FLAG_HAS_CAST_CONTACT_SHADOW, 1.0f, 0.0f);
return (2.0f * CapsuleRepresentation + CastContactShadow) / 3.0f;
}
float GetPrimitive_PerObjectGBufferData(uint PrimitiveId)
{
return GetPrimitive_PerObjectGBufferData_FromFlags(GetPrimitiveData(PrimitiveId).Flags);
}
#if VF_USE_PRIMITIVE_SCENE_DATA
float GetPrimitive_PerObjectGBufferData(FPrimitiveSceneData Primitive)
{
return GetPrimitive_PerObjectGBufferData_FromFlags(Primitive.Flags);
}
#endif
uint GetPrimitive_LightingChannelMask_FromFlags(uint Flags)
{
const uint Channel0 = CondMask(Flags & PRIMITIVE_SCENE_DATA_FLAG_LIGHTING_CHANNEL_0, 1u, 0u);
const uint Channel1 = CondMask(Flags & PRIMITIVE_SCENE_DATA_FLAG_LIGHTING_CHANNEL_1, 1u, 0u);
const uint Channel2 = CondMask(Flags & PRIMITIVE_SCENE_DATA_FLAG_LIGHTING_CHANNEL_2, 1u, 0u);
return (Channel0 | (Channel1 << 1u) | (Channel2 << 2u));
}
uint GetPrimitive_LightingChannelMask(uint PrimitiveId)
{
return GetPrimitive_LightingChannelMask_FromFlags(GetPrimitiveData(PrimitiveId).Flags);
}
#if VF_USE_PRIMITIVE_SCENE_DATA
uint GetPrimitive_LightingChannelMask(FPrimitiveSceneData Primitive)
{
return GetPrimitive_LightingChannelMask_FromFlags(Primitive.Flags);
}
#endif
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
// Unpacked AoS layout - see FInstanceSceneShaderData::Setup() for SoA packed layout.
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
#if USE_EDITOR_SHADERS
struct FInstanceSceneEditorData
{
float3 HitProxyId;
uint HitProxyPacked;
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
bool bIsSelected;
};
#endif
float3 UnpackHitProxyId(uint HitProxyPacked)
{
// BGR (dword) -> RGA (float)
return float3
(
float((HitProxyPacked ) & 0xFF),
float((HitProxyPacked >> 8u) & 0xFF),
float((HitProxyPacked >> 16u) & 0xFF)
) * (1.0f / 255.0f);
}
struct FInstancePayloadDataOffsets
{
uint HierarchyOffset;
uint EditorData;
uint LocalBounds;
uint DynamicData;
uint LightShadowUVBias;
uint CustomData;
};
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
struct FInstanceSceneData
{
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
FLWCMatrix LocalToWorld;
FLWCMatrix PrevLocalToWorld;
FLWCInverseMatrix WorldToLocal;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
float4 NonUniformScale;
float3 InvNonUniformScale;
float DeterminantSign;
float3 LocalBoundsCenter;
uint PrimitiveId;
uint RelativeId;
uint PayloadDataOffset;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
float3 LocalBoundsExtent;
uint LastUpdateSceneFrameNumber;
uint NaniteRuntimeResourceID;
uint NaniteHierarchyOffset;
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
#if 1//USES_PER_INSTANCE_RANDOM
float RandomID;
#endif
#if ENABLE_PER_INSTANCE_CUSTOM_DATA
uint CustomDataOffset;
uint CustomDataCount;
#endif
#if 1 //NEEDS_LIGHTMAP_COORDINATE // TODO: Fix Me
float4 LightMapAndShadowMapUVBias;
#endif
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
bool ValidInstance;
uint Flags;
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
#if USE_EDITOR_SHADERS
FInstanceSceneEditorData EditorData;
#endif
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
};
#if USE_GLOBAL_GPU_SCENE_DATA
StructuredBuffer<float4> GPUSceneInstanceSceneData;
StructuredBuffer<float4> GPUSceneInstancePayloadData;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
uint GPUSceneFrameNumber;
#elif USE_GLOBAL_GPU_SCENE_DATA_RW
RWStructuredBuffer<float4> GPUSceneInstanceSceneDataRW;
RWStructuredBuffer<float4> GPUSceneInstancePayloadDataRW;
uint GPUSceneFrameNumber;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
#endif
uint GetGPUSceneFrameNumber()
{
#if USE_GLOBAL_GPU_SCENE_DATA
return GPUSceneFrameNumber;
#else
return View.FrameNumber;
#endif
}
float4 LoadInstanceSceneDataElement(uint Index)
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
{
#if USE_GLOBAL_GPU_SCENE_DATA
return GPUSceneInstanceSceneData[Index];
#elif USE_GLOBAL_GPU_SCENE_DATA_RW
return GPUSceneInstanceSceneDataRW[Index];
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
#else
return View.InstanceSceneData[Index];
#endif
}
float4 LoadInstancePayloadDataElement(uint Index)
{
#if USE_GLOBAL_GPU_SCENE_DATA
return GPUSceneInstancePayloadData[Index];
#elif USE_GLOBAL_GPU_SCENE_DATA_RW
return GPUSceneInstancePayloadDataRW[Index];
#else
return View.InstancePayloadData[Index];
#endif
}
float4 LoadInstanceCustomDataElement(FInstanceSceneData SceneData, uint Float4Index)
{
#if ENABLE_PER_INSTANCE_CUSTOM_DATA
const uint NumCustomFloat4s = (SceneData.CustomDataCount + 3u) >> 2u;
if (SceneData.CustomDataOffset != 0xFFFFFFFFu && Float4Index < NumCustomFloat4s)
{
return LoadInstancePayloadDataElement(SceneData.CustomDataOffset + Float4Index);
}
#endif
return (float4)0.0f;
}
float LoadInstanceCustomDataFloat(FInstanceSceneData SceneData, uint FloatIndex)
{
#if ENABLE_PER_INSTANCE_CUSTOM_DATA
const uint Float4Index = FloatIndex >> 2u;
const uint ComponentIndex = FloatIndex % 4u;
const float4 Element = LoadInstanceCustomDataElement(SceneData, Float4Index);
return Element[ComponentIndex];
#else
return 0.0f;
#endif
}
// [Frisvad 2012, "Building an Orthonormal Basis from a 3D Unit Vector Without Normalization"]
void GetHemiOrthoBasis( inout float3 BasisX, inout float3 BasisY, float3 BasisZ )
{
float A = 1.0f / ( 1.0f + BasisZ.z );
float B = -BasisZ.x * BasisZ.y * A;
BasisX = float3( 1.0f - BasisZ.x * BasisZ.x * A, B, -BasisZ.x );
BasisY = float3( B, 1.0f - BasisZ.y * BasisZ.y * A, -BasisZ.y );
}
uint4 EncodeScaleAndRotation(float3 Scale, float3x3 Axis)
{
const uint ExpBits = 8;
const uint ExpBias = ( 1u << (ExpBits - 1) ) - 1;
const uint SignMantissaBits = 16;
const uint SignMantissaMask = (1u << SignMantissaBits) - 1;
const uint MantissaBits = SignMantissaBits - 1;
const float Sqrt2 = 1.41421356f;
uint4 Output;
// Rotation
{
if( Axis[2].z < 0.0f )
{
Axis[2] *= -1.0f;
Scale.z *= -1.0f;
}
float2 OctZ = UnitVectorToHemiOctahedron( Axis[2] );
float3 BasisX, BasisY;
GetHemiOrthoBasis( BasisX, BasisY, Axis[2] );
float X = dot(Axis[0], BasisX);
float Y = dot(Axis[0], BasisY);
float aX = abs( X );
float aY = abs( Y );
bool bSpinIsX = aX < aY;
float Spin0 = bSpinIsX ? X : Y;
float Spin1 = bSpinIsX ? Y : X;
float Sign1 = Spin1 < 0.0f ? -1.0f : 1.0f;
//Axis[0] *= Sign1;
Scale.x *= Sign1;
Spin0 *= Sign1;
float3 GeneratedY = cross(Axis[2], Axis[0]);
Scale.y *= dot( Axis[1], GeneratedY ) < 0.0f ? -Sign1 : Sign1;
// Avoid sign extension in shader by biasing
Output.x = (((int)round( OctZ.x * 32767.0f ) + 32768) & 0xFFFF) << 0;
Output.x |= (((int)round( OctZ.y * 32767.0f ) + 32768) & 0xFFFF) << 16;
// NOTE: Masking the bits with `& 0x7FFF` below causes the whole int to be optimized to 0 on some shader platforms.
// This is okay, as long as Spin0 is in [0, 1], which it should be.
Output.y = ((int)round( Spin0 * 16383.0f * Sqrt2 ) + 16384); // & 0x7FFF;
Output.y |= bSpinIsX ? (1u << 15) : 0;
}
// Scale
{
float MaxComponent = max3(abs(Scale.x), abs(Scale.y), abs(Scale.z));
uint MaxComponentExponent = (asuint(MaxComponent) & 0x7f800000u) >> 23;
// Need +1 because of losing the implicit leading bit of mantissa
// TODO assumes ExpBits == 8
// TODO clamp to expressable range
uint SharedExp = MaxComponentExponent + 1;
float ExpScale = asfloat(((127 + ExpBias + MantissaBits - SharedExp) & 0xFFu) << 23);
if( (uint)round( MaxComponent * ExpScale ) == (1u << MantissaBits) )
{
// Mantissa rounded up
SharedExp++;
ExpScale *= 0.5f;
}
Output.z = (((int)round( Scale.x * ExpScale ) + (1u << MantissaBits)) & 0xFFFFu) << 0;
Output.z |= (((int)round( Scale.y * ExpScale ) + (1u << MantissaBits)) & 0xFFFFu) << 16;
Output.w = (((int)round( Scale.z * ExpScale ) + (1u << MantissaBits)) & 0xFFFFu) << 0;
Output.w |= SharedExp << 16;
}
return Output;
}
uint4 EncodeScaleAndRotation( float3x3 InTransform )
{
float3 Scale = {
length(InTransform[0]),
length(InTransform[1]),
length(InTransform[2])
};
float3x3 Axis = {
InTransform[0] / Scale.x,
InTransform[1] / Scale.y,
InTransform[2] / Scale.z
};
return EncodeScaleAndRotation(Scale, Axis);
}
void EncodeTransform( float4x4 InTransform, inout uint4 OutRotationScale, inout float3 OutTranslation )
{
OutRotationScale = EncodeScaleAndRotation((float3x3)InTransform);
OutTranslation = InTransform[3].xyz;
}
float4x4 DecodeTransform( uint4 RotationScale, float3 Translation, inout float3 Scale )
{
float4x4 M = 0.0;
M[3].xyz = Translation;
M[3].w = 1.0;
// Rotation
{
float3 Rotation =
{
( RotationScale[0] >> 0 ) & 0xffff,
( RotationScale[0] >> 16 ) & 0xffff,
( RotationScale[1] >> 0 ) & 0x7fff
};
float2 OctZ = ( Rotation.xy - 32768 ) * (1.0f / 32767.0f);
float Spin0 = ( Rotation.z - 16384 ) * (0.70710678f / 16383.0f); // rsqrt(2)
bool bSpinIsX = RotationScale[1] & 0x8000;
M[2].xyz = HemiOctahedronToUnitVector( OctZ );
float3 BasisX, BasisY;
GetHemiOrthoBasis( BasisX, BasisY, M[2].xyz );
float Spin1 = sqrt( 1.0f - Spin0 * Spin0 );
float X = bSpinIsX ? Spin0 : Spin1;
float Y = bSpinIsX ? Spin1 : Spin0;
M[0].xyz = BasisX * X + BasisY * Y;
M[1].xyz = cross( M[2].xyz, M[0].xyz );
}
// Scale
{
const uint SignMantissaBits = 16;
const uint SignMantissaMask = (1u << SignMantissaBits) - 1;
const uint MantissaBits = SignMantissaBits - 1;
#if 0
uint SharedExp = RotationScale[3] >> 22;
float ExpScale = asfloat( ( SharedExp - MantissaBits ) << 23 );
int3 Mantissa =
{
( RotationScale[2] >> 0 ),
( RotationScale[2] >> 18 ) | ( RotationScale[3] << 14 ),
( RotationScale[3] >> 4 )
};
#else
uint SharedExp = RotationScale[3] >> 16;
float ExpScale = asfloat( ( SharedExp - MantissaBits ) << 23 );
uint3 Mantissa =
{
RotationScale[2] >> 0,
RotationScale[2] >> 16,
RotationScale[3] >> 0
};
#endif
Mantissa &= SignMantissaMask;
Scale = Mantissa;
Scale -= 1u << MantissaBits;
Scale *= ExpScale;
M[0] *= Scale[0];
M[1] *= Scale[1];
M[2] *= Scale[2];
}
return M;
}
// Helpers to pack/unpack the primitive ID and flags for the specified instance, which are packed together in a uint
void UnpackPrimitiveIdAndInstanceFlags(uint PackedPrimitiveIdAndFlags, inout uint OutPrimitiveId, inout uint OutInstanceFlags)
{
OutPrimitiveId = BitFieldExtractU32(PackedPrimitiveIdAndFlags, PRIMITIVE_ID_NUM_BITS, 0);
OutInstanceFlags = BitFieldExtractU32(PackedPrimitiveIdAndFlags, INSTANCE_SCENE_DATA_FLAGS_NUM_BITS, PRIMITIVE_ID_NUM_BITS);
}
uint PackPrimitiveIdAndInstanceFlags(uint PrimitiveId, uint InstanceFlags)
{
return (PrimitiveId & PRIMITIVE_ID_MASK) | (InstanceFlags << PRIMITIVE_ID_NUM_BITS);
}
void LoadInstancePrimitiveIdAndFlags(uint InstanceId, uint SOAStride, inout uint OutPrimitiveId, inout uint OutInstanceFlags)
{
const uint PackedPrimitiveIdAndFlags = asuint(LoadInstanceSceneDataElement(0 * SOAStride + InstanceId).x);
UnpackPrimitiveIdAndInstanceFlags(PackedPrimitiveIdAndFlags, OutPrimitiveId, OutInstanceFlags);
}
// Helpers to pack/unpack the instance relative ID and custom data count for the specified instance, which are packed together in a uint
void UnpackInstanceRelativeIdAndCustomDataCount(uint PackedRelativeIdAndCustomDataCount, inout uint OutRelativeId, inout uint OutCustomDataCount)
{
OutRelativeId = BitFieldExtractU32(PackedRelativeIdAndCustomDataCount, INSTANCE_RELATIVE_ID_NUM_BITS, 0);
OutCustomDataCount = BitFieldExtractU32(PackedRelativeIdAndCustomDataCount, INSTANCE_CUSTOM_DATA_COUNT_NUM_BITS, INSTANCE_RELATIVE_ID_NUM_BITS);
}
uint PackInstanceRelativeIdAndCustomDataCount(uint RelativeId, uint CustomDataCount)
{
return (RelativeId & INSTANCE_RELATIVE_ID_MASK) | (CustomDataCount << INSTANCE_RELATIVE_ID_NUM_BITS);
}
void LoadInstanceRelativeIdAndCustomDataCount(uint InstanceId, uint SOAStride, inout uint OutPrimitiveId, inout uint OutInstanceFlags)
{
const uint PackedRelativeIdAndCustomDataCount = asuint(LoadInstanceSceneDataElement(0 * SOAStride + InstanceId).y);
UnpackInstanceRelativeIdAndCustomDataCount(PackedRelativeIdAndCustomDataCount, OutPrimitiveId, OutInstanceFlags);
}
// Helpers for getting/setting the instance determinant sign from instance data flags
float GetInstanceDeterminantSignFromFlags(uint Flags)
{
// Scale.x * Scale.y * Scale.z < 0.0 ? -1.0 : 1.0;
return CondMask(Flags & INSTANCE_SCENE_DATA_FLAG_DETERMINANT_SIGN, -1.0f, 1.0f);
}
void SetInstanceDeterminantSignFlag(float Determinant, inout uint Flags)
{
if (Determinant < 0.0f)
{
Flags |= INSTANCE_SCENE_DATA_FLAG_DETERMINANT_SIGN;
}
else
{
Flags &= ~INSTANCE_SCENE_DATA_FLAG_DETERMINANT_SIGN;
}
}
// Determine the offsets into the payload data buffer for the given instance
FInstancePayloadDataOffsets GetInstancePayloadDataOffsets(uint PrimitiveId, uint Flags, uint InstanceRelativeId)
{
const uint PayloadDataRelativeOffset = InstanceRelativeId * GetPrimitiveData(PrimitiveId).InstancePayloadDataStride;
const uint PayloadDataGlobalOffset = PayloadDataRelativeOffset + GetPrimitiveData(PrimitiveId).InstancePayloadDataOffset;
const bool bHasHierarchyOffset = (Flags & INSTANCE_SCENE_DATA_FLAG_HAS_HIERARCHY_OFFSET) != 0u;
const bool bHasLocalBounds = (Flags & INSTANCE_SCENE_DATA_FLAG_HAS_LOCAL_BOUNDS) != 0u;
const bool bHasDynamicData = (Flags & INSTANCE_SCENE_DATA_FLAG_HAS_DYNAMIC_DATA) != 0u;
const bool bHasLightShadowUVBias = (Flags & INSTANCE_SCENE_DATA_FLAG_HAS_LIGHTSHADOW_UV_BIAS) != 0u;
const bool bHasCustomData = (Flags & INSTANCE_SCENE_DATA_FLAG_HAS_CUSTOM_DATA) != 0u;
#if USE_EDITOR_SHADERS
const bool bHasEditorData = (Flags & INSTANCE_SCENE_DATA_FLAG_HAS_EDITOR_DATA) != 0u;
#else
const bool bHasEditorData = false;
#endif
uint CurOffset = PayloadDataGlobalOffset;
// Offsets are in float4s
FInstancePayloadDataOffsets Offsets;
Offsets.HierarchyOffset = INVALID_INSTANCE_PAYLOAD_OFFSET;
Offsets.EditorData = INVALID_INSTANCE_PAYLOAD_OFFSET;
Offsets.LocalBounds = INVALID_INSTANCE_PAYLOAD_OFFSET;
Offsets.DynamicData = INVALID_INSTANCE_PAYLOAD_OFFSET;
Offsets.LightShadowUVBias = INVALID_INSTANCE_PAYLOAD_OFFSET;
Offsets.CustomData = INVALID_INSTANCE_PAYLOAD_OFFSET;
// Hierarchy Offset -> float0.x
if (bHasHierarchyOffset)
{
Offsets.HierarchyOffset = CurOffset;
}
// EditorData -> float0.y
if (bHasEditorData)
{
Offsets.EditorData = CurOffset;
}
// LocalBounds -> float0.zw & float1.xyzw
if (bHasLocalBounds)
{
Offsets.LocalBounds = CurOffset;
}
CurOffset += CondMask(bHasLocalBounds, 2u, CondMask(bHasHierarchyOffset || bHasEditorData, 1u, 0u));
if (bHasDynamicData)
{
Offsets.DynamicData = CurOffset;
CurOffset += InstanceTransformSizeFloat4Count;
}
if (bHasLightShadowUVBias)
{
Offsets.LightShadowUVBias = CurOffset;
++CurOffset;
}
if (bHasCustomData)
{
Offsets.CustomData = CurOffset;
}
return Offsets;
}
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
void ComputeInstanceDerivedData(inout FInstanceSceneData InstanceData, float3 TilePosition, float4x4 LocalToRelativeWorld)
{
//
// Do not put any load operations here!
//
#if (VF_USE_PRIMITIVE_SCENE_DATA == 2) || !INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS
// Non-uniform scale must be computed from the transform because it was not already computed when decoding it (see below in GetInstanceSceneData)
float3 Scale2;
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
Scale2.x = length2(LocalToRelativeWorld[0].xyz);
Scale2.y = length2(LocalToRelativeWorld[1].xyz);
Scale2.z = length2(LocalToRelativeWorld[2].xyz);
InstanceData.InvNonUniformScale = rsqrt(Scale2);
InstanceData.NonUniformScale.xyz = Scale2 * InstanceData.InvNonUniformScale;
#endif
InstanceData.NonUniformScale.w = max3( InstanceData.NonUniformScale.x, InstanceData.NonUniformScale.y, InstanceData.NonUniformScale.z );
InstanceData.DeterminantSign = GetInstanceDeterminantSignFromFlags(InstanceData.Flags);
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
float4x4 RelativeWorldToLocal = LocalToRelativeWorld;
RelativeWorldToLocal[0].xyz *= Pow2(InstanceData.InvNonUniformScale.x);
RelativeWorldToLocal[1].xyz *= Pow2(InstanceData.InvNonUniformScale.y);
RelativeWorldToLocal[2].xyz *= Pow2(InstanceData.InvNonUniformScale.z);
RelativeWorldToLocal[3].xyz = 0.0f;
RelativeWorldToLocal = transpose(RelativeWorldToLocal);
RelativeWorldToLocal[3].xyz = mul(float4(-LocalToRelativeWorld[3].xyz, 0.0f), RelativeWorldToLocal).xyz;
InstanceData.WorldToLocal = MakeLWCInverseMatrix(TilePosition, RelativeWorldToLocal);
}
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
// Fetch from scene primitive buffer
FInstanceSceneData GetInstanceSceneData(uint InstanceId, uint SOAStride, bool bCheckValid = true)
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
{
FInstanceSceneData InstanceData = (FInstanceSceneData)0;
//
// NOTE: When changing the packed data layout, ensure that GPUScene/GPUSceneWriter.ush is kept in sync!
// Also, please update the GetInstanceSceneData function in GPUScene.cpp for validation purposes.
//
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
// Only process valid instances
LoadInstancePrimitiveIdAndFlags(InstanceId, SOAStride, InstanceData.PrimitiveId, InstanceData.Flags);
InstanceData.ValidInstance = InstanceData.PrimitiveId != INVALID_PRIMITIVE_ID;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
// Payload Data Layout
// NOTE: Per-instance local bounds and hierarchy offset are always mutually inclusive, so pack together.
// Random ID <packed inline>
// Custom Data Count <packed inline>
// HierarchyOffset float0.x
// LocalBounds Center float0.yzw
// LocalBounds Extent float1.xyz
// __UNUSED float1.w
#if INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS
// Previous Transform[0] float2.xyzw
// Previous Transform[1] float3.xyzw
// LM/SM Scale Bias float4.xyzw
// Custom Data Float4s float5.xyzw ... floatN.xyzw
#else
// Previous Transform[0] float2.xyzw
// Previous Transform[1] float3.xyzw
// Previous Transform[2] float4.xyzw
// LM/SM Scale Bias float5.xyzw
// Custom Data Float4s float6.xyzw ... floatN.xyzw
#endif
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
BRANCH
if (!bCheckValid || InstanceData.ValidInstance)
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
{
uint CustomDataCount;
LoadInstanceRelativeIdAndCustomDataCount(InstanceId, SOAStride, InstanceData.RelativeId, CustomDataCount);
FInstancePayloadDataOffsets Offsets = GetInstancePayloadDataOffsets(InstanceData.PrimitiveId, InstanceData.Flags, InstanceData.RelativeId);
#if ENABLE_PER_INSTANCE_CUSTOM_DATA
InstanceData.CustomDataCount = CustomDataCount;
InstanceData.CustomDataOffset = Offsets.CustomData;
#endif
InstanceData.LastUpdateSceneFrameNumber = asuint(LoadInstanceSceneDataElement(0 * SOAStride + InstanceId).z);
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
#if 1//USES_PER_INSTANCE_RANDOM
InstanceData.RandomID = LoadInstanceSceneDataElement(0 * SOAStride + InstanceId).w;
#endif
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
float3 TilePosition = GetPrimitiveData(InstanceData.PrimitiveId).TilePosition;
#if INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS
uint4 RotationScale = asuint(LoadInstanceSceneDataElement(1 * SOAStride + InstanceId));
float3 Translation = LoadInstanceSceneDataElement(2 * SOAStride + InstanceId).xyz;
float3 Scale = 0;
float4x4 LocalToRelativeWorld = DecodeTransform( RotationScale, Translation, Scale );
uint4 PrevRotationScale = asuint(LoadInstanceSceneDataElement(3 * SOAStride + InstanceId));
float3 PrevTranslation = LoadInstanceSceneDataElement(4 * SOAStride + InstanceId).xyz;
float3 PrevScale = 0;
float4x4 PrevLocalToRelativeWorld = DecodeTransform( PrevRotationScale, PrevTranslation, PrevScale );
InstanceData.NonUniformScale.xyz = abs(Scale);
InstanceData.InvNonUniformScale = rcp(InstanceData.NonUniformScale.xyz);
#else
float4x4 LocalToRelativeWorld = transpose(float4x4(LoadInstanceSceneDataElement(1 * SOAStride + InstanceId),
LoadInstanceSceneDataElement(2 * SOAStride + InstanceId),
LoadInstanceSceneDataElement(3 * SOAStride + InstanceId),
float4(0.0f, 0.0f, 0.0f, 1.0f)));
float4x4 PrevLocalToRelativeWorld = transpose(float4x4(LoadInstanceSceneDataElement(4 * SOAStride + InstanceId),
LoadInstanceSceneDataElement(5 * SOAStride + InstanceId),
LoadInstanceSceneDataElement(6 * SOAStride + InstanceId),
float4(0.0f, 0.0f, 0.0f, 1.0f)));
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
#endif
InstanceData.LocalToWorld = MakeLWCMatrix(TilePosition, LocalToRelativeWorld);
InstanceData.PrevLocalToWorld = MakeLWCMatrix(TilePosition, PrevLocalToRelativeWorld);
ComputeInstanceDerivedData(InstanceData, TilePosition, LocalToRelativeWorld);
InstanceData.NaniteRuntimeResourceID = GetPrimitiveData(InstanceData.PrimitiveId).NaniteResourceID;
InstanceData.NaniteHierarchyOffset = GetPrimitiveData(InstanceData.PrimitiveId).NaniteHierarchyOffset;
BRANCH
if (Offsets.HierarchyOffset != INVALID_INSTANCE_PAYLOAD_OFFSET)
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
{
const uint HierarchyRootOffset = asuint(LoadInstancePayloadDataElement(Offsets.HierarchyOffset)).x;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
// Combine this instance's hierarchy offset with the primitive's root hierarchy offset
InstanceData.NaniteHierarchyOffset += HierarchyRootOffset;
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
}
#if USE_EDITOR_SHADERS
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
BRANCH
if (Offsets.EditorData != INVALID_INSTANCE_PAYLOAD_OFFSET)
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
{
const uint PackedEditorData = asuint(LoadInstancePayloadDataElement(Offsets.EditorData)).y;
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
InstanceData.EditorData.bIsSelected = (PackedEditorData >> 24u) != 0;
InstanceData.EditorData.HitProxyPacked = PackedEditorData & 0x00FFFFFFu;
InstanceData.EditorData.HitProxyId = UnpackHitProxyId(InstanceData.EditorData.HitProxyPacked);
Make LocalVF ISM/HISM not need any instance attributes except in editor, move remaining data to GPU-Scene. - USE_EDITOR_SHADERS moved MaterialTemplate.ush -> Common.ush to allow use in global shaders as well as in includes that happen earlier in MaterialTemplate.ush - PLATFORM_SUPPORTS_EDITOR_SHADERS default define moved to Platform.ush to enable use in Common.ush - Fetch vertex shader custom data from GPU-Scene whenever GPU-scene instance culling path is used. - Plumb through per-instance editor data to upload in the payload buffer of GPU-Scene - Skip uploading per-instance vertex attributes and creating VF uniform buffer when GPU-Scene is active (ISM) - Fetch all instance attributes from GPU-Scene, custom data and editor data, and only do so for editor platforms (LocalVF) - When doing UpdateInstances only recreate mesh draw commands if the instance count changed. Since we no longer have the per instance render data buffers they no longer are in the MDC and don't need to update. The only thing left in the MDC that needs updating is the instance count. - Skip creating/flushing per-instance render data buffers for ISM proxies. #rb Ola.Olsson, Jason.Nadro #preflight 6171d98395715b0001872c00, 6171e2a94d6efa00018197af #lockdown Michal.Valient #ddcfill: 61729e14c33baf00011f4e8b #ROBOMERGE-OWNER: jason.nadro #ROBOMERGE-AUTHOR: jason.nadro #ROBOMERGE-SOURCE: CL 17896781 via CL 18006634 via CL 18370361 via CL 18370432 #ROBOMERGE-BOT: STARSHIP (Release-Engine-Staging -> Release-Engine-Test) (v895-18170469) [CL 18370505 by jason nadro in ue5-release-engine-test branch]
2021-12-03 13:43:00 -05:00
}
#endif
BRANCH
if (Offsets.LocalBounds != INVALID_INSTANCE_PAYLOAD_OFFSET)
{
InstanceData.LocalBoundsCenter = float3(LoadInstancePayloadDataElement(Offsets.LocalBounds + 0).zw, LoadInstancePayloadDataElement(Offsets.LocalBounds + 1).x);
InstanceData.LocalBoundsExtent = LoadInstancePayloadDataElement(Offsets.LocalBounds + 1).yzw;
}
else
{
InstanceData.LocalBoundsCenter = GetPrimitiveData(InstanceData.PrimitiveId).InstanceLocalBoundsCenter;
InstanceData.LocalBoundsExtent = GetPrimitiveData(InstanceData.PrimitiveId).InstanceLocalBoundsExtent;
}
BRANCH
if (Offsets.DynamicData != INVALID_INSTANCE_PAYLOAD_OFFSET)
{
#if INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS
uint4 PrevRotationScale = asuint(LoadInstancePayloadDataElement(Offsets.DynamicData + 0));
float3 PrevTranslation = LoadInstancePayloadDataElement(Offsets.DynamicData + 1).xyz;
float3 PrevScale = 0;
float4x4 PrevLocalToRelativeWorld = DecodeTransform(PrevRotationScale, PrevTranslation, PrevScale);
#else
float4x4 PrevLocalToRelativeWorld = transpose(float4x4(LoadInstancePayloadDataElement(Offsets.DynamicData + 0),
LoadInstancePayloadDataElement(Offsets.DynamicData + 1),
LoadInstancePayloadDataElement(Offsets.DynamicData + 2),
float4(0.0f, 0.0f, 0.0f, 1.0f)));
#endif
float3 TilePosition = GetPrimitiveData(InstanceData.PrimitiveId).TilePosition;
InstanceData.PrevLocalToWorld = MakeLWCMatrix(TilePosition, PrevLocalToRelativeWorld);
}
else
{
#if INSTANCE_SCENE_DATA_COMPRESSED_TRANSFORMS
// TODO: Temporary PrevVelocityHack
uint4 PrevRotationScale = asuint(LoadInstanceSceneDataElement(3 * SOAStride + InstanceId));
float3 PrevTranslation = LoadInstanceSceneDataElement(4 * SOAStride + InstanceId).xyz;
float3 PrevScale = 0;
float4x4 PrevLocalToRelativeWorld = DecodeTransform(PrevRotationScale, PrevTranslation, PrevScale);
#else
float4x4 PrevLocalToRelativeWorld = InstanceData.LocalToWorld;
#endif
float3 TilePosition = GetPrimitiveData(InstanceData.PrimitiveId).TilePosition;
InstanceData.PrevLocalToWorld = MakeLWCMatrix(TilePosition, PrevLocalToRelativeWorld);
}
#if 1 //NEEDS_LIGHTMAP_COORDINATE
BRANCH
if (Offsets.LightShadowUVBias != INVALID_INSTANCE_PAYLOAD_OFFSET)
{
InstanceData.LightMapAndShadowMapUVBias = LoadInstancePayloadDataElement(Offsets.LightShadowUVBias);
}
#endif
More instance data memory and CPU optimizations, and general refactoring. * Moved Nanite resource ID and hierarchy offset back to primitive data (indirection cost is largely noise these days, and memory savings is more important - GetInstanceData() fetches this data and stores in the pre-existing registers so all the callsites don't have to worry about it). * Removed FNaniteInfo from FPrimitiveInfo (and the type itself) in favor of just having a 32bit per-instance hierarchy root offset. Hierarchy offset and runtime resource ID just sourced from primitive. * Removed PrimitiveID from CPU instance data (GPU still needs it). This value was redundantly stored per-instance, and now the GPU Scene upload just blits it over from primitive when the instance data is uploaded. * Added new virtual GetNaniteResourceInfo on FPrimitiveSceneProxy that retrieves the Nanite resource ID and hierarchy offset stored on a given primitive (needed now that FPrimitiveInstance doesn't cache a copy). The non-virtual FPrimitiveSceneProxy::IsNaniteMesh() helper is used to avoid this virtual function call for non-Nanite proxies. * Removed deprecated and no longer referenced Nanite::FResources array on FNaniteGeometryCollectionSceneProxy * Removed expensive per-instance checks() in a few spots * Stubbed out upcoming per instance payload allocator offset * Removed deprecated and never referenced non-GPUScene GetInstanceData uniform buffer path * Significantly cleaned up and optimized a lot of ISM instance data retrieval, including avoiding unnecessary FRenderTransform <-> FMatrix conversions. * Removed GetInstanceShaderValues API, it was inefficient and annoying to require retrieval of all parameters when often only one or two of them were ever used. In some cases even the origin was unused but .w was needed because of the per-instance random packing. * Implemented explicit GetInstanceRandomID() accessor to fetch (under current packing) Origin.W without exposing this to the callsites * Implemented public API for GetInstanceLightMapData(), since the internal version was only accessible by GetInstanceShaderValues() * Renamed GetInstanceShaderCustomDataValues() -> GetInstanceCustomDataValues() for API consistency #rb jason.nadro, krzysztof.narkowicz [FYI] rune.stubbe, brian.karis #ROBOMERGE-OWNER: graham.wihlidal #ROBOMERGE-AUTHOR: graham.wihlidal #ROBOMERGE-SOURCE: CL 16583156 #ROBOMERGE-BOT: (v828-16531559) #ROBOMERGE-CONFLICT from-shelf [CL 16583193 by graham wihlidal in ue5-main branch]
2021-06-07 23:55:28 -04:00
}
return InstanceData;
}
struct FSceneDataIntermediates
{
uint PrimitiveId;
uint InstanceId;
uint ViewIndex;
// Index from which we load the instance info, needed for the
uint InstanceIdLoadIndex;
FInstanceSceneData InstanceData;
FPrimitiveSceneData Primitive;
};
/**
* Load scene data once given the inputs require.
* InstanceIdOffset - supplied as a vertex stream with 0 instance step rate (constant for all instances)
* DrawInstanceId - the instance ID (SV_InstanceID) in the current draw
*/
#if (VF_USE_PRIMITIVE_SCENE_DATA == 1)
FSceneDataIntermediates GetSceneDataIntermediates(uint InstanceIdOffset, uint DrawInstanceId)
{
FSceneDataIntermediates Intermediates = (FSceneDataIntermediates)0;
Intermediates.InstanceIdLoadIndex = InstanceIdOffset + DrawInstanceId;
// GPUCULL_TODO: workaround for the fact that DrawDynamicMeshPassPrivate et al. don't work with GPU-Scene instancing
// instead they mark the top bit in the primitive ID and disable auto instancing such that there is an 1:1:1
// drawcmd:primitive:instance. Then we can just look up the primitive and fetch the instance data index.
// GPUCULL_TODO: Workaround also used for ray tracing interfacing with the VFs, that also supply a DrawInstanceId.
// We mark the PrimitiveID with the top bit in dynamic draw passes
if ((InstanceIdOffset & VF_TREAT_INSTANCE_ID_OFFSET_AS_PRIMITIVE_ID_FLAG) != 0U)
{
// mask off the flag
uint PrimitiveID = InstanceIdOffset & (VF_TREAT_INSTANCE_ID_OFFSET_AS_PRIMITIVE_ID_FLAG - 1U);
Intermediates.InstanceId = GetPrimitiveData(PrimitiveID).InstanceSceneDataOffset + DrawInstanceId;
Intermediates.ViewIndex = 0;
}
else
{
Intermediates.InstanceId = InstanceCulling.InstanceIdsBuffer[InstanceIdOffset + DrawInstanceId] & ((1U << 28U) - 1);
// We store the view index (which can be used for instanced stereo or other multi-view in the top four bits of the instance ID)
// Note: this is an index of views for this render pass, not the view ID in the culling manager.
Intermediates.ViewIndex = InstanceCulling.InstanceIdsBuffer[InstanceIdOffset + DrawInstanceId] >> 28U;
}
Intermediates.InstanceData = GetInstanceSceneData(Intermediates.InstanceId, View.InstanceSceneDataSOAStride);
Intermediates.PrimitiveId = Intermediates.InstanceData.PrimitiveId;
Intermediates.Primitive = GetPrimitiveData(Intermediates.PrimitiveId);
return Intermediates;
}
#elif (VF_USE_PRIMITIVE_SCENE_DATA == 2)
// Must match PackLocalBoundsCenter and PackLocalBoundsExtent
float3 UnpackLocalBoundsCenter(float2 PackedCenter)
{
float3 Result = 0;
// uses 21 bits for each component, rounded to a 1 unit
const uint SPLIT_MASK = (1u << 11u) - 1u;
const float CenterBias = (1u << 20u) - 1u;
uint p0 = asuint(PackedCenter.x);
uint p1 = asuint(PackedCenter.y);
Result.x = float(p0 >> 11u) - CenterBias;
Result.y = float(p1 >> 11u) - CenterBias;
Result.z = float((p0 & SPLIT_MASK) | ((p1 & SPLIT_MASK) << 11u)) - CenterBias;
return Result;
}
float3 UnpackLocalBoundsExtent(float2 PackedExtent)
{
float3 Result = 0;
// uses 21 bits for each component, rounded to a 1 unit
const uint SPLIT_MASK = (1u << 11u) - 1u;
uint p0 = asuint(PackedExtent.x);
uint p1 = asuint(PackedExtent.y);
Result.x = float(p0 >> 11u);
Result.y = float(p1 >> 11u);
Result.z = float((p0 & SPLIT_MASK) | ((p1 & SPLIT_MASK) << 11u));
return Result;
}
FSceneDataIntermediates GetSceneDataIntermediates(uint DrawInstanceId, float4 InstanceOrigin, float4 InstanceTransform1, float4 InstanceTransform2, float4 InstanceTransform3, float4 InstanceAuxData)
{
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
// Not all mobile devices can access storage buffers from a vertex shaders
// we supply some of primtive data using per-instance vertex data, and rest of primitive data comes from Primitive UB
// If vertex shader uses any of Primitive UB data associated drawcalls will not auto-instance
FPrimitiveSceneData Primitive = GetPrimitiveDataFromUniformBuffer();
// TODO: add support for LWC, we should pack it as integer tile coordinates
float3 TilePosition = float3(0,0,0);
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
// TODO: pack important primitive and instance flags here
const uint PrimitiveFlags = asuint(InstanceTransform1.w);
const uint InstanceFlags = (PrimitiveFlags >> 16);
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
// Reconstruct InstanceData from a packed data
FInstanceSceneData InstanceData = (FInstanceSceneData)0;
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
float4x4 LocalToRelativeWorld = float4x4(
float4(InstanceTransform1.xyz, 0.0f),
float4(InstanceTransform2.xyz, 0.0f),
float4(InstanceTransform3.xyz, 0.0f),
float4(InstanceOrigin.xyz, 1.0f));
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
InstanceData.LocalToWorld = MakeLWCMatrix(TilePosition, LocalToRelativeWorld);
InstanceData.Flags = InstanceFlags;
InstanceData.PrimitiveId = asuint(InstanceOrigin.w);
#if USES_PER_INSTANCE_RANDOM
InstanceData.RandomID = InstanceTransform3.w;
#endif
ComputeInstanceDerivedData(InstanceData, TilePosition, LocalToRelativeWorld);
//
FSceneDataIntermediates Intermediates = (FSceneDataIntermediates)0;
Intermediates.InstanceData = InstanceData;
Intermediates.PrimitiveId = InstanceData.PrimitiveId;
Intermediates.InstanceId = 0;
Intermediates.ViewIndex = 0;
Intermediates.InstanceIdLoadIndex = 0;
Merging Dev-LWCRendering into Main, this includes initial work to support rendering with LWC-scale position Basic approach is to add HLSL types FLWCScalar, FLWCMatrix, FLWCVector, etc. Inside shaders, absolute world space position values should be represented as FLWCVector3. Matrices that transform *into* absolute world space become FLWCMatrix. Matrices that transform *from* world space become FLWCInverseMatrix. Generally LWC values work by extending the regular 'float' value with an additional tile coordinate. Final tile size will be a trade-off between scale/accuracy; I'm using 256k for now, but may need to be adjusted. Value represented by a FLWCVector thus becomes V.Tile * TileSize + V.Offset. Most operations can be performed directly on LWC values. There are HLSL functions like LWCAdd, LWCSub, LWCMultiply, LWCDivide (operator overloading would be really nice here). The goal is to stay with LWC values for as long as needed, then convert to regular float values when possible. One thing that comes up a lot is working in translated (rather than absolute) world space. WorldSpace + View.PrevPreViewTranslation = TranslatedWorldspace. Except 'View.PrevPreViewTranslation' is now a FLWCVector3, and WorldSpace quantities should be as well. So that becomes LWCAdd(WorldSpace, View.PrevPreViewTranslation) = TranslatedWorldspace. Assuming that we're talking about a position that's "reasonably close" to the camera, it should be safe to convert the translated WS value to float. The 'tile' coordinate of the 2 LWC values should cancel out when added together in this case. I've done some work throughout the shader code to do this. Materials are fully supporting LWC-values as well. Projective texturing and vertex animation materials that I've tested work correctly even when positioned "far away" from the origin. Lots of work remains to fully convert all of our shader code. There's a function LWCHackToFloat(), which is a simple wrapper for LWCToFloat(). The idea of HackToFloat is to mark places that need further attention, where I'm simply converting absolute WS positions to float, to get shaders to compile. Shaders converted in this way should continue to work for all existing content (without LWC-scale values), but they will break if positions get too large. General overview of changed files: LargeWorldCoordinates.ush - This defines the FLWC types and operations GPUScene.cpp, SceneData.ush - Primitives add an extra 'float3' tile coordinate. Instance data is unchanged, so instances need to stay within single-precision range of the primitive origin. Could potentially split instances behind the scenes (I think) if we don't want this limitation HLSLMaterialDerivativeAutogen.cpp, HLSLMaterialTranslator.cpp, Preshader.cpp - Translated materials to use LWC values SceneView.cpp, SceneRelativeViewMatrices.cpp, ShaderCompiler.cpp, InstancedStereo.ush - View uniform buffer includes LWC values where appropriate #jira UE-117101 #rb arne.schober, Michael.Galetzka #ROBOMERGE-AUTHOR: ben.ingram #ROBOMERGE-SOURCE: CL 17787435 in //UE5/Main/... #ROBOMERGE-BOT: STARSHIP (Main -> Release-Engine-Test) (v881-17767770) [CL 17787478 by ben ingram in ue5-release-engine-test branch]
2021-10-12 13:31:00 -04:00
Intermediates.Primitive = Primitive;
// Primitive data that comes from per-instance vertex data
Intermediates.Primitive.Flags = PrimitiveFlags;
Intermediates.Primitive.LocalToWorld = Intermediates.InstanceData.LocalToWorld;
Intermediates.Primitive.InvNonUniformScale = Intermediates.InstanceData.InvNonUniformScale;
Intermediates.Primitive.WorldToLocal = Intermediates.InstanceData.WorldToLocal;
Intermediates.Primitive.NonUniformScale = Intermediates.InstanceData.NonUniformScale;
#if ALLOW_STATIC_LIGHTING
Intermediates.Primitive.LightmapDataIndex = asuint(InstanceTransform2.w);
InstanceData.LightMapAndShadowMapUVBias = float4(
UnpackSnorm2x16(asuint(InstanceAuxData.y)),
UnpackSnorm2x16(asuint(InstanceAuxData.w)));
#else
InstanceData.LocalBoundsCenter = UnpackLocalBoundsCenter(InstanceAuxData.xy);
InstanceData.LocalBoundsExtent = UnpackLocalBoundsExtent(InstanceAuxData.zw);
Intermediates.Primitive.LocalObjectBoundsMin = InstanceData.LocalBoundsCenter - InstanceData.LocalBoundsExtent;
Intermediates.Primitive.LocalObjectBoundsMax = InstanceData.LocalBoundsCenter + InstanceData.LocalBoundsExtent;
float3 ObjectRelativeWorldPosition = mul(float4(InstanceData.LocalBoundsCenter.xyz, 0.0f), LocalToRelativeWorld).xyz;
Intermediates.Primitive.ObjectWorldPosition = MakeLWCVector3(TilePosition, ObjectRelativeWorldPosition);
Intermediates.Primitive.ObjectRadius = length(InstanceData.LocalBoundsExtent * InstanceData.NonUniformScale.xyz);
#endif
return Intermediates;
}
#else
FSceneDataIntermediates GetSceneDataIntermediates()
{
FSceneDataIntermediates Intermediates = (FSceneDataIntermediates)0;
// Populate from Primitive uniform buffer
Intermediates.ViewIndex = 0U;
Intermediates.PrimitiveId = 0U;
Intermediates.InstanceId = 0U;
Intermediates.Primitive = GetPrimitiveDataFromUniformBuffer();
// Populate instance data from primitive data
Intermediates.InstanceData.LocalToWorld = Intermediates.Primitive.LocalToWorld;
Intermediates.InstanceData.PrevLocalToWorld = Intermediates.Primitive.PreviousLocalToWorld;
Intermediates.InstanceData.WorldToLocal = Intermediates.Primitive.WorldToLocal;
Intermediates.InstanceData.NonUniformScale = Intermediates.Primitive.NonUniformScale;
Intermediates.InstanceData.InvNonUniformScale = Intermediates.Primitive.InvNonUniformScale;
Intermediates.InstanceData.DeterminantSign = GetPrimitive_DeterminantSign_FromFlags(Intermediates.Primitive.Flags);
Intermediates.InstanceData.LocalBoundsCenter = (Intermediates.Primitive.LocalObjectBoundsMax + Intermediates.Primitive.LocalObjectBoundsMin) * 0.5f;
Intermediates.InstanceData.LocalBoundsExtent = (Intermediates.Primitive.LocalObjectBoundsMax - Intermediates.Primitive.LocalObjectBoundsMin) * 0.5f;
Intermediates.InstanceData.ValidInstance = true;
return Intermediates;
}
#endif //VF_USE_PRIMITIVE_SCENE_DATA