Files
UnrealEngineUWP/Engine/Shaders/Private/DistanceFieldObjectCulling.usf
Daniel Wright 697b5b9cf6 Sparse, narrow band, streamed Mesh Signed Distance Fields
* SDFs are now generated, allocated from the atlas and uploaded in 8^3 bricks (7^3 unique data, half voxel padding).
 * Tracing must load the brick index from the indirection table, and only bricks near the surface are stored
 * 3 mips are now generated, with the lowest resolution always loaded and the other 2 streamed
 * SDFs are now G8 narrow band.  Lower resolution mips must be traversed when querying distance to nearest surface far away from the surface
 * The Distance Field Brick Atlas is now stored for each FScene and dynamically resized based on needs with a GPU memcopy
 * Brick atlas uses a 1d pooled allocator which has no fragmentation and greatly reduces packing waste over the 3d allocator
 * Added new indirection for Distance Field Asset data, so that only a single entry needs to be updated when a mip is streamed in or out in scenes with millions of instances
 * Compute shaders operating on distance field instances generate streaming requests, which are async read back to CPU, turned into IO requests, which are polled and when complete uploaded to atlases
 * Any mesh instance inside the Global SDF extent (200m) requests mip1, and at 50m requests mip2
 * Now using a batched compute scatter to upload to the distance field atlas instead of RHIUpdateTexture3d, to bypass alignment restrictions and per-upload overhead
 * Distance Field streaming uses an async task to move Memcpy and IO request overhead off of the Rendering Thread
 * Distance Field Visualization now computes a normal from the SDF gradient and does simple lighting to better visualize the scene representation
 * Increased r.DistanceFields.MaxPerMeshResolution from 128 to 512, to better represent large objects
 * Mesh SDF generation now uses an Embree point query to calculate closest unsigned distance, and then a much smaller set of rays to count backfaces for negative region determination, for a 11x speedup
 * Upgraded mesh utilities to Embree 3.12.2 to get point queries
 * Fixed wrong transform used for SDF normals in Lumen, causing non-uniformly scaled meshes to have incorrect Surface Cache interpolation
 * Fixed Static Mesh materials not getting PostLoaded before SDF build, causing their blend modes to be wrong for the build, which corrupts the DDC.  Also included those blend modes in the DDC key.

Original costs on 1080 GTX (full updates on everything and no screen traces)
10.60ms   UpdateGlobalDistanceField
3.62ms   LumenReflectiveTest.DirectionalLight_1 Shadowmap 1
1.73ms   VoxelizeCards Clipmaps=[0,1,2,3]
0.38ms   TraceCards 1 dispatch 1 groups
0.51ms   TraceCards 1 dispatch 1 groups

Sparse SDF costs
12.06ms   UpdateGlobalDistanceField
4.35ms   LumenReflectiveTest.DirectionalLight_1 Shadowmap 1
2.30ms   VoxelizeCards Clipmaps=[0,1,2,3]
0.69ms   TraceCards 1 dispatch 1 groups
0.77ms   TraceCards 1 dispatch 1 groups

Tested: TopazEntry PC, Reverb PC and PS5, EngineTests, QAGame, Rift, Frosty P_Construct_WP, FortGPUTestbed

#rb Krzysztof.Narkowicz

#ROBOMERGE-OWNER: Daniel.Wright
#ROBOMERGE-AUTHOR: daniel.wright
#ROBOMERGE-SOURCE: CL 15784493 in //UE5/Release-5.0-EarlyAccess/...
#ROBOMERGE-BOT: STARSHIP (Release-5.0-EarlyAccess -> Main) (v783-15756269)
#ROBOMERGE-CONFLICT from-shelf

[CL 15790658 by Daniel Wright in ue5-main branch]
2021-03-23 22:40:05 -04:00

422 lines
16 KiB
Plaintext

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
DistanceFieldObjectCulling.usf
=============================================================================*/
#include "Common.ush"
#include "DeferredShadingCommon.ush"
#include "DistanceFieldLightingShared.ush"
#include "DistanceFieldAOShared.ush"
#include "GlobalDistanceFieldShared.ush"
uint ObjectBoundingGeometryIndexCount;
groupshared uint NumGroupObjects;
groupshared uint GroupBaseIndex;
groupshared uint GroupObjectIndices[UPDATEOBJECTS_THREADGROUP_SIZE];
void CopyCulledObjectData(uint DestIndex, uint SourceIndex)
{
RWCulledObjectBounds[DestIndex] = SceneObjectBounds[SourceIndex * OBJECT_BOUNDS_STRIDE + 0];
UNROLL
for (uint VectorIndex = 0; VectorIndex < CULLED_OBJECT_DATA_STRIDE; VectorIndex++)
{
float4 Data = SceneObjectData[SourceIndex * OBJECT_DATA_STRIDE + VectorIndex];
// Note: only copying the first CULLED_OBJECT_DATA_STRIDE of the original object data
RWCulledObjectData[DestIndex * CULLED_OBJECT_DATA_STRIDE + VectorIndex] = Data;
}
}
[numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)]
void CullObjectsForViewCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ObjectIndex = DispatchThreadId.x;
#define USE_FRUSTUM_CULLING 1
#if USE_FRUSTUM_CULLING
if (DispatchThreadId.x == 0)
{
// RWObjectIndirectArguments is zeroed by a clear before this shader, only need to set things that are non-zero (and are not read by this shader as that would be a race condition)
// IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance
RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount;
}
if (GroupThreadId.x == 0)
{
NumGroupObjects = 0;
}
GroupMemoryBarrierWithGroupSync();
if (ObjectIndex < NumSceneObjects)
{
uint SourceIndex = ObjectIndex;
FDFObjectBounds DFObjectBounds = LoadDFObjectBounds(ObjectIndex);
float DistanceToViewSq = dot(View.WorldCameraOrigin - DFObjectBounds.Center, View.WorldCameraOrigin - DFObjectBounds.Center);
if (DistanceToViewSq < Square(AOMaxViewDistance + DFObjectBounds.SphereRadius)
&& ViewFrustumIntersectSphere(DFObjectBounds.Center, DFObjectBounds.SphereRadius + AOObjectMaxDistance))
{
FDFObjectData DFObjectData = LoadDFObjectData(SourceIndex);
if ((DFObjectData.MinMaxDrawDistance2.x < 0.0001 || DistanceToViewSq > DFObjectData.MinMaxDrawDistance2.x)
&& (DFObjectData.MinMaxDrawDistance2.y < 0.0001 || DistanceToViewSq < DFObjectData.MinMaxDrawDistance2.y))
{
uint DestIndex;
InterlockedAdd(NumGroupObjects, 1U, DestIndex);
GroupObjectIndices[DestIndex] = SourceIndex;
}
}
}
GroupMemoryBarrierWithGroupSync();
if (GroupThreadId.x == 0)
{
InterlockedAdd(RWObjectIndirectArguments[1], NumGroupObjects, GroupBaseIndex);
}
GroupMemoryBarrierWithGroupSync();
if (GroupThreadId.x < NumGroupObjects)
{
uint SourceIndex = GroupObjectIndices[GroupThreadId.x];
uint DestIndex = GroupBaseIndex + GroupThreadId.x;
CopyCulledObjectData(DestIndex, SourceIndex);
}
#else
if (DispatchThreadId.x == 0)
{
// IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance
RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount;
RWObjectIndirectArguments[1] = NumSceneObjects;
}
GroupMemoryBarrierWithGroupSync();
if (ObjectIndex < NumSceneObjects)
{
uint SourceIndex = ObjectIndex;
uint DestIndex = ObjectIndex;
CopyCulledObjectData(DestIndex, SourceIndex);
}
#endif
}
/** Min and Max depth for this tile. */
groupshared uint IntegerTileMinZ;
groupshared uint IntegerTileMaxZ;
/** Inner Min and Max depth for this tile. */
groupshared uint IntegerTileMinZ2;
groupshared uint IntegerTileMaxZ2;
/** View rect min in xy, max in zw. */
uint4 ViewDimensions;
float2 NumGroups;
RWBuffer<float4> RWTileConeAxisAndCos;
RWBuffer<float4> RWTileConeDepthRanges;
/** Builds tile depth ranges and bounding cones. */
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void BuildTileConesMain(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x;
float2 BaseLevelScreenUV = (DispatchThreadId.xy + float2(.5f, .5f)) * DOWNSAMPLE_FACTOR * View.BufferSizeAndInvSize.zw;
float SceneDepth = GetDownsampledDepth(BaseLevelScreenUV);
//float2 ScreenUV = (DispatchThreadId.xy * DOWNSAMPLE_FACTOR + View.ViewRectMin.xy + float2(.5f, .5f)) * View.BufferSizeAndInvSize.zw;
//float SceneDepth = CalcSceneDepth(ScreenUV);
// Initialize per-tile variables
if (ThreadIndex == 0)
{
IntegerTileMinZ = 0x7F7FFFFF;
IntegerTileMaxZ = 0;
IntegerTileMinZ2 = 0x7F7FFFFF;
IntegerTileMaxZ2 = 0;
}
GroupMemoryBarrierWithGroupSync();
// Use shared memory atomics to build the depth bounds for this tile
// Each thread is assigned to a pixel at this point
if (SceneDepth < AOMaxViewDistance)
{
InterlockedMin(IntegerTileMinZ, asuint(SceneDepth));
InterlockedMax(IntegerTileMaxZ, asuint(SceneDepth));
}
GroupMemoryBarrierWithGroupSync();
float MinTileZ = asfloat(IntegerTileMinZ);
float MaxTileZ = asfloat(IntegerTileMaxZ);
float HalfZ = .5f * (MinTileZ + MaxTileZ);
// Compute a second min and max Z, clipped by HalfZ, so that we get two depth bounds per tile
// This results in more conservative tile depth bounds and fewer intersections
if (SceneDepth >= HalfZ && SceneDepth < AOMaxViewDistance)
{
InterlockedMin(IntegerTileMinZ2, asuint(SceneDepth));
}
if (SceneDepth <= HalfZ)
{
InterlockedMax(IntegerTileMaxZ2, asuint(SceneDepth));
}
GroupMemoryBarrierWithGroupSync();
float MinTileZ2 = asfloat(IntegerTileMinZ2);
float MaxTileZ2 = asfloat(IntegerTileMaxZ2);
if (ThreadIndex == 0)
{
float3 TileConeVertex;
float3 TileConeAxis;
float TileConeAngleCos;
float TileConeAngleSin;
float4 ConeAxisDepthRanges;
{
float2 ViewSize = float2(1 / View.ViewToClip[0][0], 1 / View.ViewToClip[1][1]);
float3 TileCorner00 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner10 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner01 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner11 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
TileConeAxis = normalize(TileCorner00 + TileCorner10 + TileCorner01 + TileCorner11);
TileConeAngleCos = dot(TileConeAxis, TileCorner00);
TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos);
float TileConeAngleTan = TileConeAngleSin / TileConeAngleCos;
float ConeExpandDistance = 0;
float VertexPullbackLength = ConeExpandDistance / TileConeAngleTan;
float DistanceToNearPlane = length(TileConeAxis / TileConeAxis.z * View.NearPlane);
// 1 / cos(AngleBetweenTileCenterAndViewForward)
float InvCosTileAngle = 1.0f / TileConeAxis.z;
float ConeAxisDistanceMultiply = InvCosTileAngle;
float ConeAxisDistanceAdd = VertexPullbackLength + DistanceToNearPlane;
ConeAxisDepthRanges.x = ConeAxisDistanceMultiply * (MinTileZ - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.y = ConeAxisDistanceMultiply * (MaxTileZ2 + ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.z = ConeAxisDistanceMultiply * (MinTileZ2 - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.w = ConeAxisDistanceMultiply * (MaxTileZ + ConeExpandDistance) + ConeAxisDistanceAdd;
// Pull back cone vertex to contain potential samples
TileConeVertex = float3(0, 0, 0) - TileConeAxis * VertexPullbackLength;
}
uint TileIndex = GroupId.y * NumGroups.x + GroupId.x;
if (IntegerTileMinZ > IntegerTileMaxZ)
{
// Guard against IntegerTileMinZ never getting updated
RWTileConeAxisAndCos[TileIndex] = float4(0, 0, 0, 1);
RWTileConeDepthRanges[TileIndex] = 0;
}
else
{
RWTileConeAxisAndCos[TileIndex] = float4(TileConeAxis, TileConeAngleCos);
RWTileConeDepthRanges[TileIndex] = ConeAxisDepthRanges;
}
}
}
struct FObjectCullVertexOutput
{
nointerpolation float4 PositionAndRadius : TEXCOORD0;
nointerpolation uint ObjectIndex : TEXCOORD1;
};
float ConservativeRadiusScale;
/** Used when culling objects into screenspace tile lists */
void ObjectCullVS(
float4 InPosition : ATTRIBUTE0,
uint ObjectIndex : SV_InstanceID,
out FObjectCullVertexOutput Output,
out float4 OutPosition : SV_POSITION
)
{
//@todo - implement ConservativelyBoundSphere
float4 ObjectPositionAndRadius = LoadObjectPositionAndRadius(ObjectIndex);
//@todo - expand to handle conservative rasterization
float EffectiveRadius = (ObjectPositionAndRadius.w + AOObjectMaxDistance) * ConservativeRadiusScale;
float3 WorldPosition = InPosition.xyz * EffectiveRadius + ObjectPositionAndRadius.xyz;
OutPosition = mul(float4(WorldPosition, 1), View.WorldToClip);
Output.PositionAndRadius = ObjectPositionAndRadius;
Output.ObjectIndex = ObjectIndex;
}
/** Used for object <-> tile culling */
bool IntersectObjectWithConeDepthRange(
float3 TileConeVertex,
float3 TileConeAxis,
float TileConeAngleCos,
float TileConeAngleSin,
float2 ConeDepthRange,
float2 ConeAxisDistanceMinMax,
uint ObjectIndex)
{
BRANCH
if (ConeAxisDistanceMinMax.x > ConeDepthRange.x && ConeAxisDistanceMinMax.y < ConeDepthRange.y)
{
#define USE_DISTANCE_FIELD_FOR_OBJECT_CULLING 1
#if USE_DISTANCE_FIELD_FOR_OBJECT_CULLING
FDFObjectData DFObjectData = LoadCulledDFObjectData(ObjectIndex);
// Use the position halfway between the depth ranges as the center for the bounding sphere of this tile depth range
float3 ViewTileBoundingSphereCenter = TileConeVertex + TileConeAxis * (.5f * (ConeDepthRange.x + ConeDepthRange.y));
float3 WorldTileBoundingSphereCenter = mul(float4(ViewTileBoundingSphereCenter.xyz, 1), View.ViewToTranslatedWorld).xyz - View.PreViewTranslation.xyz;
float DistanceAlongAxis = .5f * (ConeDepthRange.y - ConeDepthRange.x);
float FarDepthDistanceToEdgeOfCone = ConeDepthRange.y * TileConeAngleSin / TileConeAngleCos;
float TileBoundingSphereRadius = sqrt(DistanceAlongAxis * DistanceAlongAxis + FarDepthDistanceToEdgeOfCone * FarDepthDistanceToEdgeOfCone);
float3 VolumeTileBoundingSphereCenter = mul(float4(WorldTileBoundingSphereCenter, 1), DFObjectData.WorldToVolume).xyz;
float BoxDistance = ComputeDistanceFromBoxToPoint(-DFObjectData.VolumePositionExtent, DFObjectData.VolumePositionExtent, VolumeTileBoundingSphereCenter) * DFObjectData.VolumeScale;
BRANCH
if (BoxDistance < TileBoundingSphereRadius + AOObjectMaxDistance)
{
float3 ClampedSamplePosition = clamp(VolumeTileBoundingSphereCenter, -DFObjectData.VolumePositionExtent, DFObjectData.VolumePositionExtent);
float DistanceToClamped = length(VolumeTileBoundingSphereCenter - ClampedSamplePosition);
float DistanceToOccluder = (DistanceToMeshSurfaceStandalone(ClampedSamplePosition, DFObjectData) + DistanceToClamped) * DFObjectData.VolumeScale;
BRANCH
if (DistanceToOccluder < TileBoundingSphereRadius + AOObjectMaxDistance)
{
return true;
}
}
#else
return true;
#endif
}
return false;
}
Buffer<float4> TileConeAxisAndCos;
Buffer<float4> TileConeDepthRanges;
RWBuffer<uint> RWNumCulledTilesArray;
RWBuffer<uint> RWCulledTilesStartOffsetArray;
RWBuffer<uint> RWCulledTileDataArray;
/** Intersects a single object with the tile and adds to the intersection list if needed. */
void ObjectCullPS(
FObjectCullVertexOutput Input,
in float4 SVPos : SV_POSITION,
out float4 OutColor : SV_Target0)
{
OutColor = 0;
uint2 TilePosition = (uint2)SVPos.xy;
uint TileIndex = TilePosition.y * NumGroups.x + TilePosition.x;
float4 ConeAxisAndCos = TileConeAxisAndCos.Load(TileIndex);
float4 ConeAxisDepthRanges = TileConeDepthRanges.Load(TileIndex);
float3 TileConeVertex = 0;
float3 TileConeAxis = ConeAxisAndCos.xyz;
float TileConeAngleCos = ConeAxisAndCos.w;
float TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos);
float4 WorldSphereCenterAndRadius = Input.PositionAndRadius;
float3 ViewSpaceSphereCenter = mul(float4(WorldSphereCenterAndRadius.xyz + View.PreViewTranslation.xyz, 1), View.TranslatedWorldToView).xyz;
// A value of 1 is conservative, but has a huge impact on performance
float RadiusScale = .5f;
float4 SphereCenterAndRadius = float4(ViewSpaceSphereCenter, WorldSphereCenterAndRadius.w + RadiusScale * AOObjectMaxDistance);
if (SphereIntersectCone(SphereCenterAndRadius, TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin))
{
float ConeAxisDistance = dot(SphereCenterAndRadius.xyz - TileConeVertex, TileConeAxis);
float2 ConeAxisDistanceMinMax = float2(ConeAxisDistance + SphereCenterAndRadius.w, ConeAxisDistance - SphereCenterAndRadius.w);
uint TotalNumGroups = (uint)(NumGroups.x * NumGroups.y + .5f);
bool bTileIntersectsObject = IntersectObjectWithConeDepthRange(TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges.xy, ConeAxisDistanceMinMax, Input.ObjectIndex);
if (!bTileIntersectsObject)
{
bTileIntersectsObject = IntersectObjectWithConeDepthRange(TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges.zw, ConeAxisDistanceMinMax, Input.ObjectIndex);
}
if (bTileIntersectsObject)
{
#if SCATTER_CULLING_COUNT_PASS
InterlockedAdd(RWNumCulledTilesArray[Input.ObjectIndex], 1);
#else
uint CulledTileIndex;
InterlockedAdd(RWNumCulledTilesArray[Input.ObjectIndex], 1, CulledTileIndex);
uint CulledTileDataStart = CulledTilesStartOffsetArray[Input.ObjectIndex];
RWCulledTileDataArray[(CulledTileDataStart + CulledTileIndex) * CULLED_TILE_DATA_STRIDE + 0] = TileIndex;
RWCulledTileDataArray[(CulledTileDataStart + CulledTileIndex) * CULLED_TILE_DATA_STRIDE + 1] = Input.ObjectIndex;
#endif
}
}
}
RWBuffer<uint> RWObjectTilesIndirectArguments;
Buffer<uint> NumCulledTilesArray;
#ifndef COMPUTE_START_OFFSET_GROUP_SIZE
#define COMPUTE_START_OFFSET_GROUP_SIZE 1
#endif
[numthreads(COMPUTE_START_OFFSET_GROUP_SIZE, 1, 1)]
void ComputeCulledTilesStartOffsetCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ObjectIndex = DispatchThreadId.x;
uint NumCulledObjects = GetCulledNumObjects();
if (ObjectIndex < NumCulledObjects)
{
uint NumIntersectingTiles = NumCulledTilesArray[ObjectIndex];
uint NumConeTraceThreadGroups = (NumIntersectingTiles + CONE_TRACE_TILES_PER_THREADGROUP - 1) / CONE_TRACE_TILES_PER_THREADGROUP;
uint StartOffsetThreadGroups;
InterlockedAdd(RWObjectTilesIndirectArguments[0], NumConeTraceThreadGroups, StartOffsetThreadGroups);
uint StartOffset = StartOffsetThreadGroups * CONE_TRACE_TILES_PER_THREADGROUP;
RWCulledTilesStartOffsetArray[ObjectIndex] = StartOffset;
// Pad remaining entries with INVALID_TILE_INDEX so we can skip computing them in the cone tracing pass
for (uint PaddingTileIndex = NumIntersectingTiles; PaddingTileIndex < NumConeTraceThreadGroups * CONE_TRACE_TILES_PER_THREADGROUP; PaddingTileIndex++)
{
RWCulledTileDataArray[(StartOffset + PaddingTileIndex) * CULLED_TILE_DATA_STRIDE + 0] = INVALID_TILE_INDEX;
RWCulledTileDataArray[(StartOffset + PaddingTileIndex) * CULLED_TILE_DATA_STRIDE + 1] = ObjectIndex;
}
}
if (DispatchThreadId.x == 0)
{
RWObjectTilesIndirectArguments[1] = 1;
RWObjectTilesIndirectArguments[2] = 1;
}
}