Files
UnrealEngineUWP/Engine/Shaders/DistanceFieldSurfaceCacheLightingCompute.usf
Daniel Wright 7da79fd159 DFAO optimizations for PS4
* DF lighting GBuffer dependencies packed into 2 textures (saves .1ms)
* Reverted CULL_OBJECTS_TO_RECORD as performance savings are inconsistent
* Added low quality gap fill pass that just does a bilateral filter (saves .4ms)
* Enabled interpolation depth testing (saves .2ms)
* Disabled the temporal stability filter (saves .2ms)
* Overall empty scene cost on PS4 3.77ms -> 2.83ms

[CL 2501856 by Daniel Wright in Main branch]
2015-04-03 18:56:44 -04:00

1192 lines
42 KiB
Plaintext

// Copyright 1998-2015 Epic Games, Inc. All Rights Reserved.
/*=============================================================================
DistanceFieldSurfaceCacheLightingCompute.usf
=============================================================================*/
#include "Common.usf"
#include "DeferredShadingCommon.usf"
#include "ReflectionEnvironmentShared.usf"
#include "DistanceFieldLightingShared.usf"
#include "DistanceFieldAOShared.usf"
#include "MonteCarlo.usf"
uint NumUploadOperations;
Buffer<uint> UploadOperationIndices;
Buffer<float4> UploadOperationData;
// In float4's. Must match equivalent C++ variables.
#define UPLOAD_DATA_STRIDE (1 + OBJECT_DATA_STRIDE)
void UploadDataFloat4(uint DestIndex, uint UploadIndex)
{
float4 UploadVector = UploadOperationData[UploadIndex];
RWObjectData[4 * DestIndex + 0] = UploadVector.x;
RWObjectData[4 * DestIndex + 1] = UploadVector.y;
RWObjectData[4 * DestIndex + 2] = UploadVector.z;
RWObjectData[4 * DestIndex + 3] = UploadVector.w;
}
[numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)]
void UploadObjectsToBufferCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint UploadOperationIndex = DispatchThreadId.x;
if (DispatchThreadId.x < NumUploadOperations)
{
uint DestIndex = UploadOperationIndices[UploadOperationIndex];
float4 UploadBounds = UploadOperationData[UploadOperationIndex * UPLOAD_DATA_STRIDE + 0];
RWObjectBounds[4 * DestIndex + 0] = UploadBounds.x;
RWObjectBounds[4 * DestIndex + 1] = UploadBounds.y;
RWObjectBounds[4 * DestIndex + 2] = UploadBounds.z;
RWObjectBounds[4 * DestIndex + 3] = UploadBounds.w;
UNROLL
for (uint VectorIndex = 0; VectorIndex < OBJECT_DATA_STRIDE; VectorIndex++)
{
UploadDataFloat4(DestIndex * OBJECT_DATA_STRIDE + VectorIndex, UploadOperationIndex * UPLOAD_DATA_STRIDE + VectorIndex + 1);
}
}
}
RWBuffer<float> RWCopyObjectBounds;
RWBuffer<float> RWCopyObjectData;
void CopyDataFloat4(uint DestIndex, uint SourceIndex)
{
RWCopyObjectData[4 * DestIndex + 0] = ObjectData[4 * SourceIndex + 0];
RWCopyObjectData[4 * DestIndex + 1] = ObjectData[4 * SourceIndex + 1];
RWCopyObjectData[4 * DestIndex + 2] = ObjectData[4 * SourceIndex + 2];
RWCopyObjectData[4 * DestIndex + 3] = ObjectData[4 * SourceIndex + 3];
}
[numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)]
void CopyObjectBufferCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint CopyOperationIndex = DispatchThreadId.x;
if (CopyOperationIndex < NumSceneObjects)
{
uint SourceIndex = CopyOperationIndex;
uint DestIndex = CopyOperationIndex;
RWCopyObjectBounds[4 * DestIndex + 0] = ObjectBounds[4 * SourceIndex + 0];
RWCopyObjectBounds[4 * DestIndex + 1] = ObjectBounds[4 * SourceIndex + 1];
RWCopyObjectBounds[4 * DestIndex + 2] = ObjectBounds[4 * SourceIndex + 2];
RWCopyObjectBounds[4 * DestIndex + 3] = ObjectBounds[4 * SourceIndex + 3];
UNROLL
for (uint VectorIndex = 0; VectorIndex < OBJECT_DATA_STRIDE; VectorIndex++)
{
CopyDataFloat4(DestIndex * OBJECT_DATA_STRIDE + VectorIndex, SourceIndex * OBJECT_DATA_STRIDE + VectorIndex);
}
}
}
uint NumRemoveOperations;
Buffer<uint4> RemoveOperationIndices;
#if REMOVE_FROM_SAME_BUFFER
#define RWBoundsRemoveSource RWObjectBounds
#define RWDataRemoveSource RWObjectData
#else
Buffer<float> ObjectBounds2;
Buffer<float> ObjectData2;
#define RWBoundsRemoveSource ObjectBounds2
#define RWDataRemoveSource ObjectData2
#endif
void WriteDataFloat4(uint DestIndex, uint SourceIndex)
{
RWObjectData[4 * DestIndex + 0] = RWDataRemoveSource[4 * SourceIndex + 0];
RWObjectData[4 * DestIndex + 1] = RWDataRemoveSource[4 * SourceIndex + 1];
RWObjectData[4 * DestIndex + 2] = RWDataRemoveSource[4 * SourceIndex + 2];
RWObjectData[4 * DestIndex + 3] = RWDataRemoveSource[4 * SourceIndex + 3];
}
[numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)]
void RemoveObjectsFromBufferCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint RemoveOperationIndex = DispatchThreadId.x;
if (RemoveOperationIndex < NumRemoveOperations)
{
// RemoveAtSwap
uint SourceIndex = RemoveOperationIndices[RemoveOperationIndex].y;
uint DestIndex = RemoveOperationIndices[RemoveOperationIndex].x;
RWObjectBounds[4 * DestIndex + 0] = RWBoundsRemoveSource[4 * SourceIndex + 0];
RWObjectBounds[4 * DestIndex + 1] = RWBoundsRemoveSource[4 * SourceIndex + 1];
RWObjectBounds[4 * DestIndex + 2] = RWBoundsRemoveSource[4 * SourceIndex + 2];
RWObjectBounds[4 * DestIndex + 3] = RWBoundsRemoveSource[4 * SourceIndex + 3];
UNROLL
for (uint VectorIndex = 0; VectorIndex < OBJECT_DATA_STRIDE; VectorIndex++)
{
WriteDataFloat4(DestIndex * OBJECT_DATA_STRIDE + VectorIndex, SourceIndex * OBJECT_DATA_STRIDE + VectorIndex);
}
}
}
RWBuffer<float4> RWCulledObjectBounds;
RWBuffer<float4> RWCulledObjectData;
RWBuffer<float4> RWCulledObjectBoxBounds;
uint ObjectBoundingGeometryIndexCount;
groupshared uint NumGroupObjects;
groupshared uint GroupBaseIndex;
groupshared uint GroupObjectIndices[UPDATEOBJECTS_THREADGROUP_SIZE];
float4 FetchObjectDataFloat4(uint SourceIndex)
{
return float4(ObjectData[4 * SourceIndex + 0], ObjectData[4 * SourceIndex + 1], ObjectData[4 * SourceIndex + 2], ObjectData[4 * SourceIndex + 3]);
}
void CopyCulledObjectData(uint DestIndex, uint SourceIndex)
{
RWCulledObjectBounds[DestIndex] = float4(ObjectBounds[4 * SourceIndex + 0], ObjectBounds[4 * SourceIndex + 1], ObjectBounds[4 * SourceIndex + 2], ObjectBounds[4 * SourceIndex + 3]);
UNROLL
for (uint VectorIndex = 0; VectorIndex < CULLED_OBJECT_DATA_STRIDE; VectorIndex++)
{
float4 Data = FetchObjectDataFloat4(SourceIndex * OBJECT_DATA_STRIDE + VectorIndex);
// Note: only copying the first CULLED_OBJECT_DATA_STRIDE of the original object data
RWCulledObjectData[DestIndex * CULLED_OBJECT_DATA_STRIDE + VectorIndex] = Data;
}
}
[numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)]
void CullObjectsForViewCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ObjectIndex = DispatchThreadId.x;
#define USE_FRUSTUM_CULLING 1
#if USE_FRUSTUM_CULLING
if (DispatchThreadId.x == 0)
{
// RWObjectIndirectArguments is zeroed by a clear before this shader, only need to set things that are non-zero (and are not read by this shader as that would be a race condition)
// IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance
RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount;
}
if (GroupThreadId.x == 0)
{
NumGroupObjects = 0;
}
GroupMemoryBarrierWithGroupSync();
if (ObjectIndex < NumSceneObjects)
{
uint SourceIndex = ObjectIndex;
float4 ObjectBoundingSphere = float4(ObjectBounds[4 * SourceIndex + 0], ObjectBounds[4 * SourceIndex + 1], ObjectBounds[4 * SourceIndex + 2], ObjectBounds[4 * SourceIndex + 3]);
float DistanceToViewSq = dot(View.ViewOrigin.xyz - ObjectBoundingSphere.xyz, View.ViewOrigin.xyz - ObjectBoundingSphere.xyz);
if (DistanceToViewSq < Square(AOMaxViewDistance + ObjectBoundingSphere.w)
&& ViewFrustumIntersectSphere(ObjectBoundingSphere.xyz, ObjectBoundingSphere.w + AOMaxDistance))
{
uint DestIndex;
InterlockedAdd(NumGroupObjects, 1U, DestIndex);
GroupObjectIndices[DestIndex] = SourceIndex;
}
}
GroupMemoryBarrierWithGroupSync();
if (GroupThreadId.x == 0)
{
InterlockedAdd(RWObjectIndirectArguments[1], NumGroupObjects, GroupBaseIndex);
}
GroupMemoryBarrierWithGroupSync();
if (GroupThreadId.x < NumGroupObjects)
{
uint SourceIndex = GroupObjectIndices[GroupThreadId.x];
uint DestIndex = GroupBaseIndex + GroupThreadId.x;
CopyCulledObjectData(DestIndex, SourceIndex);
}
#else
if (DispatchThreadId.x == 0)
{
// IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance
RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount;
RWObjectIndirectArguments[1] = NumSceneObjects;
}
GroupMemoryBarrierWithGroupSync();
if (ObjectIndex < NumSceneObjects)
{
uint SourceIndex = ObjectIndex;
uint DestIndex = ObjectIndex;
CopyCulledObjectData(DestIndex, SourceIndex);
}
#endif
}
/** Min and Max depth for this tile. */
groupshared uint IntegerTileMinZ;
groupshared uint IntegerTileMaxZ;
/** Inner Min and Max depth for this tile. */
groupshared uint IntegerTileMinZ2;
groupshared uint IntegerTileMaxZ2;
/** View rect min in xy, max in zw. */
uint4 ViewDimensions;
float2 NumGroups;
RWBuffer<float4> RWTileConeAxisAndCos;
RWBuffer<float4> RWTileConeDepthRanges;
RWBuffer<uint> RWTileHeadDataUnpacked;
#ifndef MAX_OBJECTS_PER_TILE
#define MAX_OBJECTS_PER_TILE 1
#endif
/** Builds tile depth ranges and bounding cones. */
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void BuildTileConesMain(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x;
float2 BaseLevelScreenUV = (DispatchThreadId.xy + float2(.5f, .5f)) * DOWNSAMPLE_FACTOR * View.ViewSizeAndSceneTexelSize.zw;
float SceneDepth = GetDownsampledDepth(BaseLevelScreenUV);
//float2 ScreenUV = (DispatchThreadId.xy * DOWNSAMPLE_FACTOR + View.ViewRectMin.xy + float2(.5f, .5f)) * View.ViewSizeAndSceneTexelSize.zw;
//float SceneDepth = CalcSceneDepth(ScreenUV);
// Initialize per-tile variables
if (ThreadIndex == 0)
{
IntegerTileMinZ = 0x7F7FFFFF;
IntegerTileMaxZ = 0;
IntegerTileMinZ2 = 0x7F7FFFFF;
IntegerTileMaxZ2 = 0;
}
GroupMemoryBarrierWithGroupSync();
// Use shared memory atomics to build the depth bounds for this tile
// Each thread is assigned to a pixel at this point
InterlockedMin(IntegerTileMinZ, asuint(SceneDepth));
InterlockedMax(IntegerTileMaxZ, asuint(SceneDepth));
GroupMemoryBarrierWithGroupSync();
float MinTileZ = asfloat(IntegerTileMinZ);
float MaxTileZ = asfloat(IntegerTileMaxZ);
float HalfZ = .5f * (MinTileZ + MaxTileZ);
// Compute a second min and max Z, clipped by HalfZ, so that we get two depth bounds per tile
// This results in more conservative tile depth bounds and fewer intersections
if (SceneDepth >= HalfZ)
{
InterlockedMin(IntegerTileMinZ2, asuint(SceneDepth));
}
if (SceneDepth <= HalfZ)
{
InterlockedMax(IntegerTileMaxZ2, asuint(SceneDepth));
}
GroupMemoryBarrierWithGroupSync();
float MinTileZ2 = asfloat(IntegerTileMinZ2);
float MaxTileZ2 = asfloat(IntegerTileMaxZ2);
if (ThreadIndex == 0)
{
float3 TileConeVertex;
float3 TileConeAxis;
float TileConeAngleCos;
float TileConeAngleSin;
float4 ConeAxisDepthRanges;
{
float2 ViewSize = float2(1 / View.ViewToClip[0][0], 1 / View.ViewToClip[1][1]);
float3 TileCorner00 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner10 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner01 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner11 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
TileConeAxis = normalize(TileCorner00 + TileCorner10 + TileCorner01 + TileCorner11);
TileConeAngleCos = dot(TileConeAxis, TileCorner00);
TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos);
float TileConeAngleTan = TileConeAngleSin / TileConeAngleCos;
float3 ViewSpaceSampleDirection = mul(float3(0, 0, 1), (float3x3)View.TranslatedWorldToView);
float ConeExpandDistance = 0;
float VertexPullbackLength = ConeExpandDistance / TileConeAngleTan;
float DistanceToNearPlane = length(TileConeAxis / TileConeAxis.z * View.NearPlane);
// 1 / cos(AngleBetweenTileCenterAndViewForward)
float InvCosTileAngle = 1.0f / TileConeAxis.z;
float ConeAxisDistanceMultiply = InvCosTileAngle;
float ConeAxisDistanceAdd = VertexPullbackLength + DistanceToNearPlane;
ConeAxisDepthRanges.x = ConeAxisDistanceMultiply * (MinTileZ - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.y = ConeAxisDistanceMultiply * (MaxTileZ2 + ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.z = ConeAxisDistanceMultiply * (MinTileZ2 - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.w = ConeAxisDistanceMultiply * (MaxTileZ + ConeExpandDistance) + ConeAxisDistanceAdd;
// Pull back cone vertex to contain potential samples
TileConeVertex = float3(0, 0, 0) - TileConeAxis * VertexPullbackLength;
}
uint TileIndex = GroupId.y * NumGroups.x + GroupId.x;
RWTileConeAxisAndCos[TileIndex] = float4(TileConeAxis, TileConeAngleCos);
RWTileConeDepthRanges[TileIndex] = ConeAxisDepthRanges;
RWTileHeadDataUnpacked[TileIndex * 4 + 0] = TileIndex;
RWTileHeadDataUnpacked[TileIndex * 4 + 1] = 0;
RWTileHeadDataUnpacked[TileIndex * 4 + 2] = 0;
RWTileHeadDataUnpacked[TileIndex * 4 + 3] = 0;
}
}
groupshared uint SmallTileObjectIndices[MAX_OBJECTS_PER_TILE];
groupshared uint MediumTileObjectIndices[MAX_OBJECTS_PER_TILE];
groupshared uint LargeTileObjectIndices[MAX_OBJECTS_PER_TILE];
groupshared uint SmallTileNumObjects;
groupshared uint MediumTileNumObjects;
groupshared uint LargeTileNumObjects;
groupshared uint TileArrayDataStart;
RWBuffer<uint> RWTileArrayData;
RWBuffer<uint> RWTileArrayNextAllocation;
RWBuffer<uint4> RWTileHeadData;
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void DistanceFieldAOBuildTileListMain(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x;
float2 ScreenUV = (DispatchThreadId.xy * DOWNSAMPLE_FACTOR + View.ViewRectMin.xy + float2(.5f, .5f)) * View.ViewSizeAndSceneTexelSize.zw;
float SceneDepth = CalcSceneDepth(ScreenUV);
// Initialize per-tile variables
if (ThreadIndex == 0)
{
IntegerTileMinZ = 0x7F7FFFFF;
IntegerTileMaxZ = 0;
IntegerTileMinZ2 = 0x7F7FFFFF;
IntegerTileMaxZ2 = 0;
SmallTileNumObjects = 0;
MediumTileNumObjects = 0;
LargeTileNumObjects = 0;
}
GroupMemoryBarrierWithGroupSync();
// Use shared memory atomics to build the depth bounds for this tile
// Each thread is assigned to a pixel at this point
InterlockedMin(IntegerTileMinZ, asuint(SceneDepth));
InterlockedMax(IntegerTileMaxZ, asuint(SceneDepth));
GroupMemoryBarrierWithGroupSync();
float MinTileZ = asfloat(IntegerTileMinZ);
float MaxTileZ = asfloat(IntegerTileMaxZ);
float HalfZ = .5f * (MinTileZ + MaxTileZ);
// Compute a second min and max Z, clipped by HalfZ, so that we get two depth bounds per tile
// This results in more conservative tile depth bounds and fewer intersections
if (SceneDepth >= HalfZ)
{
InterlockedMin(IntegerTileMinZ2, asuint(SceneDepth));
}
if (SceneDepth <= HalfZ)
{
InterlockedMax(IntegerTileMaxZ2, asuint(SceneDepth));
}
GroupMemoryBarrierWithGroupSync();
float MinTileZ2 = asfloat(IntegerTileMinZ2);
float MaxTileZ2 = asfloat(IntegerTileMaxZ2);
float3 TileConeVertex;
float3 TileConeAxis;
float TileConeAngleCos;
float TileConeAngleSin;
float4 ConeAxisDepthRanges;
{
float2 ViewSize = float2(1 / View.ViewToClip[0][0], 1 / View.ViewToClip[1][1]);
float3 TileCorner00 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner10 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner01 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
float3 TileCorner11 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1));
TileConeAxis = normalize(TileCorner00 + TileCorner10 + TileCorner01 + TileCorner11);
TileConeAngleCos = dot(TileConeAxis, TileCorner00);
TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos);
float TileConeAngleTan = TileConeAngleSin / TileConeAngleCos;
float3 ViewSpaceSampleDirection = mul(float3(0, 0, 1), (float3x3)View.TranslatedWorldToView);
float ConeExpandDistance = 0;
float VertexPullbackLength = ConeExpandDistance / TileConeAngleTan;
float DistanceToNearPlane = length(TileConeAxis / TileConeAxis.z * View.NearPlane);
// 1 / cos(AngleBetweenTileCenterAndViewForward)
float InvCosTileAngle = 1.0f / TileConeAxis.z;
float ConeAxisDistanceMultiply = InvCosTileAngle;
float ConeAxisDistanceAdd = VertexPullbackLength + DistanceToNearPlane;
ConeAxisDepthRanges.x = ConeAxisDistanceMultiply * (MinTileZ - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.y = ConeAxisDistanceMultiply * (MaxTileZ2 + ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.z = ConeAxisDistanceMultiply * (MinTileZ2 - ConeExpandDistance) + ConeAxisDistanceAdd;
ConeAxisDepthRanges.w = ConeAxisDistanceMultiply * (MaxTileZ + ConeExpandDistance) + ConeAxisDistanceAdd;
// Pull back cone vertex to contain potential samples
//@todo - only expand in sky direction
TileConeVertex = float3(0, 0, 0) - TileConeAxis * VertexPullbackLength;
}
// A value of 1 is conservative, but has huge impact on performance
float RadiusScale = .5f;
float SmallGroupMaxSampleRadius;
{
uint StartIndex;
uint EndIndex;
GetPhaseParameters(0, StartIndex, EndIndex);
SmallGroupMaxSampleRadius = GetStepOffset(EndIndex) * 2 * RadiusScale;
}
float MediumGroupMaxSampleRadius;
{
uint StartIndex;
uint EndIndex;
GetPhaseParameters(1, StartIndex, EndIndex);
MediumGroupMaxSampleRadius = GetStepOffset(EndIndex) * 2 * RadiusScale;
}
float LargeGroupMaxSampleRadius;
{
uint StartIndex;
uint EndIndex;
GetPhaseParameters(2, StartIndex, EndIndex);
LargeGroupMaxSampleRadius = GetStepOffset(EndIndex) * 2 * RadiusScale;
}
uint NumCulledObjects = GetCulledNumObjects();
// Compute per-tile lists of affecting objects through bounds culling
// Each thread now operates on a sample instead of a pixel
LOOP
for (uint ObjectIndex = ThreadIndex; ObjectIndex < NumCulledObjects; ObjectIndex += THREADGROUP_TOTALSIZE)
{
float4 SphereCenterAndRadius = LoadObjectPositionAndRadius(ObjectIndex);
float3 ViewSpaceSphereCenter = mul(float4(SphereCenterAndRadius.xyz + View.PreViewTranslation.xyz, 1), View.TranslatedWorldToView).xyz;
if (SphereIntersectConeWithDepthRanges(float4(ViewSpaceSphereCenter, SphereCenterAndRadius.w + SmallGroupMaxSampleRadius), TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges))
{
uint ListIndex;
InterlockedAdd(SmallTileNumObjects, 1U, ListIndex);
SmallTileObjectIndices[ListIndex] = ObjectIndex;
}
else if (SphereIntersectConeWithDepthRanges(float4(ViewSpaceSphereCenter, SphereCenterAndRadius.w + MediumGroupMaxSampleRadius), TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges))
{
uint ListIndex;
InterlockedAdd(MediumTileNumObjects, 1U, ListIndex);
MediumTileObjectIndices[ListIndex] = ObjectIndex;
}
else if (SphereIntersectConeWithDepthRanges(float4(ViewSpaceSphereCenter, SphereCenterAndRadius.w + LargeGroupMaxSampleRadius), TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges))
{
uint ListIndex;
InterlockedAdd(LargeTileNumObjects, 1U, ListIndex);
LargeTileObjectIndices[ListIndex] = ObjectIndex;
}
}
GroupMemoryBarrierWithGroupSync();
if (ThreadIndex == 0)
{
SmallTileNumObjects = min(SmallTileNumObjects, MAX_OBJECTS_PER_TILE);
MediumTileNumObjects = min(MediumTileNumObjects, MAX_OBJECTS_PER_TILE);
LargeTileNumObjects = min(LargeTileNumObjects, MAX_OBJECTS_PER_TILE);
uint ArrayStart;
uint NumObjectsIntersecting = SmallTileNumObjects + MediumTileNumObjects + LargeTileNumObjects;
InterlockedAdd(RWTileArrayNextAllocation[0], NumObjectsIntersecting, ArrayStart);
TileArrayDataStart = ArrayStart;
RWTileHeadData[GroupId.y * (uint)NumGroups.x + GroupId.x] = uint4(TileArrayDataStart, SmallTileNumObjects, MediumTileNumObjects, LargeTileNumObjects);
}
GroupMemoryBarrierWithGroupSync();
uint ArrayDataStart = TileArrayDataStart;
LOOP
for (uint SmallListIndex = ThreadIndex; SmallListIndex < SmallTileNumObjects; SmallListIndex += THREADGROUP_TOTALSIZE)
{
RWTileArrayData[ArrayDataStart + SmallListIndex] = SmallTileObjectIndices[SmallListIndex];
}
ArrayDataStart += SmallTileNumObjects;
LOOP
for (uint MediumListIndex = ThreadIndex; MediumListIndex < MediumTileNumObjects; MediumListIndex += THREADGROUP_TOTALSIZE)
{
RWTileArrayData[ArrayDataStart + MediumListIndex] = MediumTileObjectIndices[MediumListIndex];
}
ArrayDataStart += MediumTileNumObjects;
LOOP
for (uint LargeListIndex = ThreadIndex; LargeListIndex < LargeTileNumObjects; LargeListIndex += THREADGROUP_TOTALSIZE)
{
RWTileArrayData[ArrayDataStart + LargeListIndex] = LargeTileObjectIndices[LargeListIndex];
}
}
/** View rect min in xy, max in zw. */
float2 ThreadToCulledTile;
uint NumCircleSections;
RWBuffer<float4> RWIrradianceCachePositionRadius;
RWBuffer<float4> RWIrradianceCacheNormal;
RWBuffer<uint> RWScatterDrawParameters;
RWBuffer<uint2> RWIrradianceCacheTileCoordinate;
Texture2D IrradianceCacheSplatTexture;
SamplerState IrradianceCacheSplatSampler;
groupshared float4 CachedIrradianceCachePositionRadius[THREADGROUP_TOTALSIZE];
groupshared float4 CachedIrradianceCacheNormal[THREADGROUP_TOTALSIZE];
groupshared uint2 CachedIrradianceCacheTileCoordinate[THREADGROUP_TOTALSIZE];
groupshared uint NumQueuedIrradianceCacheRecords;
groupshared uint BaseRecordIndex;
/** Creates new surface cache records for sample points that don't have valid coverage from existing surface cache records. */
[numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)]
void PopulateCacheCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x;
if (ThreadIndex == 0)
{
NumQueuedIrradianceCacheRecords = 0;
}
GroupMemoryBarrierWithGroupSync();
float2 ScreenUV = (DispatchThreadId.xy * CurrentLevelDownsampleFactor + View.ViewRectMin.xy + float2(.5f, .5f)) * View.ViewSizeAndSceneTexelSize.zw;
float2 ScreenPosition = (ScreenUV.xy - View.ScreenPositionScaleBias.wz) / View.ScreenPositionScaleBias.xy;
float2 DownsampledScreenUV = (DispatchThreadId.xy + float2(.5f, .5f)) / AOBufferSize;
FGBufferData GBufferData = GetGBufferData(ScreenUV);
float4 IrradianceCacheSplat = Texture2DSampleLevel(IrradianceCacheSplatTexture, IrradianceCacheSplatSampler, DownsampledScreenUV, 0);
BRANCH
if (GBufferData.ShadingModelID > 0
&& IrradianceCacheSplat.w < .0001f
&& all((float2)DispatchThreadId.xy < AOBufferSize))
{
float SceneDepth = CalcSceneDepth(ScreenUV);
BRANCH
if (SceneDepth < AOMaxViewDistance)
{
uint2 TileCoordinate = DispatchThreadId.xy * DownsampleFactorToBaseLevel / uint2(THREADGROUP_SIZEX, THREADGROUP_SIZEY);
float4 HomogeneousWorldPosition = mul(float4(ScreenPosition * SceneDepth, SceneDepth, 1), View.ScreenToWorld);
float3 OpaqueWorldPosition = HomogeneousWorldPosition.xyz / HomogeneousWorldPosition.w;
float2 BaseLevelScreenUV = (DispatchThreadId.xy * DownsampleFactorToBaseLevel + float2(.5f, .5f)) * BaseLevelTexelSize;
float3 WorldNormal;
float Unused;
bool bHasDistanceFieldRepresentation;
bool bHasHeightfieldRepresentation;
GetDownsampledGBuffer(BaseLevelScreenUV, WorldNormal, Unused, bHasDistanceFieldRepresentation, bHasHeightfieldRepresentation);
//@todo - offset shading position along normal to avoid incorrect self-occlusion?
float3 WorldShadingPosition = OpaqueWorldPosition;
// For debugging
//if (all(DispatchThreadId.xy == uint2(4,3)))
{
// Allocate a new record and store off attributes of the created record
uint NextSampleIndex;
InterlockedAdd(NumQueuedIrradianceCacheRecords, 1u, NextSampleIndex);
// W stores max allowed radius, used to limit overdraw from nearby samples placed in the high resolution passes
float MaxRadiusScale = bHasDistanceFieldRepresentation > 0 ? .005f : .0005f;
// Sign of W stores whether the sample is fading in or out
float RadiusSign = 1;
CachedIrradianceCachePositionRadius[NextSampleIndex] = float4(WorldShadingPosition, RadiusSign * SceneDepth * CurrentLevelDownsampleFactor * MaxRadiusScale);
// abs(w) - 1 stores fade in amount (shifted away from 0 to retain sign when fade amount is 0)
// sign(w) stores bHasHeightfieldRepresentation
CachedIrradianceCacheNormal[NextSampleIndex] = float4(WorldNormal, 1 * (bHasHeightfieldRepresentation ? 1 : -1));
CachedIrradianceCacheTileCoordinate[NextSampleIndex] = TileCoordinate;
}
}
}
GroupMemoryBarrierWithGroupSync();
if (ThreadIndex == 0)
{
InterlockedAdd(RWScatterDrawParameters[1], NumQueuedIrradianceCacheRecords, BaseRecordIndex);
}
GroupMemoryBarrierWithGroupSync();
LOOP
for (uint LocalRecordIndex = ThreadIndex; LocalRecordIndex < NumQueuedIrradianceCacheRecords; LocalRecordIndex += THREADGROUP_TOTALSIZE)
{
int SampleIndex = BaseRecordIndex + LocalRecordIndex;
RWIrradianceCachePositionRadius[SampleIndex] = CachedIrradianceCachePositionRadius[LocalRecordIndex];
RWIrradianceCacheNormal[SampleIndex] = CachedIrradianceCacheNormal[LocalRecordIndex];
uint2 TileCoordinate = CachedIrradianceCacheTileCoordinate[LocalRecordIndex];
RWIrradianceCacheTileCoordinate[SampleIndex] = TileCoordinate;
}
if (all(DispatchThreadId == 0))
{
// VertexCountPerInstance
RWScatterDrawParameters[0] = NumCircleSections * 3;
// StartVertexLocation
RWScatterDrawParameters[2] = 0;
// StartInstanceLocation
RWScatterDrawParameters[3] = 0;
}
}
float TanConeHalfAngle;
float RecordRadiusScale;
RWBuffer<float> RWOccluderRadius;
RWBuffer<float> RWRecordConeVisibility;
RWBuffer<float> RWRecordConeData;
RWBuffer<float4> RWDebugBuffer;
// Have to disable surface caching dependencies for this to work
#define VISUALIZE_ONE_CONE 0
// Enforce one thread per cone direction
#define SIMULTANEOUSLY_TRACED_OBJECTS (FINAL_GATHER_THREADGROUP_SIZE / NUM_CONE_DIRECTIONS)
#define THREADS_PER_OBJECT (FINAL_GATHER_THREADGROUP_SIZE / SIMULTANEOUSLY_TRACED_OBJECTS)
groupshared uint SharedConeVisibility[NUM_CONE_DIRECTIONS];
groupshared uint SharedConeRawVisibility[NUM_CONE_DIRECTIONS][NUM_VISIBILITY_STEPS];
groupshared uint SharedMinOcclusionDistance;
groupshared float3 SharedGatheredLighting[FINAL_GATHER_THREADGROUP_SIZE];
#define MAX_RECORD_CULLED_OBJECTS 128
groupshared uint SharedCulledObjectList[MAX_RECORD_CULLED_OBJECTS];
groupshared uint NumRecordCulledObjects;
// Inconsistent performance savings + overflow of shared memory array
#define CULL_OBJECTS_TO_RECORD 0
/** Computes ambient occlusion for a surface cache record by cone stepping through the nearby object distance fields. */
[numthreads(FINAL_GATHER_THREADGROUP_SIZE, 1, 1)]
void ConeTraceOcclusionCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint ThreadIndex = GroupThreadId.x;
uint ObjectOffsetIndex = ThreadIndex / THREADS_PER_OBJECT;
if (ThreadIndex == 0)
{
for (uint ConeIndex = 0; ConeIndex < NUM_CONE_DIRECTIONS; ConeIndex++)
{
SharedConeVisibility[ConeIndex] = asuint(1.0f);
#if SUPPORT_IRRADIANCE
UNROLL
for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++)
{
SharedConeRawVisibility[ConeIndex][i] = asuint(1.0f);
}
#endif
}
SharedMinOcclusionDistance = asuint(AOMaxDistance);
NumRecordCulledObjects = 0;
}
GroupMemoryBarrierWithGroupSync();
uint StartIndex = SavedStartIndex[0];
uint NumRecords = ScatterDrawParameters[1];
uint RecordIndex = StartIndex + GroupId.x;
float3 TangentX;
float3 TangentY;
float3 WorldNormal;
{
WorldNormal = IrradianceCacheNormal[RecordIndex].xyz;
float3 WorldShadingPosition = IrradianceCachePositionRadius[RecordIndex].xyz;
uint2 TileCoordinate = IrradianceCacheTileCoordinate[RecordIndex];
uint4 TileHead = GetTileHead(TileCoordinate);
uint NumObjectsAffectingTile = TileHead.y + TileHead.z + TileHead.w;
uint NumCulledObjects = NumObjectsAffectingTile;
#if CULL_OBJECTS_TO_RECORD
LOOP
for (uint ListObjectIndex = ThreadIndex; ListObjectIndex < NumObjectsAffectingTile; ListObjectIndex += FINAL_GATHER_THREADGROUP_SIZE)
{
if (ListObjectIndex < NumObjectsAffectingTile)
{
uint ListIndex = 0;
uint ArrayIndex = ListObjectIndex;
FLATTEN
if (ListObjectIndex >= TileHead.y + TileHead.z)
{
ListIndex = 2;
ArrayIndex = ListObjectIndex - TileHead.y - TileHead.z;
}
else if (ListObjectIndex >= TileHead.y)
{
ListIndex = 1;
ArrayIndex = ListObjectIndex - TileHead.y;
}
uint ObjectIndex = TileArrayData.Load((ArrayIndex * TileListGroupSize.x * TileListGroupSize.y + TileHead.x) * NUM_CULLED_OBJECT_LISTS + ListIndex);
float4 ObjectPositionAndRadius = LoadObjectPositionAndRadius(ObjectIndex);
float ObjectDistanceSq = dot(ObjectPositionAndRadius.xyz - WorldShadingPosition, ObjectPositionAndRadius.xyz - WorldShadingPosition);
BRANCH
// Skip tracing objects with a small projected angle
if (ObjectPositionAndRadius.w * ObjectPositionAndRadius.w / ObjectDistanceSq > Square(.25f)
// Skip tracing objects outside the max occlusion distance
//@todo - box distance
&& ObjectDistanceSq < Square(ObjectPositionAndRadius.w + AOMaxDistance))
{
uint DestIndex;
InterlockedAdd(NumRecordCulledObjects, 1U, DestIndex);
SharedCulledObjectList[DestIndex] = ObjectIndex;
}
}
}
GroupMemoryBarrierWithGroupSync();
NumCulledObjects = NumRecordCulledObjects;
#endif
FindBestAxisVectors2(WorldNormal, TangentX, TangentY);
float3 DebugConeDirection = normalize(float3(.3f, .5f, .4f));
uint ConeIndex = ThreadIndex % THREADS_PER_OBJECT;
float3 ConeDirection = AOSamples2.SampleDirections[ConeIndex].xyz;
float3 RotatedConeDirection = ConeDirection.x * TangentX + ConeDirection.y * TangentY + ConeDirection.z * WorldNormal;
#if VISUALIZE_ONE_CONE
RotatedConeDirection = DebugConeDirection;
#endif
float MinVisibility = 1;
float MinRawVisibility[NUM_VISIBILITY_STEPS];
float MinWorldDistanceToOccluder = AOMaxDistance;
float ConeDistanceAtClosestToOccluder = 0;
float MaxWorldStepOffset = GetStepOffset(NUM_CONE_STEPS);
#if SUPPORT_IRRADIANCE
UNROLL
for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++)
{
MinRawVisibility[i] = 1;
}
#endif
LOOP
for (uint ListObjectIndex = 0; ListObjectIndex < NumCulledObjects; ListObjectIndex += SIMULTANEOUSLY_TRACED_OBJECTS)
{
uint EffectiveListObjectIndex = ListObjectIndex + ObjectOffsetIndex;
if (EffectiveListObjectIndex < NumCulledObjects
// Ignore extra threads
&& ObjectOffsetIndex < SIMULTANEOUSLY_TRACED_OBJECTS)
{
#if CULL_OBJECTS_TO_RECORD
uint ObjectIndex = SharedCulledObjectList[EffectiveListObjectIndex];
{
#else
uint ListIndex = 0;
uint ArrayIndex = EffectiveListObjectIndex;
FLATTEN
if (EffectiveListObjectIndex >= TileHead.y + TileHead.z)
{
ListIndex = 2;
ArrayIndex = EffectiveListObjectIndex - TileHead.y - TileHead.z;
}
else if (EffectiveListObjectIndex >= TileHead.y)
{
ListIndex = 1;
ArrayIndex = EffectiveListObjectIndex - TileHead.y;
}
uint ObjectIndex = TileArrayData.Load((ArrayIndex * TileListGroupSize.x * TileListGroupSize.y + TileHead.x) * NUM_CULLED_OBJECT_LISTS + ListIndex);
float4 ObjectPositionAndRadius = LoadObjectPositionAndRadius(ObjectIndex);
float ObjectDistanceSq = dot(ObjectPositionAndRadius.xyz - WorldShadingPosition, ObjectPositionAndRadius.xyz - WorldShadingPosition);
BRANCH
// Skip tracing objects with a small projected angle
if (ObjectPositionAndRadius.w * ObjectPositionAndRadius.w / ObjectDistanceSq > Square(.25f))
{
#endif
float3 LocalPositionExtent = LoadObjectLocalPositionExtent(ObjectIndex);
float4x4 WorldToVolume = LoadObjectWorldToVolume(ObjectIndex);
bool bGeneratedAsTwoSided;
float4 UVScaleAndVolumeScale = LoadObjectUVScale(ObjectIndex, bGeneratedAsTwoSided);
float3 VolumeShadingPosition = mul(float4(WorldShadingPosition, 1), WorldToVolume).xyz;
float ObjectOccluderRadius = length(LocalPositionExtent) * .5f * UVScaleAndVolumeScale.w;
float BoxDistance = ComputeDistanceFromBoxToPoint(-LocalPositionExtent, LocalPositionExtent, VolumeShadingPosition) * UVScaleAndVolumeScale.w;
BRANCH
if (BoxDistance < AOMaxDistance)
{
float3 UVAdd = LoadObjectUVAdd(ObjectIndex);
uint StartStepIndex = 0;
#if !CULL_OBJECTS_TO_RECORD
FLATTEN
if (EffectiveListObjectIndex >= TileHead.y + TileHead.z)
{
StartStepIndex = 8;
}
else if (EffectiveListObjectIndex >= TileHead.y)
{
StartStepIndex = 5;
}
#endif
float WorldStepOffset = GetStepOffset(StartStepIndex);
LOOP
for (uint StepIndex = StartStepIndex; StepIndex < NUM_CONE_STEPS && WorldStepOffset < MaxWorldStepOffset; StepIndex++)
{
float3 WorldSamplePosition = WorldShadingPosition + RotatedConeDirection * WorldStepOffset;
float3 StepSamplePosition = mul(float4(WorldSamplePosition, 1), WorldToVolume).xyz;
float3 ClampedSamplePosition = clamp(StepSamplePosition, -LocalPositionExtent, LocalPositionExtent);
float DistanceToClamped = length(StepSamplePosition - ClampedSamplePosition);
float3 StepVolumeUV = DistanceFieldVolumePositionToUV(ClampedSamplePosition, UVScaleAndVolumeScale.xyz, UVAdd);
float DistanceToOccluder = (Texture3DSampleLevel(DistanceFieldTexture, DistanceFieldSampler, StepVolumeUV, 0).x + DistanceToClamped) * UVScaleAndVolumeScale.w;
float SphereRadius = WorldStepOffset * TanConeHalfAngle;
//@todo - have to bias away from surface further for this to work
float ShadingSphereRadius = SphereRadius * 1.0f;
// Derive visibility from 1d intersection
float Visibility = saturate(DistanceToOccluder / ShadingSphereRadius);
// Don't allow small objects to fully occlude a cone step
Visibility = max(Visibility, 1 - saturate(ObjectOccluderRadius / SphereRadius));
float OccluderDistanceFraction = (WorldStepOffset + DistanceToOccluder) / AOMaxDistance;
#if SUPPORT_IRRADIANCE
uint VisibilityIndex = NUM_VISIBILITY_STEPS * WorldStepOffset / AOMaxDistance;
// Less GI occlusion for two sided meshes, which can't separate self-occlusion
//@todo - expose
float TwoSidedVisibilityScale = bGeneratedAsTwoSided ? 100 : 1;
// Track raw visibility before the distance fade for GI shadowing
MinRawVisibility[VisibilityIndex] = min(MinRawVisibility[VisibilityIndex], TwoSidedVisibilityScale * Visibility);
#endif
// Fade out occlusion based on distance to occluder to avoid a discontinuity at the max AO distance
Visibility = max(Visibility, saturate(OccluderDistanceFraction * OccluderDistanceFraction * .8f));
MinVisibility = min(MinVisibility, Visibility);
if (DistanceToOccluder < .9f * SphereRadius)
{
// Assuming occluder is straight forward along the cone
float WorldDistanceToOccluder = WorldStepOffset + DistanceToOccluder;
MinWorldDistanceToOccluder = min(MinWorldDistanceToOccluder, WorldDistanceToOccluder);
}
float MinStepSize = .6f * (GetStepOffset(StepIndex + 1) - GetStepOffset(StepIndex));
WorldStepOffset += max(DistanceToOccluder, MinStepSize);
}
}
}
}
}
InterlockedMin(SharedConeVisibility[ConeIndex], asuint(MinVisibility));
InterlockedMin(SharedMinOcclusionDistance, asuint(max(MinWorldDistanceToOccluder, 0)));
#if SUPPORT_IRRADIANCE
UNROLL
for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++)
{
InterlockedMin(SharedConeRawVisibility[ConeIndex][i], asuint(MinRawVisibility[i]));
}
GroupMemoryBarrierWithGroupSync();
// Only need one thread per cone direction to write
if (ThreadIndex < NUM_CONE_DIRECTIONS)
{
uint RecordConeDataIndex = (GroupId.x * NUM_CONE_DIRECTIONS + ConeIndex) * RECORD_CONE_DATA_STRIDE;
float MinStepVisibility = 1;
UNROLL
for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++)
{
float StepVisibility = asfloat(SharedConeRawVisibility[ConeIndex][i]);
// Propagate min visibility down the cone
MinStepVisibility = min(MinStepVisibility, StepVisibility);
RWRecordConeData[RecordConeDataIndex + i] = MinStepVisibility;
}
}
#endif
}
GroupMemoryBarrierWithGroupSync();
if (ThreadIndex == 0)
{
for (uint ConeIndex = 0; ConeIndex < NUM_CONE_DIRECTIONS; ConeIndex++)
{
float ConeVisibility = asfloat(SharedConeVisibility[ConeIndex]);
RWRecordConeVisibility[GroupId.x * NUM_CONE_DIRECTIONS + ConeIndex] = ConeVisibility;
}
float RecordRadius = RecordRadiusScale * asfloat(SharedMinOcclusionDistance);
RWOccluderRadius[RecordIndex] = RecordRadius;
}
}
RWBuffer<float4> RWIrradianceCacheBentNormal;
/** */
[numthreads(FINAL_GATHER_THREADGROUP_SIZE, 1, 1)]
void CombineConesCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint StartIndex = SavedStartIndex[0];
uint NumRecords = ScatterDrawParameters[1];
uint RecordIndex = StartIndex + DispatchThreadId.x;
if (RecordIndex < NumRecords)
{
float3 RecordWorldNormal = IrradianceCacheNormal[RecordIndex].xyz;
float3 UnoccludedDirection = ComputeBentNormal(RecordWorldNormal, DispatchThreadId.x);
RWIrradianceCacheBentNormal[RecordIndex] = float4(UnoccludedDirection, 0);
}
}
#define COMPACT_THREADGROUP_SIZEX 64
Buffer<uint> DrawParameters;
RWBuffer<uint> RWDispatchParameters;
RWBuffer<uint> RWSavedStartIndex;
[numthreads(1, 1, 1)]
void SaveStartIndexCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
RWSavedStartIndex[0] = DrawParameters[1];
}
[numthreads(1, 1, 1)]
void SetupFinalGatherIndirectArgumentsCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint StartIndex = SavedStartIndex[0];
uint NumRecords = DrawParameters[1];
uint WorkRange = NumRecords - StartIndex;
#if ONE_GROUP_PER_RECORD
// One thread group per record
RWDispatchParameters[0] = WorkRange;
#else
// One thread per record, divide and round up
RWDispatchParameters[0] = (WorkRange + FINAL_GATHER_THREADGROUP_SIZE - 1) / FINAL_GATHER_THREADGROUP_SIZE;
#endif
RWDispatchParameters[1] = 1;
RWDispatchParameters[2] = 1;
}
#define COPY_THREADGROUP_SIZE 256
float TrimFraction;
[numthreads(1, 1, 1)]
void SetupCopyIndirectArgumentsCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
#if FADE_RECORDS_OVER_TIME
// Spawn a thread per record
RWDispatchParameters[0] = (DrawParameters[1] + COPY_THREADGROUP_SIZE - 1) / COPY_THREADGROUP_SIZE;
#else
uint NumRecords = (1 - TrimFraction) * DrawParameters[1];
// Make sure we spawn at least one group so RWScatterDrawParameters gets written to in the next pass
RWDispatchParameters[0] = max((NumRecords + COPY_THREADGROUP_SIZE - 1) / COPY_THREADGROUP_SIZE, 1);
#endif
RWDispatchParameters[1] = 1;
RWDispatchParameters[2] = 1;
#if FADE_RECORDS_OVER_TIME
if (DispatchThreadId.x == 0)
{
// Clear to 0 to prepare for accumulation
RWScatterDrawParameters[1] = 0;
}
#endif
}
RWBuffer<float4> RWCopyIrradianceCachePositionRadius;
RWBuffer<float4> RWCopyIrradianceCacheNormal;
RWBuffer<float> RWCopyOccluderRadius;
RWBuffer<float4> RWCopyIrradianceCacheBentNormal;
RWBuffer<float4> RWCopyIrradianceCacheIrradiance;
RWBuffer<uint2> RWCopyIrradianceCacheTileCoordinate;
Buffer<float> OccluderRadius;
[numthreads(COPY_THREADGROUP_SIZE, 1, 1)]
void CopyIrradianceCacheSamplesCS(
uint3 GroupId : SV_GroupID,
uint3 DispatchThreadId : SV_DispatchThreadID,
uint3 GroupThreadId : SV_GroupThreadID)
{
uint NumRecords = DrawParameters[1];
#if FADE_RECORDS_OVER_TIME
uint RecordIndex = DispatchThreadId.x;
if (RecordIndex < NumRecords)
{
float4 PositionAndPackedRadius = IrradianceCachePositionRadius[RecordIndex];
float4 NormalAndFade = IrradianceCacheNormal[RecordIndex];
// Update fade
float NewFade = abs(NormalAndFade.w) - 1 + clamp(View.GeneralPurposeTweak, .001f, 1);
// Only write out if still alive
if (NewFade < 1 /*RecordIndex > (uint)(NumRecords * TrimFraction)*/)
{
/*
// Always trim at least one to handle dynamic scene changes not accounted for with irradiance cache misses
if (RecordIndex <= (uint)(NumRecords * TrimFraction) && PositionAndPackedRadius.w > 0)
{
// Mark as fading out
PositionAndPackedRadius.w *= -1;
// Reset fade to 0
NewFade = 0;
}*/
NormalAndFade.w = (NewFade + 1) * sign(NormalAndFade.w);
uint DestIndex;
InterlockedAdd(RWScatterDrawParameters[1], 1U, DestIndex);
RWCopyIrradianceCachePositionRadius[DestIndex] = PositionAndPackedRadius;
RWCopyIrradianceCacheNormal[DestIndex] = NormalAndFade;
RWCopyOccluderRadius[DestIndex] = OccluderRadius[RecordIndex];
RWCopyIrradianceCacheBentNormal[DestIndex] = IrradianceCacheBentNormal[RecordIndex];
#if SUPPORT_IRRADIANCE
RWCopyIrradianceCacheIrradiance[DestIndex] = IrradianceCacheIrradiance[RecordIndex];
#endif
RWCopyIrradianceCacheTileCoordinate[DestIndex] = IrradianceCacheTileCoordinate[RecordIndex];
}
}
#else
// Always trim at least one to handle dynamic scene changes not accounted for with irradiance cache misses
uint StartIndex = max(NumRecords * TrimFraction, 1);
StartIndex = min(StartIndex, NumRecords);
uint SourceIndex = StartIndex + DispatchThreadId.x;
uint DestIndex = DispatchThreadId.x;
if (SourceIndex < NumRecords)
{
RWCopyIrradianceCachePositionRadius[DestIndex] = IrradianceCachePositionRadius[SourceIndex];
RWCopyIrradianceCacheNormal[DestIndex] = IrradianceCacheNormal[SourceIndex];
RWCopyOccluderRadius[DestIndex] = OccluderRadius[SourceIndex];
RWCopyIrradianceCacheBentNormal[DestIndex] = IrradianceCacheBentNormal[SourceIndex];
#if SUPPORT_IRRADIANCE
RWCopyIrradianceCacheIrradiance[DestIndex] = IrradianceCacheIrradiance[SourceIndex];
#endif
RWCopyIrradianceCacheTileCoordinate[DestIndex] = IrradianceCacheTileCoordinate[SourceIndex];
}
if (DispatchThreadId.x == 0)
{
RWScatterDrawParameters[1] = NumRecords - StartIndex;
}
#endif
}