// Copyright 1998-2015 Epic Games, Inc. All Rights Reserved. /*============================================================================= DistanceFieldSurfaceCacheLightingCompute.usf =============================================================================*/ #include "Common.usf" #include "DeferredShadingCommon.usf" #include "ReflectionEnvironmentShared.usf" #include "DistanceFieldLightingShared.usf" #include "DistanceFieldAOShared.usf" #include "MonteCarlo.usf" uint NumUploadOperations; Buffer UploadOperationIndices; Buffer UploadOperationData; // In float4's. Must match equivalent C++ variables. #define UPLOAD_DATA_STRIDE (1 + OBJECT_DATA_STRIDE) void UploadDataFloat4(uint DestIndex, uint UploadIndex) { float4 UploadVector = UploadOperationData[UploadIndex]; RWObjectData[4 * DestIndex + 0] = UploadVector.x; RWObjectData[4 * DestIndex + 1] = UploadVector.y; RWObjectData[4 * DestIndex + 2] = UploadVector.z; RWObjectData[4 * DestIndex + 3] = UploadVector.w; } [numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)] void UploadObjectsToBufferCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint UploadOperationIndex = DispatchThreadId.x; if (DispatchThreadId.x < NumUploadOperations) { uint DestIndex = UploadOperationIndices[UploadOperationIndex]; float4 UploadBounds = UploadOperationData[UploadOperationIndex * UPLOAD_DATA_STRIDE + 0]; RWObjectBounds[4 * DestIndex + 0] = UploadBounds.x; RWObjectBounds[4 * DestIndex + 1] = UploadBounds.y; RWObjectBounds[4 * DestIndex + 2] = UploadBounds.z; RWObjectBounds[4 * DestIndex + 3] = UploadBounds.w; UNROLL for (uint VectorIndex = 0; VectorIndex < OBJECT_DATA_STRIDE; VectorIndex++) { UploadDataFloat4(DestIndex * OBJECT_DATA_STRIDE + VectorIndex, UploadOperationIndex * UPLOAD_DATA_STRIDE + VectorIndex + 1); } } } RWBuffer RWCopyObjectBounds; RWBuffer RWCopyObjectData; void CopyDataFloat4(uint DestIndex, uint SourceIndex) { RWCopyObjectData[4 * DestIndex + 0] = ObjectData[4 * SourceIndex + 0]; RWCopyObjectData[4 * DestIndex + 1] = ObjectData[4 * SourceIndex + 1]; RWCopyObjectData[4 * DestIndex + 2] = ObjectData[4 * SourceIndex + 2]; RWCopyObjectData[4 * DestIndex + 3] = ObjectData[4 * SourceIndex + 3]; } [numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)] void CopyObjectBufferCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint CopyOperationIndex = DispatchThreadId.x; if (CopyOperationIndex < NumSceneObjects) { uint SourceIndex = CopyOperationIndex; uint DestIndex = CopyOperationIndex; RWCopyObjectBounds[4 * DestIndex + 0] = ObjectBounds[4 * SourceIndex + 0]; RWCopyObjectBounds[4 * DestIndex + 1] = ObjectBounds[4 * SourceIndex + 1]; RWCopyObjectBounds[4 * DestIndex + 2] = ObjectBounds[4 * SourceIndex + 2]; RWCopyObjectBounds[4 * DestIndex + 3] = ObjectBounds[4 * SourceIndex + 3]; UNROLL for (uint VectorIndex = 0; VectorIndex < OBJECT_DATA_STRIDE; VectorIndex++) { CopyDataFloat4(DestIndex * OBJECT_DATA_STRIDE + VectorIndex, SourceIndex * OBJECT_DATA_STRIDE + VectorIndex); } } } uint NumRemoveOperations; Buffer RemoveOperationIndices; #if REMOVE_FROM_SAME_BUFFER #define RWBoundsRemoveSource RWObjectBounds #define RWDataRemoveSource RWObjectData #else Buffer ObjectBounds2; Buffer ObjectData2; #define RWBoundsRemoveSource ObjectBounds2 #define RWDataRemoveSource ObjectData2 #endif void WriteDataFloat4(uint DestIndex, uint SourceIndex) { RWObjectData[4 * DestIndex + 0] = RWDataRemoveSource[4 * SourceIndex + 0]; RWObjectData[4 * DestIndex + 1] = RWDataRemoveSource[4 * SourceIndex + 1]; RWObjectData[4 * DestIndex + 2] = RWDataRemoveSource[4 * SourceIndex + 2]; RWObjectData[4 * DestIndex + 3] = RWDataRemoveSource[4 * SourceIndex + 3]; } [numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)] void RemoveObjectsFromBufferCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint RemoveOperationIndex = DispatchThreadId.x; if (RemoveOperationIndex < NumRemoveOperations) { // RemoveAtSwap uint SourceIndex = RemoveOperationIndices[RemoveOperationIndex].y; uint DestIndex = RemoveOperationIndices[RemoveOperationIndex].x; RWObjectBounds[4 * DestIndex + 0] = RWBoundsRemoveSource[4 * SourceIndex + 0]; RWObjectBounds[4 * DestIndex + 1] = RWBoundsRemoveSource[4 * SourceIndex + 1]; RWObjectBounds[4 * DestIndex + 2] = RWBoundsRemoveSource[4 * SourceIndex + 2]; RWObjectBounds[4 * DestIndex + 3] = RWBoundsRemoveSource[4 * SourceIndex + 3]; UNROLL for (uint VectorIndex = 0; VectorIndex < OBJECT_DATA_STRIDE; VectorIndex++) { WriteDataFloat4(DestIndex * OBJECT_DATA_STRIDE + VectorIndex, SourceIndex * OBJECT_DATA_STRIDE + VectorIndex); } } } RWBuffer RWCulledObjectBounds; RWBuffer RWCulledObjectData; RWBuffer RWCulledObjectBoxBounds; uint ObjectBoundingGeometryIndexCount; groupshared uint NumGroupObjects; groupshared uint GroupBaseIndex; groupshared uint GroupObjectIndices[UPDATEOBJECTS_THREADGROUP_SIZE]; float4 FetchObjectDataFloat4(uint SourceIndex) { return float4(ObjectData[4 * SourceIndex + 0], ObjectData[4 * SourceIndex + 1], ObjectData[4 * SourceIndex + 2], ObjectData[4 * SourceIndex + 3]); } void CopyCulledObjectData(uint DestIndex, uint SourceIndex) { RWCulledObjectBounds[DestIndex] = float4(ObjectBounds[4 * SourceIndex + 0], ObjectBounds[4 * SourceIndex + 1], ObjectBounds[4 * SourceIndex + 2], ObjectBounds[4 * SourceIndex + 3]); UNROLL for (uint VectorIndex = 0; VectorIndex < CULLED_OBJECT_DATA_STRIDE; VectorIndex++) { float4 Data = FetchObjectDataFloat4(SourceIndex * OBJECT_DATA_STRIDE + VectorIndex); // Note: only copying the first CULLED_OBJECT_DATA_STRIDE of the original object data RWCulledObjectData[DestIndex * CULLED_OBJECT_DATA_STRIDE + VectorIndex] = Data; } } [numthreads(UPDATEOBJECTS_THREADGROUP_SIZE, 1, 1)] void CullObjectsForViewCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint ObjectIndex = DispatchThreadId.x; #define USE_FRUSTUM_CULLING 1 #if USE_FRUSTUM_CULLING if (DispatchThreadId.x == 0) { // RWObjectIndirectArguments is zeroed by a clear before this shader, only need to set things that are non-zero (and are not read by this shader as that would be a race condition) // IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount; } if (GroupThreadId.x == 0) { NumGroupObjects = 0; } GroupMemoryBarrierWithGroupSync(); if (ObjectIndex < NumSceneObjects) { uint SourceIndex = ObjectIndex; float4 ObjectBoundingSphere = float4(ObjectBounds[4 * SourceIndex + 0], ObjectBounds[4 * SourceIndex + 1], ObjectBounds[4 * SourceIndex + 2], ObjectBounds[4 * SourceIndex + 3]); float DistanceToViewSq = dot(View.ViewOrigin.xyz - ObjectBoundingSphere.xyz, View.ViewOrigin.xyz - ObjectBoundingSphere.xyz); if (DistanceToViewSq < Square(AOMaxViewDistance + ObjectBoundingSphere.w) && ViewFrustumIntersectSphere(ObjectBoundingSphere.xyz, ObjectBoundingSphere.w + AOMaxDistance)) { uint DestIndex; InterlockedAdd(NumGroupObjects, 1U, DestIndex); GroupObjectIndices[DestIndex] = SourceIndex; } } GroupMemoryBarrierWithGroupSync(); if (GroupThreadId.x == 0) { InterlockedAdd(RWObjectIndirectArguments[1], NumGroupObjects, GroupBaseIndex); } GroupMemoryBarrierWithGroupSync(); if (GroupThreadId.x < NumGroupObjects) { uint SourceIndex = GroupObjectIndices[GroupThreadId.x]; uint DestIndex = GroupBaseIndex + GroupThreadId.x; CopyCulledObjectData(DestIndex, SourceIndex); } #else if (DispatchThreadId.x == 0) { // IndexCount, NumInstances, StartIndex, BaseVertexIndex, FirstInstance RWObjectIndirectArguments[0] = ObjectBoundingGeometryIndexCount; RWObjectIndirectArguments[1] = NumSceneObjects; } GroupMemoryBarrierWithGroupSync(); if (ObjectIndex < NumSceneObjects) { uint SourceIndex = ObjectIndex; uint DestIndex = ObjectIndex; CopyCulledObjectData(DestIndex, SourceIndex); } #endif } /** Min and Max depth for this tile. */ groupshared uint IntegerTileMinZ; groupshared uint IntegerTileMaxZ; /** Inner Min and Max depth for this tile. */ groupshared uint IntegerTileMinZ2; groupshared uint IntegerTileMaxZ2; /** View rect min in xy, max in zw. */ uint4 ViewDimensions; float2 NumGroups; RWBuffer RWTileConeAxisAndCos; RWBuffer RWTileConeDepthRanges; RWBuffer RWTileHeadDataUnpacked; #ifndef MAX_OBJECTS_PER_TILE #define MAX_OBJECTS_PER_TILE 1 #endif /** Builds tile depth ranges and bounding cones. */ [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void BuildTileConesMain( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x; float2 BaseLevelScreenUV = (DispatchThreadId.xy + float2(.5f, .5f)) * DOWNSAMPLE_FACTOR * View.ViewSizeAndSceneTexelSize.zw; float SceneDepth = GetDownsampledDepth(BaseLevelScreenUV); //float2 ScreenUV = (DispatchThreadId.xy * DOWNSAMPLE_FACTOR + View.ViewRectMin.xy + float2(.5f, .5f)) * View.ViewSizeAndSceneTexelSize.zw; //float SceneDepth = CalcSceneDepth(ScreenUV); // Initialize per-tile variables if (ThreadIndex == 0) { IntegerTileMinZ = 0x7F7FFFFF; IntegerTileMaxZ = 0; IntegerTileMinZ2 = 0x7F7FFFFF; IntegerTileMaxZ2 = 0; } GroupMemoryBarrierWithGroupSync(); // Use shared memory atomics to build the depth bounds for this tile // Each thread is assigned to a pixel at this point InterlockedMin(IntegerTileMinZ, asuint(SceneDepth)); InterlockedMax(IntegerTileMaxZ, asuint(SceneDepth)); GroupMemoryBarrierWithGroupSync(); float MinTileZ = asfloat(IntegerTileMinZ); float MaxTileZ = asfloat(IntegerTileMaxZ); float HalfZ = .5f * (MinTileZ + MaxTileZ); // Compute a second min and max Z, clipped by HalfZ, so that we get two depth bounds per tile // This results in more conservative tile depth bounds and fewer intersections if (SceneDepth >= HalfZ) { InterlockedMin(IntegerTileMinZ2, asuint(SceneDepth)); } if (SceneDepth <= HalfZ) { InterlockedMax(IntegerTileMaxZ2, asuint(SceneDepth)); } GroupMemoryBarrierWithGroupSync(); float MinTileZ2 = asfloat(IntegerTileMinZ2); float MaxTileZ2 = asfloat(IntegerTileMaxZ2); if (ThreadIndex == 0) { float3 TileConeVertex; float3 TileConeAxis; float TileConeAngleCos; float TileConeAngleSin; float4 ConeAxisDepthRanges; { float2 ViewSize = float2(1 / View.ViewToClip[0][0], 1 / View.ViewToClip[1][1]); float3 TileCorner00 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1)); float3 TileCorner10 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1)); float3 TileCorner01 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1)); float3 TileCorner11 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1)); TileConeAxis = normalize(TileCorner00 + TileCorner10 + TileCorner01 + TileCorner11); TileConeAngleCos = dot(TileConeAxis, TileCorner00); TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos); float TileConeAngleTan = TileConeAngleSin / TileConeAngleCos; float3 ViewSpaceSampleDirection = mul(float3(0, 0, 1), (float3x3)View.TranslatedWorldToView); float ConeExpandDistance = 0; float VertexPullbackLength = ConeExpandDistance / TileConeAngleTan; float DistanceToNearPlane = length(TileConeAxis / TileConeAxis.z * View.NearPlane); // 1 / cos(AngleBetweenTileCenterAndViewForward) float InvCosTileAngle = 1.0f / TileConeAxis.z; float ConeAxisDistanceMultiply = InvCosTileAngle; float ConeAxisDistanceAdd = VertexPullbackLength + DistanceToNearPlane; ConeAxisDepthRanges.x = ConeAxisDistanceMultiply * (MinTileZ - ConeExpandDistance) + ConeAxisDistanceAdd; ConeAxisDepthRanges.y = ConeAxisDistanceMultiply * (MaxTileZ2 + ConeExpandDistance) + ConeAxisDistanceAdd; ConeAxisDepthRanges.z = ConeAxisDistanceMultiply * (MinTileZ2 - ConeExpandDistance) + ConeAxisDistanceAdd; ConeAxisDepthRanges.w = ConeAxisDistanceMultiply * (MaxTileZ + ConeExpandDistance) + ConeAxisDistanceAdd; // Pull back cone vertex to contain potential samples TileConeVertex = float3(0, 0, 0) - TileConeAxis * VertexPullbackLength; } uint TileIndex = GroupId.y * NumGroups.x + GroupId.x; RWTileConeAxisAndCos[TileIndex] = float4(TileConeAxis, TileConeAngleCos); RWTileConeDepthRanges[TileIndex] = ConeAxisDepthRanges; RWTileHeadDataUnpacked[TileIndex * 4 + 0] = TileIndex; RWTileHeadDataUnpacked[TileIndex * 4 + 1] = 0; RWTileHeadDataUnpacked[TileIndex * 4 + 2] = 0; RWTileHeadDataUnpacked[TileIndex * 4 + 3] = 0; } } groupshared uint SmallTileObjectIndices[MAX_OBJECTS_PER_TILE]; groupshared uint MediumTileObjectIndices[MAX_OBJECTS_PER_TILE]; groupshared uint LargeTileObjectIndices[MAX_OBJECTS_PER_TILE]; groupshared uint SmallTileNumObjects; groupshared uint MediumTileNumObjects; groupshared uint LargeTileNumObjects; groupshared uint TileArrayDataStart; RWBuffer RWTileArrayData; RWBuffer RWTileArrayNextAllocation; RWBuffer RWTileHeadData; [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void DistanceFieldAOBuildTileListMain( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x; float2 ScreenUV = (DispatchThreadId.xy * DOWNSAMPLE_FACTOR + View.ViewRectMin.xy + float2(.5f, .5f)) * View.ViewSizeAndSceneTexelSize.zw; float SceneDepth = CalcSceneDepth(ScreenUV); // Initialize per-tile variables if (ThreadIndex == 0) { IntegerTileMinZ = 0x7F7FFFFF; IntegerTileMaxZ = 0; IntegerTileMinZ2 = 0x7F7FFFFF; IntegerTileMaxZ2 = 0; SmallTileNumObjects = 0; MediumTileNumObjects = 0; LargeTileNumObjects = 0; } GroupMemoryBarrierWithGroupSync(); // Use shared memory atomics to build the depth bounds for this tile // Each thread is assigned to a pixel at this point InterlockedMin(IntegerTileMinZ, asuint(SceneDepth)); InterlockedMax(IntegerTileMaxZ, asuint(SceneDepth)); GroupMemoryBarrierWithGroupSync(); float MinTileZ = asfloat(IntegerTileMinZ); float MaxTileZ = asfloat(IntegerTileMaxZ); float HalfZ = .5f * (MinTileZ + MaxTileZ); // Compute a second min and max Z, clipped by HalfZ, so that we get two depth bounds per tile // This results in more conservative tile depth bounds and fewer intersections if (SceneDepth >= HalfZ) { InterlockedMin(IntegerTileMinZ2, asuint(SceneDepth)); } if (SceneDepth <= HalfZ) { InterlockedMax(IntegerTileMaxZ2, asuint(SceneDepth)); } GroupMemoryBarrierWithGroupSync(); float MinTileZ2 = asfloat(IntegerTileMinZ2); float MaxTileZ2 = asfloat(IntegerTileMaxZ2); float3 TileConeVertex; float3 TileConeAxis; float TileConeAngleCos; float TileConeAngleSin; float4 ConeAxisDepthRanges; { float2 ViewSize = float2(1 / View.ViewToClip[0][0], 1 / View.ViewToClip[1][1]); float3 TileCorner00 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1)); float3 TileCorner10 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 0) / NumGroups.y * ViewSize.y * 2, 1)); float3 TileCorner01 = normalize(float3((GroupId.x + 0) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1)); float3 TileCorner11 = normalize(float3((GroupId.x + 1) / NumGroups.x * ViewSize.x * 2 - ViewSize.x, ViewSize.y - (GroupId.y + 1) / NumGroups.y * ViewSize.y * 2, 1)); TileConeAxis = normalize(TileCorner00 + TileCorner10 + TileCorner01 + TileCorner11); TileConeAngleCos = dot(TileConeAxis, TileCorner00); TileConeAngleSin = sqrt(1 - TileConeAngleCos * TileConeAngleCos); float TileConeAngleTan = TileConeAngleSin / TileConeAngleCos; float3 ViewSpaceSampleDirection = mul(float3(0, 0, 1), (float3x3)View.TranslatedWorldToView); float ConeExpandDistance = 0; float VertexPullbackLength = ConeExpandDistance / TileConeAngleTan; float DistanceToNearPlane = length(TileConeAxis / TileConeAxis.z * View.NearPlane); // 1 / cos(AngleBetweenTileCenterAndViewForward) float InvCosTileAngle = 1.0f / TileConeAxis.z; float ConeAxisDistanceMultiply = InvCosTileAngle; float ConeAxisDistanceAdd = VertexPullbackLength + DistanceToNearPlane; ConeAxisDepthRanges.x = ConeAxisDistanceMultiply * (MinTileZ - ConeExpandDistance) + ConeAxisDistanceAdd; ConeAxisDepthRanges.y = ConeAxisDistanceMultiply * (MaxTileZ2 + ConeExpandDistance) + ConeAxisDistanceAdd; ConeAxisDepthRanges.z = ConeAxisDistanceMultiply * (MinTileZ2 - ConeExpandDistance) + ConeAxisDistanceAdd; ConeAxisDepthRanges.w = ConeAxisDistanceMultiply * (MaxTileZ + ConeExpandDistance) + ConeAxisDistanceAdd; // Pull back cone vertex to contain potential samples //@todo - only expand in sky direction TileConeVertex = float3(0, 0, 0) - TileConeAxis * VertexPullbackLength; } // A value of 1 is conservative, but has huge impact on performance float RadiusScale = .5f; float SmallGroupMaxSampleRadius; { uint StartIndex; uint EndIndex; GetPhaseParameters(0, StartIndex, EndIndex); SmallGroupMaxSampleRadius = GetStepOffset(EndIndex) * 2 * RadiusScale; } float MediumGroupMaxSampleRadius; { uint StartIndex; uint EndIndex; GetPhaseParameters(1, StartIndex, EndIndex); MediumGroupMaxSampleRadius = GetStepOffset(EndIndex) * 2 * RadiusScale; } float LargeGroupMaxSampleRadius; { uint StartIndex; uint EndIndex; GetPhaseParameters(2, StartIndex, EndIndex); LargeGroupMaxSampleRadius = GetStepOffset(EndIndex) * 2 * RadiusScale; } uint NumCulledObjects = GetCulledNumObjects(); // Compute per-tile lists of affecting objects through bounds culling // Each thread now operates on a sample instead of a pixel LOOP for (uint ObjectIndex = ThreadIndex; ObjectIndex < NumCulledObjects; ObjectIndex += THREADGROUP_TOTALSIZE) { float4 SphereCenterAndRadius = LoadObjectPositionAndRadius(ObjectIndex); float3 ViewSpaceSphereCenter = mul(float4(SphereCenterAndRadius.xyz + View.PreViewTranslation.xyz, 1), View.TranslatedWorldToView).xyz; if (SphereIntersectConeWithDepthRanges(float4(ViewSpaceSphereCenter, SphereCenterAndRadius.w + SmallGroupMaxSampleRadius), TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges)) { uint ListIndex; InterlockedAdd(SmallTileNumObjects, 1U, ListIndex); SmallTileObjectIndices[ListIndex] = ObjectIndex; } else if (SphereIntersectConeWithDepthRanges(float4(ViewSpaceSphereCenter, SphereCenterAndRadius.w + MediumGroupMaxSampleRadius), TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges)) { uint ListIndex; InterlockedAdd(MediumTileNumObjects, 1U, ListIndex); MediumTileObjectIndices[ListIndex] = ObjectIndex; } else if (SphereIntersectConeWithDepthRanges(float4(ViewSpaceSphereCenter, SphereCenterAndRadius.w + LargeGroupMaxSampleRadius), TileConeVertex, TileConeAxis, TileConeAngleCos, TileConeAngleSin, ConeAxisDepthRanges)) { uint ListIndex; InterlockedAdd(LargeTileNumObjects, 1U, ListIndex); LargeTileObjectIndices[ListIndex] = ObjectIndex; } } GroupMemoryBarrierWithGroupSync(); if (ThreadIndex == 0) { SmallTileNumObjects = min(SmallTileNumObjects, MAX_OBJECTS_PER_TILE); MediumTileNumObjects = min(MediumTileNumObjects, MAX_OBJECTS_PER_TILE); LargeTileNumObjects = min(LargeTileNumObjects, MAX_OBJECTS_PER_TILE); uint ArrayStart; uint NumObjectsIntersecting = SmallTileNumObjects + MediumTileNumObjects + LargeTileNumObjects; InterlockedAdd(RWTileArrayNextAllocation[0], NumObjectsIntersecting, ArrayStart); TileArrayDataStart = ArrayStart; RWTileHeadData[GroupId.y * (uint)NumGroups.x + GroupId.x] = uint4(TileArrayDataStart, SmallTileNumObjects, MediumTileNumObjects, LargeTileNumObjects); } GroupMemoryBarrierWithGroupSync(); uint ArrayDataStart = TileArrayDataStart; LOOP for (uint SmallListIndex = ThreadIndex; SmallListIndex < SmallTileNumObjects; SmallListIndex += THREADGROUP_TOTALSIZE) { RWTileArrayData[ArrayDataStart + SmallListIndex] = SmallTileObjectIndices[SmallListIndex]; } ArrayDataStart += SmallTileNumObjects; LOOP for (uint MediumListIndex = ThreadIndex; MediumListIndex < MediumTileNumObjects; MediumListIndex += THREADGROUP_TOTALSIZE) { RWTileArrayData[ArrayDataStart + MediumListIndex] = MediumTileObjectIndices[MediumListIndex]; } ArrayDataStart += MediumTileNumObjects; LOOP for (uint LargeListIndex = ThreadIndex; LargeListIndex < LargeTileNumObjects; LargeListIndex += THREADGROUP_TOTALSIZE) { RWTileArrayData[ArrayDataStart + LargeListIndex] = LargeTileObjectIndices[LargeListIndex]; } } /** View rect min in xy, max in zw. */ float2 ThreadToCulledTile; uint NumCircleSections; RWBuffer RWIrradianceCachePositionRadius; RWBuffer RWIrradianceCacheNormal; RWBuffer RWScatterDrawParameters; RWBuffer RWIrradianceCacheTileCoordinate; Texture2D IrradianceCacheSplatTexture; SamplerState IrradianceCacheSplatSampler; groupshared float4 CachedIrradianceCachePositionRadius[THREADGROUP_TOTALSIZE]; groupshared float4 CachedIrradianceCacheNormal[THREADGROUP_TOTALSIZE]; groupshared uint2 CachedIrradianceCacheTileCoordinate[THREADGROUP_TOTALSIZE]; groupshared uint NumQueuedIrradianceCacheRecords; groupshared uint BaseRecordIndex; /** Creates new surface cache records for sample points that don't have valid coverage from existing surface cache records. */ [numthreads(THREADGROUP_SIZEX, THREADGROUP_SIZEY, 1)] void PopulateCacheCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint ThreadIndex = GroupThreadId.y * THREADGROUP_SIZEX + GroupThreadId.x; if (ThreadIndex == 0) { NumQueuedIrradianceCacheRecords = 0; } GroupMemoryBarrierWithGroupSync(); float2 ScreenUV = (DispatchThreadId.xy * CurrentLevelDownsampleFactor + View.ViewRectMin.xy + float2(.5f, .5f)) * View.ViewSizeAndSceneTexelSize.zw; float2 ScreenPosition = (ScreenUV.xy - View.ScreenPositionScaleBias.wz) / View.ScreenPositionScaleBias.xy; float2 DownsampledScreenUV = (DispatchThreadId.xy + float2(.5f, .5f)) / AOBufferSize; FGBufferData GBufferData = GetGBufferData(ScreenUV); float4 IrradianceCacheSplat = Texture2DSampleLevel(IrradianceCacheSplatTexture, IrradianceCacheSplatSampler, DownsampledScreenUV, 0); BRANCH if (GBufferData.ShadingModelID > 0 && IrradianceCacheSplat.w < .0001f && all((float2)DispatchThreadId.xy < AOBufferSize)) { float SceneDepth = CalcSceneDepth(ScreenUV); BRANCH if (SceneDepth < AOMaxViewDistance) { uint2 TileCoordinate = DispatchThreadId.xy * DownsampleFactorToBaseLevel / uint2(THREADGROUP_SIZEX, THREADGROUP_SIZEY); float4 HomogeneousWorldPosition = mul(float4(ScreenPosition * SceneDepth, SceneDepth, 1), View.ScreenToWorld); float3 OpaqueWorldPosition = HomogeneousWorldPosition.xyz / HomogeneousWorldPosition.w; float2 BaseLevelScreenUV = (DispatchThreadId.xy * DownsampleFactorToBaseLevel + float2(.5f, .5f)) * BaseLevelTexelSize; float3 WorldNormal; float Unused; bool bHasDistanceFieldRepresentation; bool bHasHeightfieldRepresentation; GetDownsampledGBuffer(BaseLevelScreenUV, WorldNormal, Unused, bHasDistanceFieldRepresentation, bHasHeightfieldRepresentation); //@todo - offset shading position along normal to avoid incorrect self-occlusion? float3 WorldShadingPosition = OpaqueWorldPosition; // For debugging //if (all(DispatchThreadId.xy == uint2(4,3))) { // Allocate a new record and store off attributes of the created record uint NextSampleIndex; InterlockedAdd(NumQueuedIrradianceCacheRecords, 1u, NextSampleIndex); // W stores max allowed radius, used to limit overdraw from nearby samples placed in the high resolution passes float MaxRadiusScale = bHasDistanceFieldRepresentation > 0 ? .005f : .0005f; // Sign of W stores whether the sample is fading in or out float RadiusSign = 1; CachedIrradianceCachePositionRadius[NextSampleIndex] = float4(WorldShadingPosition, RadiusSign * SceneDepth * CurrentLevelDownsampleFactor * MaxRadiusScale); // abs(w) - 1 stores fade in amount (shifted away from 0 to retain sign when fade amount is 0) // sign(w) stores bHasHeightfieldRepresentation CachedIrradianceCacheNormal[NextSampleIndex] = float4(WorldNormal, 1 * (bHasHeightfieldRepresentation ? 1 : -1)); CachedIrradianceCacheTileCoordinate[NextSampleIndex] = TileCoordinate; } } } GroupMemoryBarrierWithGroupSync(); if (ThreadIndex == 0) { InterlockedAdd(RWScatterDrawParameters[1], NumQueuedIrradianceCacheRecords, BaseRecordIndex); } GroupMemoryBarrierWithGroupSync(); LOOP for (uint LocalRecordIndex = ThreadIndex; LocalRecordIndex < NumQueuedIrradianceCacheRecords; LocalRecordIndex += THREADGROUP_TOTALSIZE) { int SampleIndex = BaseRecordIndex + LocalRecordIndex; RWIrradianceCachePositionRadius[SampleIndex] = CachedIrradianceCachePositionRadius[LocalRecordIndex]; RWIrradianceCacheNormal[SampleIndex] = CachedIrradianceCacheNormal[LocalRecordIndex]; uint2 TileCoordinate = CachedIrradianceCacheTileCoordinate[LocalRecordIndex]; RWIrradianceCacheTileCoordinate[SampleIndex] = TileCoordinate; } if (all(DispatchThreadId == 0)) { // VertexCountPerInstance RWScatterDrawParameters[0] = NumCircleSections * 3; // StartVertexLocation RWScatterDrawParameters[2] = 0; // StartInstanceLocation RWScatterDrawParameters[3] = 0; } } float TanConeHalfAngle; float RecordRadiusScale; RWBuffer RWOccluderRadius; RWBuffer RWRecordConeVisibility; RWBuffer RWRecordConeData; RWBuffer RWDebugBuffer; // Have to disable surface caching dependencies for this to work #define VISUALIZE_ONE_CONE 0 // Enforce one thread per cone direction #define SIMULTANEOUSLY_TRACED_OBJECTS (FINAL_GATHER_THREADGROUP_SIZE / NUM_CONE_DIRECTIONS) #define THREADS_PER_OBJECT (FINAL_GATHER_THREADGROUP_SIZE / SIMULTANEOUSLY_TRACED_OBJECTS) groupshared uint SharedConeVisibility[NUM_CONE_DIRECTIONS]; groupshared uint SharedConeRawVisibility[NUM_CONE_DIRECTIONS][NUM_VISIBILITY_STEPS]; groupshared uint SharedMinOcclusionDistance; groupshared float3 SharedGatheredLighting[FINAL_GATHER_THREADGROUP_SIZE]; #define MAX_RECORD_CULLED_OBJECTS 128 groupshared uint SharedCulledObjectList[MAX_RECORD_CULLED_OBJECTS]; groupshared uint NumRecordCulledObjects; // Inconsistent performance savings + overflow of shared memory array #define CULL_OBJECTS_TO_RECORD 0 /** Computes ambient occlusion for a surface cache record by cone stepping through the nearby object distance fields. */ [numthreads(FINAL_GATHER_THREADGROUP_SIZE, 1, 1)] void ConeTraceOcclusionCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint ThreadIndex = GroupThreadId.x; uint ObjectOffsetIndex = ThreadIndex / THREADS_PER_OBJECT; if (ThreadIndex == 0) { for (uint ConeIndex = 0; ConeIndex < NUM_CONE_DIRECTIONS; ConeIndex++) { SharedConeVisibility[ConeIndex] = asuint(1.0f); #if SUPPORT_IRRADIANCE UNROLL for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++) { SharedConeRawVisibility[ConeIndex][i] = asuint(1.0f); } #endif } SharedMinOcclusionDistance = asuint(AOMaxDistance); NumRecordCulledObjects = 0; } GroupMemoryBarrierWithGroupSync(); uint StartIndex = SavedStartIndex[0]; uint NumRecords = ScatterDrawParameters[1]; uint RecordIndex = StartIndex + GroupId.x; float3 TangentX; float3 TangentY; float3 WorldNormal; { WorldNormal = IrradianceCacheNormal[RecordIndex].xyz; float3 WorldShadingPosition = IrradianceCachePositionRadius[RecordIndex].xyz; uint2 TileCoordinate = IrradianceCacheTileCoordinate[RecordIndex]; uint4 TileHead = GetTileHead(TileCoordinate); uint NumObjectsAffectingTile = TileHead.y + TileHead.z + TileHead.w; uint NumCulledObjects = NumObjectsAffectingTile; #if CULL_OBJECTS_TO_RECORD LOOP for (uint ListObjectIndex = ThreadIndex; ListObjectIndex < NumObjectsAffectingTile; ListObjectIndex += FINAL_GATHER_THREADGROUP_SIZE) { if (ListObjectIndex < NumObjectsAffectingTile) { uint ListIndex = 0; uint ArrayIndex = ListObjectIndex; FLATTEN if (ListObjectIndex >= TileHead.y + TileHead.z) { ListIndex = 2; ArrayIndex = ListObjectIndex - TileHead.y - TileHead.z; } else if (ListObjectIndex >= TileHead.y) { ListIndex = 1; ArrayIndex = ListObjectIndex - TileHead.y; } uint ObjectIndex = TileArrayData.Load((ArrayIndex * TileListGroupSize.x * TileListGroupSize.y + TileHead.x) * NUM_CULLED_OBJECT_LISTS + ListIndex); float4 ObjectPositionAndRadius = LoadObjectPositionAndRadius(ObjectIndex); float ObjectDistanceSq = dot(ObjectPositionAndRadius.xyz - WorldShadingPosition, ObjectPositionAndRadius.xyz - WorldShadingPosition); BRANCH // Skip tracing objects with a small projected angle if (ObjectPositionAndRadius.w * ObjectPositionAndRadius.w / ObjectDistanceSq > Square(.25f) // Skip tracing objects outside the max occlusion distance //@todo - box distance && ObjectDistanceSq < Square(ObjectPositionAndRadius.w + AOMaxDistance)) { uint DestIndex; InterlockedAdd(NumRecordCulledObjects, 1U, DestIndex); SharedCulledObjectList[DestIndex] = ObjectIndex; } } } GroupMemoryBarrierWithGroupSync(); NumCulledObjects = NumRecordCulledObjects; #endif FindBestAxisVectors2(WorldNormal, TangentX, TangentY); float3 DebugConeDirection = normalize(float3(.3f, .5f, .4f)); uint ConeIndex = ThreadIndex % THREADS_PER_OBJECT; float3 ConeDirection = AOSamples2.SampleDirections[ConeIndex].xyz; float3 RotatedConeDirection = ConeDirection.x * TangentX + ConeDirection.y * TangentY + ConeDirection.z * WorldNormal; #if VISUALIZE_ONE_CONE RotatedConeDirection = DebugConeDirection; #endif float MinVisibility = 1; float MinRawVisibility[NUM_VISIBILITY_STEPS]; float MinWorldDistanceToOccluder = AOMaxDistance; float ConeDistanceAtClosestToOccluder = 0; float MaxWorldStepOffset = GetStepOffset(NUM_CONE_STEPS); #if SUPPORT_IRRADIANCE UNROLL for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++) { MinRawVisibility[i] = 1; } #endif LOOP for (uint ListObjectIndex = 0; ListObjectIndex < NumCulledObjects; ListObjectIndex += SIMULTANEOUSLY_TRACED_OBJECTS) { uint EffectiveListObjectIndex = ListObjectIndex + ObjectOffsetIndex; if (EffectiveListObjectIndex < NumCulledObjects // Ignore extra threads && ObjectOffsetIndex < SIMULTANEOUSLY_TRACED_OBJECTS) { #if CULL_OBJECTS_TO_RECORD uint ObjectIndex = SharedCulledObjectList[EffectiveListObjectIndex]; { #else uint ListIndex = 0; uint ArrayIndex = EffectiveListObjectIndex; FLATTEN if (EffectiveListObjectIndex >= TileHead.y + TileHead.z) { ListIndex = 2; ArrayIndex = EffectiveListObjectIndex - TileHead.y - TileHead.z; } else if (EffectiveListObjectIndex >= TileHead.y) { ListIndex = 1; ArrayIndex = EffectiveListObjectIndex - TileHead.y; } uint ObjectIndex = TileArrayData.Load((ArrayIndex * TileListGroupSize.x * TileListGroupSize.y + TileHead.x) * NUM_CULLED_OBJECT_LISTS + ListIndex); float4 ObjectPositionAndRadius = LoadObjectPositionAndRadius(ObjectIndex); float ObjectDistanceSq = dot(ObjectPositionAndRadius.xyz - WorldShadingPosition, ObjectPositionAndRadius.xyz - WorldShadingPosition); BRANCH // Skip tracing objects with a small projected angle if (ObjectPositionAndRadius.w * ObjectPositionAndRadius.w / ObjectDistanceSq > Square(.25f)) { #endif float3 LocalPositionExtent = LoadObjectLocalPositionExtent(ObjectIndex); float4x4 WorldToVolume = LoadObjectWorldToVolume(ObjectIndex); bool bGeneratedAsTwoSided; float4 UVScaleAndVolumeScale = LoadObjectUVScale(ObjectIndex, bGeneratedAsTwoSided); float3 VolumeShadingPosition = mul(float4(WorldShadingPosition, 1), WorldToVolume).xyz; float ObjectOccluderRadius = length(LocalPositionExtent) * .5f * UVScaleAndVolumeScale.w; float BoxDistance = ComputeDistanceFromBoxToPoint(-LocalPositionExtent, LocalPositionExtent, VolumeShadingPosition) * UVScaleAndVolumeScale.w; BRANCH if (BoxDistance < AOMaxDistance) { float3 UVAdd = LoadObjectUVAdd(ObjectIndex); uint StartStepIndex = 0; #if !CULL_OBJECTS_TO_RECORD FLATTEN if (EffectiveListObjectIndex >= TileHead.y + TileHead.z) { StartStepIndex = 8; } else if (EffectiveListObjectIndex >= TileHead.y) { StartStepIndex = 5; } #endif float WorldStepOffset = GetStepOffset(StartStepIndex); LOOP for (uint StepIndex = StartStepIndex; StepIndex < NUM_CONE_STEPS && WorldStepOffset < MaxWorldStepOffset; StepIndex++) { float3 WorldSamplePosition = WorldShadingPosition + RotatedConeDirection * WorldStepOffset; float3 StepSamplePosition = mul(float4(WorldSamplePosition, 1), WorldToVolume).xyz; float3 ClampedSamplePosition = clamp(StepSamplePosition, -LocalPositionExtent, LocalPositionExtent); float DistanceToClamped = length(StepSamplePosition - ClampedSamplePosition); float3 StepVolumeUV = DistanceFieldVolumePositionToUV(ClampedSamplePosition, UVScaleAndVolumeScale.xyz, UVAdd); float DistanceToOccluder = (Texture3DSampleLevel(DistanceFieldTexture, DistanceFieldSampler, StepVolumeUV, 0).x + DistanceToClamped) * UVScaleAndVolumeScale.w; float SphereRadius = WorldStepOffset * TanConeHalfAngle; //@todo - have to bias away from surface further for this to work float ShadingSphereRadius = SphereRadius * 1.0f; // Derive visibility from 1d intersection float Visibility = saturate(DistanceToOccluder / ShadingSphereRadius); // Don't allow small objects to fully occlude a cone step Visibility = max(Visibility, 1 - saturate(ObjectOccluderRadius / SphereRadius)); float OccluderDistanceFraction = (WorldStepOffset + DistanceToOccluder) / AOMaxDistance; #if SUPPORT_IRRADIANCE uint VisibilityIndex = NUM_VISIBILITY_STEPS * WorldStepOffset / AOMaxDistance; // Less GI occlusion for two sided meshes, which can't separate self-occlusion //@todo - expose float TwoSidedVisibilityScale = bGeneratedAsTwoSided ? 100 : 1; // Track raw visibility before the distance fade for GI shadowing MinRawVisibility[VisibilityIndex] = min(MinRawVisibility[VisibilityIndex], TwoSidedVisibilityScale * Visibility); #endif // Fade out occlusion based on distance to occluder to avoid a discontinuity at the max AO distance Visibility = max(Visibility, saturate(OccluderDistanceFraction * OccluderDistanceFraction * .8f)); MinVisibility = min(MinVisibility, Visibility); if (DistanceToOccluder < .9f * SphereRadius) { // Assuming occluder is straight forward along the cone float WorldDistanceToOccluder = WorldStepOffset + DistanceToOccluder; MinWorldDistanceToOccluder = min(MinWorldDistanceToOccluder, WorldDistanceToOccluder); } float MinStepSize = .6f * (GetStepOffset(StepIndex + 1) - GetStepOffset(StepIndex)); WorldStepOffset += max(DistanceToOccluder, MinStepSize); } } } } } InterlockedMin(SharedConeVisibility[ConeIndex], asuint(MinVisibility)); InterlockedMin(SharedMinOcclusionDistance, asuint(max(MinWorldDistanceToOccluder, 0))); #if SUPPORT_IRRADIANCE UNROLL for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++) { InterlockedMin(SharedConeRawVisibility[ConeIndex][i], asuint(MinRawVisibility[i])); } GroupMemoryBarrierWithGroupSync(); // Only need one thread per cone direction to write if (ThreadIndex < NUM_CONE_DIRECTIONS) { uint RecordConeDataIndex = (GroupId.x * NUM_CONE_DIRECTIONS + ConeIndex) * RECORD_CONE_DATA_STRIDE; float MinStepVisibility = 1; UNROLL for (uint i = 0; i < NUM_VISIBILITY_STEPS; i++) { float StepVisibility = asfloat(SharedConeRawVisibility[ConeIndex][i]); // Propagate min visibility down the cone MinStepVisibility = min(MinStepVisibility, StepVisibility); RWRecordConeData[RecordConeDataIndex + i] = MinStepVisibility; } } #endif } GroupMemoryBarrierWithGroupSync(); if (ThreadIndex == 0) { for (uint ConeIndex = 0; ConeIndex < NUM_CONE_DIRECTIONS; ConeIndex++) { float ConeVisibility = asfloat(SharedConeVisibility[ConeIndex]); RWRecordConeVisibility[GroupId.x * NUM_CONE_DIRECTIONS + ConeIndex] = ConeVisibility; } float RecordRadius = RecordRadiusScale * asfloat(SharedMinOcclusionDistance); RWOccluderRadius[RecordIndex] = RecordRadius; } } RWBuffer RWIrradianceCacheBentNormal; /** */ [numthreads(FINAL_GATHER_THREADGROUP_SIZE, 1, 1)] void CombineConesCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint StartIndex = SavedStartIndex[0]; uint NumRecords = ScatterDrawParameters[1]; uint RecordIndex = StartIndex + DispatchThreadId.x; if (RecordIndex < NumRecords) { float3 RecordWorldNormal = IrradianceCacheNormal[RecordIndex].xyz; float3 UnoccludedDirection = ComputeBentNormal(RecordWorldNormal, DispatchThreadId.x); RWIrradianceCacheBentNormal[RecordIndex] = float4(UnoccludedDirection, 0); } } #define COMPACT_THREADGROUP_SIZEX 64 Buffer DrawParameters; RWBuffer RWDispatchParameters; RWBuffer RWSavedStartIndex; [numthreads(1, 1, 1)] void SaveStartIndexCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { RWSavedStartIndex[0] = DrawParameters[1]; } [numthreads(1, 1, 1)] void SetupFinalGatherIndirectArgumentsCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint StartIndex = SavedStartIndex[0]; uint NumRecords = DrawParameters[1]; uint WorkRange = NumRecords - StartIndex; #if ONE_GROUP_PER_RECORD // One thread group per record RWDispatchParameters[0] = WorkRange; #else // One thread per record, divide and round up RWDispatchParameters[0] = (WorkRange + FINAL_GATHER_THREADGROUP_SIZE - 1) / FINAL_GATHER_THREADGROUP_SIZE; #endif RWDispatchParameters[1] = 1; RWDispatchParameters[2] = 1; } #define COPY_THREADGROUP_SIZE 256 float TrimFraction; [numthreads(1, 1, 1)] void SetupCopyIndirectArgumentsCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { #if FADE_RECORDS_OVER_TIME // Spawn a thread per record RWDispatchParameters[0] = (DrawParameters[1] + COPY_THREADGROUP_SIZE - 1) / COPY_THREADGROUP_SIZE; #else uint NumRecords = (1 - TrimFraction) * DrawParameters[1]; // Make sure we spawn at least one group so RWScatterDrawParameters gets written to in the next pass RWDispatchParameters[0] = max((NumRecords + COPY_THREADGROUP_SIZE - 1) / COPY_THREADGROUP_SIZE, 1); #endif RWDispatchParameters[1] = 1; RWDispatchParameters[2] = 1; #if FADE_RECORDS_OVER_TIME if (DispatchThreadId.x == 0) { // Clear to 0 to prepare for accumulation RWScatterDrawParameters[1] = 0; } #endif } RWBuffer RWCopyIrradianceCachePositionRadius; RWBuffer RWCopyIrradianceCacheNormal; RWBuffer RWCopyOccluderRadius; RWBuffer RWCopyIrradianceCacheBentNormal; RWBuffer RWCopyIrradianceCacheIrradiance; RWBuffer RWCopyIrradianceCacheTileCoordinate; Buffer OccluderRadius; [numthreads(COPY_THREADGROUP_SIZE, 1, 1)] void CopyIrradianceCacheSamplesCS( uint3 GroupId : SV_GroupID, uint3 DispatchThreadId : SV_DispatchThreadID, uint3 GroupThreadId : SV_GroupThreadID) { uint NumRecords = DrawParameters[1]; #if FADE_RECORDS_OVER_TIME uint RecordIndex = DispatchThreadId.x; if (RecordIndex < NumRecords) { float4 PositionAndPackedRadius = IrradianceCachePositionRadius[RecordIndex]; float4 NormalAndFade = IrradianceCacheNormal[RecordIndex]; // Update fade float NewFade = abs(NormalAndFade.w) - 1 + clamp(View.GeneralPurposeTweak, .001f, 1); // Only write out if still alive if (NewFade < 1 /*RecordIndex > (uint)(NumRecords * TrimFraction)*/) { /* // Always trim at least one to handle dynamic scene changes not accounted for with irradiance cache misses if (RecordIndex <= (uint)(NumRecords * TrimFraction) && PositionAndPackedRadius.w > 0) { // Mark as fading out PositionAndPackedRadius.w *= -1; // Reset fade to 0 NewFade = 0; }*/ NormalAndFade.w = (NewFade + 1) * sign(NormalAndFade.w); uint DestIndex; InterlockedAdd(RWScatterDrawParameters[1], 1U, DestIndex); RWCopyIrradianceCachePositionRadius[DestIndex] = PositionAndPackedRadius; RWCopyIrradianceCacheNormal[DestIndex] = NormalAndFade; RWCopyOccluderRadius[DestIndex] = OccluderRadius[RecordIndex]; RWCopyIrradianceCacheBentNormal[DestIndex] = IrradianceCacheBentNormal[RecordIndex]; #if SUPPORT_IRRADIANCE RWCopyIrradianceCacheIrradiance[DestIndex] = IrradianceCacheIrradiance[RecordIndex]; #endif RWCopyIrradianceCacheTileCoordinate[DestIndex] = IrradianceCacheTileCoordinate[RecordIndex]; } } #else // Always trim at least one to handle dynamic scene changes not accounted for with irradiance cache misses uint StartIndex = max(NumRecords * TrimFraction, 1); StartIndex = min(StartIndex, NumRecords); uint SourceIndex = StartIndex + DispatchThreadId.x; uint DestIndex = DispatchThreadId.x; if (SourceIndex < NumRecords) { RWCopyIrradianceCachePositionRadius[DestIndex] = IrradianceCachePositionRadius[SourceIndex]; RWCopyIrradianceCacheNormal[DestIndex] = IrradianceCacheNormal[SourceIndex]; RWCopyOccluderRadius[DestIndex] = OccluderRadius[SourceIndex]; RWCopyIrradianceCacheBentNormal[DestIndex] = IrradianceCacheBentNormal[SourceIndex]; #if SUPPORT_IRRADIANCE RWCopyIrradianceCacheIrradiance[DestIndex] = IrradianceCacheIrradiance[SourceIndex]; #endif RWCopyIrradianceCacheTileCoordinate[DestIndex] = IrradianceCacheTileCoordinate[SourceIndex]; } if (DispatchThreadId.x == 0) { RWScatterDrawParameters[1] = NumRecords - StartIndex; } #endif }