You've already forked UnrealEngineUWP
mirror of
https://github.com/izzy2lost/UnrealEngineUWP.git
synced 2026-03-26 18:15:20 -07:00
- Improves various situations around instanced geometry (commonly foliage) getting transitioned to dynamic caching and never going back - Improves performance in scenes with heavy foliage using WPO animation and WPO distance disable - Support disabling WPO animation in clipmaps past a certain point (based on the WPO disable distance). Works even when caching/static caching is disabled as it can still be a good performance tool. - Add cvar for greedy clipmap level selection; disabled by default. While this is a behavior change it addresses some old issues around coarse pages getting greedily chosen at certain angles, and also makes the behavior around the new WPO distance disable more predictable - Invalidate instances when the evaluate WPO/WPO distance changes - Invalidate clipmaps when the resolution changes enough that it would trigger different clipmap levels to enable/disable animation. This can be caused by resolution or FOV changes. While it's possible for dynamic resolution to trigger this, TBD whether it's a practical issue for it to happen occasionally. If need be we could add some hysteresis but it would also add additional tracking and complexity and just move when we have to invalidate anyways. Few other misc fixes and improvements. #jira UE-197817 #rb ola.olsson [CL 30273485 by andrew lauritzen in ue5-main branch]
513 lines
20 KiB
Plaintext
513 lines
20 KiB
Plaintext
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
/*=============================================================================
|
|
VirtualShadowMapBuildPerPageDrawCommands.usf:
|
|
=============================================================================*/
|
|
|
|
#include "../Common.ush"
|
|
#include "VirtualShadowMapPageOverlap.ush"
|
|
#include "VirtualShadowMapProjectionCommon.ush"
|
|
#include "../Nanite/NaniteCullingCommon.ush"
|
|
#include "../Nanite/NaniteDataDecode.ush"
|
|
#include "../InstanceCulling/InstanceCullingCommon.ush"
|
|
#include "../InstanceCulling/InstanceCullingLoadBalancer.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "/Engine/Shared/VirtualShadowMapDefinitions.h"
|
|
#include "VirtualShadowMapStats.ush"
|
|
#include "VirtualShadowMapPageCacheCommon.ush"
|
|
|
|
RWStructuredBuffer<FVSMVisibleInstanceCmd> VisibleInstancesOut;
|
|
RWStructuredBuffer<uint> VisibleInstanceCountBufferOut;
|
|
uint TotalPrimaryViews;
|
|
uint VisibleInstancesBufferNum;
|
|
|
|
#if ENABLE_BATCH_MODE
|
|
StructuredBuffer<FVSMCullingBatchInfo> VSMCullingBatchInfos;
|
|
#else // !ENABLE_BATCH_MODE
|
|
uint FirstPrimaryView;
|
|
uint NumPrimaryViews;
|
|
uint DynamicInstanceIdOffset;
|
|
uint DynamicInstanceIdMax;
|
|
#endif // ENABLE_BATCH_MODE
|
|
|
|
StructuredBuffer<uint> DrawCommandDescs;
|
|
|
|
RWBuffer<uint> DrawIndirectArgsBufferOut;
|
|
|
|
StructuredBuffer<uint> PrimitiveRevealedMask;
|
|
uint PrimitiveRevealedNum;
|
|
|
|
RWStructuredBuffer<uint> OutDirtyPageFlags;
|
|
|
|
#if USE_HZB_OCCLUSION
|
|
// Mode 1 == previous frame HZB,
|
|
// Mode 2 == this frame HZB (Nanite only).
|
|
uint HZBMode;
|
|
#endif
|
|
|
|
// TODO: Move to common header
|
|
// Get the area of an "inclusive" rect (which means that the max is inside the rect), also guards against negative area (where min > max)
|
|
uint GetInclusiveRectArea(uint4 Rect)
|
|
{
|
|
if (all(Rect.zw >= Rect.xy))
|
|
{
|
|
uint2 Size = Rect.zw - Rect.xy;
|
|
return (Size.x + 1) * (Size.y + 1);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bool WasPrimitiveRevealed(FVSMCullingBatchInfo VSMCullingBatchInfo, uint PersistentPrimitiveId)
|
|
{
|
|
// Use to filter out transient primitives as well as opting out of the test (if e.g., caching is off)
|
|
if (PersistentPrimitiveId >= VSMCullingBatchInfo.PrimitiveRevealedNum)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint PrimitiveIdWordOffset = VSMCullingBatchInfo.PrimitiveRevealedOffset + PersistentPrimitiveId / 32U;
|
|
uint PrimitiveIdWordMask = 1U << (PersistentPrimitiveId % 32U);
|
|
|
|
return (PrimitiveRevealedMask[PrimitiveIdWordOffset] & PrimitiveIdWordMask) != 0U;
|
|
}
|
|
|
|
void WriteCmd(uint MipViewId, uint InstanceId, uint IndirectArgIndex, uint CullingFlags, bool bStaticPage)
|
|
{
|
|
FPageInfo PageInfo;
|
|
PageInfo.ViewId = MipViewId;
|
|
PageInfo.bStaticPage = bStaticPage;
|
|
|
|
FVSMVisibleInstanceCmd VisibleInstanceCmd;
|
|
VisibleInstanceCmd.PackedPageInfo = PackPageInfo(PageInfo);
|
|
VisibleInstanceCmd.InstanceIdAndFlags = (CullingFlags << INSTANCE_ID_NUM_BITS) | InstanceId;
|
|
VisibleInstanceCmd.IndirectArgIndex = IndirectArgIndex;
|
|
|
|
uint VisibleInstanceOutputOffset = 0U;
|
|
WaveInterlockedAddScalar_(VisibleInstanceCountBufferOut[0], 1U, VisibleInstanceOutputOffset);
|
|
if (VisibleInstanceOutputOffset < VisibleInstancesBufferNum)
|
|
{
|
|
// TODO: Flag & feedback on overflow.
|
|
VisibleInstancesOut[VisibleInstanceOutputOffset] = VisibleInstanceCmd;
|
|
}
|
|
}
|
|
|
|
#define MAX_SINGLE_THREAD_MARKING_AREA (8U)
|
|
#define MARKING_JOB_QUEUE_SIZE (NUM_THREADS_PER_GROUP * 2U)
|
|
|
|
struct FSharedMarkingJob
|
|
{
|
|
uint4 RectPages;
|
|
uint VirtualShadowMapId;
|
|
uint MipLevel;
|
|
bool bMaterialInvalidates;
|
|
bool bShouldCacheAsStatic;
|
|
bool bHasMoved;
|
|
bool bIsViewUncached;
|
|
};
|
|
|
|
groupshared uint NumSharedMarkingJobs;
|
|
groupshared uint2 SharedMarkingJobs[MARKING_JOB_QUEUE_SIZE];
|
|
|
|
uint2 PackJob(FSharedMarkingJob Job)
|
|
{
|
|
uint2 Packed;
|
|
Packed.x = (Job.RectPages.x << 24U) | (Job.RectPages.y << 16U) | (Job.RectPages.z << 8U) | Job.RectPages.w;
|
|
Packed.y = (Job.VirtualShadowMapId << 7U)
|
|
| (Job.MipLevel << 4U)
|
|
| (Job.bIsViewUncached ? (1U << 3) : 0U)
|
|
| (Job.bHasMoved ? (1U << 2) : 0U)
|
|
| (Job.bMaterialInvalidates ? (1U << 1) : 0U)
|
|
| (Job.bShouldCacheAsStatic ? 1U : 0U);
|
|
return Packed;
|
|
}
|
|
|
|
FSharedMarkingJob UnpackJob(uint2 Packed)
|
|
{
|
|
FSharedMarkingJob Job;
|
|
Job.RectPages.x = Packed.x >> 24U;
|
|
Job.RectPages.y = (Packed.x >> 16U) & 0xFF;
|
|
Job.RectPages.z = (Packed.x >> 8U) & 0xFF;
|
|
Job.RectPages.w = Packed.x & 0xFF;
|
|
|
|
Job.VirtualShadowMapId = Packed.y >> 7U;
|
|
Job.MipLevel = (Packed.y >> 4U) & 0x7;
|
|
Job.bIsViewUncached = (Packed.y & (1U << 3)) != 0U;
|
|
Job.bHasMoved = (Packed.y & (1U << 2)) != 0U;
|
|
Job.bMaterialInvalidates = (Packed.y & (1U << 1)) != 0U;
|
|
Job.bShouldCacheAsStatic = (Packed.y & 0x1) != 0U;
|
|
|
|
return Job;
|
|
}
|
|
|
|
bool MarkPageDirty(FVirtualSMLevelOffset PageTableLevelOffset, uint2 vPage, uint MipLevel, bool bMaterialInvalidates, bool bShouldCacheAsStatic, bool bIsViewUncached, uint PageFlagMask)
|
|
{
|
|
uint PageFlagOffset = CalcPageOffset(PageTableLevelOffset, MipLevel, vPage);
|
|
uint PageFlag = VirtualShadowMap.PageFlags[PageFlagOffset];
|
|
// TODO: store page flags in the page table entry and only fetch that.
|
|
if ((PageFlag & PageFlagMask) == PageFlagMask)
|
|
{
|
|
FShadowPhysicalPage PhysPage = ShadowGetPhysicalPage(PageFlagOffset);
|
|
if (PhysPage.bThisLODValid)
|
|
{
|
|
// Mark the page dirty so we regenerate HZB, etc.
|
|
uint PhysPageIndex = VSMPhysicalPageAddressToIndex(PhysPage.PhysicalAddress);
|
|
if (bShouldCacheAsStatic || bIsViewUncached)
|
|
{
|
|
OutDirtyPageFlags[PhysPageIndex] = 1U;
|
|
}
|
|
if (bMaterialInvalidates)
|
|
{
|
|
uint Offset = VirtualShadowMap.MaxPhysicalPages * (bShouldCacheAsStatic ? 2U : 1U);
|
|
// Store invalidation flags after the dirty flags.
|
|
OutDirtyPageFlags[Offset + PhysPageIndex] = 1U;
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#if VSM_GENERATE_STATS
|
|
uint NumPageAreaDiagnosticSlots;
|
|
uint LargeInstancePageAreaThreshold;
|
|
#endif
|
|
|
|
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void CullPerPageDrawCommandsCs(uint3 GroupId : SV_GroupID, int GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
uint DispatchGroupId = GetUnWrappedDispatchGroupId(GroupId);
|
|
|
|
if (DispatchGroupId >= InstanceCullingLoadBalancer_GetNumBatches())
|
|
{
|
|
return;
|
|
}
|
|
|
|
if (GroupThreadIndex == 0)
|
|
{
|
|
NumSharedMarkingJobs = 0U;
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
|
|
#if ENABLE_BATCH_MODE
|
|
// Load Instance culling context batch info, indirection per group
|
|
FContextBatchInfo BatchInfo = BatchInfos[BatchInds[DispatchGroupId]];
|
|
FVSMCullingBatchInfo VSMCullingBatchInfo = VSMCullingBatchInfos[BatchInds[DispatchGroupId]];
|
|
#else // !ENABLE_BATCH_MODE
|
|
// Single Instance culling context batch in the call, set up batch from the kernel parameters
|
|
FContextBatchInfo BatchInfo = (FContextBatchInfo)0;
|
|
BatchInfo.DynamicInstanceIdOffset = DynamicInstanceIdOffset;
|
|
BatchInfo.DynamicInstanceIdMax = DynamicInstanceIdMax;
|
|
|
|
FVSMCullingBatchInfo VSMCullingBatchInfo;
|
|
VSMCullingBatchInfo.FirstPrimaryView = FirstPrimaryView;
|
|
VSMCullingBatchInfo.NumPrimaryViews = NumPrimaryViews;
|
|
VSMCullingBatchInfo.PrimitiveRevealedOffset = 0U;
|
|
VSMCullingBatchInfo.PrimitiveRevealedNum = PrimitiveRevealedNum;
|
|
|
|
#endif // ENABLE_BATCH_MODE
|
|
|
|
uint CurrentBatchProcessingMode = 0U;
|
|
FInstanceWorkSetup WorkSetup = InstanceCullingLoadBalancer_Setup(GroupId, GroupThreadIndex, GetItemDataOffset(BatchInfo, CurrentBatchProcessingMode));
|
|
if (WorkSetup.bValid)
|
|
{
|
|
const FInstanceCullingPayload Payload = LoadInstanceCullingPayload(WorkSetup.Item.Payload, BatchInfo);
|
|
|
|
uint InstanceDataOffset = WorkSetup.Item.InstanceDataOffset;
|
|
|
|
if (Payload.bDynamicInstanceDataOffset)
|
|
{
|
|
InstanceDataOffset += BatchInfo.DynamicInstanceIdOffset;
|
|
checkSlow(InstanceDataOffset + uint(WorkSetup.LocalItemIndex) < BatchInfo.DynamicInstanceIdMax);
|
|
}
|
|
|
|
uint InstanceId = InstanceDataOffset + uint(WorkSetup.LocalItemIndex);
|
|
FDrawCommandDesc DrawCommandDesc = UnpackDrawCommandDesc(DrawCommandDescs[Payload.IndirectArgIndex]);
|
|
|
|
// Load relevant instance data
|
|
FInstanceSceneData InstanceData = GetInstanceSceneData(InstanceId, Scene.GPUScene.InstanceDataSOAStride);
|
|
FPrimitiveSceneData PrimitiveData = GetPrimitiveData(InstanceData.PrimitiveId);
|
|
|
|
uint ThreadTotalForAllViews = 0;
|
|
#if VSM_GENERATE_STATS
|
|
uint TotalPageArea = 0U;
|
|
float NumVisibleLevels = 0.0f;
|
|
#endif
|
|
// Loop over views and output visible instance (i.e., those that overlap a valid page)
|
|
for (uint PrimaryViewId = VSMCullingBatchInfo.FirstPrimaryView; PrimaryViewId < VSMCullingBatchInfo.FirstPrimaryView + VSMCullingBatchInfo.NumPrimaryViews; ++PrimaryViewId)
|
|
{
|
|
FNaniteView NaniteView = GetNaniteView(PrimaryViewId);
|
|
|
|
uint CullingFlags = INSTANCE_CULLING_FLAGS_DEFAULT;
|
|
bool bEnableWPO = DrawCommandDesc.bMaterialUsesWorldPositionOffset;
|
|
|
|
FInstanceDynamicData DynamicData = CalculateInstanceDynamicData(NaniteView, InstanceData);
|
|
|
|
FBoxCull Cull;
|
|
Cull.Init(
|
|
NaniteView,
|
|
InstanceData.LocalBoundsCenter,
|
|
InstanceData.LocalBoundsExtent,
|
|
InstanceData.NonUniformScale.xyz,
|
|
DynamicData.LocalToTranslatedWorld,
|
|
DynamicData.PrevLocalToTranslatedWorld );
|
|
|
|
Cull.bSkipWPODisableDistance |= DrawCommandDesc.bMaterialAlwaysEvaluatesWorldPositionOffset;
|
|
|
|
bEnableWPO = Cull.Distance( PrimitiveData ) && bEnableWPO;
|
|
|
|
// NOTE: Doing this for just the primary view (not mip views) is fine for current logic
|
|
const bool bAllowWPO = VirtualShadowMapIsWPOAllowed(PrimitiveData, NaniteView.TargetLayerIndex);
|
|
bEnableWPO = bEnableWPO && bAllowWPO;
|
|
bool bCacheAsStatic = ShouldCacheInstanceAsStatic(InstanceId, (NaniteView.Flags & NANITE_VIEW_FLAG_UNCACHED), bAllowWPO);
|
|
|
|
if (!bEnableWPO)
|
|
{
|
|
// Disable the Evaluate WPO culling flag if WPO was disabled
|
|
CullingFlags &= ~INSTANCE_CULLING_FLAG_EVALUATE_WPO;
|
|
}
|
|
|
|
Cull.ScreenSize(DrawCommandDesc.MinScreenSize, DrawCommandDesc.MaxScreenSize);
|
|
|
|
Cull.GlobalClipPlane();
|
|
|
|
const bool bMaterialInvalidates = ShouldMaterialInvalidateShadowCache(PrimitiveData, bEnableWPO);
|
|
|
|
// A thing is considered to have moved (which causes it to render into cached pages) if:
|
|
// It was uploaded this frame, OR
|
|
const bool bHasMoved = GetGPUSceneFrameNumber() == InstanceData.LastUpdateSceneFrameNumber
|
|
// it has a material that uses WPO OR
|
|
|| bMaterialInvalidates
|
|
// the primitive was "revealed" this frame - i.e., transitioned from culled -> visible on CPU
|
|
|| WasPrimitiveRevealed(VSMCullingBatchInfo, PrimitiveData.PersistentPrimitiveIndex);
|
|
|
|
FFrustumCullData FrustumCull = (FFrustumCullData)0;
|
|
BRANCH
|
|
if( Cull.bIsVisible )
|
|
{
|
|
FrustumCull = Cull.Frustum();
|
|
}
|
|
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_TOTAL, NaniteView.TargetNumMipLevels);
|
|
|
|
BRANCH
|
|
if (Cull.bIsVisible)
|
|
{
|
|
// Compute estimated footprint in the VSM base level - note that we don't use the actual footprint because it changes under rotation and also with edge clipping which makes it unstable when used for culling.
|
|
float PixelEstRadius = CalcClipSpaceRadiusEstimate(Cull.bIsOrtho, InstanceData, Cull.LocalToTranslatedWorld, NaniteView.ViewToClip) * float(VSM_VIRTUAL_MAX_RESOLUTION_XY);
|
|
|
|
uint BaseFlagMask = GetPageFlagMaskForRendering(bCacheAsStatic, bHasMoved);
|
|
|
|
// Loop over mip levels and count number of output visible instances
|
|
for (uint MipLevel = 0U; MipLevel < uint(NaniteView.TargetNumMipLevels); ++MipLevel)
|
|
{
|
|
uint MipViewId = MipLevel * TotalPrimaryViews + PrimaryViewId;
|
|
FNaniteView MipView = GetNaniteView(MipViewId);
|
|
uint VirtualShadowMapId = uint(MipView.TargetLayerIndex);
|
|
|
|
FScreenRect Rect = GetScreenRect(MipView.ViewRect, FrustumCull, 4);
|
|
|
|
uint FlagMask = BaseFlagMask | GetDetailGeometryFlagMaskForCulling(bCacheAsStatic, false, PixelEstRadius);
|
|
// Update for next clip level
|
|
PixelEstRadius *= 0.5f;
|
|
|
|
uint4 RectPages = GetPageRect(Rect, VirtualShadowMapId, MipLevel);
|
|
|
|
if (OverlapsAnyValidPage(VirtualShadowMapId, MipLevel, RectPages, FlagMask))
|
|
{
|
|
#if VSM_GENERATE_STATS
|
|
TotalPageArea += float(GetInclusiveRectArea(RectPages));
|
|
NumVisibleLevels += 1.0f;
|
|
#endif
|
|
#if USE_HZB_OCCLUSION
|
|
if (Cull.bViewHZB)
|
|
{
|
|
// Assume:
|
|
// 1. Virtual address space has not changed (if so it'd have been invalidated)
|
|
// 2. Physical pages that are chached have not changed (guaranteed by persistent caching)
|
|
// OR (mode 2): Using this frame's HZB from Nanite shadow passes.
|
|
// Then we can:
|
|
// 1. Iterate the clipped page rect
|
|
// 2. Test against only valid pages which enables
|
|
// 3. rejecting any that don't pass HZB at least once (or touches an uncached page)
|
|
|
|
// In Mode 2 we are using the current-frame HZB and can use any page whether it is cached or not.
|
|
// TODO: Figure out which flag(s) we actually want here. Also need to check the non-nanite one?
|
|
// TODO: for mode 1, we need to use the previous view transform and rect.
|
|
uint VisiblePageMask = HZBMode == 1 ? (VSM_DYNAMIC_UNCACHED_FLAG | VSM_STATIC_UNCACHED_FLAG) : 0U;
|
|
if (!IsVisibleMaskedHZB(HZBMode == 1 ? MipView.TargetPrevLayerIndex : VirtualShadowMapId, MipLevel, Rect, true, HZBMode == 2, VisiblePageMask))
|
|
|
|
{
|
|
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_HZB_CULLED);
|
|
continue;
|
|
}
|
|
}
|
|
#endif // USE_HZB_OCCLUSION
|
|
|
|
uint NumMappedPages = 0U;
|
|
{
|
|
|
|
uint2 RectPagesSize = (RectPages.zw + 1u) - RectPages.xy;
|
|
if (RectPagesSize.x * RectPagesSize.y <= MAX_SINGLE_THREAD_MARKING_AREA)
|
|
{
|
|
FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(VirtualShadowMapId, MipLevel);
|
|
|
|
for (uint Y = RectPages.y; Y <= RectPages.w; ++Y)
|
|
{
|
|
for (uint X = RectPages.x; X <= RectPages.z; ++X)
|
|
{
|
|
if (MarkPageDirty(PageTableLevelOffset, uint2(X, Y), MipLevel, bMaterialInvalidates, bCacheAsStatic, Cull.bIsViewUncached, FlagMask))
|
|
{
|
|
// Count pages if we're touching all of them anyway
|
|
++NumMappedPages;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// defer the work and use all threads in group
|
|
uint JobIndex = 0U;
|
|
InterlockedAdd(NumSharedMarkingJobs, 1U, JobIndex);
|
|
FSharedMarkingJob Job;
|
|
Job.RectPages = RectPages;
|
|
Job.VirtualShadowMapId = VirtualShadowMapId;
|
|
Job.MipLevel = MipLevel;
|
|
Job.bMaterialInvalidates = bMaterialInvalidates;
|
|
Job.bShouldCacheAsStatic = bCacheAsStatic;
|
|
Job.bIsViewUncached = Cull.bIsViewUncached;
|
|
Job.bHasMoved = bHasMoved;
|
|
if (JobIndex < MARKING_JOB_QUEUE_SIZE)
|
|
{
|
|
SharedMarkingJobs[JobIndex] = PackJob(Job);
|
|
}
|
|
// Must assume we have mapped pages (or defer the cmd write also...)
|
|
NumMappedPages = 1U;
|
|
}
|
|
|
|
//
|
|
}
|
|
|
|
if (NumMappedPages > 0U)
|
|
{
|
|
++ThreadTotalForAllViews;
|
|
WriteCmd(MipViewId, InstanceId, Payload.IndirectArgIndex, CullingFlags, bCacheAsStatic);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_PAGE_MASK_CULLED);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_FRUSTUM_CULLED, NaniteView.TargetNumMipLevels);
|
|
}
|
|
}
|
|
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_DRAWN, ThreadTotalForAllViews);
|
|
|
|
// Accumulate total number of instances for each indirect argument, is also used to allocate space and output compact range of instances later
|
|
InterlockedAdd(DrawIndirectArgsBufferOut[Payload.IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1], ThreadTotalForAllViews);
|
|
|
|
#if VSM_GENERATE_STATS
|
|
if (NumPageAreaDiagnosticSlots > 0U && TotalPageArea > LargeInstancePageAreaThreshold)
|
|
{
|
|
for (uint Index = 0U; Index < NumPageAreaDiagnosticSlots; ++Index)
|
|
{
|
|
uint PrevArea = 0U;
|
|
// Store these after the stats slots
|
|
InterlockedMax(OutStatsBuffer[VSM_NUM_STATS + 2U * Index], TotalPageArea, PrevArea);
|
|
// If the area was greater then we store the persistent Primitive ID after the area.
|
|
// This obviously has a data race, so numbers & IDs will only be approximately correct, could use 64-bit atomics instead...
|
|
if (PrevArea < TotalPageArea)
|
|
{
|
|
OutStatsBuffer[VSM_NUM_STATS + 2U * Index + 1U] = PrimitiveData.PersistentPrimitiveIndex;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
GroupMemoryBarrierWithGroupSync();
|
|
uint NumMarkingJobs = NumSharedMarkingJobs;
|
|
for (uint JobIndex = 0U; JobIndex < min(MARKING_JOB_QUEUE_SIZE, NumMarkingJobs); ++JobIndex)
|
|
{
|
|
FSharedMarkingJob Job = UnpackJob(SharedMarkingJobs[JobIndex]);
|
|
FVirtualSMLevelOffset PageTableLevelOffset = CalcPageTableLevelOffset(Job.VirtualShadowMapId, Job.MipLevel);
|
|
uint2 RectPagesSize = (Job.RectPages.zw + 1u) - Job.RectPages.xy;
|
|
for (uint Index = GroupThreadIndex; Index < RectPagesSize.x * RectPagesSize.y; Index += NUM_THREADS_PER_GROUP)
|
|
{
|
|
uint LocalPageY = uint(floor((float(Index) + 0.5f) * rcp(RectPagesSize.x)));
|
|
uint2 vPage = Job.RectPages.xy + uint2(Index - RectPagesSize.x * LocalPageY, LocalPageY);
|
|
uint FlagMask = GetPageFlagMaskForRendering(Job.bShouldCacheAsStatic, Job.bHasMoved);
|
|
|
|
// We don't need to add the correct flag mask here, as we don't look at the VSM_DETAIL_GEOMETRY_FLAG for invalidaiton processing
|
|
MarkPageDirty(PageTableLevelOffset, vPage, Job.MipLevel, Job.bMaterialInvalidates, Job.bShouldCacheAsStatic, Job.bIsViewUncached, FlagMask);
|
|
}
|
|
}
|
|
}
|
|
|
|
Buffer<uint> DrawIndirectArgsBuffer;
|
|
RWBuffer<uint> InstanceIdOffsetBufferOut;
|
|
RWStructuredBuffer<uint> OutputOffsetBufferOut;
|
|
RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
|
|
uint NumIndirectArgs;
|
|
|
|
/**
|
|
* Separate pass to allocate space, needs to run once the final space requirements are known. We buffer the page/instance-draw info and reshuffle later.
|
|
* TODO: Possibly just re-run the culling process in the output pass, saves storing stuff, but may cost more and runs the risk of the passes disagreeing e.g., due to rounding or whatever.
|
|
*/
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void AllocateCommandInstanceOutputSpaceCs(uint IndirectArgIndex : SV_DispatchThreadID)
|
|
{
|
|
if (IndirectArgIndex < NumIndirectArgs)
|
|
{
|
|
uint CommandInstanceCount = DrawIndirectArgsBuffer[IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1];
|
|
uint CommandInstanceOffset = 0U;
|
|
if (CommandInstanceCount > 0U)
|
|
{
|
|
InterlockedAdd(OutputOffsetBufferOut[0], CommandInstanceCount, CommandInstanceOffset);
|
|
}
|
|
InstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
|
|
// Store second copy for use during output pass (as we need the first offset buffer during the actual rendering)
|
|
TmpInstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
|
|
}
|
|
|
|
// Also set up indirect dispatch args for the output pass (OutputCommandInstanceLists)
|
|
//if (IndirectArgIndex == 0)
|
|
//{
|
|
// uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
|
|
// // ...dispatch args to process all the visible instances
|
|
//}
|
|
}
|
|
|
|
StructuredBuffer<FVSMVisibleInstanceCmd> VisibleInstances;
|
|
StructuredBuffer <uint> VisibleInstanceCountBuffer;
|
|
//RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
|
|
RWStructuredBuffer<uint> InstanceIdsBufferOut;
|
|
RWStructuredBuffer<uint> PageInfoBufferOut;
|
|
|
|
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void OutputCommandInstanceListsCs(uint VisibleInstanceIndex : SV_DispatchThreadID)
|
|
{
|
|
uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
|
|
|
|
if (VisibleInstanceIndex < NumVisibleInstances)
|
|
{
|
|
FVSMVisibleInstanceCmd VisibleInstanceCmd = VisibleInstances[VisibleInstanceIndex];
|
|
const uint InstanceId = VisibleInstanceCmd.InstanceIdAndFlags & INSTANCE_ID_MASK;
|
|
const uint CullingFlags = VisibleInstanceCmd.InstanceIdAndFlags >> INSTANCE_ID_NUM_BITS;
|
|
|
|
// Scatter the instance ID & other data.
|
|
uint InstanceIdOutputOffset = 0;
|
|
InterlockedAdd(TmpInstanceIdOffsetBufferOut[VisibleInstanceCmd.IndirectArgIndex], 1U, InstanceIdOutputOffset);
|
|
// TODO: maybe repack as uint2 since that might be better for these type of presumably scalar loads.
|
|
InstanceIdsBufferOut[InstanceIdOutputOffset] = PackInstanceCullingOutput(InstanceId, 0u, CullingFlags);
|
|
PageInfoBufferOut[InstanceIdOutputOffset] = VisibleInstanceCmd.PackedPageInfo;
|
|
}
|
|
}
|
|
|