You've already forked UnrealEngineUWP
mirror of
https://github.com/izzy2lost/UnrealEngineUWP.git
synced 2026-03-26 18:15:20 -07:00
- Avoid gotchas with max texture size when static separate enabled - Simplify addressing logic in a number of places - Avoid allocating extra HZB that we never use Details: - Support rendering/sampling to 2D depth texture array in Nanite and virtual shadow map pass - Remove some unnecessary HZB-related cvars - Remove unused permutations from VSM HW raster #preflight 624f4e5611261bc7b2171208 #rb jamie.hayes #ROBOMERGE-AUTHOR: andrew.lauritzen #ROBOMERGE-SOURCE: CL 19679616 via CL 19679656 via CL 19679706 #ROBOMERGE-BOT: UE5 (Release-Engine-Staging -> Main) (v938-19570697) [CL 19680680 by andrew lauritzen in ue5-main branch]
346 lines
14 KiB
Plaintext
346 lines
14 KiB
Plaintext
// Copyright Epic Games, Inc. All Rights Reserved.
|
|
|
|
/*=============================================================================
|
|
VirtualShadowMapBuildPerPageDrawCommands.usf:
|
|
=============================================================================*/
|
|
|
|
#include "../Common.ush"
|
|
#include "VirtualShadowMapPageOverlap.ush"
|
|
#include "VirtualShadowMapProjectionCommon.ush"
|
|
#include "../Nanite/NaniteDataDecode.ush"
|
|
#include "../InstanceCulling/InstanceCullingCommon.ush"
|
|
#include "../InstanceCulling/InstanceCullingLoadBalancer.ush"
|
|
#include "../WaveOpUtil.ush"
|
|
#include "VirtualShadowMapStats.ush"
|
|
#include "VirtualShadowMapPageCacheCommon.ush"
|
|
|
|
// Stored in an unordered fashion, needs to be reorganized later
|
|
struct FVisibleInstanceCmd
|
|
{
|
|
uint PackedPageInfo;
|
|
uint InstanceId;
|
|
uint IndirectArgIndex;
|
|
};
|
|
|
|
RWStructuredBuffer<FVisibleInstanceCmd> VisibleInstancesOut;
|
|
RWStructuredBuffer<uint> VisibleInstanceCountBufferOut;
|
|
uint InstanceSceneDataSOAStride;
|
|
uint TotalPrimaryViews;
|
|
uint VisibleInstancesBufferNum;
|
|
|
|
struct FVSMCullingBatchInfo
|
|
{
|
|
float3 CullingViewOriginOffset;
|
|
uint FirstPrimaryView;
|
|
float3 CullingViewOriginTile;
|
|
uint NumPrimaryViews;
|
|
};
|
|
|
|
#if ENABLE_BATCH_MODE
|
|
StructuredBuffer<FVSMCullingBatchInfo> VSMCullingBatchInfos;
|
|
#else // !ENABLE_BATCH_MODE
|
|
uint FirstPrimaryView;
|
|
uint NumPrimaryViews;
|
|
uint DynamicInstanceIdOffset;
|
|
uint DynamicInstanceIdMax;
|
|
|
|
float3 CullingViewOriginOffset;
|
|
float3 CullingViewOriginTile;
|
|
|
|
#endif // ENABLE_BATCH_MODE
|
|
|
|
|
|
float MaxMaterialPositionInvalidationRange;
|
|
|
|
StructuredBuffer<FDrawCommandDesc> DrawCommandDescs;
|
|
|
|
RWBuffer<uint> DrawIndirectArgsBufferOut;
|
|
|
|
RWStructuredBuffer<uint> OutInvalidatingInstances;
|
|
uint NumInvalidatingInstanceSlots;
|
|
|
|
StructuredBuffer<uint> PrimitiveRevealedMask;
|
|
uint PrimitiveRevealedNum;
|
|
|
|
#if USE_HZB_OCCLUSION
|
|
// Mode 1 == previous frame HZB,
|
|
// Mode 2 == this frame HZB (Nanite only).
|
|
uint HZBMode;
|
|
#endif
|
|
|
|
|
|
bool WasPrimitiveRevealed(uint PersistentPrimitiveId)
|
|
{
|
|
// Use to filter out transient primitives as well as opting out of the test (if e.g., caching is off)
|
|
if (PersistentPrimitiveId >= PrimitiveRevealedNum)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint PrimitiveIdWordOffset = PersistentPrimitiveId / 32U;
|
|
uint PrimitiveIdWordMask = 1U << (PersistentPrimitiveId % 32U);
|
|
|
|
return (PrimitiveRevealedMask[PrimitiveIdWordOffset] & PrimitiveIdWordMask) != 0U;
|
|
}
|
|
|
|
void MarkInstanceForInvalidation(uint InstanceId)
|
|
{
|
|
// Note: InstanceId can be higher than NumInvalidatingInstanceSlots because of Dynamic primitives and their instances as they are added later in the frame
|
|
// Skipping them is ok, because they invalidate when they are removed (being transient).
|
|
if (InstanceId < uint(NumInvalidatingInstanceSlots))
|
|
{
|
|
uint InstanceIdWordOffset = InstanceId / 32U;
|
|
uint InstanceIdWordMask = 1U << (InstanceId % 32U);
|
|
|
|
uint PreviousWordMask = 0U;
|
|
InterlockedOr(OutInvalidatingInstances[1 + NumInvalidatingInstanceSlots + InstanceIdWordOffset], InstanceIdWordMask, PreviousWordMask);
|
|
// If the bit was not set, then we need to add it to the list of things to invaliate
|
|
if ((PreviousWordMask & InstanceIdWordMask) == 0U)
|
|
{
|
|
uint OutOffset = 0U;
|
|
WaveInterlockedAddScalar_(OutInvalidatingInstances[0], 1U, OutOffset);
|
|
OutInvalidatingInstances[1U + OutOffset] = InstanceId;
|
|
}
|
|
}
|
|
}
|
|
|
|
void WriteCmd(uint MipViewId, uint InstanceId, uint IndirectArgIndex, bool bStaticPage)
|
|
{
|
|
FPageInfo PageInfo;
|
|
PageInfo.ViewId = MipViewId;
|
|
PageInfo.bStaticPage = bStaticPage;
|
|
|
|
FVisibleInstanceCmd VisibleInstanceCmd;
|
|
VisibleInstanceCmd.PackedPageInfo = PackPageInfo(PageInfo);
|
|
VisibleInstanceCmd.InstanceId = InstanceId;
|
|
VisibleInstanceCmd.IndirectArgIndex = IndirectArgIndex;
|
|
|
|
uint VisibleInstanceOutputOffset = 0U;
|
|
WaveInterlockedAddScalar_(VisibleInstanceCountBufferOut[0], 1U, VisibleInstanceOutputOffset);
|
|
if (VisibleInstanceOutputOffset < VisibleInstancesBufferNum)
|
|
{
|
|
// TODO: Flag & feedback on overflow.
|
|
VisibleInstancesOut[VisibleInstanceOutputOffset] = VisibleInstanceCmd;
|
|
}
|
|
}
|
|
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void CullPerPageDrawCommandsCs(uint3 GroupId : SV_GroupID, int GroupThreadIndex : SV_GroupIndex)
|
|
{
|
|
uint DispatchGroupId = GetUnWrappedDispatchGroupId(GroupId);
|
|
|
|
if (DispatchGroupId >= InstanceCullingLoadBalancer_GetNumBatches())
|
|
{
|
|
return;
|
|
}
|
|
#if ENABLE_BATCH_MODE
|
|
// Load Instance culling context batch info, indirection per group
|
|
FContextBatchInfo BatchInfo = BatchInfos[BatchInds[DispatchGroupId]];
|
|
FVSMCullingBatchInfo VSMCullingBatchInfo = VSMCullingBatchInfos[BatchInds[DispatchGroupId]];
|
|
#else // !ENABLE_BATCH_MODE
|
|
// Single Instance culling context batch in the call, set up batch from the kernel parameters
|
|
FContextBatchInfo BatchInfo = (FContextBatchInfo)0;
|
|
BatchInfo.DynamicInstanceIdOffset = DynamicInstanceIdOffset;
|
|
BatchInfo.DynamicInstanceIdMax = DynamicInstanceIdMax;
|
|
|
|
FVSMCullingBatchInfo VSMCullingBatchInfo;
|
|
VSMCullingBatchInfo.FirstPrimaryView = FirstPrimaryView;
|
|
VSMCullingBatchInfo.NumPrimaryViews = NumPrimaryViews;
|
|
VSMCullingBatchInfo.CullingViewOriginOffset = CullingViewOriginOffset;
|
|
VSMCullingBatchInfo.CullingViewOriginTile = CullingViewOriginTile;
|
|
#endif // ENABLE_BATCH_MODE
|
|
|
|
FLWCVector3 CullingViewWorldOrigin = MakeLWCVector3(VSMCullingBatchInfo.CullingViewOriginTile, VSMCullingBatchInfo.CullingViewOriginOffset);
|
|
|
|
|
|
uint CurrentBatchProcessingMode = 0U;
|
|
FInstanceWorkSetup WorkSetup = InstanceCullingLoadBalancer_Setup(GroupId, GroupThreadIndex, GetItemDataOffset(BatchInfo, CurrentBatchProcessingMode));
|
|
if (!WorkSetup.bValid)
|
|
{
|
|
return;
|
|
}
|
|
|
|
const FInstanceCullingPayload Payload = LoadInstanceCullingPayload(WorkSetup.Item.Payload, BatchInfo);
|
|
|
|
uint InstanceDataOffset = WorkSetup.Item.InstanceDataOffset;
|
|
|
|
if (Payload.bDynamicInstanceDataOffset)
|
|
{
|
|
InstanceDataOffset += BatchInfo.DynamicInstanceIdOffset;
|
|
checkSlow(InstanceDataOffset + uint(WorkSetup.LocalItemIndex) < BatchInfo.DynamicInstanceIdMax);
|
|
}
|
|
|
|
uint InstanceId = InstanceDataOffset + uint(WorkSetup.LocalItemIndex);
|
|
FDrawCommandDesc DrawCommandDesc = DrawCommandDescs[Payload.IndirectArgIndex];
|
|
|
|
// Load relevant instance data
|
|
FInstanceSceneData InstanceData = GetInstanceSceneData(InstanceId, InstanceSceneDataSOAStride);
|
|
|
|
// Stop invalidations from material if further away than the threshold
|
|
bool bMaterialInvalidates = DrawCommandDesc.bMaterialMayModifyPosition;
|
|
BRANCH
|
|
if (MaxMaterialPositionInvalidationRange >= 0.0f && bMaterialInvalidates)
|
|
{
|
|
float DistSq = length2(LWCToFloat(LWCSubtract(LWCMultiply(InstanceData.LocalBoundsCenter, InstanceData.LocalToWorld), CullingViewWorldOrigin)));
|
|
bMaterialInvalidates = DistSq < Square(MaxMaterialPositionInvalidationRange);
|
|
}
|
|
|
|
// A thing is considered to have moved (which causes it to render into cached pages) if:
|
|
// It was uploaded this frame, OR
|
|
const bool bHasMoved = GetGPUSceneFrameNumber() == InstanceData.LastUpdateSceneFrameNumber
|
|
// it has a material that uses WPO or PDO, OR
|
|
|| bMaterialInvalidates
|
|
// the primitive was "revealed" this frame - i.e., transitioned from culled -> visible on CPU
|
|
|| WasPrimitiveRevealed(GetPrimitiveData(InstanceData.PrimitiveId).PersistentPrimitiveIndex);
|
|
|
|
uint ThreadTotalForAllViews = 0;
|
|
// Loop over views and output visible instance (i.e., those that overlap a valid page)
|
|
for (uint PrimaryViewId = VSMCullingBatchInfo.FirstPrimaryView; PrimaryViewId < VSMCullingBatchInfo.FirstPrimaryView + VSMCullingBatchInfo.NumPrimaryViews; ++PrimaryViewId)
|
|
{
|
|
FNaniteView NaniteView = GetNaniteView(PrimaryViewId);
|
|
|
|
const bool bNearClip = (NaniteView.Flags & NANITE_VIEW_FLAG_NEAR_CLIP) != 0u;
|
|
|
|
const uint2 TargetViewSize = uint2(NaniteView.ViewSizeAndInvSize.xy);
|
|
|
|
float4x4 LocalToTranslatedWorld = LWCMultiplyTranslation(InstanceData.LocalToWorld, NaniteView.PreViewTranslation);
|
|
float4x4 LocalToClip = mul(LocalToTranslatedWorld, NaniteView.TranslatedWorldToClip);
|
|
|
|
FFrustumCullData Cull = BoxCullFrustum(InstanceData.LocalBoundsCenter, InstanceData.LocalBoundsExtent, LocalToClip, bNearClip, false);
|
|
|
|
#if USE_HZB_OCCLUSION
|
|
const bool bViewHZB = (NaniteView.Flags & NANITE_VIEW_FLAG_HZBTEST) != 0 || HZBMode == 2;
|
|
#endif
|
|
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_TOTAL, NaniteView.TargetNumMipLevels);
|
|
|
|
if (Cull.bIsVisible)
|
|
{
|
|
// Loop over mip levels and count number of output visible instances, also retain a bit for each mip level (used to skip empty levels in output loop)
|
|
for (uint MipLevel = 0U; MipLevel < uint(NaniteView.TargetNumMipLevels); ++MipLevel)
|
|
{
|
|
uint MipViewId = MipLevel * TotalPrimaryViews + PrimaryViewId;
|
|
FNaniteView MipView = GetNaniteView(MipViewId);
|
|
uint VirtualShadowMapId = uint(MipView.TargetLayerIndex);
|
|
|
|
FScreenRect Rect = GetScreenRect(MipView.ViewRect, Cull, 4);
|
|
|
|
uint FlagMask = VSM_NON_NANITE_FLAG | GetPageFlagMaskForRendering(InstanceData, bHasMoved);
|
|
|
|
uint4 RectPages = GetPageRect(Rect, VirtualShadowMapId, MipLevel);
|
|
|
|
if (OverlapsAnyValidPage(VirtualShadowMapId, MipLevel, RectPages, FlagMask))
|
|
{
|
|
#if USE_HZB_OCCLUSION
|
|
if (bViewHZB)
|
|
{
|
|
// Assume:
|
|
// 1. Virtual address space has not changed (if so it'd have been invalidated)
|
|
// 2. Physical pages that are chached have not changed (guaranteed by persistent caching)
|
|
// OR (mode 2): Using this frame's HZB from Nanite shadow passes.
|
|
// Then we can:
|
|
// 1. Iterate the clipped page rect
|
|
// 2. Test against only valid pages which enables
|
|
// 3. rejecting any that don't pass HZB at least once (or touches an uncached page)
|
|
|
|
// In Mode 2 we are using the current-frame HZB and can use any page whether it is cached or not.
|
|
// TODO: Figure out which flag(s) we actually want here. Also need to check the non-nanite one?
|
|
// TODO: for mode 1, we need to use the previous view transform and rect.
|
|
uint VisiblePageMask = HZBMode == 1 ? (VSM_DYNAMIC_UNCACHED_FLAG | VSM_STATIC_UNCACHED_FLAG) : 0U;
|
|
if (!IsVisibleMaskedHZB(HZBMode == 1 ? MipView.TargetPrevLayerIndex : VirtualShadowMapId, MipLevel, Rect, true, HZBMode == 2, VisiblePageMask))
|
|
|
|
{
|
|
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_HZB_CULLED);
|
|
continue;
|
|
}
|
|
}
|
|
#endif // USE_HZB_OCCLUSION
|
|
|
|
++ThreadTotalForAllViews;
|
|
WriteCmd(MipViewId, InstanceId, Payload.IndirectArgIndex, ShouldCacheInstanceAsStatic(InstanceData));
|
|
}
|
|
else
|
|
{
|
|
StatsBufferInterlockedInc(VSM_STAT_NON_NANITE_INSTANCES_PAGE_MASK_CULLED);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_FRUSTUM_CULLED, NaniteView.TargetNumMipLevels);
|
|
}
|
|
}
|
|
|
|
// If the instance has WPO or PDO and rendered to any mip, queue for invalidation. This will be processed the before next frame (after a new HZB is built)
|
|
// and invalidate any visible pages.
|
|
if (ThreadTotalForAllViews > 0U && bMaterialInvalidates)
|
|
{
|
|
MarkInstanceForInvalidation(InstanceId);
|
|
}
|
|
|
|
StatsBufferInterlockedAdd(VSM_STAT_NON_NANITE_INSTANCES_DRAWN, ThreadTotalForAllViews);
|
|
|
|
// Accumulate total number of instances for each indirect argument, is also used to allocate space and output compact range of instances later
|
|
InterlockedAdd(DrawIndirectArgsBufferOut[Payload.IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1], ThreadTotalForAllViews);
|
|
}
|
|
|
|
Buffer<uint> DrawIndirectArgsBuffer;
|
|
RWBuffer<uint> InstanceIdOffsetBufferOut;
|
|
RWStructuredBuffer<uint> OutputOffsetBufferOut;
|
|
RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
|
|
uint NumIndirectArgs;
|
|
|
|
/**
|
|
* Separate pass to allocate space, needs to run once the final space requirements are known. We buffer the page/instance-draw info and reshuffle later.
|
|
* TODO: Possibly just re-run the culling process in the output pass, saves storing stuff, but may cost more and runs the risk of the passes disagreeing e.g., due to rounding or whatever.
|
|
*/
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void AllocateCommandInstanceOutputSpaceCs(uint IndirectArgIndex : SV_DispatchThreadID)
|
|
{
|
|
if (IndirectArgIndex < NumIndirectArgs)
|
|
{
|
|
uint CommandInstanceCount = DrawIndirectArgsBuffer[IndirectArgIndex * INDIRECT_ARGS_NUM_WORDS + 1];
|
|
uint CommandInstanceOffset = 0U;
|
|
if (CommandInstanceCount > 0U)
|
|
{
|
|
InterlockedAdd(OutputOffsetBufferOut[0], CommandInstanceCount, CommandInstanceOffset);
|
|
}
|
|
InstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
|
|
// Store second copy for use during output pass (as we need the first offset buffer during the actual rendering)
|
|
TmpInstanceIdOffsetBufferOut[IndirectArgIndex] = CommandInstanceOffset;
|
|
}
|
|
|
|
// Also set up indirect dispatch args for the output pass (OutputCommandInstanceLists)
|
|
//if (IndirectArgIndex == 0)
|
|
//{
|
|
// uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
|
|
// // ...dispatch args to process all the visible instances
|
|
//}
|
|
}
|
|
|
|
StructuredBuffer<FVisibleInstanceCmd> VisibleInstances;
|
|
StructuredBuffer <uint> VisibleInstanceCountBuffer;
|
|
//RWStructuredBuffer<uint> TmpInstanceIdOffsetBufferOut;
|
|
RWStructuredBuffer<uint> InstanceIdsBufferOut;
|
|
RWStructuredBuffer<uint> PageInfoBufferOut;
|
|
|
|
|
|
[numthreads(NUM_THREADS_PER_GROUP, 1, 1)]
|
|
void OutputCommandInstanceListsCs(uint VisibleInstanceIndex : SV_DispatchThreadID)
|
|
{
|
|
uint NumVisibleInstances = VisibleInstanceCountBuffer[0];
|
|
|
|
if (VisibleInstanceIndex < NumVisibleInstances)
|
|
{
|
|
FVisibleInstanceCmd VisibleInstanceCmd = VisibleInstances[VisibleInstanceIndex];
|
|
|
|
// Scatter the instance ID & other data.
|
|
uint InstanceIdOutputOffset = 0;
|
|
InterlockedAdd(TmpInstanceIdOffsetBufferOut[VisibleInstanceCmd.IndirectArgIndex], 1U, InstanceIdOutputOffset);
|
|
// TODO: maybe repack as uint2 since that might be better for these type of presumably scalar loads.
|
|
InstanceIdsBufferOut[InstanceIdOutputOffset] = VisibleInstanceCmd.InstanceId;
|
|
PageInfoBufferOut[InstanceIdOutputOffset] = VisibleInstanceCmd.PackedPageInfo;
|
|
}
|
|
}
|
|
|