Immediate mode patch rasterizer optimizations:

-Implemented vertex caching to evaluate ~1 vertex per triangle instead of 3
-Separate immediate mode tessellation table that is constrained to one vert + one tri per lane
-Disable material range merging in raster binning, so UVDensities can be made scalar in immediate mode rasterizer
-Use wide LDS loads instead of permutes for some properties to reduce DS pressure

Patch rasterizer optimizations:
-Reduced max tess factor from 16 to 14 to increase occupancy of patch rasterizer
-Move UVDensities to per-patch work in patch rasterizer

Packed FTessellatedPatch data to fit in fewer registers to reduce VGPR/DS pressure
Added debug code to output SVG of tessellation pattern

#jira UE-197833
#rb brian.karis
[FYI] graham.wihlidal, jamie.hayes

[CL 30896683 by rune stubbe in ue5-main branch]
This commit is contained in:
rune stubbe
2024-01-25 14:54:04 -05:00
parent 29b235b080
commit 2c750e1101
12 changed files with 463 additions and 158 deletions
@@ -223,6 +223,11 @@ FInstanceDynamicData WaveReadLaneAt(FInstanceDynamicData In, uint SrcIndex)
return Result;
}
float ClipZFromLinearZ(FNaniteView NaniteView, float LinearZ)
{
return LinearZ * NaniteView.ViewToClip[2][2] + NaniteView.ViewToClip[3][2]; // TODO: Pack coefficients into single load?
}
// Packs a (PageIndex, ClusterIndex) pair into a flat index based on max clusters per page.
uint PackPoolClusterRef(uint PageIndex, uint ClusterIndex)
{
+99 -11
View File
@@ -7,6 +7,8 @@
#if NANITE_TESSELLATION
#define NANITE_TESSELLATION_DICE_USE_LDS 1
void RasterizeDicedTri(
FRasterTri Tri,
FRaster Raster,
@@ -44,12 +46,16 @@ void RasterizeDicedTri(
}
}
groupshared float4 GroupPointPackedClip[ THREADGROUP_SIZE ]; // TODO: Convert to PackedClip ?
groupshared float4 GroupNormalPackedClip[ THREADGROUP_SIZE ];
struct FDiceTask
{
FRaster Raster;
FMaterialShader Shader;
uint PixelValue;
uint2 VisualizeValues;
float4 UVDensities;
bool bReverseWinding;
FNaniteTransformedVert Vert;
@@ -59,17 +65,18 @@ struct FDiceTask
uint PatchData;
// Vertex cache
//uint FirstVert;
//uint& NumCached;
float3 CachedPackedSubpixelPosition;
void Init( float3 TessFactors, uint3 VertIndexes, uint TriIndex )
{
TessellatedPatch.Init( TessFactors, VertIndexes );
TessellatedPatch.Init( TessFactors, VertIndexes, true );
PatchData = VertIndexes.x << 0;
PatchData |= VertIndexes.y << 8;
PatchData |= VertIndexes.z << 16;
PatchData |= TriIndex << 24;
CachedPackedSubpixelPosition = 0.0f;
}
FDiceTask CreateChild( uint ParentLaneIndex )
@@ -80,12 +87,12 @@ struct FDiceTask
ChildTask.Shader = Shader;
ChildTask.PixelValue = PixelValue;
ChildTask.VisualizeValues = VisualizeValues;
ChildTask.UVDensities = UVDensities;
ChildTask.bReverseWinding = bReverseWinding;
ChildTask.TessellatedPatch = WaveReadLaneAt( TessellatedPatch, ParentLaneIndex );
ChildTask.Encoded = WaveReadLaneAt( Encoded, ParentLaneIndex );
ChildTask.PatchData = WaveReadLaneAt( PatchData, ParentLaneIndex );
//ChildTask.FirstVert = WaveReadLaneAt( FirstVert, ParentLaneIndex );
uint3 PatchVertIndexes;
PatchVertIndexes.x = ( ChildTask.PatchData >> 0 ) & 0xff;
@@ -93,18 +100,42 @@ struct FDiceTask
PatchVertIndexes.z = ( ChildTask.PatchData >> 16 ) & 0xff;
FNaniteTransformedTri TransformedTri = MakeTransformedNaniteTriangle( Vert, PatchVertIndexes );
#if NANITE_TESSELLATION_DICE_USE_LDS
// Alleviate DS pressure by using wide LDS loads instead of single component permutes
for (uint Corner = 0; Corner < 3; ++Corner)
{
const uint SourceIndex = PatchVertIndexes[ Corner ];
TransformedTri.Verts[ Corner ].PointClip = GroupPointPackedClip[ SourceIndex ];
TransformedTri.Verts[ Corner ].NormalClip = GroupNormalPackedClip[ SourceIndex ];
}
#endif
ChildTask.Shader.TransformedTri = TransformedTri; // TODO mutable. This is weird
return ChildTask;
}
void CacheToLDS()
{
#if NANITE_TESSELLATION_DICE_USE_LDS
const uint LaneIndex = WaveGetLaneIndex();
void RunChild( uint LocalItemIndex );
GroupPointPackedClip[LaneIndex] = Vert.PointClip;
GroupNormalPackedClip[LaneIndex] = Vert.NormalClip;
GroupMemoryBarrierWithGroupSync();
#endif
}
void RunChild( inout FDiceTask ParentTask, bool bActive, uint LocalItemIndex );
};
//groupshared float3 VertexCache[ THREADGROUP_SIZE ];
//#define VertCache(x) VertexCache[ QueueOffset + ( (x) & ( LaneCount - 1 ) ) ]
void FDiceTask::RunChild( uint LocalItemIndex )
void FDiceTask::RunChild( inout FDiceTask ParentTask, bool bActive, uint LocalItemIndex )
{
uint PatchIndex = PatchData >> 24;
@@ -114,8 +145,63 @@ void FDiceTask::RunChild( uint LocalItemIndex )
VertIndexes.yz = VertIndexes.zy;
float4 Verts[3];
#if 1
const float3 TessFactors = TessellatedPatch.GetTessFactors();
FBarycentrics Barycentrics;
Barycentrics.Value = TessellatedPatch.GetVert( LocalItemIndex );
Barycentrics.Value_dx = 0; // float3( -1, 1, 0 ) / TessFactors.x;
Barycentrics.Value_dy = 0; // float3( 0, -1, 1 ) / TessFactors.y;
#if 0
const bool bOrtho = IsOrthoProjection( Shader.NaniteView.ViewToClip );
float3 CornerPackedSubpixel0; // sub-pixel xy, linear z
// TODO: Unify these paths by having EvaluateDomain operate directly in PackedClip space
BRANCH
if (bOrtho)
{
// Optimize out .w work and lane permutes
const float3 PointPackedClip = Shader.EvaluateDomain( UVDensities, Barycentrics ).xyz;
CornerPackedSubpixel0 = PointPackedClip.xyz;
}
else
{
// Optimize out .z work and lane permutes
const float3 PointPackedClip = Shader.EvaluateDomain( UVDensities, Barycentrics ).xyw;
CornerPackedSubpixel0 = float3( PointPackedClip.xy / PointPackedClip.z, PointPackedClip.z );
}
CornerPackedSubpixel0.xy = CornerPackedSubpixel0.xy * Raster.ViewportScale + Raster.ViewportBias;
CornerPackedSubpixel0.xy = floor( CornerPackedSubpixel0.xy );
const int3 RelativeVertIndexes = WaveGetLaneIndex() - ( LocalItemIndex - VertIndexes ); // Relative to thread 0 in wave
const uint3 ReadLaneIndex = uint3( RelativeVertIndexes ) & ( WaveGetLaneCount() - 1u );
float3 CornerPackedSubpixel1 = WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.y );
float3 CornerPackedSubpixel2 = WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.z );
ParentTask.CachedPackedSubpixelPosition = CornerPackedSubpixel0;
CornerPackedSubpixel1 = select( RelativeVertIndexes.y >= 0, WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.y ), CornerPackedSubpixel1 );
CornerPackedSubpixel2 = select( RelativeVertIndexes.z >= 0, WaveReadLaneAt( ParentTask.CachedPackedSubpixelPosition, ReadLaneIndex.z ), CornerPackedSubpixel2 );
BRANCH
if (bOrtho)
{
Verts[0] = float4( CornerPackedSubpixel0, 1.0f );
Verts[1] = float4( CornerPackedSubpixel1, 1.0f );
Verts[2] = float4( CornerPackedSubpixel2, 1.0f );
}
else
{
Verts[0] = float4( CornerPackedSubpixel0.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel0.z ), 1.0f ) / CornerPackedSubpixel0.z );
Verts[1] = float4( CornerPackedSubpixel1.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel1.z ), 1.0f ) / CornerPackedSubpixel1.z );
Verts[2] = float4( CornerPackedSubpixel2.xy, float2( ClipZFromLinearZ( Shader.NaniteView, CornerPackedSubpixel2.z ), 1.0f ) / CornerPackedSubpixel2.z );
}
#elif 0
// Grab what's there for this triangle before updating cache. Otherwise cache would need to be double size.
bool3 VertRead = false;
@@ -175,7 +261,7 @@ void FDiceTask::RunChild( uint LocalItemIndex )
FRasterTri Tri = SetupTriangle< NANITE_SUBPIXEL_SAMPLES, !NANITE_TWO_SIDED >( Raster.ScissorRect, Verts );
// Immediate dicing doesn't doesn't do near plane culling
if( min3( Verts[0].w, Verts[1].w, Verts[2].w ) < 0 )
if( !bActive || min3( Verts[0].w, Verts[1].w, Verts[2].w ) < 0 )
Tri.bIsValid = false;
if( Tri.bIsValid )
@@ -202,7 +288,7 @@ struct FClusterSplitTask
void Init( float3 TessFactors, uint VisibleClusterIndex, uint TriIndex );
FClusterSplitTask CreateChild( uint ParentLaneIndex );
void RunChild( uint LocalItemIndex );
void RunChild( inout FClusterSplitTask ParentTask, bool bActive, uint LocalItemIndex );
};
void FClusterSplitTask::Init( float3 TessFactors, uint VisibleClusterIndex, uint TriIndex )
@@ -212,7 +298,7 @@ void FClusterSplitTask::Init( float3 TessFactors, uint VisibleClusterIndex, uint
Encoded.z = BarycentricMax << 16;
Encoded.w = 0;
TessellatedPatch.Init( TessFactors, Encoded.yzw );
TessellatedPatch.Init( TessFactors, Encoded.yzw, false );
}
FClusterSplitTask FClusterSplitTask::CreateChild( uint ParentLaneIndex )
@@ -223,8 +309,10 @@ FClusterSplitTask FClusterSplitTask::CreateChild( uint ParentLaneIndex )
return ChildTask;
}
void FClusterSplitTask::RunChild( uint LocalItemIndex )
void FClusterSplitTask::RunChild( inout FClusterSplitTask ParentTask, bool bActive, uint LocalItemIndex )
{
if( !bActive )
return;
#if 0
Encoded.yzw = TessellatedPatch.GetTriangleEncoded( LocalItemIndex );
#else
@@ -425,6 +425,7 @@ void RasterBinBuild(uint RelativeClusterIndex : SV_DispatchThreadID, uint GroupT
FCluster Cluster = GetCluster(VisibleCluster.PageIndex, VisibleCluster.ClusterIndex);
const bool bWPOEnabled = (VisibleCluster.Flags & NANITE_CULLING_FLAG_ENABLE_WPO) != 0;
const bool bSecondaryBin = !bWPOEnabled; // use secondary bin where applicable when WPO disabled
const bool bMergeRanges = false; // Merging ranges is incompatible with tessellation. Cluster rasterizer assumes UVDensities is uniform. // TODO: Reenable it for non-tessellation materials?
BRANCH
if (IsMaterialFastPath(Cluster))
@@ -464,30 +465,32 @@ void RasterBinBuild(uint RelativeClusterIndex : SV_DispatchThreadID, uint GroupT
RasterBin2 = GetRemappedRasterBinFromIndex(Cluster.Material2Index, InstanceData.PrimitiveId, RegularMaterialRasterBinCount, RenderFlags, bSecondaryBin);
}
if (RasterBin0 == RasterBin1 && RasterBin0 == RasterBin2)
{
RasterLen0 = Cluster.NumTris;
BatchCount0 += BatchCount1 + BatchCount2;
}
else if (RasterBin0 == RasterBin1)
{
RasterLen0 = (Cluster.Material0Length + Cluster.Material1Length);
RasterLen2 = Material2Length;
BatchCount0 += BatchCount1;
}
else if (RasterBin1 == RasterBin2)
{
RasterLen0 = Cluster.Material0Length;
RasterLen1 = Cluster.NumTris - Cluster.Material0Length;
BatchCount1 += BatchCount2;
}
else
{
RasterLen0 = Cluster.Material0Length;
RasterLen1 = Cluster.Material1Length;
RasterLen2 = Material2Length;
}
RasterLen0 = Cluster.Material0Length;
RasterLen1 = Cluster.Material1Length;
RasterLen2 = Material2Length;
BRANCH
if (bMergeRanges)
{
if (RasterBin0 == RasterBin1 && RasterBin0 == RasterBin2)
{
RasterLen0 = Cluster.NumTris;
BatchCount0 += BatchCount1 + BatchCount2;
}
else if (RasterBin0 == RasterBin1)
{
RasterLen0 = (Cluster.Material0Length + Cluster.Material1Length);
RasterLen2 = Material2Length;
BatchCount0 += BatchCount1;
}
else if (RasterBin1 == RasterBin2)
{
RasterLen0 = Cluster.Material0Length;
RasterLen1 = Cluster.NumTris - Cluster.Material0Length;
BatchCount1 += BatchCount2;
}
}
// The 0th material range is always non-zero length
{
ExportRasterBin(RasterBin0, VisibleClusterIndex, 0u, RasterLen0, BatchCount0, BatchInfoOffset0, Cluster, true, bSoftware);
@@ -535,8 +538,7 @@ void RasterBinBuild(uint RelativeClusterIndex : SV_DispatchThreadID, uint GroupT
const uint BatchCount = BitStreamReader_Read_RO(ClusterPageData, BatchCountStreamState, 4, 4);
// Check if raster slot matches the current run, and that the triangle range is contiguous.
const bool bMergeRange = (RasterBinN == CurrentRangeBin);
if (bMergeRange)
if (bMergeRanges && (RasterBinN == CurrentRangeBin))
{
// Update current range
CurrentRangeEnd = TriStart + TriLength;
@@ -475,12 +475,6 @@ struct FMaterialShader
return PointClip;
}
float4 EvaluateDomain( uint TriIndex, FBarycentrics Barycentrics )
{
const float4 UVDensities = GetMaterialUVDensities( Cluster, InstanceData.PrimitiveId, TriIndex );
return EvaluateDomain( UVDensities, Barycentrics );
}
#endif
bool EvaluatePixel( FBarycentrics Barycentrics, float4 SvPosition, inout FVisBufferPixel Pixel )
@@ -234,7 +234,7 @@ FRaster CreateRaster( FNaniteView NaniteView, FInstanceSceneData InstanceData, F
}
#if PATCHES
#define VERTEX_CACHE_SIZE 153
#define VERTEX_CACHE_SIZE 120 // (MaxTessFactor+1)*(MaxTessFactor+2)/2
#else
#define VERTEX_CACHE_SIZE 256
#endif
@@ -311,9 +311,10 @@ void ClusterRasterize( uint VisibleIndex, uint GroupThreadIndex )
#endif
#if NANITE_TESSELLATION
#if USES_DISPLACEMENT
MaterialShader.DisplacementCenter = RasterBinMeta[ActiveRasterBin].MaterialDisplacementCenter;
MaterialShader.DisplacementMagnitude = RasterBinMeta[ActiveRasterBin].MaterialDisplacementMagnitude;
#endif
uint TriIndex = TriRange.Start + GroupThreadIndex;
bool bTriValid = GroupThreadIndex < TriRange.Num;
@@ -345,20 +346,23 @@ void ClusterRasterize( uint VisibleIndex, uint GroupThreadIndex )
float3 TessFactors = GetTessFactors( NaniteView, TriPointView );
const uint ImmediateDiceLimit = 8;
const uint ImmediateSplitLimit = 8;
bool bCanDice = max3( TessFactors.x, TessFactors.y, TessFactors.z ) <= ImmediateDiceLimit;
bool bCanDice = max3( TessFactors.x, TessFactors.y, TessFactors.z ) <= NANITE_TESSELLATION_TABLE_IMMEDIATE_SIZE;
if( WaveActiveAnyTrue( bCanDice ) )
{
FDiceTask DiceTask;
DiceTask.Raster = Raster;
DiceTask.Shader = MaterialShader;
DiceTask.PixelValue = ( VisibleIndex + 1 ) << 7;
DiceTask.VisualizeValues = GetVisualizeValues();
DiceTask.UVDensities = GetMaterialUVDensities( Cluster, InstanceData.PrimitiveId, TriRange.Start );
DiceTask.bReverseWinding = bReverseWindingOrder;
DiceTask.Vert = Vert;
DiceTask.CacheToLDS();
uint NumVerts = 0;
uint NumTris = 0;
if( bTriValid && bCanDice )
@@ -368,9 +372,6 @@ void ClusterRasterize( uint VisibleIndex, uint GroupThreadIndex )
NumTris = DiceTask.TessellatedPatch.GetNumTris();
}
//DiceTask.FirstVert = WavePrefixSum( NumVerts );
//DiceTask.NumCached = 0;
BRANCH
if ((RenderFlags & NANITE_RENDER_FLAG_WRITE_STATS) != 0u)
{
@@ -603,6 +604,7 @@ void PatchRasterize( uint GroupID, uint GroupThreadIndex )
FTessellatedPatch Patches_TessellatedPatch;
FNaniteVertTransforms Patches_VertTransforms;
FNaniteTransformedVert Patches_Verts;
float4 Patches_UVDensities;
if (GroupThreadIndex < NumPatches * 3u)
{
@@ -657,10 +659,10 @@ void PatchRasterize( uint GroupID, uint GroupThreadIndex )
const uint3 VertIndexes = DecodeTriangleIndices(Cluster, Patches_SplitPatch.TriIndex);
Patches_Verts = FetchTransformedNaniteVertex(PrimitiveData, Patches_InstanceData, Patches_VertTransforms, Cluster, VertIndexes[PatchCornerIndex], bEvaluateWPO);
Patches_UVDensities = GetMaterialUVDensities(Cluster, Patches_InstanceData.PrimitiveId, Patches_SplitPatch.TriIndex);
#if NANITE_TESSELLATION_PATCH_REFS
const uint Pattern = VisiblePatch.y;
Patches_TessellatedPatch.Pattern = Pattern;
Patches_TessellatedPatch.TableOffsets = TessellationTable_Offsets.Load4(4 * 2 * (Pattern & 0xfff));
Patches_TessellatedPatch.Init(VisiblePatch.y, false);
#else
const float3 OuterPatchCornersView = mul(float4(Patches_Verts.PointWorld, 1), NaniteView.TranslatedWorldToView).xyz;
@@ -675,7 +677,7 @@ void PatchRasterize( uint GroupID, uint GroupThreadIndex )
CornersView[2] = WaveReadLaneAt(InnerPatchCornersView, PatchStartLane + 2);
const float3 TessFactors = GetTessFactors(NaniteView, CornersView);
Patches_TessellatedPatch.Init(TessFactors, Patches_EncodedPatch.yzw);
Patches_TessellatedPatch.Init(TessFactors, Patches_EncodedPatch.yzw, false);
Patches_SplitPatch.Decode(Patches_EncodedPatch);
#endif
}
@@ -688,6 +690,7 @@ void PatchRasterize( uint GroupID, uint GroupThreadIndex )
const bool bReverseWindingOrder = WaveReadLaneAt(Patches_bReverseWindingOrders, PatchStartLane);
const FSplitPatch SplitPatch = WaveReadLaneAt(Patches_SplitPatch, PatchStartLane);
const FTessellatedPatch TessellatedPatch = WaveReadLaneAt(Patches_TessellatedPatch, PatchStartLane);
const float4 UVDensities = WaveReadLaneAt(Patches_UVDensities, PatchStartLane);
// The following values can be used in a shader, but will most likely be dead code eliminated
const FInstanceSceneData InstanceData = WaveReadLaneAt(Patches_InstanceData, PatchStartLane);
@@ -717,8 +720,10 @@ void PatchRasterize( uint GroupID, uint GroupThreadIndex )
#endif
MaterialShader.TransformedTri = MakeTransformedNaniteTriangle(Patches_Verts, PatchStartLane + uint3(0, 1, 2));
#if USES_DISPLACEMENT
MaterialShader.DisplacementCenter = RasterBinMeta[ActiveRasterBin].MaterialDisplacementCenter;
MaterialShader.DisplacementMagnitude = RasterBinMeta[ActiveRasterBin].MaterialDisplacementMagnitude;
#endif
uint PixelValue = (SplitPatch.VisibleClusterIndex + 1) << 7;
@@ -745,7 +750,6 @@ void PatchRasterize( uint GroupID, uint GroupThreadIndex )
}
#endif
const float4 UVDensities = GetMaterialUVDensities( Cluster, InstanceData.PrimitiveId, SplitPatch.TriIndex );
for( uint VertIndex = GroupThreadIndex; VertIndex < NumVerts; VertIndex += ThreadGroupSize )
{
FBarycentrics Barycentrics;
@@ -877,7 +881,7 @@ VSOut CommonRasterizerVS(FNaniteView NaniteView, FPrimitiveSceneData PrimitiveDa
MaterialShader.NaniteView = NaniteView;
MaterialShader.Cluster = Cluster;
#if NANITE_TESSELLATION
#if NANITE_TESSELLATION && USES_DISPLACEMENT
MaterialShader.DisplacementCenter = RasterBinMeta[ActiveRasterBin].MaterialDisplacementCenter;
MaterialShader.DisplacementMagnitude = RasterBinMeta[ActiveRasterBin].MaterialDisplacementMagnitude;
#endif
@@ -63,7 +63,7 @@ struct FSplitTask
uint Run();
FSplitTask CreateChild( uint ParentLaneIndex );
void RunChild( uint LocalItemIndex );
void RunChild( inout FSplitTask ParentTask, bool bActive, uint LocalItemIndex );
};
void FSplitTask::Init( uint VisibleClusterIndex, uint TriIndex )
@@ -222,7 +222,7 @@ uint FSplitTask::Run()
}
#endif
bool bNeedsSplitting = max3( TessFactors.x, TessFactors.y, TessFactors.z ) > TessellationTable_Size;
bool bNeedsSplitting = max3( TessFactors.x, TessFactors.y, TessFactors.z ) > NANITE_TESSELLATION_TABLE_SIZE;
if( Cull.bWasOccluded )
{
@@ -250,11 +250,11 @@ uint FSplitTask::Run()
if( WriteOffset < VisiblePatchesSize )
{
#if NANITE_TESSELLATION_PATCH_REFS || WRITE_STATS
TessellatedPatch.Init( TessFactors, Encoded.yzw );
TessellatedPatch.Init( TessFactors, Encoded.yzw, false );
#endif
#if NANITE_TESSELLATION_PATCH_REFS || WRITE_STATS
RWVisiblePatches.Store2( WriteOffset * 8, uint2( QueueOffset, TessellatedPatch.Pattern ) );
RWVisiblePatches.Store2( WriteOffset * 8, uint2( QueueOffset, TessellatedPatch.GetPattern() ) );
SplitWorkQueue.DataBuffer.Store4( QueueOffset * 16, Encoded );
QueueOffset = ~0u;
#else
@@ -269,7 +269,7 @@ uint FSplitTask::Run()
else
{
float3 SplitFactors = min( GetSplitFactors( TessFactors ), 8 );
TessellatedPatch.Init( SplitFactors, Encoded.yzw );
TessellatedPatch.Init( SplitFactors, Encoded.yzw, false );
return TessellatedPatch.GetNumTris();
}
}
@@ -285,8 +285,11 @@ FSplitTask FSplitTask::CreateChild( uint ParentLaneIndex )
return ChildTask;
}
void FSplitTask::RunChild( uint LocalItemIndex )
void FSplitTask::RunChild( inout FSplitTask ParentTask, bool bActive, uint LocalItemIndex )
{
if ( !bActive )
return;
FSplitPatch Patch;
Patch.Decode( Encoded );
@@ -43,9 +43,7 @@ float3 DecodeBarycentrics( uint Encoded )
ByteAddressBuffer TessellationTable_Offsets;
ByteAddressBuffer TessellationTable_Verts;
ByteAddressBuffer TessellationTable_Indexes;
static const uint TessellationTable_Size = 16;
ByteAddressBuffer TessellationTable_VertsAndIndexes;
float InvDiceRate;
uint MaxPatchesPerGroup;
@@ -91,7 +89,8 @@ float3 GetTessFactors( FNaniteView NaniteView, float3 PointView[3] )
float3 GetSplitFactors( float3 TessFactors )
{
#if 1
return TessFactors / TessellationTable_Size;
return TessFactors / NANITE_TESSELLATION_TABLE_SIZE;
// TODO: Fix other modes to work with NANITE_TESSELLATION_TABLE_SIZE
#elif 1
const float SizeLog2 = log2( TessellationTable_Size );
@@ -123,18 +122,18 @@ float2 CalculateUVDerivativeForDomainPoint(
struct FTessellatedPatch
{
uint4 TableOffsets;
uint Pattern;
uint Swizzle;
uint TableOffset;
uint Pattern_NumVerts_NumTris;
//uint Swizzle;
void Init( float3 TessFactors, inout uint3 VertData )
void Init( float3 TessFactors, inout uint3 VertData, bool bImmediateTable )
{
TessFactors = ceil( TessFactors );
TessFactors = clamp( TessFactors, 1, TessellationTable_Size );
TessFactors = clamp( TessFactors, 1, NANITE_TESSELLATION_TABLE_SIZE );
float MaxTessFactor = max3( TessFactors.x, TessFactors.y, TessFactors.z );
Swizzle = 0b011010;
//Swizzle = 0b011010;
// TessFactors in descending order to reduce size of table.
// Rotate patch so TessFactors.x == MaxTessFactor
@@ -142,13 +141,13 @@ struct FTessellatedPatch
{
VertData = VertData.yzx;
TessFactors = TessFactors.yzx;
Swizzle = 0b110100;
//Swizzle = 0b110100;
}
else if( TessFactors.z == MaxTessFactor )
{
VertData = VertData.zxy;
TessFactors = TessFactors.zxy;
Swizzle = 0b101001;
//Swizzle = 0b101001;
}
// Sorting can flip winding which we need to undo later.
@@ -158,34 +157,43 @@ struct FTessellatedPatch
VertData.xy = VertData.yx;
TessFactors.yz = TessFactors.zy;
bFlipWinding = true;
Swizzle ^= 0b000111;
//Swizzle ^= 0b000111;
}
Pattern = uint(
uint Pattern = uint(
TessFactors.x +
TessFactors.y * TessellationTable_Size +
TessFactors.z * TessellationTable_Size * TessellationTable_Size -
(1 + TessellationTable_Size + TessellationTable_Size * TessellationTable_Size) );
TessFactors.y * NANITE_TESSELLATION_TABLE_PO2_SIZE +
TessFactors.z * NANITE_TESSELLATION_TABLE_PO2_SIZE * NANITE_TESSELLATION_TABLE_PO2_SIZE -
(1 + NANITE_TESSELLATION_TABLE_PO2_SIZE + NANITE_TESSELLATION_TABLE_PO2_SIZE * NANITE_TESSELLATION_TABLE_PO2_SIZE) );
TableOffsets = TessellationTable_Offsets.Load4( 4*2 * Pattern );
Pattern |= bFlipWinding ? 0x1000 : 0;
//Pattern |= Swizzle << 25;
Init( Pattern, bImmediateTable );
}
uint GetNumVerts() { return TableOffsets[2] - TableOffsets[0]; }
uint GetNumTris() { return TableOffsets[3] - TableOffsets[1]; }
void Init( uint InPattern, bool bImmediateTable)
{
const uint Offset = bImmediateTable ? (NANITE_TESSELLATION_TABLE_PO2_SIZE * NANITE_TESSELLATION_TABLE_PO2_SIZE * NANITE_TESSELLATION_TABLE_PO2_SIZE) : 0u;
const uint2 Tmp = TessellationTable_Offsets.Load2( 4 * 2 * ( Offset + ( InPattern & 0xfff ) ) );
TableOffset = Tmp.x;
Pattern_NumVerts_NumTris = InPattern | Tmp.y;
}
uint GetPattern() { return Pattern_NumVerts_NumTris & 0x1FFFu; }
uint GetNumVerts() { return ( Pattern_NumVerts_NumTris >> 13 ) & 0x1FFu; }
uint GetNumTris() { return Pattern_NumVerts_NumTris >> 22; }
float3 GetVert( uint VertIndex )
{
uint BarycentricsEncoded = TessellationTable_Verts.Load( 4 * ( TableOffsets.x + VertIndex ) );
uint BarycentricsEncoded = TessellationTable_VertsAndIndexes.Load( 4 * ( TableOffset + VertIndex ) );
return DecodeBarycentrics( BarycentricsEncoded );
}
uint3 GetIndexes( uint TriIndex )
{
uint PackedIndexes = TessellationTable_Indexes.Load( 4 * ( TableOffsets.y + TriIndex ) );
uint PackedIndexes = TessellationTable_VertsAndIndexes.Load( 4 * ( TableOffset + GetNumVerts() + TriIndex ) );
uint3 VertIndexes;
VertIndexes[0] = ( PackedIndexes >> 0 ) & 1023;
@@ -194,7 +202,7 @@ struct FTessellatedPatch
// Sorting TessFactors might have flipped the patch winding.
//bool bFlipWinding = Pattern > 0xfff;
bool bFlipWinding = Pattern & 0x1000;
bool bFlipWinding = Pattern_NumVerts_NumTris & 0x1000;
if( bFlipWinding )
VertIndexes.yz = VertIndexes.zy;
@@ -205,6 +213,7 @@ struct FTessellatedPatch
{
uint3 VertIndexes = GetIndexes( TriIndex );
uint Pattern = GetPattern();
uint3 Shift, Mask;
for( int i = 0; i < 3; i++ )
{
@@ -216,7 +225,7 @@ struct FTessellatedPatch
uint3 Encoded;
for( int i = 0; i < 3; i++ )
{
uint BarycentricsEncoded = TessellationTable_Verts.Load( 4 * ( TableOffsets.x + VertIndexes[i] ) );
uint BarycentricsEncoded = TessellationTable_VertsAndIndexes.Load( 4 * ( TableOffset + VertIndexes[i] ) );
uint3 Barycentrics;
Barycentrics.x = BarycentricsEncoded & 0xffff;
@@ -231,14 +240,14 @@ struct FTessellatedPatch
uint3 GetTessFactors()
{
uint Packed = Pattern & 0xfff;
Packed -= (1 + TessellationTable_Size + TessellationTable_Size * TessellationTable_Size);
uint Packed = GetPattern() & 0xfff;
Packed -= (1 + NANITE_TESSELLATION_TABLE_PO2_SIZE + NANITE_TESSELLATION_TABLE_PO2_SIZE * NANITE_TESSELLATION_TABLE_PO2_SIZE);
uint3 TessFactors;
for( int i = 0; i < 3; i++ )
{
TessFactors[i] = Packed % TessellationTable_Size;
Packed /= TessellationTable_Size;
TessFactors[i] = Packed % NANITE_TESSELLATION_TABLE_PO2_SIZE;
Packed /= NANITE_TESSELLATION_TABLE_PO2_SIZE;
}
return TessFactors;
}
@@ -247,9 +256,8 @@ struct FTessellatedPatch
FTessellatedPatch WaveReadLaneAt( FTessellatedPatch In, uint SrcIndex )
{
FTessellatedPatch Out;
Out.TableOffsets= WaveReadLaneAt( In.TableOffsets, SrcIndex );
Out.Pattern = WaveReadLaneAt( In.Pattern, SrcIndex );
Out.Swizzle = WaveReadLaneAt( In.Swizzle, SrcIndex );
Out.TableOffset = WaveReadLaneAt( In.TableOffset, SrcIndex );
Out.Pattern_NumVerts_NumTris = WaveReadLaneAt( In.Pattern_NumVerts_NumTris, SrcIndex );
return Out;
}
+2 -4
View File
@@ -290,10 +290,8 @@ void DistributeWork( FTask Task, uint GroupIndex, uint NumWorkItems )
FTask ChildTask = Task.CreateChild( SourceIndex );
if( WorkHead + LaneIndex < WorkTail )
{
ChildTask.RunChild( LocalItemIndex );
}
bool bActive = ( WorkHead + LaneIndex < WorkTail );
ChildTask.RunChild( Task, bActive, LocalItemIndex );
// Did 1 wave of work
WorkHead += LaneCount;
@@ -101,6 +101,10 @@
#define NANITE_STREAMING_PRIORITY_CATEGORY_MASK ((1 << NANITE_NUM_STREAMING_PRIORITY_CATEGORY_BITS) - 1)
#define NANITE_MAX_PRIORITY_BEFORE_PARENTS 0xFFFFFFE0u
#define NANITE_TESSELLATION_TABLE_SIZE 14
#define NANITE_TESSELLATION_TABLE_PO2_SIZE 16
#define NANITE_TESSELLATION_TABLE_IMMEDIATE_SIZE 8
#define NANITE_VIEW_FLAG_HZBTEST 0x1
#define NANITE_VIEW_FLAG_NEAR_CLIP 0x2
#define NANITE_VIEW_FLAG_DISTANCE_CULL 0x4