Files
UnrealEngineUWP/Engine/Shaders/Private/ComputeShaderOutputCommon.ush
rune stubbe 799ece44dd Nanite material optimizations
-Use groupsize=32 for evaluation when running wave32
-Use morton order during binning to align thread group footprint with DCC block footprint
-Double-ended binning that allocates aligned full tiles with a single material from one end and any loose quads/pixels from the other to minimize the number of partial DCC writes during material evaluation
-Added MAX_OCCUPANCY hint to compute materials

#rb graham.wihlidal
[FYI] brian.karis, jamie.hayes, ben.woodhouse
#jira UE-187341

[CL 29394185 by rune stubbe in ue5-main branch]
2023-11-02 22:27:01 -04:00

357 lines
10 KiB
Plaintext

// Copyright Epic Games, Inc. All Rights Reserved.
/*=============================================================================
ComputeShaderOutputCommon.ush: To allow CS input/output passed into functions
through a single struct, allowing for a more readable code
(less #ifdefs, reducing the boolean hell)
=============================================================================*/
COMPILER_ALLOW_CS_DERIVATIVES
#if IS_NANITE_PASS
MAX_OCCUPANCY
#endif
#include "ShaderOutputCommon.ush"
#include "GammaCorrectionCommon.ush"
#include "VariableRateShading/VRSShadingRateCommon.ush"
#include "Nanite/NaniteShadeCommon.ush"
#if PLATFORM_SUPPORTS_SHADER_ROOT_CONSTANTS
#include "/Engine/Shared/HLSLReservedSpaces.h"
struct FUERootConstants
{
uint RecordIndex;
uint3 PassData;
};
// RecordIndex = ShadingBin
// PassData .x = Quad Binning Flag, .y = DataByteOffset, .z = Unused
ConstantBuffer<FUERootConstants> UERootConstants : UE_HLSL_REGISTER(b, 0, UE_HLSL_SPACE_SHADER_ROOT_CONSTANTS);
#else
// .x = shading bin, .y = Quad Binning Flag, .z = DataByteOffset, .w = Unused
uint4 PassData;
#endif
// TODO: Is this ever used? (see bHighPrecisionGBuffers)
#define HIGH_PRECISION_GBUFFERS 0
#if SUBSTRATE_OPAQUE_DEFERRED
#if SUBSTRATE_BASE_PASS_MRT_OUTPUT_COUNT != 3
#error Substrate SUBSTRATE_BASE_PASS_MRT_OUTPUT_COUNT has been updated but not the uint MRTs
#endif
#if PIXELSHADEROUTPUT_MRT4
#error Substrate cannot map to such a case
#endif
#endif
#if PIXELSHADEROUTPUT_MRT0
#if DUAL_SOURCE_COLOR_BLENDING_ENABLED && MATERIAL_WORKS_WITH_DUAL_SOURCE_COLOR_BLENDING
RWTexture2D<float4> OutTarget0; // DUAL_SOURCE_BLENDING_SLOT(0)
#else
RWTexture2D<float4> OutTarget0;
#endif
#endif
#if PIXELSHADEROUTPUT_MRT1
#if DUAL_SOURCE_COLOR_BLENDING_ENABLED && MATERIAL_WORKS_WITH_DUAL_SOURCE_COLOR_BLENDING
RWTexture2D<float4> OutTarget1; // DUAL_SOURCE_BLENDING_SLOT(1)
#else
RWTexture2D<float4> OutTarget1;
#endif
#endif
#if PIXELSHADEROUTPUT_MRT2
RWTexture2D<float4> OutTarget2;
#endif
#if PIXELSHADEROUTPUT_MRT3
RWTexture2D<float4> OutTarget3;
#endif
#if SUBSTRATE_OPAQUE_DEFERRED
RWTexture2DArray<uint> OutTargets;
#if PIXELSHADEROUTPUT_MRT3
RWTexture2D<SUBSTRATE_TOP_LAYER_TYPE> OutTarget4;
#elif PIXELSHADEROUTPUT_MRT2
RWTexture2D<SUBSTRATE_TOP_LAYER_TYPE> OutTarget3;
#elif PIXELSHADEROUTPUT_MRT1
RWTexture2D<SUBSTRATE_TOP_LAYER_TYPE> OutTarget2;
#else
RWTexture2D<SUBSTRATE_TOP_LAYER_TYPE> OutTarget1;
#endif
#else // SUBSTRATE_OPAQUE_DEFERRED
#if PIXELSHADEROUTPUT_MRT4
RWTexture2D<float4> OutTarget4;
#endif
#if PIXELSHADEROUTPUT_MRT5
RWTexture2D<float4> OutTarget5;
#endif
#if PIXELSHADEROUTPUT_MRT6
RWTexture2D<float4> OutTarget6;
#endif
#if PIXELSHADEROUTPUT_MRT7
RWTexture2D<float4> OutTarget7;
#endif
#endif // SUBSTRATE_OPAQUE_DEFERRED
FPixelShaderOut ShadePixel(const float2 SVPositionXY, uint QuadIndex)
{
#if PIXELSHADEROUTPUT_INTERPOLANTS || PIXELSHADEROUTPUT_BASEPASS
#if IS_NANITE_PASS
FNaniteFullscreenVSToPS NaniteInterpolants = (FNaniteFullscreenVSToPS)0;
NaniteInterpolants.TileIndex = QuadIndex;
#else
FVertexFactoryInterpolantsVSToPS Interpolants = (FVertexFactoryInterpolantsVSToPS)0;
#endif
#endif
const float4 SvPosition = float4(SVPositionXY, 0.0f, 1.0f);
#if IS_NANITE_PASS && (PIXELSHADEROUTPUT_INTERPOLANTS || PIXELSHADEROUTPUT_BASEPASS)
FVertexFactoryInterpolantsVSToPS Interpolants = (FVertexFactoryInterpolantsVSToPS)0;
Interpolants.ViewIndex = NaniteInterpolants.ViewIndex; // TODO: NANITE_MATERIAL_MULTIVIEW
#if INSTANCED_STEREO
// Revisit if we need to support > 1 instanced view or non side-by-side views
Interpolants.EyeIndex = (SvPosition.x >= (View.ViewRectMin.x + View.ViewSizeAndInvSize.x)) ? 1 : 0;
#endif
#endif
FPixelShaderIn PixelShaderIn = (FPixelShaderIn)0;
FPixelShaderOut PixelShaderOut = (FPixelShaderOut)0;
PixelShaderIn.SvPosition = SvPosition;
// Nanite does not support OPTIONAL_IsFrontFace, Instead, Nanite determines this in GetMaterialPixelParameters().
PixelShaderIn.bIsFrontFace = false;
#if PIXELSHADEROUTPUT_BASEPASS
FBasePassInterpolantsVSToPS BasePassInterpolants = (FBasePassInterpolantsVSToPS)0;
FPixelShaderInOut_MainPS(Interpolants, BasePassInterpolants, PixelShaderIn, PixelShaderOut);
#endif
#if !HIGH_PRECISION_GBUFFERS
PixelShaderOut.MRT[3] = float4(LinearToSrgb(PixelShaderOut.MRT[3].rgb), PixelShaderOut.MRT[3].a); // BaseColor is sRGB
#endif
return PixelShaderOut;
}
#define CONDITIONAL_EXPORT 1
#if SUBSTRATE_OPAQUE_DEFERRED
void ConditionalExport(bool Forced, uint2 PixelPos, FPixelShaderOut ShadedPixel, uint Index)
{
#if CONDITIONAL_EXPORT
BRANCH
if (Forced || ShadedPixel.SubstrateOutput[Index] != 0)
#endif
{
OutTargets[uint3(PixelPos, Index)] = ShadedPixel.SubstrateOutput[Index];
}
}
void ConditionalExport(bool Forced, RWTexture2D<SUBSTRATE_TOP_LAYER_TYPE> OutTarget, uint2 PixelPos, SUBSTRATE_TOP_LAYER_TYPE TopLayerData)
{
#if CONDITIONAL_EXPORT
BRANCH
if (Forced || any(TopLayerData != 0))
#endif
{
OutTarget[PixelPos] = TopLayerData;
}
}
#endif
void ExportPixel(const uint2 PixelPos, FPixelShaderOut ShadedPixel)
{
#if PIXELSHADEROUTPUT_COVERAGE || PIXELSHADEROUTPUT_A2C
// TODO: OutCoverage = PixelShaderOut.Coverage;
#endif
#if OUTPUT_PIXEL_DEPTH_OFFSET
// TODO: OutDepth = PixelShaderOut.Depth;
#endif
#if PIXELSHADEROUTPUT_MRT0
OutTarget0[PixelPos] = ShadedPixel.MRT[0];
#endif
#if PIXELSHADEROUTPUT_MRT1
OutTarget1[PixelPos] = ShadedPixel.MRT[1];
#endif
#if PIXELSHADEROUTPUT_MRT2
OutTarget2[PixelPos] = ShadedPixel.MRT[2];
#endif
#if PIXELSHADEROUTPUT_MRT3
OutTarget3[PixelPos] = ShadedPixel.MRT[3];
#endif
#if SUBSTRATE_OPAQUE_DEFERRED
// In this case, here is the gbuffer pattern
// MRT0 is pixel color
// MRT1 is velocity if enabled or precomputed shadow if velocity if disabled and precomputed shadow enabled
// MRT2 is precomputed shadow if both velocity and prec shadow are enabled.
// After, Substrate top layer data appended. Remaining Substrate outputs are in the 2d array UAV
ConditionalExport(true, PixelPos, ShadedPixel, 0);
ConditionalExport(false, PixelPos, ShadedPixel, 1);
ConditionalExport(false, PixelPos, ShadedPixel, 2);
#if PIXELSHADEROUTPUT_MRT3
ConditionalExport(true, OutTarget4, PixelPos, ShadedPixel.SubstrateTopLayerData);
#elif PIXELSHADEROUTPUT_MRT2
ConditionalExport(true, OutTarget3, PixelPos, ShadedPixel.SubstrateTopLayerData);
#elif PIXELSHADEROUTPUT_MRT1
ConditionalExport(true, OutTarget2, PixelPos, ShadedPixel.SubstrateTopLayerData);
#else
ConditionalExport(true, OutTarget1, PixelPos, ShadedPixel.SubstrateTopLayerData);
#endif
#else // SUBSTRATE_OPAQUE_DEFERRED
#if PIXELSHADEROUTPUT_MRT4
OutTarget4[PixelPos] = ShadedPixel.MRT[4];
#endif
#if PIXELSHADEROUTPUT_MRT5
OutTarget5[PixelPos] = ShadedPixel.MRT[5];
#endif
#if PIXELSHADEROUTPUT_MRT6
OutTarget6[PixelPos] = ShadedPixel.MRT[6];
#endif
#if PIXELSHADEROUTPUT_MRT7
OutTarget7[PixelPos] = ShadedPixel.MRT[7];
#endif
#endif // SUBSTRATE_OPAQUE_DEFERRED
}
#define VIS_HELPER_LANES 0
void ProcessPixel(uint ShadingBin, const uint2 PixelPos, const float2 SVPositionXY, uint QuadIndex, uint DispatchIndex, uint PixelWriteMask, uint HelperLaneCount)
{
// All lanes shade (regardless of export - so ddx/ddy are valid)
FPixelShaderOut ShadedPixel = ShadePixel(SVPositionXY, QuadIndex);
#if VIS_HELPER_LANES
ShadedPixel.MRT[3].rgb = ColorMapTurbo(float(HelperLaneCount) / 4.0f);
#elif 0
ShadedPixel.MRT[3].rgb = ColorMapTurbo(50.0);
#elif 0
// Coherency vis
float R = (DispatchIndex & 0xFFu) / 255.0f;
float G = ((DispatchIndex & 0xFF00u) >> 8u) / 255.0f;
float B = ((DispatchIndex & 0xFF0000u) >> 16u) / 255.0f;
ShadedPixel.MRT[3].rgb = float3(R, G, B);
#elif 0
ShadedPixel.MRT[3].rgb = IntToColor(ShadingBin);
#elif 0
ShadedPixel.MRT[3].rgb = IntToColor(QuadIndex);
#elif 0
ShadedPixel.MRT[3].rgb = VisualizeShadingRate(ShadingRate).rgb;
#endif
// Disable helper lanes from final export
BRANCH
if (PixelWriteMask & 1u)
{
ExportPixel(PixelPos, ShadedPixel);
}
BRANCH
if (PixelWriteMask & 2u) // Copy H
{
ExportPixel(PixelPos + uint2(1, 0), ShadedPixel);
}
BRANCH
if (PixelWriteMask & 4u) // Copy V
{
ExportPixel(PixelPos + uint2(0, 1), ShadedPixel);
}
BRANCH
if (PixelWriteMask & 8u) // Copy D
{
ExportPixel(PixelPos + uint2(1, 1), ShadedPixel);
}
}
[numthreads(COMPUTE_MATERIAL_GROUP_SIZE, 1, 1)]
void MainCS(uint ThreadIndex : SV_GroupIndex, uint GroupID : SV_GroupID)
{
#if PLATFORM_SUPPORTS_SHADER_ROOT_CONSTANTS
const uint ShadingBin = UERootConstants.RecordIndex;
const bool bQuadBinning = UERootConstants.PassData.x != 0u;
const uint DataByteOffset = UERootConstants.PassData.y;
#else
const uint ShadingBin = PassData.x;
const bool bQuadBinning = PassData.y != 0u;
const uint DataByteOffset = PassData.z;
#endif
const uint PixelIndex = (GroupID * COMPUTE_MATERIAL_GROUP_SIZE) + ThreadIndex;
const uint3 ShadingBinMeta = Nanite.ShadingBinData.Load3(ShadingBin * NANITE_SHADING_BIN_META_BYTES);
const uint ElementCount = ShadingBinMeta.x;
const uint ElementIndex = bQuadBinning ? (PixelIndex >> 2) : PixelIndex;
BRANCH
if (ElementIndex >= ElementCount)
{
return;
}
uint2 PixelPos;
uint2 VRSShift;
uint PixelWriteMask;
uint HelperLaneCount;
BRANCH
if (bQuadBinning)
{
const uint2 PackedElement = Nanite.ShadingBinData.Load2(DataByteOffset + (ShadingBinMeta.z * 4 + ElementIndex * 8));
const uint CornerIndex = (ThreadIndex & 3u);
const uint2 TopLeft = uint2(BitFieldExtractU32(PackedElement.x, 14, 0), BitFieldExtractU32(PackedElement.x, 14, 14));
VRSShift = uint2(BitFieldExtractU32(PackedElement.x, 1, 28), BitFieldExtractU32(PackedElement.x, 1, 29));
PixelWriteMask = BitFieldExtractU32(PackedElement.y, 4, CornerIndex * 4u);
PixelPos = TopLeft + (uint2(CornerIndex & 1u, CornerIndex >> 1u) << VRSShift);
HelperLaneCount = (VIS_HELPER_LANES && bQuadBinning) ? (4u - countbits(PixelWriteMask)) : 0u;
}
else
{
const uint PackedElement= Nanite.ShadingBinData.Load(DataByteOffset + (ShadingBinMeta.z * 4 + ElementIndex * 4));
PixelPos = uint2(BitFieldExtractU32(PackedElement, 13, 0), BitFieldExtractU32(PackedElement, 13, 13));
VRSShift = uint2(BitFieldExtractU32(PackedElement, 1, 26), BitFieldExtractU32(PackedElement, 1, 27));
PixelWriteMask = PackedElement >> 28;
}
const float2 SVPositionXY = PixelPos + ((1u << VRSShift) * 0.5f);
ProcessPixel(ShadingBin, PixelPos, SVPositionXY, ElementIndex, PixelIndex, PixelWriteMask, HelperLaneCount);
}