Files
UnrealEngineUWP/Engine/Shaders/Private/RayTracing/MaterialSort.usf
Marc Audy 11f5b21210 Merging //UE5/Release-Engine-Staging @ 13752110 to Main (//UE5/Main)
#rnx

[CL 13753156 by Marc Audy in ue5-main branch]
2020-06-23 18:40:00 -04:00

138 lines
3.2 KiB
Plaintext

#include "../Common.ush"
#include "RayTracingCommon.ush"
#define MATERIAL_SORT_SHADER_VERSION 0xd74cbc88 // Change to force shader compilation of this shader
#ifndef DIM_WAVE_OPS
#define DIM_WAVE_OPS 0
#endif // DIM_WAVE_OPS
#if DIM_SORT_SIZE == 0
#define NUM_ELEMENTS 256
#elif DIM_SORT_SIZE == 1
#define NUM_ELEMENTS 512
#elif DIM_SORT_SIZE == 2
#define NUM_ELEMENTS 1024
#elif DIM_SORT_SIZE == 3
#define NUM_ELEMENTS 2048
#elif DIM_SORT_SIZE == 4
#define NUM_ELEMENTS 4096
#else
#error Unknown sorting size
#endif
#define NUM_BINS 128 // Max sort window (4096) / min threads per wave (32)
groupshared uint Bins[NUM_BINS];
#if DIM_WAVE_OPS
#define MIN_SUPPORTED_WAVE_SIZE 32
#define NUM_WAVE_SUMS ((NUM_BINS+MIN_SUPPORTED_WAVE_SIZE-1) / MIN_SUPPORTED_WAVE_SIZE)
groupshared uint WaveSums[NUM_WAVE_SUMS];
#define Offsets Bins // No need for a separate array of offsets as prefix sum is computed in-place
#else // DIM_WAVE_OPS
groupshared uint Offsets[NUM_BINS];
#endif // DIM_WAVE_OPS
int NumTotalEntries;
RWStructuredBuffer<FDeferredMaterialPayload> MaterialBuffer;
#if NUM_ELEMENTS < 1024
#define NUM_THREADS_X 256
#else
#define NUM_THREADS_X 512
#endif
// Simple local bin sort based on shared atomics
[numthreads(NUM_THREADS_X, 1, 1)]
void MaterialSortLocal(uint GroupThread : SV_GroupThreadID, uint GroupIndex : SV_GroupID)
{
const uint GroupBase = GroupIndex * NUM_ELEMENTS;
if (GroupThread < NUM_BINS)
{
Bins[GroupThread] = 0;
Offsets[GroupThread] = 0;
}
GroupMemoryBarrierWithGroupSync();
FDeferredMaterialPayload Data[NUM_ELEMENTS / NUM_THREADS_X];
uint Slot[NUM_ELEMENTS / NUM_THREADS_X];
// Compute histogram
for (int i = GroupThread; i < NUM_ELEMENTS ; i += NUM_THREADS_X)
{
uint Index = GroupBase + i;
if (Index < NumTotalEntries)
{
Data[i/ NUM_THREADS_X] = MaterialBuffer[Index];
uint Bin = Data[i / NUM_THREADS_X].SortKey % NUM_BINS;
InterlockedAdd(Bins[Bin], 1, Slot[i/ NUM_THREADS_X]);
}
}
GroupMemoryBarrierWithGroupSync();
// Compute prefix sum
#if DIM_WAVE_OPS
{
const uint WaveIndex = GroupThread / WaveGetLaneCount();
const uint LaneIndex = WaveGetLaneIndex();
uint Value = 0;
if (GroupThread < NUM_BINS)
{
Value = Bins[GroupThread];
uint ThisWaveSum = WaveActiveSum(Value);
if (LaneIndex == 0)
{
WaveSums[WaveIndex] = ThisWaveSum;
}
}
GroupMemoryBarrierWithGroupSync();
if (WaveIndex == 0 && LaneIndex < NUM_WAVE_SUMS)
{
WaveSums[LaneIndex] = WavePrefixSum(WaveSums[LaneIndex]);
}
GroupMemoryBarrierWithGroupSync();
if (GroupThread < NUM_BINS)
{
Offsets[GroupThread] = WaveSums[WaveIndex] + WavePrefixSum(Value);
}
}
#else // DIM_WAVE_OPS
if (GroupThread < NUM_BINS)
{
// Naive implementation that trades instructions for fewer barriers
for (int i = 0; i < GroupThread; i++)
{
Offsets[GroupThread] += Bins[i];
}
}
#endif // DIM_WAVE_OPS
GroupMemoryBarrierWithGroupSync();
// Write data
for (int i = GroupThread; i < NUM_ELEMENTS; i += NUM_THREADS_X)
{
uint Index = GroupBase + i;
if (Index < NumTotalEntries)
{
uint Bin = Data[i / NUM_THREADS_X].SortKey % NUM_BINS;
uint DestIndex = GroupBase + Offsets[Bin] + Slot[i / NUM_THREADS_X];
MaterialBuffer[DestIndex] = Data[i / NUM_THREADS_X];
}
}
}