UnrealEngineUWP/Engine/Shaders/Private/RayTracing/MaterialSort.usf

#include "../Common.ush"
#include "RayTracingCommon.ush"

#define MATERIAL_SORT_SHADER_VERSION 0xd74cbc88 // Change to force shader compilation of this shader

#ifndef DIM_WAVE_OPS
	#define DIM_WAVE_OPS 0
#endif // DIM_WAVE_OPS

#if DIM_SORT_SIZE == 0
	#define NUM_ELEMENTS 256

#elif DIM_SORT_SIZE == 1
	#define NUM_ELEMENTS 512

#elif DIM_SORT_SIZE == 2
	#define NUM_ELEMENTS 1024

#elif DIM_SORT_SIZE == 3
#define NUM_ELEMENTS 2048

#elif DIM_SORT_SIZE == 4
#define NUM_ELEMENTS 4096


#else
	#error Unknown sorting size
#endif

#define NUM_BINS 128 // Max sort window (4096) / min threads per wave (32)
groupshared uint Bins[NUM_BINS];

#if DIM_WAVE_OPS
#define MIN_SUPPORTED_WAVE_SIZE 32
#define NUM_WAVE_SUMS ((NUM_BINS+MIN_SUPPORTED_WAVE_SIZE-1) / MIN_SUPPORTED_WAVE_SIZE)
groupshared uint WaveSums[NUM_WAVE_SUMS];
#define Offsets Bins // No need for a separate array of offsets as prefix sum is computed in-place
#else // DIM_WAVE_OPS
groupshared uint Offsets[NUM_BINS];
#endif // DIM_WAVE_OPS

int NumTotalEntries;
RWStructuredBuffer<FDeferredMaterialPayload> MaterialBuffer;

#if NUM_ELEMENTS < 1024
#define NUM_THREADS_X 256
#else
#define NUM_THREADS_X  512
#endif

// Simple local bin sort based on shared atomics
[numthreads(NUM_THREADS_X, 1, 1)]
void MaterialSortLocal(uint GroupThread : SV_GroupThreadID, uint GroupIndex : SV_GroupID)
{
	const uint GroupBase = GroupIndex * NUM_ELEMENTS;

	if (GroupThread < NUM_BINS)
	{
		Bins[GroupThread] = 0;
		Offsets[GroupThread] = 0;
	}

	GroupMemoryBarrierWithGroupSync();

	FDeferredMaterialPayload Data[NUM_ELEMENTS / NUM_THREADS_X];
	uint Slot[NUM_ELEMENTS / NUM_THREADS_X];

	// Compute histogram

	for (int i = GroupThread; i < NUM_ELEMENTS ; i += NUM_THREADS_X)
	{
		uint Index = GroupBase + i;
		if (Index < NumTotalEntries)
		{
			Data[i/ NUM_THREADS_X] = MaterialBuffer[Index];
			uint Bin = Data[i / NUM_THREADS_X].SortKey % NUM_BINS;

			InterlockedAdd(Bins[Bin], 1, Slot[i/ NUM_THREADS_X]);
		}
	}

	GroupMemoryBarrierWithGroupSync();

	// Compute prefix sum

#if DIM_WAVE_OPS
	{
		const uint WaveIndex = GroupThread / WaveGetLaneCount();
		const uint LaneIndex = WaveGetLaneIndex();
		uint Value = 0;
		if (GroupThread < NUM_BINS)
		{
			Value = Bins[GroupThread];
			uint ThisWaveSum = WaveActiveSum(Value);
			if (LaneIndex == 0)
			{
				WaveSums[WaveIndex] = ThisWaveSum;
			}
		}
		GroupMemoryBarrierWithGroupSync();
		if (WaveIndex == 0 && LaneIndex < NUM_WAVE_SUMS)
		{
			WaveSums[LaneIndex] = WavePrefixSum(WaveSums[LaneIndex]);
		}
		GroupMemoryBarrierWithGroupSync();
		if (GroupThread < NUM_BINS)
		{
			Offsets[GroupThread] = WaveSums[WaveIndex] + WavePrefixSum(Value);
		}
	}
#else // DIM_WAVE_OPS
	if (GroupThread < NUM_BINS)
	{
		// Naive implementation that trades instructions for fewer barriers
		for (int i = 0; i < GroupThread; i++)
		{
			Offsets[GroupThread] += Bins[i];
		}
	}
#endif // DIM_WAVE_OPS

	GroupMemoryBarrierWithGroupSync();

	// Write data

	for (int i = GroupThread; i < NUM_ELEMENTS; i += NUM_THREADS_X)
	{
		uint Index = GroupBase + i;
		if (Index < NumTotalEntries)
		{
			uint Bin = Data[i / NUM_THREADS_X].SortKey % NUM_BINS;
			uint DestIndex = GroupBase + Offsets[Bin] + Slot[i / NUM_THREADS_X];

			MaterialBuffer[DestIndex] = Data[i / NUM_THREADS_X];
		}
	}
}