#jira MH-8602

CPU performance improvements for using half edge buffers in deformer graph data interface. The half edge buffer was being uploaded per frame. Now it is stored in a resource owned by the data provider and uploaded once. This is still less than ideal. We want the resource to be owned by the skel mesh, so that it is cooked instead of created at runtime. But that is work for another day. Also test if we have any compute worker jobs before kicking off RDG graph. This allows for a fairer CPU comparison between skin cache and deformer graph. (We don't want to add RDG overhead to skin cache only work). Also added support for reordering the compute work so for optimal GPU execution. Ordering by kernel index instead of graph index allows greater overlap of work on GPU when multiple compute graphs are running. Added a per graph sort priority so that work sorting doesn't cause any setup graphs to run later than execution graphs. #preflight 63f40694977ceed915769bfe [CL 24343458 by jeremy moore in ue5-main branch]
2026-03-26 18:15:20 -07:00 · 2023-02-21 13:06:29 -05:00
parent e399b60a90
commit ca0bbb5de5
10 changed files with 382 additions and 207 deletions
@@ -11,6 +11,7 @@
 class FComputeDataProviderRenderProxy;
 class FComputeGraphRenderProxy;
 class FComputeKernelResource;
+class FComputeKernelShader;

 /** 
 * Class that manages the scheduling of Compute Graph work.
@@ -23,10 +24,14 @@ public:
 	void Enqueue(
 		FName InExecutionGroupName,
 		FName InOwnerName,
+		uint8 InGraphSortPriority,
 		FComputeGraphRenderProxy const* InGraphRenderProxy, 
 		TArray<FComputeDataProviderRenderProxy*> InDataProviderRenderProxies,
 		FSimpleDelegate InFallbackDelegate);

+	/** Has enqueued compute graph work. */
+	bool HasWork(FName InExecutionGroupName) const override;
+
 	/** Submit enqueued compute graph work. */
 	void SubmitWork(
 		FRDGBuilder& GraphBuilder,
@@ -39,6 +44,8 @@ private:
 	{
 		/** Name of owner object that invoked the graph. */
 		FName OwnerName;
+		/** Priority used when sorting work. */
+		uint8 GraphSortPriority = 0;
 		/** Graph render proxy. */
 		FComputeGraphRenderProxy const* GraphRenderProxy = nullptr;
 		/** Data provider render proxies. */
@@ -51,4 +58,38 @@ private:

 	/** Map of enqueued work per execution group . */
 	TMap<FName, TArray<FGraphInvocation> > GraphInvocationsPerGroup;
+
+	/** Description of a single dispatch group to be submitted. */
+	struct FSubmitDescription
+	{
+		/**
+		 * Sort key allows us to sort dispatches for optimum scheduling.
+		 * Syncing is usually required between consecutive kernels in a graph.
+		 * So we schedule the first kernels of all the graphs, before all of the second kernels.
+		 * That reduces time sync time, at the expense of memory pressure for buffers that need to stay alive.
+		 * In futuer we may want to add a limit to the number of graphs in flight to avoid memory pressure.
+		 */
+		union
+		{
+			uint32 PackedSortKey = 0;
+			struct
+			{
+				uint32 GraphIndex : 12;			// Graph index.
+				uint32 KernelIndex : 12;		// Kernel index within the graph.
+				uint32 GraphSortPriority : 8;	// Externally defined sort priority to maintain inter-graph dependencies.
+			};
+		};
+
+		/** Track the index into our collected shader array. */
+		uint32 ShaderIndex : 15;
+		/** Track if this is Unified Dispatch. */
+		uint32 bIsUnified : 1;
+	};
+
+	// These arrays could be local to FComputeGraphTaskWorker::SubmitWork() but we 
+	// store them with class and Reset() them at each usage to avoid per frame array allocations.
+	TArray<FSubmitDescription> SubmitDescs;
+	TArray<TShaderRef<FComputeKernelShader>> Shaders;
+	TArray<int32> PermutationIds;
+	TArray<FIntVector> ThreadCounts;
 };
@@ -71,10 +71,10 @@ bool FComputeGraphInstance::EnqueueWork(UComputeGraph* InComputeGraph, FSceneInt
 	}

 	ENQUEUE_RENDER_COMMAND(ComputeFrameworkEnqueueExecutionCommand)(
-		[ComputeGraphWorker, InExecutionGroupName, InOwnerName, GraphRenderProxy, MovedDataProviderRenderProxies = MoveTemp(DataProviderRenderProxies), InFallbackDelegate](FRHICommandListImmediate& RHICmdList)
+		[ComputeGraphWorker, InExecutionGroupName, InOwnerName, SortPriority = GraphSortPriority, GraphRenderProxy, MovedDataProviderRenderProxies = MoveTemp(DataProviderRenderProxies), InFallbackDelegate](FRHICommandListImmediate& RHICmdList)
 		{
 			// Compute graph scheduler will take ownership of the provider proxies.
-			ComputeGraphWorker->Enqueue(InExecutionGroupName, InOwnerName, GraphRenderProxy, MovedDataProviderRenderProxies, InFallbackDelegate);
+			ComputeGraphWorker->Enqueue(InExecutionGroupName, InOwnerName, SortPriority, GraphRenderProxy, MovedDataProviderRenderProxies, InFallbackDelegate);
 		});

 	return true;
@@ -2,6 +2,7 @@

 #include "ComputeFramework/ComputeGraphWorker.h"

+#include "Algo/Sort.h"
 #include "ComputeFramework/ComputeKernel.h"
 #include "ComputeFramework/ComputeKernelPermutationVector.h"
 #include "ComputeFramework/ComputeKernelShader.h"
@@ -14,196 +15,234 @@

 DECLARE_GPU_STAT_NAMED(ComputeFramework_ExecuteBatches, TEXT("ComputeFramework::ExecuteBatches"));

-void FComputeGraphTaskWorker::Enqueue(FName InExecutionGroupName, FName InOwnerName, FComputeGraphRenderProxy const* InGraphRenderProxy, TArray<FComputeDataProviderRenderProxy*> InDataProviderRenderProxies, FSimpleDelegate InFallbackDelegate)
+static TAutoConsoleVariable<int32> CVarComputeFrameworkSortSubmit(
+	TEXT("r.ComputeFramework.SortSubmit"),
+	1,
+	TEXT("Sort submission of work to GPU for optimal scheduling."),
+	ECVF_RenderThreadSafe
+);
+
+
+void FComputeGraphTaskWorker::Enqueue(
+	FName InExecutionGroupName, 
+	FName InOwnerName, 
+	uint8 InGraphSortPriority,
+	FComputeGraphRenderProxy const* InGraphRenderProxy, 
+	TArray<FComputeDataProviderRenderProxy*> InDataProviderRenderProxies, 
+	FSimpleDelegate InFallbackDelegate)
 {
 	FGraphInvocation& GraphInvocation = GraphInvocationsPerGroup.FindOrAdd(InExecutionGroupName).AddDefaulted_GetRef();
 	GraphInvocation.OwnerName = InOwnerName;
+	GraphInvocation.GraphSortPriority = InGraphSortPriority;
 	GraphInvocation.GraphRenderProxy = InGraphRenderProxy;
 	GraphInvocation.DataProviderRenderProxies = MoveTemp(InDataProviderRenderProxies);
 	GraphInvocation.FallbackDelegate = InFallbackDelegate;
 }

+bool FComputeGraphTaskWorker::HasWork(FName InExecutionGroupName) const
+{
+	TArray<FGraphInvocation> const* GraphInvocations = GraphInvocationsPerGroup.Find(InExecutionGroupName);
+	return GraphInvocations != nullptr && GraphInvocations->Num();
+}
+
 void FComputeGraphTaskWorker::SubmitWork(FRDGBuilder& GraphBuilder, FName InExecutionGroupName, ERHIFeatureLevel::Type FeatureLevel)
 {
-	TArray<FGraphInvocation>& GraphInvocations = GraphInvocationsPerGroup.FindOrAdd(InExecutionGroupName);
-	if (GraphInvocations.IsEmpty())
+	TRACE_CPUPROFILER_EVENT_SCOPE("ComputeFramework::ExecuteBatches");
+	RDG_EVENT_SCOPE(GraphBuilder, "ComputeFramework::ExecuteBatches");
+	RDG_GPU_STAT_SCOPE(GraphBuilder, ComputeFramework_ExecuteBatches);
+
+	// Reset our scratch memory arrays.
+	SubmitDescs.Reset();
+	Shaders.Reset();
+
+	TArray<FGraphInvocation> const& GraphInvocations = GraphInvocationsPerGroup.FindChecked(InExecutionGroupName);
+	for (int32 GraphIndex = 0; GraphIndex < GraphInvocations.Num(); ++GraphIndex)
 	{
-		return;
+		FGraphInvocation const& GraphInvocation = GraphInvocations[GraphIndex];
+		FComputeGraphRenderProxy const* GraphRenderProxy = GraphInvocation.GraphRenderProxy;
+		const int32 NumKernels = GraphRenderProxy->KernelInvocations.Num();
+
+		const int32 BaseSubmitDescIndex = SubmitDescs.Num();
+		SubmitDescs.Reserve(BaseSubmitDescIndex + NumKernels);
+		const int32 BaseShaderIndex = Shaders.Num();
+
+		// Gather shaders and validate the DataInterfaces.
+		// If validation fails or shaders are awaiting compilation we will not run the graph.
+		bool bIsValid = true;
+		for (int32 KernelIndex = 0; bIsValid && KernelIndex < NumKernels; ++KernelIndex)
+		{
+			FSubmitDescription& SubmitDesc = SubmitDescs.AddZeroed_GetRef();
+			SubmitDesc.GraphIndex = GraphIndex;
+			SubmitDesc.KernelIndex = KernelIndex;
+			SubmitDesc.GraphSortPriority = GraphInvocation.GraphSortPriority;
+			SubmitDesc.ShaderIndex = Shaders.Num();
+
+			FComputeGraphRenderProxy::FKernelInvocation const& KernelInvocation = GraphRenderProxy->KernelInvocations[KernelIndex];
+
+			// Reset our scratch memory arrays.
+			PermutationIds.Reset();
+			ThreadCounts.Reset();
+
+			const int32 NumSubInvocations = GraphInvocation.DataProviderRenderProxies[KernelInvocation.ExecutionProviderIndex]->GetDispatchThreadCount(ThreadCounts);
+
+			// Iterate shader parameter members to fill the dispatch data structures.
+			// We assume that the members were filled out with a single data interface per member, and that the
+			// order is the same one defined in the KernelInvocation.BoundProviderIndices.
+			TArray<FShaderParametersMetadata::FMember> const& ParamMembers = KernelInvocation.ShaderParameterMetadata->GetMembers();
+
+			FComputeDataProviderRenderProxy::FPermutationData PermutationData{ NumSubInvocations, GraphRenderProxy->ShaderPermutationVectors[KernelIndex], MoveTemp(PermutationIds) };
+			PermutationData.PermutationIds.SetNumZeroed(NumSubInvocations);
+
+			for (int32 MemberIndex = 0; bIsValid && MemberIndex < ParamMembers.Num(); ++MemberIndex)
+			{
+				FShaderParametersMetadata::FMember const& Member = ParamMembers[MemberIndex];
+				if (ensure(Member.GetBaseType() == EUniformBufferBaseType::UBMT_NESTED_STRUCT))
+				{
+					const int32 DataProviderIndex = KernelInvocation.BoundProviderIndices[MemberIndex];
+					FComputeDataProviderRenderProxy* DataProvider = GraphInvocation.DataProviderRenderProxies[DataProviderIndex];
+					if (ensure(DataProvider != nullptr))
+					{
+						FComputeDataProviderRenderProxy::FValidationData ValidationData{ NumSubInvocations, (int32)Member.GetStructMetadata()->GetSize() };
+						bIsValid &= DataProvider->IsValid(ValidationData);
+
+						if (bIsValid)
+						{
+							DataProvider->GatherPermutations(PermutationData);
+						}
+					}
+				}
+			}
+
+			// Get shader. This can fail if compilation is pending.
+			for (int32 SubInvocationIndex = 0; bIsValid && SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
+			{
+				TShaderRef<FComputeKernelShader> Shader = KernelInvocation.KernelResource->GetShader(PermutationData.PermutationIds[SubInvocationIndex]);
+				bIsValid &= Shader.IsValid();
+				Shaders.Add(Shader);
+			}
+
+			// Check if we can do unified dispatch and apply that if we can.
+			if (bIsValid && KernelInvocation.bSupportsUnifiedDispatch && NumSubInvocations > 1)
+			{
+				bool bSupportsUnifiedDispatch = true;
+				for (int32 SubInvocationIndex = 1; bSupportsUnifiedDispatch && SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
+				{
+					bSupportsUnifiedDispatch &= Shaders[SubmitDesc.ShaderIndex + SubInvocationIndex] == Shaders[SubmitDesc.ShaderIndex];
+				}
+
+				if (bSupportsUnifiedDispatch)
+				{
+					SubmitDesc.bIsUnified = true;
+					Shaders.SetNum(SubmitDesc.ShaderIndex + 1, /*bAllowShrinking*/false);
+				}
+			}
+
+			// Move our scratch array back for subsequent reuse.
+			PermutationIds = MoveTemp(PermutationData.PermutationIds);
+		}
+
+		// If we can't run the graph for any reason, back out now and apply fallback logic.
+		if (!bIsValid)
+		{
+			SubmitDescs.SetNum(BaseSubmitDescIndex, /*bAllowShrinking*/false);
+			Shaders.SetNum(BaseShaderIndex, /*bAllowShrinking*/false);
+			GraphInvocation.FallbackDelegate.ExecuteIfBound();
+			continue;
+		}
+
+		// Allocate RDG resources for all the data providers in the graph.
+		for (int32 DataProviderIndex = 0; DataProviderIndex < GraphInvocation.DataProviderRenderProxies.Num(); ++DataProviderIndex)
+		{
+			FComputeDataProviderRenderProxy* DataProvider = GraphInvocation.DataProviderRenderProxies[DataProviderIndex];
+			if (DataProvider != nullptr)
+			{
+				DataProvider->AllocateResources(GraphBuilder);
+			}
+		}
 	}

+	if (CVarComputeFrameworkSortSubmit.GetValueOnRenderThread() != 0)
 	{
-		SCOPED_DRAW_EVENTF(GraphBuilder.RHICmdList, ComputeFramework_ExecuteBatches, TEXT("ComputeFramework::ExecuteBatches"));
-		SCOPED_GPU_STAT(GraphBuilder.RHICmdList, ComputeFramework_ExecuteBatches);
+		// Sort for optimal dispatch.
+		Algo::Sort(SubmitDescs, [](const FSubmitDescription& LHS, const FSubmitDescription& RHS) { return LHS.PackedSortKey < RHS.PackedSortKey; });
+	}

-		for (int32 GraphIndex = 0; GraphIndex < GraphInvocations.Num(); ++GraphIndex)
+	for (FSubmitDescription const& SubmitDesc : SubmitDescs)
+	{
+		const int32 GraphIndex = SubmitDesc.GraphIndex;
+		FGraphInvocation const& GraphInvocation = GraphInvocations[GraphIndex];
+		FComputeGraphRenderProxy const* GraphRenderProxy = GraphInvocation.GraphRenderProxy;
+
+		const int32 KernelIndex = SubmitDesc.KernelIndex;
+		FComputeGraphRenderProxy::FKernelInvocation const& KernelInvocation = GraphRenderProxy->KernelInvocations[KernelIndex];
+
+		RDG_EVENT_SCOPE(GraphBuilder, "%s:%s:%s", *GraphInvocation.OwnerName.ToString(), *GraphRenderProxy->GraphName.ToString(), *KernelInvocation.KernelName);
+
+		//todo[CF]: GetDispatchThreadCount() should take the bIsUnified flag directly.
+		ThreadCounts.Reset();
+		int32 NumSubInvocations = GraphInvocation.DataProviderRenderProxies[KernelInvocation.ExecutionProviderIndex]->GetDispatchThreadCount(ThreadCounts);
+
+		bool bIsUnifiedDispatch = SubmitDesc.bIsUnified;
+		if (bIsUnifiedDispatch)
 		{
-			FGraphInvocation const& GraphInvocation = GraphInvocations[GraphIndex];
-			FComputeGraphRenderProxy const* GraphRenderProxy = GraphInvocation.GraphRenderProxy;
-
-			TArray<TShaderRef<FComputeKernelShader>> Shaders;
-			TArray<int32> ShaderStartPerKernel;
-			ShaderStartPerKernel.Reserve(GraphRenderProxy->KernelInvocations.Num());
-			TArray<bool> UnifiedDispatchPerKernel;
-			UnifiedDispatchPerKernel.SetNumZeroed(GraphRenderProxy->KernelInvocations.Num());
-
-			// Validation phase.
-			// Check if all DataInterfaces are valid.
-			// At the same time gather the permutation id so that we can validate if shader is compiled.
-			bool bIsValid = true;
-			for (int32 KernelIndex = 0; bIsValid && KernelIndex < GraphRenderProxy->KernelInvocations.Num(); ++KernelIndex)
+			for (int32 SubInvocationIndex = 1; SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
 			{
-				FComputeGraphRenderProxy::FKernelInvocation const& KernelInvocation = GraphRenderProxy->KernelInvocations[KernelIndex];
-
-				TArray<FIntVector> ThreadCounts;
-				const int32 NumSubInvocations = GraphInvocation.DataProviderRenderProxies[KernelInvocation.ExecutionProviderIndex]->GetDispatchThreadCount(ThreadCounts);
-
-				// Iterate shader parameter members to fill the dispatch data structures.
-				// We assume that the members were filled out with a single data interface per member, and that the
-				// order is the same one defined in the KernelInvocation.BoundProviderIndices.
-				TArray<FShaderParametersMetadata::FMember> const& ParamMembers = KernelInvocation.ShaderParameterMetadata->GetMembers();
-
-				FComputeDataProviderRenderProxy::FPermutationData PermutationData{ NumSubInvocations, GraphRenderProxy->ShaderPermutationVectors[KernelIndex] };
-				PermutationData.PermutationIds.AddZeroed(NumSubInvocations);
-
-				for (int32 MemberIndex = 0; bIsValid && MemberIndex < ParamMembers.Num(); ++MemberIndex)
-				{
-					FShaderParametersMetadata::FMember const& Member = ParamMembers[MemberIndex];
-					if (ensure(Member.GetBaseType() == EUniformBufferBaseType::UBMT_NESTED_STRUCT))
-					{
-						const int32 DataProviderIndex = KernelInvocation.BoundProviderIndices[MemberIndex];
-						FComputeDataProviderRenderProxy* DataProvider = GraphInvocation.DataProviderRenderProxies[DataProviderIndex];
-						if (ensure(DataProvider != nullptr))
-						{
-							FComputeDataProviderRenderProxy::FValidationData ValidationData{ NumSubInvocations, (int32)Member.GetStructMetadata()->GetSize() };
-							bIsValid &= DataProvider->IsValid(ValidationData);
-
-							if (bIsValid)
-							{
-								DataProvider->GatherPermutations(PermutationData);
-							}
-						}
-					}
-				}
-
-				// Get shader. This can fail if compilation is pending.
-				ShaderStartPerKernel.Add(Shaders.Num());
-				Shaders.Reserve(Shaders.Num() + NumSubInvocations);
-				for (int32 SubInvocationIndex = 0; bIsValid && SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
-				{
-					TShaderRef<FComputeKernelShader> Shader = KernelInvocation.KernelResource->GetShader(PermutationData.PermutationIds[SubInvocationIndex]);
-					bIsValid &= Shader.IsValid();
-					Shaders.Add(Shader);
-				}
-
-				// Check if we can do unified dispatch and apply that if we can.
-				if (bIsValid && KernelInvocation.bSupportsUnifiedDispatch && NumSubInvocations > 1)
-				{
-					bool bSupportsUnifiedDispatch = true;
-					for (int32 SubInvocationIndex = 1; bSupportsUnifiedDispatch && SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
-					{
-						bSupportsUnifiedDispatch &= Shaders[ShaderStartPerKernel[KernelIndex] + SubInvocationIndex] == Shaders[ShaderStartPerKernel[KernelIndex]];
-					}
-
-					if (bSupportsUnifiedDispatch)
-					{
-						UnifiedDispatchPerKernel[KernelIndex] = true;
-						Shaders.SetNum(Shaders.Num() - NumSubInvocations + 1);
-					}
-				}
+				ThreadCounts[0].X += ThreadCounts[SubInvocationIndex].X;
 			}
+			ThreadCounts.SetNum(1);
+			NumSubInvocations = 1;
+		}

-			// If we can't run the graph for any reason, back out now and apply fallback logic.
-			if (!bIsValid)
-			{
-				GraphInvocation.FallbackDelegate.ExecuteIfBound();
-				continue;
-			}
-
-			// From here on we are committed to submitting the work to the GPU.
-			RDG_EVENT_SCOPE(GraphBuilder, "%s:%s", *GraphInvocation.OwnerName.ToString(), *GraphRenderProxy->GraphName.ToString());
-
-			// Do resource allocation for all the data providers in the graph.
-			for (int32 DataProviderIndex = 0; DataProviderIndex < GraphInvocation.DataProviderRenderProxies.Num(); ++DataProviderIndex)
+		// Allocate parameters buffer and fill from data providers.
+		TStridedView<FComputeKernelShader::FParameters> ParameterArray = GraphBuilder.AllocParameters<FComputeKernelShader::FParameters>(KernelInvocation.ShaderParameterMetadata, NumSubInvocations);
+		FComputeDataProviderRenderProxy::FDispatchData DispatchData{ NumSubInvocations, bIsUnifiedDispatch, 0, 0, ParameterArray.GetStride(), reinterpret_cast<uint8*>(&ParameterArray[0]) };
+
+		// Iterate shader parameter members to fill the dispatch data structures.
+		// We assume that the members were filled out with a single data interface per member, and that the
+		// order is the same one defined in the KernelInvocation.BoundProviderIndices.
+		TArray<FShaderParametersMetadata::FMember> const& ParamMembers = KernelInvocation.ShaderParameterMetadata->GetMembers();
+		for (int32 MemberIndex = 0; MemberIndex < ParamMembers.Num(); ++MemberIndex)
+		{
+			FShaderParametersMetadata::FMember const& Member = ParamMembers[MemberIndex];
+			if (ensure(Member.GetBaseType() == EUniformBufferBaseType::UBMT_NESTED_STRUCT))
 			{
+				const int32 DataProviderIndex = KernelInvocation.BoundProviderIndices[MemberIndex];
 				FComputeDataProviderRenderProxy* DataProvider = GraphInvocation.DataProviderRenderProxies[DataProviderIndex];
-				if (DataProvider != nullptr)
+				if (ensure(DataProvider != nullptr))
 				{
-					DataProvider->AllocateResources(GraphBuilder);
-				}
-			}
-
-			// Iterate the graph kernels to collect shader bindings and dispatch work.
-			for (int32 KernelIndex = 0; KernelIndex < GraphRenderProxy->KernelInvocations.Num(); ++KernelIndex)
-			{
-				FComputeGraphRenderProxy::FKernelInvocation const& KernelInvocation = GraphRenderProxy->KernelInvocations[KernelIndex];
-
-				RDG_EVENT_SCOPE(GraphBuilder, "%s", *KernelInvocation.KernelName);
-
-				TArray<FIntVector> ThreadCounts;
-				int32 NumSubInvocations = GraphInvocation.DataProviderRenderProxies[KernelInvocation.ExecutionProviderIndex]->GetDispatchThreadCount(ThreadCounts);
-				
-				bool bIsUnifiedDispatch = UnifiedDispatchPerKernel[KernelIndex];
-				if (bIsUnifiedDispatch)
-				{
-					for (int32 SubInvocationIndex = 1; SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
-					{
-						ThreadCounts[0].X += ThreadCounts[SubInvocationIndex].X;
-					}
-					ThreadCounts.SetNum(1);
-					NumSubInvocations = 1;
-				}
-
-				TStridedView<FComputeKernelShader::FParameters> ParameterArray = GraphBuilder.AllocParameters<FComputeKernelShader::FParameters>(KernelInvocation.ShaderParameterMetadata, NumSubInvocations);
-				FComputeDataProviderRenderProxy::FDispatchData DispatchData{ NumSubInvocations, bIsUnifiedDispatch, 0, 0, ParameterArray.GetStride(), reinterpret_cast<uint8*>(&ParameterArray[0]) };
-
-				// Iterate shader parameter members to fill the dispatch data structures.
-				// We assume that the members were filled out with a single data interface per member, and that the
-				// order is the same one defined in the KernelInvocation.BoundProviderIndices.
-				TArray<FShaderParametersMetadata::FMember> const& ParamMembers = KernelInvocation.ShaderParameterMetadata->GetMembers();
-				for (int32 MemberIndex = 0; MemberIndex < ParamMembers.Num(); ++MemberIndex)
-				{
-					FShaderParametersMetadata::FMember const& Member = ParamMembers[MemberIndex];
-					if (ensure(Member.GetBaseType() == EUniformBufferBaseType::UBMT_NESTED_STRUCT))
-					{
-						const int32 DataProviderIndex = KernelInvocation.BoundProviderIndices[MemberIndex];
-						FComputeDataProviderRenderProxy* DataProvider = GraphInvocation.DataProviderRenderProxies[DataProviderIndex];
-						if (ensure(DataProvider != nullptr))
-						{
-							DispatchData.ParameterStructSize = Member.GetStructMetadata()->GetSize();
-							DispatchData.ParameterBufferOffset = Member.GetOffset();
-							DataProvider->GatherDispatchData(DispatchData);
-						}
-					}
-				}
-
-				// Dispatch work to the render graph.
-				for (int32 SubInvocationIndex = 0; SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
-				{
-					TShaderRef<FComputeKernelShader> Shader = Shaders[ShaderStartPerKernel[KernelIndex] + SubInvocationIndex];
-					const FIntVector GroupCount = FComputeShaderUtils::GetGroupCount(ThreadCounts[SubInvocationIndex], KernelInvocation.KernelGroupSize);
-
-					FComputeShaderUtils::AddPass(
-						GraphBuilder,
-						{},
-						ERDGPassFlags::Compute | ERDGPassFlags::NeverCull,
-						Shader,
-						KernelInvocation.ShaderParameterMetadata,
-						&ParameterArray[SubInvocationIndex],
-						GroupCount
-					);
+					DispatchData.ParameterStructSize = Member.GetStructMetadata()->GetSize();
+					DispatchData.ParameterBufferOffset = Member.GetOffset();
+					DataProvider->GatherDispatchData(DispatchData);
 				}
 			}
 		}

-		// Release any graph resources at the end of graph execution.
-		GraphBuilder.AddPass(
-			RDG_EVENT_NAME("Release Data Providers"),
-			ERDGPassFlags::None,
-			[this, InExecutionGroupName](FRHICommandList&)
-			{
-				GraphInvocationsPerGroup.FindChecked(InExecutionGroupName).Reset();
-			});
+		// Dispatch work to the render graph.
+		for (int32 SubInvocationIndex = 0; SubInvocationIndex < NumSubInvocations; ++SubInvocationIndex)
+		{
+			TShaderRef<FComputeKernelShader> Shader = Shaders[SubmitDesc.ShaderIndex + SubInvocationIndex];
+			const FIntVector GroupCount = FComputeShaderUtils::GetGroupCount(ThreadCounts[SubInvocationIndex], KernelInvocation.KernelGroupSize);
+
+			FComputeShaderUtils::AddPass(
+				GraphBuilder,
+				{},
+				ERDGPassFlags::Compute | ERDGPassFlags::NeverCull,
+				Shader,
+				KernelInvocation.ShaderParameterMetadata,
+				&ParameterArray[SubInvocationIndex],
+				GroupCount
+			);
+		}
 	}
+
+	// Release any graph resources at the end of graph execution.
+	GraphBuilder.AddPass(
+		{},
+		ERDGPassFlags::None,
+		[this, InExecutionGroupName](FRHICommandList&)
+		{
+			GraphInvocationsPerGroup.FindChecked(InExecutionGroupName).Reset();
+		});
 }

 FComputeGraphTaskWorker::FGraphInvocation::~FGraphInvocation()
@@ -19,6 +19,12 @@ struct COMPUTEFRAMEWORK_API FComputeGraphInstance
 	GENERATED_USTRUCT_BODY();

 public:
+	/** 
+	 * Set the priority used when sorting work. 
+	 * Kernels in instances with a lower sort prioirty will always be submitted first.
+	 */
+	void SetGraphSortPriority(uint8 InPriority) { GraphSortPriority = InPriority; }
+
 	/** 
 	 * Create the Data Provider objects for a single binding of a ComputeGraph. 
 	 * The type of binding object is expected to match the associated Binding on the UComputeGraph.
@@ -38,4 +44,7 @@ private:
 	/** The currently bound Data Provider objects. */
 	UPROPERTY(Transient)
 	TArray< TObjectPtr<UComputeDataProvider> > DataProviders;
+
+	/** Priority used when sorting work. */
+	uint8 GraphSortPriority = 0;
 };