Files

503 lines
20 KiB
C++
Raw Permalink Normal View History

// Copyright Epic Games, Inc. All Rights Reserved.
#include "RenderGraphTrace.h"
#include "RenderGraphBuilder.h"
#include "RenderGraphPrivate.h"
#include "Trace/Trace.inl"
#if RDG_ENABLE_TRACE
UE_TRACE_CHANNEL_DEFINE(RDGChannel)
UE_TRACE_EVENT_BEGIN(RDGTrace, GraphMessage)
UE_TRACE_EVENT_FIELD(UE::Trace::WideString, Name)
UE_TRACE_EVENT_FIELD(uint64, StartCycles)
UE_TRACE_EVENT_FIELD(uint64, EndCycles)
UE_TRACE_EVENT_FIELD(uint32, PassCount)
UE_TRACE_EVENT_FIELD(uint64[], TransientMemoryCommitSizes)
UE_TRACE_EVENT_FIELD(uint64[], TransientMemoryCapacities)
UE_TRACE_EVENT_FIELD(uint8[], TransientMemoryFlags)
UE_TRACE_EVENT_END()
UE_TRACE_EVENT_BEGIN(RDGTrace, GraphEndMessage)
UE_TRACE_EVENT_END()
UE_TRACE_EVENT_BEGIN(RDGTrace, PassMessage)
UE_TRACE_EVENT_FIELD(UE::Trace::WideString, Name)
UE_TRACE_EVENT_FIELD(uint64, StartCycles)
UE_TRACE_EVENT_FIELD(uint64, EndCycles)
UE_TRACE_EVENT_FIELD(uint32, Handle)
UE_TRACE_EVENT_FIELD(uint32, GraphicsForkPass)
UE_TRACE_EVENT_FIELD(uint32, GraphicsJoinPass)
UE_TRACE_EVENT_FIELD(uint32[], Textures)
UE_TRACE_EVENT_FIELD(uint32[], Buffers)
UE_TRACE_EVENT_FIELD(uint16, Flags)
UE_TRACE_EVENT_FIELD(uint16, Pipeline)
UE_TRACE_EVENT_FIELD(bool, IsCulled)
UE_TRACE_EVENT_FIELD(bool, IsAsyncComputeBegin)
UE_TRACE_EVENT_FIELD(bool, IsAsyncComputeEnd)
UE_TRACE_EVENT_FIELD(bool, SkipRenderPassBegin)
UE_TRACE_EVENT_FIELD(bool, SkipRenderPassEnd)
UE_TRACE_EVENT_FIELD(bool, IsParallelExecuteBegin)
UE_TRACE_EVENT_FIELD(bool, IsParallelExecuteEnd)
UE_TRACE_EVENT_FIELD(bool, IsParallelExecute)
UE_TRACE_EVENT_FIELD(bool, IsParallelExecuteAllowed)
UE_TRACE_EVENT_FIELD(bool, IsParallelExecuteAsyncAllowed)
UE_TRACE_EVENT_FIELD(bool, IsHandleType32Bits)
UE_TRACE_EVENT_END()
UE_TRACE_EVENT_BEGIN(RDGTrace, BufferMessage)
UE_TRACE_EVENT_FIELD(UE::Trace::WideString, Name)
UE_TRACE_EVENT_FIELD(uint32, UsageFlags)
UE_TRACE_EVENT_FIELD(uint32, BytesPerElement)
UE_TRACE_EVENT_FIELD(uint32, NumElements)
UE_TRACE_EVENT_FIELD(uint32, Handle)
UE_TRACE_EVENT_FIELD(uint32, NextOwnerHandle)
UE_TRACE_EVENT_FIELD(uint32, Order)
UE_TRACE_EVENT_FIELD(uint32[], Passes)
UE_TRACE_EVENT_FIELD(uint64[], TransientAllocationOffsetMins)
UE_TRACE_EVENT_FIELD(uint64[], TransientAllocationOffsetMaxs)
UE_TRACE_EVENT_FIELD(uint16[], TransientAllocationMemoryRanges)
UE_TRACE_EVENT_FIELD(FRDGPassHandle::IndexType, TransientAcquirePass)
UE_TRACE_EVENT_FIELD(FRDGPassHandle::IndexType, TransientDiscardPass)
UE_TRACE_EVENT_FIELD(bool, IsExternal)
UE_TRACE_EVENT_FIELD(bool, IsExtracted)
UE_TRACE_EVENT_FIELD(bool, IsCulled)
UE_TRACE_EVENT_FIELD(bool, IsTrackingSkipped)
UE_TRACE_EVENT_FIELD(bool, IsTransient)
UE_TRACE_EVENT_FIELD(bool, IsTransientUntracked)
UE_TRACE_EVENT_FIELD(bool, IsTransientCacheHit)
UE_TRACE_EVENT_FIELD(bool, IsHandleType32Bits)
UE_TRACE_EVENT_END()
UE_TRACE_EVENT_BEGIN(RDGTrace, TextureMessage)
UE_TRACE_EVENT_FIELD(UE::Trace::WideString, Name)
UE_TRACE_EVENT_FIELD(uint64, StartCycles)
UE_TRACE_EVENT_FIELD(uint64, EndCycles)
UE_TRACE_EVENT_FIELD(uint32, Handle)
UE_TRACE_EVENT_FIELD(uint32, NextOwnerHandle)
UE_TRACE_EVENT_FIELD(uint32, Order)
UE_TRACE_EVENT_FIELD(uint32[], Passes)
UE_TRACE_EVENT_FIELD(uint64[], TransientAllocationOffsetMins)
UE_TRACE_EVENT_FIELD(uint64[], TransientAllocationOffsetMaxs)
UE_TRACE_EVENT_FIELD(uint16[], TransientAllocationMemoryRanges)
UE_TRACE_EVENT_FIELD(FRDGPassHandle::IndexType, TransientAcquirePass)
UE_TRACE_EVENT_FIELD(FRDGPassHandle::IndexType, TransientDiscardPass)
UE_TRACE_EVENT_FIELD(uint64, SizeInBytes)
UE_TRACE_EVENT_FIELD(uint64, CreateFlags)
UE_TRACE_EVENT_FIELD(uint32, Dimension)
UE_TRACE_EVENT_FIELD(uint32, Format)
UE_TRACE_EVENT_FIELD(uint32, ExtentX)
UE_TRACE_EVENT_FIELD(uint32, ExtentY)
UE_TRACE_EVENT_FIELD(uint16, Depth)
UE_TRACE_EVENT_FIELD(uint16, ArraySize)
UE_TRACE_EVENT_FIELD(uint8, NumMips)
UE_TRACE_EVENT_FIELD(uint8, NumSamples)
UE_TRACE_EVENT_FIELD(bool, IsExternal)
UE_TRACE_EVENT_FIELD(bool, IsExtracted)
UE_TRACE_EVENT_FIELD(bool, IsCulled)
UE_TRACE_EVENT_FIELD(bool, IsTrackingSkipped)
UE_TRACE_EVENT_FIELD(bool, IsTransient)
UE_TRACE_EVENT_FIELD(bool, IsTransientUntracked)
UE_TRACE_EVENT_FIELD(bool, IsTransientCacheHit)
UE_TRACE_EVENT_FIELD(bool, IsHandleType32Bits)
UE_TRACE_EVENT_END()
UE_TRACE_EVENT_BEGIN(RDGTrace, ScopeMessage)
UE_TRACE_EVENT_FIELD(UE::Trace::WideString, Name)
UE_TRACE_EVENT_FIELD(uint32, FirstPass)
UE_TRACE_EVENT_FIELD(uint32, LastPass)
UE_TRACE_EVENT_FIELD(uint16, Depth)
UE_TRACE_EVENT_FIELD(bool, IsHandleType32Bits)
UE_TRACE_EVENT_END()
static_assert(sizeof(FRDGPassHandle) == sizeof(uint32), "Expected 32 bit pass handles.");
static_assert(sizeof(FRDGTextureHandle) == sizeof(uint32), "Expected 32 bit texture handles.");
static_assert(sizeof(FRDGBufferHandle) == sizeof(uint32), "Expected 32 bit buffer handles.");
FRDGTrace::FRDGTrace()
: bEnabled(UE_TRACE_CHANNELEXPR_IS_ENABLED(RDGChannel) && !IsImmediateMode())
{}
bool FRDGTrace::IsEnabled() const
{
return bEnabled;
}
void FRDGTrace::OutputGraphBegin()
{
if (!IsEnabled())
{
return;
}
GraphStartCycles = FPlatformTime::Cycles64();
}
void FRDGTrace::OutputGraphEnd(const FRDGBuilder& GraphBuilder)
{
if (!IsEnabled())
{
return;
}
TRACE_CPUPROFILER_EVENT_SCOPE(FRDGTrace::OutputGraphEnd);
const FRDGPassHandle ProloguePassHandle = GraphBuilder.GetProloguePassHandle();
const auto& Passes = GraphBuilder.Passes;
const auto& Textures = GraphBuilder.Textures;
const auto& Buffers = GraphBuilder.Buffers;
{
const TCHAR* Name = GraphBuilder.BuilderName.GetTCHAR();
TArray<uint64> TransientMemoryCommitSizes;
TArray<uint64> TransientMemoryCapacities;
TArray<uint8> TransientMemoryFlags;
TransientMemoryCommitSizes.Reserve(TransientAllocationStats.MemoryRanges.Num());
TransientMemoryCapacities.Reserve(TransientAllocationStats.MemoryRanges.Num());
TransientMemoryFlags.Reserve(TransientAllocationStats.MemoryRanges.Num());
for (const auto& MemoryRange : TransientAllocationStats.MemoryRanges)
{
TransientMemoryCommitSizes.Emplace(MemoryRange.CommitSize);
TransientMemoryCapacities.Emplace(MemoryRange.Capacity);
TransientMemoryFlags.Emplace((uint8)MemoryRange.Flags);
}
UE_TRACE_LOG(RDGTrace, GraphMessage, RDGChannel)
<< GraphMessage.Name(Name, uint16(FCString::Strlen(Name)))
<< GraphMessage.StartCycles(GraphStartCycles)
<< GraphMessage.EndCycles(FPlatformTime::Cycles64())
<< GraphMessage.PassCount(uint32(Passes.Num()))
<< GraphMessage.TransientMemoryCommitSizes(TransientMemoryCommitSizes.GetData(), (uint16)TransientMemoryCommitSizes.Num())
<< GraphMessage.TransientMemoryCapacities(TransientMemoryCapacities.GetData(), (uint16)TransientMemoryCapacities.Num())
<< GraphMessage.TransientMemoryFlags(TransientMemoryFlags.GetData(), (uint16)TransientMemoryFlags.Num());
}
for (FRDGPassHandle Handle = Passes.Begin(); Handle != Passes.End(); ++Handle)
{
const FRDGPass* Pass = Passes[Handle];
const TCHAR* Name = Pass->GetEventName().GetTCHAR();
UE_TRACE_LOG(RDGTrace, PassMessage, RDGChannel)
<< PassMessage.Name(Name, uint16(FCString::Strlen(Name)))
<< PassMessage.Handle(Handle.GetIndex())
<< PassMessage.GraphicsForkPass(Pass->GetGraphicsForkPass().GetIndexUnchecked())
<< PassMessage.GraphicsJoinPass(Pass->GetGraphicsJoinPass().GetIndexUnchecked())
<< PassMessage.Textures((const uint32*)Pass->TraceTextures.GetData(), (uint32)Pass->TraceTextures.Num())
<< PassMessage.Buffers((const uint32*)Pass->TraceBuffers.GetData(), (uint32)Pass->TraceBuffers.Num())
<< PassMessage.Flags(uint16(Pass->GetFlags()))
<< PassMessage.Pipeline(uint16(Pass->GetPipeline()))
<< PassMessage.IsCulled(Pass->bCulled != 0)
<< PassMessage.IsAsyncComputeBegin(Pass->bAsyncComputeBegin != 0)
<< PassMessage.IsAsyncComputeEnd(Pass->bAsyncComputeEnd != 0)
<< PassMessage.SkipRenderPassBegin(Pass->bSkipRenderPassBegin != 0)
<< PassMessage.SkipRenderPassEnd(Pass->bSkipRenderPassEnd != 0)
<< PassMessage.IsParallelExecuteBegin(Pass->bParallelExecuteBegin != 0)
<< PassMessage.IsParallelExecuteEnd(Pass->bParallelExecuteEnd != 0)
<< PassMessage.IsParallelExecute(Pass->bParallelExecute != 0)
<< PassMessage.IsParallelExecuteAllowed(Pass->TaskMode != ERDGPassTaskMode::Inline)
<< PassMessage.IsParallelExecuteAsyncAllowed(Pass->TaskMode == ERDGPassTaskMode::Async)
<< PassMessage.IsHandleType32Bits(true);
}
#if RDG_EVENTS
{
Merging //UE5/Dev-ParallelRendering/... (up to CL 30965645) to //UE5/Main/... (base CL 30962637) Significant refactor of RHI command list management and submission, and RHI breadcrumbs / RenderGraph (RDG) scopes, to allow for parallel translation of most RHI command lists. See individual changelists in //UE5/Dev-ParallelRendering for details. A summary of the changes is as follows: This work's primary goal was to allow as many RHI command lists as possible to be parallel translated, to make more efficient use of many-core systems. To achieve this: - The submission code paths for the immediate and parallel RHI command lists have been merged into a single function: FRHICommandListExecutor::Submit(). - A "dispatch thread" (which is simply a series of chained task graph tasks) is used to decide which command lists are batched together in a single parallel translate job. - Individual command lists can disable parallel translate, which forces them to be executed on the RHI thread. This happens automatically if an RHI command list performs an operation that is not thread safe (e.g. buffer lock, or low-level resource transition). One of the primary blockers for parallel translation was the RHI breadcrumb system, and the way RDG builds scopes. This was also refactored to remove these limitations: - RDG could only push/pop events on the immediate command list, which resulted in parallel and immediate work being interleaved, breaking any opportunity for parallelism. - Platform RHI implementations of breadcrumbs (e.g. in D3D12 RHI) was not correct across multiple RHI contexts. Push/pop operations aren't necessarily balanced within any one RHI context given that RDG builds "parallel pass sets" containing arbitrary ranges of renderer passes. A summary of the new RHI breadcrumb system is as follows: - A tree of breadcrumb nodes is built by the render thread and RDG. Each node contains the node name, and pointers to the parent and next nodes. When fully built, the nodes form a depth-first linked list which is used for traversing the tree for GPU crash debugging. - The memory for breadcrumb nodes is provided by ref-counted allocator objects. These allocators are pipelined through the RHI, allowing the platform RHI implementation to extend their lifetime for GPU crash debugging purposes. - RHIPushEvent / RHIPopEvent have been removed, replaced with RHIBeginBreadcrumbGPU / RHIEndBreadcrumbGPU. Platform RHIs implement these functions to perform GPU immediate writes using the unique ID of each node, for tracking GPU progress. - Format string arguments are captured by-value to remove the cost of string formatting while building the breadcrumb tree. String formatting only occurs when the actual formatted string is required (e.g. during GPU crash breadcrumb stack traversal, or when calling platform GPU profiling APIs). RenderGraph scopes have been simplified: - The separate scope trees / arrays of ops have been combined. There is now a single tree of RDG scopes containing all types. - Each RDG pass holds a pointer to the scope it was created under. - BeginCPU / EndCPU is called on each RDG scope as the various RDG threads enter / exit them. This allows us to mark-up each worker thread with the relevant Unreal Insights scopes. Other changes include: - Fixes for bugs uncovered when parallel translate was enabled. - Adjusted platform affinities necessary due to the new layout of thread tasks in the renderer. - Refactored RHI draw call stats to better fit the new pipeline design. #rb jeannoe.morissette, zach.bethel #jira UE-139543 [CL 30973133 by Luke Thatcher in ue5-main branch]
2024-01-29 12:47:28 -05:00
auto DumpScopes = [&](FRDGScope* Current, auto& DumpScopes)
{
Merging //UE5/Dev-ParallelRendering/... (up to CL 30965645) to //UE5/Main/... (base CL 30962637) Significant refactor of RHI command list management and submission, and RHI breadcrumbs / RenderGraph (RDG) scopes, to allow for parallel translation of most RHI command lists. See individual changelists in //UE5/Dev-ParallelRendering for details. A summary of the changes is as follows: This work's primary goal was to allow as many RHI command lists as possible to be parallel translated, to make more efficient use of many-core systems. To achieve this: - The submission code paths for the immediate and parallel RHI command lists have been merged into a single function: FRHICommandListExecutor::Submit(). - A "dispatch thread" (which is simply a series of chained task graph tasks) is used to decide which command lists are batched together in a single parallel translate job. - Individual command lists can disable parallel translate, which forces them to be executed on the RHI thread. This happens automatically if an RHI command list performs an operation that is not thread safe (e.g. buffer lock, or low-level resource transition). One of the primary blockers for parallel translation was the RHI breadcrumb system, and the way RDG builds scopes. This was also refactored to remove these limitations: - RDG could only push/pop events on the immediate command list, which resulted in parallel and immediate work being interleaved, breaking any opportunity for parallelism. - Platform RHI implementations of breadcrumbs (e.g. in D3D12 RHI) was not correct across multiple RHI contexts. Push/pop operations aren't necessarily balanced within any one RHI context given that RDG builds "parallel pass sets" containing arbitrary ranges of renderer passes. A summary of the new RHI breadcrumb system is as follows: - A tree of breadcrumb nodes is built by the render thread and RDG. Each node contains the node name, and pointers to the parent and next nodes. When fully built, the nodes form a depth-first linked list which is used for traversing the tree for GPU crash debugging. - The memory for breadcrumb nodes is provided by ref-counted allocator objects. These allocators are pipelined through the RHI, allowing the platform RHI implementation to extend their lifetime for GPU crash debugging purposes. - RHIPushEvent / RHIPopEvent have been removed, replaced with RHIBeginBreadcrumbGPU / RHIEndBreadcrumbGPU. Platform RHIs implement these functions to perform GPU immediate writes using the unique ID of each node, for tracking GPU progress. - Format string arguments are captured by-value to remove the cost of string formatting while building the breadcrumb tree. String formatting only occurs when the actual formatted string is required (e.g. during GPU crash breadcrumb stack traversal, or when calling platform GPU profiling APIs). RenderGraph scopes have been simplified: - The separate scope trees / arrays of ops have been combined. There is now a single tree of RDG scopes containing all types. - Each RDG pass holds a pointer to the scope it was created under. - BeginCPU / EndCPU is called on each RDG scope as the various RDG threads enter / exit them. This allows us to mark-up each worker thread with the relevant Unreal Insights scopes. Other changes include: - Fixes for bugs uncovered when parallel translate was enabled. - Adjusted platform affinities necessary due to the new layout of thread tasks in the renderer. - Refactored RHI draw call stats to better fit the new pipeline design. #rb jeannoe.morissette, zach.bethel #jira UE-139543 [CL 30973133 by Luke Thatcher in ue5-main branch]
2024-01-29 12:47:28 -05:00
if (!Current || Current->bVisited)
return;
Merging //UE5/Dev-ParallelRendering/... (up to CL 30965645) to //UE5/Main/... (base CL 30962637) Significant refactor of RHI command list management and submission, and RHI breadcrumbs / RenderGraph (RDG) scopes, to allow for parallel translation of most RHI command lists. See individual changelists in //UE5/Dev-ParallelRendering for details. A summary of the changes is as follows: This work's primary goal was to allow as many RHI command lists as possible to be parallel translated, to make more efficient use of many-core systems. To achieve this: - The submission code paths for the immediate and parallel RHI command lists have been merged into a single function: FRHICommandListExecutor::Submit(). - A "dispatch thread" (which is simply a series of chained task graph tasks) is used to decide which command lists are batched together in a single parallel translate job. - Individual command lists can disable parallel translate, which forces them to be executed on the RHI thread. This happens automatically if an RHI command list performs an operation that is not thread safe (e.g. buffer lock, or low-level resource transition). One of the primary blockers for parallel translation was the RHI breadcrumb system, and the way RDG builds scopes. This was also refactored to remove these limitations: - RDG could only push/pop events on the immediate command list, which resulted in parallel and immediate work being interleaved, breaking any opportunity for parallelism. - Platform RHI implementations of breadcrumbs (e.g. in D3D12 RHI) was not correct across multiple RHI contexts. Push/pop operations aren't necessarily balanced within any one RHI context given that RDG builds "parallel pass sets" containing arbitrary ranges of renderer passes. A summary of the new RHI breadcrumb system is as follows: - A tree of breadcrumb nodes is built by the render thread and RDG. Each node contains the node name, and pointers to the parent and next nodes. When fully built, the nodes form a depth-first linked list which is used for traversing the tree for GPU crash debugging. - The memory for breadcrumb nodes is provided by ref-counted allocator objects. These allocators are pipelined through the RHI, allowing the platform RHI implementation to extend their lifetime for GPU crash debugging purposes. - RHIPushEvent / RHIPopEvent have been removed, replaced with RHIBeginBreadcrumbGPU / RHIEndBreadcrumbGPU. Platform RHIs implement these functions to perform GPU immediate writes using the unique ID of each node, for tracking GPU progress. - Format string arguments are captured by-value to remove the cost of string formatting while building the breadcrumb tree. String formatting only occurs when the actual formatted string is required (e.g. during GPU crash breadcrumb stack traversal, or when calling platform GPU profiling APIs). RenderGraph scopes have been simplified: - The separate scope trees / arrays of ops have been combined. There is now a single tree of RDG scopes containing all types. - Each RDG pass holds a pointer to the scope it was created under. - BeginCPU / EndCPU is called on each RDG scope as the various RDG threads enter / exit them. This allows us to mark-up each worker thread with the relevant Unreal Insights scopes. Other changes include: - Fixes for bugs uncovered when parallel translate was enabled. - Adjusted platform affinities necessary due to the new layout of thread tasks in the renderer. - Refactored RHI draw call stats to better fit the new pipeline design. #rb jeannoe.morissette, zach.bethel #jira UE-139543 [CL 30973133 by Luke Thatcher in ue5-main branch]
2024-01-29 12:47:28 -05:00
Current->bVisited = true;
DumpScopes(Current->Parent, DumpScopes);
if (FRDGScope_RHI* RHIScope = Current->Get<FRDGScope_RHI>())
{
if (Current->CPUFirstPass && Current->CPULastPass)
{
const TCHAR* Name = RHIScope->Name.GetTCHAR();
uint32 Depth = 0;
for (FRDGScope* Scope = Current; Scope->Parent; Scope = Scope->Parent)
{
if (Scope->Get<FRDGScope_RHI>())
{
if (Scope->CPUFirstPass && Scope->CPULastPass)
{
Depth++;
}
}
}
Merging //UE5/Dev-ParallelRendering/... (up to CL 30965645) to //UE5/Main/... (base CL 30962637) Significant refactor of RHI command list management and submission, and RHI breadcrumbs / RenderGraph (RDG) scopes, to allow for parallel translation of most RHI command lists. See individual changelists in //UE5/Dev-ParallelRendering for details. A summary of the changes is as follows: This work's primary goal was to allow as many RHI command lists as possible to be parallel translated, to make more efficient use of many-core systems. To achieve this: - The submission code paths for the immediate and parallel RHI command lists have been merged into a single function: FRHICommandListExecutor::Submit(). - A "dispatch thread" (which is simply a series of chained task graph tasks) is used to decide which command lists are batched together in a single parallel translate job. - Individual command lists can disable parallel translate, which forces them to be executed on the RHI thread. This happens automatically if an RHI command list performs an operation that is not thread safe (e.g. buffer lock, or low-level resource transition). One of the primary blockers for parallel translation was the RHI breadcrumb system, and the way RDG builds scopes. This was also refactored to remove these limitations: - RDG could only push/pop events on the immediate command list, which resulted in parallel and immediate work being interleaved, breaking any opportunity for parallelism. - Platform RHI implementations of breadcrumbs (e.g. in D3D12 RHI) was not correct across multiple RHI contexts. Push/pop operations aren't necessarily balanced within any one RHI context given that RDG builds "parallel pass sets" containing arbitrary ranges of renderer passes. A summary of the new RHI breadcrumb system is as follows: - A tree of breadcrumb nodes is built by the render thread and RDG. Each node contains the node name, and pointers to the parent and next nodes. When fully built, the nodes form a depth-first linked list which is used for traversing the tree for GPU crash debugging. - The memory for breadcrumb nodes is provided by ref-counted allocator objects. These allocators are pipelined through the RHI, allowing the platform RHI implementation to extend their lifetime for GPU crash debugging purposes. - RHIPushEvent / RHIPopEvent have been removed, replaced with RHIBeginBreadcrumbGPU / RHIEndBreadcrumbGPU. Platform RHIs implement these functions to perform GPU immediate writes using the unique ID of each node, for tracking GPU progress. - Format string arguments are captured by-value to remove the cost of string formatting while building the breadcrumb tree. String formatting only occurs when the actual formatted string is required (e.g. during GPU crash breadcrumb stack traversal, or when calling platform GPU profiling APIs). RenderGraph scopes have been simplified: - The separate scope trees / arrays of ops have been combined. There is now a single tree of RDG scopes containing all types. - Each RDG pass holds a pointer to the scope it was created under. - BeginCPU / EndCPU is called on each RDG scope as the various RDG threads enter / exit them. This allows us to mark-up each worker thread with the relevant Unreal Insights scopes. Other changes include: - Fixes for bugs uncovered when parallel translate was enabled. - Adjusted platform affinities necessary due to the new layout of thread tasks in the renderer. - Refactored RHI draw call stats to better fit the new pipeline design. #rb jeannoe.morissette, zach.bethel #jira UE-139543 [CL 30973133 by Luke Thatcher in ue5-main branch]
2024-01-29 12:47:28 -05:00
UE_TRACE_LOG(RDGTrace, ScopeMessage, RDGChannel)
<< ScopeMessage.Name(Name, uint16(FCString::Strlen(Name)))
<< ScopeMessage.FirstPass(Current->CPUFirstPass->GetHandle().GetIndexUnchecked())
<< ScopeMessage.LastPass(Current->CPULastPass->GetHandle().GetIndexUnchecked())
<< ScopeMessage.Depth(Depth)
<< ScopeMessage.IsHandleType32Bits(true);
Merging //UE5/Dev-ParallelRendering/... (up to CL 30965645) to //UE5/Main/... (base CL 30962637) Significant refactor of RHI command list management and submission, and RHI breadcrumbs / RenderGraph (RDG) scopes, to allow for parallel translation of most RHI command lists. See individual changelists in //UE5/Dev-ParallelRendering for details. A summary of the changes is as follows: This work's primary goal was to allow as many RHI command lists as possible to be parallel translated, to make more efficient use of many-core systems. To achieve this: - The submission code paths for the immediate and parallel RHI command lists have been merged into a single function: FRHICommandListExecutor::Submit(). - A "dispatch thread" (which is simply a series of chained task graph tasks) is used to decide which command lists are batched together in a single parallel translate job. - Individual command lists can disable parallel translate, which forces them to be executed on the RHI thread. This happens automatically if an RHI command list performs an operation that is not thread safe (e.g. buffer lock, or low-level resource transition). One of the primary blockers for parallel translation was the RHI breadcrumb system, and the way RDG builds scopes. This was also refactored to remove these limitations: - RDG could only push/pop events on the immediate command list, which resulted in parallel and immediate work being interleaved, breaking any opportunity for parallelism. - Platform RHI implementations of breadcrumbs (e.g. in D3D12 RHI) was not correct across multiple RHI contexts. Push/pop operations aren't necessarily balanced within any one RHI context given that RDG builds "parallel pass sets" containing arbitrary ranges of renderer passes. A summary of the new RHI breadcrumb system is as follows: - A tree of breadcrumb nodes is built by the render thread and RDG. Each node contains the node name, and pointers to the parent and next nodes. When fully built, the nodes form a depth-first linked list which is used for traversing the tree for GPU crash debugging. - The memory for breadcrumb nodes is provided by ref-counted allocator objects. These allocators are pipelined through the RHI, allowing the platform RHI implementation to extend their lifetime for GPU crash debugging purposes. - RHIPushEvent / RHIPopEvent have been removed, replaced with RHIBeginBreadcrumbGPU / RHIEndBreadcrumbGPU. Platform RHIs implement these functions to perform GPU immediate writes using the unique ID of each node, for tracking GPU progress. - Format string arguments are captured by-value to remove the cost of string formatting while building the breadcrumb tree. String formatting only occurs when the actual formatted string is required (e.g. during GPU crash breadcrumb stack traversal, or when calling platform GPU profiling APIs). RenderGraph scopes have been simplified: - The separate scope trees / arrays of ops have been combined. There is now a single tree of RDG scopes containing all types. - Each RDG pass holds a pointer to the scope it was created under. - BeginCPU / EndCPU is called on each RDG scope as the various RDG threads enter / exit them. This allows us to mark-up each worker thread with the relevant Unreal Insights scopes. Other changes include: - Fixes for bugs uncovered when parallel translate was enabled. - Adjusted platform affinities necessary due to the new layout of thread tasks in the renderer. - Refactored RHI draw call stats to better fit the new pipeline design. #rb jeannoe.morissette, zach.bethel #jira UE-139543 [CL 30973133 by Luke Thatcher in ue5-main branch]
2024-01-29 12:47:28 -05:00
}
}
};
for (FRDGPassHandle Handle = Passes.Begin(); Handle != Passes.End(); ++Handle)
{
const FRDGPass* Pass = Passes[Handle];
Merging //UE5/Dev-ParallelRendering/... (up to CL 30965645) to //UE5/Main/... (base CL 30962637) Significant refactor of RHI command list management and submission, and RHI breadcrumbs / RenderGraph (RDG) scopes, to allow for parallel translation of most RHI command lists. See individual changelists in //UE5/Dev-ParallelRendering for details. A summary of the changes is as follows: This work's primary goal was to allow as many RHI command lists as possible to be parallel translated, to make more efficient use of many-core systems. To achieve this: - The submission code paths for the immediate and parallel RHI command lists have been merged into a single function: FRHICommandListExecutor::Submit(). - A "dispatch thread" (which is simply a series of chained task graph tasks) is used to decide which command lists are batched together in a single parallel translate job. - Individual command lists can disable parallel translate, which forces them to be executed on the RHI thread. This happens automatically if an RHI command list performs an operation that is not thread safe (e.g. buffer lock, or low-level resource transition). One of the primary blockers for parallel translation was the RHI breadcrumb system, and the way RDG builds scopes. This was also refactored to remove these limitations: - RDG could only push/pop events on the immediate command list, which resulted in parallel and immediate work being interleaved, breaking any opportunity for parallelism. - Platform RHI implementations of breadcrumbs (e.g. in D3D12 RHI) was not correct across multiple RHI contexts. Push/pop operations aren't necessarily balanced within any one RHI context given that RDG builds "parallel pass sets" containing arbitrary ranges of renderer passes. A summary of the new RHI breadcrumb system is as follows: - A tree of breadcrumb nodes is built by the render thread and RDG. Each node contains the node name, and pointers to the parent and next nodes. When fully built, the nodes form a depth-first linked list which is used for traversing the tree for GPU crash debugging. - The memory for breadcrumb nodes is provided by ref-counted allocator objects. These allocators are pipelined through the RHI, allowing the platform RHI implementation to extend their lifetime for GPU crash debugging purposes. - RHIPushEvent / RHIPopEvent have been removed, replaced with RHIBeginBreadcrumbGPU / RHIEndBreadcrumbGPU. Platform RHIs implement these functions to perform GPU immediate writes using the unique ID of each node, for tracking GPU progress. - Format string arguments are captured by-value to remove the cost of string formatting while building the breadcrumb tree. String formatting only occurs when the actual formatted string is required (e.g. during GPU crash breadcrumb stack traversal, or when calling platform GPU profiling APIs). RenderGraph scopes have been simplified: - The separate scope trees / arrays of ops have been combined. There is now a single tree of RDG scopes containing all types. - Each RDG pass holds a pointer to the scope it was created under. - BeginCPU / EndCPU is called on each RDG scope as the various RDG threads enter / exit them. This allows us to mark-up each worker thread with the relevant Unreal Insights scopes. Other changes include: - Fixes for bugs uncovered when parallel translate was enabled. - Adjusted platform affinities necessary due to the new layout of thread tasks in the renderer. - Refactored RHI draw call stats to better fit the new pipeline design. #rb jeannoe.morissette, zach.bethel #jira UE-139543 [CL 30973133 by Luke Thatcher in ue5-main branch]
2024-01-29 12:47:28 -05:00
DumpScopes(Pass->Scope, DumpScopes);
}
}
#endif
struct FTransientAllocation
{
TArray<uint64> OffsetMins;
TArray<uint64> OffsetMaxs;
TArray<uint16> MemoryRanges;
bool bCacheHit = false;
void Reset()
{
OffsetMins.Reset();
OffsetMaxs.Reset();
MemoryRanges.Reset();
bCacheHit = false;
}
void Fill(const FRHITransientAllocationStats& Stats, const FRHITransientResource* Resource)
{
const FRHITransientAllocationStats::FAllocationArray& Allocations = Stats.Resources.FindChecked(Resource);
for (const FRHITransientAllocationStats::FAllocation& Allocation : Allocations)
{
OffsetMins.Emplace(Allocation.OffsetMin);
OffsetMaxs.Emplace(Allocation.OffsetMax);
MemoryRanges.Emplace(Allocation.MemoryRangeIndex);
}
bCacheHit = Resource->GetAcquireCount() > 1;
}
} TransientAllocation;
FRDGPassHandle TransientAcquirePass;
FRDGPassHandle TransientDiscardPass;
const auto FillTransientResourceArrays = [&](const FRHITransientResource* Resource, bool bRemoveFromStats)
{
TransientAllocation.Reset();
TransientAcquirePass = {};
TransientDiscardPass = {};
if (Resource)
{
TransientAllocation.Fill(TransientAllocationStats, Resource);
if (Resource->GetAcquirePass() != FRHITransientResource::kInvalidPassIndex)
{
TransientAcquirePass = FRDGPassHandle(Resource->GetAcquirePass());
}
if (Resource->GetDiscardPass() != FRHITransientResource::kInvalidPassIndex)
{
TransientDiscardPass = FRDGPassHandle(Resource->GetDiscardPass());
}
if (bRemoveFromStats)
{
TransientAllocationStats.Resources.Remove(Resource);
}
}
};
for (FRDGTextureHandle Handle = Textures.Begin(); Handle != Textures.End(); ++Handle)
{
const FRDGTexture* Texture = Textures[Handle];
uint64 SizeInBytes = 0;
if (FRHITexture* TextureRHI = Texture->GetRHIUnchecked())
{
if (Texture->TransientTexture)
{
SizeInBytes = Texture->TransientTexture->GetSize();
}
else
{
SizeInBytes = RHIComputeMemorySize(TextureRHI);
}
}
const bool bRemoveFromStats = true;
FillTransientResourceArrays(Texture->TransientTexture, bRemoveFromStats);
UE_TRACE_LOG(RDGTrace, TextureMessage, RDGChannel)
<< TextureMessage.Name(Texture->Name, uint16(FCString::Strlen(Texture->Name)))
<< TextureMessage.Handle(Handle.GetIndex())
<< TextureMessage.NextOwnerHandle(Texture->NextOwner.GetIndexUnchecked())
<< TextureMessage.Order(Texture->TraceOrder)
<< TextureMessage.Passes((const uint32*)Texture->TracePasses.GetData(), (uint32)Texture->TracePasses.Num())
<< TextureMessage.TransientAllocationOffsetMins(TransientAllocation.OffsetMins.GetData(), TransientAllocation.OffsetMins.Num())
<< TextureMessage.TransientAllocationOffsetMaxs(TransientAllocation.OffsetMaxs.GetData(), TransientAllocation.OffsetMaxs.Num())
<< TextureMessage.TransientAllocationMemoryRanges(TransientAllocation.MemoryRanges.GetData(), TransientAllocation.MemoryRanges.Num())
<< TextureMessage.TransientAcquirePass(TransientAcquirePass.GetIndexUnchecked())
<< TextureMessage.TransientDiscardPass(TransientDiscardPass.GetIndexUnchecked())
<< TextureMessage.SizeInBytes(SizeInBytes)
<< TextureMessage.CreateFlags(uint32(Texture->Desc.Flags))
<< TextureMessage.Dimension(uint32(Texture->Desc.Dimension))
<< TextureMessage.Format(uint32(Texture->Desc.Format))
<< TextureMessage.ExtentX(Texture->Desc.Extent.X)
<< TextureMessage.ExtentY(Texture->Desc.Extent.Y)
<< TextureMessage.Depth(Texture->Desc.Depth)
<< TextureMessage.ArraySize(Texture->Desc.ArraySize)
<< TextureMessage.NumMips(Texture->Desc.NumMips)
<< TextureMessage.NumSamples(Texture->Desc.NumSamples)
<< TextureMessage.IsExternal(bool(Texture->bExternal))
<< TextureMessage.IsExtracted(bool(Texture->bExtracted))
<< TextureMessage.IsCulled(bool(Texture->ReferenceCount == 0))
<< TextureMessage.IsTrackingSkipped(EnumHasAnyFlags(Texture->Flags, ERDGTextureFlags::SkipTracking))
<< TextureMessage.IsTransient(bool(Texture->bTransient))
<< TextureMessage.IsTransientUntracked(false)
<< TextureMessage.IsTransientCacheHit(TransientAllocation.bCacheHit)
<< TextureMessage.IsHandleType32Bits(true);
}
for (FRDGBufferHandle Handle = Buffers.Begin(); Handle != Buffers.End(); ++Handle)
{
const FRDGBuffer* Buffer = Buffers[Handle];
const bool bRemoveFromStats = true;
FillTransientResourceArrays(Buffer->TransientBuffer, bRemoveFromStats);
UE_TRACE_LOG(RDGTrace, BufferMessage, RDGChannel)
<< BufferMessage.Name(Buffer->Name, uint16(FCString::Strlen(Buffer->Name)))
<< BufferMessage.Handle(Buffer->Handle.GetIndex())
<< BufferMessage.NextOwnerHandle(Buffer->NextOwner.GetIndexUnchecked())
<< BufferMessage.Order(Buffer->TraceOrder)
<< BufferMessage.Passes((const uint32*)Buffer->TracePasses.GetData(), (uint32)Buffer->TracePasses.Num())
<< BufferMessage.TransientAllocationOffsetMins(TransientAllocation.OffsetMins.GetData(), TransientAllocation.OffsetMins.Num())
<< BufferMessage.TransientAllocationOffsetMaxs(TransientAllocation.OffsetMaxs.GetData(), TransientAllocation.OffsetMaxs.Num())
<< BufferMessage.TransientAllocationMemoryRanges(TransientAllocation.MemoryRanges.GetData(), TransientAllocation.MemoryRanges.Num())
<< BufferMessage.TransientAcquirePass(TransientAcquirePass.GetIndexUnchecked())
<< BufferMessage.TransientDiscardPass(TransientDiscardPass.GetIndexUnchecked())
<< BufferMessage.UsageFlags(uint32(Buffer->Desc.Usage))
<< BufferMessage.BytesPerElement(Buffer->Desc.BytesPerElement)
<< BufferMessage.NumElements(Buffer->Desc.NumElements)
<< BufferMessage.IsExternal(bool(Buffer->bExternal))
<< BufferMessage.IsExtracted(bool(Buffer->bExtracted))
<< BufferMessage.IsCulled(bool(Buffer->ReferenceCount == 0))
<< BufferMessage.IsTrackingSkipped(EnumHasAnyFlags(Buffer->Flags, ERDGBufferFlags::SkipTracking))
<< BufferMessage.IsTransient(bool(Buffer->bTransient))
<< BufferMessage.IsTransientUntracked(false)
<< BufferMessage.IsTransientCacheHit(TransientAllocation.bCacheHit)
<< BufferMessage.IsHandleType32Bits(true);
}
int32 TextureIndex = Textures.Num();
int32 BufferIndex = Buffers.Num();
for (auto KeyValue : TransientAllocationStats.Resources)
{
const FRHITransientResource* Resource = KeyValue.Key;
if (!Resource->IsAcquired())
{
continue;
}
const bool bRemoveFromStats = false;
FillTransientResourceArrays(Resource, bRemoveFromStats);
if (Resource->GetResourceType() == ERHITransientResourceType::Texture)
{
const FRHITransientTexture* Texture = static_cast<const FRHITransientTexture*>(Resource);
UE_TRACE_LOG(RDGTrace, TextureMessage, RDGChannel)
<< TextureMessage.Name(Texture->GetName(), uint16(FCString::Strlen(Texture->GetName())))
<< TextureMessage.Handle(TextureIndex)
<< TextureMessage.TransientAllocationOffsetMins(TransientAllocation.OffsetMins.GetData(), TransientAllocation.OffsetMins.Num())
<< TextureMessage.TransientAllocationOffsetMaxs(TransientAllocation.OffsetMaxs.GetData(), TransientAllocation.OffsetMaxs.Num())
<< TextureMessage.TransientAllocationMemoryRanges(TransientAllocation.MemoryRanges.GetData(), TransientAllocation.MemoryRanges.Num())
<< TextureMessage.TransientAcquirePass(TransientAcquirePass.GetIndexUnchecked())
<< TextureMessage.TransientDiscardPass(TransientDiscardPass.GetIndexUnchecked())
<< TextureMessage.SizeInBytes(Resource->GetSize())
<< TextureMessage.CreateFlags(uint32(Texture->CreateInfo.Flags))
<< TextureMessage.Dimension(uint32(Texture->CreateInfo.Dimension))
<< TextureMessage.Format(uint32(Texture->CreateInfo.Format))
<< TextureMessage.ExtentX(Texture->CreateInfo.Extent.X)
<< TextureMessage.ExtentY(Texture->CreateInfo.Extent.Y)
<< TextureMessage.Depth(Texture->CreateInfo.Depth)
<< TextureMessage.ArraySize(Texture->CreateInfo.ArraySize)
<< TextureMessage.NumMips(Texture->CreateInfo.NumMips)
<< TextureMessage.NumSamples(Texture->CreateInfo.NumSamples)
<< TextureMessage.IsExternal(false)
<< TextureMessage.IsExtracted(false)
<< TextureMessage.IsCulled(false)
<< TextureMessage.IsTrackingSkipped(false)
<< TextureMessage.IsTransient(true)
<< TextureMessage.IsTransientUntracked(true)
<< TextureMessage.IsTransientCacheHit(TransientAllocation.bCacheHit)
<< TextureMessage.IsHandleType32Bits(true);
TextureIndex++;
}
else
{
const FRHITransientBuffer* Buffer = static_cast<const FRHITransientBuffer*>(Resource);
UE_TRACE_LOG(RDGTrace, BufferMessage, RDGChannel)
<< BufferMessage.Name(Buffer->GetName(), uint16(FCString::Strlen(Buffer->GetName())))
<< BufferMessage.Handle(BufferIndex)
<< BufferMessage.TransientAllocationOffsetMins(TransientAllocation.OffsetMins.GetData(), TransientAllocation.OffsetMins.Num())
<< BufferMessage.TransientAllocationOffsetMaxs(TransientAllocation.OffsetMaxs.GetData(), TransientAllocation.OffsetMaxs.Num())
<< BufferMessage.TransientAllocationMemoryRanges(TransientAllocation.MemoryRanges.GetData(), TransientAllocation.MemoryRanges.Num())
<< BufferMessage.TransientAcquirePass(TransientAcquirePass.GetIndexUnchecked())
<< BufferMessage.TransientDiscardPass(TransientDiscardPass.GetIndexUnchecked())
<< BufferMessage.UsageFlags(uint32(Buffer->CreateInfo.Usage))
<< BufferMessage.BytesPerElement(Buffer->CreateInfo.Stride)
<< BufferMessage.NumElements(Buffer->CreateInfo.Size / Buffer->CreateInfo.Stride)
<< BufferMessage.IsExternal(false)
<< BufferMessage.IsExtracted(false)
<< BufferMessage.IsCulled(false)
<< BufferMessage.IsTrackingSkipped(false)
<< BufferMessage.IsTransient(true)
<< BufferMessage.IsTransientUntracked(true)
<< BufferMessage.IsTransientCacheHit(TransientAllocation.bCacheHit)
<< BufferMessage.IsHandleType32Bits(true);
BufferIndex++;
}
}
UE_TRACE_LOG(RDGTrace, GraphEndMessage, RDGChannel);
}
Added RHI tracked access API to remove Unknown transitions. - New RHI command list SetTrackedAccess method for the user to supply a current whole-resource state. - New RHI command context GetTrackedAccess method for querying the tracked access in RHIBeginTransitions / RHIEndTransitions on the RHI thread. - Hooked RHICmdList.Transition and FRHICommandListExecutor::Transition to assign tracked state automatically. - Refactored RDG and resource pools to use new RHI tracking. - FRDGPooledBuffer / FRDGPooledTexture no longer contain tracked state. RDG temp-allocates state through the graph allocator instead. - All prologue transitions are 'Unknown', and all epilogue transitions coalesce into a whole resource state. - Implemented platform support for patching the 'before' state with the tracked state. - Implemented various RHI validation checks: - Asserts that the user assigned tracked state matches RHI validation tracked state, for all subresources. - Asserts that tracked state is not assigned or queried from a parallel translation context. - Added FRHIViewableResource and FRHIView base classes to RHI. FRHIView contains a pointer to an FRHIViewableResource. This is currently a raw pointer, but should be extended to a full reference in a later CL. NOTE on RHI thread constraint: Transition evaluation is now restricted to the RHI thread (i.e. no parallel translation contexts). Transitions aren't performed in parallel translate contexts anyway, so this is not a problem. If, however, we decide to refactor parallel translation to be more general, this implementation could be extended to track the state per context and update from the 'dispatch' thread. #preflight 6233b4396666d7e753a16aaf #rb kenzo.terelst [CL 19513316 by zach bethel in ue5-main branch]
2022-03-25 11:19:10 -04:00
void FRDGTrace::AddResource(FRDGViewableResource* Resource)
{
Resource->TraceOrder = ResourceOrder++;
}
void FRDGTrace::AddTexturePassDependency(FRDGTexture* Texture, FRDGPass* Pass)
{
if (!IsEnabled())
{
return;
}
Pass->TraceTextures.Add(Texture->Handle);
Texture->TracePasses.Add(Pass->Handle);
}
void FRDGTrace::AddBufferPassDependency(FRDGBuffer* Buffer, FRDGPass* Pass)
{
if (!IsEnabled())
{
return;
}
Pass->TraceBuffers.Add(Buffer->Handle);
Buffer->TracePasses.Add(Pass->Handle);
}
#endif