From 211b4c86d2ee1a1ce3d84719a55f5ccc384669e3 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Tue, 27 Feb 2024 21:30:23 -0800 Subject: [PATCH] Before dispatch change --- Makefile | 14 ++--- cpu/counters.c | 150 +++++++++++++++++++++++++++++++------------------ f3dex3.s | 44 ++++++++------- 3 files changed, 126 insertions(+), 82 deletions(-) diff --git a/Makefile b/Makefile index b5a0569..da16bc2 100644 --- a/Makefile +++ b/Makefile @@ -8,10 +8,12 @@ default: F3DEX3_BrZ F3DEX3_BrW ALL_OPTIONS := \ CFG_G_BRANCH_W \ CFG_DEBUG_NORMALS \ - CFG_GCLK_SAMPLE + CFG_PROFILING_A \ + CFG_PROFILING_B \ + CFG_PROFILING_C ARMIPS ?= armips -PARENT_OUTPUT_DIR ?= ../test +PARENT_OUTPUT_DIR ?= ./build ifeq ($(PARENT_OUTPUT_DIR),.) $(error Cannot build directly in repo directory; see Makefile for details.) # The problem is that we want to be able to have targets like F3DEX2_2.08, @@ -129,17 +131,15 @@ $(eval $(call reset_vars)) NAME := F3DEX3_BrZ DESCRIPTION := Will make you want to finally ditch HLE (G_BRANCH_Z version) ID_STR := F3DEX3 by Sauraen & Nintendo, G_BRANCH_Z version______________________ -# Add options you want here, e.g. CFG_GCLK_SAMPLE +# Add options you want here, e.g. CFG_PROFILING_A OPTIONS := $(eval $(call ucode_rule)) NAME := F3DEX3_BrW DESCRIPTION := Will make you want to finally ditch HLE (G_BRANCH_W version) ID_STR := F3DEX3 by Sauraen & Nintendo, G_BRANCH_W version______________________ -# Add options you want here, e.g. CFG_GCLK_SAMPLE -OPTIONS := \ - CFG_GCLK_SAMPLE \ - CFG_G_BRANCH_W +# Add options you want here, e.g. CFG_PROFILING_A +OPTIONS := CFG_PROFILING_A CFG_G_BRANCH_W $(eval $(call ucode_rule)) .PHONY: default ok all clean diff --git a/cpu/counters.c b/cpu/counters.c index a4e177e..4a5223c 100644 --- a/cpu/counters.c +++ b/cpu/counters.c @@ -1,27 +1,83 @@ -/* This example code is for HackerOoT. The F3DEX3PerfCounters struct and the -method of reading it will be the same for any other game. */ +/* This example code is for HackerOoT. The structs and the general method of +reading the counters will be the same for any game. + +Build the microcode with one of the CFG_PROFILING_* options below to select one +of these sets of performance counters, or without any CFG_PROFILING_* option for +the default set. You can even include all the microcode versions in your game, +and let the player/developer swap which one is used for a given frame in order +to switch which set of performance counters they're seeing. You only need to +keep the currently used one in RDRAM, you can load a different one from the cart +over it when the user swaps. + +For the options other than the default, the microcode uses the RDP's CLK counter +for its own timing. You should clear this counter just before launching F3DEX3 +on the RSP (in the graphics task setup); usually you'd also read the counter +value, to optionally print on screen, after the RDP is finished. Make sure not +to clear/modify the CLK counter while the RSP is running, or the profiling +results may be garbage. +*/ + +/* In some header, needs to be accessible to variables.h */ +typedef struct { /* Default performance counters, if no CFG_PROFILING_* is enabled */ + /* Number of vertices processed by the RSP */ + u16 vertexCount; + /* Number of tris actually drawn, after clipping and all types of culling */ + u16 rdpOutTriCount; + /* Number of tris which processing started on the RSP (before clipping / culling) */ + u32 rspInTriCount:18; + /* Number of fill rects and tex rects drawn */ + u32 rectCount:14; + u32 stallRDPFifoFullCycles; + u32 dummy; +} F3DEX3ProfilingDefault; + +typedef struct { /* Counters for CFG_PROFILING_A */ + u32 vertexProcCycles; + u16 fetchedDLCommandCount; + u16 dlCommandCount; + u32 stallRDPFifoFullCycles; + u32 triProcCycles; +} F3DEX3ProfilingA; + +typedef struct { /* Counters for CFG_PROFILING_B */ + u16 vertexCount; + u16 litVertexCount; + u32 smallRDPCommandCount:18; /* All RDP commands except tris */ + u32 clippedTriCount:14; /* Number of RSP/input triangles which got clipped */ + u32 allOverlayLoadCount:18; + u32 lightingOverlayLoadCount:14; + u32 clippingOverlayLoadCount:18; + u32 miscOverlayLoadCount:14; +} F3DEX3ProfilingB; + +typedef struct { /* Counters for CFG_PROFILING_C */ + /* Total cycles F3DEX3 believes it was running, not including SPLoadUcode */ + u32 ex3UcodeCycles; + /* The "GCLK is alive" bit of the RDP status is sampled once every time a + display list command is started. This counts the number of times that bit + was 1. Divide by dlCommandCount to get an approximate measurement of the + percentage of time the RDP was doing useful work, as opposed to waiting + for framebuffer / Z buffer memory transactions to complete. */ + u16 commandsSampledGclkActive; + u16 dlCommandCount; + u32 stallRDPFifoFullCycles; + u32 stallDMACycles; +} F3DEX3ProfilingC; + +typedef struct { + union { + F3DEX3ProfilingDefault def; + F3DEX3ProfilingA a; + F3DEX3ProfilingB b; + F3DEX3ProfilingC c; + u64 dummy_alignment[2]; + }; + u32 taskdataptr; /* Not a perf counter, can ignore */ + u32 ucode; /* Not a perf counter, can ignore */ +} F3DEX3YieldDataFooter; /* In variables.h with the ENABLE_SPEEDMETER section */ -extern volatile u32 gRSPGfxRDPWaitCycles; -extern volatile u16 gRSPGfxCommandsSampledGclkActive; -extern volatile u16 gRSPGfxCommandCount; -extern volatile u16 gRSPGfxVertexCount; -extern volatile u16 gRSPGfxTriDrawCount; -extern volatile u32 gRSPGfxTriRequestCount; -extern volatile u16 gRSPGfxRectCount; - -/* In sched.c somewhere before Sched_TaskComplete, or in some header */ -typedef struct { - u32 rdpWaitCycles; - u16 commandsSampledGclkActive; - u16 commandCount; - u16 vertexCount; - u16 triDrawCount; - u32 triRequestCount:18; - u32 rectCount:14; - u32 taskdataptr; /* Not a perf counter */ - u32 ucode; /* Not a perf counter */ -} F3DEX3YieldDataFooter; +extern volatile F3DEX3YieldDataFooter gRSPProfilingResults; /* In the true codepath of Sched_TaskComplete: */ #ifdef ENABLE_SPEEDMETER @@ -31,35 +87,12 @@ typedef struct { (u8*)gGfxSPTaskYieldBuffer + OS_YIELD_DATA_SIZE - sizeof(F3DEX3YieldDataFooter)); osInvalDCache(footer, sizeof(F3DEX3YieldDataFooter)); - gRSPGfxRDPWaitCycles = footer->rdpWaitCycles; - gRSPGfxCommandsSampledGclkActive = footer->commandsSampledGclkActive; - gRSPGfxCommandCount = footer->commandCount; - gRSPGfxVertexCount = footer->vertexCount; - gRSPGfxTriDrawCount = footer->triDrawCount; - gRSPGfxTriRequestCount = footer->triRequestCount; - gRSPGfxRectCount = footer->rectCount; + bcopy(footer, &gRSPProfilingResults, sizeof(F3DEX3YieldDataFooter)); } #endif /* In speed_meter.c */ -/* Number of cycles the RSP is waiting for space in the RDP FIFO in DRAM */ -volatile u32 gRSPGfxRDPWaitCycles; -/* If CFG_GCLK_SAMPLE is enabled, the "GCLK is alive" bit of the RDP status is -sampled once every time a display list command is started. This counts the -number of times that bit was 1. */ -volatile u16 gRSPGfxCommandsSampledGclkActive; -/* Number of display list commands the microcode processed. If CFG_GCLK_SAMPLE -is disabled, this will be zero, so be careful about dividing the glck cycles -above by this. */ -volatile u16 gRSPGfxCommandCount; -/* Number of vertices processed by the RSP */ -volatile u16 gRSPGfxVertexCount; -/* Number of tris actually drawn, after clipping and all types of culling */ -volatile u16 gRSPGfxTriDrawCount; -/* Number of tris which processing started on the RSP (before clipping / culling) */ -volatile u32 gRSPGfxTriRequestCount; -/* Number of fill rects and tex rects drawn */ -volatile u16 gRSPGfxRectCount; +volatile F3DEX3YieldDataFooter gRSPProfilingResults; /* You can display them on screen however you wish. Here is an example, in SpeedMeter_DrawTimeEntries */ @@ -74,14 +107,21 @@ gSPDisplayList(OVERLAY_DISP++, gfx); GfxPrint_Open(&printer, gfx); GfxPrint_SetColor(&printer, 255, 100, 0, 255); -GfxPrint_SetPos(&printer, 33, 25); -GfxPrint_Printf(&printer, "%5dV", gRSPGfxVertexCount); -GfxPrint_SetPos(&printer, 33, 26); -GfxPrint_Printf(&printer, "%5dt", gRSPGfxTriRequestCount); -GfxPrint_SetPos(&printer, 33, 27); -GfxPrint_Printf(&printer, "%5dT", gRSPGfxTriDrawCount); -GfxPrint_SetPos(&printer, 33, 28); -GfxPrint_Printf(&printer, "%5dR", gRSPGfxRectCount); +if(f3dex3_version_CFG_PROFILING_A){ + +}else if(f3dex3_version_CFG_PROFILING_B){ + ... +}else if(f3dex3_version_CFG_PROFILING_C){ + ... +}else{ + GfxPrint_SetPos(&printer, 33, 25); + GfxPrint_Printf(&printer, "%5dV", gRSPProfilingResults.def.vertexCount); + GfxPrint_SetPos(&printer, 33, 26); + GfxPrint_Printf(&printer, "%5dt", gRSPProfilingResults.def.rspInTriCount); + GfxPrint_SetPos(&printer, 33, 27); + GfxPrint_Printf(&printer, "%5dT", gRSPProfilingResults.def.rdpOutTriCount); + ... +} gfx = GfxPrint_Close(&printer); gSPEndDisplayList(gfx++); diff --git a/f3dex3.s b/f3dex3.s index ec6ee72..a0e985c 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -63,11 +63,6 @@ ACC_LOWER equ 2 // are removed, i.e. G_LIGHTTORDP behaves as a no-op and all tris are smooth // shaded. // -ENABLE_PROFILING equ 0 -COUNTER_A_UPPER_VERTEX_COUNT equ 0 -COUNTER_B_LOWER_CMD_COUNT equ 0 -COUNTER_C_FIFO_FULL equ 1 -NEED_START_COUNTER_DMEM equ 0 // Config A TODO // perfCounterA: @@ -80,10 +75,14 @@ NEED_START_COUNTER_DMEM equ 0 // perfCounterD: // cycles RSP spent processing triangle commands (incl. buffer flushes) .if CFG_PROFILING_A -ENABLE_PROFILING equ 1 -COUNTER_B_LOWER_CMD_COUNT equ 1 -NEED_START_COUNTER_DMEM equ 1 +.if CFG_PROFILING_B || CFG_PROFILING_C +.error "At most one CFG_PROFILING_ option can be enabled at a time" .endif +ENABLE_PROFILING equ 1 +COUNTER_A_UPPER_VERTEX_COUNT equ 0 +COUNTER_B_LOWER_CMD_COUNT equ 1 +COUNTER_C_FIFO_FULL equ 1 +NEED_START_COUNTER_DMEM equ 1 // Config B TODO // perfCounterA: @@ -98,18 +97,19 @@ NEED_START_COUNTER_DMEM equ 1 // perfCounterD: // upper 18 bits: overlay 3 (clipping) load count TODO // lower 14 bits: overlay 4 (misc) load count TODO -.if CFG_PROFILING_B -.if ENABLE_PROFILING +.elseif CFG_PROFILING_B +.if CFG_PROFILING_C .error "At most one CFG_PROFILING_ option can be enabled at a time" .endif ENABLE_PROFILING equ 1 -COUNTER_C_FIFO_FULL equ 0 COUNTER_A_UPPER_VERTEX_COUNT equ 1 -.endif +COUNTER_B_LOWER_CMD_COUNT equ 0 +COUNTER_C_FIFO_FULL equ 0 +NEED_START_COUNTER_DMEM equ 0 // Config C TODO // perfCounterA: -// cycles RSP believes it was running +// cycles RSP believes it was running (this ucode only) // perfCounterB: // upper 16 bits: samples GCLK was alive (sampled once per DL command count) // lower 16 bits: DL command count @@ -117,14 +117,12 @@ COUNTER_A_UPPER_VERTEX_COUNT equ 1 // cycles RSP was stalled because RDP FIFO was full // perfCounterD: // cycles RSP was stalled waiting for miscellaneous DMAs to finish -.if CFG_PROFILING_C -.if ENABLE_PROFILING -.error "At most one CFG_PROFILING_ option can be enabled at a time" -.endif +.elseif CFG_PROFILING_C ENABLE_PROFILING equ 1 +COUNTER_A_UPPER_VERTEX_COUNT equ 0 COUNTER_B_LOWER_CMD_COUNT equ 1 +COUNTER_C_FIFO_FULL equ 1 NEED_START_COUNTER_DMEM equ 1 -.endif // Default (extra profiling disabled) // perfCounterA: @@ -137,9 +135,15 @@ NEED_START_COUNTER_DMEM equ 1 // cycles RSP was stalled because RDP FIFO was full // perfCounterD: // unused/zero -.if !ENABLE_PROFILING +.else +ENABLE_PROFILING equ 0 COUNTER_A_UPPER_VERTEX_COUNT equ 1 +COUNTER_B_LOWER_CMD_COUNT equ 0 +COUNTER_C_FIFO_FULL equ 1 +NEED_START_COUNTER_DMEM equ 0 + .endif +.warning "TODO matrix count" /* There are two different memory spaces for the overlays: (a) IMEM and (b) the @@ -915,7 +919,7 @@ call_ret_common: j displaylist_dma_with_count sb $1, displayListStackLength -.if !CFG_GCLK_SAMPLE +.if !ENABLE_PROFILING G_LIGHTTORDP_handler: lbu $11, numLightsxSize // Ambient light lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size