Before dispatch change

This commit is contained in:
Sauraen
2024-02-27 21:30:23 -08:00
parent aa5acc439f
commit 211b4c86d2
3 changed files with 126 additions and 82 deletions

View File

@@ -8,10 +8,12 @@ default: F3DEX3_BrZ F3DEX3_BrW
ALL_OPTIONS := \
CFG_G_BRANCH_W \
CFG_DEBUG_NORMALS \
CFG_GCLK_SAMPLE
CFG_PROFILING_A \
CFG_PROFILING_B \
CFG_PROFILING_C
ARMIPS ?= armips
PARENT_OUTPUT_DIR ?= ../test
PARENT_OUTPUT_DIR ?= ./build
ifeq ($(PARENT_OUTPUT_DIR),.)
$(error Cannot build directly in repo directory; see Makefile for details.)
# The problem is that we want to be able to have targets like F3DEX2_2.08,
@@ -129,17 +131,15 @@ $(eval $(call reset_vars))
NAME := F3DEX3_BrZ
DESCRIPTION := Will make you want to finally ditch HLE (G_BRANCH_Z version)
ID_STR := F3DEX3 by Sauraen & Nintendo, G_BRANCH_Z version______________________
# Add options you want here, e.g. CFG_GCLK_SAMPLE
# Add options you want here, e.g. CFG_PROFILING_A
OPTIONS :=
$(eval $(call ucode_rule))
NAME := F3DEX3_BrW
DESCRIPTION := Will make you want to finally ditch HLE (G_BRANCH_W version)
ID_STR := F3DEX3 by Sauraen & Nintendo, G_BRANCH_W version______________________
# Add options you want here, e.g. CFG_GCLK_SAMPLE
OPTIONS := \
CFG_GCLK_SAMPLE \
CFG_G_BRANCH_W
# Add options you want here, e.g. CFG_PROFILING_A
OPTIONS := CFG_PROFILING_A CFG_G_BRANCH_W
$(eval $(call ucode_rule))
.PHONY: default ok all clean

View File

@@ -1,27 +1,83 @@
/* This example code is for HackerOoT. The F3DEX3PerfCounters struct and the
method of reading it will be the same for any other game. */
/* This example code is for HackerOoT. The structs and the general method of
reading the counters will be the same for any game.
Build the microcode with one of the CFG_PROFILING_* options below to select one
of these sets of performance counters, or without any CFG_PROFILING_* option for
the default set. You can even include all the microcode versions in your game,
and let the player/developer swap which one is used for a given frame in order
to switch which set of performance counters they're seeing. You only need to
keep the currently used one in RDRAM, you can load a different one from the cart
over it when the user swaps.
For the options other than the default, the microcode uses the RDP's CLK counter
for its own timing. You should clear this counter just before launching F3DEX3
on the RSP (in the graphics task setup); usually you'd also read the counter
value, to optionally print on screen, after the RDP is finished. Make sure not
to clear/modify the CLK counter while the RSP is running, or the profiling
results may be garbage.
*/
/* In some header, needs to be accessible to variables.h */
typedef struct { /* Default performance counters, if no CFG_PROFILING_* is enabled */
/* Number of vertices processed by the RSP */
u16 vertexCount;
/* Number of tris actually drawn, after clipping and all types of culling */
u16 rdpOutTriCount;
/* Number of tris which processing started on the RSP (before clipping / culling) */
u32 rspInTriCount:18;
/* Number of fill rects and tex rects drawn */
u32 rectCount:14;
u32 stallRDPFifoFullCycles;
u32 dummy;
} F3DEX3ProfilingDefault;
typedef struct { /* Counters for CFG_PROFILING_A */
u32 vertexProcCycles;
u16 fetchedDLCommandCount;
u16 dlCommandCount;
u32 stallRDPFifoFullCycles;
u32 triProcCycles;
} F3DEX3ProfilingA;
typedef struct { /* Counters for CFG_PROFILING_B */
u16 vertexCount;
u16 litVertexCount;
u32 smallRDPCommandCount:18; /* All RDP commands except tris */
u32 clippedTriCount:14; /* Number of RSP/input triangles which got clipped */
u32 allOverlayLoadCount:18;
u32 lightingOverlayLoadCount:14;
u32 clippingOverlayLoadCount:18;
u32 miscOverlayLoadCount:14;
} F3DEX3ProfilingB;
typedef struct { /* Counters for CFG_PROFILING_C */
/* Total cycles F3DEX3 believes it was running, not including SPLoadUcode */
u32 ex3UcodeCycles;
/* The "GCLK is alive" bit of the RDP status is sampled once every time a
display list command is started. This counts the number of times that bit
was 1. Divide by dlCommandCount to get an approximate measurement of the
percentage of time the RDP was doing useful work, as opposed to waiting
for framebuffer / Z buffer memory transactions to complete. */
u16 commandsSampledGclkActive;
u16 dlCommandCount;
u32 stallRDPFifoFullCycles;
u32 stallDMACycles;
} F3DEX3ProfilingC;
typedef struct {
union {
F3DEX3ProfilingDefault def;
F3DEX3ProfilingA a;
F3DEX3ProfilingB b;
F3DEX3ProfilingC c;
u64 dummy_alignment[2];
};
u32 taskdataptr; /* Not a perf counter, can ignore */
u32 ucode; /* Not a perf counter, can ignore */
} F3DEX3YieldDataFooter;
/* In variables.h with the ENABLE_SPEEDMETER section */
extern volatile u32 gRSPGfxRDPWaitCycles;
extern volatile u16 gRSPGfxCommandsSampledGclkActive;
extern volatile u16 gRSPGfxCommandCount;
extern volatile u16 gRSPGfxVertexCount;
extern volatile u16 gRSPGfxTriDrawCount;
extern volatile u32 gRSPGfxTriRequestCount;
extern volatile u16 gRSPGfxRectCount;
/* In sched.c somewhere before Sched_TaskComplete, or in some header */
typedef struct {
u32 rdpWaitCycles;
u16 commandsSampledGclkActive;
u16 commandCount;
u16 vertexCount;
u16 triDrawCount;
u32 triRequestCount:18;
u32 rectCount:14;
u32 taskdataptr; /* Not a perf counter */
u32 ucode; /* Not a perf counter */
} F3DEX3YieldDataFooter;
extern volatile F3DEX3YieldDataFooter gRSPProfilingResults;
/* In the true codepath of Sched_TaskComplete: */
#ifdef ENABLE_SPEEDMETER
@@ -31,35 +87,12 @@ typedef struct {
(u8*)gGfxSPTaskYieldBuffer +
OS_YIELD_DATA_SIZE - sizeof(F3DEX3YieldDataFooter));
osInvalDCache(footer, sizeof(F3DEX3YieldDataFooter));
gRSPGfxRDPWaitCycles = footer->rdpWaitCycles;
gRSPGfxCommandsSampledGclkActive = footer->commandsSampledGclkActive;
gRSPGfxCommandCount = footer->commandCount;
gRSPGfxVertexCount = footer->vertexCount;
gRSPGfxTriDrawCount = footer->triDrawCount;
gRSPGfxTriRequestCount = footer->triRequestCount;
gRSPGfxRectCount = footer->rectCount;
bcopy(footer, &gRSPProfilingResults, sizeof(F3DEX3YieldDataFooter));
}
#endif
/* In speed_meter.c */
/* Number of cycles the RSP is waiting for space in the RDP FIFO in DRAM */
volatile u32 gRSPGfxRDPWaitCycles;
/* If CFG_GCLK_SAMPLE is enabled, the "GCLK is alive" bit of the RDP status is
sampled once every time a display list command is started. This counts the
number of times that bit was 1. */
volatile u16 gRSPGfxCommandsSampledGclkActive;
/* Number of display list commands the microcode processed. If CFG_GCLK_SAMPLE
is disabled, this will be zero, so be careful about dividing the glck cycles
above by this. */
volatile u16 gRSPGfxCommandCount;
/* Number of vertices processed by the RSP */
volatile u16 gRSPGfxVertexCount;
/* Number of tris actually drawn, after clipping and all types of culling */
volatile u16 gRSPGfxTriDrawCount;
/* Number of tris which processing started on the RSP (before clipping / culling) */
volatile u32 gRSPGfxTriRequestCount;
/* Number of fill rects and tex rects drawn */
volatile u16 gRSPGfxRectCount;
volatile F3DEX3YieldDataFooter gRSPProfilingResults;
/* You can display them on screen however you wish. Here is an example, in
SpeedMeter_DrawTimeEntries */
@@ -74,14 +107,21 @@ gSPDisplayList(OVERLAY_DISP++, gfx);
GfxPrint_Open(&printer, gfx);
GfxPrint_SetColor(&printer, 255, 100, 0, 255);
GfxPrint_SetPos(&printer, 33, 25);
GfxPrint_Printf(&printer, "%5dV", gRSPGfxVertexCount);
GfxPrint_SetPos(&printer, 33, 26);
GfxPrint_Printf(&printer, "%5dt", gRSPGfxTriRequestCount);
GfxPrint_SetPos(&printer, 33, 27);
GfxPrint_Printf(&printer, "%5dT", gRSPGfxTriDrawCount);
GfxPrint_SetPos(&printer, 33, 28);
GfxPrint_Printf(&printer, "%5dR", gRSPGfxRectCount);
if(f3dex3_version_CFG_PROFILING_A){
}else if(f3dex3_version_CFG_PROFILING_B){
...
}else if(f3dex3_version_CFG_PROFILING_C){
...
}else{
GfxPrint_SetPos(&printer, 33, 25);
GfxPrint_Printf(&printer, "%5dV", gRSPProfilingResults.def.vertexCount);
GfxPrint_SetPos(&printer, 33, 26);
GfxPrint_Printf(&printer, "%5dt", gRSPProfilingResults.def.rspInTriCount);
GfxPrint_SetPos(&printer, 33, 27);
GfxPrint_Printf(&printer, "%5dT", gRSPProfilingResults.def.rdpOutTriCount);
...
}
gfx = GfxPrint_Close(&printer);
gSPEndDisplayList(gfx++);

View File

@@ -63,11 +63,6 @@ ACC_LOWER equ 2
// are removed, i.e. G_LIGHTTORDP behaves as a no-op and all tris are smooth
// shaded.
//
ENABLE_PROFILING equ 0
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 1
NEED_START_COUNTER_DMEM equ 0
// Config A TODO
// perfCounterA:
@@ -80,10 +75,14 @@ NEED_START_COUNTER_DMEM equ 0
// perfCounterD:
// cycles RSP spent processing triangle commands (incl. buffer flushes)
.if CFG_PROFILING_A
ENABLE_PROFILING equ 1
COUNTER_B_LOWER_CMD_COUNT equ 1
NEED_START_COUNTER_DMEM equ 1
.if CFG_PROFILING_B || CFG_PROFILING_C
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 1
COUNTER_C_FIFO_FULL equ 1
NEED_START_COUNTER_DMEM equ 1
// Config B TODO
// perfCounterA:
@@ -98,18 +97,19 @@ NEED_START_COUNTER_DMEM equ 1
// perfCounterD:
// upper 18 bits: overlay 3 (clipping) load count TODO
// lower 14 bits: overlay 4 (misc) load count TODO
.if CFG_PROFILING_B
.if ENABLE_PROFILING
.elseif CFG_PROFILING_B
.if CFG_PROFILING_C
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
ENABLE_PROFILING equ 1
COUNTER_C_FIFO_FULL equ 0
COUNTER_A_UPPER_VERTEX_COUNT equ 1
.endif
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 0
NEED_START_COUNTER_DMEM equ 0
// Config C TODO
// perfCounterA:
// cycles RSP believes it was running
// cycles RSP believes it was running (this ucode only)
// perfCounterB:
// upper 16 bits: samples GCLK was alive (sampled once per DL command count)
// lower 16 bits: DL command count
@@ -117,14 +117,12 @@ COUNTER_A_UPPER_VERTEX_COUNT equ 1
// cycles RSP was stalled because RDP FIFO was full
// perfCounterD:
// cycles RSP was stalled waiting for miscellaneous DMAs to finish
.if CFG_PROFILING_C
.if ENABLE_PROFILING
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
.elseif CFG_PROFILING_C
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 1
COUNTER_C_FIFO_FULL equ 1
NEED_START_COUNTER_DMEM equ 1
.endif
// Default (extra profiling disabled)
// perfCounterA:
@@ -137,9 +135,15 @@ NEED_START_COUNTER_DMEM equ 1
// cycles RSP was stalled because RDP FIFO was full
// perfCounterD:
// unused/zero
.if !ENABLE_PROFILING
.else
ENABLE_PROFILING equ 0
COUNTER_A_UPPER_VERTEX_COUNT equ 1
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 1
NEED_START_COUNTER_DMEM equ 0
.endif
.warning "TODO matrix count"
/*
There are two different memory spaces for the overlays: (a) IMEM and (b) the
@@ -915,7 +919,7 @@ call_ret_common:
j displaylist_dma_with_count
sb $1, displayListStackLength
.if !CFG_GCLK_SAMPLE
.if !ENABLE_PROFILING
G_LIGHTTORDP_handler:
lbu $11, numLightsxSize // Ambient light
lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size