mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
Added num prims perf counter
This commit is contained in:
18
README.md
18
README.md
@@ -97,6 +97,12 @@ you should expect crashes and graphical issues.**
|
||||
slightly improves RDP draw time for large tris (max of about 500 us per frame,
|
||||
usually much less or zero).
|
||||
|
||||
### Miscellaneous
|
||||
|
||||
- Microcode counts the number of primitives (tris and tex rects) actually sent
|
||||
to the RDP (after culling and clipping), which can be accessed after the task
|
||||
is finished as a performance counter.
|
||||
|
||||
|
||||
## Porting Your Romhack Codebase to F3DEX3
|
||||
|
||||
@@ -189,6 +195,18 @@ similar for other games):
|
||||
are typically drawn between each material change. For more information, see
|
||||
the GBI documentation near this define.
|
||||
|
||||
To get the number of primitives counter in OoT, in the `true` codepath of
|
||||
`Sched_TaskComplete`, add this code:
|
||||
```
|
||||
// Fetch number of primitives drawn from yield data
|
||||
if(task->list.t.type == M_GFXTASK){
|
||||
u16* counterAddress = (u16*)((u8*)gGfxSPTaskYieldBuffer + OS_YIELD_DATA_SIZE - 0xA);
|
||||
osInvalDCache(counterAddress, sizeof(u16));
|
||||
gRSPGfxNumPrimsDrawn = *counterAddress;
|
||||
}
|
||||
```
|
||||
with `volatile u16 gRSPGfxNumPrimsDrawn` defined somewhere globally.
|
||||
|
||||
### Recommended Changes (Lighting)
|
||||
|
||||
- Change your game engine lighting code to load all lights in one DMA transfer
|
||||
|
||||
76
f3dex3.s
76
f3dex3.s
@@ -445,9 +445,12 @@ clipPoly2: // \ / \ / \
|
||||
vertexBuffer:
|
||||
.skip (G_MAX_VERTS * vtxSize)
|
||||
|
||||
.if . > OS_YIELD_DATA_SIZE - 8
|
||||
// OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last two words are
|
||||
// the ucode and the DL pointer. Make sure anything past there is temporary.
|
||||
.if . > OS_YIELD_DATA_SIZE - 0xA
|
||||
// OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last data in that is:
|
||||
// -0xA: numPrimsDrawn
|
||||
// -0x8: taskDataPtr
|
||||
// -0x4: ucode
|
||||
// So anything after this must be temporary.
|
||||
// (Input buffer will be reloaded from next instruction in the source DL.)
|
||||
.error "Important things in DMEM will not be saved at yield!"
|
||||
.endif
|
||||
@@ -583,6 +586,7 @@ cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr
|
||||
cmd_w0 equ $25 // DL command word 0, also holds next tris info
|
||||
taskDataPtr equ $26 // Task data (display list) DRAM pointer
|
||||
inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end
|
||||
numPrimsDrawn equ $28 // Number of tris + tex rects sent to RDP
|
||||
// $ra // Return address
|
||||
|
||||
// Misc scalar regs:
|
||||
@@ -679,15 +683,18 @@ start: // This is at IMEM 0x1080, not the start of IMEM
|
||||
li rdpCmdBufEnd, rdpCmdBuffer1End
|
||||
lw $11, rdpFifoPos
|
||||
lw $12, OSTask + OSTask_flags
|
||||
li $1, SP_CLR_SIG2 | SP_CLR_SIG1 // task done and yielded signals
|
||||
beqz $11, task_init
|
||||
li $1, SP_CLR_SIG2 | SP_CLR_SIG1 // Clear task done and yielded signals
|
||||
beqz $11, initialize_rdp // If RDP FIFO not set up yet, do so
|
||||
mtc0 $1, SP_STATUS
|
||||
andi $12, $12, OS_TASK_YIELDED
|
||||
beqz $12, load_task_ptr // skip init if resumed from yield?
|
||||
sw $zero, OSTask + OSTask_flags
|
||||
j load_overlay1_init // Skip the initialization and go straight to loading overlay 1
|
||||
lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // Was previously saved here at yield time
|
||||
task_init:
|
||||
andi $12, $12, OS_TASK_YIELDED // Resumed from yield or came from called ucode?
|
||||
beqz $12, continue_from_os_task // If latter, load DL (task data) pointer from OSTask
|
||||
sw $zero, OSTask + OSTask_flags // Clear all task flags, incl. yielded
|
||||
continue_from_yield:
|
||||
lhu numPrimsDrawn, OS_YIELD_DATA_SIZE - 0xA // Load value saved at yield
|
||||
j finish_setup
|
||||
lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // load DL pointer from yield data
|
||||
|
||||
initialize_rdp:
|
||||
mfc0 $11, DPC_STATUS
|
||||
andi $11, $11, DPC_STATUS_XBUS_DMA
|
||||
bnez $11, wait_dpc_start_valid
|
||||
@@ -713,13 +720,14 @@ wait_dpc_start_valid:
|
||||
mtc0 $2, DPC_END
|
||||
f3dzex_0000111C:
|
||||
sw $2, rdpFifoPos
|
||||
lw $11, matrixStackPtr
|
||||
bnez $11, load_task_ptr
|
||||
lw $11, matrixStackPtr // Initialize matrix stack pointer from OSTask
|
||||
bnez $11, continue_from_os_task // if not yet initialized
|
||||
lw $11, OSTask + OSTask_dram_stack
|
||||
sw $11, matrixStackPtr
|
||||
load_task_ptr:
|
||||
continue_from_os_task:
|
||||
lhu numPrimsDrawn, OSTask + OSTask_type // Upper two bytes of type
|
||||
lw taskDataPtr, OSTask + OSTask_data_ptr
|
||||
load_overlay1_init:
|
||||
finish_setup:
|
||||
li inputBufferPos, 0
|
||||
li cmd_w1_dram, orga(ovl1_start)
|
||||
j load_overlays_0_1
|
||||
@@ -1781,6 +1789,7 @@ tDaDyI equ $v7
|
||||
vmadh tDaDxI, tDaDxI, $v24[1]
|
||||
add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set
|
||||
vmudl $v29, tDaDyF, $v23[1]
|
||||
addi numPrimsDrawn, numPrimsDrawn, 1
|
||||
vmadm $v29, tDaDyI, $v23[1]
|
||||
vmadn tDaDyF, tDaDyF, $v24[1]
|
||||
sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional)
|
||||
@@ -1892,24 +1901,25 @@ dma_write:
|
||||
|
||||
.headersize 0x00001000 - orga()
|
||||
|
||||
// Overlay 0 controls the RDP and also stops the RSP when work is done
|
||||
// Overlay 0 handles three cases of stopping the current microcode.
|
||||
// The action here is controlled by $1. If yielding, $1 > 0. If this was
|
||||
// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
|
||||
ovl0_start:
|
||||
sub $11, rdpCmdBufPtr, rdpCmdBufEnd
|
||||
addi $12, $11, RDP_CMD_BUFSIZE - 1
|
||||
bgezal $12, flush_rdp_buffer
|
||||
nop
|
||||
sh numPrimsDrawn, OS_YIELD_DATA_SIZE - 0xA // Stored here for yield and done
|
||||
jal while_wait_dma_busy
|
||||
lw $24, rdpFifoPos
|
||||
bltz $1, taskdone_and_break // $1 < 0 = Got to the end of the parent DL
|
||||
bltz $1, task_done // $1 < 0 = Got to the end of the parent DL
|
||||
mtc0 $24, DPC_END // Set the end pointer of the RDP so that it starts the task
|
||||
bnez $1, task_yield // $1 > 0 = CPU requested yield
|
||||
add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
|
||||
// If here, G_LOAD_UCODE was executed.
|
||||
load_ucode:
|
||||
lw cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr
|
||||
sw taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL
|
||||
sw cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute
|
||||
sh numPrimsDrawn, OSTask + OSTask_type // Stored here only when switch ucodes
|
||||
li dmemAddr, start // Beginning of overwritable part of IMEM
|
||||
jal dma_read_write // DMA DRAM read -> IMEM write
|
||||
li dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM
|
||||
@@ -1928,23 +1938,28 @@ ovl0_start:
|
||||
.error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE"
|
||||
.endif
|
||||
|
||||
ucode equ $11
|
||||
status equ $12
|
||||
task_yield:
|
||||
lw ucode, OSTask + OSTask_ucode
|
||||
sw taskDataPtr, OS_YIELD_DATA_SIZE - 8
|
||||
sw ucode, OS_YIELD_DATA_SIZE - 4
|
||||
li status, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals
|
||||
lw $11, OSTask + OSTask_ucode
|
||||
sw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // numPrimsDrawn was saved above
|
||||
sw $11, OS_YIELD_DATA_SIZE - 4
|
||||
li $12, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals
|
||||
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
|
||||
li dmemAddr, 0x8000 // 0, but negative = write
|
||||
li dmaLen, OS_YIELD_DATA_SIZE - 1
|
||||
j dma_read_write
|
||||
li $ra, break
|
||||
li $ra, set_status_and_break
|
||||
|
||||
taskdone_and_break:
|
||||
li status, SP_SET_SIG2 // task done signal
|
||||
break:
|
||||
mtc0 status, SP_STATUS
|
||||
task_done:
|
||||
// Copy just the part of the yield data that has the numPrimsDrawn counter.
|
||||
DONE_PART_OF_YIELD_DATA equ 0x10
|
||||
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
|
||||
addi cmd_w1_dram, cmd_w1_dram, OS_YIELD_DATA_SIZE - DONE_PART_OF_YIELD_DATA
|
||||
li dmemAddr, 0x8000 | (OS_YIELD_DATA_SIZE - DONE_PART_OF_YIELD_DATA) // negative = write
|
||||
li dmaLen, DONE_PART_OF_YIELD_DATA - 1
|
||||
jal dma_read_write
|
||||
li $12, SP_SET_SIG2 // task done signal
|
||||
set_status_and_break: // $12 is the status to set
|
||||
mtc0 $12, SP_STATUS
|
||||
break 0
|
||||
nop
|
||||
|
||||
@@ -2048,6 +2063,7 @@ G_RDPHALF_2_handler:
|
||||
ldv $v29[0], (texrectWord1)($zero)
|
||||
lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0
|
||||
addi rdpCmdBufPtr, rdpCmdBufPtr, 8
|
||||
addi numPrimsDrawn, numPrimsDrawn, 1
|
||||
j G_RDP_handler
|
||||
sdv $v29[0], -8(rdpCmdBufPtr)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user