Main reorganization but crashes soon

This commit is contained in:
Sauraen
2024-08-21 22:06:55 -07:00
parent 3865db1366
commit 8a6efe3ff8
2 changed files with 142 additions and 129 deletions

View File

@@ -27,8 +27,9 @@ all dual-issue conditions. Instruction alignment is sometimes taken into
account, otherwise assumed to be optimal.
Vertex / lighting numbers assume no special features (texgen, packed normals,
etc.) Tri numbers assume texture, shade, and Z. All numbers assume default
profiling configuration. Empty cells are "not measured yet".
etc.) Tri numbers assume texture, shade, and Z, and not flushing the buffer.
All numbers assume default profiling configuration. Empty cells are "not
measured yet".
| | F3DEX2 | F3DEX3_LVP_NOC | F3DEX3_LVP | F3DEX3_NOC | F3DEX3 |
|----------------------------|--------|----------------|------------|------------|--------|
@@ -44,6 +45,7 @@ profiling configuration. Empty cells are "not measured yet".
| Vtx pair, 8 dir lts | Can't | 119 | 145 | 385 | 404 |
| Vtx pair, 9 dir lts | Can't | 126 | 152 | 414 | 433 |
| Command dispatch | 12 | 12 | 12 | 12 | 12 |
| Small RDP command | 14 | 5 | 5 | 5 | 5 |
| Only/2nd tri to offscreen | 27 | 29 | 29 | 29 | 29 |
| 1st tri to offscreen | 28 | 29 | 29 | 29 | 29 |
| Only/2nd tri to clip | 32 | 31 | 31 | 31 | 31 |
@@ -54,13 +56,13 @@ profiling configuration. Empty cells are "not measured yet".
| 1st tri to degenerate | 43 | 42 | 42 | 42 | 42 |
| Only/2nd tri to occluded | Can't | Can't | 49 | Can't | 49 |
| 1st tri to occluded | Can't | Can't | 49 | Can't | 49 |
| Only/2nd tri to draw | 172 | 170 | 171 | 170 | 171 |
| 1st tri to draw | 173 | 170 | 171 | 170 | 171 |
| Only/2nd tri to draw | 172 | 166 | 167 | 166 | 167 |
| 1st tri to draw | 173 | 166 | 167 | 166 | 167 |
Tri numbers are measured from the first cycle of the command handler inclusive,
to the first cycle of whatever is after the return exclusive. This is in order
to capture the extra mfc0 to mfc0 stall due to return_routine in F3DEX2.
to the first cycle of whatever is after $ra exclusive. This is in order
to capture the extra latency and stalls in F3DEX2.
Vertex processing time as reported by the performance counter in the `PA`

257
f3dex3.s
View File

@@ -1153,11 +1153,33 @@ displaylist_dma:
wait_for_dma_and_run_next_command:
G_POPMTX_end:
G_MOVEMEM_end:
jal while_wait_dma_busy // wait for the DMA read to finish
.if !CFG_PROFILING_A
j while_wait_dma_busy // wait for the DMA read to finish
li $ra, run_next_DL_command
.if !CFG_LEGACY_VTX_PIPE
G_DMA_IO_handler:
G_BRANCH_WZ_handler:
G_MEMSET_handler:
j ovl234_ovl4_entrypoint // Delay slot is harmless
.endif
load_cmds_handler:
lb $3, materialCullMode
bltz $3, run_next_DL_command // If cull mode is < 0, in mat second time, skip the load
G_RDP_handler:
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer
G_SYNC_handler:
.if CFG_PROFILING_C
addi perfCounterC, perfCounterC, 0x4000 // Increment small RDP command count
.endif
sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer
addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words
check_rdp_buffer_full_and_run_next_cmd:
sub $8, rdpCmdBufPtr, rdpCmdBufEndP1
bgezal $8, flush_rdp_buffer
// $1 on next instr survives flush_rdp_buffer
.if CFG_NO_OCCLUSION_PLANE && CFG_LEGACY_VTX_PIPE && !CFG_PROFILING_A
vertex_end:
.endif
lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store
.if !CFG_PROFILING_A
tri_end:
.endif
@@ -1166,7 +1188,7 @@ G_LIGHTTORDP_handler:
.endif
G_SPNOOP_handler:
run_next_DL_command:
mfc0 $1, SP_STATUS // load the status word into register $1
mfc0 $1, SP_STATUS // load the status word into register $1
lw cmd_w0, (inputBufferEnd)(inputBufferPos) // load the command word into cmd_w0
beqz inputBufferPos, displaylist_dma // load more DL commands if none are left
andi $1, $1, SP_STATUS_SIG0 // check if the task should yield
@@ -1265,19 +1287,6 @@ G_TRI1_handler:
j tri_main
li $ra, tri_end // After done with this tri, exit tri processing
G_VTX_handler:
lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10
lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
srl $2, cmd_w0, 11 // n << 1
sub $2, cmd_w0, $2 // v0 << 1
sb $2, (inputBufferEnd - 0x06)(inputBufferPos) // Store v0 << 1 as byte 2
.if COUNTER_A_UPPER_VERTEX_COUNT
sll $11, $1, 12 // Vtx count * 0x10000
add perfCounterA, perfCounterA, $11 // Add to vertex count
.endif
j vtx_addrs_from_cmd // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $10
li $11, vtx_return_from_addrs
.if !ENABLE_PROFILING
G_LIGHTTORDP_handler:
lbu $11, numLightsxSize // Ambient light
@@ -1315,84 +1324,22 @@ G_BRANCH_WZ_handler:
G_MEMSET_handler:
instantiate_memset
.else
G_DMA_IO_handler:
G_BRANCH_WZ_handler:
G_MEMSET_handler:
j ovl234_ovl4_entrypoint // Delay slot is harmless
.endif
load_cmds_handler:
lb $3, materialCullMode
bltz $3, run_next_DL_command // If cull mode is < 0, in mat second time, skip the load
G_RDP_handler:
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer
G_SYNC_handler:
.if CFG_PROFILING_C
addi perfCounterC, perfCounterC, 0x4000 // Increment small RDP command count
.endif
sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer
addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words
check_rdp_buffer_full_and_run_next_cmd:
li $ra, run_next_DL_command // Set up running the next DL command as the return address
check_rdp_buffer_full:
sub $11, rdpCmdBufPtr, rdpCmdBufEndP1
bltz $11, return_routine // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
flush_rdp_buffer:
mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
addi dmaLen, $11, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy
.if CFG_PROFILING_C
// This is a wait for DMA busy loop, but written inline to avoid overwriting ra.
addi perfCounterD, perfCounterD, 10 // 6 instr + 2 between end load and mfc + 0 taken branch overlaps with last + 2 between mfc and load
.endif
bnez $10, flush_rdp_buffer // Wait until no DMAs are active
lw $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time)
add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer
sub $10, $10, $11 // $10 = FIFO end addr - future pointer
bgez $10, @@has_room // Branch if we can fit this
@@await_rdp_dblbuf_avail:
mfc0 $11, DPC_STATUS // Read RDP status
andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available
.if COUNTER_C_FIFO_FULL
addi perfCounterC, perfCounterC, 7 // 4 instr + 2 after mfc + 1 taken branch
.endif
lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO
@@await_past_first_instr:
mfc0 $11, DPC_CURRENT // Load RDP current pointer
beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start
.if COUNTER_C_FIFO_FULL
addi perfCounterC, perfCounterC, 6 // 3 instr + 2 after mfc + 1 taken branch
.else
nop
.endif
// Start was previously the start of the FIFO, unless this is the first buffer,
// in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we
// have a new end value waiting (END_VALID), it'll load end but leave current. By
// setting start here, it will also load current with start.
mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO
@@keep_waiting:
.if COUNTER_C_FIFO_FULL
// This is here so we only count it when stalling below or on FIFO end codepath
addi perfCounterC, perfCounterC, 10 // 7 instr + 2 after mfc + 1 taken branch
.endif
@@has_room:
mfc0 $11, DPC_CURRENT // Load RDP current pointer
sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start)
blez $11, @@copy_buffer // Current is behind or at current end, can do copy
sub $11, $11, dmaLen // If amount current is ahead of current end
blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting
@@copy_buffer:
add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size
sw $11, rdpFifoPos
// Set up the DMA from DMEM to the RDP fifo in RDRAM
addi dmaLen, dmaLen, -1 // subtract 1 from the length
addi dmemAddr, rdpCmdBufEndP1, -(0x2000 | (RDP_CMD_BUFSIZE + 8)) // The 0x2000 is meaningless, negative means write
xori rdpCmdBufEndP1, rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word ^ rdpCmdBuffer2EndPlus1Word // Swap between the two RDP command buffers
j dma_read_write
addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8)
G_VTX_handler:
lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10
lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
srl $2, cmd_w0, 11 // n << 1
sub $2, cmd_w0, $2 // v0 << 1
sb $2, (inputBufferEnd - 0x06)(inputBufferPos) // Store v0 << 1 as byte 2
.if COUNTER_A_UPPER_VERTEX_COUNT
sll $11, $1, 12 // Vtx count * 0x10000
add perfCounterA, perfCounterA, $11 // Add to vertex count
.endif
j vtx_addrs_from_cmd // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $10
li $11, vtx_return_from_addrs
.warning "TODO improve vertex address setup"
vtx_return_from_addrs:
andi $10, $10, 0xFFF8 // Round down end addr to DMA word; one input vtx still fits in one internal vtx
@@ -1730,8 +1677,9 @@ vtx_epilogue:
ssv sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos)
bltz $ra, clip_after_vtx_store
slv sKPF[2], (VTX_SCR_Z )($19)
sh $10, (VTX_CLIP )($19) // Store first vertex flags
j vertex_end
sh $10, (VTX_CLIP )($19) // Store first vertex flags
lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store
.else // end of new LVP_NOC
@@ -2088,6 +2036,31 @@ skip_return_to_lt_or_loop:
.endif // New LVP_NOC
.if !CFG_PROFILING_A && (!CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE)
vertex_end:
j run_next_DL_command
lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store
.endif
.if CFG_PROFILING_A
vertex_end:
li $ra, 0 // Flag for coming from vtx
.if !CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE
lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store
.endif
tri_end:
mfc0 $11, DPC_CLOCK
lw $10, startCounterTime
sub $11, $11, $10
beqz $ra, run_next_DL_command // $ra != 0 if from tri cmds
add perfCounterA, perfCounterA, $11 // Add to vert cycles perf counter
sub perfCounterA, perfCounterA, $11 // From tris, undo add to vert perf counter
sub $10, perfCounterC, $4 // How long we stalled for RDP FIFO during this cmd
sub $11, $11, $10 // Subtract that from the tri cycles
j run_next_DL_command
add perfCounterD, perfCounterD, $11 // Add to tri cycles perf counter
.endif
.if CFG_LEGACY_VTX_PIPE || CFG_NO_OCCLUSION_PLANE
G_MTX_end:
instantiate_mtx_end_begin
@@ -2376,23 +2349,6 @@ ovl3_padded_end:
.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga())
ovl234_end:
.if CFG_PROFILING_A
vertex_end:
li $ra, 0 // Flag for coming from vtx
lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store
tri_end:
mfc0 $11, DPC_CLOCK
lw $10, startCounterTime
sub $11, $11, $10
beqz $ra, run_next_DL_command // $ra != 0 if from tri cmds
add perfCounterA, perfCounterA, $11 // Add to vert cycles perf counter
sub perfCounterA, perfCounterA, $11 // From tris, undo add to vert perf counter
sub $10, perfCounterC, $4 // How long we stalled for RDP FIFO during this cmd
sub $11, $11, $10 // Subtract that from the tri cycles
j run_next_DL_command
add perfCounterD, perfCounterD, $11 // Add to tri cycles perf counter
.endif
tV1AtF equ $v5
tV2AtF equ $v7
tV3AtF equ $v9
@@ -2400,10 +2356,6 @@ tV1AtI equ $v18
tV2AtI equ $v19
tV3AtI equ $v21
.if (. & 4)
.warning "tri_main not aligned"
.endif
tri_main:
vmudn $v29, vOne, $v30[0] // Address of vertex buffer
lw $6, geometryModeLabel // Load full geometry mode word
@@ -2629,6 +2581,9 @@ tri_skip_alpha_compare_cull:
vmrg tV3AtF, tV3AtF, $v13 // Merge S, T, W into elems 4-6
tri_skip_tex:
// 109 cycles
.if !ENABLE_PROFILING
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
.endif
vmudl $v29, $v16, $v23
lsv tV1AtF[14], VTX_SCR_Z_FRAC($1)
vmadm $v29, $v17, $v23
@@ -2692,9 +2647,7 @@ tDaDyI equ $v7
vmudl $v29, tDaDyF, $v23[1]
add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set
vmadm $v29, tDaDyI, $v23[1]
.if !ENABLE_PROFILING
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
.endif
sub $8, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP
vmadn tDaDyF, tDaDyF, $v24[1]
sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional)
vmadh tDaDyI, tDaDyI, $v24[1]
@@ -2751,9 +2704,65 @@ tV1AtFF equ $v10
ssv tDaDyF[14], 0x0E($10)
ssv tDaDyI[14], 0x0C($10)
ssv tV1AtF[14], 0x02($10)
j check_rdp_buffer_full // eventually returns to $ra, which is next cmd, second tri in TRI2, or middle of clipping
ssv tV1AtI[14], 0x00($10)
tri_end_check_rdp_buffer_full:
bltz $8, return_routine // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
ssv tV1AtI[14], 0x00($10) // If returning from no-Z, this is okay b/c $10 is at end
// 162 cycles
flush_rdp_buffer: // $8 = rdpCmdBufPtr - rdpCmdBufEndP1
mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
addi dmaLen, $8, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy
.if CFG_PROFILING_C
// This is a wait for DMA busy loop, but written inline to avoid overwriting ra.
addi perfCounterD, perfCounterD, 10 // 6 instr + 2 between end load and mfc + 0 taken branch overlaps with last + 2 between mfc and load
.endif
bnez $10, flush_rdp_buffer // Wait until no DMAs are active
lw $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time)
add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer
sub $10, $10, $11 // $10 = FIFO end addr - future pointer
bgez $10, @@has_room // Branch if we can fit this
@@await_rdp_dblbuf_avail:
mfc0 $11, DPC_STATUS // Read RDP status
andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available
.if COUNTER_C_FIFO_FULL
addi perfCounterC, perfCounterC, 7 // 4 instr + 2 after mfc + 1 taken branch
.endif
lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO
@@await_past_first_instr:
mfc0 $11, DPC_CURRENT // Load RDP current pointer
beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start
.if COUNTER_C_FIFO_FULL
addi perfCounterC, perfCounterC, 6 // 3 instr + 2 after mfc + 1 taken branch
.else
nop
.endif
// Start was previously the start of the FIFO, unless this is the first buffer,
// in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we
// have a new end value waiting (END_VALID), it'll load end but leave current. By
// setting start here, it will also load current with start.
mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO
@@keep_waiting:
.if COUNTER_C_FIFO_FULL
// This is here so we only count it when stalling below or on FIFO end codepath
addi perfCounterC, perfCounterC, 10 // 7 instr + 2 after mfc + 1 taken branch
.endif
@@has_room:
mfc0 $11, DPC_CURRENT // Load RDP current pointer
sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start)
blez $11, @@copy_buffer // Current is behind or at current end, can do copy
sub $11, $11, dmaLen // If amount current is ahead of current end
blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting
@@copy_buffer:
add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size
sw $11, rdpFifoPos
// Set up the DMA from DMEM to the RDP fifo in RDRAM
addi dmaLen, dmaLen, -1 // subtract 1 from the length
addi dmemAddr, rdpCmdBufEndP1, -(0x2000 | (RDP_CMD_BUFSIZE + 8)) // The 0x2000 is meaningless, negative means write
xori rdpCmdBufEndP1, rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word ^ rdpCmdBuffer2EndPlus1Word // Swap between the two RDP command buffers
j dma_read_write
addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8)
.if CFG_NO_OCCLUSION_PLANE || CFG_LEGACY_VTX_PIPE
// If we have room for the extra instructions. Z disabled is rare, so the
@@ -2762,7 +2771,7 @@ no_z_buffer:
sdv tV1AtF[0], 0x0010($2) // Store RGBA shade color (fractional)
sdv tV1AtI[0], 0x0000($2) // Store RGBA shade color (integer)
sdv tV1AtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional)
j check_rdp_buffer_full
j tri_end_check_rdp_buffer_full
sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer)
.endif
@@ -2780,6 +2789,8 @@ return_routine:
jr $ra
nop
.warning "TODO move the empty space to here"
load_overlay_0_and_enter:
li postOvlRA, 0x1000 // Sets up return address
li cmd_w1_dram, orga(ovl0_start) // Sets up ovl0 table address
@@ -2881,8 +2892,8 @@ dma_write:
// The action here is controlled by $1. If yielding, $1 > 0. If this was
// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
ovl0_start:
sub $11, rdpCmdBufPtr, rdpCmdBufEndP1
addi $10, $11, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything?
sub $8, rdpCmdBufPtr, rdpCmdBufEndP1
addi $10, $8, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything?
bgezal $10, flush_rdp_buffer // - 1 because there is no bgtzal instruction
add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
jal while_wait_dma_busy // Wait for possible RDP flush to finish