This commit is contained in:
Sauraen
2024-08-27 21:39:12 -07:00
parent ad20d8c1b1
commit 02cb0386ae
2 changed files with 115 additions and 125 deletions

View File

@@ -33,6 +33,9 @@ measured yet".
| | F3DEX2 | F3DEX3_LVP_NOC | F3DEX3_LVP | F3DEX3_NOC | F3DEX3 |
|----------------------------|--------|----------------|------------|------------|--------|
| Command dispatch | 12 | 12 | 12 | 12 | 12 |
| Small RDP command | 14 | 5 | 5 | 5 | 5 |
| Vtx before DMA start | 16 | 17 | 17 | 17 | 17 |
| Vtx pair, no lighting | 54 | 54 | 81 | 79 | 98 |
| Vtx pair, 0 dir lts | Can't | 64 | | | |
| Vtx pair, 1 dir lt | 73 | 70 | 96 | 182 | 201 |
@@ -44,20 +47,18 @@ measured yet".
| Vtx pair, 7 dir lts | 118 | 112 | 138 | 356 | 375 |
| Vtx pair, 8 dir lts | Can't | 119 | 145 | 385 | 404 |
| Vtx pair, 9 dir lts | Can't | 126 | 152 | 414 | 433 |
| Command dispatch | 12 | 12 | 12 | 12 | 12 |
| Small RDP command | 14 | 5 | 5 | 5 | 5 |
| Only/2nd tri to offscreen | 27 | 29 | 29 | 29 | 29 |
| 1st tri to offscreen | 28 | 29 | 29 | 29 | 29 |
| Only/2nd tri to offscreen | 27 | 26 | 26 | 26 | 26 |
| 1st tri to offscreen | 28 | 27 | 27 | 27 | 27 |
| Only/2nd tri to clip | 32 | 31 | 31 | 31 | 31 |
| 1st tri to clip | 33 | 31 | 31 | 31 | 31 |
| Only/2nd tri to backface | 38 | 40 | 40 | 40 | 40 |
| 1st tri to backface | 39 | 40 | 40 | 40 | 40 |
| Only/2nd tri to degenerate | 42 | 42 | 42 | 42 | 42 |
| 1st tri to degenerate | 43 | 42 | 42 | 42 | 42 |
| 1st tri to clip | 33 | 32 | 32 | 32 | 32 |
| Only/2nd tri to backface | 38 | 38 | 38 | 38 | 38 |
| 1st tri to backface | 39 | 39 | 39 | 39 | 39 |
| Only/2nd tri to degenerate | 42 | 40 | 40 | 40 | 40 |
| 1st tri to degenerate | 43 | 41 | 41 | 41 | 41 |
| Only/2nd tri to occluded | Can't | Can't | 49 | Can't | 49 |
| 1st tri to occluded | Can't | Can't | 49 | Can't | 49 |
| Only/2nd tri to draw | 172 | 166 | 167 | 166 | 167 |
| 1st tri to draw | 173 | 166 | 167 | 166 | 167 |
| 1st tri to occluded | Can't | Can't | 50 | Can't | 50 |
| Only/2nd tri to draw | 172 | 165 | 168 | 165 | 168 |
| 1st tri to draw | 173 | 165 | 168 | 165 | 168 |
Tri numbers are measured from the first cycle of the command handler inclusive,

215
f3dex3.s
View File

@@ -455,6 +455,9 @@ normalsMode:
lastMatDLPhyAddr:
.dw 0
activeClipPlanes:
.dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping
// Constants for clipping algorithm
clipCondShifts:
.db CLIP_SCAL_NY_SHIFT
@@ -1043,11 +1046,11 @@ tempPrevVtxGarbage equ 0x50 // Up to 2 * 0x26 = 0x4C used -> to 0x9C
j wait_for_dma_and_run_next_command
// Delay slot harmless
@@clamp_to_memset_buffer:
addi $11, cmd_w0, -memsetBufferSize // Is more than a whole buffer left?
bltz $11, return_routine
move $2, cmd_w0 // No, use partial buffer
addi $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize)
sra $10, $11, 31
and $11, $11, $10
jr $ra
li $2, memsetBufferSize
addi $2, $11, memsetBufferSize
.endmacro
@@ -1189,7 +1192,7 @@ check_rdp_buffer_full_and_run_next_cmd:
vertex_end:
.endif
.if !CFG_PROFILING_A
tri_end:
tris_end:
.endif
.if ENABLE_PROFILING
G_LIGHTTORDP_handler:
@@ -1292,8 +1295,7 @@ G_VTX_handler:
lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive
jal segmented_to_physical // Convert address in cmd_w1_dram to physical
lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10
andi dmemAddr, dmemAddr, 0xFFF8 // Round down end addr to DMA word; one input vtx still fits in one internal vtx
sub dmemAddr, dmemAddr, $1 // Start addr = end addr - size
sub dmemAddr, dmemAddr, $1 // Start addr = end addr - size. Rounded down to DMA word by H/W
addi dmaLen, $1, -1 // DMA length is always offset by -1
j dma_read_write
li $ra, 0x8000 | vtx_after_dma // Negative = flag to not to return to clipping in vtx_setup_constants
@@ -1302,30 +1304,22 @@ G_TRIFAN_handler:
li $1, 0x8000 // $ra negative = flag for G_TRIFAN
G_TRISTRIP_handler:
addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0
addi cmd_w0, inputBufferPos, inputBufferEnd - 12 // Start pointing so elems 5-7 are tris 1-3
addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte
tri_strip_fan_loop:
lb $3, (7)(cmd_w0) // Load signed index of last of 3 tris
bgez $ra, @@skip_copy_1 // Skip if G_TRISTRIP
lbu $1, (inputBufferEnd - 7)(inputBufferPos) // Load tri 1 index
sb $1, (5)(cmd_w0) // Store as first tri of the three current tris
@@skip_copy_1:
bltz $3, tri_end // If third tri index is negative, exit
addi $11, inputBufferPos, inputBufferEnd - 7 // Off end of command
beq $11, cmd_w0, tri_end // If off end of command, exit
lpv $v27[0], (0)(cmd_w0) // Load the three tris to elems 5-7
bltz $ra, tri_main // Draw if G_TRIFAN
addi cmd_w0, cmd_w0, 1 // Increment
andi $11, cmd_w0, 1 // If odd after increment, this is the 1st/3rd/5th tri
bnez $11, tri_main // in that case draw directly
sll $3, $3, 8 // Move tri 3 index into bits 15:8
vmov $v27[7], $v27[6] // Move tri 2 to tri 3
lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word
addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command
beq $11, cmd_w0, tris_end // If off end of command, exit
sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit
bltz $10, tris_end // If negative, exit
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices
bltz $ra, tri_fan_store // Finish handling G_TRIFAN
addi cmd_w0, cmd_w0, 1 // Increment
andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri
bnez $11, tri_main // Draw as is
srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs
sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2
j tri_main
mtc2 $3, $v27[12] // Move tri 3 to tri 2
.if (. & 4)
.warning "One instruction of padding before tri handler"
.endif
.align 8
sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3
tV1AtF equ $v5
tV2AtF equ $v7
@@ -1337,79 +1331,77 @@ tV3AtI equ $v21
G_TRI2_handler:
G_QUAD_handler:
jal tri_main // Send second tri; return here for first tri
lpv $v27[0], (inputBufferEndSgn - 8)(inputBufferPos) // Second tri idxs elems 5, 6, 7
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store second tri indices
G_TRI1_handler:
lpv $v27[4], (inputBufferEndSgn - 8)(inputBufferPos) // First tri idxs elems 5, 6, 7
j tri_main
li $ra, tri_end // After done with this tri, exit tri processing
li $ra, tris_end // After done with this tri, exit tri processing
sw cmd_w0, 4(rdpCmdBufPtr) // Store first tri indices
tri_main:
vmudn $v29, vOne, $v30[0] // Address of vertex buffer
lw $6, geometryModeLabel // Load full geometry mode word
vmadl $v27, $v27, $v30[1] // Plus vtx indices times length
sb $zero, materialCullMode // This covers all tri cmds
vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6
li $24, CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri draw, check clipping
lpv $v27[0], 0(rdpCmdBufPtr) // To vector unit
lbu $1, 5(rdpCmdBufPtr)
lbu $2, 6(rdpCmdBufPtr)
lbu $3, 7(rdpCmdBufPtr)
vclr vZero
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
// vnop
sh $ra, tempTriRA // For tri cmds; where to go after clipping
mfc2 $1, $v27[10]
mfc2 $2, $v27[12]
lhu $1, (vertexTable)($1)
vmudn $v29, vOne, $v30[0] // Address of vertex buffer
lhu $2, (vertexTable)($2)
vmadl $v27, $v27, $v30[1] // Plus vtx indices times length
lhu $3, (vertexTable)($3)
vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6
.if !ENABLE_PROFILING
addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested
move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading
.endif
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
mfc2 $3, $v27[14]
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
vnxor tV1AtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
vnxor tV2AtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
vnxor tV3AtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding
lhu $5, VTX_CLIP($1)
vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
lhu $7, VTX_CLIP($2)
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
// vnop
lhu $8, VTX_CLIP($3)
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
andi $11, $5, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
and $11, $11, $7
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
and $11, $11, $8
vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
bnez $11, return_routine // Then the whole tri is offscreen, cull
// 24 cycles
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull
// 22 cycles
vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
lhu $24, activeClipPlanes
vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
or $10, $5, $7
lw $6, geometryModeLabel // Load full geometry mode word
vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
or $10, $10, $8 // $10 = all clip bits which are true for any verts
or $10, $5, $7
vmrg $v10, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
and $10, $10, $24 // If clipping is enabled, check clip flags
or $10, $10, $8 // $10 = all clip bits which are true for any verts
vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip
// 29 cycles
mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended
and $10, $10, $24 // If clipping is enabled, check clip flags
vmrg $v4, $v14, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
and $5, $5, $7
mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended
vmrg $v14, $v8, $v14 // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
and $5, $5, $8
bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip
// 30 cycles
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor
sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
vmrg $v2, $v4, $v10 // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
bltz $11, return_routine // Cull if bit is set (culled based on facing)
// 35 cycles
bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing)
// 34 cycles
vmrg $v10, $v10, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
vmudn $v4, $v14, $v31[5] // 0x4000
beqz $9, return_routine // If cross product is 0, tri is degenerate (zero area), cull.
// 37 cycles
beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull.
// 36 cycles
mfc2 $1, $v14[12] // $v14 = lowest Y value = highest on screen (x, y, addr)
vsub $v6, $v2, $v14
mfc2 $2, $v2[12] // $v2 = mid vertex (x, y, addr)
@@ -1423,6 +1415,8 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
mfc2 $3, $v10[12] // $v10 = highest Y value = lowest on screen (x, y, addr)
vsub $v15, $v10, $v2
.if !CFG_NO_OCCLUSION_PLANE
and $5, $5, $7
and $5, $5, $8
andi $5, $5, CLIP_OCCLUDED
.endif
vmudh $v29, $v6, $v8[0]
@@ -1456,7 +1450,7 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
vmrg tV3AtI, $v25, tV3AtI // RGB from $4, alpha from $3
tri_skip_flat_shading:
.endif
// 53 cycles
// 52 cycles
vrcp $v20[2], $v6[1]
lb $20, (alphaCompareCullMode)($zero)
vrcph $v22[2], $v6[1]
@@ -1487,11 +1481,11 @@ tri_skip_flat_shading:
mfc2 $24, $v26[6]
sub $24, $24, $19 // sign bit set if (max/min) < thresh
xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
bltz $24, return_routine // if max < thresh or if min >= thresh.
bltz $24, return_and_end_mat // if max < thresh or if min >= thresh.
tri_skip_alpha_compare_cull:
// 64 cycles
// 63 cycles
vmadm $v22, $v22, $v30[7] // 0x0020
sub $11, $5, $8
sub $11, $5, $8 // Four instr: $5 = max($5, $8)
vmadn $v20, $v31, $v31[2] // 0
sra $10, $11, 31
vmudm $v25, $v15, $v30[2] // 0x1000
@@ -1535,10 +1529,10 @@ tri_skip_alpha_compare_cull:
vmadm $v29, $v15, $v4[1]
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
vmadn $v2, $v22, $v26[1]
sb $zero, materialCullMode // This covers tri write out
vmadh $v3, $v15, $v26[1]
beqz $9, tri_skip_tex // If textures are not enabled, skip texture coefficient calculation
vmadh $v3, $v15, $v26[1]
// 88 cycles
vrcph $v29[0], $v27[0]
vrcph $v29[0], $v27[0]
vrcpl $v10[0], $v27[1]
vmudh $v14, vOne, $v13[1q]
vrcph $v27[0], $v31[2] // 0
@@ -1567,10 +1561,10 @@ tri_skip_alpha_compare_cull:
ldv tV1AtF[8], 0x0028(rdpCmdBufPtr) // 8
vmrg tV3AtF, tV3AtF, $v13 // Merge S, T, W into elems 4-6
tri_skip_tex:
// 109 cycles
.if !ENABLE_PROFILING
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
.endif
// 108 cycles
vmudl $v29, $v16, $v23
lsv tV1AtF[14], VTX_SCR_Z_FRAC($1)
vmadm $v29, $v17, $v23
@@ -1642,7 +1636,7 @@ tDaDyI equ $v7
// DaDe = DaDx * factor
tDaDeF equ $v8
tDaDeI equ $v9
// 137 cycles
// 136 cycles
vmadl $v29, tDaDxF, $v20[3]
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
vmadm $v29, tDaDxI, $v20[3]
@@ -1692,9 +1686,9 @@ tV1AtFF equ $v10
ssv tDaDyI[14], 0x0C($10)
ssv tV1AtF[14], 0x02($10)
tri_end_check_rdp_buffer_full:
bltz $8, return_routine // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
bltz $8, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
ssv tV1AtI[14], 0x00($10) // If returning from no-Z, this is okay b/c $10 is at end
// 162 cycles
// 161 cycles
flush_rdp_buffer: // $8 = rdpCmdBufPtr - rdpCmdBufEndP1
mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
@@ -1762,20 +1756,19 @@ no_z_buffer:
sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer)
.endif
tri_culled_by_occlusion_plane:
.if CFG_PROFILING_B
tri_culled_by_occlusion_plane:
jr $ra
addi perfCounterB, perfCounterB, 0x4000
addi perfCounterB, perfCounterB, 0x4000
.endif
return_and_end_mat:
jr $ra
sb $zero, materialCullMode // This covers all tri early exits except clipping
tri_fan_store:
lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1
j tri_main
sb $11, 5(rdpCmdBufPtr) // Store vtx 1
// This routine is used to return via conditional branch
.if !CFG_PROFILING_B
tri_culled_by_occlusion_plane:
.endif
return_routine:
jr $ra
nop
.if (. & 4)
.warning "One instruction of padding before ovl234"
.endif
@@ -1809,9 +1802,11 @@ ovl234_ovl4_entrypoint_ovl3ver: // same IMEM address as ovl234_ovl4_e
// Jump here to do clipping. If overlay 3 is loaded (this code), directly starts
// the clipping code.
ovl234_clipping_entrypoint:
sh $ra, tempTriRA // Tri return after clipping
.if CFG_PROFILING_B
addi perfCounterB, perfCounterB, 1 // Increment clipped (input) tris count
.endif
sb $zero, materialCullMode // In case only/all tri(s) clip then offscreen
jal vtx_setup_constants
li clipMaskIdx, 4
clip_after_constants:
@@ -2029,6 +2024,7 @@ clip_nextcond:
clip_draw_tris:
vclr vZero // TODO may not need this
sh $zero, activeClipPlanes
lqv $v30, (v30Value)($zero)
// Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite
// Draws verts in pattern like 0-1-4, 1-2-4, 2-3-4
@@ -2036,16 +2032,15 @@ clip_draw_tris_loop:
lhu $1, (clipPoly - 6)(clipPolySelect)
lhu $2, (clipPoly - 4)(clipPolySelect)
lhu $3, (clipPoly - 2)(clipPolyWrite)
mtc2 $1, $v6[12] // Addresses go in vector regs too
mtc2 $1, $v27[10] // Addresses go in vector regs too
mtc2 $2, $v4[12]
lw $6, geometryModeLabel // Load full geometry mode word
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
li $24, 0 // Init clipping flags for tri draw--no repeat clipping
jal tri_noinit
mtc2 $3, $v8[12]
mtc2 $3, $v27[14]
bne clipPolyWrite, clipPolySelect, clip_draw_tris_loop
addi clipPolySelect, clipPolySelect, 2
clip_done:
li $11, CLIP_SCAL_NPXY | CLIP_CAMPLANE
sh $11, activeClipPlanes
lqv $v30, (v30Value)($zero) // Need this repeated here in case we exited early
lh $ra, tempTriRA
@@ -2079,20 +2074,12 @@ ovl3_padded_end:
.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga())
ovl234_end:
vtx_after_dma:
andi inputVtxPos, dmemAddr, 0xFFF8 // Round down input start addr to DMA word
lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
srl $2, cmd_w0, 11 // n << 1
sub $2, cmd_w0, $2 // = v0 << 1
lhu outputVtxPos, (vertexTable)($2) // Address of start
sb $zero, materialCullMode // This covers vtx
lhu outputVtxPos, (vertexTable)($2) // Address of output start
.if COUNTER_A_UPPER_VERTEX_COUNT
sll $11, $1, 12 // Vtx count * 0x10000
add perfCounterA, perfCounterA, $11 // Add to vertex count
@@ -2167,7 +2154,7 @@ vtx_after_mtx_multiply:
skip_vtx_mvp:
andi $11, $5, G_LIGHTING >> 8
bnez $11, ovl234_lighting_entrypoint // Lighting setup, incl. transform
move inputVtxPos, dmemAddr // Must be before overlay load
sb $zero, materialCullMode // Vtx ends material
vtx_after_lt_setup:
lqv vM0I, (mITMatrix + 0x00)($zero) // Load MVP matrix
lqv vM2I, (mITMatrix + 0x10)($zero)
@@ -2197,6 +2184,7 @@ vtx_after_lt_setup:
@@skipzeroao:
bgtz $ra, clip_after_constants // Return to clipping if from there
sqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Store viewport scale
sb $zero, materialCullMode // Vtx ends material
lqv vM0I, (mMatrix + 0x00)($zero) // Load M matrix
lqv vM2I, (mMatrix + 0x10)($zero)
lqv vM0F, (mMatrix + 0x20)($zero)
@@ -2214,7 +2202,6 @@ vtx_after_lt_setup:
srl $7, $5, 9 // G_LIGHTING in bit 1
and $7, $7, $11 // If lighting enabled and need to update matrix,
and $7, $7, $10 // and computing mIT,
move inputVtxPos, dmemAddr // this must be before overlay load, can be clobbered
ldv vM3F[0], (mMatrix + 0x38)($zero)
ldv vM0I[8], (mMatrix + 0x00)($zero)
ldv vM2I[8], (mMatrix + 0x10)($zero)
@@ -2247,7 +2234,7 @@ vtx_after_matrix_load:
addi $19, rdpCmdBufEndP1, vtxSize // Temp mem; fog writes up to vtxSize before
jal while_wait_dma_busy // Wait for vertex load to finish
move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos
andi $11, $5, G_LIGHTING >> 8
andi $11, $5, G_LIGHTING >> 8 // Must be after the DMA wait b/c modifies $ra
beqz $11, @@skip_lighting
li $ra, vtx_loop_no_lighting
li $ra, lt_vtx_pair
@@ -2609,7 +2596,7 @@ skip_return_to_lt_or_loop:
ssv sCLZ[4], (VTX_SCR_Z )($19)
// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23
vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords
beqz $7, return_routine // fog disabled
beqz $7, return_and_end_mat // fog disabled
// sOUTI = vPairPosI is $v20, or vPairTPosI is $v24
vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords
sbv sFOG[15], (VTX_COLOR_A )(secondVtxPos)
@@ -2796,7 +2783,7 @@ vertex_end:
.if !CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE
lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store
.endif
tri_end:
tris_end:
mfc0 $11, DPC_CLOCK
lw $10, startCounterTime
sub $11, $11, $10
@@ -3214,6 +3201,7 @@ ovl234_ovl4_entrypoint_ovl2ver: // same IMEM address as ovl234_ovl4_e
// Jump here to do clipping. If overlay 2 is loaded (this code), loads overlay 3
// and jumps to right here, which is now in the new code.
ovl234_clipping_entrypoint_ovl2ver: // same IMEM address as ovl234_clipping_entrypoint
sh $ra, tempTriRA // Tri return after clipping
.if CFG_PROFILING_B
addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load
.endif
@@ -3756,6 +3744,7 @@ G_MTX_end:
// Jump here to do clipping. If overlay 4 is loaded (this code), loads overlay 3
// and jumps to right here, which is now in the new code.
ovl234_clipping_entrypoint_ovl4ver: // same IMEM address as ovl234_clipping_entrypoint
sh $ra, tempTriRA // Tri return after clipping
.if CFG_PROFILING_B
addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load
.endif