Working on optimizations

This commit is contained in:
Sauraen
2025-08-03 18:10:50 -07:00
parent 84fc8d3786
commit 86be1a09b9
2 changed files with 107 additions and 102 deletions

View File

@@ -18,8 +18,8 @@ into account, but in some cases it is assumed to be optimal.
All numbers assume default profiling configuration. Tri numbers assume texture,
shade, and Z, and not flushing the buffer. Tri numbers are measured from the
first cycle of the command handler inclusive, to the first cycle of whatever is
after $ra exclusive; this is in order to capture the extra latency and stalls in
F3DEX2.
after $ra exclusive; this is in order to capture an extra stall cycle in F3DEX2
when finishing a triangle and going to the next command.
Vertex numbers assume no extra F3DEX3 features (packed normals, ambient
occlusion, etc.). These features are listed below as the number of extra cycles
@@ -33,19 +33,19 @@ even to an odd number of lights adds a different time than vice versa.
|----------------------------|--------|------------|--------|
| Command dispatch | 12 | 12 | 12 |
| Small RDP command | 14 | 5 | 5 |
| Only/2nd tri to offscreen | 27 | 26 | 26 |
| 1st tri to offscreen | 28 | 27 | 27 |
| Only/2nd tri to clip | 32 | 31 | 31 |
| 1st tri to clip | 33 | 32 | 32 |
| Only/2nd tri to backface | 38 | 38 | 38 |
| 1st tri to backface | 39 | 39 | 39 |
| Only/2nd tri to degenerate | 42 | 40 | 40 |
| 1st tri to degenerate | 43 | 41 | 41 |
| Only/2nd tri to occluded | Can't | Can't | 49 |
| 1st tri to occluded | Can't | Can't | 50 |
| Only/2nd tri to draw | 172 | 159 | 162 |
| 1st tri to draw | 173 | 160 | 163 |
| Extra per tri from snake | Can't | 10 | 10 |
| Only/2nd tri to offscreen | 27 | 25 | 25 |
| 1st tri to offscreen | 28 | 26 | 26 |
| Only/2nd tri to clip | 32 | 30 | 30 |
| 1st tri to clip | 33 | 31 | 31 |
| Only/2nd tri to backface | 38 | 36 | 36 |
| 1st tri to backface | 39 | 37 | 37 |
| Only/2nd tri to degenerate | 42 | 38 | 38 |
| 1st tri to degenerate | 43 | 39 | 39 |
| Only/2nd tri to occluded | Can't | Can't | 42 |
| 1st tri to occluded | Can't | Can't | 43 |
| Only/2nd tri to draw | 172 | 156 | 159 |
| 1st tri to draw | 173 | 157 | 160 |
| Extra per tri from snake | Can't | 9 | 9 |
| Vtx before DMA start | 16 | 17 | 17 |
| Vtx pair, no lighting | 54 | 54 | 70 |
| Vtx pair, 0 dir lts | Can't | 65 | 81 |

179
f3dex3.s
View File

@@ -786,6 +786,7 @@ $ra return address, sometimes sign bit is flag -------------------------------
*/
// Global scalar regs:
vGeomMid equ $5 // Middle two bytes of geometry mode in lower 16 bits
perfCounterD equ $12 // Performance counter D (functions depend on config)
altBaseReg equ $13 // Alternate base address register for vector loads
rdpCmdBufEndP1 equ $22 // Pointer to one command word past "end" (middle) of RDP command buf
@@ -799,7 +800,6 @@ perfCounterC equ $30 // Performance counter C (functions depend on config)
// Vertex write:
vtxLeft equ $1 // Number of vertices left to process * 0x10
vLoopRet equ $3 // Return address at end of vtx loop = top of loop or misc lighting
vGeomMid equ $5 // Middle two bytes of geometry mode
fogFlag equ $7 // 8 if fog enabled, else 0
outVtx2 equ $8 // Pointer to second or dummy (= outVtx1) transformed vert
inVtx equ $14 // Pointer to loaded vertex to transform; < 0 means from clipping.
@@ -1098,6 +1098,7 @@ finish_setup:
sw $11, startCounterTime
.endif
sh $zero, mvpValid // and dirLightsXfrmValid
lhu vGeomMid, geometryModeLabel + 1
li inputBufferPos, 0
li cmd_w1_dram, orga(ovl1_start)
j load_overlays_0_1
@@ -1277,20 +1278,20 @@ G_MODIFYVTX_handler:
// cmd_w0 + inputBufferEnd
G_TRISNAKE_handler:
sw cmd_w0, rdpHalf1Val // Store indices a, b, c
addi inputBufferPos, inputBufferPos, -5 // Point to byte 3, index c of 1st tri
addi inputBufferPos, inputBufferPos, -6 // Point to byte 2, index b of 1st tri
li $ra, tri_snake_loop // For tri_main
tri_snake_loop:
lh $3, (inputBufferEnd - 1)(inputBufferPos) // Load indices b and c
lh $3, (inputBufferEnd)(inputBufferPos) // Load indices b and c
addi inputBufferPos, inputBufferPos, 1 // Increment indices being read
tri_snake_loop_from_input_buffer:
lb $2, rdpHalf1Val + 1 // Old v1; == index b, except when bridging between old and new load
li $ra, tri_snake_loop // For tri_main
bltz $3, tri_snake_end // Upper bit of real index b set = done
andi $11, $3, 1 // Get direction flag from index c
beqz inputBufferPos, tri_snake_over_input_buffer // == 0 at end of input buffer
andi $3, $3, 0x7E // Mask out flags from index c
sb $3, rdpHalf1Val + 1 // Store index c as vertex 1
sb $2, (rdpHalf1Val + 2)($11) // Store old v1 as 2 if dir clear or 3 if set
j tri_main
addi inputBufferPos, inputBufferPos, 1 // Increment indices being read
sb $2, (rdpHalf1Val + 2)($11) // Store old v1 as 2 if dir clear or 3 if set
// H = highest on screen = lowest Y value; then M = mid, L = low
tHAtF equ $v5
@@ -1317,128 +1318,130 @@ tri_main:
lpv $v27[4], (rdpHalf1Val)($zero) // To vector unit in elems 5-7
lbu $1, rdpHalf1Val + 1
lbu $2, rdpHalf1Val + 2
lbu $3, rdpHalf1Val + 3
vclr vZero
lhu $1, (vertexTable)($1)
lbu $3, rdpHalf1Val + 3
vmudn $v29, vOne, vTRC_VB // Address of vertex buffer
lhu $2, (vertexTable)($2)
lhu $1, (vertexTable)($1)
vmadl $v27, $v27, vTRC_VS // Plus vtx indices times length
lhu $2, (vertexTable)($2)
vmadl $v6, $v31, $v31[2] // 0; vtx 1 addr in $v6 elem 5
lhu $3, (vertexTable)($3)
vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6
.if !ENABLE_PROFILING
// vnop
addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested
move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading
.endif
tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
vnxor tHAtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding
vmov $v4[5], $v27[6] // elem 5 of v4 = vertex 2 addr
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
vnxor tMAtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
vmov $v8[5], $v27[7] // elem 5 of v8 = vertex 3 addr
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
vnxor tMAtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
vnxor tLAtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding
lhu $16, VTX_CLIP($1)
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
lhu $7, VTX_CLIP($2)
// vnop
lhu $8, VTX_CLIP($3)
vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
andi $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
lhu $7, VTX_CLIP($2)
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
and $11, $11, $7
lhu $8, VTX_CLIP($3)
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
and $11, $11, $8
andi $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
and $11, $11, $7
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
and $11, $11, $8
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull
// 22 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
lhu $24, activeClipPlanes
// 21 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
lw $6, geometryModeLabel // Load full geometry mode word
lhu $24, activeClipPlanes
vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
or $10, $16, $7
sll $20, vGeomMid, 29 // Original bit 10 (now bit 2) in the sign bit, for facing cull
vmrg tLPos, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
or $10, $10, $8 // $10 = all clip bits which are true for any verts
or $10, $16, $7
vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
and $10, $10, $24 // If clipping is enabled, check clip flags
or $10, $10, $8 // $10 = all clip bits which are true for any verts
vmrg $v4, tHPos, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended
vmrg tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
// 30 cycles
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
and $10, $10, $24 // If clipping is enabled, check clip flags
vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
// 29 cycles
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor
sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
vmrg tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing)
// 34 cycles
// 32 cycles
vmrg tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
tSubPxHF equ $v4
tSubPxHI equ $v26
vmudn tSubPxHF, tHPos, $v31[5] // 0x4000
beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull.
// 36 cycles
mfc2 $1, tHPos[12] // tHPos = lowest Y value = highest on screen (x, y, addr)
// 34 cycles
.if !CFG_NO_OCCLUSION_PLANE
and $16, $16, $7
.endif
vsub tPosMmH, tMPos, tHPos
.if !CFG_NO_OCCLUSION_PLANE
and $16, $16, $8
.endif
vsub tPosLmH, tLPos, tHPos
.if !CFG_NO_OCCLUSION_PLANE
andi $16, $16, CLIP_OCCLUDED
bnez $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
// 38 cycles
.endif
mfc2 $1, tHPos[10] // tHPos = lowest Y value = highest on screen (x, y, addr)
// 36 cycles if NOC (39 if occlusion plane)
vsub tPosHmM, tHPos, tMPos
mfc2 $2, tMPos[10] // tMPos = mid vertex (x, y, addr)
tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
tPosCatF equ $v25
vsub tPosMmH, tMPos, tHPos
mfc2 $2, tMPos[12] // tMPos = mid vertex (x, y, addr)
vsub tPosLmH, tLPos, tHPos
.if !ENABLE_PROFILING
sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit
.endif
vsub tPosHmM, tHPos, tMPos
andi $6, $6, (G_SHADE | G_ZBUFFER)
vsub tPosCatI, tLPos, tMPos
mfc2 $3, tLPos[12] // tLPos = highest Y value = lowest on screen (x, y, addr)
vmov tPosCatI[2], tPosMmH[0]
.if !CFG_NO_OCCLUSION_PLANE
and $16, $16, $7
and $16, $16, $8
andi $16, $16, CLIP_OCCLUDED
.if !ENABLE_PROFILING
andi $11, vGeomMid, G_SHADING_SMOOTH >> 8
.endif
tXPF equ $v16 // Triangle cross product
tXPI equ $v17
tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
tXPRcpI equ $v24
vmov tPosCatI[2], tPosMmH[0]
lbu $6, geometryModeLabel + 3 // Load lowest byte for G_SHADE, G_ZBUFFER. Also has G_ATTROFFSET_ST_ENABLE, but G_TRI_FILL will get OR'd into it and force that set.
vmudh $v29, tPosMmH, tPosLmH[0]
mfc2 $3, tLPos[10] // tLPos = highest Y value = lowest on screen (x, y, addr)
t1WI equ $v13 // elems 0, 4, 6
t1WF equ $v14
vmudh $v29, tPosMmH, tPosLmH[0]
.if !CFG_NO_OCCLUSION_PLANE
bnez $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
.endif
llv t1WI[0], VTX_INV_W_VEC($1)
vmadh $v29, tPosLmH, tPosHmM[0]
lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
llv t1WI[0], VTX_INV_W_VEC($1)
tXPF equ $v16 // Triangle cross product
tXPI equ $v17
vreadacc tXPI, ACC_UPPER
lpv tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
vreadacc tXPF, ACC_MIDDLE
lpv tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
vrcp $v20[0], tPosCatI[1]
.if !ENABLE_PROFILING
lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx)
.endif
vmov tPosCatI[3], tPosLmH[0]
llv t1WI[8], VTX_INV_W_VEC($2)
vrcph $v22[0], tXPI[1]
vrcp $v20[0], tPosCatI[1]
lpv tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
vmov tPosCatI[3], tPosLmH[0]
llv t1WI[12], VTX_INV_W_VEC($3)
vrcph $v22[0], tXPI[1]
lpv tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
tXPRcpI equ $v24
vrcpl tXPRcpF[1], tXPF[1]
.if !ENABLE_PROFILING
bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set
.endif
vrcph tXPRcpI[1], $v31[2] // 0
.if !ENABLE_PROFILING
lbu $10, rdpHalf1Val + 1 // Original vertex 1
lhu $10, (vertexTable)($10)
lpv $v25[0], VTX_COLOR_VEC($10) // Load RGB from vertex 4 (flat shading vtx)
vlt $v29, $v31, $v31[3] // Set vcc to 11100000
vmrg tHAtI, $v25, tHAtI // RGB from $4, alpha from $1
vmrg tMAtI, $v25, tMAtI // RGB from $4, alpha from $2
vmrg tLAtI, $v25, tLAtI // RGB from $4, alpha from $3
tri_skip_flat_shading:
.endif
// 52 cycles
// 49 cycles
vrcp $v20[2], tPosMmH[1]
lb $20, (alphaCompareCullMode)($zero)
vrcph $v22[2], tPosMmH[1]
@@ -1471,7 +1474,7 @@ tri_skip_flat_shading:
xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
bltz $24, return_and_end_mat // if max < thresh or if min >= thresh.
tri_skip_alpha_compare_cull:
// 63 cycles
// 60 cycles
vmudm tPosCatF, tPosCatI, vTRC_1000
// no nop if tri_skip_alpha_compare_cull was unaligned
vmadn tPosCatI, $v31, $v31[2] // 0
@@ -1494,7 +1497,7 @@ tMx1W equ $v27
vmadm $v29, tXPRcpI, tXPF
mfc2 $16, tXPI[1]
vmadn tXPF, tXPRcpF, tXPI
lbu $7, textureSettings1 + 2
vmadh tXPI, tXPRcpI, tXPI
lsv tMAtI[14], VTX_SCR_Z($2)
vand $v22, $v20, vTRC_FFF8
@@ -1504,11 +1507,8 @@ tMx1W equ $v27
vmudh $v29, vOne, $v31[4] // 4
lsv tLAtF[14], VTX_SCR_Z_FRAC($3)
vmadn tXPF, tXPF, $v31[0] // -4
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
vmadh tXPI, tXPI, $v31[0] // -4
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
vmudn $v29, $v3, tHPos[0]
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
vmadl $v29, $v22, tSubPxHF[1]
ssv tLPos[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
vmadm $v29, tPosCatI, tSubPxHF[1]
@@ -1516,17 +1516,17 @@ tMx1W equ $v27
vmadn $v2, $v22, tSubPxHI[1]
ssv tHPos[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
vmadh $v3, tPosCatI, tSubPxHI[1]
lw $19, otherMode1
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
tMnWI equ $v27
tMnWF equ $v10
vrcph $v29[0], tMx1W[0] // Reciprocal of max 1/W = min W
andi $10, $16, 0x0080 // Extract the left major flag from $16
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
vrcpl tMnWF[0], tMx1W[1]
or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
lbu $7, textureSettings1 + 2
vmudh t1WF, vOne, t1WI[1q]
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
vrcph tMnWI[0], $v31[2] // 0
sb $zero, materialCullMode // This covers tri write out
lw $19, otherMode1
tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF
tSTWHMF equ $v25
vmudh tSTWHMI, vOne, $v31[7] // 0x7FFF
@@ -1550,8 +1550,11 @@ tSTWLF equ $v13
vmadh tSTWHMI, tSTWHMI, t1WI[0h]
ldv tPosLmH[8], 0x0030(rdpCmdBufPtr) // MmHY -> e4, LmHX -> e5, HmMX -> e6
vmadn tSTWHMF, $v31, $v31[2] // 0
andi $10, $16, 0x0080 // Extract the left major flag from $16
vmudm $v29, tSTWLI, t1WF[6] // (S, T, 7FFF) * (1 or <1) for L
or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
vmadh tSTWLI, tSTWLI, t1WI[6]
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
vmadn tSTWLF, $v31, $v31[2] // 0
sdv tSTWHMI[0], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int to temp mem
vmrg tMAtI, tMAtI, tSTWHMI // Merge S, T, W Mid into elems 4-6
@@ -1564,7 +1567,7 @@ tSTWLF equ $v13
.if !ENABLE_PROFILING
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
.endif
// 106 cycles
// 103 cycles
vmudl $v29, tXPF, tXPRcpF
lsv tHAtF[14], VTX_SCR_Z_FRAC($1)
vmadm $v29, tXPI, tXPRcpF
@@ -1574,14 +1577,15 @@ tSTWLF equ $v13
vmadh tXPRcpI, tXPI, tXPRcpI
addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
vmudh tPosLmH, tPosLmH, $v31[0h] // e1 LmHY * -4 = 4*HmLY; e456 MmHY,LmHX,HmMX *= 4
andi $3, $6, G_SHADE
tAtLmHF equ $v10
tAtLmHI equ $v9
tAtMmHF equ $v13
tAtMmHI equ $v7
vsubc tAtLmHF, tLAtF, tHAtF
andi $3, $6, G_SHADE
vsub tAtLmHI, tLAtI, tHAtI
sll $1, $1, 14
vsub tAtLmHI, tLAtI, tHAtI
sb $zero, materialCullMode // This covers tri write out
vsubc tAtMmHF, tMAtF, tHAtF
sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient
vsub tAtMmHI, tMAtI, tHAtI
@@ -1636,7 +1640,7 @@ tDaDyI equ $v7
// DaDe = DaDx * factor
tDaDeF equ $v8
tDaDeI equ $v9
// 135 cycles
// 132 cycles
vmadl $v29, tDaDxF, $v20[3]
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
vmadm $v29, tDaDxI, $v20[3]
@@ -1677,7 +1681,7 @@ tri_return_from_decal_fix_z:
slv tDaDeI[12], 0x08($10) // DzDeI:F
bltz dmemAddr, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
slv $v10[12], 0x00($10) // ZI:F
// 156 cycles
// 153 cycles
flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
mfc0 $11, SP_DMA_BUSY // Check if any DMA is in flight
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
@@ -2140,7 +2144,6 @@ vtx_after_dma:
vtx_constants_for_clip:
// Sets up constants needed for vertex loop, including during clipping.
// Results fill vPerm1:4. Uses misc temps.
lhu vGeomMid, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
.if CFG_NO_OCCLUSION_PLANE
llv sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
@@ -2612,6 +2615,7 @@ tri_snake_over_input_buffer:
j displaylist_dma_tri_snake // inputBufferPos is now 0; load whole buffer
li nextRA, tri_snake_ret_from_input_buffer
tri_snake_ret_from_input_buffer:
li $ra, tri_snake_loop // Clobbered by DMA. Putting this in the loop saves an instruction but loop takes 1 more cycle per tri.
j tri_snake_loop_from_input_buffer // inputBufferPos pointing to first byte loaded
lbu $3, (inputBufferEnd)(inputBufferPos) // Load c; clear real index b sign bit -> don't exit
@@ -2913,12 +2917,13 @@ G_SETSCISSOR_handler: // $1 is 0 if jumped here
j G_RDP_handler // Send the command to the RDP
sw cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8
G_GEOMETRYMODE_handler: // 5; $7 = G_GEOMETRYMODE (as negative) if jumped here
lw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
G_GEOMETRYMODE_handler:
lw $11, geometryModeLabel // load the geometry mode value
and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
sw $11, geometryModeLabel // update the geometry mode value
j run_next_DL_command // run the next DL command
sw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // update the geometry mode value
lsr vGeomMid, $11, 8 // Middle 2 bytes of geom mode to lower 16 bits
G_TEXTURE_handler: // 4
li $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler) // Calculate the offset from texrectWord1 and $11 for saving to textureSettings