Working on optimizations

2026-01-21 10:37:45 -08:00 · 2025-08-03 18:10:50 -07:00
parent 84fc8d3786
commit 86be1a09b9
2 changed files with 107 additions and 102 deletions
--- a/docs/Documentation/Performance.md
+++ b/docs/Documentation/Performance.md
@@ -18,8 +18,8 @@ into account, but in some cases it is assumed to be optimal.
 All numbers assume default profiling configuration. Tri numbers assume texture,
 shade, and Z, and not flushing the buffer. Tri numbers are measured from the
 first cycle of the command handler inclusive, to the first cycle of whatever is
-after $ra exclusive; this is in order to capture the extra latency and stalls in
-F3DEX2.
+after $ra exclusive; this is in order to capture an extra stall cycle in F3DEX2
+when finishing a triangle and going to the next command.

 Vertex numbers assume no extra F3DEX3 features (packed normals, ambient
 occlusion, etc.). These features are listed below as the number of extra cycles
@@ -33,19 +33,19 @@ even to an odd number of lights adds a different time than vice versa.
 |----------------------------|--------|------------|--------|
 | Command dispatch           | 12     | 12         | 12     |
 | Small RDP command          | 14     | 5          | 5      |
-| Only/2nd tri to offscreen  | 27     | 26         | 26     |
-| 1st tri to offscreen       | 28     | 27         | 27     |
-| Only/2nd tri to clip       | 32     | 31         | 31     |
-| 1st tri to clip            | 33     | 32         | 32     |
-| Only/2nd tri to backface   | 38     | 38         | 38     |
-| 1st tri to backface        | 39     | 39         | 39     |
-| Only/2nd tri to degenerate | 42     | 40         | 40     |
-| 1st tri to degenerate      | 43     | 41         | 41     |
-| Only/2nd tri to occluded   | Can't  | Can't      | 49     |
-| 1st tri to occluded        | Can't  | Can't      | 50     |
-| Only/2nd tri to draw       | 172    | 159        | 162    |
-| 1st tri to draw            | 173    | 160        | 163    |
-| Extra per tri from snake   | Can't  | 10         | 10     |
+| Only/2nd tri to offscreen  | 27     | 25         | 25     |
+| 1st tri to offscreen       | 28     | 26         | 26     |
+| Only/2nd tri to clip       | 32     | 30         | 30     |
+| 1st tri to clip            | 33     | 31         | 31     |
+| Only/2nd tri to backface   | 38     | 36         | 36     |
+| 1st tri to backface        | 39     | 37         | 37     |
+| Only/2nd tri to degenerate | 42     | 38         | 38     |
+| 1st tri to degenerate      | 43     | 39         | 39     |
+| Only/2nd tri to occluded   | Can't  | Can't      | 42     |
+| 1st tri to occluded        | Can't  | Can't      | 43     |
+| Only/2nd tri to draw       | 172    | 156        | 159    |
+| 1st tri to draw            | 173    | 157        | 160    |
+| Extra per tri from snake   | Can't  | 9          | 9      |
 | Vtx before DMA start       | 16     | 17         | 17     |
 | Vtx pair, no lighting      | 54     | 54         | 70     |
 | Vtx pair, 0 dir lts        | Can't  | 65         | 81     |
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -786,6 +786,7 @@ $ra   return address, sometimes sign bit is flag -------------------------------
 */

 // Global scalar regs:
+vGeomMid       equ $5    // Middle two bytes of geometry mode in lower 16 bits
 perfCounterD   equ $12   // Performance counter D (functions depend on config)
 altBaseReg     equ $13   // Alternate base address register for vector loads
 rdpCmdBufEndP1 equ $22   // Pointer to one command word past "end" (middle) of RDP command buf
@@ -799,7 +800,6 @@ perfCounterC   equ $30   // Performance counter C (functions depend on config)
 // Vertex write:
 vtxLeft        equ $1    // Number of vertices left to process * 0x10
 vLoopRet       equ $3    // Return address at end of vtx loop = top of loop or misc lighting
-vGeomMid       equ $5    // Middle two bytes of geometry mode
 fogFlag        equ $7    // 8 if fog enabled, else 0
 outVtx2        equ $8    // Pointer to second or dummy (= outVtx1) transformed vert
 inVtx          equ $14   // Pointer to loaded vertex to transform; < 0 means from clipping.
@@ -1098,6 +1098,7 @@ finish_setup:
    sw      $11, startCounterTime
 .endif
    sh      $zero, mvpValid  // and dirLightsXfrmValid
+    lhu     vGeomMid, geometryModeLabel + 1
    li      inputBufferPos, 0
    li      cmd_w1_dram, orga(ovl1_start)
    j       load_overlays_0_1
@@ -1277,20 +1278,20 @@ G_MODIFYVTX_handler:
 //                  cmd_w0 + inputBufferEnd
 G_TRISNAKE_handler:
    sw      cmd_w0, rdpHalf1Val          // Store indices a, b, c
-    addi    inputBufferPos, inputBufferPos, -5 // Point to byte 3, index c of 1st tri
+    addi    inputBufferPos, inputBufferPos, -6 // Point to byte 2, index b of 1st tri
+    li      $ra, tri_snake_loop          // For tri_main
 tri_snake_loop:
-    lh      $3, (inputBufferEnd - 1)(inputBufferPos) // Load indices b and c
+    lh      $3, (inputBufferEnd)(inputBufferPos) // Load indices b and c
+    addi    inputBufferPos, inputBufferPos, 1  // Increment indices being read
 tri_snake_loop_from_input_buffer:
    lb      $2, rdpHalf1Val + 1          // Old v1; == index b, except when bridging between old and new load
-    li      $ra, tri_snake_loop          // For tri_main
    bltz    $3, tri_snake_end            // Upper bit of real index b set = done
     andi   $11, $3, 1                   // Get direction flag from index c
    beqz    inputBufferPos, tri_snake_over_input_buffer // == 0 at end of input buffer
     andi   $3, $3, 0x7E                 // Mask out flags from index c
    sb      $3, rdpHalf1Val + 1          // Store index c as vertex 1
-    sb      $2, (rdpHalf1Val + 2)($11)   // Store old v1 as 2 if dir clear or 3 if set
    j       tri_main
-     addi   inputBufferPos, inputBufferPos, 1  // Increment indices being read
+     sb     $2, (rdpHalf1Val + 2)($11)   // Store old v1 as 2 if dir clear or 3 if set

 // H = highest on screen = lowest Y value; then M = mid, L = low
 tHAtF equ $v5
@@ -1317,128 +1318,130 @@ tri_main:
    lpv     $v27[4], (rdpHalf1Val)($zero) // To vector unit in elems 5-7
    lbu     $1, rdpHalf1Val + 1
    lbu     $2, rdpHalf1Val + 2
-    lbu     $3, rdpHalf1Val + 3
    vclr    vZero
-    lhu     $1, (vertexTable)($1)
+    lbu     $3, rdpHalf1Val + 3
    vmudn   $v29, vOne, vTRC_VB    // Address of vertex buffer
-    lhu     $2, (vertexTable)($2)
+    lhu     $1, (vertexTable)($1)
    vmadl   $v27, $v27, vTRC_VS    // Plus vtx indices times length
+    lhu     $2, (vertexTable)($2)
+    vmadl   $v6, $v31, $v31[2]    // 0; vtx 1 addr in $v6 elem 5
    lhu     $3, (vertexTable)($3)
-    vmadl   $v4, $v31, $v31[2]    // 0; vtx 2 addr in $v4 elem 6
 .if !ENABLE_PROFILING
+    // vnop
    addi    perfCounterB, perfCounterB, 0x4000  // Increment number of tris requested
-    move    $4, $1                // Save original vertex 1 addr (pre-shuffle) for flat shading
 .endif
 tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
    vnxor   tHAtF, vZero, $v31[7]  // v5 = 0x8000; init frac value for attrs for rounding
+    vmov    $v4[5], $v27[6]         // elem 5 of v4 = vertex 2 addr
    llv     $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
-    vnxor   tMAtF, vZero, $v31[7]  // v7 = 0x8000; init frac value for attrs for rounding
+    vmov    $v8[5], $v27[7]         // elem 5 of v8 = vertex 3 addr
    llv     $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
-    vmov    $v6[6], $v27[5]         // elem 6 of v6 = vertex 1 addr
+    vnxor   tMAtF, vZero, $v31[7]  // v7 = 0x8000; init frac value for attrs for rounding
    llv     $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
    vnxor   tLAtF, vZero, $v31[7]  // v9 = 0x8000; init frac value for attrs for rounding
    lhu     $16, VTX_CLIP($1)
-    vmov    $v8[6], $v27[7]         // elem 6 of v8 = vertex 3 addr
-    lhu     $7, VTX_CLIP($2)
-    // vnop
-    lhu     $8, VTX_CLIP($3)
    vmudh   $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
-    andi    $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
+    lhu     $7, VTX_CLIP($2)
    vsub    $v10, $v6, $v4    // v10 = vertex 1 - vertex 2 (x, y, addr)
-    and     $11, $11, $7
+    lhu     $8, VTX_CLIP($3)
    vsub    $v12, $v6, $v8    // v12 = vertex 1 - vertex 3 (x, y, addr)
-    and     $11, $11, $8
+    andi    $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
    vsub    $v11, $v4, $v6    // v11 = vertex 2 - vertex 1 (x, y, addr)
+    and     $11, $11, $7
    vlt     $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
+    and     $11, $11, $8
+    vmrg    tHPos, $v6, $v4   // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
    bnez    $11, return_and_end_mat // Then the whole tri is offscreen, cull
-     // 22 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
-     vmrg   tHPos, $v6, $v4   // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
-    vmudh   $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... 
-    lhu     $24, activeClipPlanes
+     // 21 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
+     vmudh  $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... 
    vmadh   $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
-    lw      $6, geometryModeLabel // Load full geometry mode word
+    lhu     $24, activeClipPlanes
    vge     $v2, $v2, $v4[1]  // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
-    or      $10, $16, $7
+    sll     $20, vGeomMid, 29 // Original bit 10 (now bit 2) in the sign bit, for facing cull
    vmrg    tLPos, $v6, $v4   // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
-    or      $10, $10, $8      // $10 = all clip bits which are true for any verts
+    or      $10, $16, $7
    vge     $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
-    and     $10, $10, $24     // If clipping is enabled, check clip flags
+    or      $10, $10, $8      // $10 = all clip bits which are true for any verts
    vmrg    $v4, tHPos, $v8   // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
    mfc2    $9, $v26[0]       // elem 0 = x = cross product => lower 16 bits, sign extended
    vmrg    tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
-    bnez    $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
-     // 30 cycles
-     sll    $20, $6, 21       // Bit 10 in the sign bit, for facing cull
+    and     $10, $10, $24     // If clipping is enabled, check clip flags
    vlt     $v29, $v6, $v2    // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
-    srl     $11, $9, 31       // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
+    bnez    $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
+     // 29 cycles
+     srl    $11, $9, 31       // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
    vmudh   $v3, vOne, $v31[5] // 0x4000; some rounding factor
    sllv    $11, $20, $11     // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
    vmrg    tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
    bltz    $11, return_and_end_mat // Cull if bit is set (culled based on facing)
-     // 34 cycles
+     // 32 cycles
     vmrg   tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
 tSubPxHF equ $v4
 tSubPxHI equ $v26
    vmudn   tSubPxHF, tHPos, $v31[5] // 0x4000
    beqz    $9, return_and_end_mat  // If cross product is 0, tri is degenerate (zero area), cull.
-     // 36 cycles
-     mfc2   $1, tHPos[12]     // tHPos = lowest Y value = highest on screen (x, y, addr)
+     // 34 cycles
+.if !CFG_NO_OCCLUSION_PLANE
+     and    $16, $16, $7
+.endif
+     vsub   tPosMmH, tMPos, tHPos
+.if !CFG_NO_OCCLUSION_PLANE
+    and     $16, $16, $8
+.endif
+    vsub    tPosLmH, tLPos, tHPos
+.if !CFG_NO_OCCLUSION_PLANE
+    andi    $16, $16, CLIP_OCCLUDED
+    bnez    $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
+     // 38 cycles
+.endif
+     mfc2   $1, tHPos[10]     // tHPos = lowest Y value = highest on screen (x, y, addr)
+    // 36 cycles if NOC (39 if occlusion plane)
+    vsub    tPosHmM, tHPos, tMPos
+    mfc2    $2, tMPos[10]     // tMPos = mid vertex (x, y, addr)
 tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
 tPosCatF equ $v25
-    vsub    tPosMmH, tMPos, tHPos
-    mfc2    $2, tMPos[12]     // tMPos = mid vertex (x, y, addr)
-    vsub    tPosLmH, tLPos, tHPos
-.if !ENABLE_PROFILING
-    sll     $11, $6, 10       // Moves the value of G_SHADING_SMOOTH into the sign bit
-.endif
-    vsub    tPosHmM, tHPos, tMPos
-    andi    $6, $6, (G_SHADE | G_ZBUFFER)
    vsub    tPosCatI, tLPos, tMPos
-    mfc2    $3, tLPos[12]     // tLPos = highest Y value = lowest on screen (x, y, addr)
-    vmov    tPosCatI[2], tPosMmH[0]
-.if !CFG_NO_OCCLUSION_PLANE
-    and     $16, $16, $7
-    and     $16, $16, $8
-    andi    $16, $16, CLIP_OCCLUDED
+.if !ENABLE_PROFILING
+    andi    $11, vGeomMid, G_SHADING_SMOOTH >> 8
 .endif
-tXPF equ $v16 // Triangle cross product
-tXPI equ $v17
-tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
-tXPRcpI equ $v24
+    vmov    tPosCatI[2], tPosMmH[0]
+    lbu     $6, geometryModeLabel + 3 // Load lowest byte for G_SHADE, G_ZBUFFER. Also has G_ATTROFFSET_ST_ENABLE, but G_TRI_FILL will get OR'd into it and force that set.
+    vmudh   $v29, tPosMmH, tPosLmH[0]
+    mfc2    $3, tLPos[10]     // tLPos = highest Y value = lowest on screen (x, y, addr)
 t1WI equ $v13 // elems 0, 4, 6
 t1WF equ $v14
-    vmudh   $v29, tPosMmH, tPosLmH[0]
-.if !CFG_NO_OCCLUSION_PLANE
-    bnez    $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
-.endif
-    llv     t1WI[0], VTX_INV_W_VEC($1)
    vmadh   $v29, tPosLmH, tPosHmM[0]
-    lpv     tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
+    llv     t1WI[0], VTX_INV_W_VEC($1)
+tXPF equ $v16 // Triangle cross product
+tXPI equ $v17
    vreadacc tXPI, ACC_UPPER
-    lpv     tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
+    lpv     tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
    vreadacc tXPF, ACC_MIDDLE
-    lpv     tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
-    vrcp    $v20[0], tPosCatI[1]
-.if !ENABLE_PROFILING
-    lpv     $v25[0], VTX_COLOR_VEC($4)  // Load RGB from vertex 4 (flat shading vtx)
-.endif
-    vmov    tPosCatI[3], tPosLmH[0]
    llv     t1WI[8], VTX_INV_W_VEC($2)
-    vrcph   $v22[0], tXPI[1]
+    vrcp    $v20[0], tPosCatI[1]
+    lpv     tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
+    vmov    tPosCatI[3], tPosLmH[0]
    llv     t1WI[12], VTX_INV_W_VEC($3)
+    vrcph   $v22[0], tXPI[1]
+    lpv     tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
+tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
+tXPRcpI equ $v24
    vrcpl   tXPRcpF[1], tXPF[1]
 .if !ENABLE_PROFILING
    bltz    $11, tri_skip_flat_shading  // Branch if G_SHADING_SMOOTH is set
 .endif
     vrcph  tXPRcpI[1], $v31[2]            // 0
 .if !ENABLE_PROFILING
+    lbu     $10, rdpHalf1Val + 1         // Original vertex 1
+    lhu     $10, (vertexTable)($10)
+    lpv     $v25[0], VTX_COLOR_VEC($10)  // Load RGB from vertex 4 (flat shading vtx)
    vlt     $v29, $v31, $v31[3]         // Set vcc to 11100000
    vmrg    tHAtI, $v25, tHAtI        // RGB from $4, alpha from $1
    vmrg    tMAtI, $v25, tMAtI        // RGB from $4, alpha from $2
    vmrg    tLAtI, $v25, tLAtI        // RGB from $4, alpha from $3
 tri_skip_flat_shading:
 .endif
-    // 52 cycles
+    // 49 cycles
    vrcp    $v20[2], tPosMmH[1]
    lb      $20, (alphaCompareCullMode)($zero)
    vrcph   $v22[2], tPosMmH[1]
@@ -1471,7 +1474,7 @@ tri_skip_flat_shading:
    xor     $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
    bltz    $24, return_and_end_mat // if max < thresh or if min >= thresh.
 tri_skip_alpha_compare_cull:
-    // 63 cycles
+    // 60 cycles
    vmudm   tPosCatF, tPosCatI, vTRC_1000
    // no nop if tri_skip_alpha_compare_cull was unaligned
    vmadn   tPosCatI, $v31, $v31[2] // 0
@@ -1494,7 +1497,7 @@ tMx1W equ $v27
    vmadm   $v29, tXPRcpI, tXPF
    mfc2    $16, tXPI[1]
    vmadn   tXPF, tXPRcpF, tXPI
-    lbu     $7, textureSettings1 + 2
+
    vmadh   tXPI, tXPRcpI, tXPI
    lsv     tMAtI[14], VTX_SCR_Z($2)
    vand    $v22, $v20, vTRC_FFF8
@@ -1504,11 +1507,8 @@ tMx1W equ $v27
    vmudh   $v29, vOne, $v31[4] // 4
    lsv     tLAtF[14], VTX_SCR_Z_FRAC($3)
    vmadn   tXPF, tXPF, $v31[0] // -4
-    ori     $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
    vmadh   tXPI, tXPI, $v31[0] // -4
-    or      $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
    vmudn   $v29, $v3, tHPos[0]
-    sb      $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
    vmadl   $v29, $v22, tSubPxHF[1]
    ssv     tLPos[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
    vmadm   $v29, tPosCatI, tSubPxHF[1]
@@ -1516,17 +1516,17 @@ tMx1W equ $v27
    vmadn   $v2, $v22, tSubPxHI[1]
    ssv     tHPos[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
    vmadh   $v3, tPosCatI, tSubPxHI[1]
-    lw      $19, otherMode1
+    ori     $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
 tMnWI equ $v27
 tMnWF equ $v10
    vrcph   $v29[0], tMx1W[0] // Reciprocal of max 1/W = min W
-    andi    $10, $16, 0x0080 // Extract the left major flag from $16
+    or      $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
    vrcpl   tMnWF[0], tMx1W[1]
-    or      $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
+    lbu     $7, textureSettings1 + 2
    vmudh   t1WF, vOne, t1WI[1q]
-    sb      $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
+    sb      $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
    vrcph   tMnWI[0], $v31[2]     // 0
-    sb      $zero, materialCullMode // This covers tri write out
+    lw      $19, otherMode1
 tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF
 tSTWHMF equ $v25
    vmudh   tSTWHMI, vOne, $v31[7]  // 0x7FFF
@@ -1550,8 +1550,11 @@ tSTWLF equ $v13
    vmadh   tSTWHMI, tSTWHMI, t1WI[0h]
    ldv     tPosLmH[8], 0x0030(rdpCmdBufPtr) // MmHY -> e4, LmHX -> e5, HmMX -> e6
    vmadn   tSTWHMF, $v31, $v31[2]  // 0
+    andi    $10, $16, 0x0080 // Extract the left major flag from $16
    vmudm   $v29, tSTWLI, t1WF[6]  // (S, T, 7FFF) * (1 or <1) for L
+    or      $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
    vmadh   tSTWLI, tSTWLI, t1WI[6]
+    sb      $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
    vmadn   tSTWLF, $v31, $v31[2]  // 0
    sdv     tSTWHMI[0], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int to temp mem
    vmrg    tMAtI, tMAtI, tSTWHMI // Merge S, T, W Mid into elems 4-6
@@ -1564,7 +1567,7 @@ tSTWLF equ $v13
 .if !ENABLE_PROFILING
    addi    perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
 .endif
-    // 106 cycles
+    // 103 cycles
    vmudl   $v29, tXPF, tXPRcpF
    lsv     tHAtF[14], VTX_SCR_Z_FRAC($1)
    vmadm   $v29, tXPI, tXPRcpF
@@ -1574,14 +1577,15 @@ tSTWLF equ $v13
    vmadh   tXPRcpI, tXPI, tXPRcpI
    addi    $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
    vmudh   tPosLmH, tPosLmH, $v31[0h] // e1 LmHY * -4 = 4*HmLY; e456 MmHY,LmHX,HmMX *= 4
+    andi    $3, $6, G_SHADE
 tAtLmHF equ $v10
 tAtLmHI equ $v9
 tAtMmHF equ $v13
 tAtMmHI equ $v7
    vsubc   tAtLmHF, tLAtF, tHAtF
-    andi    $3, $6, G_SHADE
-    vsub    tAtLmHI, tLAtI, tHAtI
    sll     $1, $1, 14
+    vsub    tAtLmHI, tLAtI, tHAtI
+    sb      $zero, materialCullMode // This covers tri write out
    vsubc   tAtMmHF, tMAtF, tHAtF
    sw      $1, 0x0008(rdpCmdBufPtr)         // Store XL edge coefficient
    vsub    tAtMmHI, tMAtI, tHAtI
@@ -1636,7 +1640,7 @@ tDaDyI equ $v7
 // DaDe = DaDx * factor
 tDaDeF equ $v8
 tDaDeI equ $v9
-    // 135 cycles
+    // 132 cycles
    vmadl   $v29, tDaDxF, $v20[3]
    sdv     tDaDxF[8], 0x0018($1)   // Store DsDx, DtDx, DwDx texture coefficients (fractional)
    vmadm   $v29, tDaDxI, $v20[3]
@@ -1677,7 +1681,7 @@ tri_return_from_decal_fix_z:
    slv     tDaDeI[12], 0x08($10)  // DzDeI:F
    bltz    dmemAddr, return_and_end_mat     // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
     slv    $v10[12], 0x00($10)   // ZI:F
-     // 156 cycles
+     // 153 cycles
 flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
    mfc0    $11, SP_DMA_BUSY                 // Check if any DMA is in flight
    lw      cmd_w1_dram, rdpFifoPos          // FIFO pointer = end of RDP read, start of RSP write
@@ -2140,7 +2144,6 @@ vtx_after_dma:
 vtx_constants_for_clip:
    // Sets up constants needed for vertex loop, including during clipping.
    // Results fill vPerm1:4. Uses misc temps.
-    lhu     vGeomMid, geometryModeLabel + 1       // Load middle 2 bytes of geom mode
 .if CFG_NO_OCCLUSION_PLANE
    llv     sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
    ldv     sVPO[0], (viewport + 8)($zero)        // Load vtrans duplicated in 0-3 and 4-7
@@ -2612,6 +2615,7 @@ tri_snake_over_input_buffer:
    j       displaylist_dma_tri_snake    // inputBufferPos is now 0; load whole buffer
     li     nextRA, tri_snake_ret_from_input_buffer
 tri_snake_ret_from_input_buffer:
+    li      $ra, tri_snake_loop          // Clobbered by DMA. Putting this in the loop saves an instruction but loop takes 1 more cycle per tri.
    j       tri_snake_loop_from_input_buffer // inputBufferPos pointing to first byte loaded
     lbu    $3, (inputBufferEnd)(inputBufferPos) // Load c; clear real index b sign bit -> don't exit

@@ -2913,12 +2917,13 @@ G_SETSCISSOR_handler:  // $1 is 0 if jumped here
    j       G_RDP_handler                // Send the command to the RDP
     sw     cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8

-G_GEOMETRYMODE_handler: // 5; $7 = G_GEOMETRYMODE (as negative) if jumped here
-    lw      $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
+G_GEOMETRYMODE_handler:
+    lw      $11, geometryModeLabel  // load the geometry mode value
    and     $11, $11, cmd_w0        // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
    or      $11, $11, cmd_w1_dram   // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
+    sw      $11, geometryModeLabel  // update the geometry mode value
    j       run_next_DL_command     // run the next DL command
-     sw     $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7)  // update the geometry mode value
+     lsr    vGeomMid, $11, 8        // Middle 2 bytes of geom mode to lower 16 bits

 G_TEXTURE_handler: // 4
    li      $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler)  // Calculate the offset from texrectWord1 and $11 for saving to textureSettings