From 86be1a09b9b83359e43aea35846321ee711ae134 Mon Sep 17 00:00:00 2001
From: Sauraen <sauraen@gmail.com>
Date: Sun, 3 Aug 2025 18:10:50 -0700
Subject: [PATCH] Working on optimizations

---
 docs/Documentation/Performance.md |  30 ++---
 f3dex3.s                          | 179 +++++++++++++++---------------
 2 files changed, 107 insertions(+), 102 deletions(-)

diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md
index b318231..21cda28 100644
--- a/docs/Documentation/Performance.md
+++ b/docs/Documentation/Performance.md
@@ -18,8 +18,8 @@ into account, but in some cases it is assumed to be optimal.
 All numbers assume default profiling configuration. Tri numbers assume texture,
 shade, and Z, and not flushing the buffer. Tri numbers are measured from the
 first cycle of the command handler inclusive, to the first cycle of whatever is
-after $ra exclusive; this is in order to capture the extra latency and stalls in
-F3DEX2.
+after $ra exclusive; this is in order to capture an extra stall cycle in F3DEX2
+when finishing a triangle and going to the next command.
 
 Vertex numbers assume no extra F3DEX3 features (packed normals, ambient
 occlusion, etc.). These features are listed below as the number of extra cycles
@@ -33,19 +33,19 @@ even to an odd number of lights adds a different time than vice versa.
 |----------------------------|--------|------------|--------|
 | Command dispatch           | 12     | 12         | 12     |
 | Small RDP command          | 14     | 5          | 5      |
-| Only/2nd tri to offscreen  | 27     | 26         | 26     |
-| 1st tri to offscreen       | 28     | 27         | 27     |
-| Only/2nd tri to clip       | 32     | 31         | 31     |
-| 1st tri to clip            | 33     | 32         | 32     |
-| Only/2nd tri to backface   | 38     | 38         | 38     |
-| 1st tri to backface        | 39     | 39         | 39     |
-| Only/2nd tri to degenerate | 42     | 40         | 40     |
-| 1st tri to degenerate      | 43     | 41         | 41     |
-| Only/2nd tri to occluded   | Can't  | Can't      | 49     |
-| 1st tri to occluded        | Can't  | Can't      | 50     |
-| Only/2nd tri to draw       | 172    | 159        | 162    |
-| 1st tri to draw            | 173    | 160        | 163    |
-| Extra per tri from snake   | Can't  | 10         | 10     |
+| Only/2nd tri to offscreen  | 27     | 25         | 25     |
+| 1st tri to offscreen       | 28     | 26         | 26     |
+| Only/2nd tri to clip       | 32     | 30         | 30     |
+| 1st tri to clip            | 33     | 31         | 31     |
+| Only/2nd tri to backface   | 38     | 36         | 36     |
+| 1st tri to backface        | 39     | 37         | 37     |
+| Only/2nd tri to degenerate | 42     | 38         | 38     |
+| 1st tri to degenerate      | 43     | 39         | 39     |
+| Only/2nd tri to occluded   | Can't  | Can't      | 42     |
+| 1st tri to occluded        | Can't  | Can't      | 43     |
+| Only/2nd tri to draw       | 172    | 156        | 159    |
+| 1st tri to draw            | 173    | 157        | 160    |
+| Extra per tri from snake   | Can't  | 9          | 9      |
 | Vtx before DMA start       | 16     | 17         | 17     |
 | Vtx pair, no lighting      | 54     | 54         | 70     |
 | Vtx pair, 0 dir lts        | Can't  | 65         | 81     |
diff --git a/f3dex3.s b/f3dex3.s
index f37c1f1..bf61229 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -786,6 +786,7 @@ $ra   return address, sometimes sign bit is flag -------------------------------
 */
 
 // Global scalar regs:
+vGeomMid       equ $5    // Middle two bytes of geometry mode in lower 16 bits
 perfCounterD   equ $12   // Performance counter D (functions depend on config)
 altBaseReg     equ $13   // Alternate base address register for vector loads
 rdpCmdBufEndP1 equ $22   // Pointer to one command word past "end" (middle) of RDP command buf
@@ -799,7 +800,6 @@ perfCounterC   equ $30   // Performance counter C (functions depend on config)
 // Vertex write:
 vtxLeft        equ $1    // Number of vertices left to process * 0x10
 vLoopRet       equ $3    // Return address at end of vtx loop = top of loop or misc lighting
-vGeomMid       equ $5    // Middle two bytes of geometry mode
 fogFlag        equ $7    // 8 if fog enabled, else 0
 outVtx2        equ $8    // Pointer to second or dummy (= outVtx1) transformed vert
 inVtx          equ $14   // Pointer to loaded vertex to transform; < 0 means from clipping.
@@ -1098,6 +1098,7 @@ finish_setup:
     sw      $11, startCounterTime
 .endif
     sh      $zero, mvpValid  // and dirLightsXfrmValid
+    lhu     vGeomMid, geometryModeLabel + 1
     li      inputBufferPos, 0
     li      cmd_w1_dram, orga(ovl1_start)
     j       load_overlays_0_1
@@ -1277,20 +1278,20 @@ G_MODIFYVTX_handler:
 //                  cmd_w0 + inputBufferEnd
 G_TRISNAKE_handler:
     sw      cmd_w0, rdpHalf1Val          // Store indices a, b, c
-    addi    inputBufferPos, inputBufferPos, -5 // Point to byte 3, index c of 1st tri
+    addi    inputBufferPos, inputBufferPos, -6 // Point to byte 2, index b of 1st tri
+    li      $ra, tri_snake_loop          // For tri_main
 tri_snake_loop:
-    lh      $3, (inputBufferEnd - 1)(inputBufferPos) // Load indices b and c
+    lh      $3, (inputBufferEnd)(inputBufferPos) // Load indices b and c
+    addi    inputBufferPos, inputBufferPos, 1  // Increment indices being read
 tri_snake_loop_from_input_buffer:
     lb      $2, rdpHalf1Val + 1          // Old v1; == index b, except when bridging between old and new load
-    li      $ra, tri_snake_loop          // For tri_main
     bltz    $3, tri_snake_end            // Upper bit of real index b set = done
      andi   $11, $3, 1                   // Get direction flag from index c
     beqz    inputBufferPos, tri_snake_over_input_buffer // == 0 at end of input buffer
      andi   $3, $3, 0x7E                 // Mask out flags from index c
     sb      $3, rdpHalf1Val + 1          // Store index c as vertex 1
-    sb      $2, (rdpHalf1Val + 2)($11)   // Store old v1 as 2 if dir clear or 3 if set
     j       tri_main
-     addi   inputBufferPos, inputBufferPos, 1  // Increment indices being read
+     sb     $2, (rdpHalf1Val + 2)($11)   // Store old v1 as 2 if dir clear or 3 if set
 
 // H = highest on screen = lowest Y value; then M = mid, L = low
 tHAtF equ $v5
@@ -1317,128 +1318,130 @@ tri_main:
     lpv     $v27[4], (rdpHalf1Val)($zero) // To vector unit in elems 5-7
     lbu     $1, rdpHalf1Val + 1
     lbu     $2, rdpHalf1Val + 2
-    lbu     $3, rdpHalf1Val + 3
     vclr    vZero
-    lhu     $1, (vertexTable)($1)
+    lbu     $3, rdpHalf1Val + 3
     vmudn   $v29, vOne, vTRC_VB    // Address of vertex buffer
-    lhu     $2, (vertexTable)($2)
+    lhu     $1, (vertexTable)($1)
     vmadl   $v27, $v27, vTRC_VS    // Plus vtx indices times length
+    lhu     $2, (vertexTable)($2)
+    vmadl   $v6, $v31, $v31[2]    // 0; vtx 1 addr in $v6 elem 5
     lhu     $3, (vertexTable)($3)
-    vmadl   $v4, $v31, $v31[2]    // 0; vtx 2 addr in $v4 elem 6
 .if !ENABLE_PROFILING
+    // vnop
     addi    perfCounterB, perfCounterB, 0x4000  // Increment number of tris requested
-    move    $4, $1                // Save original vertex 1 addr (pre-shuffle) for flat shading
 .endif
 tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
     vnxor   tHAtF, vZero, $v31[7]  // v5 = 0x8000; init frac value for attrs for rounding
+    vmov    $v4[5], $v27[6]         // elem 5 of v4 = vertex 2 addr
     llv     $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
-    vnxor   tMAtF, vZero, $v31[7]  // v7 = 0x8000; init frac value for attrs for rounding
+    vmov    $v8[5], $v27[7]         // elem 5 of v8 = vertex 3 addr
     llv     $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
-    vmov    $v6[6], $v27[5]         // elem 6 of v6 = vertex 1 addr
+    vnxor   tMAtF, vZero, $v31[7]  // v7 = 0x8000; init frac value for attrs for rounding
     llv     $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
     vnxor   tLAtF, vZero, $v31[7]  // v9 = 0x8000; init frac value for attrs for rounding
     lhu     $16, VTX_CLIP($1)
-    vmov    $v8[6], $v27[7]         // elem 6 of v8 = vertex 3 addr
-    lhu     $7, VTX_CLIP($2)
-    // vnop
-    lhu     $8, VTX_CLIP($3)
     vmudh   $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
-    andi    $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
+    lhu     $7, VTX_CLIP($2)
     vsub    $v10, $v6, $v4    // v10 = vertex 1 - vertex 2 (x, y, addr)
-    and     $11, $11, $7
+    lhu     $8, VTX_CLIP($3)
     vsub    $v12, $v6, $v8    // v12 = vertex 1 - vertex 3 (x, y, addr)
-    and     $11, $11, $8
+    andi    $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
     vsub    $v11, $v4, $v6    // v11 = vertex 2 - vertex 1 (x, y, addr)
+    and     $11, $11, $7
     vlt     $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
+    and     $11, $11, $8
+    vmrg    tHPos, $v6, $v4   // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
     bnez    $11, return_and_end_mat // Then the whole tri is offscreen, cull
-     // 22 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
-     vmrg   tHPos, $v6, $v4   // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
-    vmudh   $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... 
-    lhu     $24, activeClipPlanes
+     // 21 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
+     vmudh  $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... 
     vmadh   $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
-    lw      $6, geometryModeLabel // Load full geometry mode word
+    lhu     $24, activeClipPlanes
     vge     $v2, $v2, $v4[1]  // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
-    or      $10, $16, $7
+    sll     $20, vGeomMid, 29 // Original bit 10 (now bit 2) in the sign bit, for facing cull
     vmrg    tLPos, $v6, $v4   // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
-    or      $10, $10, $8      // $10 = all clip bits which are true for any verts
+    or      $10, $16, $7
     vge     $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
-    and     $10, $10, $24     // If clipping is enabled, check clip flags
+    or      $10, $10, $8      // $10 = all clip bits which are true for any verts
     vmrg    $v4, tHPos, $v8   // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
     mfc2    $9, $v26[0]       // elem 0 = x = cross product => lower 16 bits, sign extended
     vmrg    tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
-    bnez    $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
-     // 30 cycles
-     sll    $20, $6, 21       // Bit 10 in the sign bit, for facing cull
+    and     $10, $10, $24     // If clipping is enabled, check clip flags
     vlt     $v29, $v6, $v2    // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
-    srl     $11, $9, 31       // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
+    bnez    $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
+     // 29 cycles
+     srl    $11, $9, 31       // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
     vmudh   $v3, vOne, $v31[5] // 0x4000; some rounding factor
     sllv    $11, $20, $11     // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
     vmrg    tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
     bltz    $11, return_and_end_mat // Cull if bit is set (culled based on facing)
-     // 34 cycles
+     // 32 cycles
      vmrg   tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
 tSubPxHF equ $v4
 tSubPxHI equ $v26
     vmudn   tSubPxHF, tHPos, $v31[5] // 0x4000
     beqz    $9, return_and_end_mat  // If cross product is 0, tri is degenerate (zero area), cull.
-     // 36 cycles
-     mfc2   $1, tHPos[12]     // tHPos = lowest Y value = highest on screen (x, y, addr)
+     // 34 cycles
+.if !CFG_NO_OCCLUSION_PLANE
+     and    $16, $16, $7
+.endif
+     vsub   tPosMmH, tMPos, tHPos
+.if !CFG_NO_OCCLUSION_PLANE
+    and     $16, $16, $8
+.endif
+    vsub    tPosLmH, tLPos, tHPos
+.if !CFG_NO_OCCLUSION_PLANE
+    andi    $16, $16, CLIP_OCCLUDED
+    bnez    $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
+     // 38 cycles
+.endif
+     mfc2   $1, tHPos[10]     // tHPos = lowest Y value = highest on screen (x, y, addr)
+    // 36 cycles if NOC (39 if occlusion plane)
+    vsub    tPosHmM, tHPos, tMPos
+    mfc2    $2, tMPos[10]     // tMPos = mid vertex (x, y, addr)
 tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
 tPosCatF equ $v25
-    vsub    tPosMmH, tMPos, tHPos
-    mfc2    $2, tMPos[12]     // tMPos = mid vertex (x, y, addr)
-    vsub    tPosLmH, tLPos, tHPos
-.if !ENABLE_PROFILING
-    sll     $11, $6, 10       // Moves the value of G_SHADING_SMOOTH into the sign bit
-.endif
-    vsub    tPosHmM, tHPos, tMPos
-    andi    $6, $6, (G_SHADE | G_ZBUFFER)
     vsub    tPosCatI, tLPos, tMPos
-    mfc2    $3, tLPos[12]     // tLPos = highest Y value = lowest on screen (x, y, addr)
-    vmov    tPosCatI[2], tPosMmH[0]
-.if !CFG_NO_OCCLUSION_PLANE
-    and     $16, $16, $7
-    and     $16, $16, $8
-    andi    $16, $16, CLIP_OCCLUDED
+.if !ENABLE_PROFILING
+    andi    $11, vGeomMid, G_SHADING_SMOOTH >> 8
 .endif
-tXPF equ $v16 // Triangle cross product
-tXPI equ $v17
-tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
-tXPRcpI equ $v24
+    vmov    tPosCatI[2], tPosMmH[0]
+    lbu     $6, geometryModeLabel + 3 // Load lowest byte for G_SHADE, G_ZBUFFER. Also has G_ATTROFFSET_ST_ENABLE, but G_TRI_FILL will get OR'd into it and force that set.
+    vmudh   $v29, tPosMmH, tPosLmH[0]
+    mfc2    $3, tLPos[10]     // tLPos = highest Y value = lowest on screen (x, y, addr)
 t1WI equ $v13 // elems 0, 4, 6
 t1WF equ $v14
-    vmudh   $v29, tPosMmH, tPosLmH[0]
-.if !CFG_NO_OCCLUSION_PLANE
-    bnez    $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
-.endif
-    llv     t1WI[0], VTX_INV_W_VEC($1)
     vmadh   $v29, tPosLmH, tPosHmM[0]
-    lpv     tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
+    llv     t1WI[0], VTX_INV_W_VEC($1)
+tXPF equ $v16 // Triangle cross product
+tXPI equ $v17
     vreadacc tXPI, ACC_UPPER
-    lpv     tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
+    lpv     tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
     vreadacc tXPF, ACC_MIDDLE
-    lpv     tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
-    vrcp    $v20[0], tPosCatI[1]
-.if !ENABLE_PROFILING
-    lpv     $v25[0], VTX_COLOR_VEC($4)  // Load RGB from vertex 4 (flat shading vtx)
-.endif
-    vmov    tPosCatI[3], tPosLmH[0]
     llv     t1WI[8], VTX_INV_W_VEC($2)
-    vrcph   $v22[0], tXPI[1]
+    vrcp    $v20[0], tPosCatI[1]
+    lpv     tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
+    vmov    tPosCatI[3], tPosLmH[0]
     llv     t1WI[12], VTX_INV_W_VEC($3)
+    vrcph   $v22[0], tXPI[1]
+    lpv     tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
+tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
+tXPRcpI equ $v24
     vrcpl   tXPRcpF[1], tXPF[1]
 .if !ENABLE_PROFILING
     bltz    $11, tri_skip_flat_shading  // Branch if G_SHADING_SMOOTH is set
 .endif
      vrcph  tXPRcpI[1], $v31[2]            // 0
 .if !ENABLE_PROFILING
+    lbu     $10, rdpHalf1Val + 1         // Original vertex 1
+    lhu     $10, (vertexTable)($10)
+    lpv     $v25[0], VTX_COLOR_VEC($10)  // Load RGB from vertex 4 (flat shading vtx)
     vlt     $v29, $v31, $v31[3]         // Set vcc to 11100000
     vmrg    tHAtI, $v25, tHAtI        // RGB from $4, alpha from $1
     vmrg    tMAtI, $v25, tMAtI        // RGB from $4, alpha from $2
     vmrg    tLAtI, $v25, tLAtI        // RGB from $4, alpha from $3
 tri_skip_flat_shading:
 .endif
-    // 52 cycles
+    // 49 cycles
     vrcp    $v20[2], tPosMmH[1]
     lb      $20, (alphaCompareCullMode)($zero)
     vrcph   $v22[2], tPosMmH[1]
@@ -1471,7 +1474,7 @@ tri_skip_flat_shading:
     xor     $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
     bltz    $24, return_and_end_mat // if max < thresh or if min >= thresh.
 tri_skip_alpha_compare_cull:
-    // 63 cycles
+    // 60 cycles
     vmudm   tPosCatF, tPosCatI, vTRC_1000
     // no nop if tri_skip_alpha_compare_cull was unaligned
     vmadn   tPosCatI, $v31, $v31[2] // 0
@@ -1494,7 +1497,7 @@ tMx1W equ $v27
     vmadm   $v29, tXPRcpI, tXPF
     mfc2    $16, tXPI[1]
     vmadn   tXPF, tXPRcpF, tXPI
-    lbu     $7, textureSettings1 + 2
+
     vmadh   tXPI, tXPRcpI, tXPI
     lsv     tMAtI[14], VTX_SCR_Z($2)
     vand    $v22, $v20, vTRC_FFF8
@@ -1504,11 +1507,8 @@ tMx1W equ $v27
     vmudh   $v29, vOne, $v31[4] // 4
     lsv     tLAtF[14], VTX_SCR_Z_FRAC($3)
     vmadn   tXPF, tXPF, $v31[0] // -4
-    ori     $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
     vmadh   tXPI, tXPI, $v31[0] // -4
-    or      $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
     vmudn   $v29, $v3, tHPos[0]
-    sb      $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
     vmadl   $v29, $v22, tSubPxHF[1]
     ssv     tLPos[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
     vmadm   $v29, tPosCatI, tSubPxHF[1]
@@ -1516,17 +1516,17 @@ tMx1W equ $v27
     vmadn   $v2, $v22, tSubPxHI[1]
     ssv     tHPos[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
     vmadh   $v3, tPosCatI, tSubPxHI[1]
-    lw      $19, otherMode1
+    ori     $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
 tMnWI equ $v27
 tMnWF equ $v10
     vrcph   $v29[0], tMx1W[0] // Reciprocal of max 1/W = min W
-    andi    $10, $16, 0x0080 // Extract the left major flag from $16
+    or      $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
     vrcpl   tMnWF[0], tMx1W[1]
-    or      $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
+    lbu     $7, textureSettings1 + 2
     vmudh   t1WF, vOne, t1WI[1q]
-    sb      $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
+    sb      $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
     vrcph   tMnWI[0], $v31[2]     // 0
-    sb      $zero, materialCullMode // This covers tri write out
+    lw      $19, otherMode1
 tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF
 tSTWHMF equ $v25
     vmudh   tSTWHMI, vOne, $v31[7]  // 0x7FFF
@@ -1550,8 +1550,11 @@ tSTWLF equ $v13
     vmadh   tSTWHMI, tSTWHMI, t1WI[0h]
     ldv     tPosLmH[8], 0x0030(rdpCmdBufPtr) // MmHY -> e4, LmHX -> e5, HmMX -> e6
     vmadn   tSTWHMF, $v31, $v31[2]  // 0
+    andi    $10, $16, 0x0080 // Extract the left major flag from $16
     vmudm   $v29, tSTWLI, t1WF[6]  // (S, T, 7FFF) * (1 or <1) for L
+    or      $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
     vmadh   tSTWLI, tSTWLI, t1WI[6]
+    sb      $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
     vmadn   tSTWLF, $v31, $v31[2]  // 0
     sdv     tSTWHMI[0], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int to temp mem
     vmrg    tMAtI, tMAtI, tSTWHMI // Merge S, T, W Mid into elems 4-6
@@ -1564,7 +1567,7 @@ tSTWLF equ $v13
 .if !ENABLE_PROFILING
     addi    perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
 .endif
-    // 106 cycles
+    // 103 cycles
     vmudl   $v29, tXPF, tXPRcpF
     lsv     tHAtF[14], VTX_SCR_Z_FRAC($1)
     vmadm   $v29, tXPI, tXPRcpF
@@ -1574,14 +1577,15 @@ tSTWLF equ $v13
     vmadh   tXPRcpI, tXPI, tXPRcpI
     addi    $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
     vmudh   tPosLmH, tPosLmH, $v31[0h] // e1 LmHY * -4 = 4*HmLY; e456 MmHY,LmHX,HmMX *= 4
+    andi    $3, $6, G_SHADE
 tAtLmHF equ $v10
 tAtLmHI equ $v9
 tAtMmHF equ $v13
 tAtMmHI equ $v7
     vsubc   tAtLmHF, tLAtF, tHAtF
-    andi    $3, $6, G_SHADE
-    vsub    tAtLmHI, tLAtI, tHAtI
     sll     $1, $1, 14
+    vsub    tAtLmHI, tLAtI, tHAtI
+    sb      $zero, materialCullMode // This covers tri write out
     vsubc   tAtMmHF, tMAtF, tHAtF
     sw      $1, 0x0008(rdpCmdBufPtr)         // Store XL edge coefficient
     vsub    tAtMmHI, tMAtI, tHAtI
@@ -1636,7 +1640,7 @@ tDaDyI equ $v7
 // DaDe = DaDx * factor
 tDaDeF equ $v8
 tDaDeI equ $v9
-    // 135 cycles
+    // 132 cycles
     vmadl   $v29, tDaDxF, $v20[3]
     sdv     tDaDxF[8], 0x0018($1)   // Store DsDx, DtDx, DwDx texture coefficients (fractional)
     vmadm   $v29, tDaDxI, $v20[3]
@@ -1677,7 +1681,7 @@ tri_return_from_decal_fix_z:
     slv     tDaDeI[12], 0x08($10)  // DzDeI:F
     bltz    dmemAddr, return_and_end_mat     // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
      slv    $v10[12], 0x00($10)   // ZI:F
-     // 156 cycles
+     // 153 cycles
 flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
     mfc0    $11, SP_DMA_BUSY                 // Check if any DMA is in flight
     lw      cmd_w1_dram, rdpFifoPos          // FIFO pointer = end of RDP read, start of RSP write
@@ -2140,7 +2144,6 @@ vtx_after_dma:
 vtx_constants_for_clip:
     // Sets up constants needed for vertex loop, including during clipping.
     // Results fill vPerm1:4. Uses misc temps.
-    lhu     vGeomMid, geometryModeLabel + 1       // Load middle 2 bytes of geom mode
 .if CFG_NO_OCCLUSION_PLANE
     llv     sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
     ldv     sVPO[0], (viewport + 8)($zero)        // Load vtrans duplicated in 0-3 and 4-7
@@ -2612,6 +2615,7 @@ tri_snake_over_input_buffer:
     j       displaylist_dma_tri_snake    // inputBufferPos is now 0; load whole buffer
      li     nextRA, tri_snake_ret_from_input_buffer
 tri_snake_ret_from_input_buffer:
+    li      $ra, tri_snake_loop          // Clobbered by DMA. Putting this in the loop saves an instruction but loop takes 1 more cycle per tri.
     j       tri_snake_loop_from_input_buffer // inputBufferPos pointing to first byte loaded
      lbu    $3, (inputBufferEnd)(inputBufferPos) // Load c; clear real index b sign bit -> don't exit
 
@@ -2913,12 +2917,13 @@ G_SETSCISSOR_handler:  // $1 is 0 if jumped here
     j       G_RDP_handler                // Send the command to the RDP
      sw     cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8
 
-G_GEOMETRYMODE_handler: // 5; $7 = G_GEOMETRYMODE (as negative) if jumped here
-    lw      $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
+G_GEOMETRYMODE_handler:
+    lw      $11, geometryModeLabel  // load the geometry mode value
     and     $11, $11, cmd_w0        // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
     or      $11, $11, cmd_w1_dram   // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
+    sw      $11, geometryModeLabel  // update the geometry mode value
     j       run_next_DL_command     // run the next DL command
-     sw     $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7)  // update the geometry mode value
+     lsr    vGeomMid, $11, 8        // Middle 2 bytes of geom mode to lower 16 bits
 
 G_TEXTURE_handler: // 4
     li      $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler)  // Calculate the offset from texrectWord1 and $11 for saving to textureSettings