diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md
index d4d8db6..639511b 100644
--- a/docs/Documentation/Performance.md
+++ b/docs/Documentation/Performance.md
@@ -45,7 +45,7 @@ even to an odd number of lights adds a different time than vice versa.
 | 1st tri to occluded        | Can't  | Can't      | 43     |
 | Only/2nd tri to draw       | 172    | 156        | 158    |
 | 1st tri to draw            | 173    | 157        | 159    |
-| Extra per tri from snake   | Can't  | 9          | 9      |
+| Tri snake                  | Can't  | *          | *      |
 | Vtx before DMA start       | 16     | 17         | 17     |
 | Vtx pair, no lighting      | 54     | 54         | 70     |
 | Vtx pair, 0 dir lts        | Can't  | 65         | 81     |
@@ -88,3 +88,113 @@ even to an odd number of lights adds a different time than vice versa.
 | Light dir xfrm, 7 dir lts  | 375    | 170        | 170    |
 | Light dir xfrm, 8 dir lts  | Can't  | 171        | 171    |
 | Light dir xfrm, 9 dir lts  | Can't  | 196        | 196    |
+
+## Triangle Snake Cycle Counts
+
+### Very Long Snakes
+
+For this section, we assume almost all tris are contained in very long snakes,
+so the overhead of starting and ending snakes is negligible. This overhead is
+discussed in the next section.
+
+We are assuming that the same set of tris is being drawn with or without snakes.
+Thus, cycles from `tri_main_from_snake` through the instruction after the return
+exclusive are not counted here, as they are the same regardless of which method
+is being used.
+
+For a pair of tris drawn without snakes, i.e. with a single `SP2Triangles`
+command, the cycles are:
+- Command dispatch: 12
+- First tri up to `tri_main_from_snake`: 5
+- Second tri up to `tri_main_from_snake`: 4
+- Total: 21
+
+For a pair of tris which are part of a long snake, the cycles are:
+- Each tri up to `tri_main_from_snake`: 11
+- Total: 22
+
+However, there's also the memory bandwidth savings. The `SP2Triangles` command
+is 8 bytes and the two tris in a long snake are 2 bytes, so switching to snake
+saves 6 bytes of bandwidth. Testing has shown that RSP DMAs on average transfer
+about 2.2 bytes per cycle, though it depends on the length. So this is a savings
+of about 2.7 cycles of RDRAM / RDP time. Since the DMAs loading this data are
+input buffer loads, and the RSP stalls waiting for input buffer loads (it does
+not do useful work during this time), this is also 2.7 cycles of RSP time. This
+offsets the 1 extra cycle of processing the tri pair above.
+
+Therefore, switching to snake (assuming very long snakes) saves about 2.7
+cycles of RDRAM / RDP time and 1.7 cycles of RSP time per two tris, or about
+0.9 RSP cycles and 1.4 RDRAM cycles per tri.
+
+### Starting a Snake
+
+Since a `SPTriSnake` command encodes 5 triangles, for comparison to
+`SP2Triangles` we will consider the overhead for 10 triangles total / two snake
+starts.
+
+For `SPTriSnake`, this is 2 x (12 cycles command dispatch + 4 cycles snake
+initialization + 5 tris x 11 cycles per tri as discussed above) = 142 RSP
+cycles. And it is 16 bytes of loads = 7.3 cycles of RDRAM / RDP time and stall
+RSP time. So the total cost is 149.3 RSP and 7.3 RDRAM cycles.
+
+For `SP2Triangles`, this is 5 x (21 cycles as discussed above) = 105 RSP cycles.
+And it is 40 bytes of loads = 18.2 cycles of RDRAM / RDP time and stall RSP
+time. So the total cost is 123.2 RSP and 18.2 RDRAM cycles.
+
+But drawing those 10 tris as part of very long snakes would have saved 13.5
+RDRAM cycles and 8.5 RSP cycles. So the relative cost of drawing these tris as
+two start-of-snakes instead of in very long snakes is 34.6 RSP cycles and 2.6
+RDRAM cycles. Thus the cost of each start-of-snake relative to long snakes is
+17.3 RSP cycles and 1.3 RDRAM cycles.
+
+### Ending a Snake
+
+Ending a snake costs 12 cycles of RSP time and has no direct impact on memory
+traffic. However, calculating the overall performance is more complicated: the
+snake can end after 1-8 bytes of the `SPContinueSnake` command, and the
+remaining bytes are "wasted" in that they do not contribute to drawing tris
+with memory bandwidth savings.
+
+From a mesh optimization standpoint, this is not an issue. If you have a snake
+which has filled 8 bytes of the previous `SPContinueSnake` command, and you have
+another triangle to draw, there are only two cases. If that tri can't be
+appended to the snake, you have to draw it with a `SP1Triangle` command either
+way, so there is no performance difference. If it can be appended to the snake,
+doing so will take 8 bytes of memory traffic--the same as the `SP1Triangle`
+command. The snake end penalty will have to be paid whether before or after this
+tri. And it's 11 RSP cycles to draw one more tri in an existing snake, whereas
+the command dispatch plus second tri code for `SP1Triangle` is 16 cycles. So
+it's better to continue a snake than to stop it early and use non-snake
+commands, even if this leads to a mostly empty `SPContinueSnake` command. Of
+course, if you can fill up even more tris in the command, the performance
+benefit increases.
+
+Assuming snake lengths are uniformly distributed, on average a snake will end
+after 4.5 bytes (the same number of triangles) of a `SPContinueSnake` command.
+In this case, the command will take 4.5 tris x 11 cycles per tri + 12 cycle end
+snake penalty = 61.5 RSP cycles, and 8 bytes of memory traffic = 3.6 RDRAM
+cycles. If these 4.5 tris were instead drawn with `SP2Triangles` commands, that
+would be 2.25 commands = 47.3 RSP cycles and 18 bytes = 8.2 RDRAM cycles. Thus
+on average, the snake end costs 14.2 RSP cycles and saves 4.6 RDRAM cycles
+compared to `SP2Triangles` commands. But drawing those 4.5 tris as part of very
+long snakes would have saved 3.9 RSP cycles and 6.1 RDRAM cycles. So the average
+cost of ending a snake relative to very long snakes is 18.1 RSP and 1.5 RDRAM
+cycles.
+
+### Example
+
+Suppose there are 4000 tris on screen. Suppose that 90% of them have been
+encoded with snakes--the rest are disconnected single tris or tri pairs (quads).
+That 10% are then encoded with `SP2Triangles` commands, which is the same
+performance with or without snakes, so we ignore those tris, and there are
+3600 "snakeable" tris in the scene.
+
+Suppose that the average snake length is 16, to account for some objects with
+more contiguous tris with the same material, and others with smaller disjoint
+parts. Thus, for 3600 tris, there are 225 snakes.
+
+Switching the 3600 tris from `SP2Triangles` commands to long snakes saves
+4860 RDRAM cycles and 3060 RSP cycles. However, the 225 snake starts and ends
+cost 630 RDRAM and 7965 RSP cycles relative to this. So the total performance
+change of switching to snakes in this case is that the RDRAM / RDP goes faster
+by 4230 cycles = 68 us, but the RSP goes slower by 4905 cycles = 78 us.
diff --git a/f3dex3.s b/f3dex3.s
index 72d6ca9..1facf14 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -755,7 +755,7 @@ $zero ---------------------- Hardwired zero ------------------------------------
 $1    v1 texptr   <------------- vtxLeft ------------------------------>  temp, init 0
 $2    v2 shdptr   clipVNext -------> <----- lbPostAo   laPtr                  temp
 $3    v3 shdflg   clipVLastOfsc  vLoopRet ---------> laVtxLeft                temp
-$4                                    ~ unused! ~
+$4    <--------- origV1Idx -------->
 $5    ------------------------- vGeomMid ---------------------------------------------
 $6    geom mode   clipMaskIdx -----> <-- lbTexgenOrRet laSTKept
 $7    v2flag tile <------------- fogFlag ---------->  laPacked  mtx valid   cmd byte
@@ -797,6 +797,12 @@ perfCounterA   equ $28   // Performance counter A (functions depend on config)
 perfCounterB   equ $29   // Performance counter B (functions depend on config)
 perfCounterC   equ $30   // Performance counter C (functions depend on config)
 
+// Tri write:
+origV1Idx      equ $4    // Original / current vertex 1 index (not address)
+
+// Vertex init:
+viLtFlag       equ $9    // Holds pointLightFlag or dirLightsXfrmValid
+
 // Vertex write:
 vtxLeft        equ $1    // Number of vertices left to process * 0x10
 vLoopRet       equ $3    // Return address at end of vtx loop = top of loop or misc lighting
@@ -826,7 +832,7 @@ laSpecFres     equ $16   // Nonzero if doing ltadv_normal_to_vertex for specular
 laL2A          equ $19   // Nonzero if light-to-alpha (cel shading) enabled
 laTexgen       equ $20   // Nonzero if texgen enabled
 
-// Clipping
+// Clipping:
 clipVNext      equ $2    // Next vertex (vertex at forward end of current edge)
 clipVLastOfsc  equ $3    // Last vertex / offscreen vertex
 clipVOnsc      equ $19   // Onscreen vertex
@@ -837,10 +843,7 @@ clipPolyRead   equ $17   // Read pointer within current polygon being clipped
 clipPolySelect equ $18   // Clip poly double buffer selection
 clipPolyWrite  equ $21   // Write pointer within current polygon being clipped
 
-// Vertex init
-viLtFlag       equ $9    // Holds pointLightFlag or dirLightsXfrmValid
-
-// Misc
+// Misc:
 nextRA         equ $10   // Address to return to after overlay load
 ovlInitClock   equ $16   // Temp for profiling
 dmaLen         equ $19   // DMA length in bytes minus 1
@@ -1292,18 +1295,24 @@ G_TRISNAKE_handler:
     sw      cmd_w0, rdpHalf1Val          // Store indices a, b, c
     addi    inputBufferPos, inputBufferPos, -6 // Point to byte 2, index b of 1st tri
     li      $ra, tri_snake_loop          // For tri_main
+    lbu     origV1Idx, rdpHalf1Val + 1   // Initial value, normally carried over
 tri_snake_loop:
     lh      $3, (inputBufferEnd)(inputBufferPos) // Load indices b and c
     addi    inputBufferPos, inputBufferPos, 1  // Increment indices being read
-tri_snake_loop_from_input_buffer:
-    lb      $2, rdpHalf1Val + 1          // Old v1; == index b, except when bridging between old and new load
-    bltz    $3, tri_snake_end            // Upper bit of real index b set = done
-     andi   $11, $3, 1                   // Get direction flag from index c
     beqz    inputBufferPos, tri_snake_over_input_buffer // == 0 at end of input buffer
-     andi   $3, $3, 0x7E                 // Mask out flags from index c
-    sb      $3, rdpHalf1Val + 1          // Store index c as vertex 1
-    j       tri_main
-     sb     $2, (rdpHalf1Val + 2)($11)   // Store old v1 as 2 if dir clear or 3 if set
+tri_snake_loop_from_input_buffer:
+     andi   $11, $3, 1                   // Get direction flag from index c
+    bltz    $3, tri_snake_end            // Upper bit of real index b set = done
+     sb     origV1Idx, (rdpHalf1Val + 2)($11) // Store old v1 as 2 if dir clear or 3 if set
+    andi    origV1Idx, $3, 0x7E          // New v1 = mask out flags from index c
+    sb      origV1Idx, rdpHalf1Val + 1   // Store index c as vertex 1
+    j       tri_main_from_snake          // Repeat next instr so we can skip lbu origV1Idx
+     lpv    $v27[4], (rdpHalf1Val)($zero) // To vector unit in elems 5-7
+
+tri_snake_ret_from_input_buffer:
+    li      $ra, tri_snake_loop          // Clobbered by DMA. Not in the loop to save a cycle.
+    j       tri_snake_loop_from_input_buffer // inputBufferPos pointing to first byte loaded
+     lbu    $3, (inputBufferEnd)(inputBufferPos) // Load c; clear real index b sign bit -> don't exit
 
 // H = highest on screen = lowest Y value; then M = mid, L = low
 tHAtF equ $v5
@@ -1319,6 +1328,8 @@ tPosMmH equ $v6
 tPosLmH equ $v8
 tPosHmM equ $v11
 
+align_with_warning 8, "One instruction of padding before tris"
+
 G_TRI2_handler:
 G_QUAD_handler:
     jal     tri_main                     // Send second tri; return here for first tri
@@ -1328,12 +1339,13 @@ G_TRI1_handler:
     sw      cmd_w0, rdpHalf1Val          // Store first tri indices
 tri_main:
     lpv     $v27[4], (rdpHalf1Val)($zero) // To vector unit in elems 5-7
-    lbu     $1, rdpHalf1Val + 1
+    lbu     origV1Idx, rdpHalf1Val + 1
+tri_main_from_snake:
     lbu     $2, rdpHalf1Val + 2
     vclr    vZero
     lbu     $3, rdpHalf1Val + 3
     vmudn   $v29, vOne, vTRC_VB    // Address of vertex buffer
-    lhu     $1, (vertexTable)($1)
+    lhu     $1, (vertexTable)(origV1Idx)
     vmadl   $v27, $v27, vTRC_VS    // Plus vtx indices times length
     lhu     $2, (vertexTable)($2)
     vmadl   $v6, $v31, $v31[2]    // 0; vtx 1 addr in $v6 elem 5
@@ -1420,9 +1432,7 @@ tPosCatF equ $v25
     andi    $11, vGeomMid, G_SHADING_SMOOTH >> 8
 .endif
     vmudh   $v29, tPosMmH, tPosLmH[0]
-.if !ENABLE_PROFILING
-    lbu     $10, rdpHalf1Val + 1        // Original vertex 1 before shuffle and clipping
-.endif
+    // nop
 t1WI equ $v13 // elems 0, 4, 6
     vmadh   $v29, tPosLmH, tPosHmM[0]
     mfc2    $3, tLPos[10]     // tLPos = highest Y value = lowest on screen (x, y, addr)
@@ -1432,7 +1442,7 @@ tXPI equ $v17
     lpv     tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
     vreadacc tXPF, ACC_MIDDLE
 .if !ENABLE_PROFILING
-    lhu     $10, (vertexTable)($10)
+    lhu     $10, (vertexTable)(origV1Idx)
 .endif
     vrcp    $v20[0], tPosCatI[1]
     lpv     tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
@@ -1776,13 +1786,8 @@ return_and_end_mat:
      sb     $zero, materialCullMode // This covers all tri early exits except clipping
 
 tri_snake_over_input_buffer:
-    j       displaylist_dma_tri_snake    // inputBufferPos is now 0; load whole buffer
-     li     nextRA, tri_snake_ret_from_input_buffer
-tri_snake_ret_from_input_buffer:
-    li      $ra, tri_snake_loop          // Clobbered by DMA. Putting this in the loop saves an instruction but loop takes 1 more cycle per tri.
-    j       tri_snake_loop_from_input_buffer // inputBufferPos pointing to first byte loaded
-     lbu    $3, (inputBufferEnd)(inputBufferPos) // Load c; clear real index b sign bit -> don't exit
-
+    bgez    $3, displaylist_dma_tri_snake // If $3 < 0, last tri flag set, proceed to end
+     li     nextRA, tri_snake_ret_from_input_buffer // inputBufferPos is now 0; load whole buffer
 tri_snake_end:
     addi    inputBufferPos, inputBufferPos, 7 // Round up to whole input command
     addi    $11, $zero, 0xFFF8               // Sign-extend; andi is zero-extend!