New changes working

2026-01-21 10:37:45 -08:00 · 2024-04-10 22:14:32 -07:00
parent e49161f5c4
commit 74d67ae914
2 changed files with 131 additions and 145 deletions
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -1133,11 +1133,14 @@ sSTO equ $v11 // not supported on legacy vtx pipe, but register allocated for it
 sSTS equ $v10
 .else
 sVPO equ $v17
+.if CFG_NO_OCCLUSION_PLANE
+sVPS equ $v26
+.else
 sVPS equ $v16
+.endif
 sSTO equ $v26
 sSTS equ $v25
 .endif
-sCOL equ $v17
 .if CFG_LEGACY_VTX_PIPE
 sOUTF equ vPairTPosF
 sOUTI equ vPairTPosI
@@ -1353,32 +1356,19 @@ vtx_after_calc_mit:
    ldv     vVP0F[8], (vpMatrix  + 0x20)($zero)
    ldv     vVP2F[8], (vpMatrix  + 0x30)($zero)
 .endif
-    andi    $16, $5, G_LIGHTING >> 8  // This is clipFlags, but not touched during vtx_store
+    andi    $11, $5, G_LIGHTING >> 8  
+    beqz    $11, @@skip_lighting          
+    li      $16, vtx_return_from_lighting // This is clipFlags, but not modified
+     li     $16, lt_vtx_pair              // during vtx_store
+@@skip_lighting:
    andi    $7, $5, G_FOG >> 8    // Nonzero if fog enabled
    jal     while_wait_dma_busy  // Wait for vertex load to finish
     li     $19, clipTempVerts                 // Temp mem we can freely overwrite replaces outputVtxPos
-    jal     middle_of_vtx_store  // Sets $ra to vtx_load_loop
+    j       middle_of_vtx_store
     move   secondVtxPos, $19                  // for first pre-loop, same for secondVtxPos
-    
-vtx_load_loop:
-    vlt     $v29, $v31, $v31[4]            // Set VCC to 11110000
-.if CFG_LEGACY_VTX_PIPE
-    blez    $1, vertex_end
-.endif
-     nop    // TODO
-.if CFG_LEGACY_VTX_PIPE
-    vmrg    vPairNrml, vPairNrml, vDDD     // Merge normals
-    addi    $1, $1, -2*inputVtxSize        // Counter of remaining verts * inputVtxSize
-.endif
-    vmrg    vPairRGBA, sCOL, vPairRGBA     // Merge colors
-    bnez    $16, lt_vtx_pair               // Lighting enabled in clipFlags
-.if CFG_LEGACY_VTX_PIPE
-     addi   outputVtxPos, outputVtxPos, 2*vtxSize
-.else
-     // Elems 0-1 get bytes 6-7 of the following vertex (0)
-     lpv    vAAA[2],      (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
-.endif
+
 vtx_return_from_lighting:
+    li      $ra, vertex_end
 .if CFG_LEGACY_VTX_PIPE
    vmudm   vPairST, vPairST, sSTS      // Scale ST; must be after texgen
@@skipsecond:
@@ -1398,22 +1388,17 @@ vtx_return_from_lighting:
    vmadn   $v29, vVP0F, vPairPosI[0h]
    vmadh   $v29, vVP0I, vPairPosI[0h]
    vmadl   $v29, vVP1F, vPairPosF[1h]
-    addi    outputVtxPos, outputVtxPos, 2*vtxSize
    vmadm   $v29, vVP1I, vPairPosF[1h]
-    addi    $1, $1, -2*inputVtxSize     // Counter of remaining verts * inputVtxSize
    vmadn   $v29, vVP1F, vPairPosI[1h]
    vmadh   $v29, vVP1I, vPairPosI[1h]
    vmadl   $v29, vVP2F, vPairPosF[2h]
    vmadm   $v29, vVP2I, vPairPosF[2h]
    vmadn   vPairTPosF, vVP2F, vPairPosI[2h]
-    li      $ra, vertex_end             // Done with vertex processing...
    vmadh   vPairTPosI, vVP2I, vPairPosI[2h]
-    blez    $1, @@skiploop              // ...if <= 0 verts remain, ...
-     vmudm  $v29, vPairST, sSTS         // Scale ST; must be after texgen
-    li      $ra, vtx_load_loop          // ...otherwise keep looping
-@@skiploop:
+    vmudm   $v29, vPairST, sSTS         // Scale ST; must be after texgen
    vmadh   vPairST, sSTO, vOne         // + 1 * (ST offset or zero)
 .endif
+    addi    outputVtxPos, outputVtxPos, 2*vtxSize
 vtx_store:
    // Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA
    // Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp). Also vPairST and
@@ -1425,14 +1410,13 @@ s1WH equ $v16 // vtx_store 1/W High
 s1WL equ $v17 // vtx_store 1/W Low
 sWRL equ $v25 // vtx_store W Reciprocal Low  | IMPORTANT: Can be the same reg as sWRH, but
 sWRH equ $v26 // vtx_store W Reciprocal High | using different ones saves one cycle delay
-    move    secondVtxPos, outputVtxPos          // Second and output vertices write to same mem...
    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
+    move    secondVtxPos, outputVtxPos          // Second and output vertices write to same mem...
+    vmadm  s1WH, vPairTPosI, $v30[3] // Persp norm
    bltz    $1, @@skipsecond                    // ...if < 0 verts remain, ...
-     vmadm  s1WH, vPairTPosI, $v30[3] // Persp norm
+     vmadn  s1WL, $v31, $v31[2] // 0
    addi    secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx
@@skipsecond:
-    vmadn   s1WL, $v31, $v31[2] // 0
-    addi    inputVtxPos, inputVtxPos, 2*inputVtxSize
    vch     $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
    suv     vPairRGBA[4],     (VTX_COLOR_VEC )(secondVtxPos)
    vcl     $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low
@@ -1460,6 +1444,7 @@ sSCH equ $v21 // vtx_store Scaled Clipping High
 .if CFG_NO_OCCLUSION_PLANE

    vmadn   s1WL, s1WL, sWRH[3h]
+    addi    inputVtxPos, inputVtxPos, 2*inputVtxSize
    vmadh   s1WH, s1WH, sWRH[3h]
    srl     $24, $10, 4            // Shift second vertex screen clipping to first slots
    vch     $v29, vPairTPosI, sSCH[3h] // Clip scaled high
@@ -1480,10 +1465,10 @@ sSCH equ $v21 // vtx_store Scaled Clipping High
    lsv     vPairTPosF[6],  (VTX_Z_FRAC    )(outputVtxPos) // load Z into W slot, will be for fog below
    vmadn   s1WL, s1WL, sWRH[3h]
 middle_of_vtx_store:
-// vPairPosI is $v20
-    ldv     vPairPosI[8],      (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
+// vPairST is $v22
+    ldv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3
    vmadh   s1WH, s1WH, sWRH[3h] // s1WH:s1WL is 1/W
-    ldv     vPairPosI[0],      (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
+    llv     vPairST[8],   (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
    // vnop
    sll     $11, $20, 4            // Shift first vertex scaled clipping to second slots
    // vnop
@@ -1497,59 +1482,63 @@ middle_of_vtx_store:
    vmadh   vPairTPosI, vPairTPosI, s1WH[3h] // pos * 1/W
    ssv     s1WH[6],          (VTX_INV_W_INT )($19)
    // vnop
-.if !CFG_LEGACY_VTX_PIPE
+    slv     vPairST[4],   (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Replace vtx 1 ST with vtx 0 RGBA
+    // vnop
+.if CFG_LEGACY_VTX_PIPE
+    lpv     $v14[7],      (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Y to elem 0, 4
+.else
 // sVPO is $v17 // vtx_store ViewPort Offset
    lqv     sVPO, (0x10)(rdpCmdBufEndP1) // Load viewport offset from temp mem
 .endif
-    // vnop
-.if !CFG_LEGACY_VTX_PIPE
-// sVPS is $v16 // vtx_store ViewPort Scale
+    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
+.if CFG_LEGACY_VTX_PIPE
+    lpv     $v15[6],      (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Z to elem 0, 4
+.else
+// sVPS is $v26 // vtx_store ViewPort Scale
    lqv     sVPS, (0x00)(rdpCmdBufEndP1) // Load viewport scale from temp mem
 .endif
-    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
-// vPairST is $v22
-    llv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1
    vmadm   vPairTPosI, vPairTPosI, $v30[3] // Persp norm
-    llv     vPairST[8],   (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
+// vPairPosI is $v20
+    ldv     vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
    vmadn   vPairTPosF, $v31, $v31[2] // 0
-// vDDD is $v26
-    lpv     vDDD[0],          (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
+    ldv     vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
    // vnop
 // vPairRGBA is $v27
-    luv     vPairRGBA[0],     (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
+    luv     vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair RGBA
    // vnop
+// vPairNrml is $v16
+    lpv     vPairNrml[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair normals
    vmudh   $v29, sVPO, vOne // offset * 1
    andi    $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
    vmadn   vPairTPosF, vPairTPosF, sVPS // + XYZ * scale
-// sCOL is $v17, defined at start of loop
-    luv     sCOL[4],          (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4
-    vmadh   vPairTPosI, vPairTPosI, sVPS
    or      $24, $24, $20          // Combine results for second vertex
+    vmadh   vPairTPosI, vPairTPosI, sVPS
+    or      $10, $10, $11          // Combine results for first vertex
 // sFOG is $v25
    vmadh   sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
-    or      $10, $10, $11          // Combine results for first vertex
-    // vnop
    sh      $24,              (VTX_CLIP      )(secondVtxPos) // Store second vertex clip flags
    // vnop
    sh      $10,              (VTX_CLIP      )($19)          // Store first vertex results
+    // vnop
+    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(secondVtxPos)
 // sCLZ is $v21 // vtx_store CLipped Z
    vge     sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
-    slv     vPairTPosI[8],    (VTX_SCR_VEC   )(secondVtxPos)
-    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
-    slv     vPairTPosI[0],    (VTX_SCR_VEC   )($19)
-    vmudn   $v29, vM3F, vOne
-    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(secondVtxPos)
-    vmadh   $v29, vM3I, vOne
    ssv     vPairTPosF[4],    (VTX_SCR_Z_FRAC)($19)
-    vmadn   $v29, vM0F, vPairPosI[0h]
-    // nop TODO
+    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
+    slv     vPairTPosI[8],    (VTX_SCR_VEC   )(secondVtxPos)
+    vmudn   $v29, vM3F, vOne
+    slv     vPairTPosI[0],    (VTX_SCR_VEC   )($19)
+    vmadh   $v29, vM3I, vOne
+    blez    $1, skip_return_to_lt_or_loop  // $ra left as vertex_end or clipping
+     vmadn  $v29, vM0F, vPairPosI[0h]
+    move    $ra, $16                    // Normally $ra = loop or lighting
+skip_return_to_lt_or_loop:
    vmadh   $v29, vM0I, vPairPosI[0h]
-    ssv     sCLZ[12],         (VTX_SCR_Z     )(secondVtxPos)
+    addi    $1, $1, -2*inputVtxSize     // Counter of remaining verts * inputVtxSize
    vmadn   $v29, vM1F, vPairPosI[1h]
-    ssv     sCLZ[4],          (VTX_SCR_Z     )($19)
+    ssv     sCLZ[12],         (VTX_SCR_Z     )(secondVtxPos)
    vmadh   $v29, vM1I, vPairPosI[1h]
-// vPairNrml is $v16
-    lpv     vPairNrml[4],     (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
+    ssv     sCLZ[4],          (VTX_SCR_Z     )($19)
 // sOUTF = vPairPosF is $v21, or vPairTPosF is $v23
    vmadn   sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords
    beqz    $7, return_routine // fog disabled
@@ -1576,6 +1565,7 @@ middle_of_vtx_store:
    vmadn   s1WL, s1WL, $v31[0] // -4
    ori     $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
    vmadh   s1WH, s1WH, $v31[0] // -4
+    addi    inputVtxPos, inputVtxPos, 2*inputVtxSize
    vmudn   $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz
    vmadh   $v29, vPairTPosI, sOCM // Int * int
    lsv     vPairTPosF[14], (VTX_Z_FRAC    )(secondVtxPos) // load Z into W slot, will be for fog below
@@ -1589,40 +1579,43 @@ middle_of_vtx_store:
    vmadn   s1WL, s1WL, sWRH[3h]
    sll     $11, $20, 4            // Shift first vertex scaled clipping to second slots
    vmadh   s1WH, s1WH, sWRH[3h] // s1WH:s1WL is 1/W
-    andi    $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
-    veq     $v29, $v31, $v31[3h] // Set VCC to 00010001
    andi    $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
-    vmrg    sOC1, sOCM, sOC1  // Put constant factor in elems 3, 7
-    or      $24, $24, $20          // Combine results for second vertex
+    veq     $v29, $v31, $v31[3h] // Set VCC to 00010001
+    blez    $1, skip_return_to_lt_or_loop  // $ra left as vertex_end or clipping
+     vmrg   sOC1, sOCM, sOC1  // Put constant factor in elems 3, 7
+middle_of_vtx_store:
+    move    $ra, $16                    // Normally $ra = loop or lighting
+skip_return_to_lt_or_loop:
    vmudl   $v29, vPairTPosF, s1WL[3h]  // W must be overwritten with Z before here
    ssv     s1WL[14],         (VTX_INV_W_FRAC)(secondVtxPos)
    vmadm   $v29, vPairTPosI, s1WL[3h]
-    ssv     s1WL[6],          (VTX_INV_W_FRAC)(outputVtxPos)
+    ssv     s1WL[6],          (VTX_INV_W_FRAC)($19)
    vmadn   vPairTPosF, vPairTPosF, s1WH[3h]
    ssv     s1WH[14],         (VTX_INV_W_INT )(secondVtxPos)
    vmadh   vPairTPosI, vPairTPosI, s1WH[3h] // pos * 1/W
-    ssv     s1WH[6],          (VTX_INV_W_INT )(outputVtxPos)
+    ssv     s1WH[6],          (VTX_INV_W_INT )($19)
    vadd    sOC1, sOC1, sOC1[0q] // Add pairs upwards
 .if !CFG_LEGACY_VTX_PIPE
 // sVPO is $v17 // vtx_store ViewPort Offset
    lqv     sVPO, (0x10)(rdpCmdBufEndP1) // Load viewport offset from temp mem
 .endif
    // vnop
-.if !CFG_LEGACY_VTX_PIPE
+.if CFG_LEGACY_VTX_PIPE
+    addi    $1, $1, -2*inputVtxSize     // Counter of remaining verts * inputVtxSize
+.else
 // sVPS is $v16 // vtx_store ViewPort Scale
    lqv     sVPS, (0x00)(rdpCmdBufEndP1) // Load viewport scale from temp mem
 .endif
    vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
-middle_of_vtx_store:
+// vPairST is $v22
+    ldv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3
+    vmadm   vPairTPosI, vPairTPosI, $v30[3] // Persp norm
+    llv     vPairST[8],   (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
+    vmadn   vPairTPosF, $v31, $v31[2] // 0
 // vPairPosI is $v20
    ldv     vPairPosI[0],      (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
-    vmadm   vPairTPosI, vPairTPosI, $v30[3] // Persp norm
-    ldv     vPairPosI[8],      (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
-    vmadn   vPairTPosF, $v31, $v31[2] // 0
-// vPairST is $v22
-    llv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1
    vadd    sOC1, sOC1, sOC1[1h] // Add elems 1, 5 to 3, 7
-    llv     vPairST[8],   (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
+    ldv     vPairPosI[8],      (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
    // vnop
 // sO03 is $v26 // vtx_store Occlusion coeffs 0-3
    ldv     sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3
@@ -1634,17 +1627,19 @@ middle_of_vtx_store:
    lqv     sOPM, (0x20)(rdpCmdBufEndP1) // Load occlusion plane -/+4000 constants
 .endif
    vmadh   vPairTPosI, vPairTPosI, sVPS
+    andi    $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
 // sFOG is $v16
    vmadh   sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
    or      $10, $10, $11          // Combine results for first vertex
    vlt     $v29, sOC1, $v31[2] // Occlusion plane equation < 0 in elems 3, 7
+    slv     vPairST[4],   (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Replace vtx 1 ST with vtx 0 RGBA
    // vnop
    cfc2    $11, $vcc // Load occlusion plane mid results to bits 3 and 7
 // sOSC is $v21 // vtx_store Occlusion SCaled up
    vmudh   sOSC, vPairTPosI, $v31[4] // 4; scale up x and y
    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(secondVtxPos)
    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
-    // nop TODO
+    or      $24, $24, $20          // Combine results for second vertex
 // sCLZ is $v25 // vtx_store CLipped Z
    vge     sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
    ssv     vPairTPosF[4],    (VTX_SCR_Z_FRAC)($19)
@@ -1665,23 +1660,27 @@ middle_of_vtx_store:
    veq     $v29, $v31, $v31[0q]       // Set VCC to 10101010
    slv     vPairTPosI[0],    (VTX_SCR_VEC   )($19)
    vmrg    sOC2, sOC2, sOC3           // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
-// vPairNrml is $v16
-    lpv     vPairNrml[4],     (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
+.if CFG_LEGACY_VTX_PIPE
+    lpv     $v14[7],          (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Y to elem 0, 4
+.else
+    addi    $1, $1, -2*inputVtxSize     // Counter of remaining verts * inputVtxSize
+.endif
    // vnop
    ssv     sCLZ[12],         (VTX_SCR_Z     )(secondVtxPos)
    // vnop
-// vDDD is $v26
-    lpv     vDDD[0],          (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
+.if CFG_LEGACY_VTX_PIPE
+    lpv     $v15[6],          (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Z to elem 0, 4
+.endif
    // vnop
    ssv     sCLZ[4],          (VTX_SCR_Z     )($19)
    vge     $v29, sOC2, sO47           // Each compare to coeffs 4-7
-// sCOL is $v17, defined at start of loop
-    luv     sCOL[4],          (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4
+// vPairNrml is $v16
+    lpv     vPairNrml[0],     (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair normals
    vmudn   $v29, vM3F, vOne
    cfc2    $20, $vcc
    vmadh   $v29, vM3I, vOne
 // vPairRGBA is $v27
-    luv     vPairRGBA[0],     (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
+    luv     vPairRGBA[0],     (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair colors
    vmadn   $v29, vM0F, vPairPosI[0h]
    andi    $11, $11, CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4) // Only bits 3, 7 from occlusion
    vmadh   $v29, vM0I, vPairPosI[0h]
@@ -1876,6 +1875,7 @@ clip_skipxy:
    vsubc   $v29, vClDiffF, vOne[0]       // frac part - 1 for carry
    vge     vClDiffI, vClDiffI, $v31[2]   // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
    vmrg    vClFade1, vClDiffF, vOne[0]   // keep frac part of factor, else set to 1 (min val)
+    addi    inputVtxPos, rdpCmdBufEndP1, 0x30 // Temp mem; vtx load writes a few bytes here
    vmudn   vClFade2, vClFade1, $v31[1]   // signed x * -1 = 0xFFFF - unsigned x! v2[3] is fade factor for on screen vert
    lhu     $5, geometryModeLabel + 1     // Load middle 2 bytes of geom mode, incl fog setting
    // Fade between attributes for on screen and off screen vert
@@ -2779,6 +2779,7 @@ lt_continue_setup:
    lb      $11, dirLightsXfrmValid
    li      $10, -1                   // To mark lights valid
    addi    $3, $3, altBase           // Point to ambient light; stored through vtx proc
+    andi    $17, $5, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store
    and     $11, $11, $7              // Zero if either matrix or lights invalid
    bnez    $11, lt_setup_skip_xfrm
     sb     $10, dirLightsXfrmValid
@@ -2840,11 +2841,8 @@ xfrm_light_post:
     li     $20, OSTask + OSTask_ucode_data_size
 lt_setup_skip_xfrm:
    // Load first light direction to $v13, which is not used throughout vtx processing.
-    lpv     $v4[4], (ltBufOfs + 0 - lightSize)($3) // Xfrmed dir in elems 0-2
-    vlt     $v29, $v31, $v31[4] // Set VCC to 11110000
-    lpv     $v5[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6
    j       vtx_after_lt_setup
-     vmrg   $v13, $v4, $v5  // $v13 = first light direction
+     lpv    $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6

 xfrm_single_dir:
    vmudn   $v29, $v16, $v3[0]
@@ -2877,77 +2875,44 @@ lt_vtx_pair:
 //
 // LVP main lighting
 //
+    vmulf   $v29, vPairNrml, $v13[4] // Normals X elems 0, 4 * first light dir
    luv     vPairLt,     (ltBufOfs + 0)($3)  // Total light level, init to ambient
-    vmulf   vAAA, $v13, vPairNrml            // First light dir * normals
+    vmacf   $v29, $v14, $v13[5] // Normals Y elems 0, 4 * first light dir
+    lpv     vDDD[0],     (ltBufOfs + 8 - 2*lightSize)($3) // Xfrmed dir in elems 4-6
+    vmacf   vAAA, $v15, $v13[6] // Normals Z elems 0, 4 * first light dir
 .if CFG_PROFILING_B
    addi    perfCounterA, perfCounterA, 2    // Increment lit vertex count by 2
 .endif
-    move    curLight, $3                     // Point to ambient light
-lt_loop:
-    // vnop; vnop
-    vmudh   $v29, vOne, vAAA[0h] // Sum components of dot product
-    lpv     vCCC[4],     (ltBufOfs + 0 - 2*lightSize)(curLight) // Xfrmed dir in elems 0-2
-    vmadh   $v29, vOne, vAAA[1h]
-    lpv     vDDD[0],     (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6
-    vmadh   vAAA, vOne, vAAA[2h]
-    beq     curLight, altBaseReg, lt_post
-     luv    vBBB,        (ltBufOfs + 0 - lightSize)(curLight) // Light color
-    vlt     $v29, $v31, $v31[4] // Set VCC to 11110000
-    addi    curLight, curLight, -lightSize
-    vmrg    vCCC, vCCC, vDDD  // vCCC = light direction
-    vge     vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0
-    // vnop; vnop
-    vmudh   $v29, vOne, vPairLt // Load accum mid with current light level
-    vmacf   vPairLt, vBBB, vAAA[0h] // + light color * dot product
-    bne     curLight, altBaseReg, lt_loop
-     vmulf  vAAA, vCCC, vPairNrml // Light dir * normals
-
-
-/*
-.if CFG_PROFILING_B
-    addi    perfCounterA, perfCounterA, 2    // Increment lit vertex count by 2
-.endif
-    vmulf   $v29, $v14, $v13[4] // Normals X elems 0, 4 * first light dir
-    luv     vPairLt,     (ltBufOfs + 0)($3)  // Total light level, init to ambient
-    vmacf   $v29, $v15, $v13[5] // Normals Y elems 0, 4 * first light dir
-    lpv     vDDD[0],     (ltBufOfs + 8 - XXX)(curLight) // Xfrmed dir in elems 4-6
-    vmacf   vAAA, vPairNrml, $v13[6] // Normals Z elems 0, 4 * first light dir
    beq     $3, altBaseReg, lt_post
-     move   curLight, $3                     // Point to ambient light
+     lpv    $v18[0],     (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut
+    move    curLight, $3                     // Point to ambient light
+    // vnop
 lt_loop:
    vge     vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0
-    vmulf   $v29, $v14, vDDD[4] // Normals X elems 0, 4
+    vmulf   $v29, vPairNrml, vDDD[4] // Normals X elems 0, 4
+    luv     vBBB,        (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color
+    vmacf   $v29, $v14, vDDD[5] // Normals Y elems 0, 4
    addi    curLight, curLight, -lightSize
-    vmacf   $v29, $v15, vDDD[5] // Normals Y elems 0, 4
-    luv     vBBB,        (ltBufOfs + 0 - XXX)(curLight) // Light color
-    vmacf   vAAA, vPairNrml, vDDD[6] // Normals Z elems 0, 4
-    lpv     vDDD[0],     (ltBufOfs + 8 - XXX)(curLight) // Xfrmed dir in elems 4-6
+    vmacf   vAAA, $v15, vDDD[6] // Normals Z elems 0, 4
+    lpv     vDDD[0],     (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6
    vmudh   $v29, vOne, vPairLt // Load accum mid with current light level
    bne     curLight, altBaseReg, lt_loop
     vmacf  vPairLt, vBBB, vCCC[0h] // + light color * dot product
-*/
-
 lt_post:
-    andi    $11, $5, G_TEXTURE_GEN >> 8
    vne     $v29, $v31, $v31[3h]           // Set VCC to 11101110
-    beqz    $11, vtx_return_from_lighting
+    beqz    $17, vtx_return_from_lighting
     vmrg   vPairRGBA, vPairLt, vPairRGBA  // RGB = light, A = vtx alpha
 .endif
 // These definitions are shared by both versions
 vLookat1 equ vAAA
 vLookat0 equ vPairLt
 .if CFG_LEGACY_VTX_PIPE
-    lpv     vCCC[0],     (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, lookat 1 in 4-6
-    vlt     $v29, $v31, $v31[4] // Set VCC to 11110000
-    lpv     vLookat0[4], (xfrmLookatDirs + 0)($zero) // Garbage  in 0-2, lookat 0 in 4-6
-    lpv     vLookat1[4], (xfrmLookatDirs - 8)($zero) // Lookat 1 in 0-2, garbage  in 4-6
-    vmrg    vLookat0, vCCC, vLookat0       // Lookat 0 in 0-2, 4-6
-    vmrg    vLookat1, vLookat1, vCCC       // Lookat 1 in 0-2, 4-6
-    vmulf   vLookat0, vPairNrml, vLookat0  // Normal * lookat 0 dir
-    vmulf   vLookat1, vPairNrml, vLookat1  // Normal * lookat 1 dir
-    vmudh   $v29, vOne, vLookat0[0h]       // Sum components of dot product
-    vmadh   $v29, vOne, vLookat0[1h]
-    vmadh   vLookat0, vOne, vLookat0[2h]   // vLookat0 = dot product 0
+    vmulf   $v29, vPairNrml, $v18[0] // Normals X elems 0, 4 * lookat 0 X
+    vmacf   $v29, $v14, $v18[1]      // Normals Y elems 0, 4 * lookat 0 Y
+    vmacf   vLookat0, $v15, $v18[2]  // Normals Z elems 0, 4 * lookat 0 Z
+    vmulf   $v29, vPairNrml, $v18[4] // Normals X elems 0, 4 * lookat 1 X
+    vmacf   $v29, $v14, $v18[5]      // Normals Y elems 0, 4 * lookat 1 Y
+    vmacf   vLookat1, $v15, $v18[6]  // Normals Z elems 0, 4 * lookat 1 Z
    // Continue to rest of texgen shared by both versions.
 .endif
    
@@ -2961,10 +2926,12 @@ vLookat0 equ vPairLt
    // Outputs: leave alone vPairPosI/F; update vPairRGBA, vPairST 
    // Locals: vAAA and vBBB after merge and normals selection, vCCC, vDDD, vPairLt, vNrmOut
    // New available locals: $6 (existing: $11, $10, $20, $24)
-    vmrg    vPairNrml, vPairNrml, vDDD       // Merge normals
    beqz    $11, lt_skip_packed_normals
     // Elems 4-5 get bytes 6-7 of the following vertex (1)
     lpv    vBBB[6],      (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Upper 2 in 4:5
+    // Elems 0-1 get bytes 6-7 of the following vertex (0)
+    lpv     vAAA[2],      (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
+    vlt     $v29, $v31, $v31[4]              // Set VCC to 11110000
    vmrg    vAAA, vAAA, vBBB                 // Merge packed normals
    // Packed normals algorithm. This produces a vector (one for each input vertex)
    // in vPairNrml such that |X| + |Y| + |Z| = 0x7F00 (called L1 norm), in the
@@ -3149,11 +3116,11 @@ lt_skip_fresnel:
    vlt     $v29, $v31, $v31[4]            // Set VCC to 11110000
    vmrg    vLookat1, vLookat1, vDDD       // vLookat1 = lookat 1 dir
    vmulf   vLookat1, vPairNrml, vLookat1  // Normal * lookat 1 dir
-.endif
-    // Rest of texgen shared by F3DEX3 native and LVP
    vmudh   $v29, vOne, vLookat1[0h]
    vmadh   $v29, vOne, vLookat1[1h]
    vmadh   vLookat1, vOne, vLookat1[2h]
+.endif
+    // Rest of texgen shared by F3DEX3 native and LVP
    vne     $v29, $v31, $v31[1h]           // Set VCC to 10111011
    andi    $11, $5, G_TEXTURE_GEN_LINEAR >> 8
    vmrg    vLookat0, vLookat0, vLookat1[0h] // Dot products in elements 0, 1, 4, 5
--- a/notes.s
+++ b/notes.s
@@ -17,3 +17,22 @@ vmadh   vOut, vOne, vIn[2h]
 vmudh   $v29, $v31, $v31[2] // 0; clear whole accumulator
 vadd    $v29, vIn, vIn[1h] // accum lo 0 = 0 + 1, 4 = 4 + 5
 vmadn   vOut, vOne, vIn[2h] // + 2,6; built-in saturation (clamping) ends up not a problem
+
+// lpv patterns. Same works for luv. Assuming $11 is 8 byte aligned (does not have
+// to be 16 byte aligned).
+//                       Elem  0  1  2  3  4  5  6  7
+lpv     $v27[1], (-8)($11) // 07 F8 F9 FA FB FC FD FE  Byte
+lpv     $v27[2], (-8)($11) // 06 07 F8 F9 FA FB FC FD  addr
+lpv     $v27[3], (-8)($11) // 05 06 07 F8 F9 FA FB FC  relative
+lpv     $v27[4], (-8)($11) // 04 05 06 07 F8 F9 FA FB  to
+lpv     $v27[5], (-8)($11) // 03 04 05 06 07 F8 F9 FA  $11
+lpv     $v27[6], (-8)($11) // 02 03 04 05 06 07 F8 F9
+lpv     $v27[7], (-8)($11) // 01 02 03 04 05 06 07 F8
+lpv     $v27[0], ( 0)($11) // 00 01 02 03 04 05 06 07
+lpv     $v27[1], ( 0)($11) // 0F 00 01 02 03 04 05 06
+lpv     $v27[2], ( 0)($11) // 0E 0F 00 01 02 03 04 05
+lpv     $v27[3], ( 0)($11) // 0D 0E 0F 00 01 02 03 04
+lpv     $v27[4], ( 0)($11) // 0C 0D 0E 0F 00 01 02 03
+lpv     $v27[5], ( 0)($11) // 0B 0C 0D 0E 0F 00 01 02
+lpv     $v27[6], ( 0)($11) // 0A 0B 0C 0D 0E 0F 00 01
+lpv     $v27[7], ( 0)($11) // 09 0A 0B 0C 0D 0E 0F 00