Middle of specular

2026-01-21 10:37:45 -08:00 · 2023-11-24 22:35:20 -08:00
parent 68f9725130
commit f247fecdbb
3 changed files with 171 additions and 134 deletions
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -588,35 +588,16 @@ vVP1F  equ $v13
 vVP2F  equ $v14
 vVP3F  equ $v15
 // Remaining regs sometimes valid in vertex and lighting, also used as temps
-vPairNrml  equ $v16 // Vertex pair normals (model then world space)
-vPairLt    equ $v17 // Vertex pair total light color/intensity (RGB-RGB-)
 vPairPosI  equ $v20 // Vertex pair model / world space position int/frac
 vPairPosF  equ $v21
 vPairST    equ $v22 // Vertex pair ST texture coordinates
 vPairTPosF equ $v23 // Vertex pair transformed (clip / screen) space position frac/int
 vPairTPosI equ $v24
-// $v25: temp
-// $v26: temp
 vPairRGBA  equ $v27 // Vertex pair color
 vOne       equ $v28 // Global, all elements = 1
 // $v29: permanent temp register, also write results here to discard
 // $v31: Only global constant vector register

-// Some extra defines for lighting:
-vPackPXY   equ $v23 // Positive X and Y in packed normals
-vPackZ     equ $v24 // Z in packed normals
-vLtRGBOut  equ $v25 // Light / effects RGB output
-vLtAOut    equ $v26 // Light / effects alpha output
-vLtColor   equ $v26 // Light color
-vLookat1   equ $v16 // Lookat direction 1
-vLookat0   equ $v17 // Lookat direction 0
-// M inverse transpose matrix in regs briefly:
-vLtMIT0I   equ $v26
-vLtMIT1I   equ $v25
-vLtMIT2I   equ $v23
-vLtMIT0F   equ $v29
-vLtMIT1F   equ $v17
-vLtMIT2F   equ $v24

 // Other vector regs defines:
 vZero equ $v0  // all elements = 0, for tri write only
@@ -1344,6 +1325,32 @@ vtx_setup_constants:
     sqv    $v25, (0x00)(rdpCmdBufEndP1)          // Store viewport scale to temp mem
    jal     while_wait_dma_busy                   // Wait for vertex load to finish
 vtx_load_loop:
+// Some extra defines for lighting:
+vPairNrml  equ $v16 // Vertex pair normals (model then world space)
+vPairLt    equ $v17 // Vertex pair total light color/intensity (RGB-RGB-)
+vPackPXY   equ $v23 // Positive X and Y in packed normals
+vPackZ     equ $v24 // Z in packed normals
+vLtRGBOut  equ $v25 // Light / effects RGB output
+vLtAOut    equ $v26 // Light / effects alpha output
+vLtColor   equ $v26 // Light color
+vLookat1   equ $v16 // Lookat direction 1
+vLookat0   equ $v17 // Lookat direction 0
+// M inverse transpose matrix in regs briefly:
+vLtMIT0I   equ $v26
+vLtMIT1I   equ $v25
+vLtMIT2I   equ $v23
+vLtMIT0F   equ $v29
+vLtMIT1F   equ $v17
+vLtMIT2F   equ $v24
+//
+vNNN equ $v16
+vLLL equ $v17
+vSSS equ $v18
+vLtDir equ $v19
+vAAA equ $v23
+vBBB equ $v24
+vCCC equ $v25
+vDDD equ $v26
     vlt    $v29, $v31, $v31[4]                   // Set VCC to 11110000
    ldv     vPairPosI[8],      (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
    ldv     vPairPosI[0],      (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
@@ -1355,11 +1362,11 @@ vtx_load_loop:
    // 0-3 get bytes C-F of the vertex (which is what we want).
    luv     vPairRGBA[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4
    vmadh   $v29, vM3I, vOne
-    luv     $v25[0],      (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
+    luv     vCCC[0],      (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
    vmadn   $v29, vM0F, vPairPosI[0h]
-    lpv     $v16[4],      (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
+    lpv     vNNN[4],      (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
    vmadh   $v29, vM0I, vPairPosI[0h]
-    lpv     $v26[0],      (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
+    lpv     vDDD[0],      (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
    vmadn   $v29, vM1F, vPairPosI[1h]
    llv     vPairST[0],   (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1
    vmadh   $v29, vM1I, vPairPosI[1h]
@@ -1368,11 +1375,11 @@ vtx_load_loop:
    andi    $11, $5, G_LIGHTING >> 8
    vmadh   vPairPosI, vM2I, vPairPosI[2h] // vPairPosI/F = vertices world coords
    // Elems 0-1 get bytes 6-7 of the following vertex (0)
-    lpv     $v17[2],      (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
-    vmrg    vPairRGBA, vPairRGBA, $v25 // Merge colors
+    lpv     vLLL[2],      (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
+    vmrg    vPairRGBA, vPairRGBA, vCCC // Merge colors
    bnez    $11, ovl234_lighting_entrypoint
     // Elems 4-5 get bytes 6-7 of the following vertex (1)
-     lpv    $v25[6],      (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Upper 2 in 4:5
+     lpv    vCCC[6],      (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Upper 2 in 4:5
 vtx_return_from_lighting:
    vclr    $v26
    andi    $11, $5, G_ATTROFFSET_ST_ENABLE >> 8
@@ -2175,7 +2182,7 @@ ovl1_padded_end:

 ovl2_start:
 ovl234_lighting_entrypoint:
-    vmrg    vPairNrml, $v16, $v26          // Merge normals
+    vmrg    vPairNrml, vNNN, vDDD          // Merge normals
    j       lt_continue_setup
     andi   $11, $5, G_PACKED_NORMALS >> 8

@@ -2192,28 +2199,28 @@ ovl234_clipping_entrypoint_ovl2ver:  // same IMEM address as ovl234_clipping_ent

 lt_continue_setup:
    // Inputs: vPairPosI/F vertices pos world int:frac, vPairRGBA, vPairST,
-    // $v16 vPairNrml, $v17:$v25 (to be merged) packed normals
+    // vNNN vPairNrml, vLLL:vCCC (to be merged) packed normals
    // Outputs: vPairRGBA, vPairST, must leave alone vPairPosI/F
-    // Locals: $v29 temp, $v23 (will be vPairTPosF), $v24 (will be vPairTPosI),
-    // $v25 (after merge), $v26, whichever of $v16 or $v17 is unused
+    // Locals: $v29 temp, vAAA (will be vPairTPosF), vBBB (will be vPairTPosI),
+    // vCCC (after merge), vDDD, whichever of vNNN or vLLL is unused
    // New available locals: $6, $7 (existing: $11, $12, $20, $24)
    beqz    $11, lt_skip_packed_normals
-     vmrg   $v17, $v17, $v25          // Merge packed normals
+     vmrg   vLLL, vLLL, vCCC          // Merge packed normals
    // Packed normals algorithm. This produces a vector (one for each input vertex)
    // in vPairNrml such that |X| + |Y| + |Z| = 0x7F00 (called L1 norm), in the
    // same direction as the standard normal vector. The length is not "correct"
    // compared to the standard normal, but it's is normalized anyway after the M
    // matrix transform.
-    vand    vPackPXY, $v17, $v31[6]       // 0x7F00; positive X, Y
+    vand    vPackPXY, vLLL, $v31[6]       // 0x7F00; positive X, Y
    vclr    $v29                         // Zero
    vaddc   vPackZ, vPackPXY, vPackPXY[1q]    // elem 0, 4: pos X + pos Y, no clamping
-    vadd    $v26, $v29, $v29             // Save carry bit, indicates use 0x7F00 - x and y
+    vadd    vDDD, $v29, $v29             // Save carry bit, indicates use 0x7F00 - x and y
    vxor    vPairNrml, vPackPXY, $v31[6]   // 0x7F00 - x, 0x7F00 - y
    vxor    vPackZ, vPackZ, $v31[6]            // 0x7F00 - +X - +Y in elems 0, 4
-    vne     $v29, $v29, $v26[0h]         // set 0-3, 4-7 vcc if (+X + +Y) overflowed, discard result
+    vne     $v29, $v29, vDDD[0h]         // set 0-3, 4-7 vcc if (+X + +Y) overflowed, discard result
    vmrg    vPairNrml, vPairNrml, vPackPXY  // If so, use 0x7F00 - +X, else +X (same for Y)
    vne     $v29, $v31, $v31[2h]         // set VCC to 11011101
-    vabs    vPairNrml, $v17, vPairNrml     // Apply sign of original X and Y to new X and Y
+    vabs    vPairNrml, vLLL, vPairNrml     // Apply sign of original X and Y to new X and Y
    vmrg    vPairNrml, vPairNrml, vPackZ[0h]  // Move Z to elements 2, 6
 lt_skip_packed_normals:
    // Transform normals by M, in case normalsMode = G_NORMALSMODE_FAST.
@@ -2222,12 +2229,14 @@ lt_skip_packed_normals:
    vmudn   $v29, vM0F, vPairNrml[0h]
    lbu     $11, normalsMode($zero)
    vmadh   $v29, vM0I, vPairNrml[0h]
+    andi    $6, $5, G_SHADING_SPECULAR >> 8
    vmadn   $v29, vM1F, vPairNrml[1h]
    addi    curLight, curLight, altBase // Point to ambient light
    vmadh   $v29, vM1I, vPairNrml[1h]
-    vmadn   $v24, vM2F, vPairNrml[2h] // $v24 = normals frac
+    andi    $12, $5, (G_SHADING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
+    vmadn   vBBB, vM2F, vPairNrml[2h] // vBBB = normals frac
    beqz    $11, lt_after_xfrm_normals // Skip if G_NORMALSMODE_FAST
-     vmadh  $v23, vM2I, vPairNrml[2h] // $v23 = normals int
+     vmadh  vAAA, vM2I, vPairNrml[2h] // vAAA = normals int
    // Transform normals by M inverse transpose, for G_NORMALSMODE_AUTO or G_NORMALSMODE_MANUAL
    lqv     vLtMIT0I,    (mITMatrix + 0x00)($zero) // x int, y int
    lqv     vLtMIT2I,    (mITMatrix + 0x10)($zero) // z int, x frac
@@ -2247,41 +2256,71 @@ lt_skip_packed_normals:
    vmadh   $v29, vLtMIT0I, vPairNrml[0h]
    vmadn   $v29, vLtMIT1F, vPairNrml[1h]
    vmadh   $v29, vLtMIT1I, vPairNrml[1h]
-    vmadn   $v24, vLtMIT2F, vPairNrml[2h] // vLtMIT2F = $v24 = normals frac
-    vmadh   $v23, vLtMIT2I, vPairNrml[2h] // vLtMIT2I = $v23 = normals int
+    vmadn   vBBB, vLtMIT2F, vPairNrml[2h] // vLtMIT2F = vBBB = normals frac
+    vmadh   vAAA, vLtMIT2I, vPairNrml[2h] // vLtMIT2I = vAAA = normals int
 lt_after_xfrm_normals:
-    // Normalize normals; in $v23:$v24 i/f, out $v23
+    // Normalize normals; in vAAA:vBBB i/f, out vLtDir
    jal     lt_normalize
     luv    vPairLt, (ltBufOfs + 0)(curLight) // Total light level, init to ambient
    // Set up ambient occlusion: light *= (factor * (alpha - 1) + 1)
    vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
-    vmadm   $v25, vPairRGBA, $v30[0] // + (alpha - 1) * aoAmb factor; elems 3, 7
-    vcopy   vPairNrml, $v23
-    vmulf   vPairLt, vPairLt, $v25[3h] // light color *= ambient factor
+    vmadm   vCCC, vPairRGBA, $v30[0] // + (alpha - 1) * aoAmb factor; elems 3, 7
+    vcopy   vPairNrml, vLtDir
+    beqz    $12, lt_loop
+     vmulf  vPairLt, vPairLt, vCCC[3h] // light color *= ambient factor
+    // Get (vPairNrml dot (camera - vertex))
+    ldv     vAAA[0], (cameraWorldPos - altBase)(altBaseReg) // Camera world pos
+    j       lt_normal_to_vertex
+     ldv    vAAA[8], (cameraWorldPos - altBase)(altBaseReg)
+lt_after_camera: // output in vAAA
+    // TODO
 lt_loop:
-    // vPairPosI/F, vPairST, $v23 light pos/dir (then local), $v24 $v25 locals,
+    // vPairPosI/F, vPairST, vAAA light pos/dir (then local), vBBB vCCC locals,
    // vLtColor, vPairRGBA, vPairNrml, $v29 temp, vPairLt
-    lpv     $v23[0], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2
+    lpv     vAAA[0], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2
    vlt     $v29, $v31, $v31[4] // Set VCC to 11110000
-    lpv     $v25[4], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 4-6
+    lpv     vCCC[4], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 4-6
    lbu     $11,     (ltBufOfs + 3 - lightSize)(curLight) // Light type / constant attenuation
    beq     curLight, altBaseReg, lt_post
-     vmrg   $v23, $v23, $v25                              // $v23 = light direction
+     vmrg   vLtDir, vAAA, vCCC                            // vLtDir = light direction
    bnez    $11, lt_point
     luv    vLtColor,    (ltBufOfs + 0 - lightSize)(curLight) // Light color
-    vmulf   $v23, $v23, vPairNrml // Light dir * normalized normals
+    vmulf   vAAA, vLtDir, vPairNrml // Light dir * normalized normals
    vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
-    vmadm   $v24, vPairRGBA, $v30[1] // + (alpha - 1) * aoDir factor; elems 3, 7
-    vmudh   $v29, vOne, $v23[0h] // Sum components of dot product as signed
-    vmadh   $v29, vOne, $v23[1h]
-    vmadh   $v23, vOne, $v23[2h]
-    vmulf   vLtColor, vLtColor, $v24[3h] // light color *= ambient or point light factor
-    vge     $v23, $v23, $v31[2] // 0; clamp dot product to >= 0
+    vmadm   vCCC, vPairRGBA, $v30[1] // + (alpha - 1) * aoDir factor; elems 3, 7
+    vcopy   vBBB, vOne // Directional light dot scaling = 0001.0001, approx == 1.0
+    vmudh   $v29, vOne, vAAA[0h] // Sum components of dot product as signed
+    vmadh   $v29, vOne, vAAA[1h]
+    vmadh   vAAA, vOne, vAAA[2h]
 lt_finish_light:
+    // vAAA is unclamped dot product, vBBB is point light scaling on dot product,
+    // vCCC is amb occ factor, vLtColor = vDDD is light color
+    vge     vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0
+    vmulf   vLtColor, vLtColor, vCCC[3h] // light color *= dir or point light factor
+    vmudm   $v29, vAAA, vBBB[2h] // Dot product int * rcp frac
+    vmadh   vAAA, vAAA, vBBB[3h] // Dot product int * rcp int, clamp to 0x7FFF
    addi    curLight, curLight, -lightSize
    vmudh   $v29, vOne, vPairLt // Load accum mid with current light level
+    beqz    $6, lt_loop
+     vmacf  vPairLt, vLtColor, vAAA[0h] // + light color * dot product
+    // Specular: similar to diffuse, but with specular direction
+    vmulf   vAAA, vLtDir, vSSS // Light dir * specular direction
+    lb      $20, (ltBufOfs + 0xF)(curLight) // Light size factor
+    vmudh   $v29, vOne, vAAA[0h] // Sum components of dot product as signed
+    vmadh   $v29, vOne, vAAA[1h]
+    addi    $20, $20, 2 // Default of 0 -> 2
+    vmadh   vAAA, vOne, vAAA[2h]
+    mtc2    $20, vCCC[0]        // Light size factor
+    vxor    vAAA, vAAA, $v31[7] // = 0x7FFF - dot product
+    vmudh   vAAA, vAAA, vCCC[0] // * size factor
+    vxor    vAAA, vAAA, $v31[7] // = 0x7FFF - dot product
+    vge     vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0
+    vmudm   $v29, vAAA, vBBB[2h] // Dot product int * rcp frac
+    vmadh   vAAA, vAAA, vBBB[3h] // Dot product int * rcp int, clamp to 0x7FFF
+    vmudh   $v29, vOne, vPairLt // Load accum mid with current light level
    j       lt_loop
-     vmacf  vPairLt, vLtColor, $v23[0h] // + light color * dot product
+     vmacf  vPairLt, vLtColor, vAAA[0h] // + light color * dot product
+    

 lt_post:
    vadd    vPairRGBA, vPairRGBA, $v31[7] // 0x7FFF; undo change for ambient occlusion
@@ -2302,38 +2341,36 @@ lt_skip_cel:
     andi   $24, $5, G_FRESNEL_ALPHA >> 8
    vcopy   vLtRGBOut, vPairLt                // If no packed normals, base output is just light
 lt_skip_novtxcolor:
-    vmulf   vLookat0, vPairNrml, $v23        // Normal * lookat 0 dir; vLookat0 = $v17 = vPairLt
+    vmulf   vLookat0, vPairNrml, vAAA        // Normal * lookat 0 dir; vLookat0 = vLLL = vPairLt
    beqz    $24, lt_skip_fresnel
     vmrg   vPairRGBA, vLtRGBOut, vLtAOut       // Merge base output and alpha output
-    // Fresnel: call point lighting; camera pos in $v23
-    ldv     $v23[0], (cameraWorldPos - altBase)(altBaseReg) // Camera world pos
-    j       lt_normal_to_vertex
-     ldv    $v23[8], (cameraWorldPos - altBase)(altBaseReg)
-lt_finish_fresnel: // output in $v23
-    vabs    $v23, $v23, $v23            // Absolute value
-    vmudn   $v26, $v31, $v30[7]         // TODO this is wrong, 0x0100 is not available Elem 4 = low part of 0x0100 * scale
-    vmadh   $v25, $v31, $v31[2]         // + 0; elem 4 = high part of 0x0100 * scale
-    vsub    $v23, $v23, $v30[6]         // Subtract offset
-    vmudl   $v29, $v23, $v26[4]         // Unsigned Fresnel value * low part shifted scale
-    vmadn   $v23, $v23, $v25[4]         // Alpha = unsigned Fresnel value * high part
-    vmrg    vPairRGBA, vPairRGBA, $v23  // Merge base output and alpha output
+    // Fresnel: call point lighting; camera pos in vAAA
+    /*
+    vabs    vAAA, vAAA, vAAA            // Absolute value
+    vmudn   vDDD, $v31, $v30[7]         // TODO this is wrong, 0x0100 is not available Elem 4 = low part of 0x0100 * scale
+    vmadh   vCCC, $v31, $v31[2]         // + 0; elem 4 = high part of 0x0100 * scale
+    vsub    vAAA, vAAA, $v30[6]         // Subtract offset
+    vmudl   $v29, vAAA, vDDD[4]         // Unsigned Fresnel value * low part shifted scale
+    vmadn   vAAA, vAAA, vCCC[4]         // Alpha = unsigned Fresnel value * high part
+    vmrg    vPairRGBA, vPairRGBA, vAAA  // Merge base output and alpha output
+    */
 lt_skip_fresnel:
    beqz    $12, vtx_return_from_lighting // no texgen
-    // Texgen: vLookat0, vLookat1, locals $v25, $v26, $v23
+    // Texgen: vLookat0, vLookat1, locals vCCC, vDDD, vAAA
    // Output: vPairST; have to leave vPairPosI/F, vPairRGBA
     vmudh  $v29, vOne, vLookat0[0h]
    lpv     vLookat1[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2
    vmadh   $v29, vOne, vLookat0[1h]
-    lpv     $v26[0], (ltBufOfs + 8 - lightSize)(curLight) // Lookat 1 dir in elems 4-6
+    lpv     vDDD[0], (ltBufOfs + 8 - lightSize)(curLight) // Lookat 1 dir in elems 4-6
    vmadh   vLookat0, vOne, vLookat0[2h]      // vLookat0 = dot product 0
    vlt     $v29, $v31, $v31[4]         // Set VCC to 11110000
-    vmrg    vLookat1, vLookat1, $v26            // vLookat1 = lookat 1 dir
+    vmrg    vLookat1, vLookat1, vDDD            // vLookat1 = lookat 1 dir
    vmulf   vLookat1, vPairNrml, vLookat1        // Normal * lookat 1 dir
    vmudh   $v29, vOne, vLookat1[0h]
    vmadh   $v29, vOne, vLookat1[1h]
    vmadh   vLookat1, vOne, vLookat1[2h]      // vLookat1 = dot product 1
    vne     $v29, $v31, $v31[1h] // Set VCC to 10111011
-    llv     $v23[0], (texgenLinearCoeffs - altBase)(altBaseReg)
+    llv     vAAA[0], (texgenLinearCoeffs - altBase)(altBaseReg)
    vmrg    vLookat0, vLookat0, vLookat1[0h]  // Dot products in elements 0, 1, 4, 5
    andi    $11, $5, G_TEXTURE_GEN_LINEAR >> 8
    vmudh   $v29, vOne, $v31[5]  // 1 * 0x4000
@@ -2341,13 +2378,13 @@ lt_skip_fresnel:
     vmacf  vPairST, vLookat0, $v31[5] // + dot products * 0x4000 ( / 2)
    // Texgen_Linear:
    vmulf   vPairST, vLookat0, $v31[5] // dot products * 0x4000 ( / 2)
-    vmulf   $v26, vPairST, vPairST // ST squared
-    vmulf   $v25, vPairST, $v31[7] // Move ST to accumulator (0x7FFF = 1)
-    vmacf   $v25, vPairST, $v23[1] // + ST * 0x6CB3
+    vmulf   vDDD, vPairST, vPairST // ST squared
+    vmulf   vCCC, vPairST, $v31[7] // Move ST to accumulator (0x7FFF = 1)
+    vmacf   vCCC, vPairST, vAAA[1] // + ST * 0x6CB3
    vmudh   $v29, vOne, $v31[5] // 1 * 0x4000
-    vmacf   vPairST, vPairST, $v23[0] // + ST * 0x44D3
+    vmacf   vPairST, vPairST, vAAA[0] // + ST * 0x44D3
    j       vtx_return_from_lighting
-     vmacf  vPairST, $v26, $v25 // + ST squared * (ST + ST * coeff)
+     vmacf  vPairST, vDDD, vCCC // + ST squared * (ST + ST * coeff)
     
 lt_point:
    /*
@@ -2358,84 +2395,84 @@ lt_point:
    Input vector 1 elem size 0010.0000 -> len^2 00000100 -> 1/len 07FF.FC00 -> vec  7FFF.C000
    Input vector 1 elem size 0001.0000 -> len^2 00000001 -> 1/len 7FFF.C000 -> vec  7FFF.C000
    */
-    ldv     $v23[0], (ltBufOfs + 8 - lightSize)(curLight) // Light position int part 0-3
-    ldv     $v23[8], (ltBufOfs + 8 - lightSize)(curLight) // 4-7
+    ldv     vAAA[0], (ltBufOfs + 8 - lightSize)(curLight) // Light position int part 0-3
+    ldv     vAAA[8], (ltBufOfs + 8 - lightSize)(curLight) // 4-7
 lt_normal_to_vertex:
    // This reused for fresnel; scalar unit stuff all garbage in that case
-    // Input point (light / camera) in $v23; computes $v23 = (vPairNrml dot (input - vertex))
-    // Uses temps $v24, $v25, $v26, $v29
-    vclr    $v24                         // Zero light pos frac part
-    vsubc   $v24, $v24, vPairPosF             // Vector from vertex to light, frac
-    lbu     $20,     (ltBufOfs + 7 - lightSize)(curLight) // Linear factor
-    vsub    $v23, $v23, vPairPosI             // Int
-    jal     lt_normalize
-     lbu    $24,     (ltBufOfs + 0xE - lightSize)(curLight) // Quadratic factor
-    // $v23 = normalized vector from vertex to light, $v29[0h:1h] = 1/len, $v25 = len^2
-    vmudm   $v24, $v25, $v29[1h] // len^2 int * 1/len frac
-    vmadn   $v24, $v26, $v29[0h] // len^2 frac * 1/len int = len frac
-    mtc2    $20, vPairLt[14] // Quadratic int part in elem 7
-    vmadh   $v29, $v25, $v29[0h] // len^2 int * 1/len int = len int
-    vmulf   $v23, $v23, vPairNrml // Normalized light dir * normalized normals
-    vmudl   $v24, $v24, vPairNrml[7]   //   len frac * linear factor frac
-    vmadm   $v24, $v29, vPairNrml[7]   // + len int * linear factor frac
-    vmadm   $v24, vOne, vPairNrml[3] // + 1 * constant factor frac
-    vmadl   $v24, $v26, vPairLt[3]     // + len^2 frac * quadratic factor frac
-    vmadm   $v24, $v25, vPairLt[3]     // + len^2 int * quadratic factor frac
-    vmadn   $v29, $v26, vPairLt[7]     // + len^2 frac * quadratic factor int
-    vmadh   $v25, $v25, vPairLt[7]     // + len^2 int * quadratic factor int
-    luv     vLtColor,    (ltBufOfs + 0 - lightSize)(curLight) // vLtColor = $v26
-    vmudh   $v24, vOne, $v23[0h] // Sum components of dot product as signed
-    vmadh   $v24, vOne, $v23[1h]
-    beq     curLight, altBaseReg, lt_finish_fresnel // If finished light loop, is fresnel
-     vmadh  $v23, vOne, $v23[2h]
-    vrcph   $v24[1], $v25[0] // 1/(2*light factor), input of 0000.8000 -> no change normals
-    vrcpl   $v24[2], $v29[0] // Light factor 0001.0000 -> normals /= 2
-    vrcph   $v24[3], $v25[4] // Light factor 0000.1000 -> normals *= 8 (with clamping)
-    vrcpl   $v24[6], $v29[4] // Light factor 0010.0000 -> normals /= 32
-    vrcph   $v24[7], $v31[2] // 0
-    vge     $v23, $v23, $v31[2] // 0; clamp dot product to >= 0
-    vmudm   $v29, $v23, $v24[2h] // Dot product int * rcp frac
+    // Input point (light / camera) in vAAA; computes vAAA = (vPairNrml dot (input - vertex))
+    // Uses temps vBBB, vCCC, vDDD, $v29
+    vclr    vBBB                       // Both: Zero input frac part
+    vsubc   vBBB, vBBB, vPairPosF      // Both: Vector from vertex to input, frac
+    lbu     $20,     (ltBufOfs + 7 - lightSize)(curLight) // PL: Linear factor
+    vsub    vAAA, vAAA, vPairPosI      // Both: Int
+    jal     lt_normalize               // Both: Input vAAA:vBBB; output vLtDir
+     lbu    $24,     (ltBufOfs + 0xE - lightSize)(curLight) // PL: Quadratic factor
+    // vLtDir = normalized vector from vertex to light, $v29[0h:1h] = 1/len, vCCC = len^2
+    vmudm   vBBB, vCCC, $v29[1h]       // PL: len^2 int * 1/len frac
+    vmadn   vBBB, vDDD, $v29[0h]       // PL: len^2 frac * 1/len int = len frac
+    mtc2    $20, vPairLt[14]           // PL: Quadratic int part in elem 7
+    vmadh   $v29, vCCC, $v29[0h]       // PL: len^2 int * 1/len int = len int
+    vmulf   vAAA, vLtDir, vPairNrml    // Both: Normalized light dir * normalized normals
+    vmudl   vBBB, vBBB, vPairNrml[7]   // PL:   len frac * linear factor frac
+    vmadm   vBBB, $v29, vPairNrml[7]   // PL: + len int * linear factor frac
+    vmadm   vBBB, vOne, vPairNrml[3]   // PL: + 1 * constant factor frac
+    vmadl   vBBB, vDDD, vPairLt[3]     // PL: + len^2 frac * quadratic factor frac
+    vmadm   vBBB, vCCC, vPairLt[3]     // PL: + len^2 int * quadratic factor frac
+    vmadn   $v29, vDDD, vPairLt[7]     // PL: + len^2 frac * quadratic factor int = $v29 frac
+    vmadh   vCCC, vCCC, vPairLt[7]     // PL: + len^2 int * quadratic factor int  = vCCC int
+    luv     vLtColor,    (ltBufOfs + 0 - lightSize)(curLight) // PL: vLtColor = vDDD
+    vmudh   vBBB, vOne, vAAA[0h]       // Both: Sum components of dot product as signed
+    vmadh   vBBB, vOne, vAAA[1h]       // Both:
+    beq     $zero, $zero, lt_after_camera // TODO TODO
+     vmadh  vAAA, vOne, vAAA[2h]       // Both: vAAA dot product
+    vrcph   vBBB[1], vCCC[0]     // 1/(2*light factor), input of 0000.8000 -> no change normals
+    vrcpl   vBBB[2], $v29[0]     // Light factor 0001.0000 -> normals /= 2
+    vrcph   vBBB[3], vCCC[4]     // Light factor 0000.1000 -> normals *= 8 (with clamping)
+    vrcpl   vBBB[6], $v29[4]     // Light factor 0010.0000 -> normals /= 32
+    vrcph   vBBB[7], $v31[2]     // 0
+    vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
    j       lt_finish_light
-     vmadh  $v23, $v23, $v24[3h] // Dot product int * rcp int, clamp to 0x7FFF
+     vmadm  vCCC, vPairRGBA, $v30[2] // + (alpha - 1) * aoPoint factor; elems 3, 7

 lt_normalize:
-    // Normalize vector in $v23:$v24 i/f, output in $v23. Also continue point
-    // light scalar unit stuff. Uses temps $v25, $v26, $v29, also $11, $20, $24
+    // Normalize vector in vAAA:vBBB i/f, output in vLtDir. Secondary outputs for
+    // point lighting in $v29[0h:1h] and vCCC. Also uses temps vDDD, $11, $20, $24
+    // Doing point light scalar stuff too.
    // Also overwrites vPairNrml and vPairLt elems 3, 7
-    vmudm   $v29, $v23, $v24             // Squared. Don't care about frac*frac term
+    vmudm   $v29, vAAA, vBBB             // Squared. Don't care about frac*frac term
    sll     $11, $11, 8                  // Constant factor, 00000100 - 0000FF00
-    vmadn   $v29, $v24, $v23
+    vmadn   $v29, vBBB, vAAA
    sll     $20, $20, 6                  // Linear factor, 00000040 - 00003FC0
-    vmadh   $v29, $v23, $v23
-    vreadacc $v26, ACC_MIDDLE
-    vreadacc $v25, ACC_UPPER
+    vmadh   $v29, vAAA, vAAA
+    vreadacc vDDD, ACC_MIDDLE
+    vreadacc vCCC, ACC_UPPER
    mtc2    $11, vPairNrml[6] // Constant frac part in elem 3
-    vmudm   $v29, vOne, $v26[2h] // Sum of squared components
-    vmadh   $v29, vOne, $v25[2h]
+    vmudm   $v29, vOne, vDDD[2h] // Sum of squared components
+    vmadh   $v29, vOne, vCCC[2h]
    srl     $11, $24, 5 // Top 3 bits
-    vmadm   $v29, vOne, $v26[1h]
+    vmadm   $v29, vOne, vDDD[1h]
    mtc2    $20, vPairNrml[14] // Linear frac part in elem 7
-    vmadh   $v29, vOne, $v25[1h]
+    vmadh   $v29, vOne, vCCC[1h]
    andi    $20, $24, 0x1F // Bottom 5 bits
-    vmadn   $v26, $v26, vOne // elem 0; swapped so we can do vmadn and get result
+    vmadn   vDDD, vDDD, vOne // elem 0; swapped so we can do vmadn and get result
    ori     $20, $20, 0x20 // Append leading 1 to mantissa
-    vmadh   $v25, $v25, vOne
+    vmadh   vCCC, vCCC, vOne
    sllv    $20, $20, $11 // Left shift to create floating point
-    vrsqh   $v29[2], $v25[0] // High input, garbage output
+    vrsqh   $v29[2], vCCC[0] // High input, garbage output
    sll     $20, $20, 8 // Min range 00002000, 00002100... 00003F00, max 00100000...001F8000
-    vrsql   $v29[1], $v26[0] // Low input, low output
+    vrsql   $v29[1], vDDD[0] // Low input, low output
    bnez    $24, @@skip // If original value is zero, set to zero
-     vrsqh  $v29[0], $v25[4] // High input, high output
+     vrsqh  $v29[0], vCCC[4] // High input, high output
    li      $20, 0
@@skip:
-    vrsql   $v29[5], $v26[4] // Low input, low output
+    vrsql   $v29[5], vDDD[4] // Low input, low output
    vrsqh   $v29[4], $v31[2] // 0 input, high output
    mtc2    $20, vPairLt[6] // Quadratic frac part in elem 3
-    vmudn   $v24, $v24, $v29[0h] // Vec frac * int scaling, discard result
+    vmudn   vBBB, vBBB, $v29[0h] // Vec frac * int scaling, discard result
    srl     $20, $20, 16
-    vmadm   $v24, $v23, $v29[1h] // Vec int * frac scaling, discard result
+    vmadm   vBBB, vAAA, $v29[1h] // Vec int * frac scaling, discard result
    jr      $ra
-     vmadh  $v23, $v23, $v29[0h] // Vec int * int scaling
+     vmadh  vLtDir, vAAA, $v29[0h] // Vec int * int scaling

 ovl2_end:
 .align 8
--- a/gbi.h
+++ b/gbi.h
@@ -129,7 +129,7 @@
 #define G_CULL_BOTH             0x00000600  /* useless but supported */
 #define G_PACKED_NORMALS        0x00000800
 #define G_LIGHTTOALPHA          0x00001000
-#define G_SHADING_SPECULAR     0x00002000
+#define G_SHADING_SPECULAR      0x00002000
 #define G_FRESNEL_COLOR         0x00004000
 #define G_FRESNEL_ALPHA         0x00008000
 #define G_FOG                   0x00010000
--- a/rsp/gbi.inc
+++ b/rsp/gbi.inc
@@ -17,7 +17,7 @@ G_CULL_BACK             equ 0x00000400
 G_CULL_BOTH             equ 0x00000600
 G_PACKED_NORMALS        equ 0x00000800
 G_LIGHTTOALPHA          equ 0x00001000
-G_LIGHTING_SPECULAR     equ 0x00002000
+G_SHADING_SPECULAR      equ 0x00002000
 G_FRESNEL_COLOR         equ 0x00004000
 G_FRESNEL_ALPHA         equ 0x00008000
 G_FOG                   equ 0x00010000