diff --git a/f3dex3.s b/f3dex3.s
index e7cc640..ab53503 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -2878,16 +2878,12 @@ lt_vtx_pair:
 .endif
     move    curLight, $3                     // Point to ambient light
 lt_loop:
-    /* TODO try this, this saves one vnop cycle before
     vmudh   $v29, $v31, $v31[2] // 0; clear whole accumulator
-    vadd    $v29, vAAA, vAAA[1h] // accum lo 0 = 0 + 1, 4 = 4 + 5
-    vmadn   vAAA, vOne, vAAA[2h] // + 2,6; built-in saturation (clamping) ends up not a problem
-    */
-    vmudh   $v29, vOne, vAAA[0h] // Sum components of dot product as signed
     lpv     vCCC[4],     (ltBufOfs + 0 - 2*lightSize)(curLight) // Xfrmed dir in elems 0-2
-    vmadh   $v29, vOne, vAAA[1h]
+    // vnop
+    vadd    $v29, vAAA, vAAA[1h] // accum lo 0 = 0 + 1, 4 = 4 + 5
     lpv     vDDD[0],     (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6
-    vmadh   vAAA, vOne, vAAA[2h]
+    vmadn   vAAA, vOne, vAAA[2h] // + 2,6; built-in saturation (clamping) ends up not a problem
     beq     curLight, altBaseReg, lt_post
      luv    vBBB,        (ltBufOfs + 0 - lightSize)(curLight) // Light color
     vlt     $v29, $v31, $v31[4] // Set VCC to 11110000
@@ -3022,13 +3018,13 @@ lt_loop:
      vmrg   vAAA, vAAA, vCCC                            // vAAA = light direction
     bnez    $11, lt_point
      luv    vDDD,    (ltBufOfs + 0 - lightSize)(curLight) // Light color
+    vcopy   vBBB, vOne // Directional light dot scaling = 0001.0001, approx == 1.0
     vmulf   vAAA, vAAA, vPairNrml // Light dir * normalized normals
     vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
     vmadm   vCCC, vPairRGBA, $v30[1] // + (alpha - 1) * aoDir factor; elems 3, 7
-    vcopy   vBBB, vOne // Directional light dot scaling = 0001.0001, approx == 1.0
-    vmudh   $v29, vOne, vAAA[0h] // Sum components of dot product as signed
-    vmadh   $v29, vOne, vAAA[1h]
-    vmadh   vAAA, vOne, vAAA[2h]
+    vmudh   $v29, $v31, $v31[2]  // 0; clear whole accumulator
+    vadd    $v29, vAAA, vAAA[1h] // accum lo 0 = 0 + 1, 4 = 4 + 5
+    vmadn   vAAA, vOne, vAAA[2h] // + 2,6; built-in saturation (clamping) ends up not a problem
 lt_finish_light:
     // vAAA is unclamped dot product, vBBB[2h:3h] is point light scaling on dot product,
     // vCCC is amb occ factor, vDDD is light color
@@ -3094,17 +3090,17 @@ lt_skip_novtxcolor:
 lt_skip_fresnel:
     beqz    $10, vtx_return_from_lighting  // no texgen
     // Texgen: vLookat0, vPairNrml, have to leave vPairPosI/F, vPairRGBA; output vPairST
-     vmudh  $v29, vOne, vLookat0[0h]
+     vmudh  $v29, $v31, $v31[2]            // 0; clear whole accumulator
     lpv     vLookat1[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2
-    vmadh   $v29, vOne, vLookat0[1h]
+    vadd    $v29, vLookat0, vLookat0[1h]   // accum lo 0 = 0 + 1, 4 = 4 + 5
     lpv     vDDD[0],     (ltBufOfs + 8 - lightSize)(curLight) // Lookat 1 dir in elems 4-6
-    vmadh   vLookat0, vOne, vLookat0[2h]   // vLookat0 = dot product 0
+    vmadn   vLookat0, vOne, vLookat0[2h]   // + 2,6; vLookat0 = dot product 0
     vlt     $v29, $v31, $v31[4]            // Set VCC to 11110000
     vmrg    vLookat1, vLookat1, vDDD       // vLookat1 = lookat 1 dir
     vmulf   vLookat1, vPairNrml, vLookat1  // Normal * lookat 1 dir
-    vmudh   $v29, vOne, vLookat1[0h]
-    vmadh   $v29, vOne, vLookat1[1h]
-    vmadh   vLookat1, vOne, vLookat1[2h]   // vLookat1 = dot product 1
+    vmudh   $v29, $v31, $v31[2]            // 0; clear whole accumulator
+    vadd    $v29, vLookat1, vLookat1[1h]   // accum lo 0 = 0 + 1, 4 = 4 + 5
+    vmadn   vLookat1, vOne, vLookat1[2h]   // + 2,6; vLookat1 = dot product 1
     vne     $v29, $v31, $v31[1h]           // Set VCC to 10111011
     andi    $11, $5, G_TEXTURE_GEN_LINEAR >> 8
     vmrg    vLookat0, vLookat0, vLookat1[0h] // Dot products in elements 0, 1, 4, 5
@@ -3156,10 +3152,10 @@ lt_normal_to_vertex:
     vmadm   vBBB, vCCC, vCCC[3]        // PL: + len^2 int * quadratic factor frac
     vmadn   $v29, vDDD, vCCC[7]        // PL: + len^2 frac * quadratic factor int = $v29 frac
     vmadh   vCCC, vCCC, vCCC[7]        // PL: + len^2 int * quadratic factor int  = vCCC int
-    vmudh   vBBB, vOne, vAAA[0h]       // Both: Sum components of dot product as signed
-    vmadh   vBBB, vOne, vAAA[1h]       // Both:
+    vmudh   vBBB, $v31, $v31[2]        // Both: Clear accumulator (sum dot product components)
+    vadd    vBBB, vAAA, vAAA[1h]       // Both: accum lo 0 = 0 + 1, 4 = 4 + 5
     bnez    $10, lt_after_camera       // $10 set if computing specular or fresnel
-     vmadh  vAAA, vOne, vAAA[2h]       // Both: vAAA dot product
+     vmadn  vAAA, vOne, vAAA[2h]       // Both: + 2,6; vAAA dot product
     vrcph   vBBB[1], vCCC[0]     // 1/(2*light factor), input of 0000.8000 -> no change normals
     luv     vDDD,    (ltBufOfs + 0 - lightSize)(curLight) // vDDD = light color
     vrcpl   vBBB[2], $v29[0]     // Light factor 0001.0000 -> normals /= 2