Non working draft which matches/beats EX2

2026-01-21 10:37:45 -08:00 · 2024-08-03 16:47:55 -07:00
parent f20a00f591
commit df2d86a789
3 changed files with 143 additions and 78 deletions
--- a/docs/Documentation/Performance.md
+++ b/docs/Documentation/Performance.md
@@ -4,15 +4,15 @@

 Vertex pipeline cycles per **vertex pair** in steady state (lower is better).
 Hand-counted timings taking into account all pipeline stalls and all dual-issue
-conditions except for instruction alignment.
+conditions. Instruction alignment is only taken into account for LVP_NOC.

 | Microcode      | No Lighting | First Dir Lt | Total for 1 Dir Lt | Extra Dir Lts |
 |----------------|-------------|--------------|--------------------|---------------|
 | F3DEX3         | 98          | 103          | 201                | 29            |
 | F3DEX3_NOC     | 79          | 103          | 182                | 29            |
 | F3DEX3_LVP     | 81          | 15           | 96                 | 7             |
-| F3DEX3_LVP_NOC | 62          | 15           | 77                 | 7             |
-| F3DEX2         | 54          | 19           | 73                 | 3 then 12     |
+| F3DEX3_LVP_NOC | 54          | 17           | 71                 | 7, 7, 7, 7, ...   |
+| F3DEX2         | 54          | 19           | 73                 | 3, 12, 3, 12, ... |

 Vertex processing time as reported by the performance counter in the `PA`
 configuration.
@@ -26,7 +26,7 @@ configuration.
 | F3DEX3         | 7.64ms  | 3.13ms  | 2.37ms  |
 | F3DEX3_NOC     | 7.07ms  | 2.89ms  | 2.14ms  |
 | F3DEX3_LVP     | 4.57ms  | 1.77ms  | 1.67ms  |
-| F3DEX3_LVP_NOC | 3.96ms  | 1.52ms  | 1.41ms  |
+| F3DEX3_LVP_NOC | Outdated  | | |
 | F3DEX2         | No*     | No*     | No*     |
 | Vertex count   | 3664    | 1608    | 1608    |

--- a/f3dex3.s
+++ b/f3dex3.s
@@ -1525,111 +1525,128 @@ vtx_after_calc_mit:

    srl     $7, $7, 5  // 8 if G_FOG is set, 0 otherwise

-vertices_store:
-    vge     XV3, XV25, $v31[2]              // 0; clamp Z to >= 0
+sKPI equ $v11 // vtx_store Keep Int (keep across pipelining)
+sKPF equ $v12 // vtx_store Keep Frac
+
+.align 8
+vtx_loop_no_lighting:
+    vmadn   vPairTPosF, vM2F, vPairPosI[2h]
+    or      $10, $10, $11          // Combine results for first vertex
+    vmadh   vPairTPosI, vM2I, vPairPosI[2h]
+    sh      $10,              (VTX_CLIP      )($19) // Store first vertex flags
+sKPG equ vBBB  // = $v21
+    vge     sKPG, sKPI, $v31[6]  // Clamp W/fog to >= 0x7F00 (low byte is used)
+    luv     vPairRGBA[0],    (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
+sCLZ equ $v19
+    vge     sCLZ, sKPI, $v31[2]              // 0; clamp Z to >= 0
    addi    $1, $1, -2*inputVtxSize         // Decrement vertex count by 2
+vtx_return_from_lighting:
    vmudl   $v29, vPairTPosF, $v30[3]       // Persp norm
    sub     $11, secondVtxPos, $7           // Points 8 before secondVtxPos if fog, else 0
-    vmadm   XV2, vPairTPosI, $v30[3]        // Persp norm
-    sbv     XV27[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
-    vmadn   XV21, $v31, $v31[2]             // 0
-    sbv     XV27[7],  (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below
-    vmov    XV26[1], XV3[2]
-    ssv     XV3[12],  (VTX_SCR_Z      )(secondVtxPos)
-    vmudn   XV7, vPairTPosF, $v31[3]        // W * clip ratio for scaled clipping
-    slv     XV25[8],  (VTX_SCR_VEC    )(secondVtxPos)
-    vmadh   XV6, vPairTPosI, $v31[3]        // W * clip ratio for scaled clipping
-    sdv     XV25[0],  (VTX_SCR_VEC    )($19)
-    vrcph   $v29[0], XV2[3]
-    ssv     XV26[12], (VTX_SCR_Z_FRAC )(secondVtxPos)
-    vrcpl   XV5[3], XV21[3]
-    slv     XV26[2],  (VTX_SCR_Z      )($19)
-    vrcph   XV4[3], XV2[7]
+// s1WH is $v16 // vtx_store 1/W High
+    vmadm   s1WH, vPairTPosI, $v30[3]        // Persp norm
    addi    outputVtxPos, outputVtxPos, 2*vtxSize // Points to SECOND output vtx
-    vrcpl   XV5[7], XV21[7]
-    sra     $11, $1, 31        // All 1s if on last iter
-    vrcph   XV4[7], $v31[2] // 0
-    andi    $11, $11, vtxSize  // vtxSize if on last iter, else normally 0
+// s1WL is $v17 // vtx_store 1/W Low
+    vmadn   s1WL, $v31, $v31[2]             // 0
+    sbv     sKPG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
+    vmov    sKPF[1], sCLZ[2]
+    sbv     sKPG[7],  (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below
+// sSCL is $v20 // vtx_store Scaled Clipping Low
+    vmudn   sSCL, vPairTPosF, $v31[3]        // W * clip ratio for scaled clipping
+    ssv     sCLZ[12],  (VTX_SCR_Z      )(secondVtxPos)
+// sSCH is $v21 // vtx_store Scaled Clipping High
+    vmadh   sSCH, vPairTPosI, $v31[3]        // W * clip ratio for scaled clipping
+    slv     sKPI[8],  (VTX_SCR_VEC    )(secondVtxPos)
+    vrcph   $v29[0], s1WH[3]
+    slv     sKPI[0],  (VTX_SCR_VEC    )($19)
+// sWRL is $v25 // vtx_store W Reciprocal Low
+    vrcpl   sWRL[3], s1WL[3]
+    ssv     sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos)
+// sWRH is $v26 // vtx_store W Reciprocal High
+    vrcph   sWRH[3], s1WH[7]
+    slv     sKPF[2],  (VTX_SCR_Z      )($19)
+    vrcpl   sWRL[7], s1WL[7]
+    sra     $24, $1, 31        // All 1s if on last iter
+    vrcph   sWRH[7], $v31[2] // 0
+    andi    $24, $24, vtxSize  // vtxSize if on last iter, else normally 0
    vch     $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
-    sub     secondVtxPos, outputVtxPos, $11 // First output vtx on last iter, else second
+    sub     secondVtxPos, outputVtxPos, $24 // First output vtx on last iter, else second
    vcl     $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low
    addi    $19, outputVtxPos, -vtxSize  // First output vtx always
-    vmudl   $v29, XV21, XV5
+    vmudl   $v29, s1WL, sWRL
    cfc2    $10, $vcc                   // Screen clip results
-    vmadm   $v29, XV2, XV5
+    vmadm   $v29, s1WH, sWRL
    sdv     vPairTPosF[8],  (VTX_FRAC_VEC  )(secondVtxPos)
-    vmadn   XV21, XV21, XV4
-    ldv     XTEMPST[0],   (VTX_IN_TC + 2 * inputVtxSize)(inputVtxPos) // ST in 0:1, RGBA in 2:3
-    vmadh   XV2, XV2, XV4
+    vmadn   s1WL, s1WL, sWRH
+sTCL equ $v19
+    ldv     sTCL[0],   (VTX_IN_TC + 2 * inputVtxSize)(inputVtxPos) // ST in 0:1, RGBA in 2:3
+    vmadh   s1WH, s1WH, sWRH
    sdv     vPairTPosF[0],  (VTX_FRAC_VEC  )($19)
    vge     $v29, vPairTPosI, $v31[2]  // Int position XYZW >= 0
    lsv     vPairTPosF[14], (VTX_Z_FRAC    )(secondVtxPos) // load Z into W slot, will be for fog below
    vmudh   $v29, vOne, $v31[4]  // 4
    sdv     vPairTPosI[8],  (VTX_INT_VEC   )(secondVtxPos)
-    vmadn   XV26, XV21, $v31[0]  // -4
+    vmadn   sKPF, s1WL, $v31[0]  // -4
    lsv     vPairTPosF[6],  (VTX_Z_FRAC    )($19) // load Z into W slot, will be for fog below
-    vmadh   XV25, XV2, $v31[0]  // -4
+    vmadh   sKPI, s1WH, $v31[0]  // -4
    sdv     vPairTPosI[0],  (VTX_INT_VEC   )($19)
-    vmrg    XV2, vZero, $v31[7] // Set to 0 where positive, 0x7FFF where negative
-    ldv     XTEMPST[8],   (VTX_IN_TC + 3 * inputVtxSize)(inputVtxPos) // ST in 4:5, RGBA in 6:7
-    vch     $v29, vPairTPosI, XV6[3h] // Clip scaled high
-    slv     vPairRGBA[0],   (VTX_COLOR_VEC )(secondVtxPos) // Store RGBA for first vector
-    vmudl   $v29, XV26, XV5
+sWNG equ $v16 // vtx_store W NeGative
+    vmrg    sWNG, vZero, $v31[7] // Set to 0 where positive, 0x7FFF where negative
+    ldv     sTCL[8],   (VTX_IN_TC + 3 * inputVtxSize)(inputVtxPos) // ST in 4:5, RGBA in 6:7
+    vch     $v29, vPairTPosI, sSCH[3h] // Clip scaled high
+    suv     vPairRGBA[4],   (VTX_COLOR_VEC )(secondVtxPos) // Store RGBA for second vtx
+    vmudl   $v29, sKPF, sWRL
    lsv     vPairTPosI[14], (VTX_Z_INT     )(secondVtxPos) // load Z into W slot, will be for fog below
-    vmadm   $v29, XV25, XV5
-    slv     vPairRGBA[4],   (VTX_COLOR_VEC )($19) // Store RGBA for second vector
-    vmadn   XV5, XV26, XV4
+    vmadm   $v29, sKPI, sWRL
+    suv     vPairRGBA[0],   (VTX_COLOR_VEC )($19) // Store RGBA for first vtx
+    vmadn   sWRL, sKPF, sWRH
    lsv     vPairTPosI[6],  (VTX_Z_INT     )($19) // load Z into W slot, will be for fog below
-    vmadh   XV4, XV25, XV4
+    vmadh   sWRH, sKPI, sWRH
    srl     $24, $10, 4            // Shift second vertex screen clipping to first slots
-    vmadh   XV2, XV2, $v31[7]           // Makes screen coords a large number if W < 0
+    vmadh   sWNG, sWNG, $v31[7]           // Makes screen coords a large number if W < 0
    andi    $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
-    vcl     $v29, vPairTPosF, XV7[3h] // Clip scaled low
+    vcl     $v29, vPairTPosF, sSCL[3h] // Clip scaled low
    cfc2    $20, $vcc                   // Scaled clip results
-    vmudl   $v29, vPairTPosF, XV5[3h] // Pos times inv W
-    ssv     XV5[14],          (VTX_INV_W_FRAC)(secondVtxPos)
-    vmadm   $v29, vPairTPosI, XV5[3h] // Pos times inv W
+    vmudl   $v29, vPairTPosF, sWRL[3h] // Pos times inv W
+    ssv     sWRL[14],          (VTX_INV_W_FRAC)(secondVtxPos)
+    vmadm   $v29, vPairTPosI, sWRL[3h] // Pos times inv W
+// vPairPosI is $v20
    ldv     vPairPosI[0], (VTX_IN_OB + 2 * inputVtxSize)(inputVtxPos) // Pos of 1st vector for next iteration
-    vmadn   XV26, vPairTPosF, XV2[3h] // Makes screen coords a large number if W < 0
+    vmadn   sKPF, vPairTPosF, sWNG[3h] // Makes screen coords a large number if W < 0
    ldv     vPairPosI[8], (VTX_IN_OB + 3 * inputVtxSize)(inputVtxPos) // Pos of 2nd vector on next iteration
-    vmadh   XV25, vPairTPosI, XV2[3h] // XV25:XV26 = pos times inv W
+    vmadh   sKPI, vPairTPosI, sWNG[3h] // sKPI:sKPF = pos times inv W
    addi    inputVtxPos, inputVtxPos, (2 * inputVtxSize) // Advance two positions forward in the input vertices
-    vmudm   XV3, vPairST, sSTS       // Scale ST
+sST2 equ $v21
+    vmudm   sST2, vPairST, sSTS       // Scale ST
    andi    $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
-    vcopy   vPairST, XTEMPST
+    vcopy   vPairST, sTCL
    sll     $11, $20, 4            // Shift first vertex scaled clipping to second slots
-    vmudl   $v29, XV26, $v30[3] // Persp norm
-    ssv     XV5[6],           (VTX_INV_W_FRAC)($19)
-    vmadm   XV25, XV25, $v30[3] // Persp norm
-    ssv     XV4[14],          (VTX_INV_W_INT )(secondVtxPos)
-    vmadn   XV26, $v31, $v31[2] // 0; Now XV26:XV25 = projected position
-    ssv     XV4[6],           (VTX_INV_W_INT )($19)
-    vmov    XTEMPST[4], vPairST[2]
-    slv     XV3[4],           (VTX_TC_VEC    )(secondVtxPos) // Store scaled S, T vertex 1
-    vmov    XTEMPST[5], vPairST[3]
-    slv     XV3[12],          (VTX_TC_VEC    )($19) // Store scaled S, T vertex 2
+    vmudl   $v29, sKPF, $v30[3] // Persp norm
+    ssv     sWRL[6],           (VTX_INV_W_FRAC)($19)
+    vmadm   sKPI, sKPI, $v30[3] // Persp norm
+    ssv     sWRH[14],          (VTX_INV_W_INT )(secondVtxPos)
+    vmadn   sKPF, $v31, $v31[2] // 0; Now sKPI:sKPF = projected position
+    ssv     sWRH[6],           (VTX_INV_W_INT )($19)
+    vmov    sTCL[4], vPairST[2] // First vtx RG to elem 4
+    slv     sST2[4],           (VTX_TC_VEC    )(secondVtxPos) // Store scaled S, T vertex 1
+    vmov    sTCL[5], vPairST[3] // First vtx BA to elem 5
+    slv     sST2[12],          (VTX_TC_VEC    )($19) // Store scaled S, T vertex 2
    vmudh   $v29, sVPO, vOne // offset * 1
    andi    $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
-    vmadn   XV26, XV26, sVPS      // + pos frac * scale
-    andi    $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
-    vmadh   XV25, XV25, sVPS      // int part, XV25:XV26 is now screen space pos
+    vmadn   sKPF, sKPF, sVPS      // + pos frac * scale
    or      $24, $24, $20          // Combine results for second vertex
+    vmadh   sKPI, sKPI, sVPS      // int part, sKPI:sKPF is now screen space pos
+    andi    $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
    vmudn   $v29, vM3F, vOne
-    or      $10, $10, $11          // Combine results for first vertex
-    vmadh   $v29, vM3I, vOne
    sh      $24,              (VTX_CLIP      )(secondVtxPos) // Store second vertex clip flags
-    vmadn   $v29, vM0F, vPairPosI[0h]
-    sh      $10,              (VTX_CLIP      )($19)          // Store first vertex results
-    vmadh   $v29, vM0I, vPairPosI[0h]
+    vmadh   $v29, vM3I, vOne
    blez    $1, vtx_epilogue
-     vmadn  $v29, vM1F, vPairPosI[1h]
-    vmadh   $v29, vM1I, vPairPosI[1h]
-    sdv     XTEMPST[8],      (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA
-    vmadn   vPairTPosF, vM2F, vPairPosI[2h]
-    luv     vPairRGBA[0],    (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
-    vmadh   vPairTPosI, vM2I, vPairPosI[2h]
+     vmadn  $v29, vM0F, vPairPosI[0h]
+    vmadh   $v29, vM0I, vPairPosI[0h]
+    sdv     sTCL[8],      (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
+    vmadn   $v29, vM1F, vPairPosI[1h]
    jr      $ra
-     vge    XV27, XV25, $v31[6]  // Clamp W/fog to >= 0x7F00 (low byte is used)
+     vmadh  $v29, vM1I, vPairPosI[1h]
     

 vtx_epilogue:
@@ -1806,7 +1823,7 @@ middle_of_vtx_store:
 // vPairNrml is $v16
    vmudn   vPairNrml, vPairRGBA, $v31[3] // 2; left shift RGBA without clamp; vtx pair normals
    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(secondVtxPos)
-// sCLZ is $v21 // vtx_store CLipped Z
+// sCLZ is $v21 // vtx_store CLamped Z
    vge     sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
    ssv     vPairTPosF[4],    (VTX_SCR_Z_FRAC)($19)
    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
@@ -1932,7 +1949,7 @@ skip_return_to_lt_or_loop:
    ssv     vPairTPosF[12],   (VTX_SCR_Z_FRAC)(secondVtxPos)
    vge     sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
    or      $24, $24, $20          // Combine results for second vertex
-// sCLZ is $v25 // vtx_store CLipped Z
+// sCLZ is $v25 // vtx_store CLamped Z
    vge     sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
    ssv     vPairTPosF[4],    (VTX_SCR_Z_FRAC)($19)
    vmulf   $v29, sOPM, vPairTPosI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
@@ -3113,10 +3130,23 @@ xfrm_single_dir:
     sw     $11, (0)($20)                  // Store 3 (4) bytes to target address
     // This clobbers the specular size
    
+.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE
+.align 8
+.endif
 lt_vtx_pair:
 //
 // LVP main lighting
 //
+.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
+    vmadn   vPairTPosF, vM2F, vPairPosI[2h]
+    lpv     vPairNrml[0],     (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair normals
+    vmadh   vPairTPosI, vM2I, vPairPosI[2h]
+    lpv     $v14[7],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4
+    // vnop
+    lpv     $v15[6],      (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4
+    // vnop
+    or      $10, $10, $11          // Combine results for first vertex
+.endif
    vmulf   $v29, vPairNrml, $v13[4] // Normals X elems 0, 4 * first light dir
    luv     vPairLt,     (ltBufOfs + 0)($3)  // Total light level, init to ambient
    vmacf   $v29, $v14, $v13[5] // Normals Y elems 0, 4 * first light dir
@@ -3127,8 +3157,10 @@ lt_vtx_pair:
 .endif
    beq     $3, altBaseReg, lt_post
     lpv    $v18[0],     (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut
+    // vnop
    move    curLight, $3                     // Point to ambient light
    // vnop
+    // nop
 lt_loop:
    vge     vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0
    vmulf   $v29, vPairNrml, vDDD[4] // Normals X elems 0, 4
@@ -3141,12 +3173,24 @@ lt_loop:
    bne     curLight, altBaseReg, lt_loop
     vmacf  vPairLt, vBBB, vCCC[0h] // + light color * dot product
 lt_post:
+.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
+    vge     sKPG, sKPI, $v31[6]  // Clamp W/fog to >= 0x7F00 (low byte is used)
+    sh      $10, (VTX_CLIP      )($19) // Store first vertex flags
+    vge     sCLZ, sKPI, $v31[2]              // 0; clamp Z to >= 0
+    addi    $1, $1, -2*inputVtxSize         // Decrement vertex count by 2
+    vne     $v29, $v31, $v31[3h]           // Set VCC to 11101110
+    beqz    $17, vtx_return_from_lighting
+     vmrg   vPairRGBA, vPairLt, vPairRGBA  // RGB = light, A = vtx alpha
+.else
    beqz    $17, vtx_early_return_from_lighting
     vne    $v29, $v31, $v31[3h]           // Set VCC to 11101110
 .endif
+
+.endif
 // These definitions are shared by both versions
 vLookat1 equ vAAA
 vLookat0 equ vPairLt
+// Texgen uses these, vCCC:vDDD, and of course vPairST.
 .if CFG_LEGACY_VTX_PIPE
    vmulf   $v29, vPairNrml, $v18[0] // Normals X elems 0, 4 * lookat 0 X
    vmacf   $v29, $v14, $v18[1]      // Normals Y elems 0, 4 * lookat 0 Y
--- a/notes.s
+++ b/notes.s
@@ -36,3 +36,24 @@ lpv     $v27[4], ( 0)($11) // 0C 0D 0E 0F 00 01 02 03
 lpv     $v27[5], ( 0)($11) // 0B 0C 0D 0E 0F 00 01 02
 lpv     $v27[6], ( 0)($11) // 0A 0B 0C 0D 0E 0F 00 01
 lpv     $v27[7], ( 0)($11) // 09 0A 0B 0C 0D 0E 0F 00
+
+// spv and suv always store to the 8 bytes at/after the scalar reg + offset.
+// What is stored starts at lane e, increments and wraps at lane 8. However for
+// the lanes >= 8, the functionality of spv and suv swaps.
+//        Mem addr rel to $11  0  1  2  3  4  5  6  7
+spv     $v27[0], ( 0)($11) // P0 P1 P2 P3 P4 P5 P6 P7  Lane number
+spv     $v27[1], ( 0)($11) // P1 P2 P3 P4 P5 P6 P7 U0  P = packed (top 8 bits)
+spv     $v27[2], ( 0)($11) // P2 P3 P4 P5 P6 P7 U0 U1  U = unsigned (bits 14:7)
+spv     $v27[3], ( 0)($11) // P3 P4 P5 P6 P7 U0 U1 U2
+spv     $v27[4], ( 0)($11) // P4 P5 P6 P7 U0 U1 U2 U3
+spv     $v27[5], ( 0)($11) // P5 P6 P7 U0 U1 U2 U3 U4
+spv     $v27[6], ( 0)($11) // P6 P7 U0 U1 U2 U3 U4 U5
+spv     $v27[7], ( 0)($11) // P7 U0 U1 U2 U3 U4 U5 U6
+suv     $v27[0], ( 0)($11) // U0 U1 U2 U3 U4 U5 U6 U7 = spv $v27[8], (0)($11)
+suv     $v27[1], ( 0)($11) // U1 U2 U3 U4 U5 U6 U7 P0
+suv     $v27[2], ( 0)($11) // U2 U3 U4 U5 U6 U7 P0 P1
+suv     $v27[3], ( 0)($11) // U3 U4 U5 U6 U7 P0 P1 P2
+suv     $v27[4], ( 0)($11) // U4 U5 U6 U7 P0 P1 P2 P3
+suv     $v27[5], ( 0)($11) // U5 U6 U7 P0 P1 P2 P3 P4
+suv     $v27[6], ( 0)($11) // U6 U7 P0 P1 P2 P3 P4 P5
+suv     $v27[7], ( 0)($11) // U7 P0 P1 P2 P3 P4 P5 P6