Non working draft which matches/beats EX2

This commit is contained in:
Sauraen
2024-08-03 16:47:55 -07:00
parent f20a00f591
commit df2d86a789
3 changed files with 143 additions and 78 deletions

View File

@@ -4,15 +4,15 @@
Vertex pipeline cycles per **vertex pair** in steady state (lower is better).
Hand-counted timings taking into account all pipeline stalls and all dual-issue
conditions except for instruction alignment.
conditions. Instruction alignment is only taken into account for LVP_NOC.
| Microcode | No Lighting | First Dir Lt | Total for 1 Dir Lt | Extra Dir Lts |
|----------------|-------------|--------------|--------------------|---------------|
| F3DEX3 | 98 | 103 | 201 | 29 |
| F3DEX3_NOC | 79 | 103 | 182 | 29 |
| F3DEX3_LVP | 81 | 15 | 96 | 7 |
| F3DEX3_LVP_NOC | 62 | 15 | 77 | 7 |
| F3DEX2 | 54 | 19 | 73 | 3 then 12 |
| F3DEX3_LVP_NOC | 54 | 17 | 71 | 7, 7, 7, 7, ... |
| F3DEX2 | 54 | 19 | 73 | 3, 12, 3, 12, ... |
Vertex processing time as reported by the performance counter in the `PA`
configuration.
@@ -26,7 +26,7 @@ configuration.
| F3DEX3 | 7.64ms | 3.13ms | 2.37ms |
| F3DEX3_NOC | 7.07ms | 2.89ms | 2.14ms |
| F3DEX3_LVP | 4.57ms | 1.77ms | 1.67ms |
| F3DEX3_LVP_NOC | 3.96ms | 1.52ms | 1.41ms |
| F3DEX3_LVP_NOC | Outdated | | |
| F3DEX2 | No* | No* | No* |
| Vertex count | 3664 | 1608 | 1608 |

192
f3dex3.s
View File

@@ -1525,111 +1525,128 @@ vtx_after_calc_mit:
srl $7, $7, 5 // 8 if G_FOG is set, 0 otherwise
vertices_store:
vge XV3, XV25, $v31[2] // 0; clamp Z to >= 0
sKPI equ $v11 // vtx_store Keep Int (keep across pipelining)
sKPF equ $v12 // vtx_store Keep Frac
.align 8
vtx_loop_no_lighting:
vmadn vPairTPosF, vM2F, vPairPosI[2h]
or $10, $10, $11 // Combine results for first vertex
vmadh vPairTPosI, vM2I, vPairPosI[2h]
sh $10, (VTX_CLIP )($19) // Store first vertex flags
sKPG equ vBBB // = $v21
vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
sCLZ equ $v19
vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0
addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2
vtx_return_from_lighting:
vmudl $v29, vPairTPosF, $v30[3] // Persp norm
sub $11, secondVtxPos, $7 // Points 8 before secondVtxPos if fog, else 0
vmadm XV2, vPairTPosI, $v30[3] // Persp norm
sbv XV27[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
vmadn XV21, $v31, $v31[2] // 0
sbv XV27[7], (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below
vmov XV26[1], XV3[2]
ssv XV3[12], (VTX_SCR_Z )(secondVtxPos)
vmudn XV7, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping
slv XV25[8], (VTX_SCR_VEC )(secondVtxPos)
vmadh XV6, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping
sdv XV25[0], (VTX_SCR_VEC )($19)
vrcph $v29[0], XV2[3]
ssv XV26[12], (VTX_SCR_Z_FRAC )(secondVtxPos)
vrcpl XV5[3], XV21[3]
slv XV26[2], (VTX_SCR_Z )($19)
vrcph XV4[3], XV2[7]
// s1WH is $v16 // vtx_store 1/W High
vmadm s1WH, vPairTPosI, $v30[3] // Persp norm
addi outputVtxPos, outputVtxPos, 2*vtxSize // Points to SECOND output vtx
vrcpl XV5[7], XV21[7]
sra $11, $1, 31 // All 1s if on last iter
vrcph XV4[7], $v31[2] // 0
andi $11, $11, vtxSize // vtxSize if on last iter, else normally 0
// s1WL is $v17 // vtx_store 1/W Low
vmadn s1WL, $v31, $v31[2] // 0
sbv sKPG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
vmov sKPF[1], sCLZ[2]
sbv sKPG[7], (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below
// sSCL is $v20 // vtx_store Scaled Clipping Low
vmudn sSCL, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping
ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos)
// sSCH is $v21 // vtx_store Scaled Clipping High
vmadh sSCH, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping
slv sKPI[8], (VTX_SCR_VEC )(secondVtxPos)
vrcph $v29[0], s1WH[3]
slv sKPI[0], (VTX_SCR_VEC )($19)
// sWRL is $v25 // vtx_store W Reciprocal Low
vrcpl sWRL[3], s1WL[3]
ssv sKPF[12], (VTX_SCR_Z_FRAC )(secondVtxPos)
// sWRH is $v26 // vtx_store W Reciprocal High
vrcph sWRH[3], s1WH[7]
slv sKPF[2], (VTX_SCR_Z )($19)
vrcpl sWRL[7], s1WL[7]
sra $24, $1, 31 // All 1s if on last iter
vrcph sWRH[7], $v31[2] // 0
andi $24, $24, vtxSize // vtxSize if on last iter, else normally 0
vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
sub secondVtxPos, outputVtxPos, $11 // First output vtx on last iter, else second
sub secondVtxPos, outputVtxPos, $24 // First output vtx on last iter, else second
vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low
addi $19, outputVtxPos, -vtxSize // First output vtx always
vmudl $v29, XV21, XV5
vmudl $v29, s1WL, sWRL
cfc2 $10, $vcc // Screen clip results
vmadm $v29, XV2, XV5
vmadm $v29, s1WH, sWRL
sdv vPairTPosF[8], (VTX_FRAC_VEC )(secondVtxPos)
vmadn XV21, XV21, XV4
ldv XTEMPST[0], (VTX_IN_TC + 2 * inputVtxSize)(inputVtxPos) // ST in 0:1, RGBA in 2:3
vmadh XV2, XV2, XV4
vmadn s1WL, s1WL, sWRH
sTCL equ $v19
ldv sTCL[0], (VTX_IN_TC + 2 * inputVtxSize)(inputVtxPos) // ST in 0:1, RGBA in 2:3
vmadh s1WH, s1WH, sWRH
sdv vPairTPosF[0], (VTX_FRAC_VEC )($19)
vge $v29, vPairTPosI, $v31[2] // Int position XYZW >= 0
lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below
vmudh $v29, vOne, $v31[4] // 4
sdv vPairTPosI[8], (VTX_INT_VEC )(secondVtxPos)
vmadn XV26, XV21, $v31[0] // -4
vmadn sKPF, s1WL, $v31[0] // -4
lsv vPairTPosF[6], (VTX_Z_FRAC )($19) // load Z into W slot, will be for fog below
vmadh XV25, XV2, $v31[0] // -4
vmadh sKPI, s1WH, $v31[0] // -4
sdv vPairTPosI[0], (VTX_INT_VEC )($19)
vmrg XV2, vZero, $v31[7] // Set to 0 where positive, 0x7FFF where negative
ldv XTEMPST[8], (VTX_IN_TC + 3 * inputVtxSize)(inputVtxPos) // ST in 4:5, RGBA in 6:7
vch $v29, vPairTPosI, XV6[3h] // Clip scaled high
slv vPairRGBA[0], (VTX_COLOR_VEC )(secondVtxPos) // Store RGBA for first vector
vmudl $v29, XV26, XV5
sWNG equ $v16 // vtx_store W NeGative
vmrg sWNG, vZero, $v31[7] // Set to 0 where positive, 0x7FFF where negative
ldv sTCL[8], (VTX_IN_TC + 3 * inputVtxSize)(inputVtxPos) // ST in 4:5, RGBA in 6:7
vch $v29, vPairTPosI, sSCH[3h] // Clip scaled high
suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos) // Store RGBA for second vtx
vmudl $v29, sKPF, sWRL
lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below
vmadm $v29, XV25, XV5
slv vPairRGBA[4], (VTX_COLOR_VEC )($19) // Store RGBA for second vector
vmadn XV5, XV26, XV4
vmadm $v29, sKPI, sWRL
suv vPairRGBA[0], (VTX_COLOR_VEC )($19) // Store RGBA for first vtx
vmadn sWRL, sKPF, sWRH
lsv vPairTPosI[6], (VTX_Z_INT )($19) // load Z into W slot, will be for fog below
vmadh XV4, XV25, XV4
vmadh sWRH, sKPI, sWRH
srl $24, $10, 4 // Shift second vertex screen clipping to first slots
vmadh XV2, XV2, $v31[7] // Makes screen coords a large number if W < 0
vmadh sWNG, sWNG, $v31[7] // Makes screen coords a large number if W < 0
andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vcl $v29, vPairTPosF, XV7[3h] // Clip scaled low
vcl $v29, vPairTPosF, sSCL[3h] // Clip scaled low
cfc2 $20, $vcc // Scaled clip results
vmudl $v29, vPairTPosF, XV5[3h] // Pos times inv W
ssv XV5[14], (VTX_INV_W_FRAC)(secondVtxPos)
vmadm $v29, vPairTPosI, XV5[3h] // Pos times inv W
vmudl $v29, vPairTPosF, sWRL[3h] // Pos times inv W
ssv sWRL[14], (VTX_INV_W_FRAC)(secondVtxPos)
vmadm $v29, vPairTPosI, sWRL[3h] // Pos times inv W
// vPairPosI is $v20
ldv vPairPosI[0], (VTX_IN_OB + 2 * inputVtxSize)(inputVtxPos) // Pos of 1st vector for next iteration
vmadn XV26, vPairTPosF, XV2[3h] // Makes screen coords a large number if W < 0
vmadn sKPF, vPairTPosF, sWNG[3h] // Makes screen coords a large number if W < 0
ldv vPairPosI[8], (VTX_IN_OB + 3 * inputVtxSize)(inputVtxPos) // Pos of 2nd vector on next iteration
vmadh XV25, vPairTPosI, XV2[3h] // XV25:XV26 = pos times inv W
vmadh sKPI, vPairTPosI, sWNG[3h] // sKPI:sKPF = pos times inv W
addi inputVtxPos, inputVtxPos, (2 * inputVtxSize) // Advance two positions forward in the input vertices
vmudm XV3, vPairST, sSTS // Scale ST
sST2 equ $v21
vmudm sST2, vPairST, sSTS // Scale ST
andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vcopy vPairST, XTEMPST
vcopy vPairST, sTCL
sll $11, $20, 4 // Shift first vertex scaled clipping to second slots
vmudl $v29, XV26, $v30[3] // Persp norm
ssv XV5[6], (VTX_INV_W_FRAC)($19)
vmadm XV25, XV25, $v30[3] // Persp norm
ssv XV4[14], (VTX_INV_W_INT )(secondVtxPos)
vmadn XV26, $v31, $v31[2] // 0; Now XV26:XV25 = projected position
ssv XV4[6], (VTX_INV_W_INT )($19)
vmov XTEMPST[4], vPairST[2]
slv XV3[4], (VTX_TC_VEC )(secondVtxPos) // Store scaled S, T vertex 1
vmov XTEMPST[5], vPairST[3]
slv XV3[12], (VTX_TC_VEC )($19) // Store scaled S, T vertex 2
vmudl $v29, sKPF, $v30[3] // Persp norm
ssv sWRL[6], (VTX_INV_W_FRAC)($19)
vmadm sKPI, sKPI, $v30[3] // Persp norm
ssv sWRH[14], (VTX_INV_W_INT )(secondVtxPos)
vmadn sKPF, $v31, $v31[2] // 0; Now sKPI:sKPF = projected position
ssv sWRH[6], (VTX_INV_W_INT )($19)
vmov sTCL[4], vPairST[2] // First vtx RG to elem 4
slv sST2[4], (VTX_TC_VEC )(secondVtxPos) // Store scaled S, T vertex 1
vmov sTCL[5], vPairST[3] // First vtx BA to elem 5
slv sST2[12], (VTX_TC_VEC )($19) // Store scaled S, T vertex 2
vmudh $v29, sVPO, vOne // offset * 1
andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
vmadn XV26, XV26, sVPS // + pos frac * scale
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
vmadh XV25, XV25, sVPS // int part, XV25:XV26 is now screen space pos
vmadn sKPF, sKPF, sVPS // + pos frac * scale
or $24, $24, $20 // Combine results for second vertex
vmadh sKPI, sKPI, sVPS // int part, sKPI:sKPF is now screen space pos
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
vmudn $v29, vM3F, vOne
or $10, $10, $11 // Combine results for first vertex
vmadh $v29, vM3I, vOne
sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags
vmadn $v29, vM0F, vPairPosI[0h]
sh $10, (VTX_CLIP )($19) // Store first vertex results
vmadh $v29, vM0I, vPairPosI[0h]
vmadh $v29, vM3I, vOne
blez $1, vtx_epilogue
vmadn $v29, vM1F, vPairPosI[1h]
vmadh $v29, vM1I, vPairPosI[1h]
sdv XTEMPST[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA
vmadn vPairTPosF, vM2F, vPairPosI[2h]
luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
vmadh vPairTPosI, vM2I, vPairPosI[2h]
vmadn $v29, vM0F, vPairPosI[0h]
vmadh $v29, vM0I, vPairPosI[0h]
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
vmadn $v29, vM1F, vPairPosI[1h]
jr $ra
vge XV27, XV25, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
vmadh $v29, vM1I, vPairPosI[1h]
vtx_epilogue:
@@ -1806,7 +1823,7 @@ middle_of_vtx_store:
// vPairNrml is $v16
vmudn vPairNrml, vPairRGBA, $v31[3] // 2; left shift RGBA without clamp; vtx pair normals
ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos)
// sCLZ is $v21 // vtx_store CLipped Z
// sCLZ is $v21 // vtx_store CLamped Z
vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19)
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
@@ -1932,7 +1949,7 @@ skip_return_to_lt_or_loop:
ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos)
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
or $24, $24, $20 // Combine results for second vertex
// sCLZ is $v25 // vtx_store CLipped Z
// sCLZ is $v25 // vtx_store CLamped Z
vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19)
vmulf $v29, sOPM, vPairTPosI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
@@ -3113,10 +3130,23 @@ xfrm_single_dir:
sw $11, (0)($20) // Store 3 (4) bytes to target address
// This clobbers the specular size
.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE
.align 8
.endif
lt_vtx_pair:
//
// LVP main lighting
//
.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
vmadn vPairTPosF, vM2F, vPairPosI[2h]
lpv vPairNrml[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair normals
vmadh vPairTPosI, vM2I, vPairPosI[2h]
lpv $v14[7], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4
// vnop
lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4
// vnop
or $10, $10, $11 // Combine results for first vertex
.endif
vmulf $v29, vPairNrml, $v13[4] // Normals X elems 0, 4 * first light dir
luv vPairLt, (ltBufOfs + 0)($3) // Total light level, init to ambient
vmacf $v29, $v14, $v13[5] // Normals Y elems 0, 4 * first light dir
@@ -3127,8 +3157,10 @@ lt_vtx_pair:
.endif
beq $3, altBaseReg, lt_post
lpv $v18[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut
// vnop
move curLight, $3 // Point to ambient light
// vnop
// nop
lt_loop:
vge vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0
vmulf $v29, vPairNrml, vDDD[4] // Normals X elems 0, 4
@@ -3141,12 +3173,24 @@ lt_loop:
bne curLight, altBaseReg, lt_loop
vmacf vPairLt, vBBB, vCCC[0h] // + light color * dot product
lt_post:
.if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC
vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
sh $10, (VTX_CLIP )($19) // Store first vertex flags
vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0
addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
beqz $17, vtx_return_from_lighting
vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha
.else
beqz $17, vtx_early_return_from_lighting
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
.endif
.endif
// These definitions are shared by both versions
vLookat1 equ vAAA
vLookat0 equ vPairLt
// Texgen uses these, vCCC:vDDD, and of course vPairST.
.if CFG_LEGACY_VTX_PIPE
vmulf $v29, vPairNrml, $v18[0] // Normals X elems 0, 4 * lookat 0 X
vmacf $v29, $v14, $v18[1] // Normals Y elems 0, 4 * lookat 0 Y

21
notes.s
View File

@@ -36,3 +36,24 @@ lpv $v27[4], ( 0)($11) // 0C 0D 0E 0F 00 01 02 03
lpv $v27[5], ( 0)($11) // 0B 0C 0D 0E 0F 00 01 02
lpv $v27[6], ( 0)($11) // 0A 0B 0C 0D 0E 0F 00 01
lpv $v27[7], ( 0)($11) // 09 0A 0B 0C 0D 0E 0F 00
// spv and suv always store to the 8 bytes at/after the scalar reg + offset.
// What is stored starts at lane e, increments and wraps at lane 8. However for
// the lanes >= 8, the functionality of spv and suv swaps.
// Mem addr rel to $11 0 1 2 3 4 5 6 7
spv $v27[0], ( 0)($11) // P0 P1 P2 P3 P4 P5 P6 P7 Lane number
spv $v27[1], ( 0)($11) // P1 P2 P3 P4 P5 P6 P7 U0 P = packed (top 8 bits)
spv $v27[2], ( 0)($11) // P2 P3 P4 P5 P6 P7 U0 U1 U = unsigned (bits 14:7)
spv $v27[3], ( 0)($11) // P3 P4 P5 P6 P7 U0 U1 U2
spv $v27[4], ( 0)($11) // P4 P5 P6 P7 U0 U1 U2 U3
spv $v27[5], ( 0)($11) // P5 P6 P7 U0 U1 U2 U3 U4
spv $v27[6], ( 0)($11) // P6 P7 U0 U1 U2 U3 U4 U5
spv $v27[7], ( 0)($11) // P7 U0 U1 U2 U3 U4 U5 U6
suv $v27[0], ( 0)($11) // U0 U1 U2 U3 U4 U5 U6 U7 = spv $v27[8], (0)($11)
suv $v27[1], ( 0)($11) // U1 U2 U3 U4 U5 U6 U7 P0
suv $v27[2], ( 0)($11) // U2 U3 U4 U5 U6 U7 P0 P1
suv $v27[3], ( 0)($11) // U3 U4 U5 U6 U7 P0 P1 P2
suv $v27[4], ( 0)($11) // U4 U5 U6 U7 P0 P1 P2 P3
suv $v27[5], ( 0)($11) // U5 U6 U7 P0 P1 P2 P3 P4
suv $v27[6], ( 0)($11) // U6 U7 P0 P1 P2 P3 P4 P5
suv $v27[7], ( 0)($11) // U7 P0 P1 P2 P3 P4 P5 P6