New changes working

This commit is contained in:
Sauraen
2024-04-10 22:14:32 -07:00
parent e49161f5c4
commit 74d67ae914
2 changed files with 131 additions and 145 deletions

257
f3dex3.s
View File

@@ -1133,11 +1133,14 @@ sSTO equ $v11 // not supported on legacy vtx pipe, but register allocated for it
sSTS equ $v10
.else
sVPO equ $v17
.if CFG_NO_OCCLUSION_PLANE
sVPS equ $v26
.else
sVPS equ $v16
.endif
sSTO equ $v26
sSTS equ $v25
.endif
sCOL equ $v17
.if CFG_LEGACY_VTX_PIPE
sOUTF equ vPairTPosF
sOUTI equ vPairTPosI
@@ -1353,32 +1356,19 @@ vtx_after_calc_mit:
ldv vVP0F[8], (vpMatrix + 0x20)($zero)
ldv vVP2F[8], (vpMatrix + 0x30)($zero)
.endif
andi $16, $5, G_LIGHTING >> 8 // This is clipFlags, but not touched during vtx_store
andi $11, $5, G_LIGHTING >> 8
beqz $11, @@skip_lighting
li $16, vtx_return_from_lighting // This is clipFlags, but not modified
li $16, lt_vtx_pair // during vtx_store
@@skip_lighting:
andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled
jal while_wait_dma_busy // Wait for vertex load to finish
li $19, clipTempVerts // Temp mem we can freely overwrite replaces outputVtxPos
jal middle_of_vtx_store // Sets $ra to vtx_load_loop
j middle_of_vtx_store
move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos
vtx_load_loop:
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
.if CFG_LEGACY_VTX_PIPE
blez $1, vertex_end
.endif
nop // TODO
.if CFG_LEGACY_VTX_PIPE
vmrg vPairNrml, vPairNrml, vDDD // Merge normals
addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
.endif
vmrg vPairRGBA, sCOL, vPairRGBA // Merge colors
bnez $16, lt_vtx_pair // Lighting enabled in clipFlags
.if CFG_LEGACY_VTX_PIPE
addi outputVtxPos, outputVtxPos, 2*vtxSize
.else
// Elems 0-1 get bytes 6-7 of the following vertex (0)
lpv vAAA[2], (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
.endif
vtx_return_from_lighting:
li $ra, vertex_end
.if CFG_LEGACY_VTX_PIPE
vmudm vPairST, vPairST, sSTS // Scale ST; must be after texgen
@@skipsecond:
@@ -1398,22 +1388,17 @@ vtx_return_from_lighting:
vmadn $v29, vVP0F, vPairPosI[0h]
vmadh $v29, vVP0I, vPairPosI[0h]
vmadl $v29, vVP1F, vPairPosF[1h]
addi outputVtxPos, outputVtxPos, 2*vtxSize
vmadm $v29, vVP1I, vPairPosF[1h]
addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
vmadn $v29, vVP1F, vPairPosI[1h]
vmadh $v29, vVP1I, vPairPosI[1h]
vmadl $v29, vVP2F, vPairPosF[2h]
vmadm $v29, vVP2I, vPairPosF[2h]
vmadn vPairTPosF, vVP2F, vPairPosI[2h]
li $ra, vertex_end // Done with vertex processing...
vmadh vPairTPosI, vVP2I, vPairPosI[2h]
blez $1, @@skiploop // ...if <= 0 verts remain, ...
vmudm $v29, vPairST, sSTS // Scale ST; must be after texgen
li $ra, vtx_load_loop // ...otherwise keep looping
@@skiploop:
vmudm $v29, vPairST, sSTS // Scale ST; must be after texgen
vmadh vPairST, sSTO, vOne // + 1 * (ST offset or zero)
.endif
addi outputVtxPos, outputVtxPos, 2*vtxSize
vtx_store:
// Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA
// Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp). Also vPairST and
@@ -1425,14 +1410,13 @@ s1WH equ $v16 // vtx_store 1/W High
s1WL equ $v17 // vtx_store 1/W Low
sWRL equ $v25 // vtx_store W Reciprocal Low | IMPORTANT: Can be the same reg as sWRH, but
sWRH equ $v26 // vtx_store W Reciprocal High | using different ones saves one cycle delay
move secondVtxPos, outputVtxPos // Second and output vertices write to same mem...
vmudl $v29, vPairTPosF, $v30[3] // Persp norm
move secondVtxPos, outputVtxPos // Second and output vertices write to same mem...
vmadm s1WH, vPairTPosI, $v30[3] // Persp norm
bltz $1, @@skipsecond // ...if < 0 verts remain, ...
vmadm s1WH, vPairTPosI, $v30[3] // Persp norm
vmadn s1WL, $v31, $v31[2] // 0
addi secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx
@@skipsecond:
vmadn s1WL, $v31, $v31[2] // 0
addi inputVtxPos, inputVtxPos, 2*inputVtxSize
vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos)
vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low
@@ -1460,6 +1444,7 @@ sSCH equ $v21 // vtx_store Scaled Clipping High
.if CFG_NO_OCCLUSION_PLANE
vmadn s1WL, s1WL, sWRH[3h]
addi inputVtxPos, inputVtxPos, 2*inputVtxSize
vmadh s1WH, s1WH, sWRH[3h]
srl $24, $10, 4 // Shift second vertex screen clipping to first slots
vch $v29, vPairTPosI, sSCH[3h] // Clip scaled high
@@ -1480,10 +1465,10 @@ sSCH equ $v21 // vtx_store Scaled Clipping High
lsv vPairTPosF[6], (VTX_Z_FRAC )(outputVtxPos) // load Z into W slot, will be for fog below
vmadn s1WL, s1WL, sWRH[3h]
middle_of_vtx_store:
// vPairPosI is $v20
ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
// vPairST is $v22
ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3
vmadh s1WH, s1WH, sWRH[3h] // s1WH:s1WL is 1/W
ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
// vnop
sll $11, $20, 4 // Shift first vertex scaled clipping to second slots
// vnop
@@ -1497,59 +1482,63 @@ middle_of_vtx_store:
vmadh vPairTPosI, vPairTPosI, s1WH[3h] // pos * 1/W
ssv s1WH[6], (VTX_INV_W_INT )($19)
// vnop
.if !CFG_LEGACY_VTX_PIPE
slv vPairST[4], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Replace vtx 1 ST with vtx 0 RGBA
// vnop
.if CFG_LEGACY_VTX_PIPE
lpv $v14[7], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Y to elem 0, 4
.else
// sVPO is $v17 // vtx_store ViewPort Offset
lqv sVPO, (0x10)(rdpCmdBufEndP1) // Load viewport offset from temp mem
.endif
// vnop
.if !CFG_LEGACY_VTX_PIPE
// sVPS is $v16 // vtx_store ViewPort Scale
vmudl $v29, vPairTPosF, $v30[3] // Persp norm
.if CFG_LEGACY_VTX_PIPE
lpv $v15[6], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Z to elem 0, 4
.else
// sVPS is $v26 // vtx_store ViewPort Scale
lqv sVPS, (0x00)(rdpCmdBufEndP1) // Load viewport scale from temp mem
.endif
vmudl $v29, vPairTPosF, $v30[3] // Persp norm
// vPairST is $v22
llv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1
vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm
llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
// vPairPosI is $v20
ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
vmadn vPairTPosF, $v31, $v31[2] // 0
// vDDD is $v26
lpv vDDD[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
// vnop
// vPairRGBA is $v27
luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair RGBA
// vnop
// vPairNrml is $v16
lpv vPairNrml[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair normals
vmudh $v29, sVPO, vOne // offset * 1
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
vmadn vPairTPosF, vPairTPosF, sVPS // + XYZ * scale
// sCOL is $v17, defined at start of loop
luv sCOL[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4
vmadh vPairTPosI, vPairTPosI, sVPS
or $24, $24, $20 // Combine results for second vertex
vmadh vPairTPosI, vPairTPosI, sVPS
or $10, $10, $11 // Combine results for first vertex
// sFOG is $v25
vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
or $10, $10, $11 // Combine results for first vertex
// vnop
sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags
// vnop
sh $10, (VTX_CLIP )($19) // Store first vertex results
// vnop
ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos)
// sCLZ is $v21 // vtx_store CLipped Z
vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos)
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
slv vPairTPosI[0], (VTX_SCR_VEC )($19)
vmudn $v29, vM3F, vOne
ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos)
vmadh $v29, vM3I, vOne
ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19)
vmadn $v29, vM0F, vPairPosI[0h]
// nop TODO
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos)
vmudn $v29, vM3F, vOne
slv vPairTPosI[0], (VTX_SCR_VEC )($19)
vmadh $v29, vM3I, vOne
blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping
vmadn $v29, vM0F, vPairPosI[0h]
move $ra, $16 // Normally $ra = loop or lighting
skip_return_to_lt_or_loop:
vmadh $v29, vM0I, vPairPosI[0h]
ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos)
addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
vmadn $v29, vM1F, vPairPosI[1h]
ssv sCLZ[4], (VTX_SCR_Z )($19)
ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos)
vmadh $v29, vM1I, vPairPosI[1h]
// vPairNrml is $v16
lpv vPairNrml[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
ssv sCLZ[4], (VTX_SCR_Z )($19)
// sOUTF = vPairPosF is $v21, or vPairTPosF is $v23
vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords
beqz $7, return_routine // fog disabled
@@ -1576,6 +1565,7 @@ middle_of_vtx_store:
vmadn s1WL, s1WL, $v31[0] // -4
ori $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
vmadh s1WH, s1WH, $v31[0] // -4
addi inputVtxPos, inputVtxPos, 2*inputVtxSize
vmudn $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz
vmadh $v29, vPairTPosI, sOCM // Int * int
lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below
@@ -1589,40 +1579,43 @@ middle_of_vtx_store:
vmadn s1WL, s1WL, sWRH[3h]
sll $11, $20, 4 // Shift first vertex scaled clipping to second slots
vmadh s1WH, s1WH, sWRH[3h] // s1WH:s1WL is 1/W
andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
veq $v29, $v31, $v31[3h] // Set VCC to 00010001
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7
or $24, $24, $20 // Combine results for second vertex
veq $v29, $v31, $v31[3h] // Set VCC to 00010001
blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping
vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7
middle_of_vtx_store:
move $ra, $16 // Normally $ra = loop or lighting
skip_return_to_lt_or_loop:
vmudl $v29, vPairTPosF, s1WL[3h] // W must be overwritten with Z before here
ssv s1WL[14], (VTX_INV_W_FRAC)(secondVtxPos)
vmadm $v29, vPairTPosI, s1WL[3h]
ssv s1WL[6], (VTX_INV_W_FRAC)(outputVtxPos)
ssv s1WL[6], (VTX_INV_W_FRAC)($19)
vmadn vPairTPosF, vPairTPosF, s1WH[3h]
ssv s1WH[14], (VTX_INV_W_INT )(secondVtxPos)
vmadh vPairTPosI, vPairTPosI, s1WH[3h] // pos * 1/W
ssv s1WH[6], (VTX_INV_W_INT )(outputVtxPos)
ssv s1WH[6], (VTX_INV_W_INT )($19)
vadd sOC1, sOC1, sOC1[0q] // Add pairs upwards
.if !CFG_LEGACY_VTX_PIPE
// sVPO is $v17 // vtx_store ViewPort Offset
lqv sVPO, (0x10)(rdpCmdBufEndP1) // Load viewport offset from temp mem
.endif
// vnop
.if !CFG_LEGACY_VTX_PIPE
.if CFG_LEGACY_VTX_PIPE
addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
.else
// sVPS is $v16 // vtx_store ViewPort Scale
lqv sVPS, (0x00)(rdpCmdBufEndP1) // Load viewport scale from temp mem
.endif
vmudl $v29, vPairTPosF, $v30[3] // Persp norm
middle_of_vtx_store:
// vPairST is $v22
ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3
vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm
llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
vmadn vPairTPosF, $v31, $v31[2] // 0
// vPairPosI is $v20
ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm
ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
vmadn vPairTPosF, $v31, $v31[2] // 0
// vPairST is $v22
llv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1
vadd sOC1, sOC1, sOC1[1h] // Add elems 1, 5 to 3, 7
llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
// vnop
// sO03 is $v26 // vtx_store Occlusion coeffs 0-3
ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3
@@ -1634,17 +1627,19 @@ middle_of_vtx_store:
lqv sOPM, (0x20)(rdpCmdBufEndP1) // Load occlusion plane -/+4000 constants
.endif
vmadh vPairTPosI, vPairTPosI, sVPS
andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about
// sFOG is $v16
vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
or $10, $10, $11 // Combine results for first vertex
vlt $v29, sOC1, $v31[2] // Occlusion plane equation < 0 in elems 3, 7
slv vPairST[4], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Replace vtx 1 ST with vtx 0 RGBA
// vnop
cfc2 $11, $vcc // Load occlusion plane mid results to bits 3 and 7
// sOSC is $v21 // vtx_store Occlusion SCaled up
vmudh sOSC, vPairTPosI, $v31[4] // 4; scale up x and y
ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos)
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
// nop TODO
or $24, $24, $20 // Combine results for second vertex
// sCLZ is $v25 // vtx_store CLipped Z
vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0
ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19)
@@ -1665,23 +1660,27 @@ middle_of_vtx_store:
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
slv vPairTPosI[0], (VTX_SCR_VEC )($19)
vmrg sOC2, sOC2, sOC3 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
// vPairNrml is $v16
lpv vPairNrml[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
.if CFG_LEGACY_VTX_PIPE
lpv $v14[7], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Y to elem 0, 4
.else
addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
.endif
// vnop
ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos)
// vnop
// vDDD is $v26
lpv vDDD[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
.if CFG_LEGACY_VTX_PIPE
lpv $v15[6], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Z to elem 0, 4
.endif
// vnop
ssv sCLZ[4], (VTX_SCR_Z )($19)
vge $v29, sOC2, sO47 // Each compare to coeffs 4-7
// sCOL is $v17, defined at start of loop
luv sCOL[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4
// vPairNrml is $v16
lpv vPairNrml[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair normals
vmudn $v29, vM3F, vOne
cfc2 $20, $vcc
vmadh $v29, vM3I, vOne
// vPairRGBA is $v27
luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair colors
vmadn $v29, vM0F, vPairPosI[0h]
andi $11, $11, CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4) // Only bits 3, 7 from occlusion
vmadh $v29, vM0I, vPairPosI[0h]
@@ -1876,6 +1875,7 @@ clip_skipxy:
vsubc $v29, vClDiffF, vOne[0] // frac part - 1 for carry
vge vClDiffI, vClDiffI, $v31[2] // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
vmrg vClFade1, vClDiffF, vOne[0] // keep frac part of factor, else set to 1 (min val)
addi inputVtxPos, rdpCmdBufEndP1, 0x30 // Temp mem; vtx load writes a few bytes here
vmudn vClFade2, vClFade1, $v31[1] // signed x * -1 = 0xFFFF - unsigned x! v2[3] is fade factor for on screen vert
lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode, incl fog setting
// Fade between attributes for on screen and off screen vert
@@ -2779,6 +2779,7 @@ lt_continue_setup:
lb $11, dirLightsXfrmValid
li $10, -1 // To mark lights valid
addi $3, $3, altBase // Point to ambient light; stored through vtx proc
andi $17, $5, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store
and $11, $11, $7 // Zero if either matrix or lights invalid
bnez $11, lt_setup_skip_xfrm
sb $10, dirLightsXfrmValid
@@ -2840,11 +2841,8 @@ xfrm_light_post:
li $20, OSTask + OSTask_ucode_data_size
lt_setup_skip_xfrm:
// Load first light direction to $v13, which is not used throughout vtx processing.
lpv $v4[4], (ltBufOfs + 0 - lightSize)($3) // Xfrmed dir in elems 0-2
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
lpv $v5[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6
j vtx_after_lt_setup
vmrg $v13, $v4, $v5 // $v13 = first light direction
lpv $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6
xfrm_single_dir:
vmudn $v29, $v16, $v3[0]
@@ -2877,77 +2875,44 @@ lt_vtx_pair:
//
// LVP main lighting
//
vmulf $v29, vPairNrml, $v13[4] // Normals X elems 0, 4 * first light dir
luv vPairLt, (ltBufOfs + 0)($3) // Total light level, init to ambient
vmulf vAAA, $v13, vPairNrml // First light dir * normals
vmacf $v29, $v14, $v13[5] // Normals Y elems 0, 4 * first light dir
lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)($3) // Xfrmed dir in elems 4-6
vmacf vAAA, $v15, $v13[6] // Normals Z elems 0, 4 * first light dir
.if CFG_PROFILING_B
addi perfCounterA, perfCounterA, 2 // Increment lit vertex count by 2
.endif
move curLight, $3 // Point to ambient light
lt_loop:
// vnop; vnop
vmudh $v29, vOne, vAAA[0h] // Sum components of dot product
lpv vCCC[4], (ltBufOfs + 0 - 2*lightSize)(curLight) // Xfrmed dir in elems 0-2
vmadh $v29, vOne, vAAA[1h]
lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6
vmadh vAAA, vOne, vAAA[2h]
beq curLight, altBaseReg, lt_post
luv vBBB, (ltBufOfs + 0 - lightSize)(curLight) // Light color
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
addi curLight, curLight, -lightSize
vmrg vCCC, vCCC, vDDD // vCCC = light direction
vge vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0
// vnop; vnop
vmudh $v29, vOne, vPairLt // Load accum mid with current light level
vmacf vPairLt, vBBB, vAAA[0h] // + light color * dot product
bne curLight, altBaseReg, lt_loop
vmulf vAAA, vCCC, vPairNrml // Light dir * normals
/*
.if CFG_PROFILING_B
addi perfCounterA, perfCounterA, 2 // Increment lit vertex count by 2
.endif
vmulf $v29, $v14, $v13[4] // Normals X elems 0, 4 * first light dir
luv vPairLt, (ltBufOfs + 0)($3) // Total light level, init to ambient
vmacf $v29, $v15, $v13[5] // Normals Y elems 0, 4 * first light dir
lpv vDDD[0], (ltBufOfs + 8 - XXX)(curLight) // Xfrmed dir in elems 4-6
vmacf vAAA, vPairNrml, $v13[6] // Normals Z elems 0, 4 * first light dir
beq $3, altBaseReg, lt_post
move curLight, $3 // Point to ambient light
lpv $v18[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut
move curLight, $3 // Point to ambient light
// vnop
lt_loop:
vge vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0
vmulf $v29, $v14, vDDD[4] // Normals X elems 0, 4
vmulf $v29, vPairNrml, vDDD[4] // Normals X elems 0, 4
luv vBBB, (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color
vmacf $v29, $v14, vDDD[5] // Normals Y elems 0, 4
addi curLight, curLight, -lightSize
vmacf $v29, $v15, vDDD[5] // Normals Y elems 0, 4
luv vBBB, (ltBufOfs + 0 - XXX)(curLight) // Light color
vmacf vAAA, vPairNrml, vDDD[6] // Normals Z elems 0, 4
lpv vDDD[0], (ltBufOfs + 8 - XXX)(curLight) // Xfrmed dir in elems 4-6
vmacf vAAA, $v15, vDDD[6] // Normals Z elems 0, 4
lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6
vmudh $v29, vOne, vPairLt // Load accum mid with current light level
bne curLight, altBaseReg, lt_loop
vmacf vPairLt, vBBB, vCCC[0h] // + light color * dot product
*/
lt_post:
andi $11, $5, G_TEXTURE_GEN >> 8
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
beqz $11, vtx_return_from_lighting
beqz $17, vtx_return_from_lighting
vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha
.endif
// These definitions are shared by both versions
vLookat1 equ vAAA
vLookat0 equ vPairLt
.if CFG_LEGACY_VTX_PIPE
lpv vCCC[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, lookat 1 in 4-6
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
lpv vLookat0[4], (xfrmLookatDirs + 0)($zero) // Garbage in 0-2, lookat 0 in 4-6
lpv vLookat1[4], (xfrmLookatDirs - 8)($zero) // Lookat 1 in 0-2, garbage in 4-6
vmrg vLookat0, vCCC, vLookat0 // Lookat 0 in 0-2, 4-6
vmrg vLookat1, vLookat1, vCCC // Lookat 1 in 0-2, 4-6
vmulf vLookat0, vPairNrml, vLookat0 // Normal * lookat 0 dir
vmulf vLookat1, vPairNrml, vLookat1 // Normal * lookat 1 dir
vmudh $v29, vOne, vLookat0[0h] // Sum components of dot product
vmadh $v29, vOne, vLookat0[1h]
vmadh vLookat0, vOne, vLookat0[2h] // vLookat0 = dot product 0
vmulf $v29, vPairNrml, $v18[0] // Normals X elems 0, 4 * lookat 0 X
vmacf $v29, $v14, $v18[1] // Normals Y elems 0, 4 * lookat 0 Y
vmacf vLookat0, $v15, $v18[2] // Normals Z elems 0, 4 * lookat 0 Z
vmulf $v29, vPairNrml, $v18[4] // Normals X elems 0, 4 * lookat 1 X
vmacf $v29, $v14, $v18[5] // Normals Y elems 0, 4 * lookat 1 Y
vmacf vLookat1, $v15, $v18[6] // Normals Z elems 0, 4 * lookat 1 Z
// Continue to rest of texgen shared by both versions.
.endif
@@ -2961,10 +2926,12 @@ vLookat0 equ vPairLt
// Outputs: leave alone vPairPosI/F; update vPairRGBA, vPairST
// Locals: vAAA and vBBB after merge and normals selection, vCCC, vDDD, vPairLt, vNrmOut
// New available locals: $6 (existing: $11, $10, $20, $24)
vmrg vPairNrml, vPairNrml, vDDD // Merge normals
beqz $11, lt_skip_packed_normals
// Elems 4-5 get bytes 6-7 of the following vertex (1)
lpv vBBB[6], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Upper 2 in 4:5
// Elems 0-1 get bytes 6-7 of the following vertex (0)
lpv vAAA[2], (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
vmrg vAAA, vAAA, vBBB // Merge packed normals
// Packed normals algorithm. This produces a vector (one for each input vertex)
// in vPairNrml such that |X| + |Y| + |Z| = 0x7F00 (called L1 norm), in the
@@ -3149,11 +3116,11 @@ lt_skip_fresnel:
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
vmrg vLookat1, vLookat1, vDDD // vLookat1 = lookat 1 dir
vmulf vLookat1, vPairNrml, vLookat1 // Normal * lookat 1 dir
.endif
// Rest of texgen shared by F3DEX3 native and LVP
vmudh $v29, vOne, vLookat1[0h]
vmadh $v29, vOne, vLookat1[1h]
vmadh vLookat1, vOne, vLookat1[2h]
.endif
// Rest of texgen shared by F3DEX3 native and LVP
vne $v29, $v31, $v31[1h] // Set VCC to 10111011
andi $11, $5, G_TEXTURE_GEN_LINEAR >> 8
vmrg vLookat0, vLookat0, vLookat1[0h] // Dot products in elements 0, 1, 4, 5

19
notes.s
View File

@@ -17,3 +17,22 @@ vmadh vOut, vOne, vIn[2h]
vmudh $v29, $v31, $v31[2] // 0; clear whole accumulator
vadd $v29, vIn, vIn[1h] // accum lo 0 = 0 + 1, 4 = 4 + 5
vmadn vOut, vOne, vIn[2h] // + 2,6; built-in saturation (clamping) ends up not a problem
// lpv patterns. Same works for luv. Assuming $11 is 8 byte aligned (does not have
// to be 16 byte aligned).
// Elem 0 1 2 3 4 5 6 7
lpv $v27[1], (-8)($11) // 07 F8 F9 FA FB FC FD FE Byte
lpv $v27[2], (-8)($11) // 06 07 F8 F9 FA FB FC FD addr
lpv $v27[3], (-8)($11) // 05 06 07 F8 F9 FA FB FC relative
lpv $v27[4], (-8)($11) // 04 05 06 07 F8 F9 FA FB to
lpv $v27[5], (-8)($11) // 03 04 05 06 07 F8 F9 FA $11
lpv $v27[6], (-8)($11) // 02 03 04 05 06 07 F8 F9
lpv $v27[7], (-8)($11) // 01 02 03 04 05 06 07 F8
lpv $v27[0], ( 0)($11) // 00 01 02 03 04 05 06 07
lpv $v27[1], ( 0)($11) // 0F 00 01 02 03 04 05 06
lpv $v27[2], ( 0)($11) // 0E 0F 00 01 02 03 04 05
lpv $v27[3], ( 0)($11) // 0D 0E 0F 00 01 02 03 04
lpv $v27[4], ( 0)($11) // 0C 0D 0E 0F 00 01 02 03
lpv $v27[5], ( 0)($11) // 0B 0C 0D 0E 0F 00 01 02
lpv $v27[6], ( 0)($11) // 0A 0B 0C 0D 0E 0F 00 01
lpv $v27[7], ( 0)($11) // 09 0A 0B 0C 0D 0E 0F 00