Files
F3DEX3/rsp/vtx/loop_occ.s
2025-11-29 16:27:31 -08:00

176 lines
9.2 KiB
ArmAsm

align_with_warning 8, "One instruction of padding before vertex loop"
// 70 cycles, 16 more than NOC
// 6 vu cycles for plane, 8 vu cycles for edges, 0 more vnops than NOC,
// 1 branch delay slot with SU instr, 1 land-after-branch.
vtx_loop_no_lighting:
// lDTC <- sVPS
// lVCI <- sRTI
// vpLtTot <- sTCL
// vpNrmlX <- s1WF
// lDIR <- sOTM
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0
vmrg sOCS, sOCS, sOTM // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
sbv sFOG[7], (VTX_COLOR_A + 8)($11) // ...which gets overwritten below
vmrg vpScrF, vpScrF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6
// vpRGBA <- lDIR
luv vpRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
// lDOT <- sCLZ
// lCOL <- sFOG
vtx_return_from_texgen:
vmudm $v29, vpST, sSTS // Scale ST
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
vmadh vpST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
vtx_return_from_lighting:
vge $v29, sOCS, sO47 // Each compare to coeffs 4-7
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
vmudn $v29, vMTX3F, vOne
cfc2 $11, $vcc
vmadh $v29, vMTX3I, vOne
slv vpScrF[10], (VTX_SCR_Z )(outVtx2)
vmadn $v29, vMTX0F, vpMdl[0h]
addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices
vmadh $v29, vMTX0I, vpMdl[0h]
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
vmadn $v29, vMTX1F, vpMdl[1h]
or $11, $11, $10 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded
vmadh $v29, vMTX1I, vpMdl[1h]
andi $10, $11, 0x000F // Bits 0-3 for vtx 1
// vpClpF <- lDOT
vmadn vpClpF, vMTX2F, vpMdl[2h]
addi $11, $11, -(0x0010) // If not occluded, atl 1 of 4-7 set, so $11 >= 0x10. Else $11 < 0x10.
// vpClpI <- lCOL
vmadh vpClpI, vMTX2I, vpMdl[2h]
bnez $10, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag
andi $11, $11, CLIP_OCCLUDED // This is bit 11, = sign bit b/c |$11| <= 0xFF
ori flagsV1, flagsV1, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag
@@skipv1:
// 16 cycles
vtx_store_for_clip:
vmudl $v29, vpClpF, $v30[3] // Persp norm
or flagsV2, flagsV2, $11 // occluded = $11 negative = sign bit set = $11 is flag, else 0
// s1WI <- vpMdl
vmadm s1WI, vpClpI, $v30[3] // Persp norm
sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags
// s1WF <- vpNrmlX
vmadn s1WF, $v31, $v31[2] // 0
blez vtxLeft, vtx_epilogue
vmudn $v29, vpClpF, sOCM // X * kx, Y * ky, Z * kz
vmadh $v29, vpClpI, sOCM // Int * int
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
vrcph $v29[0], s1WI[3]
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
// sRTF <- lDTC
vrcpl sRTF[2], s1WF[3]
sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter
// sRTI <- lVCI
vrcph sRTI[3], s1WI[7]
andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0
vrcpl sRTF[6], s1WF[7]
sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second
vrcph sRTI[7], $v31[2] // 0
addi outVtx1, outVtxBase, -vtxSize // First output vtx always
vreadacc sOCS, ACC_UPPER // Load int * int portion
suv vpRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx
vch $v29, vpClpI, vpClpI[3h] // Clip screen high
suv vpRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx
vmudl $v29, s1WF, sRTF[2h]
sdv vpClpI[8], (VTX_INT_VEC )(outVtx2)
vmadm $v29, s1WI, sRTF[2h]
sdv vpClpI[0], (VTX_INT_VEC )(outVtx1)
vmadn s1WF, s1WF, sRTI[3h]
sdv vpClpF[8], (VTX_FRAC_VEC )(outVtx2)
vmadh s1WI, s1WI, sRTI[3h]
sdv vpClpF[0], (VTX_FRAC_VEC )(outVtx1)
vcl $v29, vpClpF, vpClpF[3h] // Clip screen low
sqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // For Z to W manip. RGBA not currently stored here
vmudh $v29, vOne, $v31[4] // 4
cfc2 flagsV1, $vcc // Screen clip results
vmadn s1WF, s1WF, $v31[0] // -4
ssv vpClpI[4], (tempVpRGBA + 6)(rdpCmdBufEndP1) // First Z to W
vmadh s1WI, s1WI, $v31[0] // -4
// sTCL <- vpLtTot
ldv sTCL[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3
// sSCF <- vpScrF
vmudn sSCF, vpClpF, $v31[3] // W * clip ratio for scaled clipping
ssv vpClpI[12], (tempVpRGBA + 14)(rdpCmdBufEndP1) // Second Z to W
// sSCI <- vpScrI
vmadh sSCI, vpClpI, $v31[3] // W * clip ratio for scaled clipping
lsv vpClpF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below
vmudl $v29, s1WF, sRTF[2h]
lqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // Load int part with Z in W
vmadm $v29, s1WI, sRTF[2h]
lsv vpClpF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below
vmadn s1WF, s1WF, sRTI[3h]
ldv sTCL[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7
vmadh s1WI, s1WI, sRTI[3h]
srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots
vch $v29, vpClpI, sSCI[3h] // Clip scaled high
andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vcl $v29, vpClpF, sSCF[3h] // Clip scaled low
slv vpST[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2
vmudl $v29, vpClpF, s1WF[3h] // Pos times inv W
cfc2 $11, $vcc // Scaled clip results
vmadm $v29, vpClpI, s1WF[3h] // Pos times inv W
slv vpST[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1
vmadn vpClpF, vpClpF, s1WI[3h]
// sVPO <- sSCF
ldv sVPO[0], (viewport + 8)($zero) // Load viewport offset incl. fog for first vertex
vmadh vpClpI, vpClpI, s1WI[3h] // vpClpI:vpClpF = pos times inv W
ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2)
// sOTM <- vpRGBA
vadd sOTM, sOCS, sOCS[1h] // Add Y to X
ldv sVPO[8], (viewport + 8)($zero) // Load viewport offset incl. fog for second vertex
vcopy vpST, sTCL
ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1)
vmudl $v29, vpClpF, $v30[3] // Persp norm
// sVPS <- sSCI
ldv sVPS[0], (viewport)($zero) // Load viewport scale incl. fog for first vertex
vmadm vpClpI, vpClpI, $v30[3] // Persp norm
ssv s1WI[14], (VTX_INV_W_INT )(outVtx2)
vmadn vpClpF, $v31, $v31[2] // 0; Now vpClpI:vpClpF = projected position
ldv sVPS[8], (viewport)($zero) // Load viewport scale incl. fog for second vertex
vadd sOCS, sOTM, sOCS[2h] // Add Z to X
ssv s1WI[6], (VTX_INV_W_INT )(outVtx1)
vmov sTCL[4], vpST[2] // First vtx RG to elem 4
andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vmudh $v29, sVPO, vOne // offset * 1
sll $10, $11, 4 // Shift first vertex scaled clipping to second slots
// vpScrF <- sVPO
vmadn vpScrF, vpClpF, sVPS // + pos frac * scale
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
// vpScrI <- sVPS
vmadh vpScrI, vpClpI, sVPS // int part, vpScrI:vpScrF is now screen space pos
or flagsV2, flagsV2, $11 // Combine results for second vertex
// sFOG <- vpClpI
vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
vlt $v29, sOCS, sOCM[3h] // Occlusion plane X+Y+Z<C in elems 0, 4
or flagsV1, flagsV1, $10 // Combine results for first vertex
vmov sTCL[5], vpST[3] // First vtx BA to elem 5
cfc2 $10, $vcc // Load occlusion plane mid results to bits 3 and 7
vmudh sOTM, vpScrI, $v31[4] // 4; scale up x and y
// vpMdl <- s1WI
vtx_store_loop_entry:
ldv vpMdl[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
ldv vpMdl[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration
// vnop
andi $10, $10, (1 << 0) | (1 << 4) // Only bits 0, 4 from occlusion
vmulf $v29, sOPM, vpScrI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0
vmacf sOCS, sO03, sOTM[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
vmulf $v29, sOPM, vpScrI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2
sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
vmacf sOTM, sO03, sOTM[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2
jr vLoopRet
// sCLZ <- vpClpF
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
// vnop in land slot
vtx_epilogue:
// Fallthrough (across the versions boundary)