mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
176 lines
9.2 KiB
ArmAsm
176 lines
9.2 KiB
ArmAsm
align_with_warning 8, "One instruction of padding before vertex loop"
|
|
|
|
// 70 cycles, 16 more than NOC
|
|
// 6 vu cycles for plane, 8 vu cycles for edges, 0 more vnops than NOC,
|
|
// 1 branch delay slot with SU instr, 1 land-after-branch.
|
|
vtx_loop_no_lighting:
|
|
// lDTC <- sVPS
|
|
// lVCI <- sRTI
|
|
// vpLtTot <- sTCL
|
|
// vpNrmlX <- s1WF
|
|
// lDIR <- sOTM
|
|
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
|
|
sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0
|
|
vmrg sOCS, sOCS, sOTM // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
|
|
sbv sFOG[7], (VTX_COLOR_A + 8)($11) // ...which gets overwritten below
|
|
vmrg vpScrF, vpScrF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6
|
|
// vpRGBA <- lDIR
|
|
luv vpRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
|
|
// lDOT <- sCLZ
|
|
// lCOL <- sFOG
|
|
vtx_return_from_texgen:
|
|
vmudm $v29, vpST, sSTS // Scale ST
|
|
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
|
|
vmadh vpST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
|
|
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
|
|
vtx_return_from_lighting:
|
|
vge $v29, sOCS, sO47 // Each compare to coeffs 4-7
|
|
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
|
|
vmudn $v29, vMTX3F, vOne
|
|
cfc2 $11, $vcc
|
|
vmadh $v29, vMTX3I, vOne
|
|
slv vpScrF[10], (VTX_SCR_Z )(outVtx2)
|
|
vmadn $v29, vMTX0F, vpMdl[0h]
|
|
addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices
|
|
vmadh $v29, vMTX0I, vpMdl[0h]
|
|
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
|
|
vmadn $v29, vMTX1F, vpMdl[1h]
|
|
or $11, $11, $10 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded
|
|
vmadh $v29, vMTX1I, vpMdl[1h]
|
|
andi $10, $11, 0x000F // Bits 0-3 for vtx 1
|
|
// vpClpF <- lDOT
|
|
vmadn vpClpF, vMTX2F, vpMdl[2h]
|
|
addi $11, $11, -(0x0010) // If not occluded, atl 1 of 4-7 set, so $11 >= 0x10. Else $11 < 0x10.
|
|
// vpClpI <- lCOL
|
|
vmadh vpClpI, vMTX2I, vpMdl[2h]
|
|
bnez $10, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag
|
|
andi $11, $11, CLIP_OCCLUDED // This is bit 11, = sign bit b/c |$11| <= 0xFF
|
|
ori flagsV1, flagsV1, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag
|
|
@@skipv1:
|
|
// 16 cycles
|
|
vtx_store_for_clip:
|
|
vmudl $v29, vpClpF, $v30[3] // Persp norm
|
|
or flagsV2, flagsV2, $11 // occluded = $11 negative = sign bit set = $11 is flag, else 0
|
|
// s1WI <- vpMdl
|
|
vmadm s1WI, vpClpI, $v30[3] // Persp norm
|
|
sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags
|
|
// s1WF <- vpNrmlX
|
|
vmadn s1WF, $v31, $v31[2] // 0
|
|
blez vtxLeft, vtx_epilogue
|
|
vmudn $v29, vpClpF, sOCM // X * kx, Y * ky, Z * kz
|
|
vmadh $v29, vpClpI, sOCM // Int * int
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
|
|
vrcph $v29[0], s1WI[3]
|
|
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
|
|
// sRTF <- lDTC
|
|
vrcpl sRTF[2], s1WF[3]
|
|
sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter
|
|
// sRTI <- lVCI
|
|
vrcph sRTI[3], s1WI[7]
|
|
andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0
|
|
vrcpl sRTF[6], s1WF[7]
|
|
sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second
|
|
vrcph sRTI[7], $v31[2] // 0
|
|
addi outVtx1, outVtxBase, -vtxSize // First output vtx always
|
|
vreadacc sOCS, ACC_UPPER // Load int * int portion
|
|
suv vpRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx
|
|
vch $v29, vpClpI, vpClpI[3h] // Clip screen high
|
|
suv vpRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx
|
|
vmudl $v29, s1WF, sRTF[2h]
|
|
sdv vpClpI[8], (VTX_INT_VEC )(outVtx2)
|
|
vmadm $v29, s1WI, sRTF[2h]
|
|
sdv vpClpI[0], (VTX_INT_VEC )(outVtx1)
|
|
vmadn s1WF, s1WF, sRTI[3h]
|
|
sdv vpClpF[8], (VTX_FRAC_VEC )(outVtx2)
|
|
vmadh s1WI, s1WI, sRTI[3h]
|
|
sdv vpClpF[0], (VTX_FRAC_VEC )(outVtx1)
|
|
vcl $v29, vpClpF, vpClpF[3h] // Clip screen low
|
|
sqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // For Z to W manip. RGBA not currently stored here
|
|
vmudh $v29, vOne, $v31[4] // 4
|
|
cfc2 flagsV1, $vcc // Screen clip results
|
|
vmadn s1WF, s1WF, $v31[0] // -4
|
|
ssv vpClpI[4], (tempVpRGBA + 6)(rdpCmdBufEndP1) // First Z to W
|
|
vmadh s1WI, s1WI, $v31[0] // -4
|
|
// sTCL <- vpLtTot
|
|
ldv sTCL[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3
|
|
// sSCF <- vpScrF
|
|
vmudn sSCF, vpClpF, $v31[3] // W * clip ratio for scaled clipping
|
|
ssv vpClpI[12], (tempVpRGBA + 14)(rdpCmdBufEndP1) // Second Z to W
|
|
// sSCI <- vpScrI
|
|
vmadh sSCI, vpClpI, $v31[3] // W * clip ratio for scaled clipping
|
|
lsv vpClpF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below
|
|
vmudl $v29, s1WF, sRTF[2h]
|
|
lqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // Load int part with Z in W
|
|
vmadm $v29, s1WI, sRTF[2h]
|
|
lsv vpClpF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below
|
|
vmadn s1WF, s1WF, sRTI[3h]
|
|
ldv sTCL[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7
|
|
vmadh s1WI, s1WI, sRTI[3h]
|
|
srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots
|
|
vch $v29, vpClpI, sSCI[3h] // Clip scaled high
|
|
andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vcl $v29, vpClpF, sSCF[3h] // Clip scaled low
|
|
slv vpST[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2
|
|
vmudl $v29, vpClpF, s1WF[3h] // Pos times inv W
|
|
cfc2 $11, $vcc // Scaled clip results
|
|
vmadm $v29, vpClpI, s1WF[3h] // Pos times inv W
|
|
slv vpST[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1
|
|
vmadn vpClpF, vpClpF, s1WI[3h]
|
|
// sVPO <- sSCF
|
|
ldv sVPO[0], (viewport + 8)($zero) // Load viewport offset incl. fog for first vertex
|
|
vmadh vpClpI, vpClpI, s1WI[3h] // vpClpI:vpClpF = pos times inv W
|
|
ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2)
|
|
// sOTM <- vpRGBA
|
|
vadd sOTM, sOCS, sOCS[1h] // Add Y to X
|
|
ldv sVPO[8], (viewport + 8)($zero) // Load viewport offset incl. fog for second vertex
|
|
vcopy vpST, sTCL
|
|
ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1)
|
|
vmudl $v29, vpClpF, $v30[3] // Persp norm
|
|
// sVPS <- sSCI
|
|
ldv sVPS[0], (viewport)($zero) // Load viewport scale incl. fog for first vertex
|
|
vmadm vpClpI, vpClpI, $v30[3] // Persp norm
|
|
ssv s1WI[14], (VTX_INV_W_INT )(outVtx2)
|
|
vmadn vpClpF, $v31, $v31[2] // 0; Now vpClpI:vpClpF = projected position
|
|
ldv sVPS[8], (viewport)($zero) // Load viewport scale incl. fog for second vertex
|
|
vadd sOCS, sOTM, sOCS[2h] // Add Z to X
|
|
ssv s1WI[6], (VTX_INV_W_INT )(outVtx1)
|
|
vmov sTCL[4], vpST[2] // First vtx RG to elem 4
|
|
andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vmudh $v29, sVPO, vOne // offset * 1
|
|
sll $10, $11, 4 // Shift first vertex scaled clipping to second slots
|
|
// vpScrF <- sVPO
|
|
vmadn vpScrF, vpClpF, sVPS // + pos frac * scale
|
|
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
// vpScrI <- sVPS
|
|
vmadh vpScrI, vpClpI, sVPS // int part, vpScrI:vpScrF is now screen space pos
|
|
or flagsV2, flagsV2, $11 // Combine results for second vertex
|
|
// sFOG <- vpClpI
|
|
vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
|
|
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
vlt $v29, sOCS, sOCM[3h] // Occlusion plane X+Y+Z<C in elems 0, 4
|
|
or flagsV1, flagsV1, $10 // Combine results for first vertex
|
|
vmov sTCL[5], vpST[3] // First vtx BA to elem 5
|
|
cfc2 $10, $vcc // Load occlusion plane mid results to bits 3 and 7
|
|
vmudh sOTM, vpScrI, $v31[4] // 4; scale up x and y
|
|
// vpMdl <- s1WI
|
|
vtx_store_loop_entry:
|
|
ldv vpMdl[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration
|
|
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
|
|
ldv vpMdl[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration
|
|
// vnop
|
|
andi $10, $10, (1 << 0) | (1 << 4) // Only bits 0, 4 from occlusion
|
|
vmulf $v29, sOPM, vpScrI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
|
|
sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0
|
|
vmacf sOCS, sO03, sOTM[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2
|
|
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
|
|
vmulf $v29, sOPM, vpScrI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2
|
|
sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
|
|
vmacf sOTM, sO03, sOTM[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2
|
|
jr vLoopRet
|
|
// sCLZ <- vpClpF
|
|
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
|
|
// vnop in land slot
|
|
|
|
vtx_epilogue:
|
|
// Fallthrough (across the versions boundary)
|