mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
256 lines
14 KiB
ArmAsm
256 lines
14 KiB
ArmAsm
.include "rsp/clipping/clipping_regs.inc"
|
|
|
|
// Each clip condition (clipping plane bit being checked) has three phases that
|
|
// occur in this order: find an onscreen vertex, then find the transition from an
|
|
// onscreen to offscreen vertex, then find the transition from an offscreen to an
|
|
// onscreen vertex. The latter two are the edges which must be subdivided.
|
|
CLIP_PHASE_FIND_ON equ 1 // These are these values so that we can use branch
|
|
CLIP_PHASE_FIND_ON_TO_OFF equ -1 // instructions to check them, and so that the sign
|
|
CLIP_PHASE_FIND_OFF_TO_ON equ 0 // bit represents which condition (on/off) to continue.
|
|
// The clipping walk finds the two edges to be subdivided and stores them in temp
|
|
// memory at these offsets. There are two of these sets of three pointers contiguously.
|
|
CLIP_PTR_ONSCR equ 0 // Onscreen vertex at the end of the edge
|
|
CLIP_PTR_OFFSCR equ 2 // Offscreen vertex at the other end of the edge
|
|
CLIP_PTR_GEN equ 4 // Generated vertex, where to write the results
|
|
CLIP_PTR_COUNT equ 6
|
|
|
|
clip_after_constants:
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterB, perfCounterB, 1 // Increment clipped (input) tris count
|
|
.endif
|
|
sqv vZero[0], (clipPolySgn)($zero) // Clear whole polygon
|
|
sh origV1Addr, clipPoly + 0xA // Initial polygon is right-justified
|
|
sh $2, clipPoly + 0xC
|
|
sh $3, clipPoly + 0xE
|
|
sb $zero, materialCullMode // In case only/all tri(s) clip then offscreen
|
|
li clipMaskIdx, 5 // Will sub 1; 4=screen, 3=+x, 2=-x, 1=+y, 0=-y
|
|
li clipAlloc, 0 // Init to no temp verts allocated
|
|
clip_condlooptop:
|
|
addi clipMaskIdx, clipMaskIdx, -1
|
|
lbu clipMaskShift, (clipCondShifts)(clipMaskIdx)
|
|
addi clipPtrs, rdpCmdBufEndP1, tempClipPtrs // Temp mem in output buffer for vtx ptrs
|
|
li clipWalkCount, 0x18 // Give up if loop traversed the polygon 3x
|
|
li clipWalkPhase, CLIP_PHASE_FIND_ON
|
|
j clip_walk_loop
|
|
li clipIdx, 0xA // Start pos doesn't matter, cause first actual op at on-to-off, and there should be only one of these.
|
|
|
|
clip_found_on:
|
|
li clipWalkPhase, CLIP_PHASE_FIND_ON_TO_OFF
|
|
clip_walk_loop_top:
|
|
bnez clipWalkPhase, clip_walk_loop_continue // find on, or find on to off
|
|
// 0 = find off to on. This is an offscreen vertex, so remove it.
|
|
addi $10, clipIdx, -2
|
|
clip_remove_loop:
|
|
lhu $11, (clipPoly + 0)($10)
|
|
sh $11, (clipPoly + 2)($10)
|
|
bgtz $10, clip_remove_loop
|
|
addi $10, $10, -2
|
|
sh $zero, (clipPoly + 0)
|
|
// Deallocate it. The first iteration is for if it's in the main vertex buffer,
|
|
// in which case the deallocation is a no-op.
|
|
li $11, 0xFFFFFEFF // 0b...111011111111 (8 set to the right of the 0)
|
|
addi $10, clipCurVtx, -(clipTempVerts - vtxSize)
|
|
clip_deallocate_loop:
|
|
addi $10, $10, -vtxSize
|
|
bgez $10, clip_deallocate_loop // First iter: loops if in clipTempVerts
|
|
sra $11, $11, 1 // First iter: else, clears ...11101111111 (7)
|
|
and clipAlloc, clipAlloc, $11 // Clear vertex allocation bit
|
|
clip_walk_loop_continue:
|
|
move clipLastVtx, clipCurVtx
|
|
clip_walk_loop_skip_vtx:
|
|
addi clipWalkCount, clipWalkCount, -1
|
|
bltz clipWalkCount, clip_timeout
|
|
addi clipIdx, clipIdx, 2
|
|
andi clipIdx, clipIdx, 0xE // Circular addressing
|
|
clip_walk_loop:
|
|
lhu clipCurVtx, (clipPoly)(clipIdx)
|
|
beqz clipCurVtx, clip_walk_loop_skip_vtx // Vertex addr = 0 -> skip
|
|
lhu $10, VTX_CLIP(clipCurVtx)
|
|
sllv $10, $10, clipMaskShift // Put clipping bit in sign bit
|
|
xor $10, $10, clipWalkPhase // Invert the sign bit for phase -1
|
|
bltz $10, clip_walk_loop_top // Nonzero clipping bit = offscreen. Phase 1 and 0, continue if offscreen.
|
|
sh clipLastVtx, (CLIP_PTR_ONSCR)(clipPtrs) // For on to off only
|
|
bgtz clipWalkPhase, clip_found_on // 1 = find on
|
|
sh clipCurVtx, (CLIP_PTR_OFFSCR)(clipPtrs) // For on to off only
|
|
// Insert a space here. The zeros are always on the left.
|
|
blez clipIdx, clip_err_insert // Would crash if continued with clipIdx == 0
|
|
li $10, 2
|
|
clip_insert_loop:
|
|
lhu $11, (clipPoly - 0)($10) // Last iter, unecessarily copies clipIdx left one
|
|
sh $11, (clipPoly - 2)($10) // but this is harmless & handles case clipIdx == 2
|
|
bne $10, clipIdx, clip_insert_loop
|
|
addi $10, $10, 2
|
|
addi clipIdx, clipIdx, -2 // For temp vtx, then reprocess current vtx. Checked clipIdx > 0 above.
|
|
// Allocate a temp vertex
|
|
li $11, 0x0080 // 7 zeros to the right
|
|
li clipTempVtx, clipTempVerts - vtxSize
|
|
clip_allocate_loop:
|
|
srl $11, $11, 1 // First iter: 6 zeros to the right, at first temp vtx
|
|
and $10, $11, clipAlloc
|
|
bnez $10, clip_allocate_loop // First iter: branch if first temp vtx already allocated
|
|
addi clipTempVtx, clipTempVtx, vtxSize // First iter: clipTempVtx = clipTempVerts
|
|
beqz $11, clip_err_alloc
|
|
or clipAlloc, clipAlloc, $11 // Mark the vertex allocated
|
|
sh clipTempVtx, (clipPoly)(clipIdx)
|
|
sh clipTempVtx, (CLIP_PTR_GEN)(clipPtrs)
|
|
addi clipPtrs, clipPtrs, CLIP_PTR_COUNT
|
|
bltz clipWalkPhase, clip_walk_loop_continue // -1 = on to off
|
|
li clipWalkPhase, CLIP_PHASE_FIND_OFF_TO_ON // (nop if done)
|
|
sh clipLastVtx, (CLIP_PTR_OFFSCR - CLIP_PTR_COUNT)(clipPtrs) // Swapped for off to on
|
|
sh clipCurVtx, (CLIP_PTR_ONSCR - CLIP_PTR_COUNT)(clipPtrs) // Swapped for off to on
|
|
clip_do_subdivision:
|
|
lhu clipVOnsc, (CLIP_PTR_ONSCR - 2 * CLIP_PTR_COUNT)(clipPtrs)
|
|
lhu clipVOffsc, (CLIP_PTR_OFFSCR - 2 * CLIP_PTR_COUNT)(clipPtrs)
|
|
// Interpolate between clipVOffsc and clipVOns; create a new vertex which is on the
|
|
// clipping boundary (e.g. at the screen edge)
|
|
/*
|
|
Five clip conditions (these are in a different order from vanilla):
|
|
cBaseI/cBaseF[3] cDiffI/cDiffF[3]
|
|
4 W=0: W1 W1 - W2
|
|
3 +X : X1 - 2*W1 (X1 - 2*W1) - (X2 - 2*W2) <- the 2 is clip ratio
|
|
2 -X : X1 + 2*W1 (X1 + 2*W1) - (X2 + 2*W2)
|
|
1 +Y : Y1 - 2*W1 (Y1 - 2*W1) - (Y2 - 2*W2)
|
|
0 -Y : Y1 + 2*W1 (Y1 + 2*W1) - (Y2 + 2*W2)
|
|
*/
|
|
xori $11, clipMaskIdx, 1 // Invert sign of condition
|
|
ldv cPosOnOfF[0], VTX_FRAC_VEC(clipVOnsc)
|
|
ctc2 $11, $vcc // Conditions 1 (+y) or 3 (+x) -> vcc[0] = 0
|
|
ldv cPosOnOfI[0], VTX_INT_VEC (clipVOnsc)
|
|
vmrg cTemp, vOne, $v31[1] // elem 0 is 1 if W or neg cond, -1 if pos cond
|
|
andi $11, clipMaskIdx, 4 // W condition and screen clipping
|
|
ldv cPosOnOfF[8], VTX_FRAC_VEC(clipVOffsc) // Off screen to elems 4-7
|
|
bnez $11, clip_w // If so, use 1 or -1
|
|
ldv cPosOnOfI[8], VTX_INT_VEC (clipVOffsc)
|
|
vmudh cTemp, cTemp, $v31[3] // elem 0 is (1 or -1) * 2 (clip ratio)
|
|
andi $11, clipMaskIdx, 2 // Conditions 2 (-x) or 3 (+x)
|
|
vmudm cBaseF, vOne, cPosOnOfF[0h] // Set accumulator (care about 3, 7) to X
|
|
bnez $11, clip_skipy
|
|
vmadh cBaseI, vOne, cPosOnOfI[0h]
|
|
vmudm cBaseF, vOne, cPosOnOfF[1h] // Discard that and set accumulator 3, 7 to Y
|
|
vmadh cBaseI, vOne, cPosOnOfI[1h]
|
|
clip_skipy:
|
|
vmadn cBaseF, cPosOnOfF, cTemp[0] // + W * +/- 2
|
|
vmadh cBaseI, cPosOnOfI, cTemp[0]
|
|
clip_skipxy:
|
|
vsubc cDiffF, cBaseF, cBaseF[7] // Vtx on screen - vtx off screen
|
|
vsub cDiffI, cBaseI, cBaseI[7]
|
|
// This is computing cDiffI:F = cBaseI:F / cDiffI:F to high precision.
|
|
// The first step is a range reduction, where cRRF becomes a scale factor
|
|
// (roughly min(1.0f, abs(1.0f / cDiffI:F))) which scales down cDiffI:F (denominator)
|
|
// Then the reciprocal of cDiffI:F is computed with a Newton-Raphson iteration
|
|
// and multiplied by cBaseI:F. Finally scale down the result (numerator) by cRRF.
|
|
vor cTemp, cDiffI, vOne[0] // Round up int sum to odd; this ensures the value is not 0, otherwise vabs result will be 0 instead of +/- 2
|
|
vrcph cRRI[3], cDiffI[3]
|
|
vrcpl cRRF[3], cDiffF[3] // 1 / (x+y+z+w), vtx on screen - vtx off screen
|
|
vrcph cRRI[3], $v31[2] // 0; get int result of reciprocal
|
|
vabs cTemp, cTemp, $v31[3] // 2; cTemp = +/- 2 based on sum positive (incl. zero) or negative
|
|
vmudn cRRF, cRRF, cTemp[3] // multiply reciprocal by +/- 2
|
|
vmadh cRRI, cRRI, cTemp[3]
|
|
veq cRRI, cRRI, $v31[2] // 0; if RR int part is 0
|
|
vmrg cRRF, cRRF, $v31[1] // keep RR frac, otherwise set frac to 0xFFFF (max)
|
|
lhu outVtxBase, (CLIP_PTR_GEN - 2 * CLIP_PTR_COUNT)(clipPtrs)
|
|
vmudl $v29, cDiffF, cRRF[3] // Multiply clDiffI:F by RR frac*frac
|
|
ldv cPosOfF[0], VTX_FRAC_VEC (clipVOffsc) // Off screen loaded above, but need
|
|
vmadm cDiffI, cDiffI, cRRF[3] // int*frac, int out
|
|
ldv cPosOfI[0], VTX_INT_VEC (clipVOffsc) // it in elems 0-3 for interp
|
|
vmadn cDiffF, $v31, $v31[2] // 0; get frac out
|
|
luv cRGBAOf[0], VTX_COLOR_VEC(clipVOffsc)
|
|
vrcph sRTI[3], cDiffI[3] // Reciprocal of new scaled cDiff (discard)
|
|
luv cRGBAOn[0], VTX_COLOR_VEC(clipVOnsc)
|
|
vrcpl sRTF[3], cDiffF[3] // frac part
|
|
llv cSTOf[0], VTX_TC_VEC (clipVOffsc)
|
|
vrcph sRTI[3], $v31[2] // 0; int part
|
|
llv cSTOn[0], VTX_TC_VEC (clipVOnsc) // Must be before vtx_final_setup_for_clip
|
|
vmudl $v29, sRTF, cDiffF // D*R (see Newton-Raphson explanation)
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
li vtxLeft, -1 // vtxLeft < 0 triggers vtx_epilogue
|
|
.else
|
|
li vtxLeft, inputVtxSize // but trigger this on the second loop in this version
|
|
.endif
|
|
vmadm $v29, sRTI, cDiffF
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
addi outVtxBase, outVtxBase, -vtxSize // Inc'd by 2, must point to second vtx
|
|
.else
|
|
addi outVtxBase, outVtxBase, vtxSize // Not inc'd, must point to second vtx
|
|
.endif
|
|
vmadn cDiffF, sRTF, cDiffI
|
|
li vLoopRet, vtx_loop_no_lighting
|
|
vmadh cDiffI, sRTI, cDiffI
|
|
addi clipPtrs, clipPtrs, CLIP_PTR_COUNT
|
|
vmudh $v29, vOne, $v31[4] // 4; 4 - 4 * (D*R)
|
|
vmadn cDiffF, cDiffF, $v31[0] // -4
|
|
vmadh cDiffI, cDiffI, $v31[0] // -4
|
|
vmudl $v29, sRTF, cDiffF // 1/cDiff result = R * that
|
|
vmadm $v29, sRTI, cDiffF
|
|
vmadn sRTF, sRTF, cDiffI
|
|
vmadh sRTI, sRTI, cDiffI
|
|
vmudl $v29, cBaseF, sRTF // cDiff regs = cBase / cDiff
|
|
vmadm $v29, cBaseI, sRTF
|
|
vmadn cDiffF, cBaseF, sRTI
|
|
vmadh cDiffI, cBaseI, sRTI
|
|
vmudl $v29, cDiffF, cRRF[3] // Scale by range reduction
|
|
vmadm cDiffI, cDiffI, cRRF[3]
|
|
vmadn cDiffF, $v31, $v31[2] // Done cDiffI:F = cBaseI:F / cDiffI:F
|
|
// Clamp to 0x0001 to 0xFFFF range and create inverse on-screen factor
|
|
vlt cDiffI, cDiffI, vOne[0] // If integer part of factor less than 1,
|
|
vmrg cDiffF, cDiffF, $v31[1] // keep frac part of factor, else set to 0xFFFF (max val)
|
|
vsubc $v29, cDiffF, vOne[0] // frac part - 1 for carry
|
|
vge cDiffI, cDiffI, $v31[2] // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
|
|
j vtx_final_setup_for_clip // TODO can merge this with vtx_store_for_clip Clobbers vcc and accum in !NOC config.
|
|
vmrg cFadeOf, cDiffF, vOne[0] // keep frac part of factor, else set to 1 (min val)
|
|
clip_after_final_setup: // This is here because otherwise 3 cycle stall here.
|
|
vmudn cFadeOn, cFadeOf, $v31[1] // signed x * -1 = 0xFFFF - unsigned x! Fade factor for on screen vert
|
|
// Fade between attributes for on screen and off screen vert
|
|
vmudm $v29, cRGBAOf, cFadeOf[3]
|
|
vmadm vpRGBA, cRGBAOn, cFadeOn[3]
|
|
vmudm $v29, cSTOf, cFadeOf[3]
|
|
vmadm sSTS, cSTOn, cFadeOn[3]
|
|
vmudl $v29, cPosOfF, cFadeOf[3]
|
|
vmadm $v29, cPosOfI, cFadeOf[3]
|
|
vmadl $v29, cPosOnOfF, cFadeOn[3]
|
|
vmadm vpClpI, cPosOnOfI, cFadeOn[3]
|
|
j vtx_store_for_clip
|
|
vmadn vpClpF, $v31, $v31[2] // 0; load resulting frac pos
|
|
clip_after_vtx_store:
|
|
addi $11, rdpCmdBufEndP1, tempClipPtrs + 3 * CLIP_PTR_COUNT
|
|
beq $11, clipPtrs, clip_do_subdivision // Do one more subdivision
|
|
slv sSTS[0], (VTX_TC_VEC )(outVtx1) // Store not-twice-scaled ST
|
|
clip_next_cond:
|
|
bgtz clipMaskIdx, clip_condlooptop // Currently 0 = continue to draw
|
|
sh $zero, activeClipPlanes // Only matters if we need to draw
|
|
// clipDrawPtr <- clipMaskIdx; currently at 0
|
|
// Draws verts in pattern like 4-2-3, 4-1-2, 4-0-1
|
|
lh $11, (clipPolySgn + 0xE)($zero)
|
|
li $ra, clip_draw_tris_loop
|
|
sub flatV1Offset, origV1Addr, $11 // Offset = real orig addr - cur V1
|
|
move origV1Addr, $11
|
|
clip_draw_tris_loop:
|
|
addi clipDrawPtr, clipDrawPtr, -2
|
|
lh $2, (clipPolySgn + 0xC)(clipDrawPtr)
|
|
llv $v4[4], (clipPolySgn + 0xC)(clipDrawPtr) // +A ($2), +C ($3) to elem 2, 3
|
|
beqz $2, clip_done
|
|
lh $3, (clipPolySgn + 0xE)(clipDrawPtr)
|
|
vmov $v8[2], $v4[3] // +C ($3) to elem 2
|
|
j tri_from_clip
|
|
lsv $v6[4], (clipPolySgn + 0xE)($zero) // +E (origV1Addr) to elem 2
|
|
|
|
clip_timeout:
|
|
bltz clipWalkPhase, clip_next_cond // Timed out in find on to off: all onscreen, nothing to do for this cond
|
|
// bgtz clipWalkPhase, clip_done // Timed out in find on: all offscreen, discard tri.
|
|
// j clip_done // Timed out in find off to on: error, give up
|
|
clip_err_alloc:
|
|
clip_err_insert:
|
|
clip_done: // Delay slot is harmless if branched
|
|
li $11, CLIP_SCAL_NPXY | CLIP_CAMPLANE
|
|
sh $11, activeClipPlanes
|
|
snake_c_to_v30
|
|
tri_v1_move
|
|
add origV1Addr, origV1Addr, flatV1Offset // Real orig addr = cur V1 + offset
|
|
li flatV1Offset, 0
|
|
lh $ra, tempTriRA
|
|
jr $ra // Delay slot is harmless
|
|
clip_w:
|
|
vcopy cBaseF, cPosOnOfF // Result is just W
|
|
j clip_skipxy
|
|
vcopy cBaseI, cPosOnOfI
|