From 74d67ae914bebd2f0213393dddf0e097f45df7b9 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Wed, 10 Apr 2024 22:14:32 -0700 Subject: [PATCH] New changes working --- f3dex3.s | 257 ++++++++++++++++++++++++------------------------------- notes.s | 19 ++++ 2 files changed, 131 insertions(+), 145 deletions(-) diff --git a/f3dex3.s b/f3dex3.s index 6b22986..1df6470 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -1133,11 +1133,14 @@ sSTO equ $v11 // not supported on legacy vtx pipe, but register allocated for it sSTS equ $v10 .else sVPO equ $v17 +.if CFG_NO_OCCLUSION_PLANE +sVPS equ $v26 +.else sVPS equ $v16 +.endif sSTO equ $v26 sSTS equ $v25 .endif -sCOL equ $v17 .if CFG_LEGACY_VTX_PIPE sOUTF equ vPairTPosF sOUTI equ vPairTPosI @@ -1353,32 +1356,19 @@ vtx_after_calc_mit: ldv vVP0F[8], (vpMatrix + 0x20)($zero) ldv vVP2F[8], (vpMatrix + 0x30)($zero) .endif - andi $16, $5, G_LIGHTING >> 8 // This is clipFlags, but not touched during vtx_store + andi $11, $5, G_LIGHTING >> 8 + beqz $11, @@skip_lighting + li $16, vtx_return_from_lighting // This is clipFlags, but not modified + li $16, lt_vtx_pair // during vtx_store +@@skip_lighting: andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled jal while_wait_dma_busy // Wait for vertex load to finish li $19, clipTempVerts // Temp mem we can freely overwrite replaces outputVtxPos - jal middle_of_vtx_store // Sets $ra to vtx_load_loop + j middle_of_vtx_store move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos - -vtx_load_loop: - vlt $v29, $v31, $v31[4] // Set VCC to 11110000 -.if CFG_LEGACY_VTX_PIPE - blez $1, vertex_end -.endif - nop // TODO -.if CFG_LEGACY_VTX_PIPE - vmrg vPairNrml, vPairNrml, vDDD // Merge normals - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize -.endif - vmrg vPairRGBA, sCOL, vPairRGBA // Merge colors - bnez $16, lt_vtx_pair // Lighting enabled in clipFlags -.if CFG_LEGACY_VTX_PIPE - addi outputVtxPos, outputVtxPos, 2*vtxSize -.else - // Elems 0-1 get bytes 6-7 of the following vertex (0) - lpv vAAA[2], (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2 -.endif + vtx_return_from_lighting: + li $ra, vertex_end .if CFG_LEGACY_VTX_PIPE vmudm vPairST, vPairST, sSTS // Scale ST; must be after texgen @@skipsecond: @@ -1398,22 +1388,17 @@ vtx_return_from_lighting: vmadn $v29, vVP0F, vPairPosI[0h] vmadh $v29, vVP0I, vPairPosI[0h] vmadl $v29, vVP1F, vPairPosF[1h] - addi outputVtxPos, outputVtxPos, 2*vtxSize vmadm $v29, vVP1I, vPairPosF[1h] - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize vmadn $v29, vVP1F, vPairPosI[1h] vmadh $v29, vVP1I, vPairPosI[1h] vmadl $v29, vVP2F, vPairPosF[2h] vmadm $v29, vVP2I, vPairPosF[2h] vmadn vPairTPosF, vVP2F, vPairPosI[2h] - li $ra, vertex_end // Done with vertex processing... vmadh vPairTPosI, vVP2I, vPairPosI[2h] - blez $1, @@skiploop // ...if <= 0 verts remain, ... - vmudm $v29, vPairST, sSTS // Scale ST; must be after texgen - li $ra, vtx_load_loop // ...otherwise keep looping -@@skiploop: + vmudm $v29, vPairST, sSTS // Scale ST; must be after texgen vmadh vPairST, sSTO, vOne // + 1 * (ST offset or zero) .endif + addi outputVtxPos, outputVtxPos, 2*vtxSize vtx_store: // Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA // Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp). Also vPairST and @@ -1425,14 +1410,13 @@ s1WH equ $v16 // vtx_store 1/W High s1WL equ $v17 // vtx_store 1/W Low sWRL equ $v25 // vtx_store W Reciprocal Low | IMPORTANT: Can be the same reg as sWRH, but sWRH equ $v26 // vtx_store W Reciprocal High | using different ones saves one cycle delay - move secondVtxPos, outputVtxPos // Second and output vertices write to same mem... vmudl $v29, vPairTPosF, $v30[3] // Persp norm + move secondVtxPos, outputVtxPos // Second and output vertices write to same mem... + vmadm s1WH, vPairTPosI, $v30[3] // Persp norm bltz $1, @@skipsecond // ...if < 0 verts remain, ... - vmadm s1WH, vPairTPosI, $v30[3] // Persp norm + vmadn s1WL, $v31, $v31[2] // 0 addi secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx @@skipsecond: - vmadn s1WL, $v31, $v31[2] // 0 - addi inputVtxPos, inputVtxPos, 2*inputVtxSize vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos) vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low @@ -1460,6 +1444,7 @@ sSCH equ $v21 // vtx_store Scaled Clipping High .if CFG_NO_OCCLUSION_PLANE vmadn s1WL, s1WL, sWRH[3h] + addi inputVtxPos, inputVtxPos, 2*inputVtxSize vmadh s1WH, s1WH, sWRH[3h] srl $24, $10, 4 // Shift second vertex screen clipping to first slots vch $v29, vPairTPosI, sSCH[3h] // Clip scaled high @@ -1480,10 +1465,10 @@ sSCH equ $v21 // vtx_store Scaled Clipping High lsv vPairTPosF[6], (VTX_Z_FRAC )(outputVtxPos) // load Z into W slot, will be for fog below vmadn s1WL, s1WL, sWRH[3h] middle_of_vtx_store: -// vPairPosI is $v20 - ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) +// vPairST is $v22 + ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3 vmadh s1WH, s1WH, sWRH[3h] // s1WH:s1WL is 1/W - ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) + llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5 // vnop sll $11, $20, 4 // Shift first vertex scaled clipping to second slots // vnop @@ -1497,59 +1482,63 @@ middle_of_vtx_store: vmadh vPairTPosI, vPairTPosI, s1WH[3h] // pos * 1/W ssv s1WH[6], (VTX_INV_W_INT )($19) // vnop -.if !CFG_LEGACY_VTX_PIPE + slv vPairST[4], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Replace vtx 1 ST with vtx 0 RGBA + // vnop +.if CFG_LEGACY_VTX_PIPE + lpv $v14[7], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Y to elem 0, 4 +.else // sVPO is $v17 // vtx_store ViewPort Offset lqv sVPO, (0x10)(rdpCmdBufEndP1) // Load viewport offset from temp mem .endif - // vnop -.if !CFG_LEGACY_VTX_PIPE -// sVPS is $v16 // vtx_store ViewPort Scale + vmudl $v29, vPairTPosF, $v30[3] // Persp norm +.if CFG_LEGACY_VTX_PIPE + lpv $v15[6], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Z to elem 0, 4 +.else +// sVPS is $v26 // vtx_store ViewPort Scale lqv sVPS, (0x00)(rdpCmdBufEndP1) // Load viewport scale from temp mem .endif - vmudl $v29, vPairTPosF, $v30[3] // Persp norm -// vPairST is $v22 - llv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1 vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm - llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5 +// vPairPosI is $v20 + ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) vmadn vPairTPosF, $v31, $v31[2] // 0 -// vDDD is $v26 - lpv vDDD[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4 + ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // vnop // vPairRGBA is $v27 - luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4 + luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair RGBA // vnop +// vPairNrml is $v16 + lpv vPairNrml[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair normals vmudh $v29, sVPO, vOne // offset * 1 andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about vmadn vPairTPosF, vPairTPosF, sVPS // + XYZ * scale -// sCOL is $v17, defined at start of loop - luv sCOL[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4 - vmadh vPairTPosI, vPairTPosI, sVPS or $24, $24, $20 // Combine results for second vertex + vmadh vPairTPosI, vPairTPosI, sVPS + or $10, $10, $11 // Combine results for first vertex // sFOG is $v25 vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog - or $10, $10, $11 // Combine results for first vertex - // vnop sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags // vnop sh $10, (VTX_CLIP )($19) // Store first vertex results + // vnop + ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) // sCLZ is $v21 // vtx_store CLipped Z vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0 - slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos) - vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) - slv vPairTPosI[0], (VTX_SCR_VEC )($19) - vmudn $v29, vM3F, vOne - ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) - vmadh $v29, vM3I, vOne ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19) - vmadn $v29, vM0F, vPairPosI[0h] - // nop TODO + vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) + slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos) + vmudn $v29, vM3F, vOne + slv vPairTPosI[0], (VTX_SCR_VEC )($19) + vmadh $v29, vM3I, vOne + blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping + vmadn $v29, vM0F, vPairPosI[0h] + move $ra, $16 // Normally $ra = loop or lighting +skip_return_to_lt_or_loop: vmadh $v29, vM0I, vPairPosI[0h] - ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) + addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize vmadn $v29, vM1F, vPairPosI[1h] - ssv sCLZ[4], (VTX_SCR_Z )($19) + ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) vmadh $v29, vM1I, vPairPosI[1h] -// vPairNrml is $v16 - lpv vPairNrml[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4 + ssv sCLZ[4], (VTX_SCR_Z )($19) // sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords beqz $7, return_routine // fog disabled @@ -1576,6 +1565,7 @@ middle_of_vtx_store: vmadn s1WL, s1WL, $v31[0] // -4 ori $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts vmadh s1WH, s1WH, $v31[0] // -4 + addi inputVtxPos, inputVtxPos, 2*inputVtxSize vmudn $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz vmadh $v29, vPairTPosI, sOCM // Int * int lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below @@ -1589,40 +1579,43 @@ middle_of_vtx_store: vmadn s1WL, s1WL, sWRH[3h] sll $11, $20, 4 // Shift first vertex scaled clipping to second slots vmadh s1WH, s1WH, sWRH[3h] // s1WH:s1WL is 1/W - andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about - veq $v29, $v31, $v31[3h] // Set VCC to 00010001 andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about - vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7 - or $24, $24, $20 // Combine results for second vertex + veq $v29, $v31, $v31[3h] // Set VCC to 00010001 + blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping + vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7 +middle_of_vtx_store: + move $ra, $16 // Normally $ra = loop or lighting +skip_return_to_lt_or_loop: vmudl $v29, vPairTPosF, s1WL[3h] // W must be overwritten with Z before here ssv s1WL[14], (VTX_INV_W_FRAC)(secondVtxPos) vmadm $v29, vPairTPosI, s1WL[3h] - ssv s1WL[6], (VTX_INV_W_FRAC)(outputVtxPos) + ssv s1WL[6], (VTX_INV_W_FRAC)($19) vmadn vPairTPosF, vPairTPosF, s1WH[3h] ssv s1WH[14], (VTX_INV_W_INT )(secondVtxPos) vmadh vPairTPosI, vPairTPosI, s1WH[3h] // pos * 1/W - ssv s1WH[6], (VTX_INV_W_INT )(outputVtxPos) + ssv s1WH[6], (VTX_INV_W_INT )($19) vadd sOC1, sOC1, sOC1[0q] // Add pairs upwards .if !CFG_LEGACY_VTX_PIPE // sVPO is $v17 // vtx_store ViewPort Offset lqv sVPO, (0x10)(rdpCmdBufEndP1) // Load viewport offset from temp mem .endif // vnop -.if !CFG_LEGACY_VTX_PIPE +.if CFG_LEGACY_VTX_PIPE + addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize +.else // sVPS is $v16 // vtx_store ViewPort Scale lqv sVPS, (0x00)(rdpCmdBufEndP1) // Load viewport scale from temp mem .endif vmudl $v29, vPairTPosF, $v30[3] // Persp norm -middle_of_vtx_store: +// vPairST is $v22 + ldv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1, RGBA in 2:3 + vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm + llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5 + vmadn vPairTPosF, $v31, $v31[2] // 0 // vPairPosI is $v20 ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) - vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm - ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) - vmadn vPairTPosF, $v31, $v31[2] // 0 -// vPairST is $v22 - llv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1 vadd sOC1, sOC1, sOC1[1h] // Add elems 1, 5 to 3, 7 - llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5 + ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // vnop // sO03 is $v26 // vtx_store Occlusion coeffs 0-3 ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3 @@ -1634,17 +1627,19 @@ middle_of_vtx_store: lqv sOPM, (0x20)(rdpCmdBufEndP1) // Load occlusion plane -/+4000 constants .endif vmadh vPairTPosI, vPairTPosI, sVPS + andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about // sFOG is $v16 vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog or $10, $10, $11 // Combine results for first vertex vlt $v29, sOC1, $v31[2] // Occlusion plane equation < 0 in elems 3, 7 + slv vPairST[4], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Replace vtx 1 ST with vtx 0 RGBA // vnop cfc2 $11, $vcc // Load occlusion plane mid results to bits 3 and 7 // sOSC is $v21 // vtx_store Occlusion SCaled up vmudh sOSC, vPairTPosI, $v31[4] // 4; scale up x and y ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos) vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only) - // nop TODO + or $24, $24, $20 // Combine results for second vertex // sCLZ is $v25 // vtx_store CLipped Z vge sCLZ, vPairTPosI, $v31[2] // 0; clamp Z to >= 0 ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)($19) @@ -1665,23 +1660,27 @@ middle_of_vtx_store: veq $v29, $v31, $v31[0q] // Set VCC to 10101010 slv vPairTPosI[0], (VTX_SCR_VEC )($19) vmrg sOC2, sOC2, sOC3 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1 -// vPairNrml is $v16 - lpv vPairNrml[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4 +.if CFG_LEGACY_VTX_PIPE + lpv $v14[7], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Y to elem 0, 4 +.else + addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize +.endif // vnop ssv sCLZ[12], (VTX_SCR_Z )(secondVtxPos) // vnop -// vDDD is $v26 - lpv vDDD[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4 +.if CFG_LEGACY_VTX_PIPE + lpv $v15[6], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos) // Z to elem 0, 4 +.endif // vnop ssv sCLZ[4], (VTX_SCR_Z )($19) vge $v29, sOC2, sO47 // Each compare to coeffs 4-7 -// sCOL is $v17, defined at start of loop - luv sCOL[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4 +// vPairNrml is $v16 + lpv vPairNrml[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair normals vmudn $v29, vM3F, vOne cfc2 $20, $vcc vmadh $v29, vM3I, vOne // vPairRGBA is $v27 - luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4 + luv vPairRGBA[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Vtx pair colors vmadn $v29, vM0F, vPairPosI[0h] andi $11, $11, CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4) // Only bits 3, 7 from occlusion vmadh $v29, vM0I, vPairPosI[0h] @@ -1876,6 +1875,7 @@ clip_skipxy: vsubc $v29, vClDiffF, vOne[0] // frac part - 1 for carry vge vClDiffI, vClDiffI, $v31[2] // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001), vmrg vClFade1, vClDiffF, vOne[0] // keep frac part of factor, else set to 1 (min val) + addi inputVtxPos, rdpCmdBufEndP1, 0x30 // Temp mem; vtx load writes a few bytes here vmudn vClFade2, vClFade1, $v31[1] // signed x * -1 = 0xFFFF - unsigned x! v2[3] is fade factor for on screen vert lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode, incl fog setting // Fade between attributes for on screen and off screen vert @@ -2779,6 +2779,7 @@ lt_continue_setup: lb $11, dirLightsXfrmValid li $10, -1 // To mark lights valid addi $3, $3, altBase // Point to ambient light; stored through vtx proc + andi $17, $5, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store and $11, $11, $7 // Zero if either matrix or lights invalid bnez $11, lt_setup_skip_xfrm sb $10, dirLightsXfrmValid @@ -2840,11 +2841,8 @@ xfrm_light_post: li $20, OSTask + OSTask_ucode_data_size lt_setup_skip_xfrm: // Load first light direction to $v13, which is not used throughout vtx processing. - lpv $v4[4], (ltBufOfs + 0 - lightSize)($3) // Xfrmed dir in elems 0-2 - vlt $v29, $v31, $v31[4] // Set VCC to 11110000 - lpv $v5[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6 j vtx_after_lt_setup - vmrg $v13, $v4, $v5 // $v13 = first light direction + lpv $v13[0], (ltBufOfs + 8 - lightSize)($3) // Xfrmed dir in elems 4-6 xfrm_single_dir: vmudn $v29, $v16, $v3[0] @@ -2877,77 +2875,44 @@ lt_vtx_pair: // // LVP main lighting // + vmulf $v29, vPairNrml, $v13[4] // Normals X elems 0, 4 * first light dir luv vPairLt, (ltBufOfs + 0)($3) // Total light level, init to ambient - vmulf vAAA, $v13, vPairNrml // First light dir * normals + vmacf $v29, $v14, $v13[5] // Normals Y elems 0, 4 * first light dir + lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)($3) // Xfrmed dir in elems 4-6 + vmacf vAAA, $v15, $v13[6] // Normals Z elems 0, 4 * first light dir .if CFG_PROFILING_B addi perfCounterA, perfCounterA, 2 // Increment lit vertex count by 2 .endif - move curLight, $3 // Point to ambient light -lt_loop: - // vnop; vnop - vmudh $v29, vOne, vAAA[0h] // Sum components of dot product - lpv vCCC[4], (ltBufOfs + 0 - 2*lightSize)(curLight) // Xfrmed dir in elems 0-2 - vmadh $v29, vOne, vAAA[1h] - lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6 - vmadh vAAA, vOne, vAAA[2h] - beq curLight, altBaseReg, lt_post - luv vBBB, (ltBufOfs + 0 - lightSize)(curLight) // Light color - vlt $v29, $v31, $v31[4] // Set VCC to 11110000 - addi curLight, curLight, -lightSize - vmrg vCCC, vCCC, vDDD // vCCC = light direction - vge vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0 - // vnop; vnop - vmudh $v29, vOne, vPairLt // Load accum mid with current light level - vmacf vPairLt, vBBB, vAAA[0h] // + light color * dot product - bne curLight, altBaseReg, lt_loop - vmulf vAAA, vCCC, vPairNrml // Light dir * normals - - -/* -.if CFG_PROFILING_B - addi perfCounterA, perfCounterA, 2 // Increment lit vertex count by 2 -.endif - vmulf $v29, $v14, $v13[4] // Normals X elems 0, 4 * first light dir - luv vPairLt, (ltBufOfs + 0)($3) // Total light level, init to ambient - vmacf $v29, $v15, $v13[5] // Normals Y elems 0, 4 * first light dir - lpv vDDD[0], (ltBufOfs + 8 - XXX)(curLight) // Xfrmed dir in elems 4-6 - vmacf vAAA, vPairNrml, $v13[6] // Normals Z elems 0, 4 * first light dir beq $3, altBaseReg, lt_post - move curLight, $3 // Point to ambient light + lpv $v18[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut + move curLight, $3 // Point to ambient light + // vnop lt_loop: vge vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0 - vmulf $v29, $v14, vDDD[4] // Normals X elems 0, 4 + vmulf $v29, vPairNrml, vDDD[4] // Normals X elems 0, 4 + luv vBBB, (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color + vmacf $v29, $v14, vDDD[5] // Normals Y elems 0, 4 addi curLight, curLight, -lightSize - vmacf $v29, $v15, vDDD[5] // Normals Y elems 0, 4 - luv vBBB, (ltBufOfs + 0 - XXX)(curLight) // Light color - vmacf vAAA, vPairNrml, vDDD[6] // Normals Z elems 0, 4 - lpv vDDD[0], (ltBufOfs + 8 - XXX)(curLight) // Xfrmed dir in elems 4-6 + vmacf vAAA, $v15, vDDD[6] // Normals Z elems 0, 4 + lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6 vmudh $v29, vOne, vPairLt // Load accum mid with current light level bne curLight, altBaseReg, lt_loop vmacf vPairLt, vBBB, vCCC[0h] // + light color * dot product -*/ - lt_post: - andi $11, $5, G_TEXTURE_GEN >> 8 vne $v29, $v31, $v31[3h] // Set VCC to 11101110 - beqz $11, vtx_return_from_lighting + beqz $17, vtx_return_from_lighting vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha .endif // These definitions are shared by both versions vLookat1 equ vAAA vLookat0 equ vPairLt .if CFG_LEGACY_VTX_PIPE - lpv vCCC[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, lookat 1 in 4-6 - vlt $v29, $v31, $v31[4] // Set VCC to 11110000 - lpv vLookat0[4], (xfrmLookatDirs + 0)($zero) // Garbage in 0-2, lookat 0 in 4-6 - lpv vLookat1[4], (xfrmLookatDirs - 8)($zero) // Lookat 1 in 0-2, garbage in 4-6 - vmrg vLookat0, vCCC, vLookat0 // Lookat 0 in 0-2, 4-6 - vmrg vLookat1, vLookat1, vCCC // Lookat 1 in 0-2, 4-6 - vmulf vLookat0, vPairNrml, vLookat0 // Normal * lookat 0 dir - vmulf vLookat1, vPairNrml, vLookat1 // Normal * lookat 1 dir - vmudh $v29, vOne, vLookat0[0h] // Sum components of dot product - vmadh $v29, vOne, vLookat0[1h] - vmadh vLookat0, vOne, vLookat0[2h] // vLookat0 = dot product 0 + vmulf $v29, vPairNrml, $v18[0] // Normals X elems 0, 4 * lookat 0 X + vmacf $v29, $v14, $v18[1] // Normals Y elems 0, 4 * lookat 0 Y + vmacf vLookat0, $v15, $v18[2] // Normals Z elems 0, 4 * lookat 0 Z + vmulf $v29, vPairNrml, $v18[4] // Normals X elems 0, 4 * lookat 1 X + vmacf $v29, $v14, $v18[5] // Normals Y elems 0, 4 * lookat 1 Y + vmacf vLookat1, $v15, $v18[6] // Normals Z elems 0, 4 * lookat 1 Z // Continue to rest of texgen shared by both versions. .endif @@ -2961,10 +2926,12 @@ vLookat0 equ vPairLt // Outputs: leave alone vPairPosI/F; update vPairRGBA, vPairST // Locals: vAAA and vBBB after merge and normals selection, vCCC, vDDD, vPairLt, vNrmOut // New available locals: $6 (existing: $11, $10, $20, $24) - vmrg vPairNrml, vPairNrml, vDDD // Merge normals beqz $11, lt_skip_packed_normals // Elems 4-5 get bytes 6-7 of the following vertex (1) lpv vBBB[6], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Upper 2 in 4:5 + // Elems 0-1 get bytes 6-7 of the following vertex (0) + lpv vAAA[2], (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2 + vlt $v29, $v31, $v31[4] // Set VCC to 11110000 vmrg vAAA, vAAA, vBBB // Merge packed normals // Packed normals algorithm. This produces a vector (one for each input vertex) // in vPairNrml such that |X| + |Y| + |Z| = 0x7F00 (called L1 norm), in the @@ -3149,11 +3116,11 @@ lt_skip_fresnel: vlt $v29, $v31, $v31[4] // Set VCC to 11110000 vmrg vLookat1, vLookat1, vDDD // vLookat1 = lookat 1 dir vmulf vLookat1, vPairNrml, vLookat1 // Normal * lookat 1 dir -.endif - // Rest of texgen shared by F3DEX3 native and LVP vmudh $v29, vOne, vLookat1[0h] vmadh $v29, vOne, vLookat1[1h] vmadh vLookat1, vOne, vLookat1[2h] +.endif + // Rest of texgen shared by F3DEX3 native and LVP vne $v29, $v31, $v31[1h] // Set VCC to 10111011 andi $11, $5, G_TEXTURE_GEN_LINEAR >> 8 vmrg vLookat0, vLookat0, vLookat1[0h] // Dot products in elements 0, 1, 4, 5 diff --git a/notes.s b/notes.s index 26d39a2..0ac7017 100644 --- a/notes.s +++ b/notes.s @@ -17,3 +17,22 @@ vmadh vOut, vOne, vIn[2h] vmudh $v29, $v31, $v31[2] // 0; clear whole accumulator vadd $v29, vIn, vIn[1h] // accum lo 0 = 0 + 1, 4 = 4 + 5 vmadn vOut, vOne, vIn[2h] // + 2,6; built-in saturation (clamping) ends up not a problem + +// lpv patterns. Same works for luv. Assuming $11 is 8 byte aligned (does not have +// to be 16 byte aligned). +// Elem 0 1 2 3 4 5 6 7 +lpv $v27[1], (-8)($11) // 07 F8 F9 FA FB FC FD FE Byte +lpv $v27[2], (-8)($11) // 06 07 F8 F9 FA FB FC FD addr +lpv $v27[3], (-8)($11) // 05 06 07 F8 F9 FA FB FC relative +lpv $v27[4], (-8)($11) // 04 05 06 07 F8 F9 FA FB to +lpv $v27[5], (-8)($11) // 03 04 05 06 07 F8 F9 FA $11 +lpv $v27[6], (-8)($11) // 02 03 04 05 06 07 F8 F9 +lpv $v27[7], (-8)($11) // 01 02 03 04 05 06 07 F8 +lpv $v27[0], ( 0)($11) // 00 01 02 03 04 05 06 07 +lpv $v27[1], ( 0)($11) // 0F 00 01 02 03 04 05 06 +lpv $v27[2], ( 0)($11) // 0E 0F 00 01 02 03 04 05 +lpv $v27[3], ( 0)($11) // 0D 0E 0F 00 01 02 03 04 +lpv $v27[4], ( 0)($11) // 0C 0D 0E 0F 00 01 02 03 +lpv $v27[5], ( 0)($11) // 0B 0C 0D 0E 0F 00 01 02 +lpv $v27[6], ( 0)($11) // 0A 0B 0C 0D 0E 0F 00 01 +lpv $v27[7], ( 0)($11) // 09 0A 0B 0C 0D 0E 0F 00