From db28ec8fd5bebb02c23fa619a72e9c1b06c005b6 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Sun, 29 Dec 2024 21:29:58 -0800 Subject: [PATCH] Scalar register cleanup --- docs/Code/Counters.md | 3 + f3dex3.s | 679 ++++++++++++++++++++++++------------------ notes.s | 29 ++ 3 files changed, 428 insertions(+), 283 deletions(-) diff --git a/docs/Code/Counters.md b/docs/Code/Counters.md index ba59d58..49f236d 100644 --- a/docs/Code/Counters.md +++ b/docs/Code/Counters.md @@ -126,6 +126,9 @@ In the true codepath of Sched_TaskComplete: OS_YIELD_DATA_SIZE - sizeof(F3DEX3YieldDataFooter)); osInvalDCache(footer, sizeof(F3DEX3YieldDataFooter)); bcopy(footer, &gRSPProfilingResults, sizeof(F3DEX3YieldDataFooter)); + /* The second invalidate is to get the footer out of the CPU cache, + because it could get written back and overwrite newer data in RAM */ + osInvalDCache(footer, sizeof(F3DEX3YieldDataFooter)); } #endif ``` diff --git a/f3dex3.s b/f3dex3.s index bb12787..fc4188c 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -769,20 +769,49 @@ vOne equ $v28 // Global, all elements = 1 // For tri write only: vZero equ $v0 // all elements = 0 -// Global and semi-global (i.e. one main function + occasional local) scalar regs: +/* +Scalar regs: + Tri write Clip VW Vtx write Lighting Cmd dispatch +$zero -------------------- Hardwired zero --------------------- +$1 v1 texptr <---------- vtxLeft ----------> temp, init 0 +$2 v2 shdptr next vtx -----> temp +$3 v3 shdflg last/offscr temp +$4 flat shading vtx or (perf) initial FIFO stall time ------ +$5 v1flag lmaj <--------- vGeomMid ----------> +$6 geom mode clipMaskIdx --> +$7 v2flag tile <---------- fogFlag ----------> cmd byte +$8 v3flag <---------- outVtx2 ----------> cmdBufOver +$9 xp texenab clipMask -----> <----- curLight ovlInitClock +$10 ------------------------ temp2 -------------------------- +$11 ------------------------- temp -------------------------- +$12 --------------------- perfCounterD ---------------------- +$13 ---------------------- altBaseReg ----------------------- +$14 <----------- inVtx -----------> +$15 <--------- outVtxBase --------> +$16 clipFlags ----> +$17 clipPolyRead -> +$18 <---------- clipPolySelect> +$19 temp onscrvtx outVtx1 ----------> dmaLen +$20 temp <---------- flagsV1 ----------> dmemAddr +$21 <---------- clipPolyWrite > <----- ambLight +$22 -------------------- rdpCmdBufEndP1 --------------------- +$23 --------------------- rdpCmdBufPtr ---------------------- +$24 temp <---------- flagsV2 ----------> cmd_w1_dram +$25 cmd_w0 -----------------> cmd_w0 +$26 ---------------------- taskDataPtr ---------------------- +$27 -------------------- inputBufferPos --------------------- +$28 --------------------- perfCounterA ---------------------- +$29 --------------------- perfCounterB ---------------------- +$30 --------------------- perfCounterC ---------------------- +$ra return address, sometimes sign bit is flag -------------- +*/ + +// Global scalar regs: // $zero // Hardwired zero scalar register perfCounterD equ $12 // Performance counter D (functions depend on config) altBaseReg equ $13 // Alternate base address register for vector loads -inVtx equ $14 // Pointer to loaded vertex to transform -outVtxBase equ $15 // Pointer to vertex buffer to store transformed verts -clipFlags equ $16 // Current clipping flags being checked -clipPolyRead equ $17 // Read pointer within current polygon being clipped -clipPolySelect equ $18 // Clip poly double buffer selection, or < 0 for normal tri write -clipPolyWrite equ $21 // Write pointer within current polygon being clipped rdpCmdBufEndP1 equ $22 // Pointer to one command word past "end" (middle) of RDP command buf rdpCmdBufPtr equ $23 // RDP command buffer current DMEM pointer -cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr -cmd_w0 equ $25 // DL command word 0, also holds next tris info taskDataPtr equ $26 // Task data (display list) DRAM pointer inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end perfCounterA equ $28 // Performance counter A (functions depend on config) @@ -790,57 +819,38 @@ perfCounterB equ $29 // Performance counter B (functions depend on config) perfCounterC equ $30 // Performance counter C (functions depend on config) // $ra // Return address -// Misc scalar regs: -clipMaskIdx equ $6 -outVtx2 equ $8 -curLight equ $9 -outVtx1 equ $19 +// Vertex write: +vtxLeft equ $1 // Number of vertices left to process * 0x10 +vGeomMid equ $5 // Middle two bytes of geometry mode +fogFlag equ $7 // 8 if fog enabled, else 0 +outVtx2 equ $8 // Pointer to second or dummy (= outVtx1) transformed vert +inVtx equ $14 // Pointer to loaded vertex to transform +outVtxBase equ $15 // Pointer to vertex buffer to store transformed verts +outVtx1 equ $19 // Pointer to first transformed vert +flagsV1 equ $20 // Clip flags for vertex 1 +flagsV2 equ $24 // Clip flags for vertex 2 -// Arguments to dma_read_write -dmaLen equ $19 // also used by itself -dmemAddr equ $20 -// cmd_w1_dram // used for all dma_read_write DRAM addresses +// Lighting +curLight equ $9 // Current light pointer with offset +ambLight equ $21 // Ambient (top) light pointer with offset -// Argument to load_overlay* -postOvlRA equ $10 // Commonly used locally +// Clipping +clipMaskIdx equ $6 // Clip mask index 4-0 +clipMask equ $9 // Current clip mask (one bit) +clipFlags equ $16 // Current clipping flags being checked +clipPolyRead equ $17 // Read pointer within current polygon being clipped +clipPolySelect equ $18 // Clip poly double buffer selection, or < 0 for normal tri write +clipPolyWrite equ $21 // Write pointer within current polygon being clipped + +// Misc +cmdBufOver equ $8 // = rdpCmdBufPtr - rdpCmdBufEndP1 +ovlInitClock equ $9 // Temp for profiling +postOvlRA equ $10 // Address to return to after overlay load +dmaLen equ $19 // DMA length in bytes minus 1 +dmemAddr equ $20 // DMA address in DMEM or IMEM +cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr +cmd_w0 equ $25 // DL command word 0, also holds next tris info -// ==== Summary of uses of all registers -// $zero: Hardwired zero scalar register -// $1: vertex 1 addr, zero when command handler is called, count of -// remaining vertices * 0x10, pointer to store texture coefficients, local -// $2: vertex 2 addr, vertex at end of edge during clipping, pointer to store -// shade coefficients, local -// $3: vertex 3 addr, vertex at start of edge during clipping, local -// $4: pre-shuffle vertex 1 addr for flat shading during tri write (global) -// $5: geometry mode middle 2 bytes during vertex load / lighting, local -// $6: clipMaskIdx, geometry mode low byte during tri write, local -// $7: command byte when command handler is called, mIT recompute flag in -// Overlay 4, local -// $8: outVtx2, local -// $9: curLight, clip mask during clipping, local -// $10: postOvlRA, common local -// $11: very common local -// $12: perfCounterD (global). This must be $12 for S2DEX compat in while_wait_dma_busy. -// $13: altBaseReg (global) -// $14: inVtx, local -// $15: outVtxBase, local -// $16: clipFlags (global) -// $17: clipPolyRead (global) -// $18: clipPolySelect (global) -// $19: dmaLen, outVtx1, local -// $20: dmemAddr, local -// $21: clipPolyWrite (global) -// $22: rdpCmdBufEndP1 (global) -// $23: rdpCmdBufPtr (global) -// $24: cmd_w1_dram, local -// $25: cmd_w0 (global); holds next tris info during tri write -> clipping -> -// vtx write -// $26: taskDataPtr (global) -// $27: inputBufferPos (global) -// $28: perfCounterA (global) -// $29: perfCounterB (global) -// $30: perfCounterC (global) -// $ra: Return address for jal, b*al // vtx_store registers. They all start with s for store. @@ -1072,7 +1082,7 @@ tempPrevVtxGarbage equ 0x50 // Up to 2 * 0x26 = 0x4C used -> to 0x9C .create CODE_FILE, 0x00001080 // Initialization routines -// Everything up until ovl01_end will get overwritten by ovl0 and/or ovl1 +// Everything up until ovl01_end will get overwritten by ovl1 start: // This is at IMEM 0x1080, not the start of IMEM vnop // Return to here from S2DEX overlay 0 G_LOAD_UCODE jumps to start+4! lqv $v31[0], (v31Value)($zero) // Actual start is here @@ -1185,7 +1195,7 @@ G_MOVEMEM_end: G_DMA_IO_handler: G_BRANCH_WZ_handler: G_MEMSET_handler: - j ovl234_ovl4_entrypoint // Delay slot is harmless + j ovl234_ovl4_entrypoint // Delay slot is harmless .endif load_cmds_handler: lb $3, materialCullMode @@ -1199,8 +1209,8 @@ G_SYNC_handler: sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words check_rdp_buffer_full_and_run_next_cmd: - sub $8, rdpCmdBufPtr, rdpCmdBufEndP1 - bgezal $8, flush_rdp_buffer + sub cmdBufOver, rdpCmdBufPtr, rdpCmdBufEndP1 + bgezal cmdBufOver, flush_rdp_buffer // $1 on next instr survives flush_rdp_buffer .if CFG_NO_OCCLUSION_PLANE && CFG_LEGACY_VTX_PIPE && !CFG_PROFILING_A vertex_end: @@ -1308,9 +1318,9 @@ G_MODIFYVTX_handler: G_VTX_handler: lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive jal segmented_to_physical // Convert address in cmd_w1_dram to physical - lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10 - sub dmemAddr, dmemAddr, $1 // Start addr = end addr - size. Rounded down to DMA word by H/W - addi dmaLen, $1, -1 // DMA length is always offset by -1 + lhu vtxLeft, (inputBufferEnd - 0x07)(inputBufferPos) // vtxLeft = size in bytes = vtx count * 0x10 + sub dmemAddr, dmemAddr, vtxLeft // Start addr = end addr - size. Rounded down to DMA word by H/W + addi dmaLen, vtxLeft, -1 // DMA length is always offset by -1 j dma_read_write li $ra, 0x8000 | vtx_after_dma // Negative = flag to not to return to clipping in vtx_setup_constants @@ -1322,9 +1332,9 @@ G_TRISTRIP_handler: tri_strip_fan_loop: lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command - beq $11, cmd_w0, tris_end // If off end of command, exit + beq $11, cmd_w0, tris_end // If off end of command, exit sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit - bltz $10, tris_end // If negative, exit + bltz $10, tris_end // If negative, exit sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices bltz $ra, tri_fan_store // Finish handling G_TRIFAN addi cmd_w0, cmd_w0, 1 // Increment @@ -1671,7 +1681,7 @@ tDaDyI equ $v7 vmudl $v29, tDaDyF, tXPRcpF[1] add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set vmadm $v29, tDaDyI, tXPRcpF[1] - sub $8, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP + sub cmdBufOver, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP vmadn tDaDyF, tDaDyF, tXPRcpI[1] sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional) vmadh tDaDyI, tDaDyI, tXPRcpI[1] @@ -1718,13 +1728,13 @@ tri_return_from_decal_fix_z: slv tDaDyI[12], 0x0C($10) // DzDyI:F slv tDaDxI[12], 0x04($10) // DzDxI:F slv tDaDeI[12], 0x08($10) // DzDeI:F - bltz $8, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end + bltz cmdBufOver, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end slv $v10[12], 0x00($10) // ZI:F // 156 cycles -flush_rdp_buffer: // $8 = rdpCmdBufPtr - rdpCmdBufEndP1 +flush_rdp_buffer: // cmdBufOver = rdpCmdBufPtr - rdpCmdBufEndP1 mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write - addi dmaLen, $8, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy + addi dmaLen, cmdBufOver, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy .if CFG_PROFILING_C // This is a wait for DMA busy loop, but written inline to avoid overwriting ra. addi perfCounterD, perfCounterD, 10 // 6 instr + 2 between end load and mfc + 0 taken branch overlaps with last + 2 between mfc and load @@ -1851,34 +1861,34 @@ clip_init_used_loop: sh $1, (clipPoly - 6 + 0)(clipPolySelect) sh $2, (clipPoly - 6 + 2)(clipPolySelect) sh $3, (clipPoly - 6 + 4)(clipPolySelect) - sh $zero, (clipPoly)(clipPolySelect) // Zero to mark end of polygon - li $9, CLIP_CAMPLANE // Initial clip mask for no nearclipping + sh $zero, (clipPoly)(clipPolySelect) // Zero to mark end of polygon + li clipMask, CLIP_CAMPLANE // Initial clip mask for no nearclipping // Available locals here: $11, $1, $7, $20, $24, $10 clip_condlooptop: // Loop over six clipping conditions: near, far, +y, +x, -y, -x - lhu clipFlags, VTX_CLIP($3) // Load flags for V3, which will be the final vertex of the last polygon - and clipFlags, clipFlags, $9 // Mask V3's flags to current clip condition - addi clipPolyRead, clipPolySelect, -6 // Start reading at the beginning of the old polygon + lhu clipFlags, VTX_CLIP($3) // Load flags for V3, which will be the final vertex of the last polygon + and clipFlags, clipFlags, clipMask // Mask V3's flags to current clip condition + addi clipPolyRead, clipPolySelect, -6 // Start reading at the beginning of the old polygon xori clipPolySelect, clipPolySelect, 6 ^ (clipPoly2 + 6 - clipPoly) // Swap to the other polygon memory - addi clipPolyWrite, clipPolySelect, -6 // Start writing at the beginning of the new polygon + addi clipPolyWrite, clipPolySelect, -6 // Start writing at the beginning of the new polygon clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the edge // Edge starts from V3, ends at V2 lhu $2, (clipPoly)(clipPolyRead) // Read next vertex of input polygon as V2 (end of edge) addi clipPolyRead, clipPolyRead, 0x0002 // Increment read pointer - beqz $2, clip_nextcond // If V2 is 0, done with input polygon + beqz $2, clip_nextcond // If V2 is 0, done with input polygon lhu $11, VTX_CLIP($2) // Load flags for V2 - and $11, $11, $9 // Mask V2's flags to current clip condition - beq $11, clipFlags, clip_nextedge // Both set or both clear = both off screen or both on screen, no subdivision + and $11, $11, clipMask // Mask V2's flags to current clip condition + beq $11, clipFlags, clip_nextedge // Both set or both clear = both off screen or both on screen, no subdivision move clipFlags, $11 // clipFlags = masked V2's flags // Going to subdivide this edge. Find available temp vertex slot. li outVtxBase, clipTempVertsEnd clip_find_unused_loop: lhu $11, (VTX_CLIP - vtxSize)(outVtxBase) - addi $10, outVtxBase, -clipTempVerts // This is within the loop rather than before b/c delay after lhu - blez $10, clip_done // If can't find one (should never happen), give up + addi $10, outVtxBase, -clipTempVerts // This is within the loop rather than before b/c delay after lhu + blez $10, clip_done // If can't find one (should never happen), give up andi $11, $11, CLIP_VTX_USED bnez $11, clip_find_unused_loop addi outVtxBase, outVtxBase, -vtxSize - beqz clipFlags, clip_skipswap23 // V2 flag is clear / on screen, therefore V3 is set / off screen + beqz clipFlags, clip_skipswap23 // V2 flag is clear / on screen, therefore V3 is set / off screen move $19, $2 // move $19, $3 // Otherwise swap V2 and V3; note we are overwriting $3 but not $2 move $3, $2 // @@ -1935,13 +1945,13 @@ clip_skipxy: bgez $11, clip_done // If so, give up vrcph $v3[3], $v31[2] // 0; get int result of reciprocal vabs $v29, $v29, $v31[3] // 2; v29 = +/- 2 based on sum positive (incl. zero) or negative - lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode, incl fog setting + lhu vGeomMid, geometryModeLabel + 1 // Load middle 2 bytes of geom mode, incl fog setting vmudn $v2, $v2, $v29[3] // multiply reciprocal by +/- 2 sh outVtxBase, (clipPoly)(clipPolyWrite) // Write pointer to generated vertex to polygon vmadh $v3, $v3, $v29[3] lhu $11, VTX_CLIP($3) // Load clip flags for off screen vert veq $v3, $v3, $v31[2] // 0; if reciprocal high is 0 - andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled + andi fogFlag, vGeomMid, G_FOG >> 8 // Nonzero if fog enabled vmrg $v2, $v2, $v31[1] // keep reciprocal low, otherwise set to -1 addi clipPolyWrite, clipPolyWrite, 2 // Increment write ptr vmudl $v29, vClDiffF, $v2[3] // sum frac * reciprocal, discard @@ -1956,7 +1966,7 @@ clip_skipxy: .endif vrcpl $v23[3], vClDiffF[3] // frac part .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC - srl $7, $7, 5 // 8 if G_FOG is set, 0 otherwise + srl fogFlag, fogFlag, 5 // 8 if G_FOG is set, 0 otherwise .endif vrcph $v24[3], $v31[2] // 0; int part .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC @@ -1992,7 +2002,7 @@ clip_skipxy: .endif vmrg vClDiffF, vClDiffF, $v31[1] // keep frac part of factor, else set to 0xFFFF (max val) .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC - move outVtx2, $19 // Old last vtx regs = temp mem + move outVtx2, $19 // Old last vtx regs = temp mem .endif vsubc $v29, vClDiffF, vOne[0] // frac part - 1 for carry vge vClDiffI, vClDiffI, $v31[2] // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001), @@ -2046,8 +2056,8 @@ clip_nextcond: lhu $3, (clipPoly - 2)(clipPolyWrite) // Initialize the edge start (V3) to the last vert beqz clipMaskIdx, clip_draw_tris lbu $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount - li $9, 1 - sllv $9, $9, $11 // $9 is clip mask + li clipMask, 1 + sllv clipMask, clipMask, $11 j clip_condlooptop addi clipMaskIdx, clipMaskIdx, -1 @@ -2151,18 +2161,19 @@ Epilogue */ vtx_after_dma: - andi inVtx, dmemAddr, 0xFFF8 // Round down input start addr to DMA word - lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode + andi inVtx, dmemAddr, 0xFFF8 // Round down input start addr to DMA word + lhu vGeomMid, geometryModeLabel + 1 // Load middle 2 bytes of geom mode srl $2, cmd_w0, 11 // n << 1 sub $2, cmd_w0, $2 // = v0 << 1 - lhu outVtxBase, (vertexTable)($2) // Address of output start + lhu outVtxBase, (vertexTable)($2) // Address of output start .if COUNTER_A_UPPER_VERTEX_COUNT - sll $11, $1, 12 // Vtx count * 0x10000 + sll $11, vtxLeft, 12 // Vtx count * 0x10000 add perfCounterA, perfCounterA, $11 // Add to vertex count .endif vtx_setup_constants: .if CFG_LEGACY_VTX_PIPE + TODO check kept register assignments against xfrm_dir_lights // Computes modified viewport scale and offset including fog info, and stores // these to temp memory in the RDP buffer. This is only used during vertex write @@ -2192,7 +2203,7 @@ vtx_setup_constants: vmrg sVPO, sVPO, $v23[1] // Put fog offset in elements 3,7 of vtrans llv $v30[8], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 4, 5 vmov sSTS[4], sSTS[0] - andi $11, $5, G_ATTROFFSET_ST_ENABLE >> 8 + andi $11, vGeomMid, G_ATTROFFSET_ST_ENABLE >> 8 vmrg sVPS, sVPS, $v23[0] // Put fog multiplier in elements 3,7 of vscale bnez $11, @@skipoffset lbu $7, mITValid @@ -2246,7 +2257,7 @@ vtx_setup_constants: .endif vtx_after_setup_constants: - andi $8, $5, G_LIGHTING >> 8 // Temp to be reused below, is outVtx2 + andi $8, vGeomMid, G_LIGHTING >> 8 // Temp to be reused below, is outVtx2 beqz $8, @@skip_lighting li $16, vtx_loop_no_lighting // This is clipFlags, but not modified li $16, lt_vtx_pair // during vtx_store @@ -2303,7 +2314,7 @@ vtx_after_lt_setup: vcopy vM3F, vM2F ldv vM1F[0], (mMatrix + 0x28)($zero) sltiu $11, $11, 1 // 0 if matrix valid, 1 if invalid - srl $7, $5, 9 // G_LIGHTING in bit 1 + srl $7, vGeomMid, 9 // G_LIGHTING in bit 1 and $7, $7, $11 // If lighting enabled and need to update matrix, and $7, $7, $10 // and computing mIT, ldv vM3F[0], (mMatrix + 0x38)($zero) @@ -2331,9 +2342,9 @@ vtx_after_calc_mit: ldv vVP0F[8], (vpMatrix + 0x20)($zero) ldv vVP2F[8], (vpMatrix + 0x30)($zero) .endif - andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled + andi fogFlag, vGeomMid, G_FOG >> 8 // Nonzero if fog enabled .if CFG_LEGACY_VTX_PIPE && CFG_NO_OCCLUSION_PLANE // New LVP_NOC - srl $7, $7, 5 // 8 if G_FOG is set, 0 otherwise + srl fogFlag, fogFlag, 5 // 8 if G_FOG is set, 0 otherwise addi outVtx1, rdpCmdBufEndP1, vtxSize // Temp mem; fog writes up to vtxSize before jal while_wait_dma_busy // Wait for vertex load to finish move outVtx2, outVtx1 // for first pre-loop, same for outVtx2 @@ -2357,45 +2368,42 @@ vtx_after_calc_mit: // $v13 = first light dir, $v14:$v16 = Y/Z/vPairNrml/temp, $v17 = vPairLt/temp, // $v18:$v19 = available, $v20:$v21 = vPairPosI/F/temp, // $v22 = vPairST, $v23:$v24 = vPairTPosF/I/temp, $v25:$v26 = temps, $v27 = vPairRGBA, -// $v28 = vOne, $v29 = garbage, $v30 = params, $v31 = constants -// $1: 0x10 vtx count, $2: need for clipping, $3: temp, $4: vtx1/perf, -// $5: geom mode mid, $6: need for clipping, $7: fog flag, $8: outVtx2, -// $9: clipping / curLight, $10:$11: temp, $12: perf, $13: altBaseReg, $14: inVtx, -// $15: outVtxBase, $16: clipping / lt jump addr, $17:$18: clipping, $19: outVtx1, -// $20: temp, $21: clipping / first light, $22:$23: cmd buf, $24: temp, $25: cmd_w0 global, -// $26: taskDataPtr, $27: inputBufferPos, $28:$30: perf, $ra return addr +// $v28 = vOne, $v29 = garbage, $v30 = params/sSTO, $v31 = constants .align 8 .if CFG_NO_OCCLUSION_PLANE + + + vtx_loop_no_lighting: vmadh $v29, vM1I, vPairPosI[1h] - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about vmadn vPairTPosF, vM2F, vPairPosI[2h] - or $10, $10, $11 // Combine results for first vertex + or flagsV1, flagsV1, $10 // Combine results for first vertex vmadh vPairTPosI, vM2I, vPairPosI[2h] - sh $10, (VTX_CLIP )(outVtx1) // Store first vertex flags + sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags // sKPI is $v11 // vtx_store Keep Int (keep across pipelining) // sKPG is vBBB = $v21 // vtx_store Keep Fog vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA // sCLZ is $v19 vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 - addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2 + addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2 vtx_return_from_lighting: vtx_store_for_clip: vmudl $v29, vPairTPosF, $v30[3] // Persp norm - sub $20, outVtx2, $7 // Points 8 before outVtx2 if fog, else 0 + sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0 // s1WI is $v16 // vtx_store 1/W Int vmadm s1WI, vPairTPosI, $v30[3] // Persp norm addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx // s1WF is $v17 // vtx_store 1/W Frac vmadn s1WF, $v31, $v31[2] // 0 - sbv sKPG[15], (VTX_COLOR_A + 8)($20) // In VTX_SCR_Y if fog disabled... + sbv sKPG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled... // sKPF is $v18 // vtx_store Keep Frac vmov sKPF[1], sCLZ[2] - sbv sKPG[7], (VTX_COLOR_A + 8 - vtxSize)($20) // ...which gets overwritten below + sbv sKPG[7], (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below // sSCF is $v20 // vtx_store Scaled Clipping Frac vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping ssv sCLZ[12], (VTX_SCR_Z )(outVtx2) @@ -2411,15 +2419,15 @@ vtx_store_for_clip: vrcph sRTI[3], s1WI[7] slv sKPF[2], (VTX_SCR_Z )(outVtx1) vrcpl sRTF[6], s1WF[7] - sra $24, $1, 31 // All 1s if on last iter + sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter vrcph sRTI[7], $v31[2] // 0 - andi $24, $24, vtxSize // vtxSize if on last iter, else normally 0 + andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0 vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high - sub outVtx2, outVtxBase, $24 // First output vtx on last iter, else second + sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low addi outVtx1, outVtxBase, -vtxSize // First output vtx always vmudl $v29, s1WF, sRTF[2h] - cfc2 $10, $vcc // Screen clip results + cfc2 flagsV1, $vcc // Screen clip results vmadm $v29, s1WI, sRTF[2h] sdv vPairTPosF[8], (VTX_FRAC_VEC )(outVtx2) vmadn s1WF, s1WF, sRTI[3h] @@ -2447,11 +2455,11 @@ vtx_store_for_clip: vmadn s1WF, s1WF, sRTI[3h] lsv vPairTPosI[6], (VTX_Z_INT )(outVtx1) // load Z into W slot, will be for fog below vmadh s1WI, s1WI, sRTI[3h] - srl $24, $10, 4 // Shift second vertex screen clipping to first slots + srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low - andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about vcopy vPairST, sTCL - cfc2 $20, $vcc // Scaled clip results + cfc2 $11, $vcc // Scaled clip results vmudl $v29, vPairTPosF, s1WF[3h] // Pos times inv W ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2) vmadm $v29, vPairTPosI, s1WF[3h] // Pos times inv W @@ -2462,9 +2470,9 @@ vtx_store_for_clip: vmadh vPairTPosI, vPairTPosI, s1WI[3h] // vPairTPosI:vPairTPosF = pos times inv W addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices vmov sTCL[4], vPairST[2] // First vtx RG to elem 4 - andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about vmov sTCL[5], vPairST[3] // First vtx BA to elem 5 - sll $11, $20, 4 // Shift first vertex scaled clipping to second slots + sll $10, $11, 4 // Shift first vertex scaled clipping to second slots vmudl $v29, vPairTPosF, $v30[3] // Persp norm ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1) vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm @@ -2476,14 +2484,14 @@ vtx_store_for_clip: vmudh $v29, sVPO, vOne // offset * 1 slv sST2[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1 vmadh $v29, sFGM, $v31[6] // + (0,0,0,1,0,0,0,1) * 0x7F00 - andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about + andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about vmadn sKPF, vPairTPosF, sVPS // + pos frac * scale - or $24, $24, $20 // Combine results for second vertex + or flagsV2, flagsV2, $11 // Combine results for second vertex vmadh sKPI, vPairTPosI, sVPS // int part, sKPI:sKPF is now screen space pos - sh $24, (VTX_CLIP )(outVtx2) // Store second vertex clip flags + sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags vtx_store_loop_entry: vmudn $v29, vM3F, vOne - blez $1, vtx_epilogue + blez vtxLeft, vtx_epilogue vmadh $v29, vM3I, vOne vmadn $v29, vM0F, vPairPosI[0h] sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order @@ -2493,10 +2501,10 @@ vtx_store_loop_entry: vtx_epilogue: vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 - or $10, $10, $11 // Combine results for first vertex - beqz $7, @@skip_fog + or flagsV1, flagsV1, $10 // Combine results for first vertex + beqz fogFlag, @@skip_fog slv sKPI[8], (VTX_SCR_VEC )(outVtx2) sbv sKPG[15], (VTX_COLOR_A )(outVtx2) sbv sKPG[7], (VTX_COLOR_A )(outVtx1) @@ -2507,7 +2515,7 @@ vtx_epilogue: ssv sKPF[12], (VTX_SCR_Z_FRAC )(outVtx2) bltz $ra, clip_after_vtx_store // $ra - from clipping or + from while_wait_dma_busy slv sKPF[2], (VTX_SCR_Z )(outVtx1) - sh $10, (VTX_CLIP )(outVtx1) // Store first vertex flags + sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags j vertex_end lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store @@ -2530,7 +2538,6 @@ vtx_epilogue: -vtx_return_from_lighting: TODO vtx_store_for_clip: @@ -2538,30 +2545,33 @@ vtx_store_for_clip: bltz $ra, clip_after_vtx_store - // $3 available // Permanent: - // $v0:$v7 = MVP, $v10 = sSTS, $v13 = first light dir, + // $v0:$v7 = MVP, $v10 = sSTS, $v12 = avail for lighting, $v13 = first light dir, // $v28 = vOne, $v29 = garbage, $v30 = params/sSTO, $v31 = constants // Uses but otherwise temp: // $v20 = vPairPosI, $v22 = vPairST, $v23:$v24 = vPairTPosF/I = vAAA/vBBB, $v27 = vPairRGBA // Need during lighting, otherwise temp: - // $v14:$v16 = Y/Z/vPairNrml/temp, $v17 = vPairLt/temp, $v25:$v26 = vCCC/vDDD - // Always available: - // $v8:$v9, $v11:$v12, $v18:$v19, $v21 + // $v14:$v16 = Y/Z/vPairNrml, $v17 = vPairLt, $v25:$v26 = vCCC/vDDD +sO03 equ $v8 +sO47 equ $v9 +sOCM equ $v11 +sKPI equ $v18 +sKPF equ $v19 +sOC2 equ $v21 - // Kept across lighting: sKPI, sKPF, sOC2, sO47 - - // 71 cycles, 17 more than NOC - // 6 vu cycles for plane, 8 vu cycles for edges, 1 more vnop than NOC, + // 70 cycles, 16 more than NOC + // 6 vu cycles for plane, 8 vu cycles for edges, 0 more vnops than NOC, // 1 branch delay slot with SU instr, 1 land-after-branch. + + vtx_loop_no_lighting: veq $v29, $v31, $v31[0q] // Set VCC to 10101010 - sub $20, outVtx1, $7 // Points 8 before outVtx1 if fog, else 0 + sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0 // sOC3 is $v27 = vPairRGBA vmrg sOC2, sOC2, sOC3 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1 // sFOG is $v24 = vBBB - sbv sFOG[7], (VTX_COLOR_A + 8)($20) // ...which gets overwritten below + sbv sFOG[7], (VTX_COLOR_A + 8)($11) // ...which gets overwritten below // sCLZ is $v23 = vAAA vmrg sKPF, sKPF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6 luv vPairRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA @@ -2569,10 +2579,11 @@ vtx_loop_no_lighting: slv sKPI[8], (VTX_SCR_VEC )(outVtx2) vmadh vPairST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5 addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx +vtx_return_from_lighting: vge $v29, sOC2, sO47 // Each compare to coeffs 4-7 slv sKPI[0], (VTX_SCR_VEC )(outVtx1) vmudn $v29, vM3F, vOne - cfc2 $20, $vcc + cfc2 $11, $vcc vmadh $v29, vM3I, vOne slv sKPF[10], (VTX_SCR_Z )(outVtx2) vmadn $v29, vM0F, vPairPosI[0h] @@ -2580,127 +2591,133 @@ vtx_loop_no_lighting: vmadh $v29, vM0I, vPairPosI[0h] slv sKPF[2], (VTX_SCR_Z )(outVtx1) vmadn $v29, vM1F, vPairPosI[1h] - or $20, $20, $11 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded + or $11, $11, $10 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded vmadh $v29, vM1I, vPairPosI[1h] slv vPairST[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2 vmadn vPairTPosF, vM2F, vPairPosI[2h] - andi $11, $20, 0x000F // Bits 0-3 for vtx 1 + andi $10, $11, 0x000F // Bits 0-3 for vtx 1 vmadh vPairTPosI, vM2I, vPairPosI[2h] - bnez $11, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag + bnez $10, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag slv vPairST[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1 - ori $10, $10, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag + ori flagsV1, flagsV1, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag @@skipv1: // 16 cycles vmudl $v29, vPairTPosF, $v30[3] // Persp norm - addi $20, $20, -(0x0010) // If not occluded, atl 1 of 4-7 set, so $20 >= 0x10. Else $20 < 0x10. + addi $11, $11, -(0x0010) // If not occluded, atl 1 of 4-7 set, so $11 >= 0x10. Else $11 < 0x10. s1WI equ $v20 // = vPairPosI vmadm s1WI, vPairTPosI, $v30[3] // Persp norm - andi $20, $20, CLIP_OCCLUDED // This is bit 11, = sign bit b/c |$20| <= 0xFF + andi $11, $11, CLIP_OCCLUDED // This is bit 11, = sign bit b/c |$11| <= 0xFF s1WF equ $v22 // = vPairST vmadn s1WF, $v31, $v31[2] // 0 - or $24, $24, $20 // occluded = $20 negative = sign bit set = $20 is flag, else 0 + or flagsV2, flagsV2, $11 // occluded = $11 negative = sign bit set = $11 is flag, else 0 vmudn $v29, vPairTPosF, sOCM // X * kx, Y * ky, Z * kz - sh $24, (VTX_CLIP )(outVtx2) // Store second vertex clip flags + sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags vmadh $v29, vPairTPosI, sOCM // Int * int - sh $10, (VTX_CLIP )(outVtx1) // Store first vertex flags + sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags vrcph $v29[0], s1WI[3] - blez $1, vertex_end TODO + blez vtxLeft, vertex_end TODO +sRTF equ $v25 // = vCCC vrcpl sRTF[2], s1WF[3] +sRTI equ $v26 // = vDDD vrcph sRTI[3], s1WI[7] - addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2 + addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2 vrcpl sRTF[6], s1WF[7] - sra $24, $1, 31 // All 1s if on last iter + sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter vrcph sRTI[7], $v31[2] // 0 - andi $24, $24, vtxSize // vtxSize if on last iter, else normally 0 + andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0 +sOC1 equ $v21 // = sOC2 vreadacc sOC1, ACC_UPPER // Load int * int portion - ldv sTCL[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3 + sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high - sub outVtx2, outVtxBase, $24 // First output vtx on last iter, else second - vmudl $v29, s1WF, sRTF[2h] addi outVtx1, outVtxBase, -vtxSize // First output vtx always - vmadm $v29, s1WI, sRTF[2h] - suv vPairRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx - vmadn s1WF, s1WF, sRTI[3h] - suv vPairRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx - vmadh s1WI, s1WI, sRTI[3h] + vmudl $v29, s1WF, sRTF[2h] sdv vPairTPosI[8], (VTX_INT_VEC )(outVtx2) - vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low + vmadm $v29, s1WI, sRTF[2h] sdv vPairTPosI[0], (VTX_INT_VEC )(outVtx1) - vmudh $v29, vOne, $v31[4] // 4 - cfc2 $10, $vcc // Screen clip results - vmadn s1WF, s1WF, $v31[0] // -4 + vmadn s1WF, s1WF, sRTI[3h] sdv vPairTPosF[8], (VTX_FRAC_VEC )(outVtx2) - vmadh s1WI, s1WI, $v31[0] // -4 - ldv sTCL[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7 -sSCF equ $v27 // = vPairRGBA - vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping + vmadh s1WI, s1WI, sRTI[3h] sdv vPairTPosF[0], (VTX_FRAC_VEC )(outVtx1) + vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low + sqv vPairTPosI, (tempVPairTPosI)(rdpCmdBufEndP1) // For Z to W manip + vmudh $v29, vOne, $v31[4] // 4 + cfc2 flagsV1, $vcc // Screen clip results + vmadn s1WF, s1WF, $v31[0] // -4 + ssv vPairTPosI[4], (tempVPairTPosI + 6)(rdpCmdBufEndP1) // First Z to W + vmadh s1WI, s1WI, $v31[0] // -4 +sTCL equ $v17 // = vPairLt + ldv sTCL[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3 +sSCF equ $v19 // = sKPF + vmudn sSCF, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping + ssv vPairTPosI[12], (tempVPairTPosI + 14)(rdpCmdBufEndP1) // Second Z to W +sSCI equ $v18 // = sKPI vmadh sSCI, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping lsv vPairTPosF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below - // vnop - lsv vPairTPosI[14], (VTX_Z_INT )(outVtx2) // load Z into W slot, will be for fog below vmudl $v29, s1WF, sRTF[2h] - lsv vPairTPosI[6], (VTX_Z_INT )(outVtx1) // load Z into W slot, will be for fog below + lqv vPairTPosI, (tempVPairTPosI)(rdpCmdBufEndP1) // Load int part with Z in W vmadm $v29, s1WI, sRTF[2h] lsv vPairTPosF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below vmadn s1WF, s1WF, sRTI[3h] - srl $24, $10, 4 // Shift second vertex screen clipping to first slots + ldv sTCL[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7 vmadh s1WI, s1WI, sRTI[3h] - lqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset + srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots vch $v29, vPairTPosI, sSCI[3h] // Clip scaled high - andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about vcl $v29, vPairTPosF, sSCF[3h] // Clip scaled low - andi $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about + suv vPairRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx vmudl $v29, vPairTPosF, s1WF[3h] // Pos times inv W - ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2) + cfc2 $11, $vcc // Scaled clip results vmadm $v29, vPairTPosI, s1WF[3h] // Pos times inv W - cfc2 $20, $vcc // Scaled clip results + suv vPairRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx vmadn vPairTPosF, vPairTPosF, s1WI[3h] - ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1) + andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about vmadh vPairTPosI, vPairTPosI, s1WI[3h] // vPairTPosI:vPairTPosF = pos times inv W -sVPS equ $v27 // = sSCF, = vPairRGBA - lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale + ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2) +sOC4 equ $v27 // = vPairRGBA + vadd sOC4, sOC1, sOC1[1h] // Add Y to X + ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1) +// vPairST is s1WF vcopy vPairST, sTCL ssv s1WI[14], (VTX_INV_W_INT )(outVtx2) - vadd sOC4, sOC1, sOC1[1h] // Add Y to X - sll $11, $20, 4 // Shift first vertex scaled clipping to second slots vmudl $v29, vPairTPosF, $v30[3] // Persp norm ssv s1WI[6], (VTX_INV_W_INT )(outVtx1) vmadm vPairTPosI, vPairTPosI, $v30[3] // Persp norm - andi $20, $20, CLIP_SCAL_NPXY // Mask to only bits we care about +sVPO equ $v20 // = vPairPosI = s1WI + lqv sVPO, (tempViewportOffset)(rdpCmdBufEndP1) // Load viewport offset vmadn vPairTPosF, $v31, $v31[2] // 0; Now vPairTPosI:vPairTPosF = projected position - or $24, $24, $20 // Combine results for second vertex +sVPS equ $v25 // = sRTF + lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale vadd sOC1, sOC4, sOC1[2h] // Add Z to X - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about + // nop vmov sTCL[4], vPairST[2] // First vtx RG to elem 4 - ldv vPairPosI[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration + sll $10, $11, 4 // Shift first vertex scaled clipping to second slots vmudh $v29, sVPO, vOne // offset * 1 - ldv vPairPosI[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration + andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about vmadn sKPF, vPairTPosF, sVPS // + pos frac * scale - ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3 + or flagsV2, flagsV2, $11 // Combine results for second vertex vmadh sKPI, vPairTPosI, sVPS // int part, sKPI:sKPF is now screen space pos - ldv sO03[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2 -sFOG equ $v24 // = vPairTPosI = vBBB + andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about +sFOG equ $v24 // = vPairTPosI vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog - or $10, $10, $11 // Combine results for first vertex + or flagsV1, flagsV1, $10 // Combine results for first vertex vlt $v29, sOC1, sOCM[3h] // Occlusion plane X+Y+Z= 0 (want low byte only) - ldv sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2 + ldv vPairPosI[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration // vnop - andi $11, $11, (1 << 0) | (1 << 4) // Only bits 0, 4 from occlusion + andi $10, $10, (1 << 0) | (1 << 4) // Only bits 0, 4 from occlusion vmulf $v29, sOPM, sKPI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2 - sub $20, outVtx2, $7 // Points 8 before outVtx2 if fog, else 0 + sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0 vmacf sOC2, sO03, sOSC[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2 sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order vmulf $v29, sOPM, sKPI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2 - sbv sFOG[15], (VTX_COLOR_A + 8)($20) // In VTX_SCR_Y if fog disabled... + sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled... sOC3 equ $v27 // = vPairRGBA = sOSC vmacf sOC3, sO03, sOSC[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2 jr $ra // lt_vtx_pair or vtx_loop_no_lighting @@ -2733,7 +2750,7 @@ vtx_return_from_lighting: @@skipsecond: .else vclr sSTO - andi $11, $5, G_ATTROFFSET_ST_ENABLE >> 8 + andi $11, vGeomMid, G_ATTROFFSET_ST_ENABLE >> 8 vmudn $v29, vVP3F, vOne beqz $11, @@skipoffset vmadh $v29, vVP3I, vOne @@ -2767,7 +2784,7 @@ vtx_store_for_clip: vmudl $v29, vPairTPosF, $v30[3] // Persp norm move outVtx2, outVtxBase // Second and output vertices write to same mem... vmadm s1WI, vPairTPosI, $v30[3] // Persp norm - bltz $1, @@skipsecond // ...if < 0 verts remain, ... + bltz vtxLeft, @@skipsecond // ...if < 0 verts remain, ... vmadn s1WF, $v31, $v31[2] // 0 addi outVtx2, outVtxBase, vtxSize // ...otherwise, second vtx is next vtx @@skipsecond: @@ -2888,19 +2905,19 @@ vtx_store_loop_entry: vmudn $v29, vM3F, vOne slv vPairTPosI[0], (VTX_SCR_VEC )(outVtx1) vmadh $v29, vM3I, vOne - blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping + blez vtxLeft, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping vmadn $v29, vM0F, vPairPosI[0h] move $ra, $16 // Normally $ra = loop or lighting skip_return_to_lt_or_loop: vmadh $v29, vM0I, vPairPosI[0h] - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize + addi vtxLeft, vtxLeft, -2*inputVtxSize // Counter of remaining verts * inputVtxSize vmadn $v29, vM1F, vPairPosI[1h] ssv sCLZ[12], (VTX_SCR_Z )(outVtx2) vmadh $v29, vM1I, vPairPosI[1h] ssv sCLZ[4], (VTX_SCR_Z )(outVtx1) // sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords - beqz $7, return_and_end_mat // fog disabled + beqz fogFlag, return_and_end_mat // fog disabled // sOUTI = vPairPosI is $v20, or vPairTPosI is $v24 vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords sbv sFOG[15], (VTX_COLOR_A )(outVtx2) @@ -2940,7 +2957,7 @@ skip_return_to_lt_or_loop: vmadh s1WI, s1WI, sRTI[3h] // s1WI:s1WF is 1/W andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about veq $v29, $v31, $v31[3h] // Set VCC to 00010001 - blez $1, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping + blez vtxLeft, skip_return_to_lt_or_loop // $ra left as vertex_end or clipping vmrg sOC1, sOCM, sOC1 // Put constant factor in elems 3, 7 vtx_store_loop_entry: move $ra, $16 // Normally $ra = loop or lighting @@ -2960,7 +2977,7 @@ skip_return_to_lt_or_loop: .endif // vnop .if CFG_LEGACY_VTX_PIPE - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize + addi vtxLeft, vtxLeft, -2*inputVtxSize // Counter of remaining verts * inputVtxSize .else // sVPS is $v16 // vtx_store ViewPort Scale lqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Load viewport scale @@ -3016,7 +3033,7 @@ skip_return_to_lt_or_loop: vmacf sOC2, sO03, sOSC[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2 ldv sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2 vmulf $v29, sOPM, vPairTPosI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2 - beqz $7, @@skipfog // fog disabled + beqz fogFlag, @@skipfog // fog disabled // sOC3 is $v21 // vtx_store OCclusion temp 3 vmacf sOC3, sO03, sOSC[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2 sbv sFOG[15], (VTX_COLOR_A )(outVtx2) @@ -3037,7 +3054,7 @@ skip_return_to_lt_or_loop: .if CFG_LEGACY_VTX_PIPE lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4 .else - addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize + addi vtxLeft, vtxLeft, -2*inputVtxSize // Counter of remaining verts * inputVtxSize .endif // vnop ssv sCLZ[4], (VTX_SCR_Z )(outVtx1) @@ -3143,7 +3160,7 @@ load_overlay_inner: addi perfCounterC, perfCounterC, 0x4000 // Increment overlay (all 0-4) load count .endif .if CFG_PROFILING_C - mfc0 $9, DPC_CLOCK // see below + mfc0 ovlInitClock, DPC_CLOCK // see below .endif jal shared_dma_read_write // If CFG_PROFILING_C, use the one without perfCounterD add cmd_w1_dram, cmd_w1_dram, $11 @@ -3158,7 +3175,7 @@ load_overlay_inner: bnez $11, @@while_dma_busy mfc0 $11, SP_DMA_BUSY mfc0 $11, DPC_CLOCK - sub $11, $11, $9 + sub $11, $11, ovlInitClock jr $ra add perfCounterD, perfCounterD, $11 @@ -3222,8 +3239,8 @@ dma_write: // The action here is controlled by $1. If yielding, $1 > 0. If this was // G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0. ovl0_start: - sub $8, rdpCmdBufPtr, rdpCmdBufEndP1 - addi $10, $8, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything? + sub cmdBufOver, rdpCmdBufPtr, rdpCmdBufEndP1 + addi $10, cmdBufOver, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything? bgezal $10, flush_rdp_buffer // - 1 because there is no bgtzal instruction add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded jal while_wait_dma_busy // Wait for possible RDP flush to finish @@ -3312,11 +3329,11 @@ ovl1_start: G_POPMTX_handler: lw $11, matrixStackPtr // Get the current matrix stack pointer lw $2, OSTask + OSTask_dram_stack // Read the location of the dram stack - sub cmd_w1_dram, $11, cmd_w1_dram // Decrease the matrix stack pointer by the amount passed in the second command word - sub $1, cmd_w1_dram, $2 // Subtraction to check if the new pointer is greater than or equal to $2 + sub cmd_w1_dram, $11, cmd_w1_dram // Decrease the matrix stack pointer by the amount passed in the second command word + sub $1, cmd_w1_dram, $2 // Subtraction to check if the new pointer is greater than or equal to $2 bgez $1, do_popmtx // If the new matrix stack pointer is greater than or equal to $2, then use the new pointer as is nop - move cmd_w1_dram, $2 // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead + move cmd_w1_dram, $2 // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead do_popmtx: beq cmd_w1_dram, $11, run_next_DL_command // If no bytes were popped, then we don't need to make the mvp matrix as being out of date and can run the next command sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer with the new value @@ -3486,9 +3503,9 @@ lt_vtx_pair: .endif j lt_continue_setup .if CFG_LEGACY_VTX_PIPE - lbu $21, numLightsxSize + lbu ambLight, numLightsxSize .else - andi $11, $5, G_PACKED_NORMALS >> 8 + andi $11, vGeomMid, G_PACKED_NORMALS >> 8 .endif .if !CFG_LEGACY_VTX_PIPE @@ -3518,10 +3535,11 @@ lt_continue_setup: // LVP lighting setup // llv $v30[12], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 6, 7 + TODO lb $11, dirLightsXfrmValid li $10, -1 // To mark lights valid - addi $21, $21, altBase // Point to ambient light; stored through vtx proc - andi $17, $5, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store + addi ambLight, ambLight, altBase // Point to ambient light; stored through vtx proc + andi $17, vGeomMid, G_TEXTURE_GEN >> 8 // This is clipPolyRead, but not touched in vtx_store and $11, $11, $7 // Zero if either matrix or lights invalid bnez $11, lt_setup_after_xfrm sb $10, dirLightsXfrmValid @@ -3574,6 +3592,8 @@ xfrm_dir_lights: swv $v18[4], (tempXfrmSingle)(rdpCmdBufEndP1) // Stores O R U - O R U - vmudh $v29, $v0, $v3[0h] lqv $v18, (tempXfrmSingle)(rdpCmdBufEndP1) + // This is slightly wrong, vmrg writes accum lo. But only affects lookat and + // we are only reading accum mid result. Basically rounding error. vmrg $v17, $v17, $v21 // N Q T - N Q T - swv $v2[4], (tempXfrmSingle)(rdpCmdBufEndP1) // Stores C F I - C F I - vmadh $v29, $v1, $v3[1h] @@ -3596,7 +3616,7 @@ xfrm_light_loop_2: vmudh $v29, $v4, $v4 // Squared sub $10, curLight, altBaseReg // Is curLight (write ptr) <= 0? vreadacc $v7, ACC_MIDDLE // Read not-clamped value - sub $11, curLight, $21 // Is curLight (write ptr) <, =, or > ambient light? + sub $11, curLight, ambLight // Is curLight (write ptr) <, =, or > ambient light? vreadacc $v6, ACC_UPPER sw $20, (tempXfrmSingle)(rdpCmdBufEndP1) // Store light 0 vmudm $v29, $v19, $v23[0h] // Vec int * frac scaling @@ -3625,7 +3645,7 @@ xfrm_light_loop_2: lt_setup_after_xfrm: // Load first light direction to $v13, which is not used throughout vtx processing. j vtx_after_lt_setup - lpv $v13[0], (ltBufOfs + 8 - lightSize)($21) // Xfrmed dir in elems 4-6 + lpv $v13[0], (ltBufOfs + 8 - lightSize)(ambLight) // Xfrmed dir in elems 4-6 xfrm_light_store_lookat: vmadh $v29, $v1, $v3[1h] @@ -3634,73 +3654,163 @@ xfrm_light_store_lookat: j xfrm_light_loop_2 vmadn $v29, $v18, $v3[2h] +// Light loop contents + +.if CFG_NO_OCCLUSION_PLANE + +.macro instan_lt_vec_1 + vmadh $v29, vM1I, vPairPosI[1h] +.endmacro +.macro instan_lt_vec_2 + vmadn vPairTPosF, vM2F, vPairPosI[2h] +.endmacro +.macro instan_lt_vec_3 + vmadh vPairTPosI, vM2I, vPairPosI[2h] +.endmacro +.macro instan_lt_scl_1 + andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about +.endmacro +.macro instan_lt_scl_2 + or flagsV1, flagsV1, $10 // Combine results for first vertex +.endmacro +.macro instan_lt_vs_45 + vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) + addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2 + vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 + sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags +.endmacro + +.else + +.macro instan_lt_vec_1 + veq $v29, $v31, $v31[0q] // Set VCC to 10101010 +.endmacro +.macro instan_lt_vec_2 + vmrg sOC2, sOC2, sOC3 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1 +.endmacro +.macro instan_lt_vec_3 + vmrg sKPF, sKPF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6 +.endmacro +.macro instan_lt_scl_1 + sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0 +.endmacro +.macro instan_lt_scl_2 + sbv sFOG[7], (VTX_COLOR_A + 8)($11) +.endmacro +.macro instan_lt_vs_45 + vmudm $v29, vPairST, sSTS // Scale ST + slv sKPI[8], (VTX_SCR_VEC )(outVtx2) + vmadh vPairST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5 + addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx +.endmacro -.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC -.align 8 .endif -lt_vtx_pair: -// -// LVP main lighting -// + +// $11 can be used as a temporary, except between instan_lt_scl_1 and instan_lt_scl_2 + +TODO $2 sign bit set if AO enabled +TODO LTTEMP is some reg maybe $11 + +.align 8 +lt_start_packed_ao: + instan_lt_vec_1 + beqz TODO_PACKED_NORMALS, lt_start_ao + instan_lt_vec_2 + instan_lt_vec_3 + luv vPairLt, (ltBufOfs + 0)(ambLight) // Total light level, init to ambient + vmudn $v14, vPairPosI, $v13[1] // (1 << 5); left shift normals Y + sra LTTEMP, $2, 31 // All 1s if AO enabled + vand vPairNrml, vPairPosI, $v13[2] // 0xF800; mask X to only top 5 bits + nor LTTEMP, LTTEMP, $zero // All 1s if AO disabled + vmudn $v15, vPairPosI, $v13[0] // (1 << 11); left shift normals Z + ctc2 LTTEMP, $vcc // Set VCC to all 1s if AO disabled, else all 0s + vmrg vPairLt, vPairLt, $v31[2] // 0; clear vPairLt if AO enabled + j lt_after_start + vand $v14, $v14, $v13[3] // 0xFC00; mask Y to only top 6 bits + +lt_start_ao: + lpv vPairNrml[3], (tempVpRGBA)(rdpCmdBufEndP1) // X to elem 3, 7 + instan_lt_vec_3 + lpv $v15[1], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 3, 7 + vclr vPairLt + j lt_after_start + lpv $v14[2], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 3, 7 + +.align 8 +lt_start_standard: + instan_lt_vec_1 + lpv vPairNrml[3], (tempVpRGBA)(rdpCmdBufEndP1) // X to elem 3, 7 + instan_lt_vec_2 + lpv $v15[1], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 3, 7 + instan_lt_vec_3 + lpv $v14[2], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 3, 7 + // vnop + luv vPairLt, (ltBufOfs + 0)(ambLight) // Total light level, init to ambient +lt_after_start: .if CFG_PROFILING_B addi perfCounterA, perfCounterA, 2 // Increment lit vertex count by 2 .endif -.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC - vmadh $v29, vM1I, vPairPosI[1h] - lpv vPairNrml[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair normals - vmadn vPairTPosF, vM2F, vPairPosI[2h] - lpv $v14[7], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Y to elem 0, 4 - vmadh vPairTPosI, vM2I, vPairPosI[2h] - lpv $v15[6], (tempVpRGBA - 8)(rdpCmdBufEndP1) // Z to elem 0, 4 + vmulf $v29, vPairNrml, $v13[4] // Normals X elems 3, 7 * first light dir + lpv vPairRGBA[0], (ltBufOfs + 8 - 2*lightSize)(ambLight) // Xfrmed dir in elems 4-6; temp reg + vmacf $v29, $v15, $v13[6] // Normals Z elems 3, 7 * first light dir + luv vDDD[0], (tempVpRGBA)(rdpCmdBufEndP1) // Load RGBA + vmacf vAAA, $v14, $v13[5] // Normals Y elems 3, 7 * first light dir + instan_lt_scl_1 // vnop - andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about -.endif - vmulf $v29, vPairNrml, $v13[4] // Normals X elems 0, 4 * first light dir - luv vPairLt, (ltBufOfs + 0)($21) // Total light level, init to ambient - vmacf $v29, $v14, $v13[5] // Normals Y elems 0, 4 * first light dir - lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)($21) // Xfrmed dir in elems 4-6 - vmacf vAAA, $v15, $v13[6] // Normals Z elems 0, 4 * first light dir -.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC - or $10, $10, $11 // Combine results for first vertex - vmulf vPairRGBA, vPairNrml, $v31[5] // 0x4000; right shift vtx alpha from lpv -.else - // nop + beq ambLight, altBaseReg, lt_post + instan_lt_scl_2 // vnop -.endif - beq $21, altBaseReg, lt_post -.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC - addi $1, $1, -2*inputVtxSize // Decrement vertex count by 2 -.else - lpv ltLookAt[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut -.endif - // vnop - move curLight, $21 // Point to ambient light + move curLight, ambLight // Point to ambient light lt_loop: vge vCCC, vAAA, $v31[2] // 0; clamp dot product to >= 0 - vmulf $v29, vPairNrml, vDDD[4] // Normals X elems 0, 4 + vmulf $v29, vPairNrml, vPairRGBA[4] // Normals X elems 3, 7 * next light dir luv vBBB, (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color - vmacf $v29, $v14, vDDD[5] // Normals Y elems 0, 4 + vmacf $v29, $v14, vPairRGBA[5] // Normals Y elems 3, 7 * next light dir addi curLight, curLight, -lightSize - vmacf vAAA, $v15, vDDD[6] // Normals Z elems 0, 4 - lpv vDDD[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6 + vmacf vAAA, $v15, vPairRGBA[6] // Normals Z elems 3, 7 * next light dir + lpv vPairRGBA[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6; DOES dual-issue vmudh $v29, vOne, vPairLt // Load accum mid with current light level bne curLight, altBaseReg, lt_loop - vmacf vPairLt, vBBB, vCCC[0h] // + light color * dot product + vmacf vPairLt, vBBB, vCCC[3h] // + light color * dot product lt_post: -.if CFG_NO_OCCLUSION_PLANE // New LVP_NOC - vge sKPG, sKPI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used) - lpv ltLookAt[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut - vge sCLZ, sKPI, $v31[2] // 0; clamp Z to >= 0 - sh $10, (VTX_CLIP )(outVtx1) // Store first vertex flags + instan_lt_vs_45 // Starts using vBBB vne $v29, $v31, $v31[3h] // Set VCC to 11101110 - beqz $17, vtx_return_from_lighting - vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha -.else - beqz $17, vtx_early_return_from_lighting - vne $v29, $v31, $v31[3h] // Set VCC to 11101110 - vmrg vPairRGBA, vPairLt, vPairRGBA // RGB = light, A = vtx alpha -.endif -// Texgen uses vLookat0:1 = vPairLt and VAAA, vCCC:vDDD, and of course vPairST. + beqz TODO_NO_EXTRAS, vtx_return_from_lighting + vmrg vPairRGBA, vPairLt, vDDD // RGB = light, A = vtx alpha + // AO + vsub vPairRGBA, vDDD, $v31[7] // 0x7FFF; offset alpha + bgez $2, lt_skip_ao + vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15) + vmadm vCCC, vPairRGBA, $v30[7] // + (alpha - 1) * aoDir factor; elems 3, 7 + luv vAAA, (ltBufOfs + 0)(ambLight) // Ambient light level + vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15) + vmadm vPairRGBA, vPairRGBA, $v30[6] // + (alpha - 1) * aoAmb factor; elems 3, 7 + vmulf $v29, vPairLt, vCCC[3h] // Sum of dir lights *= dir factor + vmacf vPairLt, vAAA, vPairRGBA[3h] // + ambient * amb factor +lt_skip_ao: +vLtRGBOut equ vPairRGBA +vLtAOut equ vAAA + vmulf vLtRGBOut, vDDD, vPairLt // RGB output is RGB * light + beqz TODO, lt_skip_cel + vcopy vLtAOut, vDDD // Alpha output = vertex alpha (only 3, 7 matter) + // Cel: alpha = max of light components, RGB = vertex color + vge vLtAOut, vPairLt, vPairLt[1h] // elem 0 = max(R0, G0); elem 4 = max(R1, G1) + vge vLtAOut, vLtAOut, vLtAOut[2h] // elem 0 = max(R0, G0, B0); equiv for elem 4 + vcopy vLtRGBOut, vDDD // RGB output is vertex color + vmudh vLtAOut, vOne, vLtAOut[0h] // move light level elem 0, 4 to 3, 7 +lt_skip_cel: + vne $v29, $v31, $v31[3h] // Set VCC to 11101110 + bnez TODO, lt_skip_novtxcolor + lpv ltLookAt[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6; = vNrmOut + vcopy vLtRGBOut, vPairLt // If no packed normals, base output is just light +lt_skip_novtxcolor: + // vnop + beqz TODO, vtx_return_from_lighting + vmrg vPairRGBA, vLtRGBOut, vLtAOut // Merge base output and alpha output + + + +// Texgen uses vLookat0:1 = vPairLt and vAAA, vCCC:vDDD, and of course vPairST. vmulf $v29, vPairNrml, ltLookAt[0] // Normals X elems 0, 4 * lookat 0 X vmacf $v29, $v14, ltLookAt[1] // Normals Y elems 0, 4 * lookat 0 Y vmacf vLookat0, $v15, ltLookAt[2] // Normals Z elems 0, 4 * lookat 0 Z @@ -3708,6 +3818,7 @@ lt_post: vmacf $v29, $v14, ltLookAt[5] // Normals Y elems 0, 4 * lookat 1 Y vmacf vLookat1, $v15, ltLookAt[6] // Normals Z elems 0, 4 * lookat 1 Z // Continue to rest of texgen shared by both versions. + TODO now the results are in 3h instead of 0h .endif // CFG_LEGACY_VTX_PIPE @@ -3753,11 +3864,11 @@ lt_skip_packed_normals: vmudn $v29, vM0F, vPairNrml[0h] lbu $11, (normalsMode)($zero) vmadh $v29, vM0I, vPairNrml[0h] - andi $6, $5, G_LIGHTING_SPECULAR >> 8 + andi $6, vGeomMid, G_LIGHTING_SPECULAR >> 8 vmadn $v29, vM1F, vPairNrml[1h] addi curLight, curLight, altBase // Point to ambient light vmadh $v29, vM1I, vPairNrml[1h] - andi $10, $5, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8 + andi $10, vGeomMid, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8 vmadn vBBB, vM2F, vPairNrml[2h] // vBBB = normals frac beqz $11, lt_after_xfrm_normals // Skip if G_NORMALSMODE_FAST vmadh vAAA, vM2I, vPairNrml[2h] // vAAA = normals int @@ -3872,11 +3983,11 @@ vLtRGBOut equ $v25 // = vCCC: light / effects RGB output vLtAOut equ $v26 // = vDDD: light / effects alpha output .if !CFG_LEGACY_VTX_PIPE vadd vPairRGBA, vPairRGBA, $v31[7] // 0x7FFF; undo change for ambient occlusion - andi $11, $5, G_LIGHTTOALPHA >> 8 + andi $11, vGeomMid, G_LIGHTTOALPHA >> 8 // vnop - andi $20, $5, G_PACKED_NORMALS >> 8 + andi $20, vGeomMid, G_PACKED_NORMALS >> 8 // vnop - andi $10, $5, G_TEXTURE_GEN >> 8 + andi $10, vGeomMid, G_TEXTURE_GEN >> 8 // vnop // nop vmulf vLtRGBOut, vPairRGBA, vPairLt // RGB output is RGB * light @@ -3890,7 +4001,7 @@ vLtAOut equ $v26 // = vDDD: light / effects alpha output lt_skip_cel: vne $v29, $v31, $v31[3h] // Set VCC to 11101110 bnez $20, lt_skip_novtxcolor - andi $24, $5, (G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8 + andi $24, vGeomMid, (G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8 vcopy vLtRGBOut, vPairLt // If no packed normals, base output is just light lt_skip_novtxcolor: vmulf vLookat0, vPairNrml, vAAA // Normal * lookat 0 dir; vLookat0 = vPairLt @@ -3900,7 +4011,7 @@ lt_skip_novtxcolor: // vLookat0, vPairRGBA. Available: vAAA, vBBB, vNrmOut. lqv vBBB, (v30Value)($zero) // Need 0x0100 constant, in elem 3 vabs vAAA, vPairNrml, vPairNrml // Absolute value of dot product for underwater - andi $11, $5, G_FRESNEL_COLOR >> 8 + andi $11, vGeomMid, G_FRESNEL_COLOR >> 8 vmudh $v29, vOne, $v30[7] // Fresnel offset vmacf vAAA, vAAA, $v30[6] // + factor * scale beqz $11, @@skip @@ -3926,7 +4037,7 @@ lt_skip_fresnel: .endif // Rest of texgen shared by F3DEX3 native and LVP vne $v29, $v31, $v31[1h] // Set VCC to 10111011 - andi $11, $5, G_TEXTURE_GEN_LINEAR >> 8 + andi $11, vGeomMid, G_TEXTURE_GEN_LINEAR >> 8 vmrg vLookat0, vLookat0, vLookat1[0h] // Dot products in elements 0, 1, 4, 5 vmudh $v29, vOne, $v31[5] // 1 * 0x4000 beqz $11, vtx_return_from_lighting @@ -3941,6 +4052,8 @@ lt_skip_fresnel: j vtx_return_from_lighting vmacf vPairST, vDDD, vCCC // + ST squared * (ST + ST * coeff) + TODO // with new occlusion plane, apply ST scale and offset here + .if !CFG_LEGACY_VTX_PIPE lt_point: /* diff --git a/notes.s b/notes.s index 62b9f8e..873e751 100644 --- a/notes.s +++ b/notes.s @@ -57,3 +57,32 @@ suv $v27[4], ( 0)($11) // U4 U5 U6 U7 P0 P1 P2 P3 suv $v27[5], ( 0)($11) // U5 U6 U7 P0 P1 P2 P3 P4 suv $v27[6], ( 0)($11) // U6 U7 P0 P1 P2 P3 P4 P5 suv $v27[7], ( 0)($11) // U7 P0 P1 P2 P3 P4 P5 P6 + +// ltv patterns: all 8 instr below produce (values are address loaded to each element) +ltv $v0[ 0], (0x00)($11) // $v0 = 00 10 20 30 40 50 60 70 // $v0 always gets bytes 0-1 +ltv $v0[14], (0x10)($11) // $v1 = 72 02 12 22 32 42 52 62 // $v1 always gets bytes 2-3 +ltv $v0[12], (0x20)($11) // $v2 = 64 74 04 14 24 34 44 54 +ltv $v0[10], (0x30)($11) // $v3 = 56 66 76 06 16 26 36 46 +ltv $v0[ 8], (0x40)($11) // $v4 = 48 58 68 78 08 18 28 38 +ltv $v0[ 6], (0x50)($11) // $v5 = 3A 4A 5A 6A 7A 0A 1A 2A +ltv $v0[ 4], (0x60)($11) // $v6 = 2C 3C 4C 5C 6C 7C 0C 1C +ltv $v0[ 2], (0x70)($11) // $v7 = 1E 2E 3E 4E 5E 6E 7E 0E +// Or this pattern +ltv $v0[ 0], (0x00)($11) // $v0 = 00 70 60 50 40 30 20 10 +ltv $v0[ 2], (0x10)($11) // $v1 = 12 02 72 62 52 42 32 22 +ltv $v0[ 4], (0x20)($11) // $v2 = 24 14 04 74 64 54 44 34 +ltv $v0[ 6], (0x30)($11) // $v3 = 36 26 16 06 76 66 56 46 +ltv $v0[ 8], (0x40)($11) // $v4 = 48 38 28 18 08 78 68 58 +ltv $v0[10], (0x50)($11) // $v5 = 5A 4A 3A 2A 1A 0A 7A 6A +ltv $v0[12], (0x60)($11) // $v6 = 6C 5C 4C 3C 2C 1C 0C 7C +ltv $v0[14], (0x70)($11) // $v7 = 7E 6E 5E 4E 3E 2E 1E 0E + +// stv patterns: values are 16 bit reg/elem stored, e.g. 45 = $v4[e5] +stv $v0[ 0], (0x00)($11) // mem[0x00] = 00 11 22 33 44 55 66 77 +stv $v0[ 2], (0x10)($11) // mem[0x10] = 10 21 32 43 54 65 76 07 +stv $v0[ 4], (0x20)($11) // mem[0x20] = 20 31 42 53 64 75 06 17 +stv $v0[ 6], (0x30)($11) // mem[0x30] = 30 41 52 63 74 05 16 27 +stv $v0[ 8], (0x40)($11) // mem[0x40] = 40 51 62 73 04 15 26 37 +stv $v0[10], (0x50)($11) // mem[0x50] = 50 61 72 03 14 25 36 47 +stv $v0[12], (0x60)($11) // mem[0x60] = 60 71 02 13 24 35 46 57 +stv $v0[14], (0x70)($11) // mem[0x70] = 70 01 12 23 34 45 56 67