From 02cb0386aeddd58ec8151f73ef60f8a85216cd2c Mon Sep 17 00:00:00 2001 From: Sauraen Date: Tue, 27 Aug 2024 21:39:12 -0700 Subject: [PATCH] Working --- docs/Documentation/Performance.md | 25 ++-- f3dex3.s | 215 ++++++++++++++---------------- 2 files changed, 115 insertions(+), 125 deletions(-) diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md index 22d7ff5..b167187 100644 --- a/docs/Documentation/Performance.md +++ b/docs/Documentation/Performance.md @@ -33,6 +33,9 @@ measured yet". | | F3DEX2 | F3DEX3_LVP_NOC | F3DEX3_LVP | F3DEX3_NOC | F3DEX3 | |----------------------------|--------|----------------|------------|------------|--------| +| Command dispatch | 12 | 12 | 12 | 12 | 12 | +| Small RDP command | 14 | 5 | 5 | 5 | 5 | +| Vtx before DMA start | 16 | 17 | 17 | 17 | 17 | | Vtx pair, no lighting | 54 | 54 | 81 | 79 | 98 | | Vtx pair, 0 dir lts | Can't | 64 | | | | | Vtx pair, 1 dir lt | 73 | 70 | 96 | 182 | 201 | @@ -44,20 +47,18 @@ measured yet". | Vtx pair, 7 dir lts | 118 | 112 | 138 | 356 | 375 | | Vtx pair, 8 dir lts | Can't | 119 | 145 | 385 | 404 | | Vtx pair, 9 dir lts | Can't | 126 | 152 | 414 | 433 | -| Command dispatch | 12 | 12 | 12 | 12 | 12 | -| Small RDP command | 14 | 5 | 5 | 5 | 5 | -| Only/2nd tri to offscreen | 27 | 29 | 29 | 29 | 29 | -| 1st tri to offscreen | 28 | 29 | 29 | 29 | 29 | +| Only/2nd tri to offscreen | 27 | 26 | 26 | 26 | 26 | +| 1st tri to offscreen | 28 | 27 | 27 | 27 | 27 | | Only/2nd tri to clip | 32 | 31 | 31 | 31 | 31 | -| 1st tri to clip | 33 | 31 | 31 | 31 | 31 | -| Only/2nd tri to backface | 38 | 40 | 40 | 40 | 40 | -| 1st tri to backface | 39 | 40 | 40 | 40 | 40 | -| Only/2nd tri to degenerate | 42 | 42 | 42 | 42 | 42 | -| 1st tri to degenerate | 43 | 42 | 42 | 42 | 42 | +| 1st tri to clip | 33 | 32 | 32 | 32 | 32 | +| Only/2nd tri to backface | 38 | 38 | 38 | 38 | 38 | +| 1st tri to backface | 39 | 39 | 39 | 39 | 39 | +| Only/2nd tri to degenerate | 42 | 40 | 40 | 40 | 40 | +| 1st tri to degenerate | 43 | 41 | 41 | 41 | 41 | | Only/2nd tri to occluded | Can't | Can't | 49 | Can't | 49 | -| 1st tri to occluded | Can't | Can't | 49 | Can't | 49 | -| Only/2nd tri to draw | 172 | 166 | 167 | 166 | 167 | -| 1st tri to draw | 173 | 166 | 167 | 166 | 167 | +| 1st tri to occluded | Can't | Can't | 50 | Can't | 50 | +| Only/2nd tri to draw | 172 | 165 | 168 | 165 | 168 | +| 1st tri to draw | 173 | 165 | 168 | 165 | 168 | Tri numbers are measured from the first cycle of the command handler inclusive, diff --git a/f3dex3.s b/f3dex3.s index 12c199e..21cd18d 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -455,6 +455,9 @@ normalsMode: lastMatDLPhyAddr: .dw 0 +activeClipPlanes: + .dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping + // Constants for clipping algorithm clipCondShifts: .db CLIP_SCAL_NY_SHIFT @@ -1043,11 +1046,11 @@ tempPrevVtxGarbage equ 0x50 // Up to 2 * 0x26 = 0x4C used -> to 0x9C j wait_for_dma_and_run_next_command // Delay slot harmless @@clamp_to_memset_buffer: - addi $11, cmd_w0, -memsetBufferSize // Is more than a whole buffer left? - bltz $11, return_routine - move $2, cmd_w0 // No, use partial buffer + addi $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize) + sra $10, $11, 31 + and $11, $11, $10 jr $ra - li $2, memsetBufferSize + addi $2, $11, memsetBufferSize .endmacro @@ -1189,7 +1192,7 @@ check_rdp_buffer_full_and_run_next_cmd: vertex_end: .endif .if !CFG_PROFILING_A -tri_end: +tris_end: .endif .if ENABLE_PROFILING G_LIGHTTORDP_handler: @@ -1292,8 +1295,7 @@ G_VTX_handler: lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive jal segmented_to_physical // Convert address in cmd_w1_dram to physical lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10 - andi dmemAddr, dmemAddr, 0xFFF8 // Round down end addr to DMA word; one input vtx still fits in one internal vtx - sub dmemAddr, dmemAddr, $1 // Start addr = end addr - size + sub dmemAddr, dmemAddr, $1 // Start addr = end addr - size. Rounded down to DMA word by H/W addi dmaLen, $1, -1 // DMA length is always offset by -1 j dma_read_write li $ra, 0x8000 | vtx_after_dma // Negative = flag to not to return to clipping in vtx_setup_constants @@ -1302,30 +1304,22 @@ G_TRIFAN_handler: li $1, 0x8000 // $ra negative = flag for G_TRIFAN G_TRISTRIP_handler: addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0 - addi cmd_w0, inputBufferPos, inputBufferEnd - 12 // Start pointing so elems 5-7 are tris 1-3 + addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte tri_strip_fan_loop: - lb $3, (7)(cmd_w0) // Load signed index of last of 3 tris - bgez $ra, @@skip_copy_1 // Skip if G_TRISTRIP - lbu $1, (inputBufferEnd - 7)(inputBufferPos) // Load tri 1 index - sb $1, (5)(cmd_w0) // Store as first tri of the three current tris -@@skip_copy_1: - bltz $3, tri_end // If third tri index is negative, exit - addi $11, inputBufferPos, inputBufferEnd - 7 // Off end of command - beq $11, cmd_w0, tri_end // If off end of command, exit - lpv $v27[0], (0)(cmd_w0) // Load the three tris to elems 5-7 - bltz $ra, tri_main // Draw if G_TRIFAN - addi cmd_w0, cmd_w0, 1 // Increment - andi $11, cmd_w0, 1 // If odd after increment, this is the 1st/3rd/5th tri - bnez $11, tri_main // in that case draw directly - sll $3, $3, 8 // Move tri 3 index into bits 15:8 - vmov $v27[7], $v27[6] // Move tri 2 to tri 3 + lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word + addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command + beq $11, cmd_w0, tris_end // If off end of command, exit + sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit + bltz $10, tris_end // If negative, exit + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices + bltz $ra, tri_fan_store // Finish handling G_TRIFAN + addi cmd_w0, cmd_w0, 1 // Increment + andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri + bnez $11, tri_main // Draw as is + srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs + sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2 j tri_main - mtc2 $3, $v27[12] // Move tri 3 to tri 2 - -.if (. & 4) - .warning "One instruction of padding before tri handler" -.endif -.align 8 + sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3 tV1AtF equ $v5 tV2AtF equ $v7 @@ -1337,79 +1331,77 @@ tV3AtI equ $v21 G_TRI2_handler: G_QUAD_handler: jal tri_main // Send second tri; return here for first tri - lpv $v27[0], (inputBufferEndSgn - 8)(inputBufferPos) // Second tri idxs elems 5, 6, 7 + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store second tri indices G_TRI1_handler: - lpv $v27[4], (inputBufferEndSgn - 8)(inputBufferPos) // First tri idxs elems 5, 6, 7 - j tri_main - li $ra, tri_end // After done with this tri, exit tri processing - + li $ra, tris_end // After done with this tri, exit tri processing + sw cmd_w0, 4(rdpCmdBufPtr) // Store first tri indices tri_main: - vmudn $v29, vOne, $v30[0] // Address of vertex buffer - lw $6, geometryModeLabel // Load full geometry mode word - vmadl $v27, $v27, $v30[1] // Plus vtx indices times length - sb $zero, materialCullMode // This covers all tri cmds - vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6 - li $24, CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri draw, check clipping + lpv $v27[0], 0(rdpCmdBufPtr) // To vector unit + lbu $1, 5(rdpCmdBufPtr) + lbu $2, 6(rdpCmdBufPtr) + lbu $3, 7(rdpCmdBufPtr) vclr vZero - sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull - // vnop - sh $ra, tempTriRA // For tri cmds; where to go after clipping - mfc2 $1, $v27[10] - mfc2 $2, $v27[12] + lhu $1, (vertexTable)($1) + vmudn $v29, vOne, $v30[0] // Address of vertex buffer + lhu $2, (vertexTable)($2) + vmadl $v27, $v27, $v30[1] // Plus vtx indices times length + lhu $3, (vertexTable)($3) + vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6 .if !ENABLE_PROFILING addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading .endif - vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr - mfc2 $3, $v27[14] - vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping - llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y) vnxor tV1AtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding - llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4 + llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y) vnxor tV2AtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding + llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4 + vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8 vnxor tV3AtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding lhu $5, VTX_CLIP($1) - vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1 + vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr lhu $7, VTX_CLIP($2) - vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr) + // vnop lhu $8, VTX_CLIP($3) - vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr) + vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1 andi $11, $5, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane - vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr) + vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr) and $11, $11, $7 - vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y + vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr) and $11, $11, $8 - vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2) - bnez $11, return_routine // Then the whole tri is offscreen, cull - // 24 cycles - vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... + vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr) + vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y + bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull + // 22 cycles + vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2) + vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ... + lhu $24, activeClipPlanes vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing - or $10, $5, $7 + lw $6, geometryModeLabel // Load full geometry mode word vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y - or $10, $10, $8 // $10 = all clip bits which are true for any verts + or $10, $5, $7 vmrg $v10, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2) - and $10, $10, $24 // If clipping is enabled, check clip flags + or $10, $10, $8 // $10 = all clip bits which are true for any verts vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y - bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip - // 29 cycles - mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended + and $10, $10, $24 // If clipping is enabled, check clip flags vmrg $v4, $v14, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3) - and $5, $5, $7 + mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended vmrg $v14, $v8, $v14 // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2) - and $5, $5, $8 + bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip + // 30 cycles + sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing) vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing vmrg $v2, $v4, $v10 // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2) - bltz $11, return_routine // Cull if bit is set (culled based on facing) - // 35 cycles + bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing) + // 34 cycles vmrg $v10, $v10, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3) vmudn $v4, $v14, $v31[5] // 0x4000 - beqz $9, return_routine // If cross product is 0, tri is degenerate (zero area), cull. - // 37 cycles + beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull. + // 36 cycles mfc2 $1, $v14[12] // $v14 = lowest Y value = highest on screen (x, y, addr) vsub $v6, $v2, $v14 mfc2 $2, $v2[12] // $v2 = mid vertex (x, y, addr) @@ -1423,6 +1415,8 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping mfc2 $3, $v10[12] // $v10 = highest Y value = lowest on screen (x, y, addr) vsub $v15, $v10, $v2 .if !CFG_NO_OCCLUSION_PLANE + and $5, $5, $7 + and $5, $5, $8 andi $5, $5, CLIP_OCCLUDED .endif vmudh $v29, $v6, $v8[0] @@ -1456,7 +1450,7 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping vmrg tV3AtI, $v25, tV3AtI // RGB from $4, alpha from $3 tri_skip_flat_shading: .endif - // 53 cycles + // 52 cycles vrcp $v20[2], $v6[1] lb $20, (alphaCompareCullMode)($zero) vrcph $v22[2], $v6[1] @@ -1487,11 +1481,11 @@ tri_skip_flat_shading: mfc2 $24, $v26[6] sub $24, $24, $19 // sign bit set if (max/min) < thresh xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull - bltz $24, return_routine // if max < thresh or if min >= thresh. + bltz $24, return_and_end_mat // if max < thresh or if min >= thresh. tri_skip_alpha_compare_cull: - // 64 cycles + // 63 cycles vmadm $v22, $v22, $v30[7] // 0x0020 - sub $11, $5, $8 + sub $11, $5, $8 // Four instr: $5 = max($5, $8) vmadn $v20, $v31, $v31[2] // 0 sra $10, $11, 31 vmudm $v25, $v15, $v30[2] // 0x1000 @@ -1535,10 +1529,10 @@ tri_skip_alpha_compare_cull: vmadm $v29, $v15, $v4[1] sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings vmadn $v2, $v22, $v26[1] + sb $zero, materialCullMode // This covers tri write out + vmadh $v3, $v15, $v26[1] beqz $9, tri_skip_tex // If textures are not enabled, skip texture coefficient calculation - vmadh $v3, $v15, $v26[1] - // 88 cycles - vrcph $v29[0], $v27[0] + vrcph $v29[0], $v27[0] vrcpl $v10[0], $v27[1] vmudh $v14, vOne, $v13[1q] vrcph $v27[0], $v31[2] // 0 @@ -1567,10 +1561,10 @@ tri_skip_alpha_compare_cull: ldv tV1AtF[8], 0x0028(rdpCmdBufPtr) // 8 vmrg tV3AtF, tV3AtF, $v13 // Merge S, T, W into elems 4-6 tri_skip_tex: - // 109 cycles .if !ENABLE_PROFILING addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP .endif + // 108 cycles vmudl $v29, $v16, $v23 lsv tV1AtF[14], VTX_SCR_Z_FRAC($1) vmadm $v29, $v17, $v23 @@ -1642,7 +1636,7 @@ tDaDyI equ $v7 // DaDe = DaDx * factor tDaDeF equ $v8 tDaDeI equ $v9 - // 137 cycles + // 136 cycles vmadl $v29, tDaDxF, $v20[3] sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional) vmadm $v29, tDaDxI, $v20[3] @@ -1692,9 +1686,9 @@ tV1AtFF equ $v10 ssv tDaDyI[14], 0x0C($10) ssv tV1AtF[14], 0x02($10) tri_end_check_rdp_buffer_full: - bltz $8, return_routine // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end + bltz $8, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end ssv tV1AtI[14], 0x00($10) // If returning from no-Z, this is okay b/c $10 is at end - // 162 cycles + // 161 cycles flush_rdp_buffer: // $8 = rdpCmdBufPtr - rdpCmdBufEndP1 mfc0 $10, SP_DMA_BUSY // Check if any DMA is in flight lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write @@ -1762,20 +1756,19 @@ no_z_buffer: sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer) .endif +tri_culled_by_occlusion_plane: .if CFG_PROFILING_B -tri_culled_by_occlusion_plane: - jr $ra - addi perfCounterB, perfCounterB, 0x4000 + addi perfCounterB, perfCounterB, 0x4000 .endif +return_and_end_mat: + jr $ra + sb $zero, materialCullMode // This covers all tri early exits except clipping + +tri_fan_store: + lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1 + j tri_main + sb $11, 5(rdpCmdBufPtr) // Store vtx 1 -// This routine is used to return via conditional branch -.if !CFG_PROFILING_B -tri_culled_by_occlusion_plane: -.endif -return_routine: - jr $ra - nop - .if (. & 4) .warning "One instruction of padding before ovl234" .endif @@ -1809,9 +1802,11 @@ ovl234_ovl4_entrypoint_ovl3ver: // same IMEM address as ovl234_ovl4_e // Jump here to do clipping. If overlay 3 is loaded (this code), directly starts // the clipping code. ovl234_clipping_entrypoint: + sh $ra, tempTriRA // Tri return after clipping .if CFG_PROFILING_B addi perfCounterB, perfCounterB, 1 // Increment clipped (input) tris count .endif + sb $zero, materialCullMode // In case only/all tri(s) clip then offscreen jal vtx_setup_constants li clipMaskIdx, 4 clip_after_constants: @@ -2029,6 +2024,7 @@ clip_nextcond: clip_draw_tris: vclr vZero // TODO may not need this + sh $zero, activeClipPlanes lqv $v30, (v30Value)($zero) // Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite // Draws verts in pattern like 0-1-4, 1-2-4, 2-3-4 @@ -2036,16 +2032,15 @@ clip_draw_tris_loop: lhu $1, (clipPoly - 6)(clipPolySelect) lhu $2, (clipPoly - 4)(clipPolySelect) lhu $3, (clipPoly - 2)(clipPolyWrite) - mtc2 $1, $v6[12] // Addresses go in vector regs too + mtc2 $1, $v27[10] // Addresses go in vector regs too mtc2 $2, $v4[12] - lw $6, geometryModeLabel // Load full geometry mode word - sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull - li $24, 0 // Init clipping flags for tri draw--no repeat clipping jal tri_noinit - mtc2 $3, $v8[12] + mtc2 $3, $v27[14] bne clipPolyWrite, clipPolySelect, clip_draw_tris_loop addi clipPolySelect, clipPolySelect, 2 clip_done: + li $11, CLIP_SCAL_NPXY | CLIP_CAMPLANE + sh $11, activeClipPlanes lqv $v30, (v30Value)($zero) // Need this repeated here in case we exited early lh $ra, tempTriRA @@ -2079,20 +2074,12 @@ ovl3_padded_end: .orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga()) ovl234_end: - - - - - - - - vtx_after_dma: + andi inputVtxPos, dmemAddr, 0xFFF8 // Round down input start addr to DMA word lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode srl $2, cmd_w0, 11 // n << 1 sub $2, cmd_w0, $2 // = v0 << 1 - lhu outputVtxPos, (vertexTable)($2) // Address of start - sb $zero, materialCullMode // This covers vtx + lhu outputVtxPos, (vertexTable)($2) // Address of output start .if COUNTER_A_UPPER_VERTEX_COUNT sll $11, $1, 12 // Vtx count * 0x10000 add perfCounterA, perfCounterA, $11 // Add to vertex count @@ -2167,7 +2154,7 @@ vtx_after_mtx_multiply: skip_vtx_mvp: andi $11, $5, G_LIGHTING >> 8 bnez $11, ovl234_lighting_entrypoint // Lighting setup, incl. transform - move inputVtxPos, dmemAddr // Must be before overlay load + sb $zero, materialCullMode // Vtx ends material vtx_after_lt_setup: lqv vM0I, (mITMatrix + 0x00)($zero) // Load MVP matrix lqv vM2I, (mITMatrix + 0x10)($zero) @@ -2197,6 +2184,7 @@ vtx_after_lt_setup: @@skipzeroao: bgtz $ra, clip_after_constants // Return to clipping if from there sqv sVPS, (tempViewportScale)(rdpCmdBufEndP1) // Store viewport scale + sb $zero, materialCullMode // Vtx ends material lqv vM0I, (mMatrix + 0x00)($zero) // Load M matrix lqv vM2I, (mMatrix + 0x10)($zero) lqv vM0F, (mMatrix + 0x20)($zero) @@ -2214,7 +2202,6 @@ vtx_after_lt_setup: srl $7, $5, 9 // G_LIGHTING in bit 1 and $7, $7, $11 // If lighting enabled and need to update matrix, and $7, $7, $10 // and computing mIT, - move inputVtxPos, dmemAddr // this must be before overlay load, can be clobbered ldv vM3F[0], (mMatrix + 0x38)($zero) ldv vM0I[8], (mMatrix + 0x00)($zero) ldv vM2I[8], (mMatrix + 0x10)($zero) @@ -2247,7 +2234,7 @@ vtx_after_matrix_load: addi $19, rdpCmdBufEndP1, vtxSize // Temp mem; fog writes up to vtxSize before jal while_wait_dma_busy // Wait for vertex load to finish move secondVtxPos, $19 // for first pre-loop, same for secondVtxPos - andi $11, $5, G_LIGHTING >> 8 + andi $11, $5, G_LIGHTING >> 8 // Must be after the DMA wait b/c modifies $ra beqz $11, @@skip_lighting li $ra, vtx_loop_no_lighting li $ra, lt_vtx_pair @@ -2609,7 +2596,7 @@ skip_return_to_lt_or_loop: ssv sCLZ[4], (VTX_SCR_Z )($19) // sOUTF = vPairPosF is $v21, or vPairTPosF is $v23 vmadn sOUTF, vM2F, vPairPosI[2h] // vPairPosI/F = vertices world coords - beqz $7, return_routine // fog disabled + beqz $7, return_and_end_mat // fog disabled // sOUTI = vPairPosI is $v20, or vPairTPosI is $v24 vmadh sOUTI, vM2I, vPairPosI[2h] // or vPairTPosI/F = vertices clip coords sbv sFOG[15], (VTX_COLOR_A )(secondVtxPos) @@ -2796,7 +2783,7 @@ vertex_end: .if !CFG_NO_OCCLUSION_PLANE || !CFG_LEGACY_VTX_PIPE lqv $v30, (v30Value)($zero) // Restore value overwritten in vtx_store .endif -tri_end: +tris_end: mfc0 $11, DPC_CLOCK lw $10, startCounterTime sub $11, $11, $10 @@ -3214,6 +3201,7 @@ ovl234_ovl4_entrypoint_ovl2ver: // same IMEM address as ovl234_ovl4_e // Jump here to do clipping. If overlay 2 is loaded (this code), loads overlay 3 // and jumps to right here, which is now in the new code. ovl234_clipping_entrypoint_ovl2ver: // same IMEM address as ovl234_clipping_entrypoint + sh $ra, tempTriRA // Tri return after clipping .if CFG_PROFILING_B addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load .endif @@ -3756,6 +3744,7 @@ G_MTX_end: // Jump here to do clipping. If overlay 4 is loaded (this code), loads overlay 3 // and jumps to right here, which is now in the new code. ovl234_clipping_entrypoint_ovl4ver: // same IMEM address as ovl234_clipping_entrypoint + sh $ra, tempTriRA // Tri return after clipping .if CFG_PROFILING_B addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load .endif