From b79361b3f04d175c63cdcf6c54b4c6c3a0d9a63b Mon Sep 17 00:00:00 2001 From: Sauraen Date: Sat, 27 Sep 2025 18:28:30 -0700 Subject: [PATCH] More cleanup --- f3dex3.s | 163 +++++++++++++++++++++++-------------------------------- 1 file changed, 69 insertions(+), 94 deletions(-) diff --git a/f3dex3.s b/f3dex3.s index ee91b2e..10b3e52 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -655,29 +655,22 @@ vertexBuffer: // Space for temporary verts for clipping code, and reused for other things clipTempVerts: -// Round up to 0x10 -.org ((clipTempVerts + 0xF) & 0xFF0) -// Vertex addresses, to avoid a multiply-add for each vertex index lookup -vertexTable: - .skip ((G_MAX_VERTS + 8) * 2) // halfword for each vertex; need 1 extra end addr, easier to write 8 extra - -.if (. & 15) != 0 - .error "tempMatrix not aligned" -.endif - -tempMatrix: +// Round up to 0x8 +.org ((clipTempVerts + 0x7) & 0xFF8) + texrectState: .skip 8 // Only needs to be saved over texrect, half1, half2; but yield can happen .if . > yieldDataFooter - // Need to fit everything through vertex buffer in yield buffer, and also texrectState. - // Would like to also fit vertexTable to avoid recompute after yield. - // Putting texrectState after vertexTable for alignment reasons. + // Need to fit everything through here in yield buffer .error "Too much being stored in yieldable DMEM" .endif - // Rest of tempMatrix - .skip 0x40 - 8 +// Round up to 0x10 +.org ((texrectState + 0xF) & 0xFF0) + +tempMatrix: + .skip 0x40 .if . > (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES) .error "Too much in clipTempVerts" @@ -1097,8 +1090,7 @@ continue_from_os_task: lw perfCounterB, mvpMatrix + YDF_OFFSET_PERFCOUNTERB lw perfCounterC, mvpMatrix + YDF_OFFSET_PERFCOUNTERC lw perfCounterD, mvpMatrix + YDF_OFFSET_PERFCOUNTERD - jal fill_vertex_table - lw taskDataPtr, OSTask + OSTask_data_ptr + lw taskDataPtr, OSTask + OSTask_data_ptr finish_setup: .if CFG_PROFILING_C mfc0 $11, DPC_CLOCK @@ -1172,10 +1164,17 @@ load_overlays_0_1: j load_overlay_inner li dmemAddr, 0x1000 -G_MODIFYVTX_handler: // 3 - mfc2 $10, $v7[6] // Byte 3 = vtx being modified - j do_moveword // Moveword adds cmd_w0 to $10 for final addr - lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear +G_RDPHALF_2_handler: // 8; should be after the handlers with alignment needs + li $11, texrectState + ldv $v29[0], (0)($11) + sb $zero, materialCullMode // This covers tex and fill rects + lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0 + addi rdpCmdBufPtr, rdpCmdBufPtr, 8 +.if !ENABLE_PROFILING + addi perfCounterB, perfCounterB, 1 // Increment number of tex/fill rects +.endif + j send_w0_w1_to_rdp // w1 is from the current command + sdv $v29[0], -8(rdpCmdBufPtr) G_SETxIMG_handler: // 12 lb $3, materialCullMode // Get current mode @@ -1436,8 +1435,6 @@ tXPF equ $v16 // Triangle cross product tXPI equ $v17 vreadacc tXPI, ACC_UPPER add $19, origV1Addr, flatV1Offset - // TODO move this to alpha compare cull no nop, move both branches out of line - sb $zero, materialCullMode // Covers tri write (non early exit) vreadacc tXPF, ACC_MIDDLE lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1 vrcp $v20[0], tPosCatI[1] @@ -1452,20 +1449,13 @@ tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4) tXPRcpI equ $v24 vrcpl tXPRcpF[1], tXPF[1] .if !ENABLE_PROFILING - bnez $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set + beqz $11, tri_flat_shading // Branch if G_SHADING_SMOOTH is clear .endif vrcph tXPRcpI[1], $v31[2] // 0 -.if !ENABLE_PROFILING - vlt $v29, $v31, $v31[3] // Set vcc to 11100000 - vmrg tHAtI, $v25, tHAtI // RGB from original vtx 1, alpha from $1 - vmrg tMAtI, $v25, tMAtI // RGB from original vtx 1, alpha from $2 - vmrg tLAtI, $v25, tLAtI // RGB from original vtx 1, alpha from $3 -align_with_warning 8, "One instruction of padding before tri_skip_flat_shading" -tri_skip_flat_shading: -.endif +tri_return_from_flat_shading: // 44 cycles vrcp $v20[2], tPosMmH[1] - mtc2 $20, tMPos[14] // 0xFFF8; only elem 0, 1, 2 of this reg used now + ssv tPosMmH[2], 0x0030(rdpCmdBufPtr) // MmHY -> first short (temp mem) vrcph $v22[2], tPosMmH[1] llv t1WI[0], VTX_INV_W_VEC($1) vrcp $v20[3], tPosLmH[1] @@ -1473,7 +1463,7 @@ tri_skip_flat_shading: vrcph $v22[3], tPosLmH[1] llv t1WI[12], VTX_INV_W_VEC($3) vmudl tHAtI, tHAtI, vTRC_0100 // vertex color 1 >>= 8 - lb $20, (alphaCompareCullMode)($zero) + lb $11, (alphaCompareCullMode)($zero) vmudl tMAtI, tMAtI, vTRC_0100 // vertex color 2 >>= 8 lw $6, VTX_INV_W_VEC($1) // $6, $7, $8 = 1/W for H, M, L vmudl tLAtI, tLAtI, vTRC_0100 // vertex color 3 >>= 8 @@ -1481,25 +1471,14 @@ tri_skip_flat_shading: vmudl $v29, $v20, vTRC_0020 lw $8, VTX_INV_W_VEC($3) vmadm $v22, $v22, vTRC_0020 - beqz $20, tri_skip_alpha_compare_cull + bnez $11, tri_alpha_compare_cull vmadn $v20, $v31, $v31[2] // 0 - // Alpha compare culling - vge $v26, tHAtI, tMAtI - lbu $19, alphaCompareCullThresh - vlt $v25, tHAtI, tMAtI - bgtz $20, @@skip1 - vge $v26, $v26, tLAtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts - vlt $v26, $v25, tLAtI // else if < 0, $v26 = min of 3 verts -@@skip1: // $v26 elem 3 has max or min alpha value - mfc2 $24, $v26[6] - sub $24, $24, $19 // sign bit set if (max/min) < thresh - xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull - bltz $24, return_and_end_mat // if max < thresh or if min >= thresh. -tri_skip_alpha_compare_cull: +// $v6 <- tPosMmH; $v6 clobbered in alpha compare cull +tri_return_from_alpha_compare_cull: // Uses $v25, $v26 // 60 cycles tPosCatF equ $v25 vmudm tPosCatF, tPosCatI, vTRC_1000 - // no nop if tri_skip_alpha_compare_cull was unaligned + mtc2 $20, tMPos[14] // 0xFFF8; only elem 0, 1, 2 of this reg used now vmadn tPosCatI, $v31, $v31[2] // 0 sub $11, $6, $7 // Four instr: $6 = max($6, $7) vsubc tSubPxHF, vZero, tSubPxHF @@ -1558,7 +1537,7 @@ tMnWI equ $v25 // <- tMx1W lw $19, otherMode1 tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF vmudh tSTWHMI, vOne, $v31[7] // 0x7FFF - ssv tPosMmH[2], 0x0030(rdpCmdBufPtr) // MmHY -> first short (temp mem) + sb $zero, materialCullMode // Covers tri write (non early exit) vmudm $v29, t1WI, tMnWF[0] // 1/W each vtx * min W = 1 for one of the verts, < 1 for others llv tSTWHMI[0], VTX_TC_VEC($1) vmadl $v29, t1WF, tMnWF[0] @@ -1770,14 +1749,28 @@ flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAd j dma_read_write addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8) -align_with_warning 8, "One instruction of padding before return_and_end_mat" +align_with_warning 8, "One instruction of padding before tri_alpha_compare_cull" +tri_alpha_compare_cull: +// Alpha compare culling + vge $v26, tHAtI, tMAtI + lbu $19, alphaCompareCullThresh + vlt $v25, tHAtI, tMAtI + bgtz $11, @@skip1 + vge $v26, $v26, tLAtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts + vlt $v26, $v25, tLAtI // else if < 0, $v26 = min of 3 verts +@@skip1: // $v26 elem 3 has max or min alpha value + mfc2 $24, $v26[6] + sub $24, $24, $19 // sign bit set if (max/min) < thresh + xor $24, $24, $11 // invert sign bit if other cond. Sign bit set -> cull, + bgez $24, tri_return_from_alpha_compare_cull // if max < thresh or if min >= thresh. tri_culled_by_occlusion_plane: .if CFG_PROFILING_B + nop addi perfCounterB, perfCounterB, 0x4000 .endif return_and_end_mat: - tri_v1_move + tri_v1_move // overwrites $v6[1] jr $ra sb $zero, materialCullMode // This covers all tri early exits except clipping @@ -1790,13 +1783,14 @@ tri_snake_end: j tris_end and inputBufferPos, inputBufferPos, $11 // inputBufferPos has to be negative -tri_decal_fix_z: - // Valid range of tHAtI = 0 to 7FFF, but most of the scene is large values - vmudh $v29, vOne, vTRC_DO // accum all elems = -DM/2 - vmadm $v25, tHAtI, vTRC_DM // elem 7 = (0 to DM/2-1) - DM/2 = -DM/2 to -1 - vcr tDaDyI, tDaDyI, $v25[7] // Clamp DzDyI (6) to <= -val or >= val; clobbers DzDyF (7) - j tri_return_from_decal_fix_z - set_vcc_11110001 // Clobbered by vcr +.if !ENABLE_PROFILING +tri_flat_shading: + vlt $v29, $v31, $v31[3] // Set vcc to 11100000 + vmrg tHAtI, $v25, tHAtI // RGB from original vtx 1, alpha from $1 + vmrg tMAtI, $v25, tMAtI // RGB from original vtx 1, alpha from $2 + j tri_return_from_flat_shading + vmrg tLAtI, $v25, tLAtI // RGB from original vtx 1, alpha from $3 +.endif align_with_warning 8, "One instruction of padding before ovl234" @@ -2158,26 +2152,6 @@ clip_done: // Delay slot is harmless if branched add origV1Addr, origV1Addr, flatV1Offset // Real orig addr = cur V1 + offset li flatV1Offset, 0 lh $ra, tempTriRA -fill_vertex_table: - // Create bytes 00-07 - li $1, 7 -@@loop1: - sb $1, (vertexTable)($1) - bgtz $1, @@loop1 - addi $1, $1, -1 - // Load to vu and multiply by 2 to get vertex indexes. It would be more cycles - // to change the loop above to count by 2s than the stalls here. - li $2, vertexTable - lpv $v3[0], (0)($2) - li $3, vertexTable + ((G_MAX_VERTS + 8) * 2) // Need 0-56 inclusive, so do 0-63 - vmudh $v3, $v3, $v31[3] // 2; now 0x0000, 0x0200, ..., 0x0E00 -@@loop2: - vmudn $v29, vOne, vTRC_VB // Address of vertex buffer - vmadl $v4, $v3, vTRC_VS // Plus vtx indices times length - vadd $v3, $v3, vTRC_1000 // increment by 8 verts = 16 - addi $2, $2, 0x10 - bne $2, $3, @@loop2 - sqv $v4[0], (-0x10)($2) jr $ra // Delay slot is harmless clip_w: vcopy cBaseF, cPosOnOfF // Result is just W @@ -2191,6 +2165,14 @@ ovl3_padded_end: .orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga()) ovl234_end: +tri_decal_fix_z: + // Valid range of tHAtI = 0 to 7FFF, but most of the scene is large values + vmudh $v29, vOne, vTRC_DO // accum all elems = -DM/2 + vmadm $v25, tHAtI, vTRC_DM // elem 7 = (0 to DM/2-1) - DM/2 = -DM/2 to -1 + vcr tDaDyI, tDaDyI, $v25[7] // Clamp DzDyI (6) to <= -val or >= val; clobbers DzDyF (7) + j tri_return_from_decal_fix_z + set_vcc_11110001 // Clobbered by vcr + // Converts the segmented address in cmd_w1_dram to the corresponding physical address segmented_to_physical: // 8 srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11 @@ -2912,7 +2894,7 @@ final address is completely wrong. However, DMEM wraps at 4 KiB--only the lowest I only noticed this when I tried to move G_RELSEGMENT to a different command byte and got crashes. */ .if (G_RELSEGMENT & 0xF) != (G_MOVEWORD & 0xF) -.error "Crazy relsegment optimization broken, don't change command byte assignments" + .error "Crazy relsegment optimization broken, don't change command byte assignments" .endif G_RELSEGMENT_handler: // 9 jal segmented_to_physical // Resolve new segment address relative to existing segment @@ -3015,7 +2997,12 @@ G_SETOTHERMODE_L_handler: j G_RDP_handler lpv $v4[0], (otherMode0)($zero) -displaylist_dma_from_yield: +G_MODIFYVTX_handler: // 3 + mfc2 $10, $v7[6] // Byte 3 = vtx being modified + j do_moveword // Moveword adds cmd_w0 to $10 for final addr + lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear + +displaylist_dma_from_yield: // 2 j displaylist_dma_goto_next_ra lh nextRA, tempTriRA @@ -3031,18 +3018,6 @@ G_RDPHALF_1_handler: // $ra = ., 0x10 ahead of geometry mode j run_next_DL_command sw cmd_w1_dram, (geometryModeLabel - G_GEOMETRYMODE_handler)($ra) -G_RDPHALF_2_handler: // 8; should be after the handlers with alignment needs - li $11, texrectState - ldv $v29[0], (0)($11) - sb $zero, materialCullMode // This covers tex and fill rects - lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0 - addi rdpCmdBufPtr, rdpCmdBufPtr, 8 -.if !ENABLE_PROFILING - addi perfCounterB, perfCounterB, 1 // Increment number of tex/fill rects -.endif - j send_w0_w1_to_rdp // w1 is from the current command - sdv $v29[0], -8(rdpCmdBufPtr) - ovl1_end: align_with_warning 8, "One instruction of padding at end of ovl1" ovl1_padded_end: