diff --git a/README.md b/README.md index 432c929..d0b4549 100644 --- a/README.md +++ b/README.md @@ -84,10 +84,14 @@ breaking changes.** buffers, **saving some DRAM traffic** (maybe around 100 us per frame). The bits used for this are ignored by HLE. - Segment addresses are now resolved relative to other segments (feature by - Tharo). This enables a strategy for skipping repeated material DLs: call + Tharo). This enables a strategy for **skipping repeated material DLs**: call a segment to run the material, remap the segment in the material to a display list that immediately returns, and so if the material is called again it won't run. +- New `SPMemset` command fills a specified RDRAM region with a repeated 16-bit + value. This can be used for clearing the Z buffer or filling the framebuffer + or the letterbox with a solid color **faster than the RDP can in fill mode**. + Practical performance may vary due to scheduling constraints. ### Miscellaneous diff --git a/f3dex3.s b/f3dex3.s index 1e83b55..dda4e15 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -185,6 +185,10 @@ Overlay 2 Overlay 4 */ +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////// DMEM ////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + // RSP DMEM .create DATA_FILE, 0x0000 @@ -620,6 +624,10 @@ clipTempVertsEnd: .error "Not enough space for temp matrix!" .endif +memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0) +memsetBufferEnd equ (clipTempVertsEnd & 0xFF0) +memsetBufferSize equ (memsetBufferEnd - memsetBufferStart) + RDP_CMD_BUFSIZE equ 0xB0 RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command RDP_CMD_BUFSIZE_TOTAL equ (RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS) @@ -682,11 +690,8 @@ xfrmLookatDirs equ -(0x1000 - (OSTask + OSTask_ucode_data)) // and OSTask_ucode_ .close // DATA_FILE -// RSP IMEM -.create CODE_FILE, 0x00001080 - //////////////////////////////////////////////////////////////////////////////// -/////////////////////////////// Register Use Map /////////////////////////////// +/////////////////////////////// Register Naming //////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // Vertex / lighting all regs: @@ -808,6 +813,197 @@ postOvlRA equ $10 // Commonly used locally // $30: perfCounterC (global) // $ra: Return address for jal, b*al +// Vertex configurations registers + +// armips only executes "equ" statements on the codepath where they are defined. +// However, it always parses all assembly instructions, even if they current codepath +// is not active. So, code like "A equ $20; add A, $11, $11" will cause an error +// on a disabled codepath, as the first statement is not executed but the second +// is parsed and A is not defined. +// For CFG_LEGACY_VTX_PIPE, use the registers which would normally be the VP matrix +// to store constants from setup, including through clipping. This does not save +// cycles during vertex processing because the loads are always hidden, but it saves +// two instructions each to save and restore them. (For ST it saves cycles too) +.if CFG_LEGACY_VTX_PIPE +sVPO equ $v9 +sVPS equ $v8 +sSTO equ $v11 // not supported on legacy vtx pipe, but register allocated for it +sSTS equ $v10 +.else +sVPO equ $v17 +.if CFG_NO_OCCLUSION_PLANE +sVPS equ $v26 +.else +sVPS equ $v16 +.endif +sSTO equ $v26 +sSTS equ $v25 +.endif +.if CFG_LEGACY_VTX_PIPE +sOUTF equ vPairTPosF +sOUTI equ vPairTPosI +.else +sOUTF equ vPairPosF +sOUTI equ vPairPosI +.endif +.if CFG_NO_OCCLUSION_PLANE +sFOG equ $v25 +sCLZ equ $v21 +sTCL equ $v21 +sTPN equ $v16 +// Occlusion plane; these don't exist on this codepath +sO03 equ $v29 +sO47 equ $v29 +sOCM equ $v29 +sOC1 equ $v29 +sOC2 equ $v29 +sOC3 equ $v29 +sOPM equ $v29 +sOPMs equ $v29 +sOSC equ $v29 +.else +sFOG equ $v16 +sCLZ equ $v25 +sTCL equ $v29 // does not exist on this codepath +sTPN equ $v18 +// Occlusion plane +sO03 equ $v26 +sO47 equ $v23 +sOCM equ $v22 +sOC1 equ $v21 +sOC2 equ $v27 +sOC3 equ $v21 +.if CFG_LEGACY_VTX_PIPE +sOPM equ $v12 // Kept here through whole processing +sOPMs equ $v12 // so these are the same +.else +sOPM equ $v17 // When used +sOPMs equ $v24 // Just another temp register +.endif +sOSC equ $v21 +.endif +// Temp storage after rdpCmdBufEndP1. There is 0xA8 of space here which will +// always be free during vtx load or clipping. +tempViewportScale equ 0x00 +tempViewportOffset equ 0x10 +tempOccPlusMinus equ 0x20 +tempXfrmSingle equ 0x30 +tempVpRGBA equ 0x40 +tempVpPkNorm equ 0x50 + + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////// IMEM ////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +// Macros for placing code in different places based on the microcode version + +.macro instantiate_mtx_end_begin +// Multiplies the temp loaded matrix into the M or VP matrix + lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP. + li $3, tempMemRounded // Input 1 = temp mem (loaded mtx) + jal while_wait_dma_busy + move $2, $6 // Input 0 = output + // Followed immedately by instantiate_mtx_multiply. These need to be broken + // up so we can insert the global mtx_multiply label between them. +.endmacro +.macro instantiate_mtx_multiply +// $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx + addi $10, $3, 0x0018 +@@loop: + vmadn $v7, $v31, $v31[2] // 0 + addi $11, $3, 0x0008 + vmadh $v6, $v31, $v31[2] // 0 + addi $2, $2, -0x0020 + vmudh $v29, $v31, $v31[2] // 0 +@@innerloop: + ldv $v3[0], 0x0040($2) + ldv $v3[8], 0x0040($2) + lqv $v1[0], 0x0020($3) // Input 1 + ldv $v2[0], 0x0020($2) + ldv $v2[8], 0x0020($2) + lqv $v0[0], 0x0000($3) // Input 1 + vmadl $v29, $v3, $v1[0h] + addi $3, $3, 0x0002 + vmadm $v29, $v2, $v1[0h] + addi $2, $2, 0x0008 // Increment input 0 pointer + vmadn $v5, $v3, $v0[0h] + bne $3, $11, @@innerloop + vmadh $v4, $v2, $v0[0h] + bne $3, $10, @@loop + addi $3, $3, 0x0008 + sqv $v7[0], (0x0020)($6) + sqv $v6[0], (0x0000)($6) +.if CFG_LEGACY_VTX_PIPE + beqz $7, vtx_after_mtx_multiply +.endif + sqv $v4[0], (0x0010)($6) + j run_next_DL_command + sqv $v5[0], (0x0030)($6) +.endmacro + +.macro instantiate_branch_wz + j vtx_addrs_from_cmd // byte 3 = vtx being tested; addr -> $10 + li $11, @@return_from_addrs +@@return_from_addrs: +.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2 + lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex) +.else + lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2) +.endif + sub $2, $10, cmd_w1_dram // subtract the w/z value being tested + bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL + lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to + j branch_dl // need $2 < 0 for nopush and cmd_w1_dram + li cmd_w0, 0 // No count of DL cmds to skip +.endmacro + +.macro instantiate_dma_io + jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one + lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command) + andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment + // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size + // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit + sra dmemAddr, dmemAddr, 2 + j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) + li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command +.endmacro + +.macro instantiate_memset + llv $v2[0], (rdpHalf1Val)($zero) // Load the memset value + sll cmd_w0, cmd_w0, 8 // Clear upper byte + jal segmented_to_physical + srl cmd_w0, cmd_w0, 8 // Number of bytes to memset (must be mult of 16) + li $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart + jal @@clamp_to_memset_buffer + vmudh $v2, vOne, $v2[1] // Move element 1 (lower bytes) to all + addi $2, $2, memsetBufferStart // First qword set is one below memsetBufferEnd +@@pre_loop: + sqv $v2, (-0x10)($2) + bne $2, $3, @@pre_loop + addi $2, -0x10 +@@transaction_loop: + jal @@clamp_to_memset_buffer + li dmemAddr, 0x8000 | memsetBufferStart // Always write from start of buffer + jal dma_read_write + addi dmaLen, $2, -1 + sub cmd_w0, cmd_w0, $2 + bgtz cmd_w0, @@transaction_loop + add cmd_w1_dram, cmd_w1_dram, $2 + j wait_for_dma_and_run_next_command + // Delay slot harmless +@@clamp_to_memset_buffer: + addi $11, cmd_w0, -memsetBufferSize // Is more than a whole buffer left? + bltz $11, return_routine + move $2, cmd_w0 // No, use partial buffer + jr $ra + li $2, memsetBufferSize +.endmacro + + +// RSP IMEM +.create CODE_FILE, 0x00001080 + // Initialization routines // Everything up until ovl01_end will get overwritten by ovl0 and/or ovl1 start: // This is at IMEM 0x1080, not the start of IMEM @@ -921,9 +1117,6 @@ G_LIGHTTORDP_handler: vertex_end: tri_end: .endif -.if CFG_LEGACY_VTX_PIPE -G_MEMSET_handler: -.endif G_SPNOOP_handler: run_next_DL_command: mfc0 $1, SP_STATUS // load the status word into register $1 @@ -972,6 +1165,75 @@ call_ret_common: j displaylist_dma_with_count sb $1, displayListStackLength +G_LOAD_UCODE_handler: + j load_overlay_0_and_enter // Delay slot is harmless +G_MODIFYVTX_handler: + // Command byte 3 = vtx being modified; its addr -> $10 + li $11, do_moveword // Moveword adds cmd_w0 to $10 for final addr + lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx +vtx_addrs_from_cmd: + // Treat eight bytes of last command each as vertex indices << 1 + // inputBufferEnd is close enough to the end of DMEM to fit in signed offset + lpv $v27[0], (-(0x1000 - (inputBufferEnd - 0x08)))(inputBufferPos) +vtx_indices_to_addr: + // Input and output in $v27 + // Also out elem 3 -> $10, elem 7 -> $3 because these are used more than once + lqv $v30, (v30Value)($zero) + vmudl $v29, $v27, $v30[1] // Multiply vtx indices times length + vmadn $v27, vOne, $v30[0] // Add address of vertex buffer + sb $zero, materialCullMode // This covers all tri cmds, vtx, modify vtx, branchZ, cull + mfc2 $10, $v27[6] + jr $11 + mfc2 $3, $v27[14] + +G_TRISTRIP_handler: + j tri_strip_fan_start + li $ra, tri_strip_fan_loop +G_TRIFAN_handler: + li $ra, tri_strip_fan_loop + 0x8000 // Negative = flag for G_TRIFAN +tri_strip_fan_start: + addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte +tri_strip_fan_loop: + lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word + addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command + beq $11, cmd_w0, tri_end // If off end of command, exit + sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit + bltz $10, tri_end // If negative, exit + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices + bltz $ra, tri_fan_store // Finish handling G_TRIFAN + addi cmd_w0, cmd_w0, 1 // Increment + andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri + bnez $11, tri_main // Draw as is + srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs + sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2 + j tri_main + sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3 + +G_TRI2_handler: +G_QUAD_handler: + jal tri_main // Send second tri; return here for first tri + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Put second tri indices in temp memory +G_TRI1_handler: + li $ra, tri_end // After done with this tri, exit tri processing + sw cmd_w0, 4(rdpCmdBufPtr) // Put first tri indices in temp memory +tri_main: + lpv $v27[0], 0(rdpCmdBufPtr) // Load tri indexes to elems 5, 6, 7 + j vtx_indices_to_addr // elem 7 -> $3; rest in $v27 + li $11, tri_return_from_addrs + +G_VTX_handler: + lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10 + lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode + srl $2, cmd_w0, 11 // n << 1 + sub $2, cmd_w0, $2 // v0 << 1 + sb $2, (inputBufferEnd - 0x06)(inputBufferPos) // Store v0 << 1 as byte 2 +.if COUNTER_A_UPPER_VERTEX_COUNT + sll $11, $1, 12 // Vtx count * 0x10000 + add perfCounterA, perfCounterA, $11 // Add to vertex count +.endif + j vtx_addrs_from_cmd // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $10 + li $11, vtx_return_from_addrs + .if !ENABLE_PROFILING G_LIGHTTORDP_handler: lbu $11, numLightsxSize // Ambient light @@ -1001,29 +1263,13 @@ G_SETxIMG_handler: .if CFG_LEGACY_VTX_PIPE G_DMA_IO_handler: - jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one - lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command) - andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment - // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size - // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit - sra dmemAddr, dmemAddr, 2 - j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) - li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command - + instantiate_dma_io + G_BRANCH_WZ_handler: - j vtx_addrs_from_cmd // byte 3 = vtx being tested; addr -> $10 - li $11, branchwz_return_from_addrs -branchwz_return_from_addrs: -.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2 - lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex) -.else - lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2) -.endif - sub $2, $10, cmd_w1_dram // subtract the w/z value being tested - bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL - lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to - j branch_dl // need $2 < 0 for nopush and cmd_w1_dram - li cmd_w0, 0 // No count of DL cmds to skip + instantiate_branch_wz + +G_MEMSET_handler: + instantiate_memset .else G_DMA_IO_handler: @@ -1103,151 +1349,7 @@ flush_rdp_buffer: j dma_read_write addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8) -G_LOAD_UCODE_handler: - j load_overlay_0_and_enter // Delay slot is harmless -G_MODIFYVTX_handler: - // Command byte 3 = vtx being modified; its addr -> $10 - li $11, do_moveword // Moveword adds cmd_w0 to $10 for final addr - lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx -vtx_addrs_from_cmd: - // Treat eight bytes of last command each as vertex indices << 1 - // inputBufferEnd is close enough to the end of DMEM to fit in signed offset - lpv $v27[0], (-(0x1000 - (inputBufferEnd - 0x08)))(inputBufferPos) -vtx_indices_to_addr: - // Input and output in $v27 - // Also out elem 3 -> $10, elem 7 -> $3 because these are used more than once - lqv $v30, (v30Value)($zero) - vmudl $v29, $v27, $v30[1] // Multiply vtx indices times length - vmadn $v27, vOne, $v30[0] // Add address of vertex buffer - sb $zero, materialCullMode // This covers all tri cmds, vtx, modify vtx, branchZ, cull - mfc2 $10, $v27[6] - jr $11 - mfc2 $3, $v27[14] -G_TRISTRIP_handler: - j tri_strip_fan_start - li $ra, tri_strip_fan_loop -G_TRIFAN_handler: - li $ra, tri_strip_fan_loop + 0x8000 // Negative = flag for G_TRIFAN -tri_strip_fan_start: - addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte -tri_strip_fan_loop: - lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word - addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command - beq $11, cmd_w0, tri_end // If off end of command, exit - sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit - bltz $10, tri_end // If negative, exit - sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices - bltz $ra, tri_fan_store // Finish handling G_TRIFAN - addi cmd_w0, cmd_w0, 1 // Increment - andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri - bnez $11, tri_main // Draw as is - srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs - sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2 - j tri_main - sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3 - -G_TRI2_handler: -G_QUAD_handler: - jal tri_main // Send second tri; return here for first tri - sw cmd_w1_dram, 4(rdpCmdBufPtr) // Put second tri indices in temp memory -G_TRI1_handler: - li $ra, tri_end // After done with this tri, exit tri processing - sw cmd_w0, 4(rdpCmdBufPtr) // Put first tri indices in temp memory -tri_main: - lpv $v27[0], 0(rdpCmdBufPtr) // Load tri indexes to elems 5, 6, 7 - j vtx_indices_to_addr // elem 7 -> $3; rest in $v27 - li $11, tri_return_from_addrs - -G_VTX_handler: -// armips only executes "equ" statements on the codepath where they are defined. -// However, it always parses all assembly instructions, even if they current codepath -// is not active. So, code like "A equ $20; add A, $11, $11" will cause an error -// on a disabled codepath, as the first statement is not executed but the second -// is parsed and A is not defined. -// For CFG_LEGACY_VTX_PIPE, use the registers which would normally be the VP matrix -// to store constants from setup, including through clipping. This does not save -// cycles during vertex processing because the loads are always hidden, but it saves -// two instructions each to save and restore them. (For ST it saves cycles too) -.if CFG_LEGACY_VTX_PIPE -sVPO equ $v9 -sVPS equ $v8 -sSTO equ $v11 // not supported on legacy vtx pipe, but register allocated for it -sSTS equ $v10 -.else -sVPO equ $v17 -.if CFG_NO_OCCLUSION_PLANE -sVPS equ $v26 -.else -sVPS equ $v16 -.endif -sSTO equ $v26 -sSTS equ $v25 -.endif -.if CFG_LEGACY_VTX_PIPE -sOUTF equ vPairTPosF -sOUTI equ vPairTPosI -.else -sOUTF equ vPairPosF -sOUTI equ vPairPosI -.endif -.if CFG_NO_OCCLUSION_PLANE -sFOG equ $v25 -sCLZ equ $v21 -sTCL equ $v21 -sTPN equ $v16 -// Occlusion plane; these don't exist on this codepath -sO03 equ $v29 -sO47 equ $v29 -sOCM equ $v29 -sOC1 equ $v29 -sOC2 equ $v29 -sOC3 equ $v29 -sOPM equ $v29 -sOPMs equ $v29 -sOSC equ $v29 -.else -sFOG equ $v16 -sCLZ equ $v25 -sTCL equ $v29 // does not exist on this codepath -sTPN equ $v18 -// Occlusion plane -sO03 equ $v26 -sO47 equ $v23 -sOCM equ $v22 -sOC1 equ $v21 -sOC2 equ $v27 -sOC3 equ $v21 -.if CFG_LEGACY_VTX_PIPE -sOPM equ $v12 // Kept here through whole processing -sOPMs equ $v12 // so these are the same -.else -sOPM equ $v17 // When used -sOPMs equ $v24 // Just another temp register -.endif -sOSC equ $v21 -.endif -// Temp storage after rdpCmdBufEndP1. There is 0xA8 of space here which will -// always be free during vtx load or clipping. -tempViewportScale equ 0x00 -tempViewportOffset equ 0x10 -tempOccPlusMinus equ 0x20 -tempXfrmSingle equ 0x30 -tempVpRGBA equ 0x40 -tempVpPkNorm equ 0x50 - - - lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10 - lhu $5, geometryModeLabel + 1 // Load middle 2 bytes of geom mode - srl $2, cmd_w0, 11 // n << 1 - sub $2, cmd_w0, $2 // v0 << 1 - sb $2, (inputBufferEnd - 0x06)(inputBufferPos) // Store v0 << 1 as byte 2 -.if COUNTER_A_UPPER_VERTEX_COUNT - sll $11, $1, 12 // Vtx count * 0x10000 - add perfCounterA, perfCounterA, $11 // Add to vertex count -.endif - j vtx_addrs_from_cmd // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $10 - li $11, vtx_return_from_addrs vtx_return_from_addrs: andi $10, $10, 0xFFF8 // Round down end addr to DMA word; one input vtx still fits in one internal vtx mfc2 outputVtxPos, $v27[4] // Address of start in vtxSize units @@ -1759,43 +1861,10 @@ skip_return_to_lt_or_loop: .endif // CFG_NO_OCCLUSION_PLANE .if CFG_LEGACY_VTX_PIPE || CFG_NO_OCCLUSION_PLANE -G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix - lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP. - li $3, tempMemRounded // Input 1 = temp mem (loaded mtx) - jal while_wait_dma_busy - move $2, $6 // Input 0 = output -mtx_multiply: // $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx - addi $10, $3, 0x0018 -@@loop: - vmadn $v7, $v31, $v31[2] // 0 - addi $11, $3, 0x0008 - vmadh $v6, $v31, $v31[2] // 0 - addi $2, $2, -0x0020 - vmudh $v29, $v31, $v31[2] // 0 -@@innerloop: - ldv $v3[0], 0x0040($2) - ldv $v3[8], 0x0040($2) - lqv $v1[0], 0x0020($3) // Input 1 - ldv $v2[0], 0x0020($2) - ldv $v2[8], 0x0020($2) - lqv $v0[0], 0x0000($3) // Input 1 - vmadl $v29, $v3, $v1[0h] - addi $3, $3, 0x0002 - vmadm $v29, $v2, $v1[0h] - addi $2, $2, 0x0008 // Increment input 0 pointer - vmadn $v5, $v3, $v0[0h] - bne $3, $11, @@innerloop - vmadh $v4, $v2, $v0[0h] - bne $3, $10, @@loop - addi $3, $3, 0x0008 - sqv $v7[0], (0x0020)($6) - sqv $v6[0], (0x0000)($6) -.if CFG_LEGACY_VTX_PIPE - beqz $7, vtx_after_mtx_multiply -.endif - sqv $v4[0], (0x0010)($6) - j run_next_DL_command - sqv $v5[0], (0x0030)($6) +G_MTX_end: + instantiate_mtx_end_begin +mtx_multiply: + instantiate_mtx_multiply .endif .if (. & 4) @@ -2455,7 +2524,7 @@ dma_read_write: bnez $11, dma_read_write addi perfCounterD, perfCounterD, 6 // 3 instr + 2 after mfc + 1 taken branch j dma_read_write_not_full - // Padding nops or $11 load in delay slot are not harmful. + // Padding nops or $11 load in delay slot are harmless. .endif totalImemUseUpTo1FC8: @@ -3314,111 +3383,17 @@ ovl234_clipping_entrypoint_ovl4ver: // same IMEM address as ovl234_clippi li cmd_w1_dram, orga(ovl3_start) // set up a load for overlay 3 ovl4_select_instr: - li $2, 1 // $7 = 1 (lighting && mIT invalid) if doing calc_mit - beq $2, $7, calc_mit // otherwise $7 = command byte - li $3, G_BRANCH_WZ - beq $3, $7, g_branch_wz_real - li $2, (0xFF00 | G_DMA_IO) - beq $2, $7, g_dma_io_real - li $3, (0xFF00 | G_MEMSET) - beq $3, $7, g_memset_real - nop - // Otherwise g_mtx_end_real - .if !CFG_NO_OCCLUSION_PLANE -g_mtx_end_real: -// Multiplies the temp loaded matrix into the M or VP matrix - lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP. - li $3, tempMemRounded // Input 1 = temp mem (loaded mtx) - jal while_wait_dma_busy // If ovl4 already in memory, was not done - move $2, $6 // Input 0 = output - addi $10, $3, 0x0018 -@@loop: - vmadn $v7, $v31, $v31[2] // 0 - addi $11, $3, 0x0008 - vmadh $v6, $v31, $v31[2] // 0 - addi $2, $2, -0x0020 - vmudh $v29, $v31, $v31[2] // 0 -@@innerloop: - ldv $v3[0], 0x0040($2) - ldv $v3[8], 0x0040($2) - lqv $v1[0], 0x0020($3) // Input 1 - ldv $v2[0], 0x0020($2) - ldv $v2[8], 0x0020($2) - lqv $v0[0], 0x0000($3) // Input 1 - vmadl $v29, $v3, $v1[0h] - addi $3, $3, 0x0002 - vmadm $v29, $v2, $v1[0h] - addi $2, $2, 0x0008 // Increment input 0 pointer - vmadn $v5, $v3, $v0[0h] - bne $3, $11, @@innerloop - vmadh $v4, $v2, $v0[0h] - bne $3, $10, @@loop - addi $3, $3, 0x0008 - sqv $v7[0], (0x0020)($6) - sqv $v6[0], (0x0000)($6) - sqv $v4[0], (0x0010)($6) - j run_next_DL_command - sqv $v5[0], (0x0030)($6) + li $2, (0xFF00 | G_MTX) + beq $2, $7, g_mtx_end_ovl4 .endif - -g_dma_io_real: - jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one - lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command) - andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment - // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size - // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit - sra dmemAddr, dmemAddr, 2 - j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) - li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command - -g_memset_real: -memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0) -memsetBufferEnd equ (clipTempVertsEnd & 0xFF0) -memsetBufferSize equ (memsetBufferEnd - memsetBufferStart) - llv $v2[0], (rdpHalf1Val)($zero) // Load the memset value - sll cmd_w0, cmd_w0, 8 // Clear upper byte - jal segmented_to_physical - srl cmd_w0, cmd_w0, 8 // Number of bytes to memset (must be mult of 16) - li $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart - jal clamp_to_memset_buffer - vmudh $v2, vOne, $v2[1] // Move element 1 (lower bytes) to all - addi $2, $2, memsetBufferStart // First qword set is one below memsetBufferEnd -@@pre_loop: - sqv $v2, (-0x10)($2) - bne $2, $3, @@pre_loop - addi $2, -0x10 -@@transaction_loop: - jal clamp_to_memset_buffer - li dmemAddr, 0x8000 | memsetBufferStart // Always write from start of buffer - jal dma_read_write - addi dmaLen, $2, -1 - sub cmd_w0, cmd_w0, $2 - bgtz cmd_w0, @@transaction_loop - add cmd_w1_dram, cmd_w1_dram, $2 - j wait_for_dma_and_run_next_command - // Delay slot harmless -clamp_to_memset_buffer: - addi $11, cmd_w0, -memsetBufferSize // Is more than a whole buffer left? - bltz $11, return_routine - move $2, cmd_w0 // No, use partial buffer - jr $ra - li $2, memsetBufferSize - -g_branch_wz_real: - j vtx_addrs_from_cmd // byte 3 = vtx being tested; addr -> $10 - li $11, branchwz_return_from_addrs -branchwz_return_from_addrs: -.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2 - lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex) -.else - lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2) -.endif - sub $2, $10, cmd_w1_dram // subtract the w/z value being tested - bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL - lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to - j branch_dl // need $2 < 0 for nopush and cmd_w1_dram - li cmd_w0, 0 // No count of DL cmds to skip + li $3, G_BRANCH_WZ + beq $3, $7, g_branch_wz_ovl4 + li $2, (0xFF00 | G_DMA_IO) + beq $2, $7, g_dma_io_ovl4 + li $3, (0xFF00 | G_MEMSET) + beq $3, $7, g_memset_ovl4 + // Otherwise calc_mit. Delay slot is harmless. calc_mit: /* @@ -3558,7 +3533,22 @@ calc_mit: sdv $v21[0], (mITMatrix + 0x18)($zero) j vtx_after_calc_mit sdv $v20[0], (mITMatrix + 0x00)($zero) - + +.if !CFG_NO_OCCLUSION_PLANE +g_mtx_end_ovl4: + instantiate_mtx_end_begin + instantiate_mtx_multiply +.endif + +g_branch_wz_ovl4: + instantiate_branch_wz + +g_dma_io_ovl4: + instantiate_dma_io + +g_memset_ovl4: + instantiate_memset + .endif // !CFG_LEGACY_VTX_PIPE ovl4_end: diff --git a/gbi.h b/gbi.h index d6b493a..df4628e 100644 --- a/gbi.h +++ b/gbi.h @@ -2388,6 +2388,9 @@ _DW({ \ /** * Use RSP DMAs to set a region of memory to a repeated 16-bit value. This can * clear the color framebuffer or Z-buffer faster than the RDP can in fill mode. + * SPMemset overwrites the DMEM vertex buffer, so vertices loaded before this + * command cannot be used after it (though this would not normally be done). + * * dram: Segmented or physical start address. Must be aligned to 16 bytes. * value: 16-bit value to fill the memory with. e.g. 0 for color, 0xFFFC for Z. * size: Size in bytes to fill, must be nonzero and a multiple of 16 bytes. diff --git a/rsp/rsp_defs.inc b/rsp/rsp_defs.inc index cd8160e..ea162a5 100644 --- a/rsp/rsp_defs.inc +++ b/rsp/rsp_defs.inc @@ -14,18 +14,18 @@ YDF_OFFSET_TASKDATAPTR equ 0x10 YDF_OFFSET_UCODE equ 0x14 // OSTask data member offsets -OSTask_type equ 0x0000 -OSTask_flags equ 0x0004 // see note below -OSTask_ucode_boot equ 0x0008 -OSTask_ucode_boot_size equ 0x000C +OSTask_type equ 0x0000 // EX3: fourthQWMVP ^ +OSTask_flags equ 0x0004 // see note below | +OSTask_ucode_boot equ 0x0008 // | +OSTask_ucode_boot_size equ 0x000C // v OSTask_ucode equ 0x0010 // used in F3D, S2D, and boot -OSTask_ucode_size equ 0x0014 -OSTask_ucode_data equ 0x0018 // used in boot only -OSTask_ucode_data_size equ 0x001C // used in boot only +OSTask_ucode_size equ 0x0014 // EX3: startCounterTime +OSTask_ucode_data equ 0x0018 // used in boot only. EX3: xfrmLookatDirs ^ +OSTask_ucode_data_size equ 0x001C // used in boot only v OSTask_dram_stack equ 0x0020 // used in F3D and S2D at first startup OSTask_dram_stack_size equ 0x0024 OSTask_output_buff equ 0x0028 // used in F3D and S2D -OSTask_output_buff_size equ 0x002C // actually end pointer; used in F3D and S2D +OSTask_output_buff_size equ 0x002C // used in F3D and S2D; actually end pointer not size OSTask_data_ptr equ 0x0030 // used in F3D and S2D OSTask_data_size equ 0x0034 OSTask_yield_data_ptr equ 0x0038 // used in F3D and S2D