From d7f33fea9297a7a1180d29bcdc7aac01052d03b0 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Sun, 28 Jan 2024 16:40:14 -0800 Subject: [PATCH] Integrated changes from Tharo --- README.md | 28 +++++++-- f3dex3.s | 162 ++++++++++++++++++++++++++-------------------------- gbi.h | 7 ++- rsp/gbi.inc | 1 + 4 files changed, 110 insertions(+), 88 deletions(-) diff --git a/README.md b/README.md index f384a34..8e06cbe 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,11 @@ you should expect crashes and graphical issues.** commands in the next DL to be fetched, rather than always fetching full buffers, **saving some DRAM traffic** (maybe around 100 us per frame). The bits used for this are ignored by HLE. +- Segment addresses are now resolved relative to other segments (feature by + Tharo). This enables a strategy for skipping repeated material DLs: call + a segment to run the material, remap the segment in the material to a + display list that immediately returns, and so if the material is called again + it won't run. - Clipped triangles are drawn by minimal overlapping scanlines algorithm; this **slightly improves RDP draw time** for large tris (max of about 500 us per frame, usually much less or zero). @@ -299,8 +304,7 @@ and commands except: MVP matrix in F3DEX3. - `G_MV_POINT` has been removed. This was not used in any command; it would have likely been used for debugging to copy vertices from DMEM to examine them. - This does not affect `SPModifyVertex`, which is still supported, though this - is moved to Overlay 4 (see below) so it will be slower than in F3DEX2. + This does not affect `SPModifyVertex`, which is still supported. - `G_MW_PERSPNORM` has been removed; `SPPerspNormalize` is still supported but is encoded differently, no longer using this define. - `G_MVO_LOOKATX` and `G_MVO_LOOKATY` have been removed, and `SPLookAtX` and @@ -336,6 +340,12 @@ them binary incompatible. The lighting data structures, e.g. `Light_t`, `PosLight_t`, `LookAt_t`, `Lightsn`, `Lights*`, `PosLights*`, etc., have also changed--generally only slightly, so most code is compatible with no changes. +`SPSegment` has been given a different command id (`G_RELSEGMENT` vs. +`G_MOVEWORD`) to facilitate relative segmented address translation. The +original binary encoding is still valid, but does not support relative +translation like the new encoding. However, recompiling with the C GBI will +always use the new encoding. + ## What are the tradeoffs for all these new features? @@ -368,7 +378,6 @@ This overlay contains handlers for: discussed below - The codepath for `SPMatrix` with `G_MTX_MUL` set - `SPBranchLessZ*` -- `SPModifyVertex` - `SPDma_io` Whenever any of these features is needed, the RSP has to swap to Overlay 4. The @@ -475,6 +484,17 @@ the FIFO is occupied by full-size tris, so the buffers are effectively only two tris in size because a third tri can't fit. So, their size has been reduced to two tris, saving a substantial amount of DMEM. +### Segment 0 + +Segment 0 is now reserved: ensure segment 0 is never set to anything but +0x00000000. In F3DEX2 and prior this was only a good idea (and SM64 and OoT +always follow this); in F3DEX3 segmented addresses are now resolved relative to +other segments. That is, `gsSPSegment(0x08, 0x07001000)` sets segment 8 to the +base address of segment 7 with an additional offset of 0x1000. So for correct +behavior when supplying a direct-mapped or physical address such as 0x80101000, +segment 0 must always be 0x00000000 so that this address resolves to e.g. +0x101000 as expected in this example. + ### Obscure semantic differences from F3DEX2 that should never matter in practice - `SPLoadUcode*` corrupts the current M inverse transpose matrix state. If using @@ -534,7 +554,7 @@ are credited. Other credits: - Wiseguy: large chunk of F3DEX2 disassembly documentation and first version of build system +- Tharo: relative segment resolution feature, other feature discussions - Kaze Emanuar: several feature suggestions, testing - thecozies: Fresnel feature suggestion -- Tharo: feature discussions - neoshaman: feature discussions diff --git a/f3dex3.s b/f3dex3.s index 325629e..182e835 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -415,7 +415,7 @@ jumpTableEntry G_TEXRECT_handler jumpTableEntry G_TEXRECTFLIP_handler cmdJumpTable: jumpTableEntry G_VTX_handler -jumpTableEntry ovl234_ovl4_entrypoint // G_MODIFYVTX +jumpTableEntry G_MODIFYVTX_handler jumpTableEntry G_CULLDL_handler jumpTableEntry ovl234_ovl4_entrypoint // G_BRANCH_WZ jumpTableEntry G_TRI1_handler @@ -424,6 +424,7 @@ jumpTableEntry G_QUAD_handler jumpTableEntry G_TRISTRIP_handler jumpTableEntry G_TRIFAN_handler jumpTableEntry G_LIGHTTORDP_handler +jumpTableEntry G_RELSEGMENT_handler // The maximum number of generated vertices in a clip polygon. In reality, this // is equal to MAX_CLIP_POLY_VERTS, but for testing we can change them separately. @@ -811,34 +812,32 @@ call_ret_common: j displaylist_dma_with_count sb $1, displayListStackLength -G_CULLDL_handler: - j vtx_addrs_from_cmd // Load start vtx addr in $12, end vtx in $3 - li $11, culldl_return_from_addrs -culldl_return_from_addrs: - /* - CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1 - verts which are behind the occlusion plane, and 1 vert which is behind the camera - plane and therefore randomly erroneously also set as behind the occlusion plane. - However, the convex hull of all the verts goes through visible area. This will be - incorrectly culled here. We can't afford the extra few instructions to disable - the occlusion plane if the vert is behind the camera, because this only matters for - G_CULLDL and not for tris. - */ - li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) - lhu $11, VTX_CLIP($12) -culldl_loop: - and $1, $1, $11 - beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render - lhu $11, (vtxSize + VTX_CLIP)($12) // next vertex clip flags - bne $12, $3, culldl_loop // loop until reaching the last vertex - addi $12, $12, vtxSize // advance to the next vertex - li cmd_w0, 0 // Clear count of DL cmds to skip loading -G_ENDDL_handler: - lbu $1, displayListStackLength // Load the DL stack index; if end stack, - beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end - addi $1, $1, -4 // Decrement the DL stack index - j call_ret_common // has a different version in ovl1 - lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to +.if !CFG_GCLK_SAMPLE +G_LIGHTTORDP_handler: + lbu $11, numLightsxSize // Ambient light + lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size + andi $2, cmd_w0, 0x00FF // Byte 3 = alpha + sub $1, $11, $1 // Light address; byte 2 counts from end + lw $3, (lightBufferMain-1)($1) // Load light RGB into lower 3 bytes + move cmd_w0, cmd_w1_dram // Move second word to first (cmd byte, prim level) + sll $3, $3, 8 // Shift light RGB to upper 3 bytes and clear alpha byte + j G_RDP_handler // Send to RDP + or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word +.endif + +G_SETxIMG_handler: + beqz $7, G_RDP_handler // Don't do any of this for G_NOOP + lb $3, materialCullMode // Get current mode + jal segmented_to_physical // Convert image to physical address + lw $2, lastMatDLPhyAddr // Get last material physical addr + bnez $3, G_RDP_handler // If not in normal mode (0), exit + add $12, taskDataPtr, inputBufferPos // Current material physical addr + beq $12, $2, @@skip // Branch if we are executing the same mat again + sw $12, lastMatDLPhyAddr // Store material physical addr + li $7, 1 // > 0: in material first time +@@skip: // Otherwise $7 was < 0: cull mode (in mat second time) + j G_RDP_handler + sb $7, materialCullMode refine_cmd_further: addi $12, $7, -(0xFF00 | G_SETSCISSOR) // Relative to G_SETSCISSOR = 0 @@ -1492,6 +1491,10 @@ vtx_skip_fog: jr $ra sh $12, (VTX_CLIP )(outputVtxPos) // Store first vertex results +G_MODIFYVTX_handler: + // Command byte 3 = vtx being modified; its addr -> $12 + li $11, do_moveword // Moveword adds cmd_w0 to $12 for final addr + lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx vtx_addrs_from_cmd: // Treat eight bytes of last command each as vertex indices << 1 // inputBufferEnd is close enough to the end of DMEM to fit in signed offset @@ -1841,30 +1844,31 @@ tDaDeI equ $v9 vmadh tV1AtI, tDaDeI, $v26[1] sdv tDaDeI[8], 0x0020($1) // Store DsDe, DtDe, DwDe texture coefficients (integer) tV1AtFF equ $v10 - vmudn tV1AtFF, tDaDeF, $v4[1] // Super-frac (frac * frac) part; assumes v4 factor >= 0 - vmudn tDaDeF, tDaDeF, $v30[7] // 0x0020 - vmadh tDaDeI, tDaDeI, $v30[7] // 0x0020 + // All values start in element 7. "a", attribute, is Z. Need + // tV1AtI, tV1AtF, tDaDxI, tDaDxF, tDaDeI, tDaDeF, tDaDyI, tDaDyF + vmov tDaDyF[5], tDaDeF[7] // DaDy already in elem 7; DaDe to elem 5 sdv tV1AtF[0], 0x0010($2) // Store RGBA shade color (fractional) - vmudn tDaDxF, tDaDxF, $v30[7] // 0x0020 + vmov tDaDyI[5], tDaDeI[7] sdv tV1AtI[0], 0x0000($2) // Store RGBA shade color (integer) - vmadh tDaDxI, tDaDxI, $v30[7] // 0x0020 + vmov tDaDyF[3], tDaDxF[7] // DaDx to elem 3 sdv tV1AtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional) - vmudn tDaDyF, tDaDyF, $v30[7] // 0x0020 + vmov tDaDyI[3], tDaDxI[7] + sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer) + vmudn tV1AtFF, tDaDeF, $v4[1] // Super-frac (frac * frac) part; assumes v4 factor >= 0 beqz $6, check_rdp_buffer_full // see below - sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer) + veq $v29, $v31, $v31[1q] // Set VCC to 01010101 + vmudn tDaDyF, tDaDyF, $v30[7] // 0x0020 vmadh tDaDyI, tDaDyI, $v30[7] // 0x0020 - ssv tDaDeF[14], -0x0006(rdpCmdBufPtr) vmudl $v29, tV1AtFF, $v30[7] // 0x0020 - ssv tDaDeI[14], -0x0008(rdpCmdBufPtr) vmadn tV1AtF, tV1AtF, $v30[7] // 0x0020 - ssv tDaDxF[14], -0x000A(rdpCmdBufPtr) vmadh tV1AtI, tV1AtI, $v30[7] // 0x0020 - ssv tDaDxI[14], -0x000C(rdpCmdBufPtr) - ssv tDaDyF[14], -0x0002(rdpCmdBufPtr) - ssv tDaDyI[14], -0x0004(rdpCmdBufPtr) - ssv tV1AtF[14], -0x000E(rdpCmdBufPtr) + vmrg tDaDyF, tDaDyF, tDaDyI[1q] // Move int elems 3, 5, 7 to result 2, 4, 6 + ssv tV1AtF[14], -0x0E(rdpCmdBufPtr) + ssv tV1AtI[14], -0x10(rdpCmdBufPtr) + slv tDaDyF[4], -0x0C(rdpCmdBufPtr) // DaDx i/f j check_rdp_buffer_full // eventually returns to $ra, which is next cmd, second tri in TRI2, or middle of clipping - ssv tV1AtI[14], -0x10(rdpCmdBufPtr) + sdv tDaDyF[8], -0x08(rdpCmdBufPtr) // DaDe i/f, DaDy i/f + load_overlay_0_and_enter: G_LOAD_UCODE_handler: @@ -2097,8 +2101,10 @@ G_RDPHALF_2_handler: j G_RDP_handler sdv $v29[0], -8(rdpCmdBufPtr) +G_RELSEGMENT_handler: + jal segmented_to_physical // Resolve new segment address relative to existing segment G_MOVEWORD_handler: - srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT) + srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT) lhu $12, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304) do_moveword: sll $11, cmd_w0, 16 // Sign bit = upper bit of offset @@ -2118,32 +2124,35 @@ segmented_to_physical: jr $ra add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address -G_SETxIMG_handler: - beqz $7, G_RDP_handler // Don't do any of this for G_NOOP - lb $3, materialCullMode // Get current mode - jal segmented_to_physical // Convert image to physical address - lw $2, lastMatDLPhyAddr // Get last material physical addr - bnez $3, G_RDP_handler // If not in normal mode (0), exit - add $12, taskDataPtr, inputBufferPos // Current material physical addr - beq $12, $2, @@skip // Branch if we are executing the same mat again - sw $12, lastMatDLPhyAddr // Store material physical addr - li $7, 1 // > 0: in material first time -@@skip: // Otherwise $7 was < 0: cull mode (in mat second time) - j G_RDP_handler - sb $7, materialCullMode +G_CULLDL_handler: + j vtx_addrs_from_cmd // Load start vtx addr in $12, end vtx in $3 + li $11, culldl_return_from_addrs +culldl_return_from_addrs: + /* + CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1 + verts which are behind the occlusion plane, and 1 vert which is behind the camera + plane and therefore randomly erroneously also set as behind the occlusion plane. + However, the convex hull of all the verts goes through visible area. This will be + incorrectly culled here. We can't afford the extra few instructions to disable + the occlusion plane if the vert is behind the camera, because this only matters for + G_CULLDL and not for tris. + */ + li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) + lhu $11, VTX_CLIP($12) +culldl_loop: + and $1, $1, $11 + beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render + lhu $11, (vtxSize + VTX_CLIP)($12) // next vertex clip flags + bne $12, $3, culldl_loop // loop until reaching the last vertex + addi $12, $12, vtxSize // advance to the next vertex + li cmd_w0, 0 // Clear count of DL cmds to skip loading +G_ENDDL_handler: + lbu $1, displayListStackLength // Load the DL stack index; if end stack, + beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end + addi $1, $1, -4 // Decrement the DL stack index + j call_ret_common // has a different version in ovl1 + lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to -.if !CFG_GCLK_SAMPLE -G_LIGHTTORDP_handler: - lbu $11, numLightsxSize // Ambient light - lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size - andi $2, cmd_w0, 0x00FF // Byte 3 = alpha - sub $1, $11, $1 // Light address; byte 2 counts from end - lw $3, (lightBufferMain-1)($1) // Load light RGB into lower 3 bytes - move cmd_w0, cmd_w1_dram // Move second word to first (cmd byte, prim level) - sll $3, $3, 8 // Shift light RGB to upper 3 bytes and clear alpha byte - j G_RDP_handler // Send to RDP - or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word -.endif ovl1_end: .align 8 @@ -2496,10 +2505,8 @@ ovl4_select_instr: beq $2, $7, calc_mit // otherwise $7 = command byte li $3, G_BRANCH_WZ beq $3, $7, G_BRANCH_WZ_handler - li $2, G_MODIFYVTX - beq $2, $7, G_MODIFYVTX_handler - li $3, (0xFF00 | G_DMA_IO) - beq $3, $7, G_DMA_IO_handler + li $2, (0xFF00 | G_DMA_IO) + beq $2, $7, G_DMA_IO_handler // Otherwise G_MTX_end, which starts with a harmless instruction G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix @@ -2547,13 +2554,6 @@ G_DMA_IO_handler: j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command -G_MODIFYVTX_handler: - j vtx_addrs_from_cmd // byte 3 = vtx being modified; addr -> $12 - li $11, modifyvtx_return_from_addrs -modifyvtx_return_from_addrs: - j do_moveword // Moveword adds cmd_w0 to $12 for final addr - lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) - G_BRANCH_WZ_handler: j vtx_addrs_from_cmd // byte 3 = vtx being tested; addr -> $12 li $11, branchwz_return_from_addrs diff --git a/gbi.h b/gbi.h index bfd178b..3dbc364 100644 --- a/gbi.h +++ b/gbi.h @@ -97,6 +97,7 @@ of warnings if you use -Wpedantic. */ #define G_TRISTRIP 0x08 /* = G_LINE3D was a no-op in F3DEX2, has been removed */ #define G_TRIFAN 0x09 #define G_LIGHTTORDP 0x0A +#define G_RELSEGMENT 0x0B /* names differ between F3DEX2 and F3DZEX */ #define G_BRANCH_Z G_BRANCH_WZ @@ -2600,11 +2601,11 @@ _DW({ \ /* * Moveword commands */ - +/* not strictly a moveword command anymore */ #define gSPSegment(pkt, segment, base) \ - gMoveWd(pkt, G_MW_SEGMENT, (segment) * 4, (base)) + gDma1p((pkt), G_RELSEGMENT, (base), ((segment) * 4) & 0xFFF, G_MW_SEGMENT) #define gsSPSegment(segment, base) \ - gsMoveWd( G_MW_SEGMENT, (segment) * 4, (base)) + gsDma1p( G_RELSEGMENT, (base), ((segment) * 4) & 0xFFF, G_MW_SEGMENT) #define gSPPerspNormalize(pkt, s) gMoveHalfwd(pkt, G_MW_FX, G_MWO_PERSPNORM, (s)) #define gsSPPerspNormalize(s) gsMoveHalfwd( G_MW_FX, G_MWO_PERSPNORM, (s)) diff --git a/rsp/gbi.inc b/rsp/gbi.inc index e1009db..6a12471 100644 --- a/rsp/gbi.inc +++ b/rsp/gbi.inc @@ -88,6 +88,7 @@ G_QUAD equ 0x07 G_TRISTRIP equ 0x08 G_TRIFAN equ 0x09 G_LIGHTTORDP equ 0x0a +G_RELSEGMENT equ 0x0b G_BRANCH_Z equ G_BRANCH_WZ