Moved things around

This commit is contained in:
Sauraen
2025-09-21 15:06:43 -07:00
parent cd8fe1bc9b
commit ed62620414
2 changed files with 138 additions and 153 deletions

View File

@@ -19,9 +19,9 @@ occlusion plane for all vertices.
## Functionality in Overlay 3
The following commands are moved to Overlay 3 in F3DEX3 to save IMEM space. This
means that code will have to be loaded from DRAM to run them if Overlays 2 or 4
(for lighting) happen to be loaded already.
The following commands are moved to Overlay 2 or 3 in F3DEX3 to save IMEM space.
This means that code will have to be loaded from DRAM to run them if a different
overlay happens to be loaded already.
- Push and multiply codepaths for `SPMatrix`
- `SPPopMatrix*`
- `SPDma*`
@@ -32,7 +32,8 @@ However:
or accuracy, and these are not used for most 3D objects in SM64 or OoT.
- `SPDma*` is rarely used except at startup for HLE detection.
- `SPMemset` is a new F3DEX3 command which can improve performance. Plus, it is
typically run shortly after render start, when Overlay 3 is already in IMEM.
typically run shortly after render start, when Overlay 3 (which contains it)
is already in IMEM.
So there is not a significant practical performance impact from these changes.
@@ -117,10 +118,10 @@ segment 0 must always be 0x00000000 so that this address resolves to e.g.
In F3DEX2, the RSP time for drawing non-textured tris was significantly lower
than for textured tris, by skipping a chunk of computation for the texture
coefficients if they were disabled. In F3DEX3, no computation is skipped when
textures are disabled. However, almost all materials use textures, and F3DEX3 is
a little faster at drawing textured tris than F3DEX2. Plus, F3DEX3 still does
not send the texture cofficients if they are disabled, saving DRAM access time
for RSP -> FIFO and FIFO -> RDP. RDP time savings from avoiding loading a
textures are disabled. However, practically almost all materials use textures,
and F3DEX3 is faster at drawing textured tris than F3DEX2. Plus, F3DEX3 still
does not send the texture cofficients if they are disabled, saving DRAM access
time for RSP -> FIFO and FIFO -> RDP. RDP time savings from avoiding loading a
texture are unaffected of course.
## Obscure semantic differences from F3DEX2 that should never matter in practice
@@ -134,3 +135,6 @@ texture are unaffected of course.
to hold state during some display list macros which are actually two 8-byte
commands. This change is not noticeable when using standard GBI commands, only
if something highly custom has been set up.
- `SPTexture` and `SPFogFactor` state is corrupted when loading and returning
from another microcode (S2DEX). In F3DEX2, it would be reinitialized to
default values; in F3DEX3, it is left as garbage values.

271
f3dex3.s
View File

@@ -254,12 +254,16 @@ otherMode0: // command byte included, same as above
otherMode1:
.dw 0x00000000
unused4:
.fill 8
unused3:
.fill 4
// These two words are texrectState in S2DEX, so it can clobber them.
textureSettings1:
.dw 0x00000000 // first word, has command byte, level, tile, and on
textureSettings2:
.dw 0xFFFFFFFF // second word, has s and t scale
// This word is rdpHalf1Val in S2DEX, so it can clobber it.
fogFactor:
.dw 0x00000000
activeClipPlanes:
.dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping
@@ -477,25 +481,6 @@ materialCullMode:
geometryModeLabel:
.dw 0x00000000
.if (. & 7) != 0
.error "textureSettings align to 8 broken"
.endif
textureSettings1:
.dw 0x00000000 // first word, has command byte, level, tile, and on
textureSettings2:
.dw 0xFFFFFFFF // second word, has s and t scale
fogFactor:
.dw 0x00000000
// First half of RDP value for split commands. Also used as temp storage for
// tri vertices during tri commands.
rdpHalf1Val:
.fill 4
// moveword table
movewordTable:
.dh fxParams // G_MW_FX
.dh numLightsxSize - 3 // G_MW_NUMLIGHT; writes numLightsxSize and pointLightFlag, zeroes dirLightsXfrmValid
@@ -511,7 +496,11 @@ packedNormalsConstants:
.dh fogFactor // G_MW_FOG
.dh lightBufferMain // G_MW_LIGHTCOL
// Movemem table
// First half of RDP value for split commands. Also used as temp storage for
// tri vertices during tri commands.
rdpHalf1Val:
.fill 4
movememTable:
.dh mMatrix // G_MV_MMTX
.dh tempMatrix // G_MV_TEMPMTX0 multiply temp matrix (model)
@@ -519,7 +508,7 @@ movememTable:
.dh tempMatrix // G_MV_TEMPMTX1 multiply temp matrix (view*projection)
.dh viewport // G_MV_VIEWPORT
.dh cameraWorldPos // G_MV_LIGHT
afterMovememRaTable:
.dh run_next_DL_command
.dh G_MTX_multiply_end
@@ -1124,34 +1113,6 @@ start_padded_end:
.orga max(orga(), max(ovl0_padded_end - ovl0_start, ovl1_padded_end - ovl1_start) - 0x80)
ovl01_end:
G_CULLDL_handler: // 15
lhu $10, (vertexTable)(cmd_w0) // Start vtx addr
lhu $3, (vertexTable)(cmd_w1_dram) // End vertex
/*
CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1
verts which are behind the occlusion plane, and 1 vert which is behind the camera
plane and therefore randomly erroneously also set as behind the occlusion plane.
However, the convex hull of all the verts goes through visible area. This will be
incorrectly culled here. We can't afford the extra few instructions to disable
the occlusion plane if the vert is behind the camera, because this only matters for
G_CULLDL and not for tris.
*/
li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
lhu $11, VTX_CLIP($10)
culldl_loop:
and $1, $1, $11
beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render
lhu $11, (vtxSize + VTX_CLIP)($10) // next vertex clip flags
bne $10, $3, culldl_loop // loop until reaching the last vertex
addi $10, $10, vtxSize // advance to the next vertex
li cmd_w0, 0 // Clear count of DL cmds to skip loading
G_ENDDL_handler:
lbu $1, displayListStackLength // Load the DL stack index; if end stack,
beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end
addi $1, $1, -4 // Decrement the DL stack index
j call_ret_common // has a different version in ovl1
lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
G_POPMTX_handler:
G_DMA_IO_handler:
j ovl234_ltbasic_entrypoint // Delay slot is harmless
@@ -1196,6 +1157,23 @@ dma_and_wait_goto_next_ra:
j dma_read_write
li $ra, wait_goto_next_ra
G_SETxIMG_handler: // 12
lb $3, materialCullMode // Get current mode
jal segmented_to_physical // Convert image to physical address
lw $2, lastMatDLPhyAddr // Get last material physical addr
bnez $3, send_w0_w1_to_rdp // If not in normal mode (0), exit
add $10, taskDataPtr, inputBufferPos // Current material physical addr
beq $10, $2, @@skip // Branch if we are executing the same mat again
sw $10, lastMatDLPhyAddr // Store material physical addr
li $7, 1 // > 0: in material first time
@@skip: // Otherwise $7 was < 0: cull mode (in mat second time)
sb $7, materialCullMode
send_w0_w1_to_rdp:
sw cmd_w0, 0(rdpCmdBufPtr)
send_w1_to_rdp:
j commit_small_rdp_command
sw cmd_w1_dram, 4(rdpCmdBufPtr)
G_MEMSET_handler:
j ovl234_clipmisc_entrypoint // Delay slot is harmless
load_cmds_handler:
@@ -1252,31 +1230,6 @@ run_next_DL_command:
// $7 must retain the command byte for load_mtx and overlay 3 stuff
// $ra must contain the handler called for several handlers
/* This is a crazy optimization, and it was completely accidental!
When G_RELSEGMENT was implemented, we did not notice the G_MOVEWORD behavior of
subtracting (G_MOVEWORD << 8) from the movewordTable address in order to remove
the command byte. Since the command byte is G_RELSEGMENT, not G_MOVEWORD, the
final address is completely wrong. However, DMEM wraps at 4 KiB--only the lowest
4 bits of any address are significant. And, G_RELSEGMENT **happened** to end in
0xB, the same as G_MOVEWORD! So the wrong address aliases to the correct one!
I only noticed this when I tried to move G_RELSEGMENT to a different command
byte and got crashes. */
.if (G_RELSEGMENT & 0xF) != (G_MOVEWORD & 0xF)
.error "Crazy relsegment optimization broken, don't change command byte assignments"
.endif
G_RELSEGMENT_handler: // 9
jal segmented_to_physical // Resolve new segment address relative to existing segment
G_MOVEWORD_handler:
srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
lhu $10, (movewordTable - ((G_MOVEWORD & 0xF) << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
do_moveword:
sll $11, cmd_w0, 16 // Sign bit = upper bit of offset
add $10, $10, cmd_w0 // Offset + base; only lower 12 bits matter
bltz $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
sh cmd_w1_dram, ($10) // Store value from cmd into halfword
j run_next_DL_command
sw cmd_w1_dram, ($10) // Store value from cmd into word (offset + moveword_table[index])
G_LOAD_UCODE_handler: // 4
j load_overlay_0_and_enter // Delay slot is harmless
G_MODIFYVTX_handler:
@@ -1284,6 +1237,27 @@ G_MODIFYVTX_handler:
j do_moveword // Moveword adds cmd_w0 to $10 for final addr
lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear
G_MTX_handler: // 12
.if CFG_PROFILING_C
addi perfCounterC, perfCounterC, 1 // Increment matrix count
.endif
andi $11, cmd_w0, G_MTX_VP_M | G_MTX_NOPUSH_PUSH
beqz $11, ovl234_ltbasic_entrypoint // Model and push: go to overlay for push
sh $zero, mvpValid // Also zeroes dirLightsXfrmValid
load_mtx:
andi $1, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $1 (2 is multiply, 0 is load)
G_MOVEMEM_handler: // Otherwise $1 is 0
jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one
do_movemem:
// 0: load M, 2: mul M -> load temp, 4: load VP, 6: mul VP -> load temp
andi $3, cmd_w0, 0x00FE // Movemem table index into $1 (bits 1-7 of the word 0)
lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Second byte of word 0
lhu dmemAddr, (movememTable)($3) // $3 reused in G_MTX_multiply_end
srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
add dmemAddr, dmemAddr, $2
j dma_and_wait_goto_next_ra
lh nextRA, (afterMovememRaTable)($1) // $1 is 2 if mtx multiply, else 0
.if !ENABLE_PROFILING
G_LIGHTTORDP_handler: // 9
sw cmd_w1_dram, 0(rdpCmdBufPtr) // Store second word as first (cmd byte, prim level)
@@ -2200,11 +2174,11 @@ vtx_constants_for_clip:
// Sets up constants needed for vertex loop, including during clipping.
// Results fill vPerm1:4. Uses misc temps.
.if CFG_NO_OCCLUSION_PLANE
llv sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
llv sFOG[0], (fogFactor)($zero) // Load fog multiplier 0 and offset 1
ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
veq $v29, $v31, $v31[3h] // VCC = 00010001
ldv sVPO[8], (viewport + 8)($zero)
llv sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1
vmrg sFGM, vOne, $v31[2] // sFGM is 0,0,0,1,0,0,0,1
ldv sVPS[0], (viewport)($zero) // Load vscale duplicated in 0-3 and 4-7
vne $v29, $v31, $v31[3h] // VCC = 11101110
@@ -2222,7 +2196,7 @@ vtx_constants_for_clip:
.else
lb flagsV1, geometryModeLabel + 3 // G_ATTROFFSET_ST_ENABLE in sign bit
lw $11, (fogFactor)($zero) // Load fog multiplier MSBs and offset LSBs
llv sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1
llv $v30[0], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 0, 1
llv $v30[8], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 4, 5
bltz flagsV1, @@keepoffset
@@ -2857,26 +2831,76 @@ ovl0_padded_end:
ovl1_start:
G_MTX_handler: // 12
.if CFG_PROFILING_C
addi perfCounterC, perfCounterC, 1 // Increment matrix count
G_CULLDL_handler: // 15
lhu $10, (vertexTable)(cmd_w0) // Start vtx addr
lhu $3, (vertexTable)(cmd_w1_dram) // End vertex
/*
CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1
verts which are behind the occlusion plane, and 1 vert which is behind the camera
plane and therefore randomly erroneously also set as behind the occlusion plane.
However, the convex hull of all the verts goes through visible area. This will be
incorrectly culled here. We can't afford the extra few instructions to disable
the occlusion plane if the vert is behind the camera, because this only matters for
G_CULLDL and not for tris.
*/
li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
lhu $11, VTX_CLIP($10)
culldl_loop:
and $1, $1, $11
beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render
lhu $11, (vtxSize + VTX_CLIP)($10) // next vertex clip flags
bne $10, $3, culldl_loop // loop until reaching the last vertex
addi $10, $10, vtxSize // advance to the next vertex
li cmd_w0, 0 // Clear count of DL cmds to skip loading
G_ENDDL_handler:
lbu $1, displayListStackLength // Load the DL stack index; if end stack,
beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end
addi $1, $1, -4 // Decrement the DL stack index
j call_ret_common // has a different version in ovl1
lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
G_SETSCISSOR_handler: // 3; should be towards the start of ovl1
li $ra, scissorUpLeft - (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))
G_RDPSETOTHERMODE_handler: // $ra = .
.if (. & 7) != 0
.error "G_RDPSETOTHERMODE_handler alignment broken"
.endif
andi $11, cmd_w0, G_MTX_VP_M | G_MTX_NOPUSH_PUSH
beqz $11, ovl234_ltbasic_entrypoint // Model and push: go to overlay for push
sh $zero, mvpValid // Also zeroes dirLightsXfrmValid
load_mtx:
andi $1, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $1 (2 is multiply, 0 is load)
G_MOVEMEM_handler: // Otherwise $1 is 0
jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one
do_movemem:
// 0: load M, 2: mul M -> load temp, 4: load VP, 6: mul VP -> load temp
andi $3, cmd_w0, 0x00FE // Movemem table index into $1 (bits 1-7 of the word 0)
lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Second byte of word 0
lhu dmemAddr, (movememTable)($3) // $3 reused in G_MTX_multiply_end
srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
add dmemAddr, dmemAddr, $2
j dma_and_wait_goto_next_ra
lh nextRA, (afterMovememRaTable)($1) // $1 is 2 if mtx multiply, else 0
j G_RDP_handler // Send the command to the RDP
spv $v4[0], (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))($ra)
/* This is a crazy optimization, and it was completely accidental!
When G_RELSEGMENT was implemented, we did not notice the G_MOVEWORD behavior of
subtracting (G_MOVEWORD << 8) from the movewordTable address in order to remove
the command byte. Since the command byte is G_RELSEGMENT, not G_MOVEWORD, the
final address is completely wrong. However, DMEM wraps at 4 KiB--only the lowest
4 bits of any address are significant. And, G_RELSEGMENT **happened** to end in
0xB, the same as G_MOVEWORD! So the wrong address aliases to the correct one!
I only noticed this when I tried to move G_RELSEGMENT to a different command
byte and got crashes. */
.if (G_RELSEGMENT & 0xF) != (G_MOVEWORD & 0xF)
.error "Crazy relsegment optimization broken, don't change command byte assignments"
.endif
G_RELSEGMENT_handler: // 9
jal segmented_to_physical // Resolve new segment address relative to existing segment
G_MOVEWORD_handler:
srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
lhu $10, (movewordTable - ((G_MOVEWORD & 0xF) << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
do_moveword:
sll $11, cmd_w0, 16 // Sign bit = upper bit of offset
add $10, $10, cmd_w0 // Offset + base; only lower 12 bits matter
bltz $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
sh cmd_w1_dram, ($10) // Store value from cmd into halfword
j run_next_DL_command
sw cmd_w1_dram, ($10) // Store value from cmd into word (offset + moveword_table[index])
G_TEXRECT_handler: // 3; should be towards the start of ovl1
li $ra, texrectState - (textureSettings1 - (G_TEXTURE_handler & 0xFFF))
G_TEXTURE_handler: // $ra = .
.if (. & 7) != 0
.error "G_TEXTURE_handler alignment broken"
.endif
j run_next_DL_command
spv $v4[0], (textureSettings1 - (G_TEXTURE_handler & 0xFFF))($ra)
G_FLUSH_handler: // 32
jal flush_rdp_buffer // Flush once to push partial DMEM buf to FIFO
@@ -2957,20 +2981,7 @@ G_RDPHALF_1_handler: // $ra = ., 0x10 ahead of geometry mode
j run_next_DL_command
sw cmd_w1_dram, (geometryModeLabel - G_GEOMETRYMODE_handler)($ra)
.if !CFG_PROFILING_C
nop // TODO
.endif
G_TEXRECT_handler:
li $ra, texrectState - (textureSettings1 - (G_TEXTURE_handler & 0xFFF))
G_TEXTURE_handler: // $ra = .
.if (. & 7) != 0
.error "G_TEXTURE_handler alignment broken"
.endif
j run_next_DL_command
spv $v4[0], (textureSettings1 - (G_TEXTURE_handler & 0xFFF))($ra)
G_RDPHALF_2_handler: // 8
G_RDPHALF_2_handler: // 8; should be after the handlers with alignment needs
li $11, texrectState
ldv $v29[0], (0)($11)
sb $zero, materialCullMode // This covers tex and fill rects
@@ -2978,40 +2989,10 @@ G_RDPHALF_2_handler: // 8
addi rdpCmdBufPtr, rdpCmdBufPtr, 8
.if !ENABLE_PROFILING
addi perfCounterB, perfCounterB, 1 // Increment number of tex/fill rects
.else
vnop // For G_RDPSETOTHERMODE_handler alignment below, without taking a cycle
.endif
j send_w0_w1_to_rdp // w1 is from the current command
sdv $v29[0], -8(rdpCmdBufPtr)
nop // TODO
G_SETSCISSOR_handler:
li $ra, scissorUpLeft - (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))
G_RDPSETOTHERMODE_handler: // $ra = .
.if (. & 7) != 0
.error "G_RDPSETOTHERMODE_handler alignment broken"
.endif
j G_RDP_handler // Send the command to the RDP
spv $v4[0], (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))($ra)
G_SETxIMG_handler: // 12
lb $3, materialCullMode // Get current mode
jal segmented_to_physical // Convert image to physical address
lw $2, lastMatDLPhyAddr // Get last material physical addr
bnez $3, send_w0_w1_to_rdp // If not in normal mode (0), exit
add $10, taskDataPtr, inputBufferPos // Current material physical addr
beq $10, $2, @@skip // Branch if we are executing the same mat again
sw $10, lastMatDLPhyAddr // Store material physical addr
li $7, 1 // > 0: in material first time
@@skip: // Otherwise $7 was < 0: cull mode (in mat second time)
sb $7, materialCullMode
send_w0_w1_to_rdp:
sw cmd_w0, 0(rdpCmdBufPtr)
send_w1_to_rdp:
j commit_small_rdp_command
sw cmd_w1_dram, 4(rdpCmdBufPtr)
ovl1_end:
align_with_warning 8, "One instruction of padding at end of ovl1"
ovl1_padded_end: