Moved things around

2026-01-21 10:37:45 -08:00 · 2025-09-21 15:06:43 -07:00
parent cd8fe1bc9b
commit ed62620414
2 changed files with 138 additions and 153 deletions
--- a/docs/Documentation/Design
+++ b/docs/Documentation/Design
@@ -19,9 +19,9 @@ occlusion plane for all vertices.

 ## Functionality in Overlay 3

-The following commands are moved to Overlay 3 in F3DEX3 to save IMEM space. This
-means that code will have to be loaded from DRAM to run them if Overlays 2 or 4
-(for lighting) happen to be loaded already.
+The following commands are moved to Overlay 2 or 3 in F3DEX3 to save IMEM space.
+This means that code will have to be loaded from DRAM to run them if a different
+overlay happens to be loaded already.
 - Push and multiply codepaths for `SPMatrix`
 - `SPPopMatrix*`
 - `SPDma*`
@@ -32,7 +32,8 @@ However:
  or accuracy, and these are not used for most 3D objects in SM64 or OoT.
 - `SPDma*` is rarely used except at startup for HLE detection.
 - `SPMemset` is a new F3DEX3 command which can improve performance. Plus, it is
-  typically run shortly after render start, when Overlay 3 is already in IMEM.
+  typically run shortly after render start, when Overlay 3 (which contains it)
+  is already in IMEM.

 So there is not a significant practical performance impact from these changes.

@@ -117,10 +118,10 @@ segment 0 must always be 0x00000000 so that this address resolves to e.g.
 In F3DEX2, the RSP time for drawing non-textured tris was significantly lower
 than for textured tris, by skipping a chunk of computation for the texture
 coefficients if they were disabled. In F3DEX3, no computation is skipped when
-textures are disabled. However, almost all materials use textures, and F3DEX3 is
-a little faster at drawing textured tris than F3DEX2. Plus, F3DEX3 still does
-not send the texture cofficients if they are disabled, saving DRAM access time
-for RSP -> FIFO and FIFO -> RDP. RDP time savings from avoiding loading a
+textures are disabled. However, practically almost all materials use textures,
+and F3DEX3 is faster at drawing textured tris than F3DEX2. Plus, F3DEX3 still
+does not send the texture cofficients if they are disabled, saving DRAM access
+time for RSP -> FIFO and FIFO -> RDP. RDP time savings from avoiding loading a
 texture are unaffected of course. 

 ## Obscure semantic differences from F3DEX2 that should never matter in practice
@@ -134,3 +135,6 @@ texture are unaffected of course.
  to hold state during some display list macros which are actually two 8-byte
  commands. This change is not noticeable when using standard GBI commands, only
  if something highly custom has been set up.
+- `SPTexture` and `SPFogFactor` state is corrupted when loading and returning
+  from another microcode (S2DEX). In F3DEX2, it would be reinitialized to
+  default values; in F3DEX3, it is left as garbage values.
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -254,12 +254,16 @@ otherMode0: // command byte included, same as above
 otherMode1:
    .dw 0x00000000

-unused4:
-    .fill 8
-
-unused3:
-    .fill 4
+// These two words are texrectState in S2DEX, so it can clobber them.
+textureSettings1:
+    .dw 0x00000000 // first word, has command byte, level, tile, and on
+textureSettings2:
+    .dw 0xFFFFFFFF // second word, has s and t scale
    
+// This word is rdpHalf1Val in S2DEX, so it can clobber it.
+fogFactor:
+    .dw 0x00000000
+
 activeClipPlanes:
    .dh CLIP_SCAL_NPXY | CLIP_CAMPLANE  // Normal tri write, set to zero when clipping
    
@@ -477,25 +481,6 @@ materialCullMode:
 geometryModeLabel:
    .dw 0x00000000

-.if (. & 7) != 0
-    .error "textureSettings align to 8 broken"
-.endif
-
-textureSettings1:
-    .dw 0x00000000 // first word, has command byte, level, tile, and on
-    
-textureSettings2:
-    .dw 0xFFFFFFFF // second word, has s and t scale
-    
-fogFactor:
-    .dw 0x00000000
-
-// First half of RDP value for split commands. Also used as temp storage for
-// tri vertices during tri commands.
-rdpHalf1Val:
-    .fill 4
-
-// moveword table
 movewordTable:
    .dh fxParams           // G_MW_FX
    .dh numLightsxSize - 3 // G_MW_NUMLIGHT; writes numLightsxSize and pointLightFlag, zeroes dirLightsXfrmValid
@@ -511,7 +496,11 @@ packedNormalsConstants:
    .dh fogFactor          // G_MW_FOG
    .dh lightBufferMain    // G_MW_LIGHTCOL

-// Movemem table
+// First half of RDP value for split commands. Also used as temp storage for
+// tri vertices during tri commands.
+rdpHalf1Val:
+    .fill 4
+
 movememTable:
    .dh mMatrix         // G_MV_MMTX
    .dh tempMatrix      // G_MV_TEMPMTX0 multiply temp matrix (model)
@@ -519,7 +508,7 @@ movememTable:
    .dh tempMatrix      // G_MV_TEMPMTX1 multiply temp matrix (view*projection)
    .dh viewport        // G_MV_VIEWPORT
    .dh cameraWorldPos  // G_MV_LIGHT
-
+    
 afterMovememRaTable:
    .dh run_next_DL_command
    .dh G_MTX_multiply_end
@@ -1124,34 +1113,6 @@ start_padded_end:
 .orga max(orga(), max(ovl0_padded_end - ovl0_start, ovl1_padded_end - ovl1_start) - 0x80)
 ovl01_end:

-G_CULLDL_handler: // 15
-    lhu     $10, (vertexTable)(cmd_w0)      // Start vtx addr
-    lhu     $3, (vertexTable)(cmd_w1_dram)  // End vertex
-    /*
-    CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1
-    verts which are behind the occlusion plane, and 1 vert which is behind the camera
-    plane and therefore randomly erroneously also set as behind the occlusion plane.
-    However, the convex hull of all the verts goes through visible area. This will be
-    incorrectly culled here. We can't afford the extra few instructions to disable
-    the occlusion plane if the vert is behind the camera, because this only matters for
-    G_CULLDL and not for tris.
-    */
-    li      $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
-    lhu     $11, VTX_CLIP($10)
-culldl_loop:
-    and     $1, $1, $11
-    beqz    $1, run_next_DL_command         // Some vertex is on the screen-side of all clipping planes; have to render
-     lhu    $11, (vtxSize + VTX_CLIP)($10)  // next vertex clip flags
-    bne     $10, $3, culldl_loop            // loop until reaching the last vertex
-     addi   $10, $10, vtxSize               // advance to the next vertex
-    li      cmd_w0, 0                       // Clear count of DL cmds to skip loading
-G_ENDDL_handler:
-    lbu     $1, displayListStackLength      // Load the DL stack index; if end stack,
-    beqz    $1, load_overlay_0_and_enter    // load overlay 0; $1 < 0 signals end
-     addi   $1, $1, -4                      // Decrement the DL stack index
-    j       call_ret_common                 // has a different version in ovl1
-     lw     taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
-
 G_POPMTX_handler:
 G_DMA_IO_handler:
    j       ovl234_ltbasic_entrypoint   // Delay slot is harmless
@@ -1196,6 +1157,23 @@ dma_and_wait_goto_next_ra:
    j       dma_read_write
     li     $ra, wait_goto_next_ra

+G_SETxIMG_handler: // 12
+    lb      $3, materialCullMode            // Get current mode
+    jal     segmented_to_physical           // Convert image to physical address
+     lw     $2, lastMatDLPhyAddr            // Get last material physical addr
+    bnez    $3, send_w0_w1_to_rdp           // If not in normal mode (0), exit
+     add    $10, taskDataPtr, inputBufferPos // Current material physical addr
+    beq     $10, $2, @@skip                 // Branch if we are executing the same mat again
+     sw     $10, lastMatDLPhyAddr           // Store material physical addr
+    li      $7, 1                           // > 0: in material first time
+@@skip:                                     // Otherwise $7 was < 0: cull mode (in mat second time)
+    sb      $7, materialCullMode
+send_w0_w1_to_rdp:
+    sw      cmd_w0, 0(rdpCmdBufPtr)
+send_w1_to_rdp:
+    j       commit_small_rdp_command
+     sw     cmd_w1_dram, 4(rdpCmdBufPtr)
+
 G_MEMSET_handler:
    j       ovl234_clipmisc_entrypoint       // Delay slot is harmless
 load_cmds_handler:
@@ -1252,31 +1230,6 @@ run_next_DL_command:
    // $7 must retain the command byte for load_mtx and overlay 3 stuff
    // $ra must contain the handler called for several handlers

-/* This is a crazy optimization, and it was completely accidental!
-When G_RELSEGMENT was implemented, we did not notice the G_MOVEWORD behavior of
-subtracting (G_MOVEWORD << 8) from the movewordTable address in order to remove
-the command byte. Since the command byte is G_RELSEGMENT, not G_MOVEWORD, the
-final address is completely wrong. However, DMEM wraps at 4 KiB--only the lowest
-4 bits of any address are significant. And, G_RELSEGMENT **happened** to end in
-0xB, the same as G_MOVEWORD! So the wrong address aliases to the correct one!
-I only noticed this when I tried to move G_RELSEGMENT to a different command
-byte and got crashes. */
-.if (G_RELSEGMENT & 0xF) != (G_MOVEWORD & 0xF)
-.error "Crazy relsegment optimization broken, don't change command byte assignments"
-.endif
-G_RELSEGMENT_handler: // 9
-    jal     segmented_to_physical    // Resolve new segment address relative to existing segment
-G_MOVEWORD_handler:
-     srl    $2, cmd_w0, 16           // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
-    lhu     $10, (movewordTable - ((G_MOVEWORD & 0xF) << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
-do_moveword:
-    sll     $11, cmd_w0, 16          // Sign bit = upper bit of offset
-    add     $10, $10, cmd_w0         // Offset + base; only lower 12 bits matter
-    bltz    $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
-     sh     cmd_w1_dram, ($10)       // Store value from cmd into halfword
-    j       run_next_DL_command
-     sw     cmd_w1_dram, ($10)       // Store value from cmd into word (offset + moveword_table[index])
-
 G_LOAD_UCODE_handler: // 4
    j       load_overlay_0_and_enter     // Delay slot is harmless
 G_MODIFYVTX_handler:
@@ -1284,6 +1237,27 @@ G_MODIFYVTX_handler:
    j       do_moveword  // Moveword adds cmd_w0 to $10 for final addr
     lbu    cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos)  // offset in vtx, bit 15 clear

+G_MTX_handler: // 12
+.if CFG_PROFILING_C
+    addi    perfCounterC, perfCounterC, 1  // Increment matrix count
+.endif
+    andi    $11, cmd_w0, G_MTX_VP_M | G_MTX_NOPUSH_PUSH
+    beqz    $11, ovl234_ltbasic_entrypoint   // Model and push: go to overlay for push
+     sh     $zero, mvpValid                  // Also zeroes dirLightsXfrmValid
+load_mtx:
+    andi    $1, cmd_w0, G_MTX_MUL_LOAD       // Read the matrix load type into $1 (2 is multiply, 0 is load)
+G_MOVEMEM_handler:  // Otherwise $1 is 0
+    jal     segmented_to_physical   // convert the memory address cmd_w1_dram to a virtual one
+do_movemem:
+     // 0: load M, 2: mul M -> load temp, 4: load VP, 6: mul VP -> load temp
+     andi   $3, cmd_w0, 0x00FE            // Movemem table index into $1 (bits 1-7 of the word 0)
+    lbu     dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Second byte of word 0
+    lhu     dmemAddr, (movememTable)($3)  // $3 reused in G_MTX_multiply_end
+    srl     $2, cmd_w0, 5                 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
+    add     dmemAddr, dmemAddr, $2
+    j       dma_and_wait_goto_next_ra
+     lh     nextRA, (afterMovememRaTable)($1) // $1 is 2 if mtx multiply, else 0
+
 .if !ENABLE_PROFILING
 G_LIGHTTORDP_handler: // 9
    sw      cmd_w1_dram, 0(rdpCmdBufPtr) // Store second word as first (cmd byte, prim level)
@@ -2200,11 +2174,11 @@ vtx_constants_for_clip:
    // Sets up constants needed for vertex loop, including during clipping.
    // Results fill vPerm1:4. Uses misc temps.
 .if CFG_NO_OCCLUSION_PLANE
-    llv     sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
+    llv     sFOG[0], (fogFactor)($zero)           // Load fog multiplier 0 and offset 1
    ldv     sVPO[0], (viewport + 8)($zero)        // Load vtrans duplicated in 0-3 and 4-7
    veq     $v29, $v31, $v31[3h]                  // VCC = 00010001
    ldv     sVPO[8], (viewport + 8)($zero)
-    llv     sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
+    llv     sSTS[0], (textureSettings2)($zero)    // Texture ST scale in 0, 1
    vmrg    sFGM, vOne, $v31[2]                   // sFGM is 0,0,0,1,0,0,0,1
    ldv     sVPS[0], (viewport)($zero)            // Load vscale duplicated in 0-3 and 4-7
    vne     $v29, $v31, $v31[3h]                  // VCC = 11101110
@@ -2222,7 +2196,7 @@ vtx_constants_for_clip:
 .else
    lb      flagsV1, geometryModeLabel + 3    // G_ATTROFFSET_ST_ENABLE in sign bit
    lw      $11, (fogFactor)($zero)           // Load fog multiplier MSBs and offset LSBs
-    llv     sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
+    llv     sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1
    llv     $v30[0], (attrOffsetST - altBase)(altBaseReg)  // Texture ST offset in 0, 1
    llv     $v30[8], (attrOffsetST - altBase)(altBaseReg)  // Texture ST offset in 4, 5
    bltz    flagsV1, @@keepoffset
@@ -2857,26 +2831,76 @@ ovl0_padded_end:

 ovl1_start:

-G_MTX_handler: // 12
-.if CFG_PROFILING_C
-    addi    perfCounterC, perfCounterC, 1  // Increment matrix count
+G_CULLDL_handler: // 15
+    lhu     $10, (vertexTable)(cmd_w0)      // Start vtx addr
+    lhu     $3, (vertexTable)(cmd_w1_dram)  // End vertex
+    /*
+    CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1
+    verts which are behind the occlusion plane, and 1 vert which is behind the camera
+    plane and therefore randomly erroneously also set as behind the occlusion plane.
+    However, the convex hull of all the verts goes through visible area. This will be
+    incorrectly culled here. We can't afford the extra few instructions to disable
+    the occlusion plane if the vert is behind the camera, because this only matters for
+    G_CULLDL and not for tris.
+    */
+    li      $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
+    lhu     $11, VTX_CLIP($10)
+culldl_loop:
+    and     $1, $1, $11
+    beqz    $1, run_next_DL_command         // Some vertex is on the screen-side of all clipping planes; have to render
+     lhu    $11, (vtxSize + VTX_CLIP)($10)  // next vertex clip flags
+    bne     $10, $3, culldl_loop            // loop until reaching the last vertex
+     addi   $10, $10, vtxSize               // advance to the next vertex
+    li      cmd_w0, 0                       // Clear count of DL cmds to skip loading
+G_ENDDL_handler:
+    lbu     $1, displayListStackLength      // Load the DL stack index; if end stack,
+    beqz    $1, load_overlay_0_and_enter    // load overlay 0; $1 < 0 signals end
+     addi   $1, $1, -4                      // Decrement the DL stack index
+    j       call_ret_common                 // has a different version in ovl1
+     lw     taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
+
+G_SETSCISSOR_handler: // 3; should be towards the start of ovl1
+    li      $ra, scissorUpLeft - (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))
+G_RDPSETOTHERMODE_handler: // $ra = .
+.if (. & 7) != 0
+    .error "G_RDPSETOTHERMODE_handler alignment broken"
 .endif
-    andi    $11, cmd_w0, G_MTX_VP_M | G_MTX_NOPUSH_PUSH
-    beqz    $11, ovl234_ltbasic_entrypoint   // Model and push: go to overlay for push
-     sh     $zero, mvpValid                  // Also zeroes dirLightsXfrmValid
-load_mtx:
-    andi    $1, cmd_w0, G_MTX_MUL_LOAD       // Read the matrix load type into $1 (2 is multiply, 0 is load)
-G_MOVEMEM_handler:  // Otherwise $1 is 0
-    jal     segmented_to_physical   // convert the memory address cmd_w1_dram to a virtual one
-do_movemem:
-     // 0: load M, 2: mul M -> load temp, 4: load VP, 6: mul VP -> load temp
-     andi   $3, cmd_w0, 0x00FE            // Movemem table index into $1 (bits 1-7 of the word 0)
-    lbu     dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Second byte of word 0
-    lhu     dmemAddr, (movememTable)($3)  // $3 reused in G_MTX_multiply_end
-    srl     $2, cmd_w0, 5                 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
-    add     dmemAddr, dmemAddr, $2
-    j       dma_and_wait_goto_next_ra
-     lh     nextRA, (afterMovememRaTable)($1) // $1 is 2 if mtx multiply, else 0
+    j       G_RDP_handler  // Send the command to the RDP
+     spv    $v4[0], (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))($ra)
+
+/* This is a crazy optimization, and it was completely accidental!
+When G_RELSEGMENT was implemented, we did not notice the G_MOVEWORD behavior of
+subtracting (G_MOVEWORD << 8) from the movewordTable address in order to remove
+the command byte. Since the command byte is G_RELSEGMENT, not G_MOVEWORD, the
+final address is completely wrong. However, DMEM wraps at 4 KiB--only the lowest
+4 bits of any address are significant. And, G_RELSEGMENT **happened** to end in
+0xB, the same as G_MOVEWORD! So the wrong address aliases to the correct one!
+I only noticed this when I tried to move G_RELSEGMENT to a different command
+byte and got crashes. */
+.if (G_RELSEGMENT & 0xF) != (G_MOVEWORD & 0xF)
+.error "Crazy relsegment optimization broken, don't change command byte assignments"
+.endif
+G_RELSEGMENT_handler: // 9
+    jal     segmented_to_physical    // Resolve new segment address relative to existing segment
+G_MOVEWORD_handler:
+     srl    $2, cmd_w0, 16           // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
+    lhu     $10, (movewordTable - ((G_MOVEWORD & 0xF) << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
+do_moveword:
+    sll     $11, cmd_w0, 16          // Sign bit = upper bit of offset
+    add     $10, $10, cmd_w0         // Offset + base; only lower 12 bits matter
+    bltz    $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
+     sh     cmd_w1_dram, ($10)       // Store value from cmd into halfword
+    j       run_next_DL_command
+     sw     cmd_w1_dram, ($10)       // Store value from cmd into word (offset + moveword_table[index])
+
+G_TEXRECT_handler: // 3; should be towards the start of ovl1
+    li      $ra, texrectState - (textureSettings1 - (G_TEXTURE_handler & 0xFFF))
+G_TEXTURE_handler: // $ra = .
+.if (. & 7) != 0
+    .error "G_TEXTURE_handler alignment broken"
+.endif
+    j       run_next_DL_command
+     spv    $v4[0], (textureSettings1 - (G_TEXTURE_handler & 0xFFF))($ra)

 G_FLUSH_handler: // 32
    jal     flush_rdp_buffer        // Flush once to push partial DMEM buf to FIFO
@@ -2957,20 +2981,7 @@ G_RDPHALF_1_handler: // $ra = ., 0x10 ahead of geometry mode
    j       run_next_DL_command
     sw     cmd_w1_dram, (geometryModeLabel - G_GEOMETRYMODE_handler)($ra)

-.if !CFG_PROFILING_C
-    nop // TODO
-.endif
-
-G_TEXRECT_handler:
-    li      $ra, texrectState - (textureSettings1 - (G_TEXTURE_handler & 0xFFF))
-G_TEXTURE_handler: // $ra = .
-.if (. & 7) != 0
-    .error "G_TEXTURE_handler alignment broken"
-.endif
-    j       run_next_DL_command
-     spv    $v4[0], (textureSettings1 - (G_TEXTURE_handler & 0xFFF))($ra)
-
-G_RDPHALF_2_handler: // 8
+G_RDPHALF_2_handler: // 8; should be after the handlers with alignment needs
    li      $11, texrectState
    ldv     $v29[0], (0)($11)
    sb      $zero, materialCullMode         // This covers tex and fill rects
@@ -2978,40 +2989,10 @@ G_RDPHALF_2_handler: // 8
    addi    rdpCmdBufPtr, rdpCmdBufPtr, 8
 .if !ENABLE_PROFILING
    addi    perfCounterB, perfCounterB, 1   // Increment number of tex/fill rects
-.else
-    vnop // For G_RDPSETOTHERMODE_handler alignment below, without taking a cycle
 .endif
    j       send_w0_w1_to_rdp               // w1 is from the current command
     sdv    $v29[0], -8(rdpCmdBufPtr)

-    nop // TODO
-
-G_SETSCISSOR_handler:
-    li      $ra, scissorUpLeft - (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))
-G_RDPSETOTHERMODE_handler: // $ra = .
-.if (. & 7) != 0
-    .error "G_RDPSETOTHERMODE_handler alignment broken"
-.endif
-    j       G_RDP_handler  // Send the command to the RDP
-     spv    $v4[0], (otherMode0 - (G_RDPSETOTHERMODE_handler & 0xFFF))($ra)
-
-G_SETxIMG_handler: // 12
-    lb      $3, materialCullMode            // Get current mode
-    jal     segmented_to_physical           // Convert image to physical address
-     lw     $2, lastMatDLPhyAddr            // Get last material physical addr
-    bnez    $3, send_w0_w1_to_rdp           // If not in normal mode (0), exit
-     add    $10, taskDataPtr, inputBufferPos // Current material physical addr
-    beq     $10, $2, @@skip                 // Branch if we are executing the same mat again
-     sw     $10, lastMatDLPhyAddr           // Store material physical addr
-    li      $7, 1                           // > 0: in material first time
-@@skip:                                     // Otherwise $7 was < 0: cull mode (in mat second time)
-    sb      $7, materialCullMode
-send_w0_w1_to_rdp:
-    sw      cmd_w0, 0(rdpCmdBufPtr)
-send_w1_to_rdp:
-    j       commit_small_rdp_command
-     sw     cmd_w1_dram, 4(rdpCmdBufPtr)
-
 ovl1_end:
 align_with_warning 8, "One instruction of padding at end of ovl1"
 ovl1_padded_end: