Removed overlay 4 for LVP config

2026-01-21 10:37:45 -08:00 · 2024-04-13 18:06:25 -07:00
parent cc3822649a
commit 2cc4823780
2 changed files with 131 additions and 68 deletions
--- a/README.md
+++ b/README.md
@@ -254,9 +254,9 @@ Some ways to use this for debugging are:

 ## Performance Results

-Vertex pipeline cycles per **vertex pair** in steady state. Hand-counted timings
-taking into account all pipeline stalls and all dual-issue conditions except for
-instruction alignment.
+Vertex pipeline cycles per **vertex pair** in steady state (lower is better).
+Hand-counted timings taking into account all pipeline stalls and all dual-issue
+conditions except for instruction alignment.

 | Microcode      | No Lighting | First Dir Lt | Total for 1 Dir Lt | Extra Dir Lts |
 |----------------|-------------|--------------|--------------------|---------------|
@@ -510,6 +510,11 @@ See the Microcode Configuration and Performance Results sections above.

 ### Overlay 4

+(Note that in the LVP configuration, Overlay 4 is absent; there is no M inverse
+transpose matrix discussed below, and the other commands mentioned below are
+directly in the microcode without an overlay, due to there being enough IMEM
+space.)
+
 F3DEX2 contains Overlay 2, which does lighting, and Overlay 3, which does
 clipping (run on any large triangle which extends a large distance offscreen).
 These overlays are more RSP assembly code which are loaded into the same space
@@ -535,7 +540,9 @@ F3DEX3 introduces Overlay 4, which can occupy the same IMEM as Overlay 2 and 3.
 This overlay contains handlers for:
 - Computing the inverse transpose of the model matrix M (abbreviated as mIT),
  discussed below
- The codepath for `SPMatrix` with `G_MTX_MUL` set
+- The codepath for `SPMatrix` with `G_MTX_MUL` set (base version only; this is
+  moved out of the overlay to normal microcode in the NOC configuration due to
+  having extra IMEM space available)
 - `SPBranchLessZ*`
 - `SPDma_io`

@@ -577,10 +584,6 @@ It is recommended to use `G_NORMALS_MODE_FAST` (the default) for most things,
 and use `G_NORMALS_MODE_AUTO` only for objects while they currently have a
 nonuniform scale (e.g. Mario only while he is squashed).

-Note that in the LVP configuration, lighting is computed in model space by
-transforming light directions into model space with M transpose, like in F3DEX2.
-Thus there is no mIT matrix and the SPNormalsMode setting is ignored.
-
 ### Optimizing for RSP code size

 A number of optimizations in F3DEX2 which saved a few cycles but took several
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -486,9 +486,9 @@ movewordTable:

 // G_POPMTX, G_MTX, G_MOVEMEM Command Jump Table
 movememHandlerTable:
-jumpTableEntry G_POPMTX_end            // G_POPMTX
-jumpTableEntry ovl234_ovl4_entrypoint  // G_MTX (multiply)
-jumpTableEntry G_MOVEMEM_end           // G_MOVEMEM, G_MTX (load)
+jumpTableEntry G_POPMTX_end   // G_POPMTX
+jumpTableEntry G_MTX_end      // G_MTX (multiply)
+jumpTableEntry G_MOVEMEM_end  // G_MOVEMEM, G_MTX (load)

 .macro miniTableEntry, addr
    .if addr < 0x1000 || addr >= 0x1400
@@ -499,7 +499,7 @@ jumpTableEntry G_MOVEMEM_end           // G_MOVEMEM, G_MTX (load)

 // RDP/Immediate Command Mini Table
 // 1 byte per entry, after << 2 points to an addr in first 1/4 of IMEM
-miniTableEntry ovl4_cmd_handler // G_DMA_IO
+miniTableEntry G_DMA_IO_handler
 miniTableEntry G_TEXTURE_handler
 miniTableEntry G_POPMTX_handler
 miniTableEntry G_GEOMETRYMODE_handler
@@ -546,7 +546,7 @@ miniTableEntry G_SYNC_handler // G_NOOP
 miniTableEntry G_VTX_handler
 miniTableEntry G_MODIFYVTX_handler
 miniTableEntry G_CULLDL_handler
-miniTableEntry ovl4_cmd_handler // G_BRANCH_WZ
+miniTableEntry G_BRANCH_WZ_handler
 miniTableEntry G_TRI1_handler
 miniTableEntry G_TRI2_handler
 miniTableEntry G_QUAD_handler
@@ -987,8 +987,38 @@ G_SETxIMG_handler:
    j       G_RDP_handler
     sb     $7, materialCullMode

-ovl4_cmd_handler:
+.if CFG_LEGACY_VTX_PIPE
+
+G_DMA_IO_handler:
+    jal     segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
+     lh     dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
+    andi    dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
+    // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
+    // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
+    sra     dmemAddr, dmemAddr, 2
+    j       dma_read_write  // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
+     li     $ra, wait_for_dma_and_run_next_command  // Setup the return address for running the next DL command
+
+G_BRANCH_WZ_handler:
+    j       vtx_addrs_from_cmd          // byte 3 = vtx being tested; addr -> $10
+     li     $11, branchwz_return_from_addrs
+branchwz_return_from_addrs:
+.if CFG_G_BRANCH_W                      // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
+    lh      $10, VTX_W_INT($10)         // read the w coordinate of the vertex (f3dzex)
+.else
+    lw      $10, VTX_SCR_Z($10)         // read the screen z coordinate (int and frac) of the vertex (f3dex2)
+.endif
+    sub     $2, $10, cmd_w1_dram        // subtract the w/z value being tested
+    bgez    $2, run_next_DL_command     // if vtx.w/z >= cmd w/z, continue running this DL
+     lw     cmd_w1_dram, rdpHalf1Val    // load the RDPHALF1 value as the location to branch to
+    j       branch_dl                   // need $2 < 0 for nopush and cmd_w1_dram
+     li     cmd_w0, 0                   // No count of DL cmds to skip
+
+.else
+G_DMA_IO_handler:
+G_BRANCH_WZ_handler:
    j       ovl234_ovl4_entrypoint          // Delay slot is harmless
+.endif
 load_cmds_handler:
     lb     $3, materialCullMode
    bltz    $3, run_next_DL_command  // If cull mode is < 0, in mat second time, skip the load
@@ -1256,40 +1286,16 @@ vtx_setup_constants:
    bnez    $7, skip_vtx_mvp
     li     $2, vpMatrix
    li      $3, mMatrix
-    addi    $10, $3, 0x0018
-@@loop:
-    vmadn   $v7, $v31, $v31[2]  // 0
-    addi    $11, $3, 0x0008
-    vmadh   $v6, $v31, $v31[2]  // 0
-    addi    $2, $2, -0x0020
-    vmudh   $v29, $v31, $v31[2] // 0
-@@innerloop:
-    ldv     $v3[0], 0x0040($2)
-    ldv     $v3[8], 0x0040($2)
-    lqv     $v1[0], 0x0020($3) // Input 1
-    ldv     $v2[0], 0x0020($2)
-    ldv     $v2[8], 0x0020($2)
-    lqv     $v0[0], 0x0000($3) // Input 1
-    vmadl   $v29, $v3, $v1[0h]
-    addi    $3, $3, 0x0002
-    vmadm   $v29, $v2, $v1[0h]
-    addi    $2, $2, 0x0008 // Increment input 0 pointer
-    vmadn   $v5, $v3, $v0[0h]
-    bne     $3, $11, @@innerloop
-     vmadh  $v4, $v2, $v0[0h]
-    bne     $3, $10, @@loop
-     addi   $3, $3, 0x0008
-    sqv     $v7[0], (mITMatrix + 0x0020)($zero)
-    sqv     $v6[0], (mITMatrix + 0x0000)($zero)
+    j       mtx_multiply
+     li     $6, mITMatrix
+vtx_after_mtx_multiply:
    sqv     $v5[0], (fourthQWMVP +    0)($zero)
-    sqv     $v4[0], (mITMatrix + 0x0010)($zero)
-    sb      $10, mITValid  // $10 is nonzero, in fact 0x18
+    sb      $10, mITValid  // $10 is nonzero from mtx_multiply, in fact 0x18
 skip_vtx_mvp:
    andi    $11, $5, G_LIGHTING >> 8
    bnez    $11, ovl234_lighting_entrypoint     // Lighting setup, incl. transform
     move   inputVtxPos, dmemAddr               // Must be before overlay load
 vtx_after_lt_setup:
-vtx_after_calc_mit: // Not actually used on this codepath
    lqv     vM0I,     (mITMatrix + 0x00)($zero)  // Load MVP matrix
    lqv     vM2I,     (mITMatrix + 0x10)($zero)
    lqv     vM0F,     (mITMatrix + 0x20)($zero)
@@ -1714,6 +1720,45 @@ skip_return_to_lt_or_loop:

 .endif // CFG_NO_OCCLUSION_PLANE

+.if CFG_LEGACY_VTX_PIPE || CFG_NO_OCCLUSION_PLANE
+G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix
+    lhu     $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
+    li      $3, tempMemRounded // Input 1 = temp mem (loaded mtx)
+    jal     while_wait_dma_busy
+     move   $2, $6 // Input 0 = output
+mtx_multiply: // $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx
+    addi    $10, $3, 0x0018
+@@loop:
+    vmadn   $v7, $v31, $v31[2]  // 0
+    addi    $11, $3, 0x0008
+    vmadh   $v6, $v31, $v31[2]  // 0
+    addi    $2, $2, -0x0020
+    vmudh   $v29, $v31, $v31[2] // 0
+@@innerloop:
+    ldv     $v3[0], 0x0040($2)
+    ldv     $v3[8], 0x0040($2)
+    lqv     $v1[0], 0x0020($3) // Input 1
+    ldv     $v2[0], 0x0020($2)
+    ldv     $v2[8], 0x0020($2)
+    lqv     $v0[0], 0x0000($3) // Input 1
+    vmadl   $v29, $v3, $v1[0h]
+    addi    $3, $3, 0x0002
+    vmadm   $v29, $v2, $v1[0h]
+    addi    $2, $2, 0x0008 // Increment input 0 pointer
+    vmadn   $v5, $v3, $v0[0h]
+    bne     $3, $11, @@innerloop
+     vmadh  $v4, $v2, $v0[0h]
+    bne     $3, $10, @@loop
+     addi   $3, $3, 0x0008
+    sqv     $v7[0], (0x0020)($6)
+    sqv     $v6[0], (0x0000)($6)
+.if CFG_LEGACY_VTX_PIPE
+    beqz    $7, vtx_after_mtx_multiply
+.endif
+     sqv    $v4[0], (0x0010)($6)
+    j       run_next_DL_command
+     sqv    $v5[0], (0x0030)($6)
+.endif
    
 .if (. & 4)
    .warning "One instruction of padding before ovl234"
@@ -1734,6 +1779,7 @@ ovl234_lighting_entrypoint_ovl3ver:        // same IMEM address as ovl234_lighti
    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
     li     cmd_w1_dram, orga(ovl2_start)  // set up a load for overlay 2

+.if !CFG_LEGACY_VTX_PIPE
 // Jump here for all overlay 4 features. If overlay 3 is loaded (this code), loads
 // overlay 4 and jumps to right here, which is now in the new code.
 ovl234_ovl4_entrypoint_ovl3ver:            // same IMEM address as ovl234_ovl4_entrypoint
@@ -1742,6 +1788,7 @@ ovl234_ovl4_entrypoint_ovl3ver:            // same IMEM address as ovl234_ovl4_e
 .endif
    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
     li     cmd_w1_dram, orga(ovl4_start)  // set up a load for overlay 4
+.endif //!CFG_LEGACY_VTX_PIPE

 // Jump here to do clipping. If overlay 3 is loaded (this code), directly starts
 // the clipping code.
@@ -2762,6 +2809,7 @@ lt_vtx_pair:
     andi   $11, $5, G_PACKED_NORMALS >> 8
 .endif

+.if !CFG_LEGACY_VTX_PIPE
 // Jump here for all overlay 4 features. If overlay 2 is loaded (this code), loads
 // overlay 4 and jumps to right here, which is now in the new code.
 ovl234_ovl4_entrypoint_ovl2ver:            // same IMEM address as ovl234_ovl4_entrypoint
@@ -2770,6 +2818,7 @@ ovl234_ovl4_entrypoint_ovl2ver:            // same IMEM address as ovl234_ovl4_e
 .endif
    jal     load_overlays_2_3_4            // Not a call; returns to $ra-8 = here
     li     cmd_w1_dram, orga(ovl4_start)  // set up a load for overlay 4
+.endif //!CFG_LEGACY_VTX_PIPE

 // Jump here to do clipping. If overlay 2 is loaded (this code), loads overlay 3
 // and jumps to right here, which is now in the new code.
@@ -3247,6 +3296,9 @@ ovl2_padded_end:
 .headersize ovl234_start - orga()

 ovl4_start:
+
+.if !CFG_LEGACY_VTX_PIPE
+
 // Contains M inverse transpose (mIT) computation, and some rarely-used command handlers.

 // Jump here to do lighting. If overlay 4 is loaded (this code), loads overlay 2
@@ -3261,6 +3313,9 @@ ovl234_lighting_entrypoint_ovl4ver:        // same IMEM address as ovl234_lighti
 // Jump here for all overlay 4 features. If overlay 4 is loaded (this code), jumps
 // to the instruction selection below.
 ovl234_ovl4_entrypoint:
+.if !CFG_LEGACY_VTX_PIPE && !CFG_NO_OCCLUSION_PLANE
+G_MTX_end:
+.endif
 .if CFG_PROFILING_B
    nop                                    // Needs to take up the space for the other perf counter
 .endif
@@ -3279,47 +3334,50 @@ ovl234_clipping_entrypoint_ovl4ver:        // same IMEM address as ovl234_clippi
 ovl4_select_instr:
    beq     $2, $7, calc_mit // otherwise $7 = command byte
     li     $3, G_BRANCH_WZ
-    beq     $3, $7, G_BRANCH_WZ_handler
+    beq     $3, $7, g_branch_wz_real
     li     $2, (0xFF00 | G_DMA_IO)
-    beq     $2, $7, G_DMA_IO_handler
-     // Otherwise G_MTX_end, which starts with a harmless instruction
+    beq     $2, $7, g_dma_io_real
+     nop
+     // Otherwise g_mtx_end_real

-G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix
-    lhu     $5, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
-    move    $2, $5 // Input 0 = output
+.if !CFG_NO_OCCLUSION_PLANE
+g_mtx_end_real:
+// Multiplies the temp loaded matrix into the M or VP matrix
+    lhu     $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
+    li      $3, tempMemRounded // Input 1 = temp mem (loaded mtx)
    jal     while_wait_dma_busy // If ovl4 already in memory, was not done
-     li     $3, tempMemRounded // Input 1 = temp mem (loaded mtx)
+     move   $2, $6 // Input 0 = output
    addi    $10, $3, 0x0018
@@loop:
-    vmadn   $v9, $v31, $v31[2]  // 0
+    vmadn   $v7, $v31, $v31[2]  // 0
    addi    $11, $3, 0x0008
-    vmadh   $v8, $v31, $v31[2]  // 0
+    vmadh   $v6, $v31, $v31[2]  // 0
    addi    $2, $2, -0x0020
    vmudh   $v29, $v31, $v31[2] // 0
@@innerloop:
-    ldv     $v5[0], 0x0040($2)
-    ldv     $v5[8], 0x0040($2)
-    lqv     $v3[0], 0x0020($3) // Input 1
-    ldv     $v4[0], 0x0020($2)
-    ldv     $v4[8], 0x0020($2)
-    lqv     $v2[0], 0x0000($3) // Input 1
-    vmadl   $v29, $v5, $v3[0h]
+    ldv     $v3[0], 0x0040($2)
+    ldv     $v3[8], 0x0040($2)
+    lqv     $v1[0], 0x0020($3) // Input 1
+    ldv     $v2[0], 0x0020($2)
+    ldv     $v2[8], 0x0020($2)
+    lqv     $v0[0], 0x0000($3) // Input 1
+    vmadl   $v29, $v3, $v1[0h]
    addi    $3, $3, 0x0002
-    vmadm   $v29, $v4, $v3[0h]
+    vmadm   $v29, $v2, $v1[0h]
    addi    $2, $2, 0x0008 // Increment input 0 pointer
-    vmadn   $v7, $v5, $v2[0h]
+    vmadn   $v5, $v3, $v0[0h]
    bne     $3, $11, @@innerloop
-     vmadh  $v6, $v4, $v2[0h]
+     vmadh  $v4, $v2, $v0[0h]
    bne     $3, $10, @@loop
     addi   $3, $3, 0x0008
-    // Store the results in M or VP
-    sqv     $v9[0], 0x0020($5)
-    sqv     $v8[0], 0x0000($5)
-    sqv     $v7[0], 0x0030($5)
+    sqv     $v7[0], (0x0020)($6)
+    sqv     $v6[0], (0x0000)($6)
+    sqv     $v4[0], (0x0010)($6)
    j       run_next_DL_command
-     sqv    $v6[0], 0x0010($5)
+     sqv    $v5[0], (0x0030)($6)
+.endif

-G_DMA_IO_handler:
+g_dma_io_real:
    jal     segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
     lh     dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
    andi    dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
@@ -3329,7 +3387,7 @@ G_DMA_IO_handler:
    j       dma_read_write  // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
     li     $ra, wait_for_dma_and_run_next_command  // Setup the return address for running the next DL command

-G_BRANCH_WZ_handler:
+g_branch_wz_real:
    j       vtx_addrs_from_cmd          // byte 3 = vtx being tested; addr -> $10
     li     $11, branchwz_return_from_addrs
 branchwz_return_from_addrs:
@@ -3482,6 +3540,8 @@ calc_mit:
    sdv     $v21[0], (mITMatrix + 0x18)($zero)
    j       vtx_after_calc_mit
     sdv    $v20[0], (mITMatrix + 0x00)($zero)
+     
+.endif // !CFG_LEGACY_VTX_PIPE

 ovl4_end:
 .align 8