From 99b69e3f2bd36d232583eacb3c3c241df7e46dc8 Mon Sep 17 00:00:00 2001
From: Sauraen <sauraen@gmail.com>
Date: Fri, 17 Feb 2023 18:18:33 -0800
Subject: [PATCH] Document registers (#14)

* More clipping and tri docs

* Documented clipping flags

* Clipping docs almost done

* Clipping documented

* More vertex related docs added

* Fixed some vtrans vscale explanations

* More register docs

* Finished basic registers documentation

* Tweaks for Tharo

* Addressed Wiseguy comments, added some documentation of ovl0

* Fixed an XBUS bug

* Tweaks to comments and moving an instruction before point lighting for clarity
---
 f3dex2.s         | 715 ++++++++++++++++++++++++++++-------------------
 rsp/gbi.inc      |   9 -
 rsp/rsp_defs.inc |   3 +
 3 files changed, 426 insertions(+), 301 deletions(-)

diff --git a/f3dex2.s b/f3dex2.s
index fa3e345..d116704 100644
--- a/f3dex2.s
+++ b/f3dex2.s
@@ -125,17 +125,6 @@ pMatrix:
 mvpMatrix:
     .fill 64
     
-// Not global names, but used multiple times with the same meaning.
-// Matrix row X integer/fractional
-mxr0i equ $v8
-mxr1i equ $v9
-mxr2i equ $v10
-mxr3i equ $v11
-mxr0f equ $v12
-mxr1f equ $v13
-mxr2f equ $v14
-mxr3f equ $v15
-
 // 0x00C0-0x00C8: scissor (four 12-bit values)
 scissorUpLeft: // the command byte is included since the command word is copied verbatim
     .dw (G_SETSCISSOR << 24) | ((  0 * 4) << 12) | ((  0 * 4) << 0)
@@ -200,7 +189,6 @@ displayListStack:
 // fixed offsets to things in this region. Perhaps had to do with DMEM overlays
 // at some point in development.
 spFxBase:
-spFxBaseReg equ $13
 
 // 0x0180-0x1B0: clipping values
 clipRatio: // This is an array of 6 doublewords
@@ -219,35 +207,41 @@ clipRatio: // This is an array of 6 doublewords
 
 // 0x1B0: constants for register $v31
 .align 0x10 // loaded with lqv
+// VCC patterns used:
+// vlt xxx, $v31, $v31[3]  = 11101110 in load_spfx_global_values
+// vne xxx, $v31, $v31[3h] = 11101110 in lighting
+// veq xxx, $v31, $v31[3h] = 00010001 in lighting
 v31Value:
-    .dh 0xFFFF // -1
-    .dh 0x0004 // 4
-    .dh 0x0008 // 8
-    .dh 0x7F00 // 32512
-    .dh 0xFFFC // -4
-    .dh 0x4000 // 16384
-    .dh vertexBuffer // 0x420
-    .dh 0x7FFF // 32767
+    .dh -1     // used in init, clipping
+    .dh 4      // used in clipping, vtx write
+    .dh 8      // old ucode only: used in tri write
+    .dh 0x7F00 // used in vtx write and pre-jump instrs to there, also 4 put here during point lighting
+    .dh -4     // used in clipping, vtx write
+    .dh 0x4000 // used in tri write, texgen
+    .dh vertexBuffer // 0x420; used in tri write
+    .dh 0x7FFF // used in vtx write, tri write, lighting, point lighting
 
 // 0x1C0: constants for register $v30
 .align 0x10 // loaded with lqv
+// VCC patterns used:
+// vge xxx, $v30, $v30[7] = 11110001 in tri write
 v30Value:
-    .dh 0x7FFC
-    .dh vtxSize << 7 // 0x1400
+    .dh 0x7FFC // not used!
+    .dh vtxSize << 7 // 0x1400; used in tri write for vtx index to addr
 .if (UCODE_IS_206_OR_OLDER)
-    .dh 0x01CC
-    .dh 0x0200
-    .dh 0xFFF0
-    .dh 0x0010
-    .dh 0x0020
-    .dh 0x0100
+    .dh 0x01CC // used in tri write, vcr?
+    .dh 0x0200 // not used!
+    .dh -16    // used in tri write, some signed multiplier
+    .dh 0x0010 // used in tri write, some accumulator init value
+    .dh 0x0020 // used in tri write, both signed and unsigned multipliers
+    .dh 0x0100 // used in tri write, vertex color >>= 8; also in lighting
 .else
-    .dh 0x1000
-    .dh 0x0100
-    .dh 0xFFF0
-    .dh 0xFFF8
-    .dh 0x0010
-    .dh 0x0020
+    .dh 0x1000 // used in tri write, some multiplier
+    .dh 0x0100 // used in tri write, vertex color >>= 8 and vcr?; also in lighting and point lighting
+    .dh -16    // used in tri write, some signed multiplier
+    .dh 0xFFF8 // used in tri write, mask away lower ST bits?
+    .dh 0x0010 // used in tri write, some accumulator init value; value moved to elem 7 for point lighting
+    .dh 0x0020 // used in tri write, both signed and unsigned multipliers; value moved from elem 6 from point lighting
 .endif
 
 .align 0x10 // loaded with lqv
@@ -255,7 +249,7 @@ linearGenerateCoefficients:
     .dh 0xC000
     .dh 0x44D3
     .dh 0x6CB3
-    .dh 0x0002
+    .dh 2
 
 // 0x01D8
     .db 0x00 // Padding to allow mvpValid to be written to as a 32-bit word
@@ -322,9 +316,6 @@ lightBufferMain:
     .fill (8 * lightSize)
 // Code uses pointers relative to spFxBase, with immediate offsets, so that
 // another register isn't needed to store the start or end address of the array.
-curLight    equ $9 // With ltBufOfs immediate added, points to current light
-                   // (current max in list, counting down).
-tmpCurLight equ $6 // Same meaning, another register.
 // Pointers are kept relative to spFxBase; this offset gets them to point to
 // lightBufferMain instead.
 ltBufOfs equ (lightBufferMain - spFxBase)
@@ -495,6 +486,13 @@ overlayInfo3:
 vertexBuffer:
     .skip (vtxSize * 32) // 32 vertices
 
+.if . > OS_YIELD_DATA_SIZE - 8
+    // OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last two words are
+    // the ucode and the DL pointer. Make sure anything past there is temporary.
+    // (Input buffer will be reloaded from next instruction in the source DL.)
+    .error "Important things in DMEM will not be saved at yield!"
+.endif
+
 // 0x0920-0x09C8: Input buffer
 inputBuffer:
 inputBufferLength equ 0xA8
@@ -547,27 +545,178 @@ OSTask:
 // RSP IMEM
 .create CODE_FILE, 0x00001080
 
-// Global registers
-secondVtxPos equ $8
-outputVtxPos equ $15
-clipFlags equ $16
-clipPolyRead equ $17
-clipPolySelect equ $18
-rdpCmdBufEnd equ $22
-rdpCmdBufPtr equ $23
-cmd_w1 equ $24
-cmd_w0 equ $25
-taskDataPtr equ $26
-inputBufferPos equ $27
-savedActiveClipPlanes equ $29
-savedRA equ $30
+////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////// Register Use Map ///////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
+
+// Registers marked as "global" are only used for one purpose in the vanilla
+// microcode. However, this does not necessarily mean they can't be used for
+// other things in mods--this depends on which group they're listed in below.
+
+// Note that these lists do not cover registers which are just used locally in
+// a particular region of code--you're still responsible for not breaking the
+// code you modify. This is designed to help you avoid breaking one part of the
+// code by modifying a different part.
+
+// Local register definitions are included with their code, not here.
+
+// These registers are used globally, and their values can't be rebuilt, so
+// they should never be used for anything besides their original purpose.
+//                 $zero // global
+rdpCmdBufEnd   equ $22   // global
+rdpCmdBufPtr   equ $23   // global
+taskDataPtr    equ $26   // global
+inputBufferPos equ $27   // global
+//                 $ra   // global
+
+// These registers are used throughout the codebase and expected to have
+// certain values, but you're free to overwrite them as long as you
+// reconstruct the normal values after you're done (in fact point lighting does
+// this for $v30 and $v31).
+vZero equ $v0  // global
+vOne  equ $v1  // global
+//        $v30 // global
+//        $v31 // global
+
+// Must keep values during the full clipping process: clipping overlay, vertex
+// write, tri drawing.
+clipPolySelect        equ $18 // global
+clipPolyWrite         equ $21 // also input_mtx_0
+savedActiveClipPlanes equ $29 // global
+savedRA               equ $30 // global
+
+// Must keep values during the first part of the clipping process only: polygon
+// subdivision and vertex write.
+// $2: vertex at end of edge
+clipMaskIdx  equ $5
+secondVtxPos equ $8  // global
+outputVtxPos equ $15 // global
+clipFlags    equ $16 // global
+clipPolyRead equ $17 // global
+
+// Must keep values during tri drawing.
+// They are also used throughout the codebase, but can be overwritten once their
+// use has been fulfilled for the specific command.
+cmd_w1_dram equ $24 // Command word 1, which is also DMA DRAM addr; almost global, occasionally used locally
+cmd_w0      equ $25 // Command word 0; almost global, occasionally used locally
+
+// Must keep values during the full vertex process: load, lighting, and vertex write
+// $1: count of remaining vertices
+topLightPtr  equ $6   // Used locally elsewhere
+curLight     equ $9   // Used locally elsewhere
+inputVtxPos  equ $14  // global
+mxr0i        equ $v8  // "matrix row 0 int part"
+mxr1i        equ $v9  // All of these used locally elsewhere
+mxr2i        equ $v10
+mxr3i        equ $v11
+mxr0f        equ $v12
+mxr1f        equ $v13
+mxr2f        equ $v14
+mxr3f        equ $v15
+vPairST      equ $v22 // global
+vPairMVPPosF equ $v23 // global
+vPairMVPPosI equ $v24 // global
+// v25: prev vertex screen pos
+// v26: prev vertex screen Z
+// For point lighting
+mvTc0f equ $v3
+mvTc0i equ $v4
+mvTc1i equ $v21
+mvTc1f equ $v28 // same as vPairAlpha37
+mvTc2i equ $v30
+mvTc2f equ $v31
+
+// Values set up by load_spfx_global_values, which must be kept during the full
+// vertex process, and which are reloaded for each vert during clipping. See
+// that routine for the detailed contents of each of these registers.
+// secondVtxPos
+spFxBaseReg equ $13  // global
+vVpFgScale  equ $v16 // All of these used locally elsewhere
+vVpFgOffset equ $v17
+vVpMisc     equ $v18
+vFogMask    equ $v19
+vVpNegScale equ $v21
+
+// Arguments to mtx_multiply
+output_mtx  equ $19 // also dmaLen, also used by itself
+input_mtx_1 equ $20 // also dmemAddr and xfrmLtPtr
+input_mtx_0 equ $21 // also clipPolyWrite
+
+// Arguments to dma_read_write
+dmaLen   equ $19 // also output_mtx, also used by itself
+dmemAddr equ $20 // also input_mtx_1 and xfrmLtPtr
+// cmd_w1_dram   // used for all dma_read_write DRAM addresses, not just second word of command
+
+// Arguments to load_overlay_and_enter
+ovlTableEntry equ $11 // Commonly used locally
+postOvlRA     equ $12 // Commonly used locally
+
+// ==== Summary of uses of all registers
+// $zero: Hardwired zero scalar register
+// $1: vertex 1 addr, count of remaining vertices, pointer to store texture coefficients, local
+// $2: vertex 2 addr, vertex at end of edge in clipping, pointer to store shade coefficients, local
+// $3: vertex 3 addr, vertex at start of edge in clipping, local
+// $4: pre-shuffle vertex 1 addr for flat shading, local
+// $5: clipMaskIdx, geometry mode high short during vertex load / lighting, local
+// $6: topLightPtr, geometry mode low byte during tri write, local
+// $7: fog flag in vtx write, local
+// $8: secondVtxPos
+// $9: curLight, local
+// $10: briefly used local in vtx write
+// $11: ovlTableEntry, very common local
+// $12: postOvlRA, curMatrix, local
+// $13: spFxBaseReg
+// $14: inputVtxPos
+// $15: outputVtxPos
+// $16: clipFlags
+// $17: clipPolyRead
+// $18: clipPolySelect
+// $19: dmaLen, output_mtx, briefly used local
+// $20: dmemAddr, input_mtx_1, xfrmLtPtr
+// $21: clipPolyWrite, input_mtx_0
+// $22: rdpCmdBufEnd
+// $23: rdpCmdBufPtr
+// $24: cmd_w1_dram, local
+// $25: cmd_w0
+// $26: taskDataPtr
+// $27: inputBufferPos
+// $28: not used!
+// $29: savedActiveClipPlanes
+// $30: savedRA
+// $ra: Return address for jal, b*al
+// $v0: vZero (every element 0)
+// $v1: vOne (every element 1)
+// $v2: very common local
+// $v3: mvTc0f, local
+// $v4: mvTc0i, local
+// $v5: vPairNZ, local
+// $v6: vPairNY, local
+// $v7: vPairNX, vPairRGBATemp, local
+// $v8: mxr0i, local
+// $v9: mxr1i, local
+// $v10: mxr2i, local
+// $v11: mxr3i, local
+// $v12: mxr0f, local
+// $v13: mxr1f, local
+// $v14: mxr2f, local
+// $v15: mxr3f, local
+// $v16: vVpFgScale, local
+// $v17: vVpFgOffset, local
+// $v18: vVpMisc, local
+// $v19: vFogMask, local
+// $v20: local
+// $v21: mvTc1i, vVpNegScale, local
+// $v22: vPairST
+// $v23: vPairMVPPosF
+// $v24: vPairMVPPosI
+// $v25: prev vertex data, local
+// $v26: prev vertex data, local
+// $v27: vPairRGBA, local
+// $v28: mvTc1f, vPairAlpha37, local
+// $v29: very common local
+// $v30: mvTc2i, constant values for tri write
+// $v31: mvTc2f, general constant values
 
-// Global vector registers
-vZero equ $v0
-vOne equ $v1
-vPairST equ $v22
-vPairMVPPosF equ $v23
-vPairMVPPosI equ $v24
 
 // Initialization routines
 // Everything up until displaylist_dma will get overwritten by ovl0 and/or ovl1
@@ -595,7 +744,7 @@ start: // This is at IMEM 0x1080, not the start of IMEM
     beqz    $12, calculate_overlay_addrs    // skip overlay address calculations if resumed from yield?
      sw     $zero, OSTask + OSTask_flags
     j       load_overlay1_init              // Skip the initialization and go straight to loading overlay 1
-     lw     taskDataPtr, OS_YIELD_DATA_SIZE - 8
+     lw     taskDataPtr, OS_YIELD_DATA_SIZE - 8  // Was previously saved here at yield time
 task_init:
     mfc0    $11, DPC_STATUS
     andi    $11, $11, DPC_STATUS_XBUS_DMA
@@ -640,7 +789,7 @@ wait_dpc_start_valid:
     beqz $12, f3dzex_xbus_0000111C
      sw $zero, OSTask + OSTask_flags
     j load_overlay1_init
-     lw taskDataPtr, OS_YIELD_DATA_SIZE_TOTAL - 8
+     lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // Was previously saved here at yield time
 .fill 16 * 4 // Bunch of nops here to make it the same size as the fifo code.
 f3dzex_xbus_0000111C:
 .endif
@@ -664,7 +813,7 @@ calculate_overlay_addrs:
     sw      $5, overlayInfo3 + overlay_load
     lw      taskDataPtr, OSTask + OSTask_data_ptr
 load_overlay1_init:
-    li      $11, overlayInfo1   // set up loading of overlay 1
+    li      ovlTableEntry, overlayInfo1   // set up loading of overlay 1
 
 // Make room for overlays 0 and 1. Normally, overlay 1 ends exactly at ovl01_end,
 // and overlay 0 is much shorter, but if things are modded this constraint must be met.
@@ -680,18 +829,19 @@ load_overlay1_init:
 // which is displaylist_dma. So the padding has to be before these two instructions,
 // so that this is immediately before displaylist_dma; otherwise the return address
 // will be in the last few instructions of overlay 1. However, this was unnecessary--
-// it could have been a jump and then `addiu $12, $zero, displaylist_dma`.
+// it could have been a jump and then `addiu postOvlRA, $zero, displaylist_dma`,
+// and the padding put after this.
     jal     load_overlay_and_enter  // load overlay 1 and enter
-     move   $12, $ra                // set up the return address, since load_overlay_and_enter returns to $12
+     move   postOvlRA, $ra          // set up the return address, since load_overlay_and_enter returns to postOvlRA
 
 ovl01_end:
 // Overlays 0 and 1 overwrite everything up to this point (2.08 versions overwrite up to the previous .align 8)
 
 displaylist_dma: // loads inputBufferLength bytes worth of displaylist data via DMA into inputBuffer
-    li      $19, inputBufferLength - 1  // set the DMA length
-    move    $24, taskDataPtr            // set up the DRAM address to read from
-    jal     dma_read_write              // initiate the DMA read
-     la     $20, inputBuffer            // set the address to DMA read to
+    li      dmaLen, inputBufferLength - 1               // set the DMA length
+    move    cmd_w1_dram, taskDataPtr                    // set up the DRAM address to read from
+    jal     dma_read_write                              // initiate the DMA read
+     la     dmemAddr, inputBuffer                       // set the address to DMA read to
     addiu   taskDataPtr, taskDataPtr, inputBufferLength // increment the DRAM address to read from next time
     li      inputBufferPos, -inputBufferLength          // reset the DL word index
 wait_for_dma_and_run_next_command:
@@ -713,46 +863,44 @@ run_next_DL_command:
     sra     $12, cmd_w0, 24                             // extract DL command byte from command word
     sll     $11, $12, 1                                 // multiply command byte by 2 to get jump table offset
     lhu     $11, (commandJumpTable)($11)                // get command subroutine address from command jump table
-    bnez    $1, load_overlay_0_and_enter                // load and execute overlay 0 if yielding
-     lw     cmd_w1, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1
-    jr      $11                                         // jump to the loaded command handler
+    bnez    $1, load_overlay_0_and_enter                // load and execute overlay 0 if yielding; $1 > 0
+     lw     cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram
+    jr      $11                                         // jump to the loaded command handler; $1 == 0
      addiu  inputBufferPos, inputBufferPos, 0x0008      // increment the DL index by 2 words
 
 .if (UCODE_IS_F3DEX2_204H) // Microcodes besides F3DEX2 2.04H have this as a noop
 G_SPECIAL_1_handler:    // Seems to be a manual trigger for mvp recalculation
     li      $ra, run_next_DL_command
-    li      $21, pMatrix
-    li      $20, mvMatrix
-    li      $19, mvpMatrix
+    li      input_mtx_0, pMatrix
+    li      input_mtx_1, mvMatrix
+    li      output_mtx, mvpMatrix
     j       mtx_multiply
      sb     cmd_w0, mvpValid
 .endif
 
 G_DMA_IO_handler:
-    jal     segmented_to_physical // Convert the provided segmented address (in cmd_w1) to a virtual one
-     lh     $20, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
-    andi    $19, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
-    // At this point, $20's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
+    jal     segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
+     lh     dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
+    andi    dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
+    // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
     // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
-    sra     $20, $20, 2
-    j       dma_read_write  // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of $20)
+    sra     dmemAddr, dmemAddr, 2
+    j       dma_read_write  // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
      li     $ra, wait_for_dma_and_run_next_command  // Setup the return address for running the next DL command
 
-geometryMode equ $11
 G_GEOMETRYMODE_handler:
-    lw      geometryMode, geometryModeLabel     // load the geometry mode value
-    and     geometryMode, geometryMode, cmd_w0  // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
-    or      geometryMode, geometryMode, cmd_w1  // sets the flags in cmd_w1 (set in g*SPSetGeometryMode)
-    j       run_next_DL_command                 // run the next DL command
-     sw     geometryMode, geometryModeLabel     // update the geometry mode value
+    lw      $11, geometryModeLabel  // load the geometry mode value
+    and     $11, $11, cmd_w0        // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
+    or      $11, $11, cmd_w1_dram   // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
+    j       run_next_DL_command     // run the next DL command
+     sw     $11, geometryModeLabel  // update the geometry mode value
 
-dlStackIdx equ $1
 G_ENDDL_handler:
-    lbu     dlStackIdx, displayListStackLength      // Load the DL stack index
-    beqz    dlStackIdx, load_overlay_0_and_enter    // Load overlay 0 if there is no DL return address, to end the graphics task processing
-     addi   dlStackIdx, dlStackIdx, -4              // Decrement the DL stack index
-    j       f3dzex_ovl1_00001020                    // has a different version in ovl1
-     lw     taskDataPtr, (displayListStack)(dlStackIdx) // Load the address of the DL to return to into the taskDataPtr (the current DL address)
+    lbu     $1, displayListStackLength          // Load the DL stack index
+    beqz    $1, load_overlay_0_and_enter        // Load overlay 0 if there is no DL return address, to end the graphics task processing; $1 < 0
+     addi   $1, $1, -4                          // Decrement the DL stack index
+    j       f3dzex_ovl1_00001020                // has a different version in ovl1
+     lw     taskDataPtr, (displayListStack)($1) // Load the address of the DL to return to into the taskDataPtr (the current DL address)
 
 G_RDPHALF_2_handler:
     ldv     $v29[0], (texrectWord1)($zero)
@@ -760,7 +908,7 @@ G_RDPHALF_2_handler:
     addi    rdpCmdBufPtr, rdpCmdBufPtr, 8
     sdv     $v29[0], -8(rdpCmdBufPtr)
 G_RDP_handler:
-    sw      cmd_w1, 4(rdpCmdBufPtr)         // Add the second word of the command to the RDP command buffer
+    sw      cmd_w1_dram, 4(rdpCmdBufPtr)        // Add the second word of the command to the RDP command buffer
 G_SYNC_handler:
 G_NOOP_handler:
     sw      cmd_w0, 0(rdpCmdBufPtr)         // Add the command word to the RDP command buffer
@@ -769,25 +917,25 @@ G_NOOP_handler:
 
 G_SETxIMG_handler:
     li      $ra, G_RDP_handler          // Load the RDP command handler into the return address, then fall through to convert the address to virtual
-// Converts the segmented address in $24 (also cmd_w1) to the corresponding physical address
+// Converts the segmented address in cmd_w1_dram to the corresponding physical address
 segmented_to_physical:
-    srl     $11, $24, 22                // Copy (segment index << 2) into $11
-    andi    $11, $11, 0x3C              // Clear the bottom 2 bits that remained during the shift
-    lw      $11, (segmentTable)($11)    // Get the current address of the segment
-    sll     $24, $24, 8                 // Shift the address to the left so that the top 8 bits are shifted out
-    srl     $24, $24, 8                 // Shift the address back to the right, resulting in the original with the top 8 bits cleared
+    srl     $11, cmd_w1_dram, 22          // Copy (segment index << 2) into $11
+    andi    $11, $11, 0x3C                // Clear the bottom 2 bits that remained during the shift
+    lw      $11, (segmentTable)($11)      // Get the current address of the segment
+    sll     cmd_w1_dram, cmd_w1_dram, 8   // Shift the address to the left so that the top 8 bits are shifted out
+    srl     cmd_w1_dram, cmd_w1_dram, 8   // Shift the address back to the right, resulting in the original with the top 8 bits cleared
     jr      $ra
-     add    $24, $24, $11               // Add the segment's address to the masked input address, resulting in the virtual address
+     add    cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address
 
 G_RDPSETOTHERMODE_handler:
-    sw      cmd_w0, otherMode0  // Record the local otherMode0 copy
-    j       G_RDP_handler       // Send the command to the RDP
-     sw     cmd_w1, otherMode1  // Record the local otherMode1 copy
+    sw      cmd_w0, otherMode0       // Record the local otherMode0 copy
+    j       G_RDP_handler            // Send the command to the RDP
+     sw     cmd_w1_dram, otherMode1  // Record the local otherMode1 copy
 
 G_SETSCISSOR_handler:
-    sw      cmd_w0, scissorUpLeft       // Record the local scissorUpleft copy
-    j       G_RDP_handler               // Send the command to the RDP
-     sw     cmd_w1, scissorBottomRight  // Record the local scissorBottomRight copy
+    sw      cmd_w0, scissorUpLeft            // Record the local scissorUpleft copy
+    j       G_RDP_handler                    // Send the command to the RDP
+     sw     cmd_w1_dram, scissorBottomRight  // Record the local scissorBottomRight copy
 
 check_rdp_buffer_full_and_run_next_cmd:
     li      $ra, run_next_DL_command    // Set up running the next DL command as the return address
@@ -798,42 +946,42 @@ check_rdp_buffer_full:
     blez    $11, return_routine         // Return if rdpCmdBufEnd >= rdpCmdBufPtr
 flush_rdp_buffer:
      mfc0   $12, SP_DMA_BUSY
-    lw      $24, rdpFifoPos
-    addiu   $19, $11, RDP_CMD_BUFSIZE
+    lw      cmd_w1_dram, rdpFifoPos
+    addiu   dmaLen, $11, RDP_CMD_BUFSIZE
     bnez    $12, flush_rdp_buffer
      lw     $12, OSTask + OSTask_output_buff_size
-    mtc0    $24, DPC_END
-    add     $11, $24, $19
+    mtc0    cmd_w1_dram, DPC_END
+    add     $11, cmd_w1_dram, dmaLen
     sub     $12, $12, $11
     bgez    $12, f3dzex_000012A8
 @@await_start_valid:
      mfc0   $11, DPC_STATUS
     andi    $11, $11, DPC_STATUS_START_VALID
     bnez    $11, @@await_start_valid
-     lw     $24, OSTask + OSTask_output_buff
+     lw     cmd_w1_dram, OSTask + OSTask_output_buff
 f3dzex_00001298:
     mfc0    $11, DPC_CURRENT
-    beq     $11, $24, f3dzex_00001298
+    beq     $11, cmd_w1_dram, f3dzex_00001298
      nop
-    mtc0    $24, DPC_START
+    mtc0    cmd_w1_dram, DPC_START
 f3dzex_000012A8:
     mfc0    $11, DPC_CURRENT
-    sub     $11, $11, $24
+    sub     $11, $11, cmd_w1_dram
     blez    $11, f3dzex_000012BC
-     sub    $11, $11, $19
+     sub    $11, $11, dmaLen
     blez    $11, f3dzex_000012A8
 f3dzex_000012BC:
-     add    $11, $24, $19
+     add    $11, cmd_w1_dram, dmaLen
     sw      $11, rdpFifoPos
     // Set up the DMA from DMEM to the RDP fifo in RDRAM
-    addi    $19, $19, -1                                    // subtract 1 from the length
-    addi    $20, rdpCmdBufEnd, -(0x2000 | RDP_CMD_BUFSIZE)  // The 0x2000 is meaningless, negative means write
+    addi    dmaLen, dmaLen, -1                                  // subtract 1 from the length
+    addi    dmemAddr, rdpCmdBufEnd, -(0x2000 | RDP_CMD_BUFSIZE) // The 0x2000 is meaningless, negative means write
     xori    rdpCmdBufEnd, rdpCmdBufEnd, rdpCmdBuffer1End ^ rdpCmdBuffer2End // Swap between the two RDP command buffers
     j       dma_read_write
      addi   rdpCmdBufPtr, rdpCmdBufEnd, -RDP_CMD_BUFSIZE
 .else // UCODE_METHOD == METHOD_XBUS
 check_rdp_buffer_full:
-    addi $11, rdpCmdBufPtr, -0xF10
+    addi $11, rdpCmdBufPtr, -(OSTask - RDP_CMD_BUFSIZE_EXCESS)
     blez $11, ovl0_04001284
      mtc0 rdpCmdBufPtr, DPC_END
 ovl0_04001260:
@@ -851,7 +999,7 @@ ovl0_04001284:
     mfc0 $11, DPC_CURRENT
     sub $11, $11, rdpCmdBufPtr
     blez $11, ovl0_0400129C
-     addi $11, $11, -0xB0
+     addi $11, $11, -RDP_CMD_BUFSIZE_EXCESS
     blez $11, ovl0_04001284
      nop
 ovl0_0400129C:
@@ -864,17 +1012,12 @@ ovl23_start:
 
 ovl3_start:
 
-clipPolyWrite equ $21
-clipMaskIdx equ $5
-
-vPairRGBATemp equ $v7
-
 // Jump here to do lighting. If overlay 3 is loaded (this code), loads and jumps
 // to overlay 2 (same address as right here).
-ovl23_lighting_entrypoint_copy:     // same IMEM address as ovl23_lighting_entrypoint
-    li      $11, overlayInfo2       // set up a load for overlay 2
-    j       load_overlay_and_enter  // load overlay 2
-     li     $12, ovl23_lighting_entrypoint  // set the return address
+ovl23_lighting_entrypoint_copy:  // same IMEM address as ovl23_lighting_entrypoint
+    li      ovlTableEntry, overlayInfo2          // set up a load for overlay 2
+    j       load_overlay_and_enter               // load overlay 2
+     li     postOvlRA, ovl23_lighting_entrypoint // set the return address
 
 // Jump here to do clipping. If overlay 3 is loaded (this code), directly starts
 // the clipping code.
@@ -925,7 +1068,7 @@ clipping_skipswap23: // After possible swap, $19 = vtx not meeting clip cond / o
     vmadn   $v10, $v6, $v3            // frac: - vtx off screen * clip ratio
     vmadh   $v11, $v7, $v3            // int:  - vtx off screen * clip ratio 11:10
     vaddc   $v8, $v8, $v8[0q]         // frac: y += x, w += z, vtx on screen only
-    lqv     $v25[0], (linearGenerateCoefficients)($zero) // Used just to load the value 0x0002
+    lqv     $v25[0], (linearGenerateCoefficients)($zero) // Used just to load the value 2
     vadd    $v9, $v9, $v9[0q]         // int:  y += x, w += z, vtx on screen only
     vaddc   $v10, $v10, $v10[0q]      // frac: y += x, w += z, vtx on screen - vtx off screen
     vadd    $v11, $v11, $v11[0q]      // int:  y += x, w += z, vtx on screen - vtx off screen
@@ -1066,39 +1209,33 @@ ovl3_end:
 
 ovl23_end:
 
-inputVtxPos equ $14
-// See load_spfx_global_values for detailed contents
-vVpFgScale  equ $v16
-vVpFgOffset equ $v17
-vVpMisc     equ $v18
-vFogMask    equ $v19
-vVpNegScale equ $v21
+vPairRGBATemp equ $v7
 
 G_VTX_handler:
-    lhu     $20, (vertexTable)(cmd_w0)      // Load the address of the provided vertex array
-    jal     segmented_to_physical           // Convert the vertex array's segmented address (in $24) to a virtual one
+    lhu     dmemAddr, (vertexTable)(cmd_w0) // Load the address of the provided vertex array
+    jal     segmented_to_physical           // Convert the vertex array's segmented address (in cmd_w1_dram) to a virtual one
      lhu    $1, (inputBufferEnd - 0x07)(inputBufferPos) // Load the size of the vertex array to copy into reg $1
-    sub     $20, $20, $1                    // Calculate the address to DMA the provided vertices into
+    sub     dmemAddr, dmemAddr, $1          // Calculate the address to DMA the provided vertices into
     jal     dma_read_write                  // DMA read the vertices from DRAM
-     addi   $19, $1, -1                     // Set up the DMA length
+     addi   dmaLen, $1, -1                  // Set up the DMA length
     lhu     $5, geometryModeLabel           // Load the geometry mode into $5
     srl     $1, $1, 3
     sub     outputVtxPos, cmd_w0, $1
     lhu     outputVtxPos, (vertexTable)(outputVtxPos)
-    move    inputVtxPos, $20
+    move    inputVtxPos, dmemAddr
     lbu     secondVtxPos, mvpValid          // used as temp reg
-    andi    tmpCurLight, $5, G_LIGHTING_H   // If no lighting, tmpCurLight is 0, skips transforming light dirs and setting this up as a pointer
-    bnez    tmpCurLight, ovl23_lighting_entrypoint // Run overlay 2 for lighting, either directly or via overlay 3 loading overlay 2
+    andi    topLightPtr, $5, G_LIGHTING_H   // If no lighting, topLightPtr is 0, skips transforming light dirs and setting this up as a pointer
+    bnez    topLightPtr, ovl23_lighting_entrypoint // Run overlay 2 for lighting, either directly or via overlay 3 loading overlay 2
      andi   $7, $5, G_FOG_H
 after_light_dir_xfrm:
     bnez    secondVtxPos, vertex_skip_recalc_mvp  // Skip recalculating the mvp matrix if it's already up-to-date
      sll    $7, $7, 3                 // $7 is 8 if G_FOG is set, 0 otherwise
     sb      cmd_w0, mvpValid          // Set mvpValid
-    li      $21, pMatrix              // Arguments to mtx_multiply
-    li      $20, mvMatrix
+    li      input_mtx_0, pMatrix      // Arguments to mtx_multiply
+    li      input_mtx_1, mvMatrix
     // Calculate the MVP matrix
     jal     mtx_multiply
-     li     $19, mvpMatrix
+     li     output_mtx, mvpMatrix
 
 vertex_skip_recalc_mvp:
     /* Load MVP matrix as follows--note that translation is in the bottom row,
@@ -1143,7 +1280,7 @@ vertices_process_pair:
     vmadh   $v29, mxr3i, vOne[0]
     llv     vPairST[12], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // load the texture coords of the 1st vertex into second half of vPairST
     vmadn   $v29, mxr0f, $v20[0h]
-    move    curLight, tmpCurLight
+    move    curLight, topLightPtr
     vmadh   $v29, mxr0i, $v20[0h]
     lpv     $v2[0], (ltBufOfs + 0x10)(curLight)    // First instruction of lights_dircoloraccum2 loop; load light transformed dir
     vmadn   $v29, mxr1f, $v20[1h]
@@ -1151,7 +1288,7 @@ vertices_process_pair:
     vmadh   $v29, mxr1i, $v20[1h]
     lpv     vPairRGBATemp[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Load both vertex's colors/normals into v7's elements RGBARGBA or XYZAXYZA
     vmadn   vPairMVPPosF, mxr2f, $v20[2h]          // vPairMVPPosF = MVP * vpos result frac
-    bnez    tmpCurLight, light_vtx                 // Zero if lighting disabled, pointer if enabled
+    bnez    topLightPtr, light_vtx                 // Zero if lighting disabled, pointer if enabled
      vmadh  vPairMVPPosI, mxr2i, $v20[2h]          // vPairMVPPosI = MVP * vpos result int
     // These two instructions are repeated at the end of all the lighting codepaths,
     // since they're skipped here if lighting is being performed
@@ -1309,12 +1446,12 @@ load_spfx_global_values:
     llv     $v29[0], (fogFactor - spFxBase)(spFxBaseReg) // Load fog multiplier and offset
     ldv     vVpFgOffset[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
     ldv     vVpFgOffset[8], (viewport + 8)($zero)
-    vlt     vFogMask, $v31, $v31[3]               // VCC = [0, 0, 0, 1, 0, 0, 0, 1]
+    vlt     vFogMask, $v31, $v31[3]               // VCC = 11101110
     vsub    vVpNegScale, vZero, vVpFgScale        // -vscale
     llv     vVpMisc[4], (textureSettings2 - spFxBase)(spFxBaseReg) // Texture ST scale
     vmrg    vVpFgScale, vVpFgScale, $v29[0]       // Put fog multiplier in elements 3,7 of vscale
     llv     vVpMisc[12], (textureSettings2 - spFxBase)(spFxBaseReg) // Texture ST scale
-    vmrg    vFogMask, vZero, vOne[0]              // Put 0 in elems 3,7, 1 in others
+    vmrg    vFogMask, vZero, vOne[0]              // Put 0 in most elems, 1 in elems 3,7
     llv     vVpMisc[8], (perspNorm)($zero)        // Perspective normalization long (actually short)
     vmrg    vVpFgOffset, vVpFgOffset, $v29[1]     // Put fog offset in elements 3,7 of vtrans
     lsv     vVpMisc[10], (clipRatio + 6 - spFxBase)(spFxBaseReg) // Clip ratio (-x version, but normally +/- same in all dirs)
@@ -1324,13 +1461,13 @@ load_spfx_global_values:
 
 G_TRI2_handler:
 G_QUAD_handler:
-    jal     tri_to_rdp               // Send second tri; return here for first tri
-     sw     cmd_w1, 4(rdpCmdBufPtr)  // Put second tri indices in temp memory
+    jal     tri_to_rdp                   // Send second tri; return here for first tri
+     sw     cmd_w1_dram, 4(rdpCmdBufPtr) // Put second tri indices in temp memory
 G_TRI1_handler:
-    li      $ra, run_next_DL_command // After done with this tri, run next cmd
-    sw      cmd_w0, 4(rdpCmdBufPtr)  // Put first tri indices in temp memory
+    li      $ra, run_next_DL_command     // After done with this tri, run next cmd
+    sw      cmd_w0, 4(rdpCmdBufPtr)      // Put first tri indices in temp memory
 tri_to_rdp:
-    lpv     $v2[0], 0(rdpCmdBufPtr)  // Load tri indexes to vector unit for shuffling
+    lpv     $v2[0], 0(rdpCmdBufPtr)      // Load tri indexes to vector unit for shuffling
     // read the three vertex indices from the stored command word
     lbu     $1, 0x0005(rdpCmdBufPtr)     // $1 = vertex 1 index
     lbu     $2, 0x0006(rdpCmdBufPtr)     // $2 = vertex 2 index
@@ -1605,9 +1742,9 @@ no_textures:
     vmadh   $v9, $v3, $v15[3]
     sdv     $v7[0], 0x0028($2)      // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer)
     vmudn   $v29, $v5, vOne[0]
-    sdv     $v6[8], 0x0038($1)      // Store DsDy, DtDy, DwDy texture coefficeints (fractional)
+    sdv     $v6[8], 0x0038($1)      // Store DsDy, DtDy, DwDy texture coefficients (fractional)
     vmadh   $v29, $v18, vOne[0]
-    sdv     $v7[8], 0x0028($1)      // Store DsDy, DtDy, DwDy texture coefficeints (integer)
+    sdv     $v7[8], 0x0028($1)      // Store DsDy, DtDy, DwDy texture coefficients (integer)
     vmadl   $v29, $v8, $v4[1]
     sdv     $v8[0], 0x0030($2)      // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional)
     vmadm   $v29, $v9, $v4[1]
@@ -1648,35 +1785,35 @@ no_z_buffer:
     j       check_rdp_buffer_full   // eventually returns to $ra, which is next cmd, second tri in TRI2, or middle of clipping
      sdv    $v18[8], 0x0000($1)     // Store S, T, W texture coefficients (integer)
 
-vtxPtr equ $25
-endVtxPtr equ $24
+vtxPtr    equ $25 // = cmd_w0
+endVtxPtr equ $24 // = cmd_w1_dram
 G_CULLDL_handler:
-    lhu     vtxPtr, (vertexTable)(cmd_w0)   // load start vertex address
-    lhu     endVtxPtr, (vertexTable)(cmd_w1) // load end vertex address
+    lhu     vtxPtr, (vertexTable)(cmd_w0)     // load start vertex address
+    lhu     endVtxPtr, (vertexTable)(cmd_w1_dram) // load end vertex address
     addiu   $1, $zero, (CLIP_NX | CLIP_NY | CLIP_PX | CLIP_PY | CLIP_FAR | CLIP_NEAR)
-    lw      $11, VTX_CLIP(vtxPtr)           // read clip flags from vertex
+    lw      $11, VTX_CLIP(vtxPtr)             // read clip flags from vertex
 culldl_loop:
     and     $1, $1, $11
-    beqz    $1, run_next_DL_command         // Some vertex is on the screen-side of all clipping planes; have to render
+    beqz    $1, run_next_DL_command           // Some vertex is on the screen-side of all clipping planes; have to render
      lw     $11, (vtxSize + VTX_CLIP)(vtxPtr) // next vertex clip flags
-    bne     vtxPtr, endVtxPtr, culldl_loop  // loop until reaching the last vertex
-     addiu  vtxPtr, vtxPtr, vtxSize         // advance to the next vertex
-    j       G_ENDDL_handler                 // If got here, there's some clipping plane where all verts are outside it; skip DL
+    bne     vtxPtr, endVtxPtr, culldl_loop    // loop until reaching the last vertex
+     addiu  vtxPtr, vtxPtr, vtxSize           // advance to the next vertex
+    j       G_ENDDL_handler                   // If got here, there's some clipping plane where all verts are outside it; skip DL
 G_BRANCH_WZ_handler:
-     lhu    vtxPtr, (vertexTable)(cmd_w0)   // get the address of the vertex being tested
-.if UCODE_TYPE == TYPE_F3DZEX               // BRANCH_W/BRANCH_Z difference
-    lh      vtxPtr, VTX_W_INT(vtxPtr)       // read the w coordinate of the vertex (f3dzex)
+     lhu    vtxPtr, (vertexTable)(cmd_w0)     // get the address of the vertex being tested
+.if UCODE_TYPE == TYPE_F3DZEX                 // BRANCH_W/BRANCH_Z difference
+    lh      vtxPtr, VTX_W_INT(vtxPtr)         // read the w coordinate of the vertex (f3dzex)
 .else
-    lw      vtxPtr, VTX_SCR_Z(vtxPtr)       // read the screen z coordinate (int and frac) of the vertex (f3dex2)
+    lw      vtxPtr, VTX_SCR_Z(vtxPtr)         // read the screen z coordinate (int and frac) of the vertex (f3dex2)
 .endif
-    sub     $2, vtxPtr, endVtxPtr       // subtract the w/z value being tested
-    bgez    $2, run_next_DL_command     // if vtx.w/z > w/z, continue running this DL
-     lw     $24, rdpHalf1Val            // load the RDPHALF1 value
-    j       f3dzex_ovl1_00001008
+    sub     $2, vtxPtr, cmd_w1_dram           // subtract the w/z value being tested
+    bgez    $2, run_next_DL_command           // if vtx.w/z >= cmd w/z, continue running this DL
+     lw     cmd_w1_dram, rdpHalf1Val          // load the RDPHALF1 value as the location to branch to
+    j       branch_dl
 G_MODIFYVTX_handler:
      lbu    $1, (inputBufferEnd - 0x07)(inputBufferPos)
     j       do_moveword
-     lhu    vtxPtr, (vertexTable)(cmd_w0)
+     lhu    cmd_w0, (vertexTable)(cmd_w0)
 
      
 .if . > 0x00001FAC
@@ -1688,46 +1825,45 @@ G_MODIFYVTX_handler:
 // to load_overlay_and_enter to execute the load.
 load_overlay_0_and_enter:
 G_LOAD_UCODE_handler:
-    li      $12, ovl0_start    // Sets up return address
-    li      $11, overlayInfo0       // Sets up ovl0 table address
+    li      postOvlRA, ovl0_start                    // Sets up return address
+    li      ovlTableEntry, overlayInfo0              // Sets up ovl0 table address
 // This subroutine accepts the address of an overlay table entry and loads that overlay.
 // It then jumps to that overlay's address after DMA of the overlay is complete.
-// $11 is used to provide the overlay table entry
-// $12 is used to pass in a value to return to
-ovlTableEntry equ $11
-returnAddr equ $12
+// ovlTableEntry is used to provide the overlay table entry
+// postOvlRA is used to pass in a value to return to
 load_overlay_and_enter:
-    lw      $24, overlay_load(ovlTableEntry)    // Set up overlay dram address
-    lhu     $19, overlay_len(ovlTableEntry)     // Set up overlay length
-    jal     dma_read_write                      // DMA the overlay
-     lhu    $20, overlay_imem(ovlTableEntry)    // Set up overlay load address
-    move    $ra, returnAddr     // Set the return address to the passed in value
+    lw      cmd_w1_dram, overlay_load(ovlTableEntry) // Set up overlay dram address
+    lhu     dmaLen, overlay_len(ovlTableEntry)       // Set up overlay length
+    jal     dma_read_write                           // DMA the overlay
+     lhu    dmemAddr, overlay_imem(ovlTableEntry)    // Set up overlay load address
+    move    $ra, postOvlRA                // Set the return address to the passed in value
+
+.if . > 0x1FC8
+    .error "Constraints violated on what can be overwritten at end of ucode (relevant for G_LOAD_UCODE)"
+.endif
+
 while_wait_dma_busy:
-    mfc0    $11, SP_DMA_BUSY    // Load the DMA_BUSY value into $11
+    mfc0    ovlTableEntry, SP_DMA_BUSY    // Load the DMA_BUSY value into ovlTableEntry
 while_dma_busy:
-    bnez    $11, while_dma_busy // Loop until DMA_BUSY is cleared
-     mfc0   $11, SP_DMA_BUSY    // Update $11's DMA_BUSY value
+    bnez    ovlTableEntry, while_dma_busy // Loop until DMA_BUSY is cleared
+     mfc0   ovlTableEntry, SP_DMA_BUSY    // Update ovlTableEntry's DMA_BUSY value
 // This routine is used to return via conditional branch
 return_routine:
     jr      $ra
 
-dmemAddr equ $20
-dramAddr equ $24
-dmaLen equ $19
-dmaFull equ $11
 dma_read_write:
-     mfc0   dmaFull, SP_DMA_FULL    // load the DMA_FULL value
+     mfc0   $11, SP_DMA_FULL          // load the DMA_FULL value
 while_dma_full:
-    bnez    dmaFull, while_dma_full // Loop until DMA_FULL is cleared
-     mfc0   dmaFull, SP_DMA_FULL    // Update DMA_FULL value
-    mtc0    dmemAddr, SP_MEM_ADDR   // Set the DMEM address to DMA from/to
-    bltz    dmemAddr, dma_write     // If the DMEM address is negative, this is a DMA write, if not read
-     mtc0   dramAddr, SP_DRAM_ADDR  // Set the DRAM address to DMA from/to
+    bnez    $11, while_dma_full       // Loop until DMA_FULL is cleared
+     mfc0   $11, SP_DMA_FULL          // Update DMA_FULL value
+    mtc0    dmemAddr, SP_MEM_ADDR     // Set the DMEM address to DMA from/to
+    bltz    dmemAddr, dma_write       // If the DMEM address is negative, this is a DMA write, if not read
+     mtc0   cmd_w1_dram, SP_DRAM_ADDR // Set the DRAM address to DMA from/to
     jr $ra
-     mtc0   dmaLen, SP_RD_LEN       // Initiate a DMA read with a length of dmaLen
+     mtc0   dmaLen, SP_RD_LEN         // Initiate a DMA read with a length of dmaLen
 dma_write:
     jr $ra
-     mtc0   dmaLen, SP_WR_LEN       // Initiate a DMA write with a length of dmaLen
+     mtc0   dmaLen, SP_WR_LEN         // Initiate a DMA write with a length of dmaLen
 
 .if . > 0x00002000
     .error "Not enough room in IMEM"
@@ -1739,6 +1875,8 @@ dma_write:
 .headersize 0x00001000 - orga()
 
 // Overlay 0 controls the RDP and also stops the RSP when work is done
+// The action here is controlled by $1. If yielding, $1 > 0. If this was
+// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
 ovl0_start:
 .if UCODE_METHOD == METHOD_FIFO
     sub     $11, rdpCmdBufPtr, rdpCmdBufEnd
@@ -1747,37 +1885,42 @@ ovl0_start:
      nop
     jal     while_wait_dma_busy
      lw     $24, rdpFifoPos
-    bltz    $1, taskdone_and_break
+    bltz    $1, taskdone_and_break  // $1 < 0 = Got to the end of the parent DL
      mtc0   $24, DPC_END            // Set the end pointer of the RDP so that it starts the task
 .else // UCODE_METHOD == METHOD_XBUS
-    bltz    $1, taskdone_and_break
+    bltz    $1, taskdone_and_break  // $1 < 0 = Got to the end of the parent DL
      nop
 .endif
-    bnez    $1, task_yield
-     add    taskDataPtr, taskDataPtr, inputBufferPos
-    lw      $24, 0x09C4(inputBufferPos) // Should this be (inputBufferEnd - 0x04)?
-    sw      taskDataPtr, OSTask + OSTask_data_ptr
-    sw      $24, OSTask + OSTask_ucode
-    la      $20, start              // DMA address
-    jal     dma_read_write          // initiate DMA read
-     li     $19, 0x0F48 - 1
+    bnez    $1, task_yield          // $1 > 0 = CPU requested yield
+     add    taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
+// If here, G_LOAD_UCODE was executed.
+    lw      cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr
+    sw      taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL
+    sw      cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute
+    la      dmemAddr, start         // Beginning of overwritable part of IMEM
+    jal     dma_read_write          // DMA DRAM read -> IMEM write
+     li     dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM
 .if UCODE_METHOD == METHOD_XBUS
 ovl0_xbus_wait_for_rdp:
     mfc0 $11, DPC_STATUS
     andi $11, $11, DPC_STATUS_DMA_BUSY
     bnez $11, ovl0_xbus_wait_for_rdp // Keep looping while RDP is busy.
 .endif
-    lw      $24, rdpHalf1Val
-    la      $20, 0x0180             // DMA address; equal to but probably not actually spFxBase or clipRatio
-    andi    $19, cmd_w0, 0x0FFF
-    add     $24, $24, $20
+    lw      cmd_w1_dram, rdpHalf1Val // Get DRAM address of ucode data from rdpHalf1Val
+    la      dmemAddr, spFxBase      // DMEM address is spFxBase
+    andi    dmaLen, cmd_w0, 0x0FFF  // Extract DMEM length from command word
+    add     cmd_w1_dram, cmd_w1_dram, dmemAddr // Start overwriting data from spFxBase
     jal     dma_read_write          // initate DMA read
-     sub    $19, $19, $20
+     sub    dmaLen, dmaLen, dmemAddr // End that much before the end of DMEM
     j       while_wait_dma_busy
-.if (UCODE_IS_F3DEX2_204H || UCODE_METHOD == METHOD_XBUS /* ??? */)
-     li     $ra, taskdone_and_break_204H
+.if (UCODE_IS_F3DEX2_204H)
+     li     $ra, start
 .else
-     li     $ra, taskdone_and_break
+     li     $ra, start + 4 // Not sure why we skip the first instruction of the new ucode
+.endif
+
+.if . > start
+    .error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE"
 .endif
 
 ucode equ $11
@@ -1788,24 +1931,29 @@ task_yield:
     sw      taskDataPtr, OS_YIELD_DATA_SIZE - 8
     sw      ucode, OS_YIELD_DATA_SIZE - 4
     li      status, SP_SET_SIG1 | SP_SET_SIG2   // yielded and task done signals
-    lw      $24, OSTask + OSTask_yield_data_ptr
-    li      $20, 0x8000
-    li      $19, OS_YIELD_DATA_SIZE - 1
+    lw      cmd_w1_dram, OSTask + OSTask_yield_data_ptr
+    li      dmemAddr, 0x8000 // 0, but negative = write
+    li      dmaLen, OS_YIELD_DATA_SIZE - 1
 .else // UCODE_METHOD == METHOD_XBUS
-    sw      taskDataPtr, OS_YIELD_DATA_SIZE
-    sw      ucode, OS_YIELD_DATA_SIZE + 4
-    lw      $24, OSTask + OSTask_yield_data_ptr
-    li      $20, 0x8000
+    // Instead of saving the whole first OS_YIELD_DATA_SIZE bytes of DMEM,
+    // XBUS saves only up to inputBuffer, as everything after that can be erased,
+    // and because the RDP may still be using the output buffer, which is where
+    // we'd have to write taskDataPtr and ucode.
+    sw      taskDataPtr, inputBuffer // save these values for below, somewhere outside
+    sw      ucode, inputBuffer + 4   // the area being written
+    lw      cmd_w1_dram, OSTask + OSTask_yield_data_ptr
+    li      dmemAddr, 0x8000 // 0, but negative = write
     jal     dma_read_write
-     li     $19, OS_YIELD_DATA_SIZE - 1
+     li     dmaLen, inputBuffer - 1
+    // At the end of the OS's yield buffer, write the taskDataPtr and ucode words.
     li      status, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals
-    addiu   $24, $24, OS_YIELD_DATA_SIZE_TOTAL - 8
-    li      $20, -0x76E0 // ???
-    li      $19, 7
+    addiu   cmd_w1_dram, cmd_w1_dram, OS_YIELD_DATA_SIZE - 8
+    li      dmemAddr, 0x8000 | inputBuffer // where they were saved above
+    li      dmaLen, 8 - 1
 .endif
     j       dma_read_write
-taskdone_and_break_204H: // Only used in f3dex2 2.04H
      li     $ra, break
+
 taskdone_and_break:
     li      status, SP_SET_SIG2   // task done signal
 break:
@@ -1835,11 +1983,11 @@ ovl1_start:
 G_DL_handler:
     lbu     $1, displayListStackLength  // Get the DL stack length
     sll     $2, cmd_w0, 15              // Shifts the push/nopush value to the highest bit in $2
-f3dzex_ovl1_00001008:
+branch_dl:
     jal     segmented_to_physical
      add    $3, taskDataPtr, inputBufferPos
     bltz    $2, displaylist_dma         // If the operation is nopush (branch) then simply DMA the new displaylist
-     move   taskDataPtr, cmd_w1         // Set the task data pointer to the target display list
+     move   taskDataPtr, cmd_w1_dram    // Set the task data pointer to the target display list
     sw      $3, (displayListStack)($1)
     addi    $1, $1, 4                   // Increment the DL stack length
 f3dzex_ovl1_00001020:
@@ -1855,7 +2003,7 @@ G_TEXRECTFLIP_handler:
 G_RDPHALF_1_handler:
     j       run_next_DL_command
     // Stores second command word into textureSettings for gSPTexture, 0x00D4 for gSPTextureRectangle/Flip, 0x00D8 for G_RDPHALF_1
-     sw     cmd_w1, (texrectWord2 - G_TEXRECTFLIP_handler)($11)
+     sw     cmd_w1_dram, (texrectWord2 - G_TEXRECTFLIP_handler)($11)
 
 G_MOVEWORD_handler:
     srl     $2, cmd_w0, 16                              // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
@@ -1863,32 +2011,29 @@ G_MOVEWORD_handler:
 do_moveword:
     add     $1, $1, cmd_w0          // adds the offset in the command word to the address from the table (the upper 4 bytes are effectively ignored)
     j       run_next_DL_command     // process the next command
-     sw     cmd_w1, ($1)            // moves the specified value (in cmd_w1) into the word (offset + moveword_table[index])
+     sw     cmd_w1_dram, ($1)       // moves the specified value (in cmd_w1_dram) into the word (offset + moveword_table[index])
 
 G_POPMTX_handler:
     lw      $11, matrixStackPtr             // Get the current matrix stack pointer
     lw      $2, OSTask + OSTask_dram_stack  // Read the location of the dram stack
-    sub     $24, $11, cmd_w1                // Decrease the matrix stack pointer by the amount passed in the second command word
-    sub     $1, $24, $2                     // Subtraction to check if the new pointer is greater than or equal to $2
+    sub     cmd_w1_dram, $11, cmd_w1_dram           // Decrease the matrix stack pointer by the amount passed in the second command word
+    sub     $1, cmd_w1_dram, $2                     // Subtraction to check if the new pointer is greater than or equal to $2
     bgez    $1, do_popmtx                   // If the new matrix stack pointer is greater than or equal to $2, then use the new pointer as is
      nop
-    move    $24, $2                         // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead
+    move    cmd_w1_dram, $2                         // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead
 do_popmtx:
-    beq     $24, $11, run_next_DL_command   // If no bytes were popped, then we don't need to make the mvp matrix as being out of date and can run the next command
-     sw     $24, matrixStackPtr             // Update the matrix stack pointer with the new value
+    beq     cmd_w1_dram, $11, run_next_DL_command   // If no bytes were popped, then we don't need to make the mvp matrix as being out of date and can run the next command
+     sw     cmd_w1_dram, matrixStackPtr             // Update the matrix stack pointer with the new value
     j       do_movemem
      sw     $zero, mvpValid                 // Mark the MVP matrix and light directions as being out of date (the word being written to contains both)
 
 G_MTX_end: // Multiplies the loaded model matrix into the model stack
-    lhu     $19, (movememTable + G_MV_MMTX)($1) // Set the output matrix to the model or projection matrix based on the command
+    lhu     output_mtx, (movememTable + G_MV_MMTX)($1) // Set the output matrix to the model or projection matrix based on the command
     jal     while_wait_dma_busy
-     lhu    $21, (movememTable + G_MV_MMTX)($1) // Set the first input matrix to the model or projection matrix based on the command
+     lhu    input_mtx_0, (movememTable + G_MV_MMTX)($1) // Set the first input matrix to the model or projection matrix based on the command
     li      $ra, run_next_DL_command
     // The second input matrix will correspond to the address that memory was moved into, which will be tempMtx for G_MTX
 
-input_mtx_0 equ $21
-input_mtx_1 equ $20
-output_mtx equ $19
 mtx_multiply:
     addi    $12, input_mtx_1, 0x0018
 @@loop:
@@ -1920,7 +2065,6 @@ mtx_multiply:
     jr      $ra
      sqv    $v6[0], 0x0010(output_mtx)
 
-matrixStackAddr equ $24
 G_MTX_handler:
     // The lower 3 bits of G_MTX are, from LSb to MSb (0 value/1 value),
     //  matrix type (modelview/projection)
@@ -1930,27 +2074,27 @@ G_MTX_handler:
     andi    $11, cmd_w0, G_MTX_P_MV | G_MTX_NOPUSH_PUSH // Read the matrix type and push type flags into $11
     bnez    $11, load_mtx                               // If the matrix type is projection or this is not a push, skip pushing the matrix
      andi   $2, cmd_w0, G_MTX_MUL_LOAD                  // Read the matrix load type into $2 (0 is multiply, 2 is load)
-    lw      matrixStackAddr, matrixStackPtr             // Set up the DMA from dmem to rdram at the matrix stack pointer
-    li      $20, -0x2000                                //
+    lw      cmd_w1_dram, matrixStackPtr                 // Set up the DMA from dmem to rdram at the matrix stack pointer
+    li      dmemAddr, -0x2000                           //
     jal     dma_read_write                              // DMA the current matrix from dmem to rdram
-     li     $19, 0x0040 - 1                             // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive)
-    addi    matrixStackAddr, matrixStackAddr, 0x40      // Increase the matrix stack pointer by the size of one matrix
-    sw      matrixStackAddr, matrixStackPtr             // Update the matrix stack pointer
-    lw      cmd_w1, (inputBufferEnd - 4)(inputBufferPos)
+     li     dmaLen, 0x0040 - 1                          // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive)
+    addi    cmd_w1_dram, cmd_w1_dram, 0x40              // Increase the matrix stack pointer by the size of one matrix
+    sw      cmd_w1_dram, matrixStackPtr                 // Update the matrix stack pointer
+    lw      cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Load command word 1 again
 load_mtx:
     add     $12, $12, $2        // Add the load type to the command byte, selects the return address based on whether the matrix needs multiplying or just loading
     sw      $zero, mvpValid     // Mark the MVP matrix and light directions as being out of date (the word being written to contains both)
 G_MOVEMEM_handler:
-    jal     segmented_to_physical   // convert the memory address cmd_w1 to a virtual one
+    jal     segmented_to_physical   // convert the memory address cmd_w1_dram to a virtual one
 do_movemem:
-     andi   $1, cmd_w0, 0x00FE                           // Move the movemem table index into $1 (bits 1-7 of the first command word)
-    lbu     $19, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into $19
-    lhu     $20, (movememTable)($1)                      // Load the address of the memory location for the given movemem index
-    srl     $2, cmd_w0, 5                                // Left shifts the index by 5 (which is then added to the value read from the movemem table)
+     andi   $1, cmd_w0, 0x00FE                              // Move the movemem table index into $1 (bits 1-7 of the first command word)
+    lbu     dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into dmaLen
+    lhu     dmemAddr, (movememTable)($1)                    // Load the address of the memory location for the given movemem index
+    srl     $2, cmd_w0, 5                                   // Left shifts the index by 5 (which is then added to the value read from the movemem table)
     lhu     $ra, (movememHandlerTable - (G_POPMTX | 0xFF00))($12)  // Loads the return address from movememHandlerTable based on command byte
     j       dma_read_write
 G_SETOTHERMODE_H_handler: // These handler labels must be 4 bytes apart for the code below to work
-     add    $20, $20, $2
+     add    dmemAddr, dmemAddr, $2                          // This is for the code above, does nothing for G_SETOTHERMODE_H
 G_SETOTHERMODE_L_handler:
     lw      $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) // resolves to othermode0 or othermode1 based on which handler was jumped to
     lui     $2, 0x8000
@@ -1959,11 +2103,11 @@ G_SETOTHERMODE_L_handler:
     srlv    $2, $2, $1
     nor     $2, $2, $zero
     and     $3, $3, $2
-    or      $3, $3, cmd_w1
+    or      $3, $3, cmd_w1_dram
     sw      $3, (othermode0 - G_SETOTHERMODE_H_handler)($11)
     lw      cmd_w0, otherMode0
     j       G_RDP_handler
-     lw     cmd_w1, otherMode1
+     lw     cmd_w1_dram, otherMode1
 
 .align 8
 ovl1_end:
@@ -1978,13 +2122,13 @@ ovl2_start:
 ovl23_lighting_entrypoint:
     lbu     $11, lightsValid
     j       continue_light_dir_xfrm
-     lbu    tmpCurLight, numLightsx18
+     lbu    topLightPtr, numLightsx18
 
-ovl23_clipping_entrypoint_copy:         // same IMEM address as ovl23_clipping_entrypoint
+ovl23_clipping_entrypoint_copy:  // same IMEM address as ovl23_clipping_entrypoint
     move    savedRA, $ra
-    li      $11, overlayInfo3           // set up a load of overlay 3
-    j       load_overlay_and_enter      // load overlay 3
-     li     $12, ovl3_clipping_nosavera // set up the return address in ovl3
+    li      ovlTableEntry, overlayInfo3       // set up a load of overlay 3
+    j       load_overlay_and_enter            // load overlay 3
+     li     postOvlRA, ovl3_clipping_nosavera // set up the return address in ovl3
      
 continue_light_dir_xfrm:
     // Transform light directions from camera space to model space, by
@@ -1993,7 +2137,7 @@ continue_light_dir_xfrm:
     // lookat lights and through all directional and point lights, but not
     // ambient. For point lights, the data is garbage but doesn't harm anything.
     bnez    $11, after_light_dir_xfrm // Skip calculating lights if they're not out of date
-     addi   tmpCurLight, tmpCurLight, spFxBase - lightSize // With ltBufOfs, points at top/max light.
+     addi   topLightPtr, topLightPtr, spFxBase - lightSize // With ltBufOfs, points at top/max light.
     sb      cmd_w0, lightsValid     // Set as valid, reusing state of w0
     /* Load MV matrix 3x3 transposed as:
     mxr0i 00 08 10 06 08 0A 0C 0E
@@ -2021,7 +2165,7 @@ continue_light_dir_xfrm:
     lsv     mxr2i[4], (mvMatrix + 0x14)($zero)
     vmov    mxr2f[0], mxr0f[2]
     // With ltBufOfs immediate add, points two lights behind lightBufferMain, i.e. lightBufferLookat.
-    xfrmLtPtr equ $20
+    xfrmLtPtr equ $20 // also input_mtx_1 and dmemAddr
     li      xfrmLtPtr, spFxBase - 2 * lightSize
     vmov    mxr2i[0], mxr0i[2]                   
     lpv     $v7[0], (ltBufOfs + 0x8)(xfrmLtPtr) // Load light direction
@@ -2046,7 +2190,7 @@ continue_light_dir_xfrm:
     vreadacc $v11, ACC_MIDDLE           // read the middle (bits 16..31) of the accumulator elements into v11
     sw      $12, (ltBufOfs + 0x14)(xfrmLtPtr) // Store duplicate of transformed light direction
     vreadacc $v15, ACC_UPPER            // read the upper (bits 32..47) of the accumulator elements into v15
-    beq     xfrmLtPtr, tmpCurLight, after_light_dir_xfrm    // exit if equal
+    beq     xfrmLtPtr, topLightPtr, after_light_dir_xfrm    // exit if equal
      vmudl  $v29, $v11, $v11            // calculate the low partial product of the accumulator squared (low * low)
     vmadm   $v29, $v15, $v11            // calculate the mid partial product of the accumulator squared (mid * low)
     vmadn   $v16, $v11, $v15            // calculate the mid partial product of the accumulator squared (low * mid)
@@ -2067,16 +2211,11 @@ continue_light_dir_xfrm:
     vmadm   $v29, $v15, $v16[0]
     vmadn   $v11, $v11, $v17[0]
     vmadh   $v15, $v15, $v17[0]
-.if (UCODE_IS_206_OR_OLDER)
-    i7 equ 7
-.else
-    i7 equ 3
-.endif
-    vmudn   $v11, $v11, $v30[i7]        // Scale results to become bytes
+    vmudn   $v11, $v11, $v30[i1]        // 0x0100; scale results to become bytes
     j       @@loop
-     vmadh  $v15, $v15, $v30[i7]        // Scale results to become bytes
+     vmadh  $v15, $v15, $v30[i1]        // 0x0100; scale results to become bytes
 
-curMatrix equ $12
+curMatrix equ $12 // Overwritten during texgen, but with a value which is 0 or positive, so means cur matrix is MV
 ltColor equ $v29
 vPairRGBA equ $v27
 vPairAlpha37 equ $v28 // Same as mvTc1f, but alpha values are left in elems 3, 7
@@ -2084,25 +2223,19 @@ vPairNX equ $v7 // also named vPairRGBATemp; with name vPairNX, uses X component
 vPairNY equ $v6
 vPairNZ equ $v5
 
-// For point lighting, but armips does not like these defined in an .if
-mvTc0i equ $v4
-mvTc1i equ $v21
-mvTc2i equ $v30
-mvTc0f equ $v3
-mvTc1f equ $v28
-mvTc2f equ $v31
-
 light_vtx:
     vadd    vPairNY, vZero, vPairRGBATemp[1h] // Move vertex normals Y to separate reg
 .if UCODE_HAS_POINT_LIGHTING
-    luv     ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Load next light color (ambient)
+    luv     ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Init to ambient light color
 .else
     lpv     $v20[0], (ltBufOfs - lightSize + 0x10)(curLight) // Load next below transformed light direction as XYZ_XYZ_ for lights_dircoloraccum2
 .endif
     vadd    vPairNZ, vZero, vPairRGBATemp[2h] // Move vertex normals Z to separate reg
     luv     vPairRGBA[0], 8(inputVtxPos)      // Load both verts' XYZAXYZA as unsigned
     vne     $v4, $v31, $v31[3h]               // Set VCC to 11101110
-.if UCODE_HAS_POINT_LIGHTING
+.if !UCODE_HAS_POINT_LIGHTING
+    luv     ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Init to ambient light color
+.else
     andi    $11, $5, G_LIGHTING_POSITIONAL_H  // check if point lighting is enabled in the geometry mode
     beqz    $11, directional_lighting         // If not enabled, use directional algorithm for everything
      li     curMatrix, mvpMatrix + 0x8000     // Set flag in negative to indicate cur mtx is MVP
@@ -2110,7 +2243,7 @@ light_vtx:
     suv     ltColor[0], 8(inputVtxPos)        // Store ambient light color to two verts' RGBARGBA
     ori     $11, $zero, 0x0004
     vmov    $v30[7], $v30[6]                  // v30[7] = 0x0010 because v30[0:2,4:6] will get clobbered
-    mtc2    $11, $v31[6]                      // v31[6] = 0x0004 (was previously 0x0420)
+    mtc2    $11, $v31[6]                      // v31[3] = 0x0004 (was previously 0x7F00)
 next_light_dirorpoint:
     lbu     $11, (ltBufOfs + 0x3)(curLight)   // Load light type / constant attenuation value at light structure + 3
     bnez    $11, light_point                  // If not zero, this is a point light
@@ -2272,8 +2405,8 @@ light_point:
     vmadh   $v20, mvTc2i, $v20[2h]       // v20 = int result of vert-to-light in model space
     vmudm   $v2, $v20, $v29[3h]          // v2l_model * length normalization frac
     vmadh   $v20, $v20, $v29[2h]         // v2l_model * length normalization int
-    vmudn   $v2, $v2, $v31[3]            // this is 0x7F00; v31 is mvTc2f but elements 3 and 7 weren't overwritten
-    vmadh   $v20, $v20, $v31[3]          // scale to byte, only keep int part
+    vmudn   $v2, $v2, $v31[3]            // this is 0x0004; v31 is mvTc2f but elem 3 replaced, elem 7 left
+    vmadh   $v20, $v20, $v31[3]          // 
     vmulu   $v2, vPairNX, $v20[0h]       // Normal X * normalized vert-to-light X
     mtc2    $11, $v27[8]                 // 0x3 << 4 -> v27 elems 4, 5
     vmacu   $v2, vPairNY, $v20[1h]       // Y * Y
@@ -2291,7 +2424,7 @@ light_point:
     vmudl   $v2, $v2, $v2[0h]            // squared
     vmulf   $v29, $v29, $v20[3]          // Length * byte 0x7
     vmadm   $v29, $v2, $v20[7]           // + (scaled length squared) * byte 0xE << 5
-    vmadn   $v29, $v27, $v30[3]          // + (byte 0x3 << 4) * 0xFFF0
+    vmadn   $v29, $v27, $v30[3]          // + (byte 0x3 << 4) * 0x0100
     vreadacc $v2, ACC_MIDDLE
     vrcph   $v2[0], $v2[0]               // v2 int, v29 frac: function of distance to light
     vrcpl   $v2[0], $v29[0]              // Reciprocal = inversely proportional
@@ -2312,8 +2445,6 @@ light_point:
     j       after_dirorpoint_loop
 directional_lighting:
      lpv     $v20[0], (ltBufOfs - lightSize + 0x10)(curLight) // Load next light transformed dir; this value is overwritten with the same thing
-.else // No point lighting
-    luv     ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Init to ambient light color
 .endif
 
 // Loop for dot product normals and multiply-add color for 2 lights
diff --git a/rsp/gbi.inc b/rsp/gbi.inc
index 113bd3d..c8d03af 100644
--- a/rsp/gbi.inc
+++ b/rsp/gbi.inc
@@ -1,14 +1,5 @@
 // GBI: Graphics Binary Interface
 
-// XBUS doesn't use the full yield data buffer, but does reference the full size,
-// so a definition for both sizes needs to exist.
-OS_YIELD_DATA_SIZE_TOTAL equ 0xC00
-.if UCODE_METHOD == METHOD_FIFO
-OS_YIELD_DATA_SIZE equ 0xC00
-.else // UCODE_METHOD == METHOD_XBUS
-OS_YIELD_DATA_SIZE equ 0x920
-.endif
-
 // Geometry mode flags
 // First byte gets OR'd with 0xC8 to form triangle opcode. So we can use 0x80,
 // 0x40, and 0x08 for other purposes while remaining forward compatible. (Of
diff --git a/rsp/rsp_defs.inc b/rsp/rsp_defs.inc
index ccb6b1a..384ffe8 100644
--- a/rsp/rsp_defs.inc
+++ b/rsp/rsp_defs.inc
@@ -2,6 +2,9 @@
 
 // OSTask placed at end of DMEM (IMEM_START - sizeof(OSTask))
 .definelabel OSTask_addr, 0xFC0
+
+OS_YIELD_DATA_SIZE equ 0xC00
+
 // OSTask data member offsets
 OSTask_type             equ 0x0000
 OSTask_flags            equ 0x0004