From 99b69e3f2bd36d232583eacb3c3c241df7e46dc8 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Fri, 17 Feb 2023 18:18:33 -0800 Subject: [PATCH] Document registers (#14) * More clipping and tri docs * Documented clipping flags * Clipping docs almost done * Clipping documented * More vertex related docs added * Fixed some vtrans vscale explanations * More register docs * Finished basic registers documentation * Tweaks for Tharo * Addressed Wiseguy comments, added some documentation of ovl0 * Fixed an XBUS bug * Tweaks to comments and moving an instruction before point lighting for clarity --- f3dex2.s | 715 ++++++++++++++++++++++++++++------------------- rsp/gbi.inc | 9 - rsp/rsp_defs.inc | 3 + 3 files changed, 426 insertions(+), 301 deletions(-) diff --git a/f3dex2.s b/f3dex2.s index fa3e345..d116704 100644 --- a/f3dex2.s +++ b/f3dex2.s @@ -125,17 +125,6 @@ pMatrix: mvpMatrix: .fill 64 -// Not global names, but used multiple times with the same meaning. -// Matrix row X integer/fractional -mxr0i equ $v8 -mxr1i equ $v9 -mxr2i equ $v10 -mxr3i equ $v11 -mxr0f equ $v12 -mxr1f equ $v13 -mxr2f equ $v14 -mxr3f equ $v15 - // 0x00C0-0x00C8: scissor (four 12-bit values) scissorUpLeft: // the command byte is included since the command word is copied verbatim .dw (G_SETSCISSOR << 24) | (( 0 * 4) << 12) | (( 0 * 4) << 0) @@ -200,7 +189,6 @@ displayListStack: // fixed offsets to things in this region. Perhaps had to do with DMEM overlays // at some point in development. spFxBase: -spFxBaseReg equ $13 // 0x0180-0x1B0: clipping values clipRatio: // This is an array of 6 doublewords @@ -219,35 +207,41 @@ clipRatio: // This is an array of 6 doublewords // 0x1B0: constants for register $v31 .align 0x10 // loaded with lqv +// VCC patterns used: +// vlt xxx, $v31, $v31[3] = 11101110 in load_spfx_global_values +// vne xxx, $v31, $v31[3h] = 11101110 in lighting +// veq xxx, $v31, $v31[3h] = 00010001 in lighting v31Value: - .dh 0xFFFF // -1 - .dh 0x0004 // 4 - .dh 0x0008 // 8 - .dh 0x7F00 // 32512 - .dh 0xFFFC // -4 - .dh 0x4000 // 16384 - .dh vertexBuffer // 0x420 - .dh 0x7FFF // 32767 + .dh -1 // used in init, clipping + .dh 4 // used in clipping, vtx write + .dh 8 // old ucode only: used in tri write + .dh 0x7F00 // used in vtx write and pre-jump instrs to there, also 4 put here during point lighting + .dh -4 // used in clipping, vtx write + .dh 0x4000 // used in tri write, texgen + .dh vertexBuffer // 0x420; used in tri write + .dh 0x7FFF // used in vtx write, tri write, lighting, point lighting // 0x1C0: constants for register $v30 .align 0x10 // loaded with lqv +// VCC patterns used: +// vge xxx, $v30, $v30[7] = 11110001 in tri write v30Value: - .dh 0x7FFC - .dh vtxSize << 7 // 0x1400 + .dh 0x7FFC // not used! + .dh vtxSize << 7 // 0x1400; used in tri write for vtx index to addr .if (UCODE_IS_206_OR_OLDER) - .dh 0x01CC - .dh 0x0200 - .dh 0xFFF0 - .dh 0x0010 - .dh 0x0020 - .dh 0x0100 + .dh 0x01CC // used in tri write, vcr? + .dh 0x0200 // not used! + .dh -16 // used in tri write, some signed multiplier + .dh 0x0010 // used in tri write, some accumulator init value + .dh 0x0020 // used in tri write, both signed and unsigned multipliers + .dh 0x0100 // used in tri write, vertex color >>= 8; also in lighting .else - .dh 0x1000 - .dh 0x0100 - .dh 0xFFF0 - .dh 0xFFF8 - .dh 0x0010 - .dh 0x0020 + .dh 0x1000 // used in tri write, some multiplier + .dh 0x0100 // used in tri write, vertex color >>= 8 and vcr?; also in lighting and point lighting + .dh -16 // used in tri write, some signed multiplier + .dh 0xFFF8 // used in tri write, mask away lower ST bits? + .dh 0x0010 // used in tri write, some accumulator init value; value moved to elem 7 for point lighting + .dh 0x0020 // used in tri write, both signed and unsigned multipliers; value moved from elem 6 from point lighting .endif .align 0x10 // loaded with lqv @@ -255,7 +249,7 @@ linearGenerateCoefficients: .dh 0xC000 .dh 0x44D3 .dh 0x6CB3 - .dh 0x0002 + .dh 2 // 0x01D8 .db 0x00 // Padding to allow mvpValid to be written to as a 32-bit word @@ -322,9 +316,6 @@ lightBufferMain: .fill (8 * lightSize) // Code uses pointers relative to spFxBase, with immediate offsets, so that // another register isn't needed to store the start or end address of the array. -curLight equ $9 // With ltBufOfs immediate added, points to current light - // (current max in list, counting down). -tmpCurLight equ $6 // Same meaning, another register. // Pointers are kept relative to spFxBase; this offset gets them to point to // lightBufferMain instead. ltBufOfs equ (lightBufferMain - spFxBase) @@ -495,6 +486,13 @@ overlayInfo3: vertexBuffer: .skip (vtxSize * 32) // 32 vertices +.if . > OS_YIELD_DATA_SIZE - 8 + // OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last two words are + // the ucode and the DL pointer. Make sure anything past there is temporary. + // (Input buffer will be reloaded from next instruction in the source DL.) + .error "Important things in DMEM will not be saved at yield!" +.endif + // 0x0920-0x09C8: Input buffer inputBuffer: inputBufferLength equ 0xA8 @@ -547,27 +545,178 @@ OSTask: // RSP IMEM .create CODE_FILE, 0x00001080 -// Global registers -secondVtxPos equ $8 -outputVtxPos equ $15 -clipFlags equ $16 -clipPolyRead equ $17 -clipPolySelect equ $18 -rdpCmdBufEnd equ $22 -rdpCmdBufPtr equ $23 -cmd_w1 equ $24 -cmd_w0 equ $25 -taskDataPtr equ $26 -inputBufferPos equ $27 -savedActiveClipPlanes equ $29 -savedRA equ $30 +//////////////////////////////////////////////////////////////////////////////// +/////////////////////////////// Register Use Map /////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +// Registers marked as "global" are only used for one purpose in the vanilla +// microcode. However, this does not necessarily mean they can't be used for +// other things in mods--this depends on which group they're listed in below. + +// Note that these lists do not cover registers which are just used locally in +// a particular region of code--you're still responsible for not breaking the +// code you modify. This is designed to help you avoid breaking one part of the +// code by modifying a different part. + +// Local register definitions are included with their code, not here. + +// These registers are used globally, and their values can't be rebuilt, so +// they should never be used for anything besides their original purpose. +// $zero // global +rdpCmdBufEnd equ $22 // global +rdpCmdBufPtr equ $23 // global +taskDataPtr equ $26 // global +inputBufferPos equ $27 // global +// $ra // global + +// These registers are used throughout the codebase and expected to have +// certain values, but you're free to overwrite them as long as you +// reconstruct the normal values after you're done (in fact point lighting does +// this for $v30 and $v31). +vZero equ $v0 // global +vOne equ $v1 // global +// $v30 // global +// $v31 // global + +// Must keep values during the full clipping process: clipping overlay, vertex +// write, tri drawing. +clipPolySelect equ $18 // global +clipPolyWrite equ $21 // also input_mtx_0 +savedActiveClipPlanes equ $29 // global +savedRA equ $30 // global + +// Must keep values during the first part of the clipping process only: polygon +// subdivision and vertex write. +// $2: vertex at end of edge +clipMaskIdx equ $5 +secondVtxPos equ $8 // global +outputVtxPos equ $15 // global +clipFlags equ $16 // global +clipPolyRead equ $17 // global + +// Must keep values during tri drawing. +// They are also used throughout the codebase, but can be overwritten once their +// use has been fulfilled for the specific command. +cmd_w1_dram equ $24 // Command word 1, which is also DMA DRAM addr; almost global, occasionally used locally +cmd_w0 equ $25 // Command word 0; almost global, occasionally used locally + +// Must keep values during the full vertex process: load, lighting, and vertex write +// $1: count of remaining vertices +topLightPtr equ $6 // Used locally elsewhere +curLight equ $9 // Used locally elsewhere +inputVtxPos equ $14 // global +mxr0i equ $v8 // "matrix row 0 int part" +mxr1i equ $v9 // All of these used locally elsewhere +mxr2i equ $v10 +mxr3i equ $v11 +mxr0f equ $v12 +mxr1f equ $v13 +mxr2f equ $v14 +mxr3f equ $v15 +vPairST equ $v22 // global +vPairMVPPosF equ $v23 // global +vPairMVPPosI equ $v24 // global +// v25: prev vertex screen pos +// v26: prev vertex screen Z +// For point lighting +mvTc0f equ $v3 +mvTc0i equ $v4 +mvTc1i equ $v21 +mvTc1f equ $v28 // same as vPairAlpha37 +mvTc2i equ $v30 +mvTc2f equ $v31 + +// Values set up by load_spfx_global_values, which must be kept during the full +// vertex process, and which are reloaded for each vert during clipping. See +// that routine for the detailed contents of each of these registers. +// secondVtxPos +spFxBaseReg equ $13 // global +vVpFgScale equ $v16 // All of these used locally elsewhere +vVpFgOffset equ $v17 +vVpMisc equ $v18 +vFogMask equ $v19 +vVpNegScale equ $v21 + +// Arguments to mtx_multiply +output_mtx equ $19 // also dmaLen, also used by itself +input_mtx_1 equ $20 // also dmemAddr and xfrmLtPtr +input_mtx_0 equ $21 // also clipPolyWrite + +// Arguments to dma_read_write +dmaLen equ $19 // also output_mtx, also used by itself +dmemAddr equ $20 // also input_mtx_1 and xfrmLtPtr +// cmd_w1_dram // used for all dma_read_write DRAM addresses, not just second word of command + +// Arguments to load_overlay_and_enter +ovlTableEntry equ $11 // Commonly used locally +postOvlRA equ $12 // Commonly used locally + +// ==== Summary of uses of all registers +// $zero: Hardwired zero scalar register +// $1: vertex 1 addr, count of remaining vertices, pointer to store texture coefficients, local +// $2: vertex 2 addr, vertex at end of edge in clipping, pointer to store shade coefficients, local +// $3: vertex 3 addr, vertex at start of edge in clipping, local +// $4: pre-shuffle vertex 1 addr for flat shading, local +// $5: clipMaskIdx, geometry mode high short during vertex load / lighting, local +// $6: topLightPtr, geometry mode low byte during tri write, local +// $7: fog flag in vtx write, local +// $8: secondVtxPos +// $9: curLight, local +// $10: briefly used local in vtx write +// $11: ovlTableEntry, very common local +// $12: postOvlRA, curMatrix, local +// $13: spFxBaseReg +// $14: inputVtxPos +// $15: outputVtxPos +// $16: clipFlags +// $17: clipPolyRead +// $18: clipPolySelect +// $19: dmaLen, output_mtx, briefly used local +// $20: dmemAddr, input_mtx_1, xfrmLtPtr +// $21: clipPolyWrite, input_mtx_0 +// $22: rdpCmdBufEnd +// $23: rdpCmdBufPtr +// $24: cmd_w1_dram, local +// $25: cmd_w0 +// $26: taskDataPtr +// $27: inputBufferPos +// $28: not used! +// $29: savedActiveClipPlanes +// $30: savedRA +// $ra: Return address for jal, b*al +// $v0: vZero (every element 0) +// $v1: vOne (every element 1) +// $v2: very common local +// $v3: mvTc0f, local +// $v4: mvTc0i, local +// $v5: vPairNZ, local +// $v6: vPairNY, local +// $v7: vPairNX, vPairRGBATemp, local +// $v8: mxr0i, local +// $v9: mxr1i, local +// $v10: mxr2i, local +// $v11: mxr3i, local +// $v12: mxr0f, local +// $v13: mxr1f, local +// $v14: mxr2f, local +// $v15: mxr3f, local +// $v16: vVpFgScale, local +// $v17: vVpFgOffset, local +// $v18: vVpMisc, local +// $v19: vFogMask, local +// $v20: local +// $v21: mvTc1i, vVpNegScale, local +// $v22: vPairST +// $v23: vPairMVPPosF +// $v24: vPairMVPPosI +// $v25: prev vertex data, local +// $v26: prev vertex data, local +// $v27: vPairRGBA, local +// $v28: mvTc1f, vPairAlpha37, local +// $v29: very common local +// $v30: mvTc2i, constant values for tri write +// $v31: mvTc2f, general constant values -// Global vector registers -vZero equ $v0 -vOne equ $v1 -vPairST equ $v22 -vPairMVPPosF equ $v23 -vPairMVPPosI equ $v24 // Initialization routines // Everything up until displaylist_dma will get overwritten by ovl0 and/or ovl1 @@ -595,7 +744,7 @@ start: // This is at IMEM 0x1080, not the start of IMEM beqz $12, calculate_overlay_addrs // skip overlay address calculations if resumed from yield? sw $zero, OSTask + OSTask_flags j load_overlay1_init // Skip the initialization and go straight to loading overlay 1 - lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 + lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // Was previously saved here at yield time task_init: mfc0 $11, DPC_STATUS andi $11, $11, DPC_STATUS_XBUS_DMA @@ -640,7 +789,7 @@ wait_dpc_start_valid: beqz $12, f3dzex_xbus_0000111C sw $zero, OSTask + OSTask_flags j load_overlay1_init - lw taskDataPtr, OS_YIELD_DATA_SIZE_TOTAL - 8 + lw taskDataPtr, OS_YIELD_DATA_SIZE - 8 // Was previously saved here at yield time .fill 16 * 4 // Bunch of nops here to make it the same size as the fifo code. f3dzex_xbus_0000111C: .endif @@ -664,7 +813,7 @@ calculate_overlay_addrs: sw $5, overlayInfo3 + overlay_load lw taskDataPtr, OSTask + OSTask_data_ptr load_overlay1_init: - li $11, overlayInfo1 // set up loading of overlay 1 + li ovlTableEntry, overlayInfo1 // set up loading of overlay 1 // Make room for overlays 0 and 1. Normally, overlay 1 ends exactly at ovl01_end, // and overlay 0 is much shorter, but if things are modded this constraint must be met. @@ -680,18 +829,19 @@ load_overlay1_init: // which is displaylist_dma. So the padding has to be before these two instructions, // so that this is immediately before displaylist_dma; otherwise the return address // will be in the last few instructions of overlay 1. However, this was unnecessary-- -// it could have been a jump and then `addiu $12, $zero, displaylist_dma`. +// it could have been a jump and then `addiu postOvlRA, $zero, displaylist_dma`, +// and the padding put after this. jal load_overlay_and_enter // load overlay 1 and enter - move $12, $ra // set up the return address, since load_overlay_and_enter returns to $12 + move postOvlRA, $ra // set up the return address, since load_overlay_and_enter returns to postOvlRA ovl01_end: // Overlays 0 and 1 overwrite everything up to this point (2.08 versions overwrite up to the previous .align 8) displaylist_dma: // loads inputBufferLength bytes worth of displaylist data via DMA into inputBuffer - li $19, inputBufferLength - 1 // set the DMA length - move $24, taskDataPtr // set up the DRAM address to read from - jal dma_read_write // initiate the DMA read - la $20, inputBuffer // set the address to DMA read to + li dmaLen, inputBufferLength - 1 // set the DMA length + move cmd_w1_dram, taskDataPtr // set up the DRAM address to read from + jal dma_read_write // initiate the DMA read + la dmemAddr, inputBuffer // set the address to DMA read to addiu taskDataPtr, taskDataPtr, inputBufferLength // increment the DRAM address to read from next time li inputBufferPos, -inputBufferLength // reset the DL word index wait_for_dma_and_run_next_command: @@ -713,46 +863,44 @@ run_next_DL_command: sra $12, cmd_w0, 24 // extract DL command byte from command word sll $11, $12, 1 // multiply command byte by 2 to get jump table offset lhu $11, (commandJumpTable)($11) // get command subroutine address from command jump table - bnez $1, load_overlay_0_and_enter // load and execute overlay 0 if yielding - lw cmd_w1, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1 - jr $11 // jump to the loaded command handler + bnez $1, load_overlay_0_and_enter // load and execute overlay 0 if yielding; $1 > 0 + lw cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram + jr $11 // jump to the loaded command handler; $1 == 0 addiu inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words .if (UCODE_IS_F3DEX2_204H) // Microcodes besides F3DEX2 2.04H have this as a noop G_SPECIAL_1_handler: // Seems to be a manual trigger for mvp recalculation li $ra, run_next_DL_command - li $21, pMatrix - li $20, mvMatrix - li $19, mvpMatrix + li input_mtx_0, pMatrix + li input_mtx_1, mvMatrix + li output_mtx, mvpMatrix j mtx_multiply sb cmd_w0, mvpValid .endif G_DMA_IO_handler: - jal segmented_to_physical // Convert the provided segmented address (in cmd_w1) to a virtual one - lh $20, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command) - andi $19, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment - // At this point, $20's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size + jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one + lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command) + andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment + // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit - sra $20, $20, 2 - j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of $20) + sra dmemAddr, dmemAddr, 2 + j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command -geometryMode equ $11 G_GEOMETRYMODE_handler: - lw geometryMode, geometryModeLabel // load the geometry mode value - and geometryMode, geometryMode, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode) - or geometryMode, geometryMode, cmd_w1 // sets the flags in cmd_w1 (set in g*SPSetGeometryMode) - j run_next_DL_command // run the next DL command - sw geometryMode, geometryModeLabel // update the geometry mode value + lw $11, geometryModeLabel // load the geometry mode value + and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode) + or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode) + j run_next_DL_command // run the next DL command + sw $11, geometryModeLabel // update the geometry mode value -dlStackIdx equ $1 G_ENDDL_handler: - lbu dlStackIdx, displayListStackLength // Load the DL stack index - beqz dlStackIdx, load_overlay_0_and_enter // Load overlay 0 if there is no DL return address, to end the graphics task processing - addi dlStackIdx, dlStackIdx, -4 // Decrement the DL stack index - j f3dzex_ovl1_00001020 // has a different version in ovl1 - lw taskDataPtr, (displayListStack)(dlStackIdx) // Load the address of the DL to return to into the taskDataPtr (the current DL address) + lbu $1, displayListStackLength // Load the DL stack index + beqz $1, load_overlay_0_and_enter // Load overlay 0 if there is no DL return address, to end the graphics task processing; $1 < 0 + addi $1, $1, -4 // Decrement the DL stack index + j f3dzex_ovl1_00001020 // has a different version in ovl1 + lw taskDataPtr, (displayListStack)($1) // Load the address of the DL to return to into the taskDataPtr (the current DL address) G_RDPHALF_2_handler: ldv $v29[0], (texrectWord1)($zero) @@ -760,7 +908,7 @@ G_RDPHALF_2_handler: addi rdpCmdBufPtr, rdpCmdBufPtr, 8 sdv $v29[0], -8(rdpCmdBufPtr) G_RDP_handler: - sw cmd_w1, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer G_SYNC_handler: G_NOOP_handler: sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer @@ -769,25 +917,25 @@ G_NOOP_handler: G_SETxIMG_handler: li $ra, G_RDP_handler // Load the RDP command handler into the return address, then fall through to convert the address to virtual -// Converts the segmented address in $24 (also cmd_w1) to the corresponding physical address +// Converts the segmented address in cmd_w1_dram to the corresponding physical address segmented_to_physical: - srl $11, $24, 22 // Copy (segment index << 2) into $11 - andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift - lw $11, (segmentTable)($11) // Get the current address of the segment - sll $24, $24, 8 // Shift the address to the left so that the top 8 bits are shifted out - srl $24, $24, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared + srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11 + andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift + lw $11, (segmentTable)($11) // Get the current address of the segment + sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out + srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared jr $ra - add $24, $24, $11 // Add the segment's address to the masked input address, resulting in the virtual address + add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address G_RDPSETOTHERMODE_handler: - sw cmd_w0, otherMode0 // Record the local otherMode0 copy - j G_RDP_handler // Send the command to the RDP - sw cmd_w1, otherMode1 // Record the local otherMode1 copy + sw cmd_w0, otherMode0 // Record the local otherMode0 copy + j G_RDP_handler // Send the command to the RDP + sw cmd_w1_dram, otherMode1 // Record the local otherMode1 copy G_SETSCISSOR_handler: - sw cmd_w0, scissorUpLeft // Record the local scissorUpleft copy - j G_RDP_handler // Send the command to the RDP - sw cmd_w1, scissorBottomRight // Record the local scissorBottomRight copy + sw cmd_w0, scissorUpLeft // Record the local scissorUpleft copy + j G_RDP_handler // Send the command to the RDP + sw cmd_w1_dram, scissorBottomRight // Record the local scissorBottomRight copy check_rdp_buffer_full_and_run_next_cmd: li $ra, run_next_DL_command // Set up running the next DL command as the return address @@ -798,42 +946,42 @@ check_rdp_buffer_full: blez $11, return_routine // Return if rdpCmdBufEnd >= rdpCmdBufPtr flush_rdp_buffer: mfc0 $12, SP_DMA_BUSY - lw $24, rdpFifoPos - addiu $19, $11, RDP_CMD_BUFSIZE + lw cmd_w1_dram, rdpFifoPos + addiu dmaLen, $11, RDP_CMD_BUFSIZE bnez $12, flush_rdp_buffer lw $12, OSTask + OSTask_output_buff_size - mtc0 $24, DPC_END - add $11, $24, $19 + mtc0 cmd_w1_dram, DPC_END + add $11, cmd_w1_dram, dmaLen sub $12, $12, $11 bgez $12, f3dzex_000012A8 @@await_start_valid: mfc0 $11, DPC_STATUS andi $11, $11, DPC_STATUS_START_VALID bnez $11, @@await_start_valid - lw $24, OSTask + OSTask_output_buff + lw cmd_w1_dram, OSTask + OSTask_output_buff f3dzex_00001298: mfc0 $11, DPC_CURRENT - beq $11, $24, f3dzex_00001298 + beq $11, cmd_w1_dram, f3dzex_00001298 nop - mtc0 $24, DPC_START + mtc0 cmd_w1_dram, DPC_START f3dzex_000012A8: mfc0 $11, DPC_CURRENT - sub $11, $11, $24 + sub $11, $11, cmd_w1_dram blez $11, f3dzex_000012BC - sub $11, $11, $19 + sub $11, $11, dmaLen blez $11, f3dzex_000012A8 f3dzex_000012BC: - add $11, $24, $19 + add $11, cmd_w1_dram, dmaLen sw $11, rdpFifoPos // Set up the DMA from DMEM to the RDP fifo in RDRAM - addi $19, $19, -1 // subtract 1 from the length - addi $20, rdpCmdBufEnd, -(0x2000 | RDP_CMD_BUFSIZE) // The 0x2000 is meaningless, negative means write + addi dmaLen, dmaLen, -1 // subtract 1 from the length + addi dmemAddr, rdpCmdBufEnd, -(0x2000 | RDP_CMD_BUFSIZE) // The 0x2000 is meaningless, negative means write xori rdpCmdBufEnd, rdpCmdBufEnd, rdpCmdBuffer1End ^ rdpCmdBuffer2End // Swap between the two RDP command buffers j dma_read_write addi rdpCmdBufPtr, rdpCmdBufEnd, -RDP_CMD_BUFSIZE .else // UCODE_METHOD == METHOD_XBUS check_rdp_buffer_full: - addi $11, rdpCmdBufPtr, -0xF10 + addi $11, rdpCmdBufPtr, -(OSTask - RDP_CMD_BUFSIZE_EXCESS) blez $11, ovl0_04001284 mtc0 rdpCmdBufPtr, DPC_END ovl0_04001260: @@ -851,7 +999,7 @@ ovl0_04001284: mfc0 $11, DPC_CURRENT sub $11, $11, rdpCmdBufPtr blez $11, ovl0_0400129C - addi $11, $11, -0xB0 + addi $11, $11, -RDP_CMD_BUFSIZE_EXCESS blez $11, ovl0_04001284 nop ovl0_0400129C: @@ -864,17 +1012,12 @@ ovl23_start: ovl3_start: -clipPolyWrite equ $21 -clipMaskIdx equ $5 - -vPairRGBATemp equ $v7 - // Jump here to do lighting. If overlay 3 is loaded (this code), loads and jumps // to overlay 2 (same address as right here). -ovl23_lighting_entrypoint_copy: // same IMEM address as ovl23_lighting_entrypoint - li $11, overlayInfo2 // set up a load for overlay 2 - j load_overlay_and_enter // load overlay 2 - li $12, ovl23_lighting_entrypoint // set the return address +ovl23_lighting_entrypoint_copy: // same IMEM address as ovl23_lighting_entrypoint + li ovlTableEntry, overlayInfo2 // set up a load for overlay 2 + j load_overlay_and_enter // load overlay 2 + li postOvlRA, ovl23_lighting_entrypoint // set the return address // Jump here to do clipping. If overlay 3 is loaded (this code), directly starts // the clipping code. @@ -925,7 +1068,7 @@ clipping_skipswap23: // After possible swap, $19 = vtx not meeting clip cond / o vmadn $v10, $v6, $v3 // frac: - vtx off screen * clip ratio vmadh $v11, $v7, $v3 // int: - vtx off screen * clip ratio 11:10 vaddc $v8, $v8, $v8[0q] // frac: y += x, w += z, vtx on screen only - lqv $v25[0], (linearGenerateCoefficients)($zero) // Used just to load the value 0x0002 + lqv $v25[0], (linearGenerateCoefficients)($zero) // Used just to load the value 2 vadd $v9, $v9, $v9[0q] // int: y += x, w += z, vtx on screen only vaddc $v10, $v10, $v10[0q] // frac: y += x, w += z, vtx on screen - vtx off screen vadd $v11, $v11, $v11[0q] // int: y += x, w += z, vtx on screen - vtx off screen @@ -1066,39 +1209,33 @@ ovl3_end: ovl23_end: -inputVtxPos equ $14 -// See load_spfx_global_values for detailed contents -vVpFgScale equ $v16 -vVpFgOffset equ $v17 -vVpMisc equ $v18 -vFogMask equ $v19 -vVpNegScale equ $v21 +vPairRGBATemp equ $v7 G_VTX_handler: - lhu $20, (vertexTable)(cmd_w0) // Load the address of the provided vertex array - jal segmented_to_physical // Convert the vertex array's segmented address (in $24) to a virtual one + lhu dmemAddr, (vertexTable)(cmd_w0) // Load the address of the provided vertex array + jal segmented_to_physical // Convert the vertex array's segmented address (in cmd_w1_dram) to a virtual one lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // Load the size of the vertex array to copy into reg $1 - sub $20, $20, $1 // Calculate the address to DMA the provided vertices into + sub dmemAddr, dmemAddr, $1 // Calculate the address to DMA the provided vertices into jal dma_read_write // DMA read the vertices from DRAM - addi $19, $1, -1 // Set up the DMA length + addi dmaLen, $1, -1 // Set up the DMA length lhu $5, geometryModeLabel // Load the geometry mode into $5 srl $1, $1, 3 sub outputVtxPos, cmd_w0, $1 lhu outputVtxPos, (vertexTable)(outputVtxPos) - move inputVtxPos, $20 + move inputVtxPos, dmemAddr lbu secondVtxPos, mvpValid // used as temp reg - andi tmpCurLight, $5, G_LIGHTING_H // If no lighting, tmpCurLight is 0, skips transforming light dirs and setting this up as a pointer - bnez tmpCurLight, ovl23_lighting_entrypoint // Run overlay 2 for lighting, either directly or via overlay 3 loading overlay 2 + andi topLightPtr, $5, G_LIGHTING_H // If no lighting, topLightPtr is 0, skips transforming light dirs and setting this up as a pointer + bnez topLightPtr, ovl23_lighting_entrypoint // Run overlay 2 for lighting, either directly or via overlay 3 loading overlay 2 andi $7, $5, G_FOG_H after_light_dir_xfrm: bnez secondVtxPos, vertex_skip_recalc_mvp // Skip recalculating the mvp matrix if it's already up-to-date sll $7, $7, 3 // $7 is 8 if G_FOG is set, 0 otherwise sb cmd_w0, mvpValid // Set mvpValid - li $21, pMatrix // Arguments to mtx_multiply - li $20, mvMatrix + li input_mtx_0, pMatrix // Arguments to mtx_multiply + li input_mtx_1, mvMatrix // Calculate the MVP matrix jal mtx_multiply - li $19, mvpMatrix + li output_mtx, mvpMatrix vertex_skip_recalc_mvp: /* Load MVP matrix as follows--note that translation is in the bottom row, @@ -1143,7 +1280,7 @@ vertices_process_pair: vmadh $v29, mxr3i, vOne[0] llv vPairST[12], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // load the texture coords of the 1st vertex into second half of vPairST vmadn $v29, mxr0f, $v20[0h] - move curLight, tmpCurLight + move curLight, topLightPtr vmadh $v29, mxr0i, $v20[0h] lpv $v2[0], (ltBufOfs + 0x10)(curLight) // First instruction of lights_dircoloraccum2 loop; load light transformed dir vmadn $v29, mxr1f, $v20[1h] @@ -1151,7 +1288,7 @@ vertices_process_pair: vmadh $v29, mxr1i, $v20[1h] lpv vPairRGBATemp[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Load both vertex's colors/normals into v7's elements RGBARGBA or XYZAXYZA vmadn vPairMVPPosF, mxr2f, $v20[2h] // vPairMVPPosF = MVP * vpos result frac - bnez tmpCurLight, light_vtx // Zero if lighting disabled, pointer if enabled + bnez topLightPtr, light_vtx // Zero if lighting disabled, pointer if enabled vmadh vPairMVPPosI, mxr2i, $v20[2h] // vPairMVPPosI = MVP * vpos result int // These two instructions are repeated at the end of all the lighting codepaths, // since they're skipped here if lighting is being performed @@ -1309,12 +1446,12 @@ load_spfx_global_values: llv $v29[0], (fogFactor - spFxBase)(spFxBaseReg) // Load fog multiplier and offset ldv vVpFgOffset[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7 ldv vVpFgOffset[8], (viewport + 8)($zero) - vlt vFogMask, $v31, $v31[3] // VCC = [0, 0, 0, 1, 0, 0, 0, 1] + vlt vFogMask, $v31, $v31[3] // VCC = 11101110 vsub vVpNegScale, vZero, vVpFgScale // -vscale llv vVpMisc[4], (textureSettings2 - spFxBase)(spFxBaseReg) // Texture ST scale vmrg vVpFgScale, vVpFgScale, $v29[0] // Put fog multiplier in elements 3,7 of vscale llv vVpMisc[12], (textureSettings2 - spFxBase)(spFxBaseReg) // Texture ST scale - vmrg vFogMask, vZero, vOne[0] // Put 0 in elems 3,7, 1 in others + vmrg vFogMask, vZero, vOne[0] // Put 0 in most elems, 1 in elems 3,7 llv vVpMisc[8], (perspNorm)($zero) // Perspective normalization long (actually short) vmrg vVpFgOffset, vVpFgOffset, $v29[1] // Put fog offset in elements 3,7 of vtrans lsv vVpMisc[10], (clipRatio + 6 - spFxBase)(spFxBaseReg) // Clip ratio (-x version, but normally +/- same in all dirs) @@ -1324,13 +1461,13 @@ load_spfx_global_values: G_TRI2_handler: G_QUAD_handler: - jal tri_to_rdp // Send second tri; return here for first tri - sw cmd_w1, 4(rdpCmdBufPtr) // Put second tri indices in temp memory + jal tri_to_rdp // Send second tri; return here for first tri + sw cmd_w1_dram, 4(rdpCmdBufPtr) // Put second tri indices in temp memory G_TRI1_handler: - li $ra, run_next_DL_command // After done with this tri, run next cmd - sw cmd_w0, 4(rdpCmdBufPtr) // Put first tri indices in temp memory + li $ra, run_next_DL_command // After done with this tri, run next cmd + sw cmd_w0, 4(rdpCmdBufPtr) // Put first tri indices in temp memory tri_to_rdp: - lpv $v2[0], 0(rdpCmdBufPtr) // Load tri indexes to vector unit for shuffling + lpv $v2[0], 0(rdpCmdBufPtr) // Load tri indexes to vector unit for shuffling // read the three vertex indices from the stored command word lbu $1, 0x0005(rdpCmdBufPtr) // $1 = vertex 1 index lbu $2, 0x0006(rdpCmdBufPtr) // $2 = vertex 2 index @@ -1605,9 +1742,9 @@ no_textures: vmadh $v9, $v3, $v15[3] sdv $v7[0], 0x0028($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer) vmudn $v29, $v5, vOne[0] - sdv $v6[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficeints (fractional) + sdv $v6[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficients (fractional) vmadh $v29, $v18, vOne[0] - sdv $v7[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficeints (integer) + sdv $v7[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficients (integer) vmadl $v29, $v8, $v4[1] sdv $v8[0], 0x0030($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional) vmadm $v29, $v9, $v4[1] @@ -1648,35 +1785,35 @@ no_z_buffer: j check_rdp_buffer_full // eventually returns to $ra, which is next cmd, second tri in TRI2, or middle of clipping sdv $v18[8], 0x0000($1) // Store S, T, W texture coefficients (integer) -vtxPtr equ $25 -endVtxPtr equ $24 +vtxPtr equ $25 // = cmd_w0 +endVtxPtr equ $24 // = cmd_w1_dram G_CULLDL_handler: - lhu vtxPtr, (vertexTable)(cmd_w0) // load start vertex address - lhu endVtxPtr, (vertexTable)(cmd_w1) // load end vertex address + lhu vtxPtr, (vertexTable)(cmd_w0) // load start vertex address + lhu endVtxPtr, (vertexTable)(cmd_w1_dram) // load end vertex address addiu $1, $zero, (CLIP_NX | CLIP_NY | CLIP_PX | CLIP_PY | CLIP_FAR | CLIP_NEAR) - lw $11, VTX_CLIP(vtxPtr) // read clip flags from vertex + lw $11, VTX_CLIP(vtxPtr) // read clip flags from vertex culldl_loop: and $1, $1, $11 - beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render + beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render lw $11, (vtxSize + VTX_CLIP)(vtxPtr) // next vertex clip flags - bne vtxPtr, endVtxPtr, culldl_loop // loop until reaching the last vertex - addiu vtxPtr, vtxPtr, vtxSize // advance to the next vertex - j G_ENDDL_handler // If got here, there's some clipping plane where all verts are outside it; skip DL + bne vtxPtr, endVtxPtr, culldl_loop // loop until reaching the last vertex + addiu vtxPtr, vtxPtr, vtxSize // advance to the next vertex + j G_ENDDL_handler // If got here, there's some clipping plane where all verts are outside it; skip DL G_BRANCH_WZ_handler: - lhu vtxPtr, (vertexTable)(cmd_w0) // get the address of the vertex being tested -.if UCODE_TYPE == TYPE_F3DZEX // BRANCH_W/BRANCH_Z difference - lh vtxPtr, VTX_W_INT(vtxPtr) // read the w coordinate of the vertex (f3dzex) + lhu vtxPtr, (vertexTable)(cmd_w0) // get the address of the vertex being tested +.if UCODE_TYPE == TYPE_F3DZEX // BRANCH_W/BRANCH_Z difference + lh vtxPtr, VTX_W_INT(vtxPtr) // read the w coordinate of the vertex (f3dzex) .else - lw vtxPtr, VTX_SCR_Z(vtxPtr) // read the screen z coordinate (int and frac) of the vertex (f3dex2) + lw vtxPtr, VTX_SCR_Z(vtxPtr) // read the screen z coordinate (int and frac) of the vertex (f3dex2) .endif - sub $2, vtxPtr, endVtxPtr // subtract the w/z value being tested - bgez $2, run_next_DL_command // if vtx.w/z > w/z, continue running this DL - lw $24, rdpHalf1Val // load the RDPHALF1 value - j f3dzex_ovl1_00001008 + sub $2, vtxPtr, cmd_w1_dram // subtract the w/z value being tested + bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL + lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to + j branch_dl G_MODIFYVTX_handler: lbu $1, (inputBufferEnd - 0x07)(inputBufferPos) j do_moveword - lhu vtxPtr, (vertexTable)(cmd_w0) + lhu cmd_w0, (vertexTable)(cmd_w0) .if . > 0x00001FAC @@ -1688,46 +1825,45 @@ G_MODIFYVTX_handler: // to load_overlay_and_enter to execute the load. load_overlay_0_and_enter: G_LOAD_UCODE_handler: - li $12, ovl0_start // Sets up return address - li $11, overlayInfo0 // Sets up ovl0 table address + li postOvlRA, ovl0_start // Sets up return address + li ovlTableEntry, overlayInfo0 // Sets up ovl0 table address // This subroutine accepts the address of an overlay table entry and loads that overlay. // It then jumps to that overlay's address after DMA of the overlay is complete. -// $11 is used to provide the overlay table entry -// $12 is used to pass in a value to return to -ovlTableEntry equ $11 -returnAddr equ $12 +// ovlTableEntry is used to provide the overlay table entry +// postOvlRA is used to pass in a value to return to load_overlay_and_enter: - lw $24, overlay_load(ovlTableEntry) // Set up overlay dram address - lhu $19, overlay_len(ovlTableEntry) // Set up overlay length - jal dma_read_write // DMA the overlay - lhu $20, overlay_imem(ovlTableEntry) // Set up overlay load address - move $ra, returnAddr // Set the return address to the passed in value + lw cmd_w1_dram, overlay_load(ovlTableEntry) // Set up overlay dram address + lhu dmaLen, overlay_len(ovlTableEntry) // Set up overlay length + jal dma_read_write // DMA the overlay + lhu dmemAddr, overlay_imem(ovlTableEntry) // Set up overlay load address + move $ra, postOvlRA // Set the return address to the passed in value + +.if . > 0x1FC8 + .error "Constraints violated on what can be overwritten at end of ucode (relevant for G_LOAD_UCODE)" +.endif + while_wait_dma_busy: - mfc0 $11, SP_DMA_BUSY // Load the DMA_BUSY value into $11 + mfc0 ovlTableEntry, SP_DMA_BUSY // Load the DMA_BUSY value into ovlTableEntry while_dma_busy: - bnez $11, while_dma_busy // Loop until DMA_BUSY is cleared - mfc0 $11, SP_DMA_BUSY // Update $11's DMA_BUSY value + bnez ovlTableEntry, while_dma_busy // Loop until DMA_BUSY is cleared + mfc0 ovlTableEntry, SP_DMA_BUSY // Update ovlTableEntry's DMA_BUSY value // This routine is used to return via conditional branch return_routine: jr $ra -dmemAddr equ $20 -dramAddr equ $24 -dmaLen equ $19 -dmaFull equ $11 dma_read_write: - mfc0 dmaFull, SP_DMA_FULL // load the DMA_FULL value + mfc0 $11, SP_DMA_FULL // load the DMA_FULL value while_dma_full: - bnez dmaFull, while_dma_full // Loop until DMA_FULL is cleared - mfc0 dmaFull, SP_DMA_FULL // Update DMA_FULL value - mtc0 dmemAddr, SP_MEM_ADDR // Set the DMEM address to DMA from/to - bltz dmemAddr, dma_write // If the DMEM address is negative, this is a DMA write, if not read - mtc0 dramAddr, SP_DRAM_ADDR // Set the DRAM address to DMA from/to + bnez $11, while_dma_full // Loop until DMA_FULL is cleared + mfc0 $11, SP_DMA_FULL // Update DMA_FULL value + mtc0 dmemAddr, SP_MEM_ADDR // Set the DMEM address to DMA from/to + bltz dmemAddr, dma_write // If the DMEM address is negative, this is a DMA write, if not read + mtc0 cmd_w1_dram, SP_DRAM_ADDR // Set the DRAM address to DMA from/to jr $ra - mtc0 dmaLen, SP_RD_LEN // Initiate a DMA read with a length of dmaLen + mtc0 dmaLen, SP_RD_LEN // Initiate a DMA read with a length of dmaLen dma_write: jr $ra - mtc0 dmaLen, SP_WR_LEN // Initiate a DMA write with a length of dmaLen + mtc0 dmaLen, SP_WR_LEN // Initiate a DMA write with a length of dmaLen .if . > 0x00002000 .error "Not enough room in IMEM" @@ -1739,6 +1875,8 @@ dma_write: .headersize 0x00001000 - orga() // Overlay 0 controls the RDP and also stops the RSP when work is done +// The action here is controlled by $1. If yielding, $1 > 0. If this was +// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0. ovl0_start: .if UCODE_METHOD == METHOD_FIFO sub $11, rdpCmdBufPtr, rdpCmdBufEnd @@ -1747,37 +1885,42 @@ ovl0_start: nop jal while_wait_dma_busy lw $24, rdpFifoPos - bltz $1, taskdone_and_break + bltz $1, taskdone_and_break // $1 < 0 = Got to the end of the parent DL mtc0 $24, DPC_END // Set the end pointer of the RDP so that it starts the task .else // UCODE_METHOD == METHOD_XBUS - bltz $1, taskdone_and_break + bltz $1, taskdone_and_break // $1 < 0 = Got to the end of the parent DL nop .endif - bnez $1, task_yield - add taskDataPtr, taskDataPtr, inputBufferPos - lw $24, 0x09C4(inputBufferPos) // Should this be (inputBufferEnd - 0x04)? - sw taskDataPtr, OSTask + OSTask_data_ptr - sw $24, OSTask + OSTask_ucode - la $20, start // DMA address - jal dma_read_write // initiate DMA read - li $19, 0x0F48 - 1 + bnez $1, task_yield // $1 > 0 = CPU requested yield + add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded +// If here, G_LOAD_UCODE was executed. + lw cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr + sw taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL + sw cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute + la dmemAddr, start // Beginning of overwritable part of IMEM + jal dma_read_write // DMA DRAM read -> IMEM write + li dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM .if UCODE_METHOD == METHOD_XBUS ovl0_xbus_wait_for_rdp: mfc0 $11, DPC_STATUS andi $11, $11, DPC_STATUS_DMA_BUSY bnez $11, ovl0_xbus_wait_for_rdp // Keep looping while RDP is busy. .endif - lw $24, rdpHalf1Val - la $20, 0x0180 // DMA address; equal to but probably not actually spFxBase or clipRatio - andi $19, cmd_w0, 0x0FFF - add $24, $24, $20 + lw cmd_w1_dram, rdpHalf1Val // Get DRAM address of ucode data from rdpHalf1Val + la dmemAddr, spFxBase // DMEM address is spFxBase + andi dmaLen, cmd_w0, 0x0FFF // Extract DMEM length from command word + add cmd_w1_dram, cmd_w1_dram, dmemAddr // Start overwriting data from spFxBase jal dma_read_write // initate DMA read - sub $19, $19, $20 + sub dmaLen, dmaLen, dmemAddr // End that much before the end of DMEM j while_wait_dma_busy -.if (UCODE_IS_F3DEX2_204H || UCODE_METHOD == METHOD_XBUS /* ??? */) - li $ra, taskdone_and_break_204H +.if (UCODE_IS_F3DEX2_204H) + li $ra, start .else - li $ra, taskdone_and_break + li $ra, start + 4 // Not sure why we skip the first instruction of the new ucode +.endif + +.if . > start + .error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE" .endif ucode equ $11 @@ -1788,24 +1931,29 @@ task_yield: sw taskDataPtr, OS_YIELD_DATA_SIZE - 8 sw ucode, OS_YIELD_DATA_SIZE - 4 li status, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals - lw $24, OSTask + OSTask_yield_data_ptr - li $20, 0x8000 - li $19, OS_YIELD_DATA_SIZE - 1 + lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr + li dmemAddr, 0x8000 // 0, but negative = write + li dmaLen, OS_YIELD_DATA_SIZE - 1 .else // UCODE_METHOD == METHOD_XBUS - sw taskDataPtr, OS_YIELD_DATA_SIZE - sw ucode, OS_YIELD_DATA_SIZE + 4 - lw $24, OSTask + OSTask_yield_data_ptr - li $20, 0x8000 + // Instead of saving the whole first OS_YIELD_DATA_SIZE bytes of DMEM, + // XBUS saves only up to inputBuffer, as everything after that can be erased, + // and because the RDP may still be using the output buffer, which is where + // we'd have to write taskDataPtr and ucode. + sw taskDataPtr, inputBuffer // save these values for below, somewhere outside + sw ucode, inputBuffer + 4 // the area being written + lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr + li dmemAddr, 0x8000 // 0, but negative = write jal dma_read_write - li $19, OS_YIELD_DATA_SIZE - 1 + li dmaLen, inputBuffer - 1 + // At the end of the OS's yield buffer, write the taskDataPtr and ucode words. li status, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals - addiu $24, $24, OS_YIELD_DATA_SIZE_TOTAL - 8 - li $20, -0x76E0 // ??? - li $19, 7 + addiu cmd_w1_dram, cmd_w1_dram, OS_YIELD_DATA_SIZE - 8 + li dmemAddr, 0x8000 | inputBuffer // where they were saved above + li dmaLen, 8 - 1 .endif j dma_read_write -taskdone_and_break_204H: // Only used in f3dex2 2.04H li $ra, break + taskdone_and_break: li status, SP_SET_SIG2 // task done signal break: @@ -1835,11 +1983,11 @@ ovl1_start: G_DL_handler: lbu $1, displayListStackLength // Get the DL stack length sll $2, cmd_w0, 15 // Shifts the push/nopush value to the highest bit in $2 -f3dzex_ovl1_00001008: +branch_dl: jal segmented_to_physical add $3, taskDataPtr, inputBufferPos bltz $2, displaylist_dma // If the operation is nopush (branch) then simply DMA the new displaylist - move taskDataPtr, cmd_w1 // Set the task data pointer to the target display list + move taskDataPtr, cmd_w1_dram // Set the task data pointer to the target display list sw $3, (displayListStack)($1) addi $1, $1, 4 // Increment the DL stack length f3dzex_ovl1_00001020: @@ -1855,7 +2003,7 @@ G_TEXRECTFLIP_handler: G_RDPHALF_1_handler: j run_next_DL_command // Stores second command word into textureSettings for gSPTexture, 0x00D4 for gSPTextureRectangle/Flip, 0x00D8 for G_RDPHALF_1 - sw cmd_w1, (texrectWord2 - G_TEXRECTFLIP_handler)($11) + sw cmd_w1_dram, (texrectWord2 - G_TEXRECTFLIP_handler)($11) G_MOVEWORD_handler: srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT) @@ -1863,32 +2011,29 @@ G_MOVEWORD_handler: do_moveword: add $1, $1, cmd_w0 // adds the offset in the command word to the address from the table (the upper 4 bytes are effectively ignored) j run_next_DL_command // process the next command - sw cmd_w1, ($1) // moves the specified value (in cmd_w1) into the word (offset + moveword_table[index]) + sw cmd_w1_dram, ($1) // moves the specified value (in cmd_w1_dram) into the word (offset + moveword_table[index]) G_POPMTX_handler: lw $11, matrixStackPtr // Get the current matrix stack pointer lw $2, OSTask + OSTask_dram_stack // Read the location of the dram stack - sub $24, $11, cmd_w1 // Decrease the matrix stack pointer by the amount passed in the second command word - sub $1, $24, $2 // Subtraction to check if the new pointer is greater than or equal to $2 + sub cmd_w1_dram, $11, cmd_w1_dram // Decrease the matrix stack pointer by the amount passed in the second command word + sub $1, cmd_w1_dram, $2 // Subtraction to check if the new pointer is greater than or equal to $2 bgez $1, do_popmtx // If the new matrix stack pointer is greater than or equal to $2, then use the new pointer as is nop - move $24, $2 // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead + move cmd_w1_dram, $2 // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead do_popmtx: - beq $24, $11, run_next_DL_command // If no bytes were popped, then we don't need to make the mvp matrix as being out of date and can run the next command - sw $24, matrixStackPtr // Update the matrix stack pointer with the new value + beq cmd_w1_dram, $11, run_next_DL_command // If no bytes were popped, then we don't need to make the mvp matrix as being out of date and can run the next command + sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer with the new value j do_movemem sw $zero, mvpValid // Mark the MVP matrix and light directions as being out of date (the word being written to contains both) G_MTX_end: // Multiplies the loaded model matrix into the model stack - lhu $19, (movememTable + G_MV_MMTX)($1) // Set the output matrix to the model or projection matrix based on the command + lhu output_mtx, (movememTable + G_MV_MMTX)($1) // Set the output matrix to the model or projection matrix based on the command jal while_wait_dma_busy - lhu $21, (movememTable + G_MV_MMTX)($1) // Set the first input matrix to the model or projection matrix based on the command + lhu input_mtx_0, (movememTable + G_MV_MMTX)($1) // Set the first input matrix to the model or projection matrix based on the command li $ra, run_next_DL_command // The second input matrix will correspond to the address that memory was moved into, which will be tempMtx for G_MTX -input_mtx_0 equ $21 -input_mtx_1 equ $20 -output_mtx equ $19 mtx_multiply: addi $12, input_mtx_1, 0x0018 @@loop: @@ -1920,7 +2065,6 @@ mtx_multiply: jr $ra sqv $v6[0], 0x0010(output_mtx) -matrixStackAddr equ $24 G_MTX_handler: // The lower 3 bits of G_MTX are, from LSb to MSb (0 value/1 value), // matrix type (modelview/projection) @@ -1930,27 +2074,27 @@ G_MTX_handler: andi $11, cmd_w0, G_MTX_P_MV | G_MTX_NOPUSH_PUSH // Read the matrix type and push type flags into $11 bnez $11, load_mtx // If the matrix type is projection or this is not a push, skip pushing the matrix andi $2, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $2 (0 is multiply, 2 is load) - lw matrixStackAddr, matrixStackPtr // Set up the DMA from dmem to rdram at the matrix stack pointer - li $20, -0x2000 // + lw cmd_w1_dram, matrixStackPtr // Set up the DMA from dmem to rdram at the matrix stack pointer + li dmemAddr, -0x2000 // jal dma_read_write // DMA the current matrix from dmem to rdram - li $19, 0x0040 - 1 // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive) - addi matrixStackAddr, matrixStackAddr, 0x40 // Increase the matrix stack pointer by the size of one matrix - sw matrixStackAddr, matrixStackPtr // Update the matrix stack pointer - lw cmd_w1, (inputBufferEnd - 4)(inputBufferPos) + li dmaLen, 0x0040 - 1 // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive) + addi cmd_w1_dram, cmd_w1_dram, 0x40 // Increase the matrix stack pointer by the size of one matrix + sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer + lw cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Load command word 1 again load_mtx: add $12, $12, $2 // Add the load type to the command byte, selects the return address based on whether the matrix needs multiplying or just loading sw $zero, mvpValid // Mark the MVP matrix and light directions as being out of date (the word being written to contains both) G_MOVEMEM_handler: - jal segmented_to_physical // convert the memory address cmd_w1 to a virtual one + jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one do_movemem: - andi $1, cmd_w0, 0x00FE // Move the movemem table index into $1 (bits 1-7 of the first command word) - lbu $19, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into $19 - lhu $20, (movememTable)($1) // Load the address of the memory location for the given movemem index - srl $2, cmd_w0, 5 // Left shifts the index by 5 (which is then added to the value read from the movemem table) + andi $1, cmd_w0, 0x00FE // Move the movemem table index into $1 (bits 1-7 of the first command word) + lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into dmaLen + lhu dmemAddr, (movememTable)($1) // Load the address of the memory location for the given movemem index + srl $2, cmd_w0, 5 // Left shifts the index by 5 (which is then added to the value read from the movemem table) lhu $ra, (movememHandlerTable - (G_POPMTX | 0xFF00))($12) // Loads the return address from movememHandlerTable based on command byte j dma_read_write G_SETOTHERMODE_H_handler: // These handler labels must be 4 bytes apart for the code below to work - add $20, $20, $2 + add dmemAddr, dmemAddr, $2 // This is for the code above, does nothing for G_SETOTHERMODE_H G_SETOTHERMODE_L_handler: lw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) // resolves to othermode0 or othermode1 based on which handler was jumped to lui $2, 0x8000 @@ -1959,11 +2103,11 @@ G_SETOTHERMODE_L_handler: srlv $2, $2, $1 nor $2, $2, $zero and $3, $3, $2 - or $3, $3, cmd_w1 + or $3, $3, cmd_w1_dram sw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) lw cmd_w0, otherMode0 j G_RDP_handler - lw cmd_w1, otherMode1 + lw cmd_w1_dram, otherMode1 .align 8 ovl1_end: @@ -1978,13 +2122,13 @@ ovl2_start: ovl23_lighting_entrypoint: lbu $11, lightsValid j continue_light_dir_xfrm - lbu tmpCurLight, numLightsx18 + lbu topLightPtr, numLightsx18 -ovl23_clipping_entrypoint_copy: // same IMEM address as ovl23_clipping_entrypoint +ovl23_clipping_entrypoint_copy: // same IMEM address as ovl23_clipping_entrypoint move savedRA, $ra - li $11, overlayInfo3 // set up a load of overlay 3 - j load_overlay_and_enter // load overlay 3 - li $12, ovl3_clipping_nosavera // set up the return address in ovl3 + li ovlTableEntry, overlayInfo3 // set up a load of overlay 3 + j load_overlay_and_enter // load overlay 3 + li postOvlRA, ovl3_clipping_nosavera // set up the return address in ovl3 continue_light_dir_xfrm: // Transform light directions from camera space to model space, by @@ -1993,7 +2137,7 @@ continue_light_dir_xfrm: // lookat lights and through all directional and point lights, but not // ambient. For point lights, the data is garbage but doesn't harm anything. bnez $11, after_light_dir_xfrm // Skip calculating lights if they're not out of date - addi tmpCurLight, tmpCurLight, spFxBase - lightSize // With ltBufOfs, points at top/max light. + addi topLightPtr, topLightPtr, spFxBase - lightSize // With ltBufOfs, points at top/max light. sb cmd_w0, lightsValid // Set as valid, reusing state of w0 /* Load MV matrix 3x3 transposed as: mxr0i 00 08 10 06 08 0A 0C 0E @@ -2021,7 +2165,7 @@ continue_light_dir_xfrm: lsv mxr2i[4], (mvMatrix + 0x14)($zero) vmov mxr2f[0], mxr0f[2] // With ltBufOfs immediate add, points two lights behind lightBufferMain, i.e. lightBufferLookat. - xfrmLtPtr equ $20 + xfrmLtPtr equ $20 // also input_mtx_1 and dmemAddr li xfrmLtPtr, spFxBase - 2 * lightSize vmov mxr2i[0], mxr0i[2] lpv $v7[0], (ltBufOfs + 0x8)(xfrmLtPtr) // Load light direction @@ -2046,7 +2190,7 @@ continue_light_dir_xfrm: vreadacc $v11, ACC_MIDDLE // read the middle (bits 16..31) of the accumulator elements into v11 sw $12, (ltBufOfs + 0x14)(xfrmLtPtr) // Store duplicate of transformed light direction vreadacc $v15, ACC_UPPER // read the upper (bits 32..47) of the accumulator elements into v15 - beq xfrmLtPtr, tmpCurLight, after_light_dir_xfrm // exit if equal + beq xfrmLtPtr, topLightPtr, after_light_dir_xfrm // exit if equal vmudl $v29, $v11, $v11 // calculate the low partial product of the accumulator squared (low * low) vmadm $v29, $v15, $v11 // calculate the mid partial product of the accumulator squared (mid * low) vmadn $v16, $v11, $v15 // calculate the mid partial product of the accumulator squared (low * mid) @@ -2067,16 +2211,11 @@ continue_light_dir_xfrm: vmadm $v29, $v15, $v16[0] vmadn $v11, $v11, $v17[0] vmadh $v15, $v15, $v17[0] -.if (UCODE_IS_206_OR_OLDER) - i7 equ 7 -.else - i7 equ 3 -.endif - vmudn $v11, $v11, $v30[i7] // Scale results to become bytes + vmudn $v11, $v11, $v30[i1] // 0x0100; scale results to become bytes j @@loop - vmadh $v15, $v15, $v30[i7] // Scale results to become bytes + vmadh $v15, $v15, $v30[i1] // 0x0100; scale results to become bytes -curMatrix equ $12 +curMatrix equ $12 // Overwritten during texgen, but with a value which is 0 or positive, so means cur matrix is MV ltColor equ $v29 vPairRGBA equ $v27 vPairAlpha37 equ $v28 // Same as mvTc1f, but alpha values are left in elems 3, 7 @@ -2084,25 +2223,19 @@ vPairNX equ $v7 // also named vPairRGBATemp; with name vPairNX, uses X component vPairNY equ $v6 vPairNZ equ $v5 -// For point lighting, but armips does not like these defined in an .if -mvTc0i equ $v4 -mvTc1i equ $v21 -mvTc2i equ $v30 -mvTc0f equ $v3 -mvTc1f equ $v28 -mvTc2f equ $v31 - light_vtx: vadd vPairNY, vZero, vPairRGBATemp[1h] // Move vertex normals Y to separate reg .if UCODE_HAS_POINT_LIGHTING - luv ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Load next light color (ambient) + luv ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Init to ambient light color .else lpv $v20[0], (ltBufOfs - lightSize + 0x10)(curLight) // Load next below transformed light direction as XYZ_XYZ_ for lights_dircoloraccum2 .endif vadd vPairNZ, vZero, vPairRGBATemp[2h] // Move vertex normals Z to separate reg luv vPairRGBA[0], 8(inputVtxPos) // Load both verts' XYZAXYZA as unsigned vne $v4, $v31, $v31[3h] // Set VCC to 11101110 -.if UCODE_HAS_POINT_LIGHTING +.if !UCODE_HAS_POINT_LIGHTING + luv ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Init to ambient light color +.else andi $11, $5, G_LIGHTING_POSITIONAL_H // check if point lighting is enabled in the geometry mode beqz $11, directional_lighting // If not enabled, use directional algorithm for everything li curMatrix, mvpMatrix + 0x8000 // Set flag in negative to indicate cur mtx is MVP @@ -2110,7 +2243,7 @@ light_vtx: suv ltColor[0], 8(inputVtxPos) // Store ambient light color to two verts' RGBARGBA ori $11, $zero, 0x0004 vmov $v30[7], $v30[6] // v30[7] = 0x0010 because v30[0:2,4:6] will get clobbered - mtc2 $11, $v31[6] // v31[6] = 0x0004 (was previously 0x0420) + mtc2 $11, $v31[6] // v31[3] = 0x0004 (was previously 0x7F00) next_light_dirorpoint: lbu $11, (ltBufOfs + 0x3)(curLight) // Load light type / constant attenuation value at light structure + 3 bnez $11, light_point // If not zero, this is a point light @@ -2272,8 +2405,8 @@ light_point: vmadh $v20, mvTc2i, $v20[2h] // v20 = int result of vert-to-light in model space vmudm $v2, $v20, $v29[3h] // v2l_model * length normalization frac vmadh $v20, $v20, $v29[2h] // v2l_model * length normalization int - vmudn $v2, $v2, $v31[3] // this is 0x7F00; v31 is mvTc2f but elements 3 and 7 weren't overwritten - vmadh $v20, $v20, $v31[3] // scale to byte, only keep int part + vmudn $v2, $v2, $v31[3] // this is 0x0004; v31 is mvTc2f but elem 3 replaced, elem 7 left + vmadh $v20, $v20, $v31[3] // vmulu $v2, vPairNX, $v20[0h] // Normal X * normalized vert-to-light X mtc2 $11, $v27[8] // 0x3 << 4 -> v27 elems 4, 5 vmacu $v2, vPairNY, $v20[1h] // Y * Y @@ -2291,7 +2424,7 @@ light_point: vmudl $v2, $v2, $v2[0h] // squared vmulf $v29, $v29, $v20[3] // Length * byte 0x7 vmadm $v29, $v2, $v20[7] // + (scaled length squared) * byte 0xE << 5 - vmadn $v29, $v27, $v30[3] // + (byte 0x3 << 4) * 0xFFF0 + vmadn $v29, $v27, $v30[3] // + (byte 0x3 << 4) * 0x0100 vreadacc $v2, ACC_MIDDLE vrcph $v2[0], $v2[0] // v2 int, v29 frac: function of distance to light vrcpl $v2[0], $v29[0] // Reciprocal = inversely proportional @@ -2312,8 +2445,6 @@ light_point: j after_dirorpoint_loop directional_lighting: lpv $v20[0], (ltBufOfs - lightSize + 0x10)(curLight) // Load next light transformed dir; this value is overwritten with the same thing -.else // No point lighting - luv ltColor[0], (ltBufOfs + lightSize + 0)(curLight) // Init to ambient light color .endif // Loop for dot product normals and multiply-add color for 2 lights diff --git a/rsp/gbi.inc b/rsp/gbi.inc index 113bd3d..c8d03af 100644 --- a/rsp/gbi.inc +++ b/rsp/gbi.inc @@ -1,14 +1,5 @@ // GBI: Graphics Binary Interface -// XBUS doesn't use the full yield data buffer, but does reference the full size, -// so a definition for both sizes needs to exist. -OS_YIELD_DATA_SIZE_TOTAL equ 0xC00 -.if UCODE_METHOD == METHOD_FIFO -OS_YIELD_DATA_SIZE equ 0xC00 -.else // UCODE_METHOD == METHOD_XBUS -OS_YIELD_DATA_SIZE equ 0x920 -.endif - // Geometry mode flags // First byte gets OR'd with 0xC8 to form triangle opcode. So we can use 0x80, // 0x40, and 0x08 for other purposes while remaining forward compatible. (Of diff --git a/rsp/rsp_defs.inc b/rsp/rsp_defs.inc index ccb6b1a..384ffe8 100644 --- a/rsp/rsp_defs.inc +++ b/rsp/rsp_defs.inc @@ -2,6 +2,9 @@ // OSTask placed at end of DMEM (IMEM_START - sizeof(OSTask)) .definelabel OSTask_addr, 0xFC0 + +OS_YIELD_DATA_SIZE equ 0xC00 + // OSTask data member offsets OSTask_type equ 0x0000 OSTask_flags equ 0x0004