diff --git a/f3dex3.s b/f3dex3.s index d48e162..559bbbf 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -143,6 +143,20 @@ COUNTER_C_FIFO_FULL equ 1 .endif +// Only raise a warning in base modes; in profiling modes, addresses will be off +.macro warn_if_base, warntext + .if !ENABLE_PROFILING + .warning warntext + .endif +.endmacro + +.macro align_with_warning, alignment, warntext + .if (. & (alignment - 1)) + warn_if_base warntext + .endif + .align alignment +.endmacro + /* There are two different memory spaces for the overlays: (a) IMEM and (b) the microcode file (which, plus an offset, is also the location in DRAM). @@ -489,25 +503,23 @@ packedNormalsConstants: // Movemem table movememTable: - .dh tempMatrix // G_MTX multiply temp matrix (model) .dh mMatrix // G_MV_MMTX - .dh tempMatrix // G_MTX multiply temp matrix (projection) - .dh vpMatrix // G_MV_PMTX + .dh tempMatrix // G_MV_TEMPMTX0 multiply temp matrix (model) + .dh vpMatrix // G_MV_VPMTX + .dh tempMatrix // G_MV_TEMPMTX1 multiply temp matrix (view*projection) .dh viewport // G_MV_VIEWPORT .dh cameraWorldPos // G_MV_LIGHT +afterMovememRaTable: + .dh run_next_DL_command + .dh G_MTX_multiply_end + clipCondShifts: .db CLIP_SCAL_NY_SHIFT // Constants for clipping algorithm .db CLIP_SCAL_PY_SHIFT .db CLIP_SCAL_NX_SHIFT .db CLIP_SCAL_PX_SHIFT -// G_POPMTX, G_MTX, G_MOVEMEM Command Jump Table -movememHandlerTable: - .dh G_POPMTX_end // G_POPMTX - .dh G_MTX_end // G_MTX (multiply) - .dh G_MOVEMEM_end // G_MOVEMEM, G_MTX (load) - mvpValid: .db 0 // Nonzero if the MVP matrix is valid, 0 if it needs to be recomputed. dirLightsXfrmValid: @@ -823,7 +835,7 @@ clipPolyWrite equ $21 // Write pointer within current polygon being clipped viLtFlag equ $9 // Holds pointLightFlag or dirLightsXfrmValid // Misc -postOvlRA equ $10 // Address to return to after overlay load +nextRA equ $10 // Address to return to after overlay load ovlInitClock equ $16 // Temp for profiling dmaLen equ $19 // DMA length in bytes minus 1 dmemAddr equ $20 // DMA address in DMEM or IMEM. Also = rdpCmdBufPtr - rdpCmdBufEndP1 for flush_rdp_buffer @@ -1080,7 +1092,7 @@ finish_setup: li inputBufferPos, 0 li cmd_w1_dram, orga(ovl1_start) j load_overlays_0_1 - li postOvlRA, displaylist_dma + li nextRA, displaylist_dma start_end: .align 8 @@ -1089,7 +1101,57 @@ start_padded_end: .orga max(orga(), max(ovl0_padded_end - ovl0_start, ovl1_padded_end - ovl1_start) - 0x80) ovl01_end: -displaylist_dma_with_count: +G_CULLDL_handler: // 15 + lhu $10, (vertexTable)(cmd_w0) // Start vtx addr + lhu $3, (vertexTable)(cmd_w1_dram) // End vertex + /* + CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1 + verts which are behind the occlusion plane, and 1 vert which is behind the camera + plane and therefore randomly erroneously also set as behind the occlusion plane. + However, the convex hull of all the verts goes through visible area. This will be + incorrectly culled here. We can't afford the extra few instructions to disable + the occlusion plane if the vert is behind the camera, because this only matters for + G_CULLDL and not for tris. + */ + li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) + lhu $11, VTX_CLIP($10) +culldl_loop: + and $1, $1, $11 + beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render + lhu $11, (vtxSize + VTX_CLIP)($10) // next vertex clip flags + bne $10, $3, culldl_loop // loop until reaching the last vertex + addi $10, $10, vtxSize // advance to the next vertex + li cmd_w0, 0 // Clear count of DL cmds to skip loading +G_ENDDL_handler: + lbu $1, displayListStackLength // Load the DL stack index; if end stack, + beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end + addi $1, $1, -4 // Decrement the DL stack index + j call_ret_common // has a different version in ovl1 + lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to + +G_BRANCH_WZ_handler: + lhu $10, (vertexTable)(cmd_w0) // Vertex addr from byte 3 +.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2 + lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex) +.else + lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2) +.endif + sub $2, $10, cmd_w1_dram // subtract the w/z value being tested + bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL + lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to + li cmd_w0, 0x8000 // Bit 16 set (via negative) = nopush, bits 3-7 = 0 for hint +G_DL_handler: + sll $2, cmd_w0, 15 // Shifts the push/nopush value to the sign bit + lbu $1, displayListStackLength // Get the DL stack length + jal segmented_to_physical + add $3, taskDataPtr, inputBufferPos // Current DL pos to push on stack + bltz $2, call_ret_common // Nopush = branch = flag is set + move taskDataPtr, cmd_w1_dram // Set the new DL to the target display list + sw $3, (displayListStack)($1) + addi $1, $1, 4 // Increment the DL stack length +call_ret_common: + sb $zero, materialCullMode // This covers call, branch, return, and cull and branchZ successes + sb $1, displayListStackLength andi inputBufferPos, cmd_w0, 0x00F8 // Byte 3, how many cmds to drop from load (max 0xA0) displaylist_dma: // Load INPUT_BUFFER_SIZE_BYTES - inputBufferPos cmds (inputBufferPos >= 0, mult of 8) @@ -1100,14 +1162,13 @@ displaylist_dma: .endif nor dmaLen, inputBufferPos, $zero // DMA length = -inputBufferPos - 1 = ones compliment move cmd_w1_dram, taskDataPtr // set up the DRAM address to read from - jal dma_read_write // initiate the DMA read - addi dmemAddr, inputBufferPos, inputBufferEnd // set the address to DMA read to sub taskDataPtr, taskDataPtr, inputBufferPos // increment the DRAM address to read from next time -wait_for_dma_and_run_next_command: -G_POPMTX_end: -G_MOVEMEM_end: - j while_wait_dma_busy // wait for the DMA read to finish - li $ra, run_next_DL_command + addi dmemAddr, inputBufferPos, inputBufferEnd // set the address to DMA read to +dma_and_wait_goto_next_command: + li nextRA, run_next_DL_command +dma_and_wait_goto_next_ra: + j dma_read_write + li $ra, wait_goto_next_ra G_POPMTX_handler: G_DMA_IO_handler: @@ -1164,11 +1225,24 @@ run_next_DL_command: jr $11 // Jump to handler addi inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words // $1 must remain zero - // $7 must retain the command byte for load_mtx and overlay 4 stuff + // $7 must retain the command byte for load_mtx and overlay 3 stuff // $11 must contain the handler called for several handlers +G_SETxIMG_handler: // 10 + lb $3, materialCullMode // Get current mode + jal segmented_to_physical // Convert image to physical address + lw $2, lastMatDLPhyAddr // Get last material physical addr + bnez $3, G_RDP_handler // If not in normal mode (0), exit + add $10, taskDataPtr, inputBufferPos // Current material physical addr + beq $10, $2, @@skip // Branch if we are executing the same mat again + sw $10, lastMatDLPhyAddr // Store material physical addr + li $7, 1 // > 0: in material first time +@@skip: // Otherwise $7 was < 0: cull mode (in mat second time) + j G_RDP_handler + sb $7, materialCullMode + .if !ENABLE_PROFILING -G_LIGHTTORDP_handler: +G_LIGHTTORDP_handler: // 9 lbu $11, numLightsxSize // Ambient light lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size andi $2, cmd_w0, 0x00FF // Byte 3 = alpha @@ -1180,49 +1254,14 @@ G_LIGHTTORDP_handler: or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word .endif -G_BRANCH_WZ_handler: - lhu $10, (vertexTable)(cmd_w0) // Vertex addr from byte 3 -.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2 - lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex) -.else - lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2) -.endif - sub $2, $10, cmd_w1_dram // subtract the w/z value being tested - bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL - lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to - j branch_dl // need $2 < 0 for nopush and cmd_w1_dram - li cmd_w0, 0 // No count of DL cmds to skip - -G_RELSEGMENT_handler: - jal segmented_to_physical // Resolve new segment address relative to existing segment -G_MOVEWORD_handler: - srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT) - lhu $10, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304) -do_moveword: - sll $11, cmd_w0, 16 // Sign bit = upper bit of offset - add $10, $10, cmd_w0 // Offset + base; only lower 12 bits matter - bltz $11, run_next_DL_command // If upper bit of offset is set, exit after halfword - sh cmd_w1_dram, ($10) // Store value from cmd into halfword - j run_next_DL_command - sw cmd_w1_dram, ($10) // Store value from cmd into word (offset + moveword_table[index]) - -G_LOAD_UCODE_handler: +G_LOAD_UCODE_handler: // 4 j load_overlay_0_and_enter // Delay slot is harmless G_MODIFYVTX_handler: lhu $10, (vertexTable)(cmd_w0) // Byte 3 = vtx being modified j do_moveword // Moveword adds cmd_w0 to $10 for final addr lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear -G_VTX_handler: - lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive - jal segmented_to_physical // Convert address in cmd_w1_dram to physical - lhu vtxLeft, (inputBufferEnd - 0x07)(inputBufferPos) // vtxLeft = size in bytes = vtx count * 0x10 - sub dmemAddr, dmemAddr, vtxLeft // Start addr = end addr - size. Rounded down to DMA word by H/W - addi dmaLen, vtxLeft, -1 // DMA length is always offset by -1 - j dma_read_write - li $ra, vtx_after_dma - -G_TRIFAN_handler: +G_TRIFAN_handler: // 17 li $1, 0x8000 // $ra negative = flag for G_TRIFAN G_TRISTRIP_handler: addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0 @@ -1702,10 +1741,7 @@ return_and_end_mat: jr $ra sb $zero, materialCullMode // This covers all tri early exits except clipping -.if (. & 4) - .warning "One instruction of padding before ovl234" -.endif -.align 8 +align_with_warning 8, "One instruction of padding before ovl234" vtx_select_lighting: .if CFG_PROFILING_B @@ -1764,12 +1800,12 @@ g_popmtx_ovl3: // otherwise lw $11, matrixStackPtr // Current matrix stack pointer lw $2, OSTask + OSTask_dram_stack // Top of the stack sub cmd_w1_dram, $11, cmd_w1_dram // Decrease pointer by amount in command - sub $1, cmd_w1_dram, $2 // Is it still valid / within the stack? - bgez $1, @@skip // If so, skip the failsafe + sub $3, cmd_w1_dram, $2 // Is it still valid / within the stack? + bgez $3, @@skip // If so, skip the failsafe sh $zero, mvpValid // and dirLightsXfrmValid; mark both mtx and dir lts invalid move cmd_w1_dram, $2 // Use the top of the stack as the new pointer @@skip: - j do_movemem // Load the new matrix from the stack + j do_movemem // Must keep $1 = 0 sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer g_mtx_push_ovl3: @@ -1786,11 +1822,10 @@ g_dma_io_ovl3: jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command) andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment - // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size - // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit - sra dmemAddr, dmemAddr, 2 - j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) - li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command + j dma_and_wait_goto_next_command // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr) + // At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size + // So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit + sra dmemAddr, dmemAddr, 2 clip_after_constants: .if CFG_PROFILING_B @@ -2066,8 +2101,8 @@ g_memset_ovl3: sub cmd_w0, cmd_w0, $2 bgtz cmd_w0, @@transaction_loop add cmd_w1_dram, cmd_w1_dram, $2 - j wait_for_dma_and_run_next_command - // Delay slot harmless + j while_wait_dma_busy + li $ra, run_next_DL_command @@clamp_to_memset_buffer: addi $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize) sra $10, $11, 31 @@ -2082,49 +2117,6 @@ ovl3_padded_end: .orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga()) ovl234_end: -tri_fan_store: - lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1 - j tri_main - sb $11, 5(rdpCmdBufPtr) // Store vtx 1 - -G_MTX_end: // TODO move to ovl3? -// Multiplies the temp loaded matrix into the M or VP matrix - lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP. - li $3, tempMatrix // Input 1 = temp mem (loaded mtx) - jal while_wait_dma_busy - move $2, $6 // Input 0 = output - li $ra, run_next_DL_command -mtx_multiply: - // $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx - addi $10, $3, 0x0018 -@@loop: - vmadn $v7, $v31, $v31[2] // 0 - addi $11, $3, 0x0008 - vmadh $v6, $v31, $v31[2] // 0 - addi $2, $2, -0x0020 - vmudh $v29, $v31, $v31[2] // 0 -@@innerloop: - ldv $v3[0], 0x0040($2) - ldv $v3[8], 0x0040($2) - lqv vTemp2[0], 0x0020($3) // Input 1 - ldv $v2[0], 0x0020($2) - ldv $v2[8], 0x0020($2) - lqv vTemp1[0], 0x0000($3) // Input 1 - vmadl $v29, $v3, vTemp2[0h] - addi $3, $3, 0x0002 - vmadm $v29, $v2, vTemp2[0h] - addi $2, $2, 0x0008 // Increment input 0 pointer - vmadn $v5, $v3, vTemp1[0h] - bne $3, $11, @@innerloop - vmadh $v4, $v2, vTemp1[0h] - bne $3, $10, @@loop - addi $3, $3, 0x0008 - sqv $v7[0], (0x0020)($6) - sqv $v6[0], (0x0000)($6) - sqv $v4[0], (0x0010)($6) - jr $ra - sqv $v5[0], (0x0030)($6) - vtx_after_dma: srl $2, cmd_w0, 11 // n << 1 sub $2, cmd_w0, $2 // = v0 << 1 @@ -2186,9 +2178,9 @@ vtx_after_setup_constants: bnez $7, @@skip_recalc_mvp lb viLtFlag, pointLightFlag li $2, vpMatrix - li $3, mMatrix + li dmemAddr, mMatrix jal mtx_multiply - li $6, mvpMatrix + li $3, mvpMatrix sb $10, mvpValid // $10 is nonzero from mtx_multiply, in fact 0x18 @@skip_recalc_mvp: andi $11, vGeomMid, G_LIGHTING >> 8 @@ -2248,10 +2240,7 @@ vtx_final_setup_for_clip: j vtx_store_loop_entry llv vpST[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5 -.if (. & 4) - .warning "One instruction of padding before vertex loop" -.endif -.align 8 +align_with_warning 8, "One instruction of padding before vertex loop" .if CFG_NO_OCCLUSION_PLANE @@ -2602,60 +2591,55 @@ tris_end: lqv vTRC, (vTRCValue)($zero) // Restore value overwritten by matrix .endif +tri_fan_store: + lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1 + j tri_main + sb $11, 5(rdpCmdBufPtr) // Store vtx 1 - -.if CFG_PROFILING_B -loadOverlayInstrs equ 13 -.elseif CFG_PROFILING_C -loadOverlayInstrs equ 24 -.else -loadOverlayInstrs equ 12 -.endif -endFreeImemAddr equ (0x1FC8 - (4 * loadOverlayInstrs)) -startFreeImem: -.if . > endFreeImemAddr - .error "Out of IMEM space" -.endif -.org endFreeImemAddr -endFreeImem: +// Converts the segmented address in cmd_w1_dram to the corresponding physical address +segmented_to_physical: // 7 + srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11 + andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift + lw $11, (segmentTable)($11) // Get the current address of the segment + sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out + srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared + jr $ra + add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address load_overlay_0_and_enter: - li postOvlRA, 0x1000 // Sets up return address - li cmd_w1_dram, orga(ovl0_start) // Sets up ovl0 table address -// To use these: set postOvlRA ($10) to the address to execute after the load is -// done, and set cmd_w1_dram to orga(your_overlay). + li nextRA, 0x1000 // Sets up return address + li cmd_w1_dram, orga(ovl0_start) // Sets up ovl0 table address load_overlays_0_1: li dmaLen, ovl01_end - 0x1000 - 1 j load_overlay_inner li dmemAddr, 0x1000 load_overlays_2_3_4: - addi postOvlRA, $ra, -8 // Got here with jal, but want to return to addr of jal itself + addi nextRA, $ra, -8 // Got here with jal, but want to return to addr of jal itself li dmaLen, ovl234_end - ovl234_start - 1 li dmemAddr, ovl234_start -load_overlay_inner: +load_overlay_inner: // dmaLen, dmemAddr, cmd_w1_dram, and nextRA must be set lw $11, OSTask + OSTask_ucode .if CFG_PROFILING_B addi perfCounterC, perfCounterC, 0x4000 // Increment overlay (all 0-4) load count .endif -.if CFG_PROFILING_C - mfc0 ovlInitClock, DPC_CLOCK // see below -.endif - jal shared_dma_read_write // If CFG_PROFILING_C, use the one without perfCounterD +.if !CFG_PROFILING_C + j dma_and_wait_goto_next_ra + add cmd_w1_dram, cmd_w1_dram, $11 +.else + // According to Tharo's testing, and in contradiction to the manual, almost no + // instructions are issued while an IMEM DMA is happening. So we have to time + // it using counters. + mfc0 ovlInitClock, DPC_CLOCK + jal shared_dma_read_write // The one without perfCounterD add cmd_w1_dram, cmd_w1_dram, $11 - move $ra, postOvlRA - // Fall through to while_wait_dma_busy -.if CFG_PROFILING_C -// ...except if profiling DMA time. According to Tharo's testing, and in contradiction -// to the manual, almost no instructions are issued while an IMEM DMA is happening. -// So we have to time it using counters. mfc0 $11, SP_DMA_BUSY @@while_dma_busy: bnez $11, @@while_dma_busy mfc0 $11, SP_DMA_BUSY mfc0 $11, DPC_CLOCK sub $11, $11, ovlInitClock - jr $ra + jr nextRA add perfCounterD, perfCounterD, $11 // Also, normal dma_read_write below can't be changed to insert perfCounterD due to @@ -2668,6 +2652,18 @@ dma_read_write: // $11 load in delay slot is harmless. .endif +endFreeImemAddr equ 0x1FC4 +startFreeImem: +.if . > endFreeImemAddr + .error "Out of IMEM space" +.endif +.org endFreeImemAddr +endFreeImem: + +wait_goto_next_ra: + move $ra, nextRA + // Fallthrough to while_wait_dma_busy + .if . != 0x1FC8 // This has to be at this address for boot and S2DEX compatibility .error "Error in organization of end of IMEM" @@ -2797,52 +2793,86 @@ ovl0_padded_end: .error "Automatic resizing for overlay 0 failed" .endif -// overlay 1 (0x178 bytes loaded into 0x1000) +// overlay 1 .headersize 0x00001000 - orga() ovl1_start: -G_DL_handler: - sll $2, cmd_w0, 15 // Shifts the push/nopush value to the sign bit -branch_dl: - lbu $1, displayListStackLength // Get the DL stack length - jal segmented_to_physical - add $3, taskDataPtr, inputBufferPos // Current DL pos to push on stack - bltz $2, call_ret_common // Nopush = branch = flag is set - move taskDataPtr, cmd_w1_dram // Set the new DL to the target display list - sw $3, (displayListStack)($1) - addi $1, $1, 4 // Increment the DL stack length -call_ret_common: - sb $zero, materialCullMode // This covers call, branch, return, and cull and branchZ successes - j displaylist_dma_with_count - sb $1, displayListStackLength - -G_MTX_handler: - // The lower 3 bits of G_MTX are, from LSb to MSb (0 value/1 value), - // matrix type (modelview/projection) - // load type (multiply/load) - // push type (nopush/push) - // In F3DEX2 (and by extension F3DZEX), G_MTX_PUSH is inverted, so 1 is nopush and 0 is push +G_MTX_handler: // 12 .if CFG_PROFILING_C addi perfCounterC, perfCounterC, 1 // Increment matrix count .endif - andi $11, cmd_w0, G_MTX_P_MV | G_MTX_NOPUSH_PUSH // Read the matrix type and push type flags into $11 - beqz $11, ovl234_clipmisc_entrypoint // Modelview and push: go to overlay for push - andi $2, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $2 (0 is multiply, 2 is load) + andi $11, cmd_w0, G_MTX_VP_M | G_MTX_NOPUSH_PUSH + beqz $11, ovl234_clipmisc_entrypoint // Model and push: go to overlay for push + sh $zero, mvpValid // and dirLightsXfrmValid load_mtx: - add $7, $7, $2 // Add the load type to the command byte in $7, selects the return address based on whether the matrix needs multiplying or just loading - sh $zero, mvpValid // and dirLightsXfrmValid -G_MOVEMEM_handler: + andi $1, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $1 (2 is multiply, 0 is load) +G_MOVEMEM_handler: // Otherwise $1 is 0 jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one do_movemem: - andi $1, cmd_w0, 0x00FE // Move the movemem table index into $1 (bits 1-7 of the first command word) - lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into dmaLen - lhu dmemAddr, (movememTable)($1) // Load the address of the memory location for the given movemem index - srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0 - lh $ra, (movememHandlerTable - (G_POPMTX | 0xFF00))($7) // Loads the return address from movememHandlerTable based on command byte + // 0: load M, 2: mul M -> load temp, 4: load VP, 6: mul VP -> load temp + andi $3, cmd_w0, 0x00FE // Movemem table index into $1 (bits 1-7 of the word 0) + lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Second byte of word 0 + lhu dmemAddr, (movememTable)($3) // $3 reused in G_MTX_multiply_end + srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0 + add dmemAddr, dmemAddr, $2 + j dma_and_wait_goto_next_ra + lh nextRA, (afterMovememRaTable)($1) // $1 is 2 if mtx multiply, else 0 + +G_FLUSH_handler: // 32 + jal flush_rdp_buffer // Flush once to push partial DMEM buf to FIFO + sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1 // Prereq; offset buffer fullness + // If the DMEM buffer was empty, dmemAddr will be unchanged and valid for this next + // jump. Otherwise, running the DMA write will cause dmemAddr to get set to a large + // negative number. Then for this second jump, the same codepath will be triggered as + // if the buffer was empty. The result is it will wait for the DMA to finish, set + // DPC_END, and return to $ra. This is why the dmemAddr register (as opposed to, + // for example, dmaLen) is used as the DMEM buf fullness. + j flush_rdp_buffer +G_MTX_multiply_end: + li $ra, run_next_DL_command // Dual use for above and below + lhu $3, (movememTable - G_MV_TEMPMTX0)($3) // $3=2->0=M; $3=6->4=VP + move $2, $3 // Input 0 = output +mtx_multiply: + // $2 and dmemAddr are input matrices; $3 is output matrix + addi $10, dmemAddr, 0x0018 +@@loop: + vmadn $v7, $v31, $v31[2] // 0 + addi $11, dmemAddr, 0x0008 + vmadh $v6, $v31, $v31[2] // 0 + addi $2, $2, -0x0020 + vmudh $v29, $v31, $v31[2] // 0 +@@innerloop: + ldv $v3[0], 0x0040($2) + ldv $v3[8], 0x0040($2) + lqv vTemp2[0], 0x0020(dmemAddr) // Input 1 + ldv $v2[0], 0x0020($2) + ldv $v2[8], 0x0020($2) + lqv vTemp1[0], 0x0000(dmemAddr) // Input 1 + vmadl $v29, $v3, vTemp2[0h] + addi dmemAddr, dmemAddr, 0x0002 + vmadm $v29, $v2, vTemp2[0h] + addi $2, $2, 0x0008 // Increment input 0 pointer + vmadn $v5, $v3, vTemp1[0h] + bne dmemAddr, $11, @@innerloop + vmadh $v4, $v2, vTemp1[0h] + bne dmemAddr, $10, @@loop + addi dmemAddr, dmemAddr, 0x0008 + sqv $v7[0], (0x0020)($3) + sqv $v6[0], (0x0000)($3) + sqv $v4[0], (0x0010)($3) + jr $ra + sqv $v5[0], (0x0030)($3) + +G_VTX_handler: // 19 + lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive + jal segmented_to_physical // Convert address in cmd_w1_dram to physical + lhu vtxLeft, (inputBufferEnd - 0x07)(inputBufferPos) // vtxLeft = size in bytes = vtx count * 0x10 + sub dmemAddr, dmemAddr, vtxLeft // Start addr = end addr - size. Rounded down to DMA word by H/W + addi dmaLen, vtxLeft, -1 // DMA length is always offset by -1 j dma_read_write G_SETOTHERMODE_H_handler: // These handler labels must be 4 bytes apart for the code below to work - add dmemAddr, dmemAddr, $2 // This is for the code above, does nothing for G_SETOTHERMODE_H + li $ra, vtx_after_dma // Only for above, nop for below G_SETOTHERMODE_L_handler: lw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) // resolves to othermode0 or othermode1 based on which handler was jumped to lui $2, 0x8000 @@ -2857,21 +2887,21 @@ G_SETOTHERMODE_L_handler: j G_RDP_handler lw cmd_w1_dram, otherMode1 -G_RDPSETOTHERMODE_handler: +G_RDPSETOTHERMODE_handler: // 4 li $1, 8 // Offset from scissor DMEM to othermode DMEM G_SETSCISSOR_handler: // $1 is 0 if jumped here sw cmd_w0, (scissorUpLeft)($1) // otherMode0 = scissorUpLeft + 8 j G_RDP_handler // Send the command to the RDP sw cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8 -G_GEOMETRYMODE_handler: // $7 = G_GEOMETRYMODE (as negative) if jumped here +G_GEOMETRYMODE_handler: // 5; $7 = G_GEOMETRYMODE (as negative) if jumped here lw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode) or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode) j run_next_DL_command // run the next DL command sw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // update the geometry mode value -G_TEXTURE_handler: +G_TEXTURE_handler: // 4 li $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler) // Calculate the offset from texrectWord1 and $11 for saving to textureSettings G_TEXRECT_handler: // $11 contains address of handler G_TEXRECTFLIP_handler: @@ -2882,7 +2912,7 @@ G_RDPHALF_1_handler: // Stores second command word into textureSettings for gSPTexture, 0x00D4 for gSPTextureRectangle/Flip, 0x00D8 for G_RDPHALF_1 sw cmd_w1_dram, (texrectWord2 - G_TEXRECTFLIP_handler)($11) -G_RDPHALF_2_handler: +G_RDPHALF_2_handler: // 7 ldv $v29[0], (texrectWord1)($zero) lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0 addi rdpCmdBufPtr, rdpCmdBufPtr, 8 @@ -2893,77 +2923,33 @@ G_RDPHALF_2_handler: j G_RDP_handler sdv $v29[0], -8(rdpCmdBufPtr) -// Converts the segmented address in cmd_w1_dram to the corresponding physical address -segmented_to_physical: - srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11 - andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift - lw $11, (segmentTable)($11) // Get the current address of the segment - sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out - srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared - jr $ra - add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address - -G_SETxIMG_handler: - lb $3, materialCullMode // Get current mode - jal segmented_to_physical // Convert image to physical address - lw $2, lastMatDLPhyAddr // Get last material physical addr - bnez $3, G_RDP_handler // If not in normal mode (0), exit - add $10, taskDataPtr, inputBufferPos // Current material physical addr - beq $10, $2, @@skip // Branch if we are executing the same mat again - sw $10, lastMatDLPhyAddr // Store material physical addr - li $7, 1 // > 0: in material first time -@@skip: // Otherwise $7 was < 0: cull mode (in mat second time) - j G_RDP_handler - sb $7, materialCullMode - -G_FLUSH_handler: - jal flush_rdp_buffer // Flush once to push partial DMEM buf to FIFO - sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1 // Prereq; offset buffer fullness - // If the DMEM buffer was empty, dmemAddr will be unchanged and valid for this next - // jump. Otherwise, running the DMA write will cause dmemAddr to get set to a large - // negative number. Then for this second jump, the same codepath will be triggered as - // if the buffer was empty. The result is it will wait for the DMA to finish, set - // DPC_END, and return to $ra. This is why the dmemAddr register (as opposed to, - // for example, dmaLen) is used as the DMEM buf fullness. - j flush_rdp_buffer - li $ra, run_next_DL_command - -G_CULLDL_handler: - lhu $10, (vertexTable)(cmd_w0) // Start vtx addr - lhu $3, (vertexTable)(cmd_w1_dram) // End vertex - /* - CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1 - verts which are behind the occlusion plane, and 1 vert which is behind the camera - plane and therefore randomly erroneously also set as behind the occlusion plane. - However, the convex hull of all the verts goes through visible area. This will be - incorrectly culled here. We can't afford the extra few instructions to disable - the occlusion plane if the vert is behind the camera, because this only matters for - G_CULLDL and not for tris. - */ - li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) - lhu $11, VTX_CLIP($10) -culldl_loop: - and $1, $1, $11 - beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render - lhu $11, (vtxSize + VTX_CLIP)($10) // next vertex clip flags - bne $10, $3, culldl_loop // loop until reaching the last vertex - addi $10, $10, vtxSize // advance to the next vertex - li cmd_w0, 0 // Clear count of DL cmds to skip loading -G_ENDDL_handler: - lbu $1, displayListStackLength // Load the DL stack index; if end stack, - beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end - addi $1, $1, -4 // Decrement the DL stack index - j call_ret_common // has a different version in ovl1 - lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to - +G_RELSEGMENT_handler: // 9 + jal segmented_to_physical // Resolve new segment address relative to existing segment +G_MOVEWORD_handler: + srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT) + lhu $10, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304) +do_moveword: + sll $11, cmd_w0, 16 // Sign bit = upper bit of offset + add $10, $10, cmd_w0 // Offset + base; only lower 12 bits matter + bltz $11, run_next_DL_command // If upper bit of offset is set, exit after halfword + sh cmd_w1_dram, ($10) // Store value from cmd into halfword + j run_next_DL_command + sw cmd_w1_dram, ($10) // Store value from cmd into word (offset + moveword_table[index]) ovl1_end: -.align 8 +align_with_warning 8, "One instruction of padding at end of ovl1" ovl1_padded_end: .if ovl1_padded_end > ovl01_end .error "Automatic resizing for overlay 1 failed" .endif +// Currently want exactly 92 instructions (based on current size of start) +.if ovl1_padded_end > start_padded_end + warn_if_base "ovl1 is larger than start, try to move something out" +.endif +.if ovl1_padded_end < start_padded_end + warn_if_base "ovl1 is smaller than start, wasting space!" +.endif .headersize ovl234_start - orga() diff --git a/gbi.h b/gbi.h index e69f9fd..7fe0323 100644 --- a/gbi.h +++ b/gbi.h @@ -177,35 +177,51 @@ of warnings if you use -Wpedantic. */ /* See SPMatrix */ /** - * @brief specifies whether the matrix operation will be performed on the projection or the model view matrix. - * + * @brief Specifies whether the matrix operation will be performed on the + * model or the view*projection matrix. */ -#define G_MTX_MODELVIEW 0x00 /* matrix types */ +#define G_MTX_MODEL 0x00 /** - * @brief @copybrief G_MTX_MODELVIEW - * + * @brief Equivalent to G_MTX_MODEL, for backwards compatibility. The view + * matrix used to be put in the same stack as the model matrix, whereas now it + * should be multiplied with the projection matrix. In SM64, this is called + * "mat stack fix"; in OoT, the vanilla game already does this. */ -#define G_MTX_PROJECTION 0x04 +#define G_MTX_MODELVIEW G_MTX_MODEL /** - * @brief concatenates the matrix (m) with the top of the matrix stack. - * + * @brief @copybrief G_MTX_MODEL */ -#define G_MTX_MUL 0x00 /* concat or load */ +#define G_MTX_VIEWPROJECTION 0x04 /** - * @brief loads the matrix (m) onto the top of the matrix stack. - * + * @brief Equivalent to G_MTX_VIEWPROJECTION, @see G_MTX_MODELVIEW. */ -#define G_MTX_LOAD 0x02 +#define G_MTX_PROJECTION G_MTX_VIEWPROJECTION /** - * @brief specifies do not push the matrix stack prior to matrix operations - * + * @brief Multiplies the incoming matrix into the top of the matrix stack. + * @note The binary encoding of this bit is flipped in SPMatrix to save a RSP + * instruction. This is new in F3DEX3. */ -#define G_MTX_NOPUSH 0x00 /* push or not */ +#define G_MTX_MUL 0x00 /** - * @brief specifies push the matrix stack prior to matrix operations - * + * @brief Replaces the top of the matrix stack with the incoming matrix. + * @note The binary encoding of this bit is flipped in SPMatrix to save a RSP + * instruction. This is new in F3DEX3. */ -#define G_MTX_PUSH 0x01 +#define G_MTX_LOAD 0x02 +/** + * @brief Do not push the top of the matrix stack to DRAM prior to matrix + * operations. + * @note The binary encoding of this bit is flipped in SPMatrix to save a RSP + * instruction. This is true in both F3DEX2 and F3DEX3. + */ +#define G_MTX_NOPUSH 0x00 +/** + * @brief Push the top of the matrix stack to DRAM prior to matrix operations. + * This is not supported for G_MTX_VIEWPROJECTION, only G_MTX_MODEL. + * @note The binary encoding of this bit is flipped in SPMatrix to save a RSP + * instruction. This is true in both F3DEX2 and F3DEX3. + */ +#define G_MTX_PUSH 0x01 /* See SPAlphaCompareCull */ #define G_ALPHA_COMPARE_CULL_DISABLE 0 @@ -217,15 +233,15 @@ of warnings if you use -Wpedantic. */ * Each of these indexes an entry in a dmem table which points to an arbitrarily * sized block of dmem in which to store the result of a DMA. */ -#define G_MV_TEMPMTX0 0 /* for internal use by G_MTX multiply mode */ -#define G_MV_MMTX 2 -#define G_MV_TEMPMTX1 4 /* for internal use by G_MTX multiply mode */ -#define G_MV_VPMTX 6 +#define G_MV_MMTX 0 +#define G_MV_TEMPMTX0 2 /* for internal use by G_MTX multiply mode */ +#define G_MV_VPMTX 4 +#define G_MV_TEMPMTX1 6 /* for internal use by G_MTX multiply mode */ #define G_MV_VIEWPORT 8 #define G_MV_LIGHT 10 /* G_MV_POINT is no longer supported because the internal vertex format is no longer a multiple of 8 (DMA word). This was not used in any command anyway. */ -/* G_MV_MATRIX is no longer supported because there is no MVP matrix in F3DEX3. */ +/* G_MV_MATRIX is no longer supported. */ #define G_MV_PMTX G_MV_VPMTX /* backwards compatibility */ /* @@ -239,7 +255,7 @@ longer a multiple of 8 (DMA word). This was not used in any command anyway. */ #define G_MW_SEGMENT 0x06 #define G_MW_FOG 0x08 #define G_MW_LIGHTCOL 0x0A -/* G_MW_FORCEMTX is no longer supported because there is no MVP matrix in F3DEX3. */ +/* G_MW_FORCEMTX is no longer supported. */ /* G_MW_PERSPNORM is removed; perspective norm is now set via G_MW_FX. */ #define G_MW_HALFWORD_FLAG 0x8000 /* indicates store 2 bytes instead of 4 */ @@ -2123,16 +2139,36 @@ _DW({ \ /** * @brief macro which inserts a matrix operation at the end display list. * - * It inserts a matrix operation in the display list. The parameters allow you to select which matrix stack to use (projection or model view), where to load or concatenate, and whether or not to push the matrix stack. The following parameters are bit OR'ed together: - * - @ref G_MTX_PROJECTION @ref G_MTX_MODELVIEW - @copybrief G_MTX_MODELVIEW + * It inserts a matrix operation in the display list. The parameters allow you + * to select which matrix stack to use (projection or model view), whether to + * load or multiply, and whether or not to push the matrix stack. The following + * parameters are bitwise OR'ed together: + * - @ref G_MTX_MODEL - @copybrief G_MTX_MODEL + * - @ref G_MTX_VIEWPROJECTION - @copybrief G_MTX_VIEWPROJECTION * - @ref G_MTX_MUL - @copybrief G_MTX_MUL * - @ref G_MTX_LOAD - @copybrief G_MTX_LOAD * - @ref G_MTX_NOPUSH - @copybrief G_MTX_NOPUSH * - @ref G_MTX_PUSH - @copybrief G_MTX_PUSH - * # Matrix Format - * The format of the fixed-point matrices may seem a little awkward to the application programmer because it is optimized for the RSP geometry engine. This unusual format is hidden in the graphics utility libraries and not usually exposed to the application programmer, but in some cases (static matrix declarations or direct element manipulation) it is necessary to understand the format. * - * The integer and fractional components of the matrix elements are separated. The first 8 words (16 shorts) hold the 16-bit integer elements, the second 8 words (16 shorts) hold the 16-bit fractional elements. The fact that the Mtx type is declared as a long [4][4] array is slightly misleading. For example, to declare a static identity matrix, use code similar to this: + * The legacy parameters @ref G_MTX_MODELVIEW and @ref G_MTX_PROJECTION are also + * supported, but in F3DEX3 you should always multiply the view matrix with the + * projection matrix as G_MTX_VIEWPROJECTION, and only put model matrices in the + * G_MTX_MODEL stack. + * + * # Matrix Format + * + * The format of the fixed-point matrices may seem a little awkward to the + * application programmer because it is optimized for the RSP geometry engine. + * This unusual format is hidden in the graphics utility libraries and not + * usually exposed to the application programmer, but in some cases (static + * matrix declarations or direct element manipulation) it is necessary to + * understand the format. + * + * The integer and fractional components of the matrix elements are separated. + * The first 8 words (16 shorts) hold the 16-bit integer elements, the second + * 8 words (16 shorts) hold the 16-bit fractional elements. The fact that the + * Mtx type is declared as a long [4][4] array is slightly misleading. For + * example, to declare a static identity matrix, use code similar to this: * ```#include "gbi.h" * static Mtx ident = * { @@ -2149,7 +2185,8 @@ _DW({ \ * 0x00000000, 0x00000000, * }; * ``` - * To force the translation elements of a matrix to be (10.5, 20.5, 30.5), use code similar to this: + * To force the translation elements of a matrix to be (10.5, 20.5, 30.5), use + * code similar to this: * ``` * #include "gbi.h" * @@ -2163,70 +2200,113 @@ _DW({ \ * mat.m[3][3] = * (0x8000 << 16) | (0); * ``` - * @note - * Matrix concatenation in the RSP geometry engine is done using 32-bit integer arithmetic. A 32 x 32 bit multiply results in a 64-bit number. Only the middle 32 bits of this 64-bit result are kept for the new matrix. Therefore, when concatenating matrices, remember about the resulting fixed-point numerical error. + * + * # Accuracy * - * For example, to retain maximum precision, the number ranges must be similar. Large-scale and translate parameters can decrease the transformation precision. Because rotation and projection matrices require quite a bit of fractional accuracy, these fractions may get tossed out if multiplied against large integer numbers. + * Matrix multiplication in the RSP geometry engine is done using 32-bit integer + * arithmetic, in s15.16 format (16 integer, 16 fractional bits, in other words + * representing -32768.0 to 32767.999985 with a resolution of about 0.000015). + * A 32 x 32 bit multiply results in a 64-bit number. Only the middle 32 bits of + * this 64-bit result are kept for the new matrix, to preserve the s15.16 + * format. * - * Each concatenation results in the rounding of the LSB of each matrix term. This means that each concatenation injects 1/2 LSB of error into the matrix. To keep full precision, concatenate matrices in floating-point on the processor and just load the result into the RSP. + * A typical game object's transformation will have a scale around the range of + * 1/100, a rotation which is always values -1.0 to 1.0, and a translation + * around the range of 1000. When producing a final transformation matrix, you + * will typically compose (multiply) one scale, multiple rotations (for limbs), + * and finally one translation. Each matrix multiply on the RSP will lose + * precision, especially if the scale has been applied before the rotations. + * + * Therefore, your game should usually maintain a matrix stack on the CPU in + * floating point, and once you have a final model matrix for each limb / + * object, convert it to fixed point and load it to the RSP. Occasional uses of + * G_MTX_MUL, such as multiplying view * projection or for HUD elements, are + * okay. Both SM64 and OoT already operate this way. * * # Performance - * Each @ref G_MTX_MODELVIEW matrix operation has an implicit matrix multiplication even if you specify @ref G_MTX_LOAD. This is the combined model view (M) and projection (P) matrix that is necessary for the vertex transformation to use a single matrix during transformation. * - * You can optimize this by concatenating modeling matrices on the CPU and then putting the viewing (V) and projection matrices on the projection stack. By doing this, you only incur the single MxVP matrix concatenation each time you load a modeling matrix. Furthermore, the application has more information on how to do a cheap hack for modeling matrix concatenation. For example, if you want to combine a single axis rotation with a translation, just place the coefficients in the correct entries of the resulting matrix. + * Your game generally should not use G_MTX_PUSH or SPPopMatrix*, even in a + * scene graph style engine like SM64. * - * @param m is the pointer to the 4x4 fixed-point matrix (see note below about format) - * @param p are the bit OR'd parameters to the matrix macro (@ref G_MTX_PROJECTION, @ref G_MTX_MODELVIEW, @ref G_MTX_MUL, @ref G_MTX_LOAD, @ref G_MTX_NOPUSH) + * If you have taken the advice above to just compute and upload final model + * matrices, there is no need to use push or pop--you'll always just do a single + * load before rendering any model. If you need to return to a previous + * transformation matrix, just upload that already-computed matrix again. Again, + * both SM64 and OoT already do this. + * + * In F3DEX3, the code for G_MTX_PUSH and SPPopMatrix* is moved to overlay 3, + * meaning these operations will be slower on average than in F3DEX2. + * + * @param m is the pointer to the 4x4 fixed-point matrix (see note above about + * format) + * @param p are the bit OR'd parameters to the matrix macro + * (@ref G_MTX_MODEL, @ref G_MTX_VIEWPROJECTION, @ref G_MTX_MUL, + * @ref G_MTX_LOAD, @ref G_MTX_NOPUSH, @ref G_MTX_PUSH) + * + * @note The binary encoding for this command inverts both G_MTX_PUSH and + * G_MTX_LOAD. F3DEX2 already inverted G_MTX_PUSH, but the inversion of + * G_MTX_LOAD is new in F3DEX3. No C source level changes are needed due to + * these inversions, it's just a binary encoding change. + * + * @note G_MTX_PUSH | G_MTX_VIEWPROJECTION is not supported; the behavior will + * be that G_MTX_PUSH is ignored in this case. + * + * @note Unlike the display list stack, which is kept in DMEM and is 18 deep, + * the matrix stack is kept in RDRAM and is of no specified size. It is of + * whatever size the developer chooses to allocate; there is no bounds checking. */ #define gSPMatrix(pkt, m, p) \ - gDma2p((pkt),G_MTX, (m), sizeof(Mtx), (p) ^ G_MTX_PUSH, 0) + gDma2p((pkt),G_MTX, (m), sizeof(Mtx), (p) ^ G_MTX_PUSH ^ G_MTX_LOAD, 0) /** * @brief macro which inserts a matrix operation in a static display list. * * @copydetails gSPMatrix */ #define gsSPMatrix(m, p) \ - gsDma2p( G_MTX, (m), sizeof(Mtx), (p) ^ G_MTX_PUSH, 0) + gsDma2p( G_MTX, (m), sizeof(Mtx), (p) ^ G_MTX_PUSH ^ G_MTX_LOAD, 0) /** - * @brief macro which pops one of the matrix stacks at the end display list. + * @brief macro which pops multiple matrices from a matrix stack. * - * It pops `num` of the matrix stacks. The model view stack can be up to 10 matrices deep. The projection stack is 1 matrix deep, so it cannot be popped. + * It pops `num` matrices from the stack. * - * @note - * If the stack is empty, the macro is ignored. + * @note If the number of matrices to pop is greater than the number of matrices + * currently on the stack, the stack ends up validly holding 0 matrices. This is + * a rare case of "exception" handling in the microcode. Perhaps SGI's intention + * was to allow for resetting the matrix stack by popping >= 10 matrices at + * once. * - * @param n is the flag field that identifies which matrix stack to pop: - * - @ref G_MTX_MODELVIEW pops the modeling/viewing matrix stack - * - @ref G_MTX_PROJECTION pops the projection matrix stack (NOT IMPLEMENTED) + * @param mtx is the flag field that identifies which matrix stack to pop: + * - @ref G_MTX_MODEL pops from the model matrix stack + * - @ref G_MTX_VIEWPROJECTION pops from the view*projection matrix stack; this + * is not supposed to be supported but actually kind of is. The model matrix + * stack pointer is reduced by the number of matrices specified here, and then + * the resulting matrix is loaded into the view*projection matrix. * @param num is the number of matrices to pop */ -#define gSPPopMatrixN(pkt, n, num) gDma2p((pkt), G_POPMTX, (num) * 64, 64, 2, 0) +#define gSPPopMatrixN(pkt, mtx, num) \ + gDma2p((pkt), G_POPMTX, (num) * 64, 64, (mtx) + G_MV_MMTX, 0) /** - * @brief macro which pops one of the matrix stacks in a static display list. + * @brief macro which pops multiple matrices from a matrix stack. * * @copydetails gSPPopMatrixN */ -#define gsSPPopMatrixN(n, num) gsDma2p( G_POPMTX, (num) * 64, 64, 2, 0) +#define gsSPPopMatrixN(mtx, num) \ + gsDma2p( G_POPMTX, (num) * 64, 64, (mtx) + G_MV_MMTX, 0) /** - * @brief macro which pops one of the matrix stacks at the end display list. + * @brief macro which pops one matrix from a matrix stack in a static display list. * - * It pops one of the matrix stacks. The model view stack can be up to 10 matrices deep. The projection stack is 1 matrix deep, so it cannot be popped. + * This is just SPPopMatrixN with num=1: * - * @note - * If the stack is empty, the macro is ignored. - * - * @param n is the flag field that identifies which matrix stack to pop: - * - @ref G_MTX_MODELVIEW pops the modeling/viewing matrix stack - * - @ref G_MTX_PROJECTION pops the projection matrix stack (NOT IMPLEMENTED) + * @copydetails gSPPopMatrixN */ -#define gSPPopMatrix(pkt, n) gSPPopMatrixN((pkt), (n), 1) +#define gSPPopMatrix(pkt, mtx) gSPPopMatrixN((pkt), (mtx), 1) /** - * @brief macro which pops one of the matrix stacks in a static display list. + * @brief macro which pops one matrix from a matrix stack in a static display list. * * @copydetails gSPPopMatrix */ -#define gsSPPopMatrix(n) gsSPPopMatrixN( (n), 1) +#define gsSPPopMatrix(mtx) gsSPPopMatrixN( (mtx), 1) /** * @brief macro which loads an internal vertex buffer in the RSP with points that are used by @ref gSP1Triangle macros to generate polygons at the end display list. diff --git a/rsp/gbi.inc b/rsp/gbi.inc index 1853258..19a0cf8 100644 --- a/rsp/gbi.inc +++ b/rsp/gbi.inc @@ -92,12 +92,15 @@ G_RELSEGMENT equ 0x0b G_BRANCH_Z equ G_BRANCH_WZ G_BRANCH_W equ G_BRANCH_WZ -G_MTX_P_MV equ 0x04 -G_MTX_MUL_LOAD equ 0x02 -G_MTX_NOPUSH_PUSH equ 0x01 +// Note that load and push are inverted in the GBI encoding. +G_MTX_VP_M equ 0x04 // Binary: set for VP, clear for M +G_MTX_MUL_LOAD equ 0x02 // Binary: set for mul, clear for load +G_MTX_NOPUSH_PUSH equ 0x01 // Binary: set for nopush, clear for push -G_MV_MMTX equ 0x02 -G_MV_PMTX equ 0x06 +G_MV_MMTX equ 0x00 +G_MV_TEMPMTX0 equ 0x02 +G_MV_VPMTX equ 0x04 +G_MV_TEMPMTX1 equ 0x06 G_MV_VIEWPORT equ 0x08 G_MV_LIGHT equ 0x0A