diff --git a/Makefile b/Makefile index a9449e4..213595f 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,8 @@ default: F3DEX3_BrZ F3DEX3_BrW # List of all compile-time options supported by the microcode source. ALL_OPTIONS := \ CFG_G_BRANCH_W \ - CFG_DEBUG_NORMALS + CFG_DEBUG_NORMALS \ + CFG_GCLK_SAMPLE ARMIPS ?= armips PARENT_OUTPUT_DIR ?= ./build diff --git a/README.md b/README.md index d8918ef..f384a34 100644 --- a/README.md +++ b/README.md @@ -40,9 +40,12 @@ you should expect crashes and graphical issues.** diffuse to **specular**. If enabled, the vertex normal for lighting is replaced with the reflection of the vertex-to-camera vector over the vertex normal. Also, a new size value for each light controls how large the light - reflection appears to be. This technique is lower fidelity than the vanilla - `hilite` system, as it is per-vertex rather than per-pixel, but it allows the - material to be textured normally. + reflection appears to be. This technique is lower fidelity in some ways than + the vanilla `hilite` system, as it is per-vertex rather than per-pixel, but it + allows the material to be textured normally. Plus, it supports all scene + lights (including point) with different dynamic colors, whereas the vanilla + system supports up to two directional lights and more than one dynamic color + is difficult. - New geometry mode bits `G_ATTROFFSET_ST_ENABLE` and `G_ATTROFFSET_Z_ENABLE` apply settable offsets to vertex ST (`SPAttrOffsetST`) and/or Z (`SPAttrOffsetZ`) values. These offsets are applied after their respective @@ -207,11 +210,11 @@ SM64 only: computes a matrix stack on the CPU and sends the final matrix for each object / limb to the RSP, rather than multiplying matrices on the RSP. OoT already usually does the former for precision / accuracy reasons and only uses - `G_MTX_MUL` in a couple places; it is okay to leave those. This change is - recommended because the `G_MTX_MUL` mode of `SPMatrix` has been moved to - Overlay 4 in F3DEX3 (see below), making it substantially slower than it was in - F3DEX2. It still functions the same though so you can use it if it's really - needed. + `G_MTX_MUL` in a couple places (e.g. view * perspective matrix); it is okay to + leave those. This change is recommended because the `G_MTX_MUL` mode of + `SPMatrix` has been moved to Overlay 4 in F3DEX3 (see below), making it + substantially slower than it was in F3DEX2. It still functions the same though + so you can use it if it's really needed. - Re-export as many display lists (scenes, objects, skeletons, etc.) as possible with fast64 set to F3DEX3 mode, to take advantage of the substantially larger vertex buffer, triangle packing commands, "hints" system, etc. @@ -445,7 +448,9 @@ the same goal with some extra benefits: - The amount of ambient occlusion in F3DEX3 can be set at runtime based on scene lighting, whereas the scaled normals approach is baked into the mesh. - F3DEX3 can have the vertex alpha affect ambient, directional, and point lights - by different amounts, which is not possible with scaled normals. + by different amounts, which is not possible with scaled normals. In fact, + scaled normals never affect the ambient light, contrary to the concept of + ambient occlusion. Furthermore, for partial HLE compatibility, the same mesh can have the ambient occlusion information encoded in both scaled normals and vertex alpha at the @@ -472,12 +477,14 @@ two tris, saving a substantial amount of DMEM. ### Obscure semantic differences from F3DEX2 that should never matter in practice -- `SPLoadUcode*` will corrupt RSP texture state previously set with `SPTexture`. - (In F3DEX2, it would be set to all zeros--texture disabled--after returning - from the other microcode, but in F3DEX3 it is set to garbage data.) After - returning from the other microcode but before drawing anything else, you must - execute `SPTexture` again. Normally, `SPTexture` is executed as part of every - material, so its state would not be relied on across microcode changes. +- `SPLoadUcode*` corrupts the current M inverse transpose matrix state. If using + `G_NORMALS_MODE_FAST`, this doesn't matter. If using `G_NORMALS_MODE_AUTO`, + you must send the M matrix to the RSP again after returning to F3DEX3 from the + other microcode (which would normally be done anyway when starting to draw the + next object). If using `G_NORMALS_MODE_MANUAL`, you must send the updated + M inverse transpose matrix to the RSP after returning to F3DEX3 from the other + microcode (which would normally be done anyway when starting to draw the next + object). - Changing fog settings--i.e. enabling or disabling `G_FOG` in the geometry mode or executing `SPFogFactor` or `SPFogPosition`--between loading verts and drawing tris with those verts will lead to incorrect fog values for those diff --git a/cpu/camera.c b/cpu/camera.c index 7fc1698..4d6a836 100644 --- a/cpu/camera.c +++ b/cpu/camera.c @@ -33,3 +33,14 @@ gSPCameraWorld(GFX, cameraWorldPos); /* The important part: in View_UpdateViewingMatrix */ View_SetCameraWorld(view->cameraWorldPosPtr, view); + +/* The same issue happens (in the vanilla game) for lookat vectors--they are +computed during the frame based on the camera position and direction, but +these are updated at the end of the frame so the lookat vectors are one frame +behind. You can see this when going into C-up next to an object with hilite +or env map-- it's wrong for one frame and then is fixed. You could solve +this by either tracking all lookat vectors created during the frame and +updating them at the end as we did here. Another option is to only send +lookat once at the start of each frame (and update it at the end of the +frame), rather than once per object using it. This is not exactly the same +for objects not in the middle of the screen, but the difference is minor. */ diff --git a/cpu/counters.c b/cpu/counters.c index b9a793f..a4e177e 100644 --- a/cpu/counters.c +++ b/cpu/counters.c @@ -2,6 +2,9 @@ method of reading it will be the same for any other game. */ /* In variables.h with the ENABLE_SPEEDMETER section */ +extern volatile u32 gRSPGfxRDPWaitCycles; +extern volatile u16 gRSPGfxCommandsSampledGclkActive; +extern volatile u16 gRSPGfxCommandCount; extern volatile u16 gRSPGfxVertexCount; extern volatile u16 gRSPGfxTriDrawCount; extern volatile u32 gRSPGfxTriRequestCount; @@ -9,27 +12,46 @@ extern volatile u16 gRSPGfxRectCount; /* In sched.c somewhere before Sched_TaskComplete, or in some header */ typedef struct { + u32 rdpWaitCycles; + u16 commandsSampledGclkActive; + u16 commandCount; u16 vertexCount; u16 triDrawCount; u32 triRequestCount:18; u32 rectCount:14; -} F3DEX3PerfCounters; + u32 taskdataptr; /* Not a perf counter */ + u32 ucode; /* Not a perf counter */ +} F3DEX3YieldDataFooter; /* In the true codepath of Sched_TaskComplete: */ #ifdef ENABLE_SPEEDMETER /* Fetch number of primitives drawn from yield data */ if(task->list.t.type == M_GFXTASK){ - F3DEX3PerfCounters* counters = (F3DEX3PerfCounters*)( - (u8*)gGfxSPTaskYieldBuffer + OS_YIELD_DATA_SIZE - 0x10); - osInvalDCache(counters, sizeof(F3DEX3PerfCounters)); - gRSPGfxVertexCount = counters->vertexCount; - gRSPGfxTriDrawCount = counters->triDrawCount; - gRSPGfxTriRequestCount = counters->triRequestCount; - gRSPGfxRectCount = counters->rectCount; + F3DEX3YieldDataFooter* footer = (F3DEX3YieldDataFooter*)( + (u8*)gGfxSPTaskYieldBuffer + + OS_YIELD_DATA_SIZE - sizeof(F3DEX3YieldDataFooter)); + osInvalDCache(footer, sizeof(F3DEX3YieldDataFooter)); + gRSPGfxRDPWaitCycles = footer->rdpWaitCycles; + gRSPGfxCommandsSampledGclkActive = footer->commandsSampledGclkActive; + gRSPGfxCommandCount = footer->commandCount; + gRSPGfxVertexCount = footer->vertexCount; + gRSPGfxTriDrawCount = footer->triDrawCount; + gRSPGfxTriRequestCount = footer->triRequestCount; + gRSPGfxRectCount = footer->rectCount; } #endif /* In speed_meter.c */ +/* Number of cycles the RSP is waiting for space in the RDP FIFO in DRAM */ +volatile u32 gRSPGfxRDPWaitCycles; +/* If CFG_GCLK_SAMPLE is enabled, the "GCLK is alive" bit of the RDP status is +sampled once every time a display list command is started. This counts the +number of times that bit was 1. */ +volatile u16 gRSPGfxCommandsSampledGclkActive; +/* Number of display list commands the microcode processed. If CFG_GCLK_SAMPLE +is disabled, this will be zero, so be careful about dividing the glck cycles +above by this. */ +volatile u16 gRSPGfxCommandCount; /* Number of vertices processed by the RSP */ volatile u16 gRSPGfxVertexCount; /* Number of tris actually drawn, after clipping and all types of culling */ diff --git a/f3dex3.s b/f3dex3.s index 99082d8..325629e 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -460,16 +460,15 @@ clipPoly2: // \ / \ / \ vertexBuffer: .skip (G_MAX_VERTS * vtxSize) -YIELD_DATA_FOOTER_SIZE equ 0x10 -yieldDataFooter equ OS_YIELD_DATA_SIZE - YIELD_DATA_FOOTER_SIZE - .if . > yieldDataFooter // OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last data in that is // the footer, organized as: - // +0: perfCounter1: Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP - // +4: perfCounter2: Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects - // +8: taskDataPtr - // +C: ucode + // YDF_OFFSET_RDPWAITCYC : How many loops we had to wait because the RDP FIFO was full + // YDF_OFFSET_GCLKSAMPLE : Upper 16 bits: num times "GCLK is alive" was 1, out of:; lower 16 bits: num DL cmds + // YDF_OFFSET_PERFCOUNTER1: Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP + // YDF_OFFSET_PERFCOUNTER2: Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects + // YDF_OFFSET_TASKDATAPTR : taskDataPtr + // YDF_OFFSET_UCODE : ucode // So, any data starting from the address of this footer will be clobbered, // so the vertex buffer and other data which needs to be save across yield // can't extend here. (The input buffer will be reloaded from the next @@ -592,6 +591,7 @@ vZero equ $v0 // all elements = 0 // Global and semi-global (i.e. one main function + occasional local) scalar regs: // $zero // Hardwired zero scalar register +gclkSample equ $10 // RDP GCLK sampling counter altBaseReg equ $13 // Alternate base address register for vector loads inputVtxPos equ $14 // Pointer to loaded vertex to transform outputVtxPos equ $15 // Pointer to vertex buffer to store transformed verts @@ -607,6 +607,7 @@ taskDataPtr equ $26 // Task data (display list) DRAM pointer inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end perfCounter1 equ $28 // Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP perfCounter2 equ $29 // Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects +rdpWaitCyc equ $30 // How many loops we had to wait because the RDP FIFO was full // $ra // Return address // Misc scalar regs: @@ -636,7 +637,7 @@ postOvlRA equ $12 // Commonly used locally // Overlay 4, local // $8: secondVtxPos, local // $9: curLight, clip mask during clipping, local -// $10: unused +// $10: gclkSample (global) // $11: very common local // $12: postOvlRA, local // $13: altBaseReg (global) @@ -657,7 +658,7 @@ postOvlRA equ $12 // Commonly used locally // $27: inputBufferPos (global) // $28: perfCounter1 (global) // $29: perfCounter2 (global) -// $30: unused +// $30: rdpWaitCyc (global) // $ra: Return address for jal, b*al // Initialization routines @@ -679,44 +680,57 @@ start: // This is at IMEM 0x1080, not the start of IMEM beqz $12, continue_from_os_task // If latter, load DL (task data) pointer from OSTask sw $zero, OSTask + OSTask_flags // Clear all task flags, incl. yielded continue_from_yield: - lw perfCounter1, yieldDataFooter + 0x0 // Perf counters saved here at yield - lw perfCounter2, yieldDataFooter + 0x4 + // Perf counters saved here at yield + lw rdpWaitCyc, yieldDataFooter + YDF_OFFSET_RDPWAITCYC + lw gclkSample, yieldDataFooter + YDF_OFFSET_GCLKSAMPLE + lw perfCounter1, yieldDataFooter + YDF_OFFSET_PERFCOUNTER1 + lw perfCounter2, yieldDataFooter + YDF_OFFSET_PERFCOUNTER2 j finish_setup - lw taskDataPtr, yieldDataFooter + 0x8 // load DL pointer from yield data + lw taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR initialize_rdp: - mfc0 $11, DPC_STATUS - andi $11, $11, DPC_STATUS_XBUS_DMA - bnez $11, wait_dpc_start_valid - mfc0 $2, DPC_END - lw $3, OSTask + OSTask_output_buff - sub $11, $3, $2 - bgtz $11, wait_dpc_start_valid - mfc0 $1, DPC_CURRENT - lw $3, OSTask + OSTask_output_buff_size - beqz $1, wait_dpc_start_valid - sub $11, $1, $3 - bgez $11, wait_dpc_start_valid + mfc0 $11, DPC_STATUS // Read RDP status + andi $11, $11, DPC_STATUS_XBUS_DMA // Look at XBUS enabled bit + bnez $11, @@start_new_buf // If XBUS is enabled, start new buffer + mfc0 $2, DPC_END // Load RDP end pointer + lw $3, OSTask + OSTask_output_buff // Load start of FIFO + sub $11, $3, $2 // If start of FIFO > RDP end, + bgtz $11, @@start_new_buf // start new buffer + mfc0 $1, DPC_CURRENT // Load RDP current pointer + lw $3, OSTask + OSTask_output_buff_size // Load end of FIFO + beqz $1, @@start_new_buf // If RDP current pointer is 0, start new buffer + sub $11, $1, $3 // If RDP current > end of fifo, + bgez $11, @@start_new_buf // start new buffer nop - bne $1, $2, f3dzex_0000111C -wait_dpc_start_valid: - mfc0 $11, DPC_STATUS - andi $11, $11, DPC_STATUS_START_VALID - bnez $11, wait_dpc_start_valid - li $11, DPC_STATUS_CLR_XBUS - mtc0 $11, DPC_STATUS - lw $2, OSTask + OSTask_output_buff_size - mtc0 $2, DPC_START - mtc0 $2, DPC_END -f3dzex_0000111C: - sw $2, rdpFifoPos + bne $1, $2, @@continue_buffer // If RDP current != RDP end, keep current buffer +@@start_new_buf: + // There may be one buffer executing in the RDP, and another queued in the + // double-buffered start/end regs. Wait for the latter to be available + // (i.e. possibly one buffer executing, none waiting). + mfc0 $11, DPC_STATUS // Read RDP status + andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf + bnez $11, @@start_new_buf // Wait until double buffered start/end available + li $11, DPC_STATUS_CLR_XBUS // Bit to disable XBUS mode + mtc0 $11, DPC_STATUS // Set bit, disable XBUS + lw $2, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr) + // Set up the next buffer for the RDP to be zero size and at the end of the FIFO. + mtc0 $2, DPC_START // Set RDP start addr to end of FIFO + mtc0 $2, DPC_END // Set RDP end addr to end of FIFO +@@continue_buffer: + // If we jumped here, the RDP is currently executing from the middle of the FIFO. + // So we can just append commands to there and move the end pointer. + sw $2, rdpFifoPos // Set FIFO position to end of FIFO or RDP end lw $11, matrixStackPtr // Initialize matrix stack pointer from OSTask bnez $11, continue_from_os_task // if not yet initialized lw $11, OSTask + OSTask_dram_stack sw $11, matrixStackPtr continue_from_os_task: - lw perfCounter1, textureSettings1 // Counters stored here if jumped to different ucode - lw perfCounter2, textureSettings2 // If starting from scratch, these are zero + // Counters stored here if jumped to different ucode + // If starting from scratch, these are zero + lw rdpWaitCyc, mITMatrix + YDF_OFFSET_RDPWAITCYC + lw gclkSample, mITMatrix + YDF_OFFSET_GCLKSAMPLE + lw perfCounter1, mITMatrix + YDF_OFFSET_PERFCOUNTER1 + lw perfCounter2, mITMatrix + YDF_OFFSET_PERFCOUNTER2 lw taskDataPtr, OSTask + OSTask_data_ptr finish_setup: li inputBufferPos, 0 @@ -745,6 +759,9 @@ wait_for_dma_and_run_next_command: G_POPMTX_end: G_MOVEMEM_end: jal while_wait_dma_busy // wait for the DMA read to finish +.if CFG_GCLK_SAMPLE +G_LIGHTTORDP_handler: +.endif G_SPNOOP_handler: run_next_DL_command: mfc0 $1, SP_STATUS // load the status word into register $1 @@ -755,6 +772,13 @@ run_next_DL_command: sra $7, cmd_w0, 24 // extract DL command byte from command word lw cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram addi inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words +.if CFG_GCLK_SAMPLE + mfc0 $11, DPC_STATUS + andi $11, $11, DPC_STATUS_GCLK_ALIVE // Sample whether GCLK is active now + sll $11, $11, 16 - 3 // move from bit 3 to bit 16 + addi $11, $11, 1 // 1 counts that we sampled + add gclkSample, gclkSample, $11 // Add both to the perf counter +.endif // $1 must remain zero // $7 must retain the command byte for load_mtx and overlay 4 stuff // $11 must contain the handler called for several handlers @@ -816,20 +840,6 @@ G_ENDDL_handler: j call_ret_common // has a different version in ovl1 lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to -G_SETxIMG_handler: - beqz $7, G_RDP_handler // Don't do any of this for G_NOOP - lb $3, materialCullMode // Get current mode - jal segmented_to_physical // Convert image to physical address - lw $2, lastMatDLPhyAddr // Get last material physical addr - bnez $3, G_RDP_handler // If not in normal mode (0), exit - add $12, taskDataPtr, inputBufferPos // Current material physical addr - beq $12, $2, @@skip // Branch if we are executing the same mat again - sw $12, lastMatDLPhyAddr // Store material physical addr - li $7, 1 // > 0: in material first time -@@skip: // Otherwise $7 was < 0: cull mode (in mat second time) - j G_RDP_handler - sb $7, materialCullMode - refine_cmd_further: addi $12, $7, -(0xFF00 | G_SETSCISSOR) // Relative to G_SETSCISSOR = 0 bltz $12, G_RDP_handler // G_RDPLOADSYNC through G_SETCONVERT @@ -853,33 +863,41 @@ check_rdp_buffer_full: sub $11, rdpCmdBufPtr, rdpCmdBufEndP1 bltz $11, return_routine // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end flush_rdp_buffer: - mfc0 $12, SP_DMA_BUSY - lw cmd_w1_dram, rdpFifoPos - addi dmaLen, $11, RDP_CMD_BUFSIZE + 8 - bnez $12, flush_rdp_buffer - lw $12, OSTask + OSTask_output_buff_size - mtc0 cmd_w1_dram, DPC_END - add $11, cmd_w1_dram, dmaLen - sub $12, $12, $11 - bgez $12, f3dzex_000012A8 -@@await_start_valid: - mfc0 $11, DPC_STATUS - andi $11, $11, DPC_STATUS_START_VALID - bnez $11, @@await_start_valid - lw cmd_w1_dram, OSTask + OSTask_output_buff -f3dzex_00001298: - mfc0 $11, DPC_CURRENT - beq $11, cmd_w1_dram, f3dzex_00001298 - nop - mtc0 cmd_w1_dram, DPC_START -f3dzex_000012A8: - mfc0 $11, DPC_CURRENT - sub $11, $11, cmd_w1_dram - blez $11, f3dzex_000012BC - sub $11, $11, dmaLen - blez $11, f3dzex_000012A8 -f3dzex_000012BC: - add $11, cmd_w1_dram, dmaLen + mfc0 $12, SP_DMA_BUSY // Check if any DMA is in flight + lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write + addi dmaLen, $11, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy + bnez $12, flush_rdp_buffer // Wait until no DMAs are active + lw $12, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr) + mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time) + add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer + sub $12, $12, $11 // $12 = FIFO end addr - future pointer + bgez $12, @@has_room // Branch if we can fit this +@@await_rdp_dblbuf_avail: + mfc0 $11, DPC_STATUS // Read RDP status + andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf + bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available + addi rdpWaitCyc, rdpWaitCyc, 7 // 4 instr + 2 after mfc + 1 taken branch + lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO +@@await_past_first_instr: + mfc0 $11, DPC_CURRENT // Load RDP current pointer + beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start + addi rdpWaitCyc, rdpWaitCyc, 6 // 3 instr + 2 after mfc + 1 taken branch + // Start was previously the start of the FIFO, unless this is the first buffer, + // in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we + // have a new end value waiting (END_VALID), it'll load end but leave current. By + // setting start here, it will also load current with start. + mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO +@@keep_waiting: + // This is here so we only count it when stalling below or on FIFO end codepath + addi rdpWaitCyc, rdpWaitCyc, 10 // 7 instr + 2 after mfc + 1 taken branch +@@has_room: + mfc0 $11, DPC_CURRENT // Load RDP current pointer + sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start) + blez $11, @@copy_buffer // Current is behind or at current end, can do copy + sub $11, $11, dmaLen // If amount current is ahead of current end + blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting +@@copy_buffer: + add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size sw $11, rdpFifoPos // Set up the DMA from DMEM to the RDP fifo in RDRAM addi dmaLen, dmaLen, -1 // subtract 1 from the length @@ -1911,8 +1929,11 @@ ovl0_start: sub $11, rdpCmdBufPtr, rdpCmdBufEndP1 addi $12, $11, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything? bgezal $12, flush_rdp_buffer // - 1 because there is no bgtzal instruction - sw perfCounter1, yieldDataFooter + 0x0 // Stored here for yield and done - sw perfCounter2, yieldDataFooter + 0x4 // otherwise this is temp memory + // Stored here for yield and done, otherwise this is temp memory + sw perfCounter1, yieldDataFooter + YDF_OFFSET_PERFCOUNTER1 + sw perfCounter2, yieldDataFooter + YDF_OFFSET_PERFCOUNTER2 + sw rdpWaitCyc, yieldDataFooter + YDF_OFFSET_RDPWAITCYC + sw gclkSample, yieldDataFooter + YDF_OFFSET_GCLKSAMPLE jal while_wait_dma_busy lw $24, rdpFifoPos bltz $1, task_done // $1 < 0 = Got to the end of the parent DL @@ -1923,8 +1944,12 @@ load_ucode: lw cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr sw taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL sw cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute - sw perfCounter1, textureSettings1 // Store counters in texture settings; first 0x180 of DMEM - sw perfCounter2, textureSettings2 // will be preserved in ucode swap AND if other ucode yields + // Store counters in mITMatrix; first 0x180 of DMEM will be preserved in ucode swap AND + // if other ucode yields + sw rdpWaitCyc, mITMatrix + YDF_OFFSET_RDPWAITCYC + sw gclkSample, mITMatrix + YDF_OFFSET_GCLKSAMPLE + sw perfCounter1, mITMatrix + YDF_OFFSET_PERFCOUNTER1 + sw perfCounter2, mITMatrix + YDF_OFFSET_PERFCOUNTER2 li dmemAddr, start // Beginning of overwritable part of IMEM jal dma_read_write // DMA DRAM read -> IMEM write li dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM @@ -1949,13 +1974,13 @@ task_yield: li dmemAddr, 0x8000 // 0, but negative = write li dmaLen, OS_YIELD_DATA_SIZE - 1 li $12, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals - sw taskDataPtr, yieldDataFooter + 0x8 // Save pointer to where in DL - sw $11, yieldDataFooter + 0xC + sw taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR // Save pointer to where in DL + sw $11, yieldDataFooter + YDF_OFFSET_UCODE j dma_read_write li $ra, set_status_and_break task_done: - // Copy just the part of the yield data that has the perf counters. + // Copy just the yield data footer, which has the perf counters. lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr addi cmd_w1_dram, cmd_w1_dram, yieldDataFooter li dmemAddr, 0x8000 | yieldDataFooter // negative = write @@ -2093,6 +2118,21 @@ segmented_to_physical: jr $ra add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address +G_SETxIMG_handler: + beqz $7, G_RDP_handler // Don't do any of this for G_NOOP + lb $3, materialCullMode // Get current mode + jal segmented_to_physical // Convert image to physical address + lw $2, lastMatDLPhyAddr // Get last material physical addr + bnez $3, G_RDP_handler // If not in normal mode (0), exit + add $12, taskDataPtr, inputBufferPos // Current material physical addr + beq $12, $2, @@skip // Branch if we are executing the same mat again + sw $12, lastMatDLPhyAddr // Store material physical addr + li $7, 1 // > 0: in material first time +@@skip: // Otherwise $7 was < 0: cull mode (in mat second time) + j G_RDP_handler + sb $7, materialCullMode + +.if !CFG_GCLK_SAMPLE G_LIGHTTORDP_handler: lbu $11, numLightsxSize // Ambient light lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size @@ -2103,6 +2143,7 @@ G_LIGHTTORDP_handler: sll $3, $3, 8 // Shift light RGB to upper 3 bytes and clear alpha byte j G_RDP_handler // Send to RDP or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word +.endif ovl1_end: .align 8 diff --git a/rsp/rsp_defs.inc b/rsp/rsp_defs.inc index 792c3a9..c4db626 100644 --- a/rsp/rsp_defs.inc +++ b/rsp/rsp_defs.inc @@ -4,6 +4,14 @@ .definelabel OSTask_addr, 0xFC0 OS_YIELD_DATA_SIZE equ 0xC00 +YIELD_DATA_FOOTER_SIZE equ 0x18 +yieldDataFooter equ OS_YIELD_DATA_SIZE - YIELD_DATA_FOOTER_SIZE +YDF_OFFSET_RDPWAITCYC equ 0x00 +YDF_OFFSET_GCLKSAMPLE equ 0x04 +YDF_OFFSET_PERFCOUNTER1 equ 0x08 +YDF_OFFSET_PERFCOUNTER2 equ 0x0C +YDF_OFFSET_TASKDATAPTR equ 0x10 +YDF_OFFSET_UCODE equ 0x14 // OSTask data member offsets OSTask_type equ 0x0000 @@ -73,6 +81,7 @@ SP_STATUS_SIG7 equ 0x4000 // RDP Status read flags DPC_STATUS_XBUS_DMA equ 0x0001 +DPC_STATUS_GCLK_ALIVE equ 0x0008 DPC_STATUS_DMA_BUSY equ 0x0100 DPC_STATUS_END_VALID equ 0x0200 DPC_STATUS_START_VALID equ 0x0400