diff --git a/Makefile b/Makefile
index 352a6ec..b5a0569 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ ALL_OPTIONS := \
   CFG_GCLK_SAMPLE
 
 ARMIPS ?= armips
-PARENT_OUTPUT_DIR ?= ./build
+PARENT_OUTPUT_DIR ?= ../test
 ifeq ($(PARENT_OUTPUT_DIR),.)
   $(error Cannot build directly in repo directory; see Makefile for details.)
   # The problem is that we want to be able to have targets like F3DEX2_2.08,
@@ -138,6 +138,7 @@ DESCRIPTION := Will make you want to finally ditch HLE (G_BRANCH_W version)
 ID_STR := F3DEX3 by Sauraen & Nintendo, G_BRANCH_W version______________________
 # Add options you want here, e.g. CFG_GCLK_SAMPLE
 OPTIONS := \
+  CFG_GCLK_SAMPLE \
   CFG_G_BRANCH_W
 $(eval $(call ucode_rule))
 
diff --git a/f3dex3.s b/f3dex3.s
index 08ab5fa..ec6ee72 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -53,6 +53,94 @@ ACC_LOWER equ 2
     vsar    dst, dst, dst[N]
 .endmacro
 
+.macro jumpTableEntry, addr
+    .dh addr & 0xFFFF
+.endmacro
+
+//
+// Profiling configurations. To make space for the profiling features, if any of
+// the profiling configurations are enabled, G_LIGHTTORDP and !G_SHADING_SMOOTH
+// are removed, i.e. G_LIGHTTORDP behaves as a no-op and all tris are smooth
+// shaded.
+//
+ENABLE_PROFILING equ 0
+COUNTER_A_UPPER_VERTEX_COUNT equ 0
+COUNTER_B_LOWER_CMD_COUNT equ 0
+COUNTER_C_FIFO_FULL equ 1
+NEED_START_COUNTER_DMEM equ 0
+
+// Config A TODO
+// perfCounterA:
+//     cycles RSP spent processing vertex commands (incl. vertex DMAs)
+// perfCounterB:
+//     upper 16 bits: fetched DL command count
+//     lower 16 bits: DL command count
+// perfCounterC:
+//     cycles RSP was stalled because RDP FIFO was full
+// perfCounterD:
+//     cycles RSP spent processing triangle commands (incl. buffer flushes)
+.if CFG_PROFILING_A
+ENABLE_PROFILING equ 1
+COUNTER_B_LOWER_CMD_COUNT equ 1
+NEED_START_COUNTER_DMEM equ 1
+.endif
+
+// Config B TODO
+// perfCounterA:
+//     upper 16 bits: vertex count
+//     lower 16 bits: lit vertex count
+// perfCounterB:
+//     upper 18 bits: small RDP command count (all RDP cmds except tris)
+//     lower 14 bits: clipped (input) tris count
+// perfCounterC:
+//     upper 18 bits: overlay (all 0-4) load count
+//     lower 14 bits: overlay 2 (lighting) load count TODO
+// perfCounterD:
+//     upper 18 bits: overlay 3 (clipping) load count TODO
+//     lower 14 bits: overlay 4 (misc) load count TODO
+.if CFG_PROFILING_B
+.if ENABLE_PROFILING
+.error "At most one CFG_PROFILING_ option can be enabled at a time"
+.endif
+ENABLE_PROFILING equ 1
+COUNTER_C_FIFO_FULL equ 0
+COUNTER_A_UPPER_VERTEX_COUNT equ 1
+.endif
+
+// Config C TODO
+// perfCounterA:
+//     cycles RSP believes it was running
+// perfCounterB:
+//     upper 16 bits: samples GCLK was alive (sampled once per DL command count)
+//     lower 16 bits: DL command count
+// perfCounterC:
+//     cycles RSP was stalled because RDP FIFO was full
+// perfCounterD:
+//     cycles RSP was stalled waiting for miscellaneous DMAs to finish
+.if CFG_PROFILING_C
+.if ENABLE_PROFILING
+.error "At most one CFG_PROFILING_ option can be enabled at a time"
+.endif
+ENABLE_PROFILING equ 1
+COUNTER_B_LOWER_CMD_COUNT equ 1
+NEED_START_COUNTER_DMEM equ 1
+.endif
+
+// Default (extra profiling disabled)
+// perfCounterA:
+//     upper 16 bits: vertex count
+//     lower 16 bits: RDP/out tri count
+// perfCounterB:
+//     upper 18 bits: RSP/in tri count
+//     lower 14 bits: tex/fill rect count
+// perfCounterC:
+//     cycles RSP was stalled because RDP FIFO was full
+// perfCounterD:
+//     unused/zero
+.if !ENABLE_PROFILING
+COUNTER_A_UPPER_VERTEX_COUNT equ 1
+.endif
+
 /*
 There are two different memory spaces for the overlays: (a) IMEM and (b) the
 microcode file (which, plus an offset, is also the location in DRAM).
@@ -96,10 +184,6 @@ Overlay 2
 Overlay 4
 */
 
-.macro jumpTableEntry, addr
-    .dh addr & 0xFFFF
-.endmacro
-
 // RSP DMEM
 .create DATA_FILE, 0x0000
 
@@ -362,6 +446,11 @@ normalsMode:
 lastMatDLPhyAddr:
     .dw 0
     
+.if NEED_START_COUNTER_DMEM
+startCounterTime:
+    .dw 0
+.endif
+    
 // Constants for clipping algorithm
 clipCondShifts:
     .db CLIP_SCAL_NY_SHIFT
@@ -462,14 +551,8 @@ vertexBuffer:
     .skip (G_MAX_VERTS * vtxSize)
 
 .if . > yieldDataFooter
-    // OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last data in that is
-    // the footer, organized as:
-    // YDF_OFFSET_RDPWAITCYC  : How many loops we had to wait because the RDP FIFO was full
-    // YDF_OFFSET_GCLKSAMPLE  : Upper 16 bits: num times "GCLK is alive" was 1, out of:; lower 16 bits: num DL cmds
-    // YDF_OFFSET_PERFCOUNTER1: Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP
-    // YDF_OFFSET_PERFCOUNTER2: Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects
-    // YDF_OFFSET_TASKDATAPTR : taskDataPtr
-    // YDF_OFFSET_UCODE       : ucode
+    // OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved. The last data in that is
+    // the footer, which contains four perf counters, taskDataPtr, and ucode.
     // So, any data starting from the address of this footer will be clobbered,
     // so the vertex buffer and other data which needs to be save across yield
     // can't extend here. (The input buffer will be reloaded from the next
@@ -592,7 +675,7 @@ vZero equ $v0  // all elements = 0
 
 // Global and semi-global (i.e. one main function + occasional local) scalar regs:
 //                 $zero // Hardwired zero scalar register
-gclkSample     equ $10   // RDP GCLK sampling counter
+perfCounterD   equ $12   // Performance counter D (functions depend on config)
 altBaseReg     equ $13   // Alternate base address register for vector loads
 inputVtxPos    equ $14   // Pointer to loaded vertex to transform
 outputVtxPos   equ $15   // Pointer to vertex buffer to store transformed verts
@@ -606,9 +689,9 @@ cmd_w1_dram    equ $24   // DL command word 1, which is also DMA DRAM addr
 cmd_w0         equ $25   // DL command word 0, also holds next tris info
 taskDataPtr    equ $26   // Task data (display list) DRAM pointer
 inputBufferPos equ $27   // DMEM position within display list input buffer, relative to end
-perfCounter1   equ $28   // Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP
-perfCounter2   equ $29   // Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects
-rdpWaitCyc     equ $30   // How many loops we had to wait because the RDP FIFO was full
+perfCounterA   equ $28   // Performance counter A (functions depend on config)
+perfCounterB   equ $29   // Performance counter B (functions depend on config)
+perfCounterC   equ $30   // Performance counter C (functions depend on config)
 //                 $ra   // Return address
 
 // Misc scalar regs:
@@ -622,7 +705,7 @@ dmemAddr equ $20
 // cmd_w1_dram   // used for all dma_read_write DRAM addresses
 
 // Argument to load_overlay*
-postOvlRA equ $12 // Commonly used locally
+postOvlRA equ $10 // Commonly used locally
 
 // ==== Summary of uses of all registers
 // $zero: Hardwired zero scalar register
@@ -638,9 +721,9 @@ postOvlRA equ $12 // Commonly used locally
 //     Overlay 4, local
 // $8: secondVtxPos, local
 // $9: curLight, clip mask during clipping, local
-// $10: gclkSample (global)
+// $10: postOvlRA, common local
 // $11: very common local
-// $12: postOvlRA, local
+// $12: perfCounterD (global). This must be $12 for S2DEX compat in while_wait_dma_busy.
 // $13: altBaseReg (global)
 // $14: inputVtxPos, local
 // $15: outputVtxPos, local
@@ -657,9 +740,9 @@ postOvlRA equ $12 // Commonly used locally
 //      vtx write
 // $26: taskDataPtr (global)
 // $27: inputBufferPos (global)
-// $28: perfCounter1 (global)
-// $29: perfCounter2 (global)
-// $30: rdpWaitCyc (global)
+// $28: perfCounterA (global)
+// $29: perfCounterB (global)
+// $30: perfCounterC (global)
 // $ra: Return address for jal, b*al
 
 // Initialization routines
@@ -672,20 +755,20 @@ start: // This is at IMEM 0x1080, not the start of IMEM
     li      rdpCmdBufPtr, rdpCmdBuffer1
     li      rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word
     lw      $11, rdpFifoPos
-    lw      $12, OSTask + OSTask_flags
+    lw      $10, OSTask + OSTask_flags
     vsub    vOne, vOne, $v31[1]             // 1 = 0 - -1
     li      $1, SP_CLR_SIG2 | SP_CLR_SIG1   // Clear task done and yielded signals
     beqz    $11, initialize_rdp             // If RDP FIFO not set up yet, starting ucode from scratch
      mtc0   $1, SP_STATUS
-    andi    $12, $12, OS_TASK_YIELDED       // Resumed from yield or came from called ucode?
-    beqz    $12, continue_from_os_task      // If latter, load DL (task data) pointer from OSTask
+    andi    $10, $10, OS_TASK_YIELDED       // Resumed from yield or came from called ucode?
+    beqz    $10, continue_from_os_task      // If latter, load DL (task data) pointer from OSTask
      sw     $zero, OSTask + OSTask_flags    // Clear all task flags, incl. yielded
 continue_from_yield:
     // Perf counters saved here at yield
-    lw      rdpWaitCyc, yieldDataFooter + YDF_OFFSET_RDPWAITCYC
-    lw      gclkSample, yieldDataFooter + YDF_OFFSET_GCLKSAMPLE
-    lw      perfCounter1, yieldDataFooter + YDF_OFFSET_PERFCOUNTER1
-    lw      perfCounter2, yieldDataFooter + YDF_OFFSET_PERFCOUNTER2 
+    lw      perfCounterA, yieldDataFooter + YDF_OFFSET_PERFCOUNTERA
+    lw      perfCounterB, yieldDataFooter + YDF_OFFSET_PERFCOUNTERB
+    lw      perfCounterC, yieldDataFooter + YDF_OFFSET_PERFCOUNTERC
+    lw      perfCounterD, yieldDataFooter + YDF_OFFSET_PERFCOUNTERD
     j       finish_setup
      lw     taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR
 
@@ -728,12 +811,16 @@ initialize_rdp:
 continue_from_os_task:
     // Counters stored here if jumped to different ucode
     // If starting from scratch, these are zero
-    lw      rdpWaitCyc, mITMatrix + YDF_OFFSET_RDPWAITCYC
-    lw      gclkSample, mITMatrix + YDF_OFFSET_GCLKSAMPLE
-    lw      perfCounter1, mITMatrix + YDF_OFFSET_PERFCOUNTER1
-    lw      perfCounter2, mITMatrix + YDF_OFFSET_PERFCOUNTER2
+    lw      perfCounterA, mITMatrix + YDF_OFFSET_PERFCOUNTERA
+    lw      perfCounterB, mITMatrix + YDF_OFFSET_PERFCOUNTERB
+    lw      perfCounterC, mITMatrix + YDF_OFFSET_PERFCOUNTERC
+    lw      perfCounterD, mITMatrix + YDF_OFFSET_PERFCOUNTERD
     lw      taskDataPtr, OSTask + OSTask_data_ptr
 finish_setup:
+.if CFG_PROFILING_C
+    mfc0    $11, DPC_CLOCK
+    sw      $11, startCounterTime
+.endif
     li      inputBufferPos, 0
     li      cmd_w1_dram, orga(ovl1_start)
     j       load_overlays_0_1
@@ -751,6 +838,10 @@ displaylist_dma_with_count:
 displaylist_dma:
     // Load INPUT_BUFFER_LEN - inputBufferPos cmds (inputBufferPos >= 0, mult of 8)
     addi    inputBufferPos, inputBufferPos, -INPUT_BUFFER_LEN // inputBufferPos = - num cmds
+.if CFG_PROFILING_A
+    sll     $11, inputBufferPos, 16 - 3                // Divide by 8 for num cmds to load, then move to upper 16
+    sub     perfCounterB, perfCounterB, $11            // Negative so subtract
+.endif
     nor     dmaLen, inputBufferPos, $zero              // DMA length = -inputBufferPos - 1 = ones compliment
     move    cmd_w1_dram, taskDataPtr                   // set up the DRAM address to read from
     jal     dma_read_write                             // initiate the DMA read
@@ -760,9 +851,13 @@ wait_for_dma_and_run_next_command:
 G_POPMTX_end:
 G_MOVEMEM_end:
     jal     while_wait_dma_busy                         // wait for the DMA read to finish
-.if CFG_GCLK_SAMPLE
+.if ENABLE_PROFILING
 G_LIGHTTORDP_handler:
 .endif
+.if !CFG_PROFILING_A
+vertex_end:
+tri_end:
+.endif
 G_SPNOOP_handler:
 run_next_DL_command:
      mfc0   $1, SP_STATUS                               // load the status word into register $1
@@ -773,23 +868,31 @@ run_next_DL_command:
      sra    $7, cmd_w0, 24                              // extract DL command byte from command word
     lw      cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram
     addi    inputBufferPos, inputBufferPos, 0x0008      // increment the DL index by 2 words
-.if CFG_GCLK_SAMPLE
+.if CFG_PROFILING_C
     mfc0    $11, DPC_STATUS
     andi    $11, $11, DPC_STATUS_GCLK_ALIVE             // Sample whether GCLK is active now
     sll     $11, $11, 16 - 3                            // move from bit 3 to bit 16
-    addi    $11, $11, 1                                 // 1 counts that we sampled
-    add     gclkSample, gclkSample, $11                 // Add both to the perf counter
+    add     perfCounterB, perfCounterB, $11             // Add to the perf counter
+.endif
+.if CFG_PROFILING_A
+    mfc0    $11, DPC_CLOCK
+.endif
+.if COUNTER_B_LOWER_CMD_COUNT
+    addi    perfCounterB, perfCounterB, 1               // Count commands
 .endif
     // $1 must remain zero
     // $7 must retain the command byte for load_mtx and overlay 4 stuff
     // $11 must contain the handler called for several handlers
     addi    $2, $7, -G_VTX                              // If >= G_VTX, use jump table
+.if CFG_PROFILING_A
+    sw      $11, startCounterTime
+.endif
     bgez    $2, do_cmd_jump_table                       // $2 is the index
      addi   $3, $2, G_VTX - (0xFF00 | G_SETTIMG)        // If >= G_SETTIMG, use handler; for G_NOOP, this puts
     bgez    $3, G_SETxIMG_handler                       // garbage in second word, but normal handler does anyway
-     addi   $12, $3, G_SETTIMG - G_SETTILE              // If >= G_SETTILE, use RDP handler
-    bgez    $12, G_RDP_handler
-     addi   $2, $12, G_SETTILE - G_RDPLOADSYNC          // If >= G_RDPLOADSYNC, refine cmd further
+     addi   $10, $3, G_SETTIMG - G_SETTILE              // If >= G_SETTILE, use RDP handler
+    bgez    $10, G_RDP_handler
+     addi   $2, $10, G_SETTILE - G_RDPLOADSYNC          // If >= G_RDPLOADSYNC, refine cmd further
     bgez    $2, refine_cmd_further                      // Otherwise $2 (negative) is the index
 do_cmd_jump_table:
      sll    $11, $2, 1                                  // Multiply jump table index by 2 for addr offset
@@ -831,17 +934,17 @@ G_SETxIMG_handler:
     jal     segmented_to_physical           // Convert image to physical address
      lw     $2, lastMatDLPhyAddr            // Get last material physical addr
     bnez    $3, G_RDP_handler               // If not in normal mode (0), exit
-     add    $12, taskDataPtr, inputBufferPos // Current material physical addr
-    beq     $12, $2, @@skip                 // Branch if we are executing the same mat again
-     sw     $12, lastMatDLPhyAddr           // Store material physical addr
+     add    $10, taskDataPtr, inputBufferPos // Current material physical addr
+    beq     $10, $2, @@skip                 // Branch if we are executing the same mat again
+     sw     $10, lastMatDLPhyAddr           // Store material physical addr
     li      $7, 1                           // > 0: in material first time
 @@skip:                                     // Otherwise $7 was < 0: cull mode (in mat second time)
     j       G_RDP_handler
      sb     $7, materialCullMode
 
 refine_cmd_further:
-    addi    $12, $7, -(0xFF00 | G_SETSCISSOR) // Relative to G_SETSCISSOR = 0
-    bltz    $12, G_RDP_handler // G_RDPLOADSYNC through G_SETCONVERT
+    addi    $10, $7, -(0xFF00 | G_SETSCISSOR) // Relative to G_SETSCISSOR = 0
+    bltz    $10, G_RDP_handler // G_RDPLOADSYNC through G_SETCONVERT
      andi   $2, $2, 0x0003 // $2 is relative to G_RDPLOADSYNC;
     beqz    $2, G_RDP_handler // G_SETPRIMDEPTH and G_SETTILESIZE are multiples of 4 from here
      addi   $3, $7, -(0xFF00 | G_LOADTLUT) // G_SETSCISSOR and G_RDPSETOTHERMODE are < this
@@ -852,51 +955,66 @@ load_cmds_handler:
      lb     $3, materialCullMode
     bltz    $3, run_next_DL_command  // If cull mode is < 0, in mat second time, skip the load
 G_RDP_handler:
-     sw     cmd_w1_dram, 4(rdpCmdBufPtr)    // Add the second word of the command to the RDP command buffer
+     sw     cmd_w1_dram, 4(rdpCmdBufPtr)     // Add the second word of the command to the RDP command buffer
 G_SYNC_handler:
-    sw      cmd_w0, 0(rdpCmdBufPtr)         // Add the command word to the RDP command buffer
-    addi    rdpCmdBufPtr, rdpCmdBufPtr, 8   // Increment the next RDP command pointer by 2 words
+.if CFG_PROFILING_B
+    addi    perfCounterB, perfCounterB, 0x4000 // Increment small RDP command count
+.endif
+    sw      cmd_w0, 0(rdpCmdBufPtr)          // Add the command word to the RDP command buffer
+    addi    rdpCmdBufPtr, rdpCmdBufPtr, 8    // Increment the next RDP command pointer by 2 words
 check_rdp_buffer_full_and_run_next_cmd:
-    li      $ra, run_next_DL_command    // Set up running the next DL command as the return address
+    li      $ra, run_next_DL_command         // Set up running the next DL command as the return address
 check_rdp_buffer_full:
      sub    $11, rdpCmdBufPtr, rdpCmdBufEndP1
-    bltz    $11, return_routine         // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
+    bltz    $11, return_routine              // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
 flush_rdp_buffer:
-     mfc0   $12, SP_DMA_BUSY            // Check if any DMA is in flight
-    lw      cmd_w1_dram, rdpFifoPos     // FIFO pointer = end of RDP read, start of RSP write
+     mfc0   $10, SP_DMA_BUSY                 // Check if any DMA is in flight
+    lw      cmd_w1_dram, rdpFifoPos          // FIFO pointer = end of RDP read, start of RSP write
     addi    dmaLen, $11, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy
-    bnez    $12, flush_rdp_buffer       // Wait until no DMAs are active
-     lw     $12, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
-    mtc0    cmd_w1_dram, DPC_END        // Set RDP to execute until FIFO end (buf pushed last time)
-    add     $11, cmd_w1_dram, dmaLen    // $11 = future FIFO pointer if we append this new buffer
-    sub     $12, $12, $11               // $12 = FIFO end addr - future pointer
-    bgez    $12, @@has_room             // Branch if we can fit this
+.if CFG_PROFILING_C
+    // This is a wait for DMA busy loop, but written inline to avoid overwriting ra.
+    addi    perfCounterD, perfCounterD, 10   // 6 instr + 2 between end load and mfc + 0 taken branch overlaps with last + 2 between mfc and load
+.endif
+    bnez    $10, flush_rdp_buffer            // Wait until no DMAs are active
+     lw     $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
+    mtc0    cmd_w1_dram, DPC_END             // Set RDP to execute until FIFO end (buf pushed last time)
+    add     $11, cmd_w1_dram, dmaLen         // $11 = future FIFO pointer if we append this new buffer
+    sub     $10, $10, $11                    // $10 = FIFO end addr - future pointer
+    bgez    $10, @@has_room                  // Branch if we can fit this
 @@await_rdp_dblbuf_avail:
-     mfc0   $11, DPC_STATUS             // Read RDP status
+     mfc0   $11, DPC_STATUS                  // Read RDP status
     andi    $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
-    bnez    $11, @@await_rdp_dblbuf_avail  // Wait until double buffered start/end available
-     addi   rdpWaitCyc, rdpWaitCyc, 7   // 4 instr + 2 after mfc + 1 taken branch
-    lw      cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO
+    bnez    $11, @@await_rdp_dblbuf_avail    // Wait until double buffered start/end available
+.if COUNTER_C_FIFO_FULL
+     addi   perfCounterC, perfCounterC, 7    // 4 instr + 2 after mfc + 1 taken branch
+.endif
+     lw     cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO
 @@await_past_first_instr:
-    mfc0    $11, DPC_CURRENT            // Load RDP current pointer
+    mfc0    $11, DPC_CURRENT                 // Load RDP current pointer
     beq     $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start
-     addi   rdpWaitCyc, rdpWaitCyc, 6   // 3 instr + 2 after mfc + 1 taken branch
+.if COUNTER_C_FIFO_FULL
+     addi   perfCounterC, perfCounterC, 6    // 3 instr + 2 after mfc + 1 taken branch
+.else
+     nop
+.endif
     // Start was previously the start of the FIFO, unless this is the first buffer,
     // in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we
     // have a new end value waiting (END_VALID), it'll load end but leave current. By
     // setting start here, it will also load current with start.
-    mtc0    cmd_w1_dram, DPC_START      // Set RDP start to start of FIFO
+    mtc0    cmd_w1_dram, DPC_START           // Set RDP start to start of FIFO
 @@keep_waiting:
+.if COUNTER_C_FIFO_FULL
     // This is here so we only count it when stalling below or on FIFO end codepath
-    addi    rdpWaitCyc, rdpWaitCyc, 10  // 7 instr + 2 after mfc + 1 taken branch
+    addi    perfCounterC, perfCounterC, 10   // 7 instr + 2 after mfc + 1 taken branch
+.endif
 @@has_room:
-    mfc0    $11, DPC_CURRENT            // Load RDP current pointer
-    sub     $11, $11, cmd_w1_dram       // Current - current end (rdpFifoPos or start)
-    blez    $11, @@copy_buffer          // Current is behind or at current end, can do copy
-     sub    $11, $11, dmaLen            // If amount current is ahead of current end
-    blez    $11, @@keep_waiting         // is <= size of buffer to copy, keep waiting
+    mfc0    $11, DPC_CURRENT                 // Load RDP current pointer
+    sub     $11, $11, cmd_w1_dram            // Current - current end (rdpFifoPos or start)
+    blez    $11, @@copy_buffer               // Current is behind or at current end, can do copy
+     sub    $11, $11, dmaLen                 // If amount current is ahead of current end
+    blez    $11, @@keep_waiting              // is <= size of buffer to copy, keep waiting
 @@copy_buffer:
-     add    $11, cmd_w1_dram, dmaLen    // New end is current end + buffer size
+     add    $11, cmd_w1_dram, dmaLen         // New end is current end + buffer size
     sw      $11, rdpFifoPos
     // Set up the DMA from DMEM to the RDP fifo in RDRAM
     addi    dmaLen, dmaLen, -1                                  // subtract 1 from the length
@@ -933,6 +1051,9 @@ ovl234_ovl4_entrypoint_ovl3ver: // same IMEM address as ovl234_ovl4_entrypoint
 ovl234_clipping_entrypoint:
     sh      $ra, tempHalfword
 ovl3_clipping_nosavera:
+.if CFG_PROFILING_B
+    addi    perfCounterB, perfCounterB, 1        // Increment clipped (input) tris count
+.endif
     jal     vtx_setup_constants
      li     clipMaskIdx, 4
 clip_after_constants:
@@ -950,7 +1071,7 @@ clip_init_used_loop:
     sh      $3, (clipPoly - 6 + 4)(clipPolySelect)
     sh      $zero, (clipPoly)(clipPolySelect) // Zero to mark end of polygon
     li      $9, CLIP_CAMPLANE                        // Initial clip mask for no nearclipping
-// Available locals here: $11, $1, $7, $20, $24, $12
+// Available locals here: $11, $1, $7, $20, $24, $10
 clip_condlooptop: // Loop over six clipping conditions: near, far, +y, +x, -y, -x
     lhu     clipFlags, VTX_CLIP($3)                  // Load flags for V3, which will be the final vertex of the last polygon
     and     clipFlags, clipFlags, $9                 // Mask V3's flags to current clip condition
@@ -970,8 +1091,8 @@ clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the ed
     li      outputVtxPos, clipTempVerts + MAX_CLIP_GEN_VERTS * vtxSize
 clip_find_unused_loop:
     lhu     $11, (VTX_CLIP - vtxSize)(outputVtxPos)
-    addi    $12, outputVtxPos, -clipTempVerts  // This is within the loop rather than before b/c delay after lhu
-    blez    $12, clip_done                 // If can't find one (should never happen), give up
+    addi    $10, outputVtxPos, -clipTempVerts  // This is within the loop rather than before b/c delay after lhu
+    blez    $10, clip_done                 // If can't find one (should never happen), give up
      andi   $11, $11, CLIP_VTX_USED
     bnez    $11, clip_find_unused_loop
      addi   outputVtxPos, outputVtxPos, -vtxSize
@@ -1120,15 +1241,15 @@ clip_draw_tris:
     lqv     $v30, (v30Value)($zero)
 // Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite
     addi    clipPolySelect, clipPolySelect, -6 // = Pointer to first vertex
-    // Available locals: most registers ($5, $6, $7, $8, $9, $11, $12, etc.)
+    // Available locals: most registers ($5, $6, $7, $8, $9, $11, $10, etc.)
     // Available regs which won't get clobbered by tri write: 
     // clipPolySelect, clipPolyWrite, $14 (inputVtxPos), $15 (outputVtxPos), (more)
     // Find vertex highest on screen (lowest screen Y)
     li      $5, 0x7FFF                // current best value
     move    $7, clipPolySelect        // initial vertex pointer
-    lhu     $12, (clipPoly)($7)       // Load vertex address
+    lhu     $10, (clipPoly)($7)       // Load vertex address
 clip_search_highest_loop:
-    lh      $9, VTX_SCR_Y($12)        // Load screen Y
+    lh      $9, VTX_SCR_Y($10)        // Load screen Y
     sub     $11, $9, $5               // Branch if new vtx Y >= best vtx Y
     bgez    $11, clip_search_skip_better
      addi   $7, $7, 2                 // Next vertex
@@ -1136,7 +1257,7 @@ clip_search_highest_loop:
     move    $5, $9                    // Save best value
 clip_search_skip_better:
     bne     clipPolyWrite, $7, clip_search_highest_loop
-     lhu    $12, (clipPoly)($7)       // Next vertex address
+     lhu    $10, (clipPoly)($7)       // Next vertex address
     addi    clipPolyWrite, clipPolyWrite, -2   // = Pointer to last vertex
     // Find next closest vertex, from the two on either side
     bne     $14, clipPolySelect, @@skip1
@@ -1161,7 +1282,7 @@ clip_draw_loop:
     // (previous) - $14 - $15, or we can draw $14 - $15 - (next). We want the
     // one where the lower edge covers the fewest scanlines. This edge is
     // (previous) - $15 or $14 - (next).
-    // $1, $2, $3, $5 are vertices at $11=prev, $14, $15, $12=next
+    // $1, $2, $3, $5 are vertices at $11=prev, $14, $15, $10=next
     bne     $14, clipPolySelect, @@skip1
      addi   $11, $14, -2
     move    $11, clipPolyWrite
@@ -1169,12 +1290,12 @@ clip_draw_loop:
     beq     $11, $15, clip_done // If previous is $15, we only have two verts left, done
      lhu    $1, (clipPoly)($11)     // From the group below, need something in the delay slot
     bne     $15, clipPolyWrite, @@skip2
-     addi   $12, $15, 2
-    move    $12, clipPolySelect
+     addi   $10, $15, 2
+    move    $10, clipPolySelect
 @@skip2:
     lhu     $2, (clipPoly)($14)
     lhu     $3, (clipPoly)($15)
-    lhu     $5, (clipPoly)($12)
+    lhu     $5, (clipPoly)($10)
     lsv     $v5[0], (VTX_SCR_Y)($1)
     lsv     $v5[4], (VTX_SCR_Y)($2)
     lsv     $v5[2], (VTX_SCR_Y)($3)
@@ -1191,7 +1312,7 @@ clip_draw_loop:
     move    $2, $3
     move    $3, $5
     move    $14, $8            // Restore overwritten $14
-    move    $15, $12           // Update $15 to be next
+    move    $15, $10           // Update $15 to be next
 clip_final_draw:
     mtc2    $1, $v27[10]              // Addresses go in vector regs too
     mtc2    $2, $v4[12]
@@ -1217,15 +1338,17 @@ G_VTX_handler:
     srl     $2, cmd_w0, 11                     // n << 1
     sub     $2, cmd_w0, $2                     // v0 << 1
     sb      $2, (inputBufferEnd - 0x06)(inputBufferPos) // Store v0 << 1 as byte 2
+.if COUNTER_A_UPPER_VERTEX_COUNT
     sll     $11, $1, 12                        // Vtx count * 0x10000
-    add     perfCounter1, perfCounter1, $11    // Add to vertex count
-    j       vtx_addrs_from_cmd                 // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $12
+    add     perfCounterA, perfCounterA, $11    // Add to vertex count
+.endif
+    j       vtx_addrs_from_cmd                 // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $10
      li     $11, vtx_return_from_addrs
 vtx_return_from_addrs:
-    andi    $12, $12, 0xFFF8                   // Round down end addr to DMA word; one input vtx still fits in one internal vtx
+    andi    $10, $10, 0xFFF8                   // Round down end addr to DMA word; one input vtx still fits in one internal vtx
     mfc2    outputVtxPos, $v27[4]              // Address of start in vtxSize units
     jal     segmented_to_physical              // Convert address in cmd_w1_dram to physical
-     sub    dmemAddr, $12, $1                  // Start addr = end addr - size
+     sub    dmemAddr, $10, $1                  // Start addr = end addr - size
     jal     dma_read_write
      addi   dmaLen, $1, -1                     // DMA length is always offset by -1
     move    inputVtxPos, dmemAddr
@@ -1235,7 +1358,7 @@ vtx_return_from_addrs:
     lqv     vM2F,     (mMatrix + 0x30)($zero)
     lbu     $11, mITValid                      // 0 if matrix invalid, 1 if valid
     vcopy   vM1I,  vM0I
-    lbu     $12, normalsMode                   // bit 0 clear if don't compute mIT, set if do
+    lbu     $10, normalsMode                   // bit 0 clear if don't compute mIT, set if do
     vcopy   vM3I,  vM2I
     ldv     vM1I[0],  (mMatrix + 0x08)($zero)
     vcopy   vM1F,  vM0F
@@ -1245,7 +1368,7 @@ vtx_return_from_addrs:
     sltiu   $11, $11, 1                        // 0 if matrix valid, 1 if invalid
     srl     $7, $5, 9                          // G_LIGHTING in bit 1
     and     $7, $7, $11                        // If lighting enabled and need to update matrix,
-    and     $7, $7, $12                        // and computing mIT,
+    and     $7, $7, $10                        // and computing mIT,
     ldv     vM3F[0],  (mMatrix + 0x38)($zero)
     ldv     vM0I[8],  (mMatrix + 0x00)($zero)
     ldv     vM2I[8],  (mMatrix + 0x10)($zero)
@@ -1278,11 +1401,11 @@ vtx_setup_constants:
     lsv     $v21[0], (attrOffsetZ - altBase)(altBaseReg) // Z offset
     ldv     $v26[0], (viewport + 8)($zero)        // Load vtrans duplicated in 0-3 and 4-7
     ldv     $v26[8], (viewport + 8)($zero)
-    lw      $12, (geometryModeLabel)($zero)
+    lw      $10, (geometryModeLabel)($zero)
     ldv     $v25[0], (viewport)($zero)            // Load vscale duplicated in 0-3 and 4-7
     ldv     $v25[8], (viewport)($zero)
     vne     $v29, $v31, $v31[2h]                  // VCC = 11011101
-    andi    $11, $12, G_ATTROFFSET_Z_ENABLE
+    andi    $11, $10, G_ATTROFFSET_Z_ENABLE
     vadd    $v21, $v26, $v21[0]                   // Add Z offset to all terms (care about 2, 6)
     beqz    $11, @@skipz                          // Skip if Z offset disabled
      llv    $v23[0], (fogFactor)($zero)           // Load fog multiplier 0 and offset 1
@@ -1291,7 +1414,7 @@ vtx_setup_constants:
     vne     $v29, $v31, $v31[3h]                  // VCC = 11101110
     lqv     $v30, (fxParams - altBase)(altBaseReg) // Parameters for vtx and lighting
     vmudh   $v20, $v25, $v31[1]                   // -1; -vscale
-    andi    $11, $12, G_AMBOCCLUSION
+    andi    $11, $10, G_AMBOCCLUSION
     vmrg    $v25, $v25, $v23[0]                   // Put fog multiplier in elements 3,7 of vscale
     vmrg    $v26, $v26, $v23[1]                   // Put fog offset in elements 3,7 of vtrans
     vge     $v29, $v31, $v31[3]                   // VCC = 00011111
@@ -1363,7 +1486,7 @@ vtx_load_loop:
 @@skipsecond:
     vmadm   $v29, vVP2I, vPairPosF[2h]
     vmadn   vPairTPosF, vVP2F, vPairPosI[2h]
-    li      $ra, run_next_DL_command    // run next DL command...
+    li      $ra, vertex_end             // Done with vertex processing...
     vmadh   vPairTPosI, vVP2I, vPairPosI[2h]
     blez    $1, @@skiploop              // ...if <= 0 verts remain, ...
      vmudm  $v29, vPairST, $v25         // Scale ST; must be after texgen
@@ -1374,7 +1497,7 @@ vtx_store:
     // Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA
     // Locals: $v20, $v21, $v25, $v26, $v16, $v17 ($v29 is temp)
     // Scalar regs: secondVtxPos, outputVtxPos; set to the same thing if only write 1 vtx
-    // temps $11, $12, $20, $24
+    // temps $11, $10, $20, $24
     ldv     $v17[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg)
     vch     $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
     ldv     $v17[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg)
@@ -1382,7 +1505,7 @@ vtx_store:
     vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
     vmadm   $v20, vPairTPosI, $v30[3] // Persp norm
     vmadn   $v21, $v31, $v31[2] // 0
-    cfc2    $12, $vcc // Load screen clipping results
+    cfc2    $10, $vcc // Load screen clipping results
     vmudn   $v29, vPairTPosF, $v17 // X * kx, Y * ky, Z * kz
     vmadh   $v29, vPairTPosI, $v17 // Int * int
     vreadacc $v16, ACC_UPPER // Load int * int portion
@@ -1409,7 +1532,7 @@ vtx_store:
     vadd    $v16, $v16, $v16[1h] // Add elems 1, 5 to 3, 7
     cfc2    $20, $vcc // Load scaled clipping results
     vmudl   $v29, $v21, $v17[2h]
-    srl     $24, $12, 4            // Shift second vertex screen clipping to first slots
+    srl     $24, $10, 4            // Shift second vertex screen clipping to first slots
     vmadm   $v29, $v20, $v17[2h]
     andi    $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
     vmadn   $v21, $v21, $v17[3h]
@@ -1420,9 +1543,9 @@ vtx_store:
     vmudh   $v29, vOne, $v31[4] // 4 * 1 in elems 3, 7
     cfc2    $11, $vcc // Load occlusion plane mid results to bits 3 and 7 (garbage in others)
     vmadn   $v21, $v21, $v31[0] // -4
-    andi    $12, $12, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
+    andi    $10, $10, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
     vmadh   $v20, $v20, $v31[0] // -4
-    ori     $12, $12, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
+    ori     $10, $10, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
     vmudl   $v29, $v21, $v17[2h]
     lsv     vPairTPosI[14], (VTX_Z_INT     )(secondVtxPos) // load Z into W slot, will be for fog below
     vmadm   $v29, $v20, $v17[2h]
@@ -1456,7 +1579,7 @@ vtx_store:
     vmadh   vPairTPosI, vPairTPosI, $v21
     or      $24, $24, $20          // Combine results for second vertex
     vmadh   $v21, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
-    or      $12, $12, $11          // Combine results for first vertex
+    or      $10, $10, $11          // Combine results for first vertex
     vmrg    $v26, vOne, $v31[1] // Signs of $v26 are --++--++
     andi    $11, $5, G_FOG >> 8    // Nonzero if fog enabled
     vmudh   $v16, vPairTPosI, $v31[4] // 4; scale up x and y
@@ -1490,14 +1613,28 @@ vtx_skip_fog:
 @@skipv2:
     beqz    $20, @@skipv1    // If 0, all equations true, don't clear occluded flag
      sh     $24,              (VTX_CLIP      )(secondVtxPos) // Store second vertex clip flags
-    andi    $12, $12, ~CLIP_OCCLUDED // At least one eqn false, clear vtx 1 occluded flag
+    andi    $10, $10, ~CLIP_OCCLUDED // At least one eqn false, clear vtx 1 occluded flag
 @@skipv1:    
     jr      $ra
-     sh     $12,              (VTX_CLIP      )(outputVtxPos) // Store first vertex results
+     sh     $10,              (VTX_CLIP      )(outputVtxPos) // Store first vertex results
+
+.if CFG_PROFILING_A
+vertex_end:
+    li      $ra, 0                           // Flag for coming from vtx
+tri_end:
+    mfc0    $11, DPC_CLOCK
+    lw      $10, startCounterTime
+    sub     $11, $11, $10
+    bnez    $ra, run_next_DL_command         // $ra != 0 if from tri cmds
+     add    perfCounterD, perfCounterD, $11  // Add to tri cycles perf counter
+    sub     perfCounterD, perfCounterD, $11  // From verts, undo add to tri perf counter
+    j       run_next_DL_command
+     add    perfCounterA, perfCounterA, $11  // Add to vert cycles perf counter
+.endif
 
 G_MODIFYVTX_handler:
-    // Command byte 3 = vtx being modified; its addr -> $12
-    li      $11, do_moveword  // Moveword adds cmd_w0 to $12 for final addr
+    // Command byte 3 = vtx being modified; its addr -> $10
+    li      $11, do_moveword  // Moveword adds cmd_w0 to $10 for final addr
     lbu     cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos)  // offset in vtx
 vtx_addrs_from_cmd:
     // Treat eight bytes of last command each as vertex indices << 1
@@ -1505,12 +1642,12 @@ vtx_addrs_from_cmd:
     lpv     $v27[0], (-(0x1000 - (inputBufferEnd - 0x08)))(inputBufferPos)
 vtx_indices_to_addr:
     // Input and output in $v27
-    // Also out elem 3 -> $12, elem 7 -> $3 because these are used more than once
+    // Also out elem 3 -> $10, elem 7 -> $3 because these are used more than once
     lqv     $v30, (v30Value)($zero)
     vmudl   $v29, $v27, $v30[1]   // Multiply vtx indices times length
     vmadn   $v27, vOne, $v30[0]   // Add address of vertex buffer
     sb      $zero, materialCullMode // This covers all tri cmds, vtx, modify vtx, branchZ, cull
-    mfc2    $12, $v27[6]
+    mfc2    $10, $v27[6]
     jr      $11
      mfc2   $3, $v27[14]
 
@@ -1524,18 +1661,18 @@ tri_strip_fan_start:
 tri_strip_fan_loop:
     lw      cmd_w1_dram, 0(cmd_w0)       // Load tri indices to lower 3 bytes of word
     addi    $11, inputBufferPos, inputBufferEnd - 3 // Off end of command
-    beq     $11, cmd_w0, run_next_DL_command // If off end of command, exit
-     sll    $12, cmd_w1_dram, 24         // Put sign bit of vtx 3 in sign bit
-    bltz    $12, run_next_DL_command     // If negative, exit
+    beq     $11, cmd_w0, tri_end         // If off end of command, exit
+     sll    $10, cmd_w1_dram, 24         // Put sign bit of vtx 3 in sign bit
+    bltz    $10, tri_end                 // If negative, exit
      sw     cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices
     bltz    $ra, tri_fan_store           // Finish handling G_TRIFAN
      addi   cmd_w0, cmd_w0, 1            // Increment
     andi    $11, cmd_w0, 1               // If odd, this is the 1st/3rd/5th tri
-    bnez    $11, tri_main              // Draw as is
-     srl    $12, cmd_w1_dram, 8          // Move vtx 2 to LSBs
+    bnez    $11, tri_main                // Draw as is
+     srl    $10, cmd_w1_dram, 8          // Move vtx 2 to LSBs
     sb      cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2
     j       tri_main
-     sb     $12, 7(rdpCmdBufPtr)         // Store vtx 2 to spot for 3
+     sb     $10, 7(rdpCmdBufPtr)         // Store vtx 2 to spot for 3
 
 tri_fan_store:
     lb      $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1
@@ -1547,7 +1684,7 @@ G_QUAD_handler:
     jal     tri_main                     // Send second tri; return here for first tri
      sw     cmd_w1_dram, 4(rdpCmdBufPtr) // Put second tri indices in temp memory
 G_TRI1_handler:
-    li      $ra, run_next_DL_command     // After done with this tri, run next cmd
+    li      $ra, tri_end                 // After done with this tri, exit tri processing
     sw      cmd_w0, 4(rdpCmdBufPtr)      // Put first tri indices in temp memory
 tri_main:
     lpv     $v27[0], 0(rdpCmdBufPtr)     // Load tri indexes to elems 5, 6, 7
@@ -1556,9 +1693,13 @@ tri_main:
 tri_return_from_addrs:
     mfc2    $1, $v27[10]
     vcopy   $v4, $v27                    // Need vtx 2 addr in $v4 elem 6
-    addi    perfCounter2, perfCounter2, 0x4000  // Increment number of tris requested
+.if !ENABLE_PROFILING
+    addi    perfCounterB, perfCounterB, 0x4000  // Increment number of tris requested
+.endif
     mfc2    $2, $v27[12]
+.if !ENABLE_PROFILING
     move    $4, $1                // Save original vertex 1 addr (pre-shuffle) for flat shading
+.endif
     li      clipPolySelect, -1    // Normal tri drawing mode (check clip masks)
 tri_noinit:
     // ra is next cmd, second tri in TRI2, or middle of clipping
@@ -1636,19 +1777,27 @@ tV3AtI equ $v21
     vreadacc $v16, ACC_MIDDLE
     lpv     tV3AtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
     vmov    $v15[2], $v6[0]
+.if !ENABLE_PROFILING
     lpv     $v25[0], VTX_COLOR_VEC($4)  // Load RGB from vertex 4 (flat shading vtx)
+.endif
     vrcp    $v20[0], $v15[1]
+.if !ENABLE_PROFILING
     sll     $11, $6, 10                 // Moves the value of G_SHADING_SMOOTH into the sign bit
+.endif
     vrcph   $v22[0], $v17[1]
     andi    $6, $6, (G_SHADE | G_ZBUFFER)
     vrcpl   $v23[1], $v16[1]
+.if !ENABLE_PROFILING
     bltz    $11, tri_skip_flat_shading  // Branch if G_SHADING_SMOOTH is set
+.endif
      vrcph  $v24[1], $v31[2]            // 0
+.if !ENABLE_PROFILING
     vlt     $v29, $v31, $v31[3]         // Set vcc to 11100000
     vmrg    tV1AtI, $v25, tV1AtI        // RGB from $4, alpha from $1
     vmrg    tV2AtI, $v25, tV2AtI        // RGB from $4, alpha from $2
     vmrg    tV3AtI, $v25, tV3AtI        // RGB from $4, alpha from $3
 tri_skip_flat_shading:
+.endif
     vrcp    $v20[2], $v6[1]
     lb      $20, (alphaCompareCullMode)($zero)
     vrcph   $v22[2], $v6[1]
@@ -1662,9 +1811,9 @@ tri_skip_flat_shading:
     vmudl   tV2AtI, tV2AtI, $v30[3] // 0x0100; vertex color 2 >>= 8
     sub     $11, $5, $7
     vmudl   tV3AtI, tV3AtI, $v30[3] // 0x0100; vertex color 3 >>= 8
-    sra     $12, $11, 31
+    sra     $10, $11, 31
     vmov    $v15[3], $v8[0]
-    and     $11, $11, $12
+    and     $11, $11, $10
     vmudl   $v29, $v20, $v30[7] // 0x0020
     beqz    $20, tri_skip_alpha_compare_cull
      sub    $5, $5, $11
@@ -1684,9 +1833,9 @@ tri_skip_alpha_compare_cull:
      vmadm  $v22, $v22, $v30[7] // 0x0020
     sub     $11, $5, $8
     vmadn   $v20, $v31, $v31[2] // 0
-    sra     $12, $11, 31
+    sra     $10, $11, 31
     vmudm   $v25, $v15, $v30[2] // 0x1000
-    and     $11, $11, $12
+    and     $11, $11, $10
     vmadn   $v15, $v31, $v31[2] // 0
     sub     $5, $5, $11
     vsubc   $v4, vZero, $v4
@@ -1719,11 +1868,11 @@ tri_skip_alpha_compare_cull:
     vmadh   $v17, $v17, $v30[4] // -16
     ssv     $v14[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
     vmudn   $v29, $v3, $v14[0]
-    andi    $12, $5, 0x0080 // Extract the left major flag from $5
+    andi    $10, $5, 0x0080 // Extract the left major flag from $5
     vmadl   $v29, $v22, $v4[1]
-    or      $12, $12, $7 // Combine the left major flag with the level and tile from the texture settings
+    or      $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
     vmadm   $v29, $v15, $v4[1]
-    sb      $12, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
+    sb      $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
     vmadn   $v2, $v22, $v26[1]
     beqz    $9, tri_skip_tex // If textures are not enabled, skip texture coefficient calculation
      vmadh  $v3, $v15, $v26[1]
@@ -1817,7 +1966,9 @@ tDaDyI equ $v7
     vmadh   tDaDxI, tDaDxI, $v24[1]
     add     rdpCmdBufPtr, rdpCmdBufPtr, $11  // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set
     vmudl   $v29, tDaDyF, $v23[1]
-    addi    perfCounter1, perfCounter1, 1 // Increment number of tris sent to RDP
+.if !ENABLE_PROFILING
+    addi    perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
+.endif
     vmadm   $v29, tDaDyI, $v23[1]
     vmadn   tDaDyF, tDaDyF, $v24[1]
     sdv     tDaDxF[0], 0x0018($2)   // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional)
@@ -1878,7 +2029,7 @@ load_overlay_0_and_enter:
 G_LOAD_UCODE_handler:
     li      postOvlRA, 0x1000                        // Sets up return address
     li      cmd_w1_dram, orga(ovl0_start)            // Sets up ovl0 table address
-// To use these: set postOvlRA ($12) to the address to execute after the load is
+// To use these: set postOvlRA ($10) to the address to execute after the load is
 // done, and set cmd_w1_dram to orga(your_overlay).
 load_overlays_0_1:
     li      dmaLen, ovl01_end - 0x1000 - 1
@@ -1889,6 +2040,9 @@ load_overlays_2_3_4:
     li      dmemAddr, ovl234_start
 load_overlay_inner:
     lw      $11, OSTask + OSTask_ucode
+.if CFG_PROFILING_B
+    addi    perfCounterC, perfCounterC, 0x4000  // Increment overlay (all 0-4) load count
+.endif
     jal     dma_read_write
      add    cmd_w1_dram, cmd_w1_dram, $11
     move    $ra, postOvlRA
@@ -1901,11 +2055,19 @@ totalImemUseUpTo1FC8:
 .endif
 .org 0x1FC8
 
+// The code from here to the end is shared with S2DEX, so great care is needed for changes.
 while_wait_dma_busy:
     mfc0    $11, SP_DMA_BUSY    // Load the DMA_BUSY value
-while_dma_busy:
-    bnez    $11, while_dma_busy // Loop until DMA_BUSY is cleared
-     mfc0   $11, SP_DMA_BUSY    // Update DMA_BUSY value
+.if CFG_PROFILING_C
+    bnez    $11, while_wait_dma_busy
+     // perfCounterD is $12, which is a temp register in S2DEX, which happens to
+     // never have state carried over while_wait_dma_busy.
+     addi   perfCounterD, perfCounterD, 6 // 3 instr + 2 after mfc + 1 taken branch
+.else
+@@while_dma_busy:
+    bnez    $11, @@while_dma_busy // Loop until DMA_BUSY is cleared
+     mfc0   $11, SP_DMA_BUSY      // Update DMA_BUSY value
+.endif
 // This routine is used to return via conditional branch
 return_routine:
     jr      $ra
@@ -1935,29 +2097,29 @@ dma_write:
 // G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
 ovl0_start:
     sub     $11, rdpCmdBufPtr, rdpCmdBufEndP1
-    addi    $12, $11, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything?
-    bgezal  $12, flush_rdp_buffer   // - 1 because there is no bgtzal instruction
-     // Stored here for yield and done, otherwise this is temp memory
-     sw     perfCounter1, yieldDataFooter + YDF_OFFSET_PERFCOUNTER1
-    sw      perfCounter2, yieldDataFooter + YDF_OFFSET_PERFCOUNTER2
-    sw      rdpWaitCyc, yieldDataFooter + YDF_OFFSET_RDPWAITCYC
-    sw      gclkSample, yieldDataFooter + YDF_OFFSET_GCLKSAMPLE
-    jal     while_wait_dma_busy
-     lw     $24, rdpFifoPos
-    bltz    $1, task_done           // $1 < 0 = Got to the end of the parent DL
-     mtc0   $24, DPC_END            // Set the end pointer of the RDP so that it starts the task
-    bnez    $1, task_yield          // $1 > 0 = CPU requested yield
+    addi    $10, $11, (RDP_CMD_BUFSIZE + 8) - 1 // Does the current buffer contain anything?
+    bgezal  $10, flush_rdp_buffer   // - 1 because there is no bgtzal instruction
      add    taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
+    jal     while_wait_dma_busy     // Wait for possible RDP flush to finish
+     lw     $24, rdpFifoPos
+.if CFG_PROFILING_C
+    mfc0    $11, DPC_CLOCK
+    lw      $10, startCounterTime
+    sub     $11, $11, $10
+    add     perfCounterA, perfCounterA, $11
+.endif
+    bnez    $1, task_done_or_yield  // Continue to load ucode if 0
+     mtc0   $24, DPC_END            // Set the end pointer of the RDP so that it starts the task
 load_ucode:
     lw      cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr
     sw      taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL
     sw      cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute
     // Store counters in mITMatrix; first 0x180 of DMEM will be preserved in ucode swap AND
     // if other ucode yields
-    sw      rdpWaitCyc, mITMatrix + YDF_OFFSET_RDPWAITCYC
-    sw      gclkSample, mITMatrix + YDF_OFFSET_GCLKSAMPLE
-    sw      perfCounter1, mITMatrix + YDF_OFFSET_PERFCOUNTER1
-    sw      perfCounter2, mITMatrix + YDF_OFFSET_PERFCOUNTER2
+    sw      perfCounterA, mITMatrix + YDF_OFFSET_PERFCOUNTERA
+    sw      perfCounterB, mITMatrix + YDF_OFFSET_PERFCOUNTERB
+    sw      perfCounterC, mITMatrix + YDF_OFFSET_PERFCOUNTERC
+    sw      perfCounterD, mITMatrix + YDF_OFFSET_PERFCOUNTERD
     li      dmemAddr, start         // Beginning of overwritable part of IMEM
     jal     dma_read_write          // DMA DRAM read -> IMEM write
      li     dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM
@@ -1976,12 +2138,18 @@ load_ucode:
     .error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE"
 .endif
 
-task_yield:
+task_done_or_yield:
+    sw      perfCounterA, yieldDataFooter + YDF_OFFSET_PERFCOUNTERA
+    sw      perfCounterB, yieldDataFooter + YDF_OFFSET_PERFCOUNTERB
+    sw      perfCounterC, yieldDataFooter + YDF_OFFSET_PERFCOUNTERC
+    bltz    $1, task_done           // $1 < 0 = Got to the end of the parent DL
+     sw     perfCounterD, yieldDataFooter + YDF_OFFSET_PERFCOUNTERD
+task_yield: // Otherwise $1 > 0 = CPU requested yield
     lw      $11, OSTask + OSTask_ucode         // Save pointer to current ucode
     lw      cmd_w1_dram, OSTask + OSTask_yield_data_ptr
     li      dmemAddr, 0x8000                   // 0, but negative = write
     li      dmaLen, OS_YIELD_DATA_SIZE - 1
-    li      $12, SP_SET_SIG1 | SP_SET_SIG2     // yielded and task done signals
+    li      $10, SP_SET_SIG1 | SP_SET_SIG2     // yielded and task done signals
     sw      taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR // Save pointer to where in DL
     sw      $11, yieldDataFooter + YDF_OFFSET_UCODE
     j       dma_read_write
@@ -1995,9 +2163,9 @@ task_done:
     jal     dma_read_write
      li     dmaLen, YIELD_DATA_FOOTER_SIZE - 1
     jal     while_wait_dma_busy
-     li     $12, SP_SET_SIG2   // task done signal
-set_status_and_break: // $12 is the status to set
-    mtc0    $12, SP_STATUS
+     li     $10, SP_SET_SIG2   // task done signal
+set_status_and_break: // $10 is the status to set
+    mtc0    $10, SP_STATUS
     break   0
     nop
 
@@ -2072,11 +2240,11 @@ G_SETOTHERMODE_L_handler:
     j       G_RDP_handler
      lw     cmd_w1_dram, otherMode1
 
-scissor_other_handler:     // $12 is 0 for G_SETSCISSOR or 2 for G_RDPSETOTHERMODE
-    sll     $12, $12, 2    // Now 0 or 8
-    sw      cmd_w0, (scissorUpLeft)($12) // otherMode0 = scissorUpLeft + 8
+scissor_other_handler:     // $10 is 0 for G_SETSCISSOR or 2 for G_RDPSETOTHERMODE
+    sll     $10, $10, 2    // Now 0 or 8
+    sw      cmd_w0, (scissorUpLeft)($10) // otherMode0 = scissorUpLeft + 8
     j       G_RDP_handler                // Send the command to the RDP
-     sw     cmd_w1_dram, (scissorBottomRight)($12) // otherMode1 = scissorBottomRight + 8
+     sw     cmd_w1_dram, (scissorBottomRight)($10) // otherMode1 = scissorBottomRight + 8
 
 G_GEOMETRYMODE_handler: // $7 = G_GEOMETRYMODE (as negative) if jumped here
     lw      $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
@@ -2100,7 +2268,9 @@ G_RDPHALF_2_handler:
     ldv     $v29[0], (texrectWord1)($zero)
     lw      cmd_w0, rdpHalf1Val             // load the RDPHALF1 value into w0
     addi    rdpCmdBufPtr, rdpCmdBufPtr, 8
-    addi    perfCounter2, perfCounter2, 1   // Increment number of tex/fill rects
+.if !ENABLE_PROFILING
+    addi    perfCounterB, perfCounterB, 1   // Increment number of tex/fill rects
+.endif
     sb      $zero, materialCullMode         // This covers tex and fill rects
     j       G_RDP_handler
      sdv    $v29[0], -8(rdpCmdBufPtr)
@@ -2109,14 +2279,14 @@ G_RELSEGMENT_handler:
     jal     segmented_to_physical    // Resolve new segment address relative to existing segment
 G_MOVEWORD_handler:
      srl    $2, cmd_w0, 16           // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
-    lhu     $12, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
+    lhu     $10, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
 do_moveword:
     sll     $11, cmd_w0, 16          // Sign bit = upper bit of offset
-    add     $12, $12, cmd_w0         // Offset + base; only lower 12 bits matter
+    add     $10, $10, cmd_w0         // Offset + base; only lower 12 bits matter
     bltz    $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
-     sh     cmd_w1_dram, ($12)       // Store value from cmd into halfword
+     sh     cmd_w1_dram, ($10)       // Store value from cmd into halfword
     j       run_next_DL_command
-     sw     cmd_w1_dram, ($12)       // Store value from cmd into word (offset + moveword_table[index])
+     sw     cmd_w1_dram, ($10)       // Store value from cmd into word (offset + moveword_table[index])
 
 // Converts the segmented address in cmd_w1_dram to the corresponding physical address
 segmented_to_physical:
@@ -2129,7 +2299,7 @@ segmented_to_physical:
      add    cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address
 
 G_CULLDL_handler:
-    j       vtx_addrs_from_cmd              // Load start vtx addr in $12, end vtx in $3
+    j       vtx_addrs_from_cmd              // Load start vtx addr in $10, end vtx in $3
      li     $11, culldl_return_from_addrs
 culldl_return_from_addrs:
     /*
@@ -2142,13 +2312,13 @@ culldl_return_from_addrs:
     G_CULLDL and not for tris.
     */
     li      $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
-    lhu     $11, VTX_CLIP($12)
+    lhu     $11, VTX_CLIP($10)
 culldl_loop:
     and     $1, $1, $11
     beqz    $1, run_next_DL_command         // Some vertex is on the screen-side of all clipping planes; have to render
-     lhu    $11, (vtxSize + VTX_CLIP)($12)  // next vertex clip flags
-    bne     $12, $3, culldl_loop            // loop until reaching the last vertex
-     addi   $12, $12, vtxSize               // advance to the next vertex
+     lhu    $11, (vtxSize + VTX_CLIP)($10)  // next vertex clip flags
+    bne     $10, $3, culldl_loop            // loop until reaching the last vertex
+     addi   $10, $10, vtxSize               // advance to the next vertex
     li      cmd_w0, 0                       // Clear count of DL cmds to skip loading
 G_ENDDL_handler:
     lbu     $1, displayListStackLength      // Load the DL stack index; if end stack,
@@ -2190,7 +2360,10 @@ lt_continue_setup:
     // vPairNrml, vAAA:vBBB (to be merged) packed normals
     // Outputs: leave alone vPairPosI/F; update vPairRGBA, vPairST 
     // Locals: vAAA and vBBB after merge and normals selection, vCCC, vDDD, vPairLt, vNrmOut
-    // New available locals: $6, $7 (existing: $11, $12, $20, $24)
+    // New available locals: $6, $7 (existing: $11, $10, $20, $24)
+.if CFG_PROFILING_B
+    addi    perfCounterB, perfCounterB, 2    // Increment lit vertex count by 2
+.endif
     beqz    $11, lt_skip_packed_normals
      vmrg   vAAA, vAAA, vBBB          // Merge packed normals
     // Packed normals algorithm. This produces a vector (one for each input vertex)
@@ -2221,7 +2394,7 @@ lt_skip_packed_normals:
     vmadn   $v29, vM1F, vPairNrml[1h]
     addi    curLight, curLight, altBase // Point to ambient light
     vmadh   $v29, vM1I, vPairNrml[1h]
-    andi    $12, $5, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
+    andi    $10, $5, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
     vmadn   vBBB, vM2F, vPairNrml[2h] // vBBB = normals frac
     beqz    $11, lt_after_xfrm_normals // Skip if G_NORMALSMODE_FAST
      vmadh  vAAA, vM2I, vPairNrml[2h] // vAAA = normals int
@@ -2267,7 +2440,7 @@ lt_after_xfrm_normals:
     vmadm   vCCC, vPairRGBA, $v30[0] // + (alpha - 1) * aoAmb factor; elems 3, 7
     vcopy   vPairNrml, vNrmOut
 .endif
-    beqz    $12, lt_loop // Not specular or fresnel
+    beqz    $10, lt_loop // Not specular or fresnel
      vmulf  vPairLt, vPairLt, vCCC[3h] // light color *= ambient factor
     // Get vNrmOut = normalize(camera - vertex), vAAA = (vPairNrml dot vNrmOut)
     ldv     vAAA[0], (cameraWorldPos - altBase)(altBaseReg) // Camera world pos
@@ -2277,7 +2450,7 @@ lt_after_camera:
     // If specular, replace vPairNrml with reflected vector
     vne     $v29, $v31, $v31[3h]      // Set VCC to 11101110
     beqz    $6, @@skip
-     li     $12, 0                    // Clear flag for specular or fresnel
+     li     $10, 0                    // Clear flag for specular or fresnel
     vmulf   vBBB, vPairNrml, vAAA[0h] // Projection of camera vec onto normal
     vmudh   $v29, vNrmOut, $v31[1]    // -camera vec
     vmadh   vPairNrml, vBBB, $v31[3]  // + 2 * projection
@@ -2330,7 +2503,7 @@ vLookat0   equ $v17 // = vPairLt:   lookat direction 0 (not initially)
     vadd    vPairRGBA, vPairRGBA, $v31[7]  // 0x7FFF; undo change for ambient occlusion
     andi    $11, $5, G_LIGHTTOALPHA >> 8
     andi    $20, $5, G_PACKED_NORMALS >> 8
-    andi    $12, $5, G_TEXTURE_GEN >> 8
+    andi    $10, $5, G_TEXTURE_GEN >> 8
     vmulf   vLtRGBOut, vPairRGBA, vPairLt  // RGB output is RGB * light
     beqz    $11, lt_skip_cel
      vcopy  vLtAOut, vPairRGBA             // Alpha output = vertex alpha (only 3, 7 matter)
@@ -2362,7 +2535,7 @@ lt_skip_novtxcolor:
     vmrg    vPairRGBA, vPairRGBA, vAAA[3h] // Replace color or alpha with fresnel
     vge     vPairRGBA, vPairRGBA, $v31[2]  // Clamp to >= 0 for fresnel; doesn't affect others
 lt_skip_fresnel:
-    beqz    $12, vtx_return_from_lighting  // no texgen
+    beqz    $10, vtx_return_from_lighting  // no texgen
     // Texgen: vLookat0, vPairNrml, have to leave vPairPosI/F, vPairRGBA; output vPairST
      vmudh  $v29, vOne, vLookat0[0h]
     lpv     vLookat1[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2
@@ -2428,7 +2601,7 @@ lt_normal_to_vertex:
     vmadh   vCCC, vCCC, vCCC[7]        // PL: + len^2 int * quadratic factor int  = vCCC int
     vmudh   vBBB, vOne, vAAA[0h]       // Both: Sum components of dot product as signed
     vmadh   vBBB, vOne, vAAA[1h]       // Both:
-    bnez    $12, lt_after_camera       // $12 set if computing specular or fresnel
+    bnez    $10, lt_after_camera       // $10 set if computing specular or fresnel
      vmadh  vAAA, vOne, vAAA[2h]       // Both: vAAA dot product
     vrcph   vBBB[1], vCCC[0]     // 1/(2*light factor), input of 0000.8000 -> no change normals
     luv     vDDD,    (ltBufOfs + 0 - lightSize)(curLight) // vDDD = light color
@@ -2495,7 +2668,7 @@ ovl234_lighting_entrypoint_ovl4ver:  // same IMEM address as ovl234_lighting_ent
      li     postOvlRA, ovl234_lighting_entrypoint // set the return address
 
 ovl234_ovl4_entrypoint:
-    li      $12, mMatrix + 0xE   // For right rotates with lrv/ldv for calc_mit
+    li      $10, mMatrix + 0xE   // For right rotates with lrv/ldv for calc_mit
     j       ovl4_select_instr
      li     $2, 1                // $7 = 1 (lighting && mIT invalid) if doing calc_mit
 
@@ -2518,7 +2691,7 @@ G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix
     move    $2, $5 // Input 0 = output
     jal     while_wait_dma_busy // If ovl4 already in memory, was not done
      li     $3, tempMemRounded // Input 1 = temp mem (loaded mtx)
-    addi    $12, $3, 0x0018
+    addi    $10, $3, 0x0018
 @@loop:
     vmadn   $v9, $v31, $v31[2]  // 0
     addi    $11, $3, 0x0008
@@ -2539,7 +2712,7 @@ G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix
     vmadn   $v7, $v5, $v2[0h]
     bne     $3, $11, @@innerloop
      vmadh  $v6, $v4, $v2[0h]
-    bne     $3, $12, @@loop
+    bne     $3, $10, @@loop
      addi   $3, $3, 0x0008
     // Store the results in M or VP
     sqv     $v9[0], 0x0020($5)
@@ -2559,15 +2732,15 @@ G_DMA_IO_handler:
      li     $ra, wait_for_dma_and_run_next_command  // Setup the return address for running the next DL command
 
 G_BRANCH_WZ_handler:
-    j       vtx_addrs_from_cmd          // byte 3 = vtx being tested; addr -> $12
+    j       vtx_addrs_from_cmd          // byte 3 = vtx being tested; addr -> $10
      li     $11, branchwz_return_from_addrs
 branchwz_return_from_addrs:
 .if CFG_G_BRANCH_W                      // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
-    lh      $12, VTX_W_INT($12)         // read the w coordinate of the vertex (f3dzex)
+    lh      $10, VTX_W_INT($10)         // read the w coordinate of the vertex (f3dzex)
 .else
-    lw      $12, VTX_SCR_Z($12)         // read the screen z coordinate (int and frac) of the vertex (f3dex2)
+    lw      $10, VTX_SCR_Z($10)         // read the screen z coordinate (int and frac) of the vertex (f3dex2)
 .endif
-    sub     $2, $12, cmd_w1_dram        // subtract the w/z value being tested
+    sub     $2, $10, cmd_w1_dram        // subtract the w/z value being tested
     bgez    $2, run_next_DL_command     // if vtx.w/z >= cmd w/z, continue running this DL
      lw     cmd_w1_dram, rdpHalf1Val    // load the RDPHALF1 value as the location to branch to
     j       branch_dl                   // need $2 < 0 for nopush and cmd_w1_dram
@@ -2587,15 +2760,15 @@ calc_mit:
     However, if input matrix has components on the order of 0000.0100, multiplying
     two terms will reduce that to the order of 0000.0001, which kills all the precision.
     */
-    // Get absolute value of all terms of M matrix. $12 already set in dispatch.
+    // Get absolute value of all terms of M matrix. $10 already set in dispatch.
     vxor    $v20, vM0I, $v31[1] // One's complement of X int part
     sb      $7, mITValid                                     // $7 is 1 if we got here, mark valid
     vlt     $v29, vM0I, $v31[2] // X int part < 0
     li      $11, mMatrix + 2                                 // For left rotates with lqv/ldv
     vabs    $v21, vM0I, vM0F     // Apply sign of X int part to X frac part
-    lrv     $v10[0], (0x00)($12)                              // X int right shifted
+    lrv     $v10[0], (0x00)($10)                              // X int right shifted
     vxor    $v22, vM1I, $v31[1] // One's complement of Y int part
-    lrv     $v11[0], (0x20)($12)                              // X frac right shifted
+    lrv     $v11[0], (0x20)($10)                              // X frac right shifted
     vmrg    $v20, $v20, vM0I    // $v20:$v21 = abs(X int:frac)
     lqv     $v16[0], (0x10)($11)                              // Z int left shifted
     vlt     $v29, vM1I, $v31[2] // Y int part < 0
@@ -2609,9 +2782,9 @@ calc_mit:
     vlt     $v29, vM2I, $v31[2] // Z int part < 0
     lsv     $v17[4],  (0x2E)($11)                             // Z frac left rot elem 0->2
     vabs    $v25, vM2I, vM2F     // Apply sign of Z int part to Z frac part
-    lrv     $v18[0], (0x10)($12)                              // Z int right shifted
+    lrv     $v18[0], (0x10)($10)                              // Z int right shifted
     vmrg    $v24, $v24, vM2I    // $v24:$v25 = abs(Z int:frac)
-    lrv     $v19[0], (0x30)($12)                              // Z frac right shifted
+    lrv     $v19[0], (0x30)($10)                              // Z frac right shifted
     // See if any of the int parts are nonzero. Also, get the maximum of the frac parts.
     vge     $v21, $v21, $v23
     lqv     $v8[0],  (0x00)($11)                              // X int left shifted
@@ -2630,9 +2803,9 @@ calc_mit:
     vmadh   $v16, $v16, $v31[1]
     ldv     $v13[0], (0x28)($11)                              // Y frac left shifted
     vge     $v21, $v21, $v21[1h]
-    ldv     $v14[0], (-0x08)($12)                             // Y int right shifted
+    ldv     $v14[0], (-0x08)($10)                             // Y int right shifted
     vor     $v20, $v20, $v20[1h]
-    ldv     $v15[0], (0x18)($12)                              // Y frac right shifted
+    ldv     $v15[0], (0x18)($10)                              // Y frac right shifted
     vmudn   $v27, $v19, $v31[1] // -1; $v26:$v27 is negated copy of Z right rot
     lsv     $v12[4], (0x06)($11)                              // Y int left rot elem 0->2
     vmadh   $v26, $v18, $v31[1]
diff --git a/rsp/rsp_defs.inc b/rsp/rsp_defs.inc
index c4db626..c3095ce 100644
--- a/rsp/rsp_defs.inc
+++ b/rsp/rsp_defs.inc
@@ -6,10 +6,10 @@
 OS_YIELD_DATA_SIZE equ 0xC00
 YIELD_DATA_FOOTER_SIZE equ 0x18
 yieldDataFooter equ OS_YIELD_DATA_SIZE - YIELD_DATA_FOOTER_SIZE
-YDF_OFFSET_RDPWAITCYC   equ 0x00
-YDF_OFFSET_GCLKSAMPLE   equ 0x04
-YDF_OFFSET_PERFCOUNTER1 equ 0x08
-YDF_OFFSET_PERFCOUNTER2 equ 0x0C
+YDF_OFFSET_PERFCOUNTERA equ 0x00
+YDF_OFFSET_PERFCOUNTERB equ 0x04
+YDF_OFFSET_PERFCOUNTERC equ 0x08
+YDF_OFFSET_PERFCOUNTERD equ 0x0C
 YDF_OFFSET_TASKDATAPTR  equ 0x10
 YDF_OFFSET_UCODE        equ 0x14