Files
F3DEX3/f3dex3.s
2025-03-19 21:58:09 -07:00

3645 lines
170 KiB
ArmAsm

.rsp
.include "rsp/rsp_defs.inc"
.include "rsp/gbi.inc"
// This file assumes DATA_FILE and CODE_FILE are set on the command line
.if version() < 110
.error "armips 0.11 or newer is required"
.endif
// Sign-extends the immediate using addi. ori would zero-extend.
.macro li, reg, imm
addi reg, $zero, imm
.endmacro
.macro move, dst, src
ori dst, src, 0
.endmacro
// Prohibit macros involving slt; this silently clobbers $1. You can of course
// manually write the slt and branch instructions if you want this behavior.
.macro blt, ra, rb, lbl
.error "blt is a macro using slt, and silently clobbers $1!"
.endmacro
.macro bgt, ra, rb, lbl
.error "bgt is a macro using slt, and silently clobbers $1!"
.endmacro
.macro ble, ra, rb, lbl
.error "ble is a macro using slt, and silently clobbers $1!"
.endmacro
.macro bge, ra, rb, lbl
.error "bge is a macro using slt, and silently clobbers $1!"
.endmacro
// This version doesn't depend on $v0 to be vZero, which it often is not in
// F3DEX3, and also doesn't get corrupted if $vco is set / consume $vco which
// may be needed for a subsequent instruction.
.macro vcopy, dst, src
vor dst, src, src
.endmacro
// Using $v31 instead of dst as the source because $v31 doesn't change, whereas
// dst might have been modified 2 or 3 cycles ago, causing a stall.
.macro vclr, dst
vxor dst, $v31, $v31
.endmacro
// Also using $v31 for the dummy args here to avoid stalls. dst was once written
// in vanilla tri code just before reading (should have been $v29), leading to
// stalls!
ACC_UPPER equ 0
ACC_MIDDLE equ 1
ACC_LOWER equ 2
.macro vreadacc, dst, N
vsar dst, $v31, $v31[N]
.endmacro
//
// Profiling configurations. To make space for the profiling features, if any of
// the profiling configurations are enabled, G_LIGHTTORDP and !G_SHADING_SMOOTH
// are removed, i.e. G_LIGHTTORDP behaves as a no-op and all tris are smooth
// shaded.
//
// Profiling Configuration A
// perfCounterA:
// cycles RSP spent processing vertex commands (incl. vertex DMAs)
// perfCounterB:
// upper 16 bits: fetched DL command count
// lower 16 bits: DL command count
// perfCounterC:
// cycles RSP was stalled because RDP FIFO was full
// perfCounterD:
// cycles RSP spent processing triangle commands, NOT including buffer flushes
.if CFG_PROFILING_A
.if CFG_PROFILING_B || CFG_PROFILING_C
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 1
COUNTER_C_FIFO_FULL equ 1
// Profiling Configuration B
// perfCounterA:
// upper 16 bits: vertex count
// lower 16 bits: lit vertex count
// perfCounterB:
// upper 18 bits: tris culled by occlusion plane count
// lower 14 bits: clipped (input) tris count
// perfCounterC:
// upper 18 bits: overlay (all 0-4) load count
// lower 14 bits: overlay 2 (lighting) load count
// perfCounterD:
// upper 18 bits: overlay 3 (clipping) load count
// lower 14 bits: overlay 4 (misc) load count
.elseif CFG_PROFILING_B
.if CFG_PROFILING_C
.error "At most one CFG_PROFILING_ option can be enabled at a time"
.endif
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 1
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 0
// Profiling Configuration C
// perfCounterA:
// cycles RSP believes it was running (this ucode only)
// perfCounterB:
// upper 16 bits: samples GCLK was alive (sampled once per DL command count)
// lower 16 bits: DL command count
// perfCounterC:
// upper 18 bits: small RDP command count (all RDP cmds except tris)
// lower 14 bits: matrix loads count
// perfCounterD:
// cycles RSP was stalled waiting for miscellaneous DMAs to finish
.elseif CFG_PROFILING_C
ENABLE_PROFILING equ 1
COUNTER_A_UPPER_VERTEX_COUNT equ 0
COUNTER_B_LOWER_CMD_COUNT equ 1
COUNTER_C_FIFO_FULL equ 0
// Default (extra profiling disabled)
// perfCounterA:
// upper 16 bits: vertex count
// lower 16 bits: RDP/out tri count
// perfCounterB:
// upper 18 bits: RSP/in tri count
// lower 14 bits: tex/fill rect count
// perfCounterC:
// cycles RSP was stalled because RDP FIFO was full
// perfCounterD:
// unused/zero
.else
ENABLE_PROFILING equ 0
COUNTER_A_UPPER_VERTEX_COUNT equ 1
COUNTER_B_LOWER_CMD_COUNT equ 0
COUNTER_C_FIFO_FULL equ 1
.endif
/*
There are two different memory spaces for the overlays: (a) IMEM and (b) the
microcode file (which, plus an offset, is also the location in DRAM).
A label marks both an IMEM addresses and a file address, but evaluating the
label in an integer context (e.g. in a branch) gives the IMEM address.
`orga(your_label)` gets the file address of the label, and `.orga` sets the
file address.
`.headersize`, as well as the value after `.create`, sets the difference
between IMEM addresses and file addresses, so you can set the IMEM address
with `.headersize desired_imem_addr - orga()`.
In IMEM, the whole microcode is organized as (each row is the same address):
0x80 space | |
for boot code Overlay 0 Overlay 1
(End (More cmd
start task) handlers)
(initialization) | |
Rest command handlers
Vertex start
All tri write cmds
Overlay 2 Overlay 3 Overlay 4
(Basic lighting) (Clipping, (Advanced
rare cmds) lighting)
Main vertex write
DMA code
In the file, the microcode is organized as:
start (file addr 0x0 = IMEM 0x1080)
Many command handlers
Overlay 3
Vertex and tri handlers
DMA code (end of this = IMEM 0x2000 = file 0xF80)
Overlay 0
Overlay 1
Overlay 2
Overlay 4
*/
////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// DMEM //////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// RSP DMEM
.create DATA_FILE, 0x0000
/*
Matrices are stored and used in a transposed format compared to how they are
normally written in mathematics. For the integer part:
00 02 04 06 typical Xscl Rot Rot 0
08 0A 0C 0E use: Rot Yscl Rot 0
10 12 14 16 Rot Rot Zscl 0
18 1A 1C 1E Xpos Ypos Zpos 1
The fractional part comes next and is in the same format.
Applying this transformation is done by multiplying a row vector times the
matrix, like:
X Y Z 1 * Xscl Rot Rot 0 = NewX NewY NewZ 1
Rot Yscl Rot 0
Rot Rot Zscl 0
Xpos Ypos Zpos 1
In C, the matrix is accessed as matrix[row][col], and the vector is vector[row].
*/
// 0x0000-0x0040: model matrix
mMatrix:
.fill 0x40
// 0x0040-0x0080: view * projection matrix
vpMatrix:
.fill 0x40
// model * (view * projection) matrix
mvpMatrix:
.fill 0x40
.if . != 0x00C0
.error "Scissor and othermode must be at 0x00C0 for S2DEX"
.endif
// scissor (four 12-bit values)
scissorUpLeft: // the command byte is included since the command word is copied verbatim
.dw (G_SETSCISSOR << 24) | (( 0 * 4) << 12) | (( 0 * 4) << 0)
scissorBottomRight:
.dw ((320 * 4) << 12) | ((240 * 4) << 0)
// othermode
otherMode0: // command byte included, same as above
.dw (G_RDPSETOTHERMODE << 24) | (0x080CFF)
otherMode1:
.dw 0x00000000
// Saved texrect state for combining the multiple input commands into one RDP texrect command
texrectWord1:
.fill 4 // first word, has command byte, xh and yh
texrectWord2:
.fill 4 // second word, has tile, xl, yl
// First half of RDP value for split commands; overwritten by numLightsxSize
rdpHalf1Val:
.fill 4
pointLightFlagOrDirXfrmValid:
.db 0 // Sign bit set if there are point lights. [1, 7F] if dir lights and xfrm valid.
numLightsxSize:
.db 0 // Overwrites second half of rdpHalf1Val when written
// displaylist stack length
displayListStackLength:
.db 0x00 // starts at 0, increments by 4 for each "return address" pushed onto the stack
// Nonzero if the MVP matrix is valid, 0 if it needs to be recomputed.
mvpValid:
.db 0
// viewport
viewport:
.fill 16
// Current RDP fifo output position
rdpFifoPos:
.fill 4
matrixStackPtr:
.dw 0x00000000
// segment table
segmentTable:
.fill (4 * 16) // 16 DRAM pointers
// displaylist stack
displayListStack:
// ucode text (shared with DL stack)
.ascii ID_STR, 0x0A
endIdStr:
.if endIdStr < 0x180
.fill (0x180 - endIdStr)
.elseif endIdStr > 0x180
.error "ID_STR is too long"
.align 16 // to suppress subsequent errors
.endif
endSharedDMEM:
.if . != 0x180
.error "endSharedDMEM at incorrect address, matters for G_LOAD_UCODE / S2DEX"
.endif
// constants for register $v31
.if (. & 15) != 0
.error "Wrong alignment for v31value"
.endif
v31Value:
// v31 must go from lowest to highest (signed) values for vcc patterns.
// Also relies on the fact that $v31[0h] is -4,-4,-4,-4, 4, 4, 4, 4.
.dh -4 // used in clipping, vtx write for Newton-Raphson reciprocal
.dh -1 // used often
.dh 0 // used often
.dh 2 // used as clip ratio (vtx write, clipping) and in clipping
.dh 4 // used for same Newton-Raphsons, occlusion plane scaling
.dh 0x4000 // used in tri write, texgen
.dh 0x7F00 // used in fog, normals unpacking
.dh 0x7FFF // used often
/*
Quick note on Newton-Raphson:
https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
Given input D, we want to find the reciprocal R. The base formula for refining
the estimate of R is R_new = R*(2 - D*R). However, since the RSP reciprocal
instruction moves the radix point 1 to the left, the result has to be multiplied
by 2. So it's 2*R*(2 - D*2*R) = R*(4 - 4*D*R) = R*(1*4 + D*R*-4). This is where
the 4 and -4 come from. For tri write, the result needs to be multiplied by 4
for subpixels, so it's 16 and -16.
*/
cameraWorldPos:
.skip 6
tempTriRA:
.skip 2 // Overwritten as part of camera world position, used as temp
lightBufferLookat:
.skip 8 // s8 X0, Y0, Z0, dummy, X1, Y1, Z1, dummy
lightBufferMain:
.skip (G_MAX_LIGHTS * lightSize)
lightBufferAmbient:
.skip 8 // just colors for ambient light
ltBufOfs equ (lightBufferMain - altBase)
occlusionPlaneEdgeCoeffs:
/*
NOTE: This explanation is outdated; see cpu/occlusionplane.c
Vertex is in occlusion region if all five equations below are true:
4 * screenX[s13.2] * c0[s0.15] - 0.5 * screenY[s13.2] < c4[s14.1]
4 * screenY[s13.2] * c1[s0.15] - 0.5 * screenX[s13.2] < c5[s14.1]
4 * screenX[s13.2] * c2[s0.15] + 0.5 * screenY[s13.2] < c6[s14.1]
4 * screenY[s13.2] * c3[s0.15] + 0.5 * screenX[s13.2] < c7[s14.1]
clamp_to_0.s15(clipX[s15.16] * kx[0.s15])
+ clamp_to_0.s15(clipY[s15.16] * ky[0.s15])
+ clamp_to_0.s15(clipZ[s15.16] * kz[0.s15])
+ kc[0.s15]
>= 0
The first four can be rewritten as (again, vertex is occluded if all are true):
screenY > screenX * 8*c0 + -2*c4
screenX > screenY * 8*c1 + -2*c5
screenY < screenX * -8*c2 + 2*c6
screenX < screenY * -8*c3 + 2*c7
where screenX and screenY are in subpixels (e.g. screenX = 100 = 25.0 pixels),
c0-c3 are shorts representing -1:0.99997,
and c4-c7 are shorts representing "half pixels" (e.g. c4 = 50 = 25.0 pixels)
For the last equation, one option is to think of kx through kc as in s10.5 mode
instead, so a value of 0x0020 is 1.0 and they can range from -0x400.00 to
0x3FF.F8. This choice is because clipZ ranges from 0x0000.0000 at the camera
plane to 0x03FF.0000 at the maximum distance away. The normal distance Adult
Link is from the camera is about 0x00B0.0000.
A better option is to develop your plane equation in floating point, e.g.
clipX[f] * -0.2f + clipY[f] * 0.4f + clipZ[f] * 1.0f + -200.0f >= 0
then multiply everything by (32768.0f / max(abs(kx), abs(ky), abs(kz), abs(kc)))
(here 32768.0f / 200.0f = 163.84f)
clipX[f] * -32.77f + clipY[f] * 65.54f + clipZ[f] * 163.84f + -32768
*/
.dh 0x0000 // c0
.dh 0x0000 // c1
.dh 0x0000 // c2
.dh 0x0000 // c3
.dh 0x8000 // c4
.dh 0x8000 // c5
.dh 0x8000 // c6
.dh 0x8000 // c7
occlusionPlaneMidCoeffs:
.dh 0x0000 // kx
.dh 0x0000 // ky
.dh 0x0000 // kz
.dh 0x8000 // kc
// Alternate base address because vector load offsets can't reach all of DMEM.
// altBaseReg permanently points here.
.if (. & 15) != 0
.error "Wrong alignment for altBase"
.endif
altBase:
textureSettings1:
.dw 0x00000000 // first word, has command byte, level, tile, and on
textureSettings2:
.dw 0xFFFFFFFF // second word, has s and t scale
geometryModeLabel:
.dw 0x00000000 // originally initialized to G_CLIPPING, but that does nothing
fogFactor:
.dw 0x00000000
// constants for register vTRC
.if (. & 15) != 0
.error "Wrong alignment for vTRCValue"
.endif
vTRCValue:
decalFixMult equ 0x0400
decalFixOff equ (-(decalFixMult / 2))
.dh vertexBuffer // currently 0x02DE; for converting vertex index to address
.dh vtxSize << 7 // 0x1300; it's not 0x2600 because vertex indices are *2
.dh 0x1000 // some multiplier in tri write, increment in vertex indices
.dh decalFixMult
.dh 0x0020 // some edge write thing in tri write; formerly Z scale factor
.dh 0xFFF8 // used once in tri write, mask away lower ST bits
.dh decalFixOff // negative
.dh 0x0100 // used several times in tri write
.macro set_vcc_11110001
vge $v29, vTRC, vTRC[7]
.endmacro
.if (vertexBuffer < 0x0100 || decalFixMult < 0x100)
.error "VCC pattern for vTRC corrupted"
.endif
vTRC_VB equ vTRC[0] // Vertex Buffer
vTRC_VS equ vTRC[1] // Vertex Size
vTRC_1000 equ vTRC[2]
vTRC_DM equ vTRC[3] // Decal Multiplier
vTRC_0020 equ vTRC[4]
vTRC_FFF8 equ vTRC[5]
vTRC_DO equ vTRC[6] // Decal Offset
vTRC_0100 equ vTRC[7]
vTRC_0100_addr equ (vTRCValue + 2 * 7)
fxParams:
.if (. & 15) != 0
.error "Wrong alignment for fxParams"
.endif
// First 8 values here loaded with lqv.
aoAmbientFactor:
.dh 0xFFFF
aoDirectionalFactor:
.dh 0xA000
aoPointFactor:
.dh 0x0000
perspNorm:
.dh 0xFFFF
texgenLinearCoeffs:
.dh 0x44D3
.dh 0x6CB3
fresnelScale:
.dh 0x0000
fresnelOffset:
.dh 0x0000
attrOffsetST:
.dh 0x0100
.dh 0xFF00
alphaCompareCullMode:
.db 0x00 // 0 = disabled, 1 = cull if all < thresh, -1 = cull if all >= thresh
alphaCompareCullThresh:
.db 0x00 // Alpha threshold, 00 - FF
lastMatDLPhyAddr:
.dw 0
packedNormalsMaskConstant:
.db 0xF8 // When read, materialCullMode has been zeroed, so read as 0xF800
materialCullMode:
.db 0
// moveword table
movewordTable:
.dh fxParams // G_MW_FX
.dh numLightsxSize - 3 // G_MW_NUMLIGHT
packedNormalsConstants:
.if (. & 4) != 0
.error "Alignment broken for packed normals constants in movewordTable"
.endif
.dh 0x2008 // For packed normals; unused in movewordTable
.if (segmentTable & 0xFF00) != 0
.error "Packed normals constants relies on first byte of segmentTable addr being 0"
.endif
.dh segmentTable // G_MW_SEGMENT
.dh fogFactor // G_MW_FOG
.dh lightBufferMain // G_MW_LIGHTCOL
// Movemem table
movememTable:
.dh tempMatrix // G_MTX multiply temp matrix (model)
.dh mMatrix // G_MV_MMTX
.dh tempMatrix // G_MTX multiply temp matrix (projection)
.dh vpMatrix // G_MV_PMTX
.dh viewport // G_MV_VIEWPORT
.dh cameraWorldPos // G_MV_LIGHT
activeClipPlanes:
.dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping
clipCondShifts:
.db CLIP_SCAL_NY_SHIFT // Constants for clipping algorithm
.db CLIP_SCAL_PY_SHIFT
.db CLIP_SCAL_NX_SHIFT
.db CLIP_SCAL_PX_SHIFT
.macro jumpTableEntry, addr
.dh addr & 0xFFFF
.endmacro
// G_POPMTX, G_MTX, G_MOVEMEM Command Jump Table
movememHandlerTable:
jumpTableEntry G_POPMTX_end // G_POPMTX
jumpTableEntry G_MTX_end // G_MTX (multiply)
jumpTableEntry G_MOVEMEM_end // G_MOVEMEM, G_MTX (load)
.macro miniTableEntry, addr
.if addr < 0x1000 || addr >= 0x1400
.error "Handler address out of range!"
.endif
.db (addr - 0x1000) >> 2
.endmacro
// RDP/Immediate Command Mini Table
// 1 byte per entry, after << 2 points to an addr in first 1/4 of IMEM
miniTableEntry G_FLUSH_handler
miniTableEntry G_MEMSET_handler
miniTableEntry G_DMA_IO_handler
miniTableEntry G_TEXTURE_handler
miniTableEntry G_POPMTX_handler
miniTableEntry G_GEOMETRYMODE_handler
miniTableEntry G_MTX_handler
miniTableEntry G_MOVEWORD_handler
miniTableEntry G_MOVEMEM_handler
miniTableEntry G_LOAD_UCODE_handler
miniTableEntry G_DL_handler
miniTableEntry G_ENDDL_handler
miniTableEntry G_SPNOOP_handler
miniTableEntry G_RDPHALF_1_handler
miniTableEntry G_SETOTHERMODE_L_handler
miniTableEntry G_SETOTHERMODE_H_handler
miniTableEntry G_TEXRECT_handler
miniTableEntry G_TEXRECTFLIP_handler
miniTableEntry G_SYNC_handler // G_RDPLOADSYNC
miniTableEntry G_SYNC_handler // G_RDPPIPESYNC
miniTableEntry G_SYNC_handler // G_RDPTILESYNC
miniTableEntry G_SYNC_handler // G_RDPFULLSYNC
miniTableEntry G_RDP_handler // G_SETKEYGB
miniTableEntry G_RDP_handler // G_SETKEYR
miniTableEntry G_RDP_handler // G_SETCONVERT
miniTableEntry G_SETSCISSOR_handler
miniTableEntry G_RDP_handler // G_SETPRIMDEPTH
miniTableEntry G_RDPSETOTHERMODE_handler
miniTableEntry load_cmds_handler // G_LOADTLUT
miniTableEntry G_RDPHALF_2_handler
miniTableEntry G_RDP_handler // G_SETTILESIZE
miniTableEntry load_cmds_handler // G_LOADBLOCK
miniTableEntry load_cmds_handler // G_LOADTILE
miniTableEntry G_RDP_handler // G_SETTILE
miniTableEntry G_RDP_handler // G_FILLRECT
miniTableEntry G_RDP_handler // G_SETFILLCOLOR
miniTableEntry G_RDP_handler // G_SETFOGCOLOR
miniTableEntry G_RDP_handler // G_SETBLENDCOLOR
miniTableEntry G_RDP_handler // G_SETPRIMCOLOR
miniTableEntry G_RDP_handler // G_SETENVCOLOR
miniTableEntry G_RDP_handler // G_SETCOMBINE
miniTableEntry G_SETxIMG_handler // G_SETTIMG
miniTableEntry G_SETxIMG_handler // G_SETZIMG
miniTableEntry G_SETxIMG_handler // G_SETCIMG
cmdMiniTable:
miniTableEntry G_SYNC_handler // G_NOOP
miniTableEntry G_VTX_handler
miniTableEntry G_MODIFYVTX_handler
miniTableEntry G_CULLDL_handler
miniTableEntry G_BRANCH_WZ_handler
miniTableEntry G_TRI1_handler
miniTableEntry G_TRI2_handler
miniTableEntry G_QUAD_handler
miniTableEntry G_TRISTRIP_handler
miniTableEntry G_TRIFAN_handler
miniTableEntry G_LIGHTTORDP_handler
miniTableEntry G_RELSEGMENT_handler
// The maximum number of generated vertices in a clip polygon. In reality, this
// is equal to MAX_CLIP_POLY_VERTS, but for testing we can change them separately.
// In case you're wondering if it's possible to have a 7-vertex polygon where all
// 7 verts are generated, it looks like this (X = generated vertex):
// ___----=>
// +---------------__X----X _-^
// | __--^^ X^
// | __--^^ _-^|
// _X^^^ _-^ |
// C | _-^ |
// ^X _-^ |
// |\ _-^ |
// +-X--_X^---------------+
// V^
MAX_CLIP_GEN_VERTS equ 7
// Normally, each clip plane can cut off a "tip" of a polygon, turning one vert
// into two. (It can also cut off more of the polygon and remove additional verts,
// but the maximum is one more vert per clip plane.) So with 5 clip planes, we
// could have a maximum of 8 verts in the final polygon. However, the verts
// generated by the no-nearclipping plane will always be at infinity, so they
// will always get replaced by generated verts from one of the other clip planes.
// Put another way, if there are 8 verts in the final polygon, there are 8 edges,
// which are portions of the 3 original edges plus portions of 5 edges along the
// 5 clip planes. But the edge portion along the no-nearclipping plane is at
// infinity, so that edge can't be on screen.
// It is rare but possible for these assumptions to be violated and a polygon
// with more than 7 verts to be generated. For example, numerical precision
// issues could cause the polygon to be slightly non-convex at one of the clip
// planes, causing the plane to cut off more than one tip. However, this
// implementation checks for an imminent overflow and aborts clipping (draws no
// tris) if this occurs. Because this is caused by extreme/degenerate cases like
// the camera exactly on a tri, not drawing anything is an okay result.
MAX_CLIP_POLY_VERTS equ 7
CLIP_POLY_SIZE_BYTES equ (MAX_CLIP_POLY_VERTS+1) * 2
CLIP_TEMP_VERTS_SIZE_BYTES equ (MAX_CLIP_GEN_VERTS * vtxSize)
VERTEX_BUFFER_SIZE_BYTES equ (G_MAX_VERTS * vtxSize)
RDP_CMD_BUFSIZE equ 0xB0
RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command
RDP_CMD_BUFSIZE_TOTAL equ (RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS)
INPUT_BUFFER_CMDS equ 21
INPUT_BUFFER_SIZE_BYTES equ (INPUT_BUFFER_CMDS * 8)
INPUT_BUFFER_CLOBBER_OSTASK_AMT equ 0x10 // Input buffer overwrites beginning of OSTask, see rsp_defs.inc
OSTASK_ORIG_SIZE equ 0x40
OSTASK_CLOBBERED_SIZE equ (OSTASK_ORIG_SIZE - INPUT_BUFFER_CLOBBER_OSTASK_AMT)
END_VARIABLE_LEN_DMEM equ (0x1000 - OSTASK_CLOBBERED_SIZE - INPUT_BUFFER_SIZE_BYTES - (2 * RDP_CMD_BUFSIZE_TOTAL) - (2 * CLIP_POLY_SIZE_BYTES) - CLIP_TEMP_VERTS_SIZE_BYTES - VERTEX_BUFFER_SIZE_BYTES)
startFreeDmem:
.org END_VARIABLE_LEN_DMEM
endFreeDmem:
// Main vertex buffer in RSP internal format
vertexBuffer:
.skip VERTEX_BUFFER_SIZE_BYTES
// Space for temporary verts for clipping code, and reused for other things
clipTempVerts:
// Round up to 0x10
.org ((clipTempVerts + 0xF) & 0xFF0)
// Vertex addresses, to avoid a multiply-add for each vertex index lookup
vertexTable:
.skip ((G_MAX_VERTS + 8) * 2) // halfword for each vertex; need 1 extra end addr, easier to write 8 extra
.if . > yieldDataFooter
// Need to fit everything through vertex buffer in yield buffer, would like
// to also fit vertexTable to avoid recompute after yield
.error "Too much being stored in yieldable DMEM"
.endif
tempMatrix:
.skip 0x40
.if . > (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES)
.error "Too much in clipTempVerts"
.endif
.org (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES)
clipTempVertsEnd:
clipPoly:
.skip CLIP_POLY_SIZE_BYTES // 3 5 7 + term 0
clipPoly2: // \ / \ / \
.skip CLIP_POLY_SIZE_BYTES // 4 6 7 + term 0
// First RDP Command Buffer
rdpCmdBuffer1:
.skip RDP_CMD_BUFSIZE
.if (. & 8) != 8
.error "RDP command buffer alignment to 8 assumption broken"
.endif
rdpCmdBuffer1End:
.skip 8
rdpCmdBuffer1EndPlus1Word:
// This is so that we can temporarily store vector regs here with lqv/sqv
.skip RDP_CMD_BUFSIZE_EXCESS - 8
// Second RDP Command Buffer
rdpCmdBuffer2:
.skip RDP_CMD_BUFSIZE
.if (. & 8) != 8
.error "RDP command buffer alignment to 8 assumption broken"
.endif
rdpCmdBuffer2End:
.skip 8
rdpCmdBuffer2EndPlus1Word:
.skip RDP_CMD_BUFSIZE_EXCESS - 8
// Input buffer. After RDP cmd buffers so it can be vector addressed from end.
inputBuffer:
.skip INPUT_BUFFER_SIZE_BYTES - INPUT_BUFFER_CLOBBER_OSTASK_AMT
// 0x0FC0-0x1000: OSTask
OSTask:
.skip INPUT_BUFFER_CLOBBER_OSTASK_AMT
inputBufferEnd:
inputBufferEndSgn equ -(0x1000 - inputBufferEnd) // Underflow DMEM address
// rest of OSTask
.skip OSTASK_CLOBBERED_SIZE
.if . != 0x1000
.error "DMEM organization incorrect"
.endif
.close // DATA_FILE
// See rsp_defs.inc about why these are not used and we can reuse them.
startCounterTime equ (OSTask + OSTask_ucode_size)
xfrmLookatDirs equ -(0x1000 - (OSTask + OSTask_ucode_data)) // and OSTask_ucode_data_size
memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0)
memsetBufferMaxEnd equ (rdpCmdBuffer1 & 0xFF0)
memsetBufferMaxSize equ (memsetBufferMaxEnd - memsetBufferStart)
memsetBufferSize equ (memsetBufferMaxSize > 0x800 ? 0x800 : memsetBufferMaxSize)
////////////////////////////////////////////////////////////////////////////////
/////////////////////////////// Register Naming ////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/*
Scalar regs:
Tri write Clip VW Vtx write Lighting V/L init Cmd dispatch
$zero ---------------------- Hardwired zero --------------------------------
$1 v1 texptr <------------- vtxLeft --------------------> temp, init 0
$2 v2 shdptr clipVNext -------> <----- lbPostAo temp
$3 v3 shdflg clipVLastOfsc vLoopRet ---------> temp
$4 flat shading vtx or (perf) initial FIFO stall time -------------------
$5 <------------------------ vGeomMid -------------------->
$6 geom mode clipMaskIdx -----> <--- lbTexgenOrRet
$7 v2flag tile <------------- fogFlag ----------> mtx valid cmd byte
$8 v3flag <------------- outVtx2 -------------------->
$9 xp texenab clipMask --------> <----- curLight viLtFlag ovlInitClock
$10 -------------------------- temp2 -------------------------------------
$11 --------------------------- temp -------------------------------------
$12 ----------------------- perfCounterD ---------------------------------
$13 ------------------------ altBaseReg ----------------------------------
$14 <-------------- inVtx --------------------->
$15 <------------ outVtxBase ------------------>
$16 v1flag lmaj clipFlags -------> <----- lbFakeAmb
$17 clipPolyRead ---->
$18 <---------- clipPolySelect -->
$19 temp clipVOnsc outVtx1 ----------> <--------- dmaLen
$20 temp <------------- flagsV1 ----------> <--------- dmemAddr
$21 <---------- clipPolyWrite ---> <----- ambLight ---------->
$22 ---------------------- rdpCmdBufEndP1 --------------------------------
$23 ----------------------- rdpCmdBufPtr ---------------------------------
$24 temp <------------- flagsV2 ----------> <--------- cmd_w1_dram
$25 cmd_w0 --------------------> <----- lbAfter <--------- cmd_w0
$26 ------------------------ taskDataPtr ---------------------------------
$27 ---------------------- inputBufferPos --------------------------------
$28 ----------------------- perfCounterA ---------------------------------
$29 ----------------------- perfCounterB ---------------------------------
$30 ----------------------- perfCounterC ---------------------------------
$ra return address, sometimes sign bit is flag ---------------------------
*/
// Global scalar regs:
perfCounterD equ $12 // Performance counter D (functions depend on config)
altBaseReg equ $13 // Alternate base address register for vector loads
rdpCmdBufEndP1 equ $22 // Pointer to one command word past "end" (middle) of RDP command buf
rdpCmdBufPtr equ $23 // RDP command buffer current DMEM pointer
taskDataPtr equ $26 // Task data (display list) DRAM pointer
inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end
perfCounterA equ $28 // Performance counter A (functions depend on config)
perfCounterB equ $29 // Performance counter B (functions depend on config)
perfCounterC equ $30 // Performance counter C (functions depend on config)
// Vertex write:
vtxLeft equ $1 // Number of vertices left to process * 0x10
vLoopRet equ $3 // Return address at end of vtx loop = top of loop or misc lighting
vGeomMid equ $5 // Middle two bytes of geometry mode
fogFlag equ $7 // 8 if fog enabled, else 0
outVtx2 equ $8 // Pointer to second or dummy (= outVtx1) transformed vert
inVtx equ $14 // Pointer to loaded vertex to transform; < 0 means from clipping.
outVtxBase equ $15 // Pointer to vertex buffer to store transformed verts
outVtx1 equ $19 // Pointer to first transformed vert
flagsV1 equ $20 // Clip flags for vertex 1
flagsV2 equ $24 // Clip flags for vertex 2
// Lighting basic:
lbPostAo equ $2 // Address to return to after AO
lbTexgenOrRet equ $6 // ltbasic_texgen as negative if texgen, else vtx_return_from_lighting
curLight equ $9 // Current light pointer with offset
lbFakeAmb equ $16 // Pointer to ambient light or to 8 bytes of zeros if AO enabled
ambLight equ $21 // Ambient (top) light pointer with offset
lbAfter equ $25 // Address to return to after main lighting loop (vertex or extras)
// Lighting advanced:
laPtr equ TODO // Pointer to current vertex pair being lit
laSTKept equ TODO // Texture coords of vertex 1 kept through processing
laVtxLeft equ TODO // Count of vertices left * 0x10
laPacked equ TODO // Nonzero if packed normals enabled
laSpecular equ TODO // Sign bit set if specular enabled
laSpecFres equ TODO // Nonzero if doing ltadv_normal_to_vertex for specular or Fresnel
laL2A equ TODO // Nonzero if light-to-alpha (cel shading) enabled
laTexgen equ TODO // Nonzero if texgen enabled
// Clipping
clipVNext equ $2 // Next vertex (vertex at forward end of current edge)
clipVLastOfsc equ $3 // Last vertex / offscreen vertex
clipVOnsc equ $19 // Onscreen vertex
clipMaskIdx equ $6 // Clip mask index 4-0
clipMask equ $9 // Current clip mask (one bit)
clipFlags equ $16 // Current clipping flags being checked
clipPolyRead equ $17 // Read pointer within current polygon being clipped
clipPolySelect equ $18 // Clip poly double buffer selection
clipPolyWrite equ $21 // Write pointer within current polygon being clipped
// Vertex init
viLtFlag equ $9 // Holds pointLightFlagOrDirXfrmValid
// Misc
ovlInitClock equ $9 // Temp for profiling
postOvlRA equ $10 // Address to return to after overlay load
dmaLen equ $19 // DMA length in bytes minus 1
dmemAddr equ $20 // DMA address in DMEM or IMEM. Also = rdpCmdBufPtr - rdpCmdBufEndP1 for flush_rdp_buffer
cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr
cmd_w0 equ $25 // DL command word 0, also holds next tris info
// Global vector regs:
vZero equ $v0 // All elements = 0; NOT global, only in tri write and clip. Mtx in vtx.
vTRC equ $v1 // Triangle Constants; NOT global, only in tri write and clip. Mtx in vtx.
vOne equ $v28 // All elements = 1; global
// $v29: permanent temp register, also write results here to discard
// $v30: vtx / lt = sSTO + persp norm + more lighting params
// $v31: Global constant vector register
// Vertex / lighting vector regs:
vMTX0I equ $v0 // Matrix rows int/frac; MVP normally, or M in ltadv
vMTX1I equ $v1
vMTX2I equ $v2
vMTX3I equ $v3
vMTX0F equ $v4
vMTX1F equ $v5
vMTX2F equ $v6
vMTX3F equ $v7
vTemp1 equ $v8 // Temporaries, used by lighting (along with some vp regs)
vTemp2 equ $v9
vKept1 equ $v10 // Kept across lighting
vKept2 equ $v11
vpMdl equ $v12 // Vertex pair model space position
vpClpF equ $v13 // Vertex pair clip space position frac
vpClpI equ $v14 // Vertex pair clip space position int
vpScrF equ $v15 // Vertex pair screen space position frac
vpScrI equ $v16 // Vertex pair screen space position int
vpST equ $v17 // Vertex pair ST texture coordinates
vpRGBA equ $v18 // Vertex pair color
vpLtTot equ $v19 // Vertex pair total light
vpNrmlX equ $v20 // Vertex pair normal X (elems 3, 7)
vpNrmlY equ $v21 // Vertex pair normal Y (elems 3, 7)
vpNrmlZ equ $v22 // Vertex pair normal Z (elems 3, 7)
vLTC equ $v23 // Lighting constants - first light dir, constants for packed normals
vPerm1 equ $v24 // Regs loaded in vtx_constants_for_clip and permanently kept through vtx/lt
vPerm2 equ $v25
vPerm3 equ $v26
vPerm4 equ $v27
// Lighting temporaries. Lighting also modifies vpNrmlX:Y:Z, vpLtTot, vpRGBA, and
// in texgen vpST. Only the two regs in the comments below and vKept1 are kept.
.if CFG_NO_OCCLUSION_PLANE
// vpClpI:F are kept, vpMdl is free to use as temp
lDOT equ vpMdl // lighting DOT product
lCOL equ vKept2 // lighting total light COLor
.else
// vpMdl is kept, these are free to use as temps
lDOT equ vpClpF
lCOL equ vpClpI
.endif
lDTC equ vTemp1 // lighting DoT Clamped
lVCI equ vTemp2 // lighting Vertex Color In
lDIR equ vpRGBA // lighting transformed light DIRection
// Kept
.if CFG_NO_OCCLUSION_PLANE
sCLZ equ vKept1 // vtx_store Clamped Z. Does have to be kept even though in instan_lt_vs_45 b/c need rest of lt temps at start of texgen (and advanced lighting).
sOCS equ $v29 // Does not exist
.else
sOCS equ vKept1 // vtx_store Occlusion State
sCLZ equ vpClpF // Not a kept in this config
.endif
// Common vertex temporaries
sRTF equ vTemp1 // vtx_store Reciprocal Temp Frac
sRTI equ vTemp2 // vtx_store Reciprocal Temp Int
sFOG equ lCOL // lCOL -> sFOG in lt epilogue with NOC, else sFOG -> lCOL in lt prologue
// Misc temps used by both
.if CFG_NO_OCCLUSION_PLANE
s1WI equ vpNrmlX // vtx_store 1/W Int
s1WF equ vpLtTot // vtx_store 1/W Frac
sSCI equ sFOG // vtx_store Scaled Clipping Int
sSCF equ vpMdl // vtx_store Scaled Clipping Frac
sTCL equ sCLZ // vtx_store Temp CoLor
.else
s1WI equ vpMdl
s1WF equ vpNrmlX
sSCI equ vpScrI
sSCF equ vpScrF
sTCL equ vpLtTot
.endif
// Misc temps used by only one
.if CFG_NO_OCCLUSION_PLANE
sST2 equ vpScrI // vtx_store ST coordinates copy 2
sOTM equ $v29 // Does not exist
.else
sST2 equ $v29 // Does not exist
sOTM equ vpRGBA // vtx_store Occlusion Temporary
.endif
// Permanently kept through vertex/lighting
.if CFG_NO_OCCLUSION_PLANE
sVPS equ vPerm1 // vtx_store ViewPort Scale
sVPO equ vPerm2 // vtx_store ViewPort Offset
sFGM equ vPerm3 // vtx_store FoG Mask
sO03 equ $v29 // Does not exist
sO47 equ $v29
sOCM equ $v29
sOPM equ $v29
.else
// These are temps, not permanents, on this codepath
sVPS equ vpScrI // Temp, not permament, on this codepath
sVPO equ vpScrF // Temp, not permament, on this codepath
sFGM equ $v29 // Does not exist
sO03 equ vPerm1 // vtx_store Occlusion plane edge coefficients 0-3
sO47 equ vPerm2 // vtx_store Occlusion plane edge coefficients 4-7
sOCM equ vPerm3 // vtx_store Occlusion plane Mid coefficients
sOPM equ vKept2 // vtx_store Occlusion Plus Minus. Loaded in vtx_after_lt_setup not vtx_constants_for_clip b/c clobbered by lighting.
.endif
sSTS equ vPerm4
// ltadv (advanced lighting) vector regs.
// vMTX* ($v0:7) and vPerm1:4 ($v24:27) have to be kept.
vM0F equ $v8
vM0I equ $v9
vM1F equ $v10
vM1I equ $v11
vM2F equ $v12
vM2I equ $v13
vM3F equ $v14
vM3I equ $v15
aDWI equ $v16 // ltadv Delta World position Int part
aDWF equ $v17 // ltadv Delta World position Frac part
// Also uses vpRGBA equ $v18
// Also uses vpLtTot equ $v19
aL2I equ $v20 // ltadv Length 2quared Int part
aL2F equ $v21 // ltadv Length 2quared Frac part
aDIR equ $v22 // ltadv light DIRection
vpWNrm equ $v23 // Vertex Pair World space Normal
vpWrlI equ vMTX0F // sort of vpMdlI- Vertex Pair World position Int part
vpWrlF equ vMTX2F // sort of vpMdlF- Vertex Pair World position Frac part
aDOT equ aDWI // ltadv unclamped DOT product
aMDL equ aDWI // ltadv MoDeL position
aSCL equ aDWF // ltadv dot product SCaLe
aAOF equ aL2I // ltadv Ambient Occlusion Factor
aCLO equ aL2I // ltadv CoLor Out
aLTC equ aL2F // ltadv LighT Color
aALO equ aL2F // ltadv ALpha Out
vpMNrm equ vpWNrm // Vertex Pair Model space Normal
aLkDt0 equ vpLtTot // ltadv Lookat Dot product 0
aLkDt1 equ aDWI // ltadv Lookat Dot product 1; reg assignment TODO
// Temp storage after rdpCmdBufEndP1. There is 0xA8 of space here which will
// always be free during vtx load or clipping.
tempVpRGBA equ 0x00 // Only used during loop
tempXfrmLt equ tempVpRGBA // ltbasic only used during init
tempUnpackings equ tempVpRGBA // ltadv only during init
tempAmbient equ 0x10 // ltbasic set during init, used during loop
tempNormalScales equ tempAmbient // ltadv only during init
tempPrevInvalVtxStart equ 0x20
tempPrevInvalVtx equ (tempPrevInvalVtxStart + vtxSize) // 0x46; fog writes here
tempPrevInvalVtxEnd equ (tempPrevInvalVtx + vtxSize) // 0x6C; rest of vtx writes here
.if tempPrevInvalVtxEnd > (RDP_CMD_BUFSIZE_EXCESS - 8)
.error "Too much temp storage used!"
.endif
////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// IMEM //////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// RSP IMEM
.create CODE_FILE, 0x00001080
// Initialization routines
// Everything up until ovl01_end will get overwritten by ovl1
start: // This is at IMEM 0x1080, not the start of IMEM
vnop // Return to here from S2DEX overlay 0 G_LOAD_UCODE jumps to start+4!
lqv $v31[0], (v31Value)($zero) // Actual start is here
vadd $v29, $v29, $v29 // Consume VCO (carry) value possibly set by the previous ucode
lqv vTRC, (vTRCValue)($zero) // Always as this value except vtx_store
li altBaseReg, altBase
li rdpCmdBufPtr, rdpCmdBuffer1
vclr vOne
li rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word
lw $11, rdpFifoPos
lw $10, OSTask + OSTask_flags
li $1, SP_CLR_SIG2 | SP_CLR_SIG1 // Clear task done and yielded signals
vsub vOne, vOne, $v31[1] // 1 = 0 - -1
beqz $11, initialize_rdp // If RDP FIFO not set up yet, starting ucode from scratch
mtc0 $1, SP_STATUS
andi $10, $10, OS_TASK_YIELDED // Resumed from yield or came from called ucode?
beqz $10, continue_from_os_task // If latter, load DL (task data) pointer from OSTask
// Otherwise continuing from yield; perf counters saved here at yield
lw perfCounterA, yieldDataFooter + YDF_OFFSET_PERFCOUNTERA
lw perfCounterB, yieldDataFooter + YDF_OFFSET_PERFCOUNTERB
lw perfCounterC, yieldDataFooter + YDF_OFFSET_PERFCOUNTERC
lw perfCounterD, yieldDataFooter + YDF_OFFSET_PERFCOUNTERD
j finish_setup
lw taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR
initialize_rdp:
mfc0 $11, DPC_STATUS // Read RDP status
andi $11, $11, DPC_STATUS_XBUS_DMA // Look at XBUS enabled bit
bnez $11, @@start_new_buf // If XBUS is enabled, start new buffer
mfc0 $2, DPC_END // Load RDP end pointer
lw $3, OSTask + OSTask_output_buff // Load start of FIFO
sub $11, $3, $2 // If start of FIFO > RDP end,
bgtz $11, @@start_new_buf // start new buffer
mfc0 $1, DPC_CURRENT // Load RDP current pointer
lw $3, OSTask + OSTask_output_buff_size // Load end of FIFO
beqz $1, @@start_new_buf // If RDP current pointer is 0, start new buffer
sub $11, $1, $3 // If RDP current > end of fifo,
bgez $11, @@start_new_buf // start new buffer
nop
bne $1, $2, @@continue_buffer // If RDP current != RDP end, keep current buffer
@@start_new_buf:
// There may be one buffer executing in the RDP, and another queued in the
// double-buffered start/end regs. Wait for the latter to be available
// (i.e. possibly one buffer executing, none waiting).
mfc0 $11, DPC_STATUS // Read RDP status
andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
bnez $11, @@start_new_buf // Wait until double buffered start/end available
li $11, DPC_STATUS_CLR_XBUS // Bit to disable XBUS mode
mtc0 $11, DPC_STATUS // Set bit, disable XBUS
lw $2, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
// Set up the next buffer for the RDP to be zero size and at the end of the FIFO.
mtc0 $2, DPC_START // Set RDP start addr to end of FIFO
mtc0 $2, DPC_END // Set RDP end addr to end of FIFO
@@continue_buffer:
// If we jumped here, the RDP is currently executing from the middle of the FIFO.
// So we can just append commands to there and move the end pointer.
sw $2, rdpFifoPos // Set FIFO position to end of FIFO or RDP end
lw $11, matrixStackPtr // Initialize matrix stack pointer from OSTask
bnez $11, continue_from_os_task // if not yet initialized
lw $11, OSTask + OSTask_dram_stack
sw $11, matrixStackPtr
continue_from_os_task:
// Counters stored here if jumped to different ucode
// If starting from scratch, these are zero
lw perfCounterA, mvpMatrix + YDF_OFFSET_PERFCOUNTERA
lw perfCounterB, mvpMatrix + YDF_OFFSET_PERFCOUNTERB
lw perfCounterC, mvpMatrix + YDF_OFFSET_PERFCOUNTERC
lw perfCounterD, mvpMatrix + YDF_OFFSET_PERFCOUNTERD
jal fill_vertex_table
lw taskDataPtr, OSTask + OSTask_data_ptr
finish_setup:
.if CFG_PROFILING_C
mfc0 $11, DPC_CLOCK
sw $11, startCounterTime
.endif
sb $zero, mvpValid
li inputBufferPos, 0
li cmd_w1_dram, orga(ovl1_start)
j load_overlays_0_1
li postOvlRA, displaylist_dma
start_end:
.align 8
start_padded_end:
.orga max(orga(), max(ovl0_padded_end - ovl0_start, ovl1_padded_end - ovl1_start) - 0x80)
ovl01_end:
displaylist_dma_with_count:
andi inputBufferPos, cmd_w0, 0x00F8 // Byte 3, how many cmds to drop from load (max 0xA0)
displaylist_dma:
// Load INPUT_BUFFER_SIZE_BYTES - inputBufferPos cmds (inputBufferPos >= 0, mult of 8)
addi inputBufferPos, inputBufferPos, -INPUT_BUFFER_SIZE_BYTES // inputBufferPos = - num cmds
.if CFG_PROFILING_A
sll $11, inputBufferPos, 16 - 3 // Divide by 8 for num cmds to load, then move to upper 16
sub perfCounterB, perfCounterB, $11 // Negative so subtract
.endif
nor dmaLen, inputBufferPos, $zero // DMA length = -inputBufferPos - 1 = ones compliment
move cmd_w1_dram, taskDataPtr // set up the DRAM address to read from
jal dma_read_write // initiate the DMA read
addi dmemAddr, inputBufferPos, inputBufferEnd // set the address to DMA read to
sub taskDataPtr, taskDataPtr, inputBufferPos // increment the DRAM address to read from next time
wait_for_dma_and_run_next_command:
G_POPMTX_end:
G_MOVEMEM_end:
j while_wait_dma_busy // wait for the DMA read to finish
li $ra, run_next_DL_command
G_DMA_IO_handler:
G_MEMSET_handler:
j ovl234_clipmisc_entrypoint // Delay slot is harmless
load_cmds_handler:
lb $3, materialCullMode
bltz $3, run_next_DL_command // If cull mode is < 0, in mat second time, skip the load
G_RDP_handler:
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer
G_SYNC_handler:
.if CFG_PROFILING_C
addi perfCounterC, perfCounterC, 0x4000 // Increment small RDP command count
.endif
sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer
addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words
check_rdp_buffer_full_and_run_next_cmd:
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1
bgezal dmemAddr, flush_rdp_buffer
// $1 on next instr survives flush_rdp_buffer
.if !CFG_PROFILING_A
tris_end:
.endif
.if ENABLE_PROFILING
G_LIGHTTORDP_handler:
.endif
G_SPNOOP_handler:
run_next_DL_command:
mfc0 $1, SP_STATUS // load the status word into register $1
lw cmd_w0, (inputBufferEnd)(inputBufferPos) // load the command word into cmd_w0
beqz inputBufferPos, displaylist_dma // load more DL commands if none are left
andi $1, $1, SP_STATUS_SIG0 // check if the task should yield
sra $7, cmd_w0, 24 // extract DL command byte from command word
lbu $11, (cmdMiniTable)($7) // Load mini table entry
bnez $1, load_overlay_0_and_enter // load and execute overlay 0 if yielding; $1 > 0
lw cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram
sll $11, $11, 2 // Convert to a number of instructions
.if CFG_PROFILING_C
mfc0 $10, DPC_STATUS
andi $10, $10, DPC_STATUS_GCLK_ALIVE // Sample whether GCLK is active now
sll $10, $10, 16 - 3 // move from bit 3 to bit 16
add perfCounterB, perfCounterB, $10 // Add to the perf counter
.endif
.if CFG_PROFILING_A
mfc0 $10, DPC_CLOCK
.endif
.if COUNTER_B_LOWER_CMD_COUNT
addi perfCounterB, perfCounterB, 1 // Count commands
.endif
.if CFG_PROFILING_A
move $4, perfCounterC // Save initial FIFO stall time
sw $10, startCounterTime
.endif
jr $11 // Jump to handler
addi inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words
// $1 must remain zero
// $7 must retain the command byte for load_mtx and overlay 4 stuff
// $11 must contain the handler called for several handlers
G_DL_handler:
sll $2, cmd_w0, 15 // Shifts the push/nopush value to the sign bit
branch_dl:
lbu $1, displayListStackLength // Get the DL stack length
jal segmented_to_physical
add $3, taskDataPtr, inputBufferPos // Current DL pos to push on stack
bltz $2, call_ret_common // Nopush = branch = flag is set
move taskDataPtr, cmd_w1_dram // Set the new DL to the target display list
sw $3, (displayListStack)($1)
addi $1, $1, 4 // Increment the DL stack length
call_ret_common:
sb $zero, materialCullMode // This covers call, branch, return, and cull and branchZ successes
j displaylist_dma_with_count
sb $1, displayListStackLength
.if !ENABLE_PROFILING
G_LIGHTTORDP_handler:
lbu $11, numLightsxSize // Ambient light
lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size
andi $2, cmd_w0, 0x00FF // Byte 3 = alpha
sub $1, $11, $1 // Light address; byte 2 counts from end
lw $3, (lightBufferMain-1)($1) // Load light RGB into lower 3 bytes
move cmd_w0, cmd_w1_dram // Move second word to first (cmd byte, prim level)
sll $3, $3, 8 // Shift light RGB to upper 3 bytes and clear alpha byte
j G_RDP_handler // Send to RDP
or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word
.endif
G_SETxIMG_handler:
lb $3, materialCullMode // Get current mode
jal segmented_to_physical // Convert image to physical address
lw $2, lastMatDLPhyAddr // Get last material physical addr
bnez $3, G_RDP_handler // If not in normal mode (0), exit
add $10, taskDataPtr, inputBufferPos // Current material physical addr
beq $10, $2, @@skip // Branch if we are executing the same mat again
sw $10, lastMatDLPhyAddr // Store material physical addr
li $7, 1 // > 0: in material first time
@@skip: // Otherwise $7 was < 0: cull mode (in mat second time)
j G_RDP_handler
sb $7, materialCullMode
G_BRANCH_WZ_handler:
lhu $10, (vertexTable)(cmd_w0) // Vertex addr from byte 3
.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex)
.else
lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2)
.endif
sub $2, $10, cmd_w1_dram // subtract the w/z value being tested
bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL
lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to
j branch_dl // need $2 < 0 for nopush and cmd_w1_dram
li cmd_w0, 0 // No count of DL cmds to skip
G_FLUSH_handler:
jal flush_rdp_buffer // Flush once to push partial DMEM buf to FIFO
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1 // Prereq; offset buffer fullness
// If the DMEM buffer was empty, dmemAddr will be unchanged and valid for this next
// jump. Otherwise, running the DMA write will cause dmemAddr to get set to a large
// negative number. Then for this second jump, the same codepath will be triggered as
// if the buffer was empty. The result is it will wait for the DMA to finish, set
// DPC_END, and return to $ra. This is why the dmemAddr register (as opposed to,
// for example, dmaLen) is used as the DMEM buf fullness.
j flush_rdp_buffer
li $ra, run_next_DL_command
G_LOAD_UCODE_handler:
j load_overlay_0_and_enter // Delay slot is harmless
G_MODIFYVTX_handler:
lhu $10, (vertexTable)(cmd_w0) // Byte 3 = vtx being modified
j do_moveword // Moveword adds cmd_w0 to $10 for final addr
lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear
G_VTX_handler:
lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive
jal segmented_to_physical // Convert address in cmd_w1_dram to physical
lhu vtxLeft, (inputBufferEnd - 0x07)(inputBufferPos) // vtxLeft = size in bytes = vtx count * 0x10
sub dmemAddr, dmemAddr, vtxLeft // Start addr = end addr - size. Rounded down to DMA word by H/W
addi dmaLen, vtxLeft, -1 // DMA length is always offset by -1
j dma_read_write
li $ra, vtx_after_dma
G_TRIFAN_handler:
li $1, 0x8000 // $ra negative = flag for G_TRIFAN
G_TRISTRIP_handler:
addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0
addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte
tri_strip_fan_loop:
lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word
addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command
beq $11, cmd_w0, tris_end // If off end of command, exit
sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit
bltz $10, tris_end // If negative, exit
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices
bltz $ra, tri_fan_store // Finish handling G_TRIFAN
addi cmd_w0, cmd_w0, 1 // Increment
andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri
bnez $11, tri_main // Draw as is
srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs
sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2
j tri_main
sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3
// H = highest on screen = lowest Y value; then M = mid, L = low
tHAtF equ $v5
tMAtF equ $v7
tLAtF equ $v9
tHAtI equ $v18
tMAtI equ $v19
tLAtI equ $v21
tHPos equ $v14
tMPos equ $v2
tLPos equ $v10
tPosMmH equ $v6
tPosLmH equ $v8
tPosHmM equ $v11
G_TRI2_handler:
G_QUAD_handler:
jal tri_main // Send second tri; return here for first tri
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store second tri indices
G_TRI1_handler:
li $ra, tris_end // After done with this tri, exit tri processing
sw cmd_w0, 4(rdpCmdBufPtr) // Store first tri indices
tri_main:
lpv $v27[0], 0(rdpCmdBufPtr) // To vector unit
lbu $1, 5(rdpCmdBufPtr)
lbu $2, 6(rdpCmdBufPtr)
lbu $3, 7(rdpCmdBufPtr)
vclr vZero
lhu $1, (vertexTable)($1)
vmudn $v29, vOne, vTRC_VB // Address of vertex buffer
lhu $2, (vertexTable)($2)
vmadl $v27, $v27, vTRC_VS // Plus vtx indices times length
lhu $3, (vertexTable)($3)
vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6
.if !ENABLE_PROFILING
addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested
move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading
.endif
tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
vnxor tHAtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
vnxor tMAtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
vnxor tLAtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding
lhu $16, VTX_CLIP($1)
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
lhu $7, VTX_CLIP($2)
// vnop
lhu $8, VTX_CLIP($3)
vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
andi $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
and $11, $11, $7
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
and $11, $11, $8
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull
// 22 cycles
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
lhu $24, activeClipPlanes
vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
lw $6, geometryModeLabel // Load full geometry mode word
vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
or $10, $16, $7
vmrg tLPos, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
or $10, $10, $8 // $10 = all clip bits which are true for any verts
vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
and $10, $10, $24 // If clipping is enabled, check clip flags
vmrg $v4, tHPos, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended
vmrg tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
// 30 cycles
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor
sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
vmrg tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing)
// 34 cycles
vmrg tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
tSubPxHF equ $v4
tSubPxHI equ $v26
vmudn tSubPxHF, tHPos, $v31[5] // 0x4000
beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull.
// 36 cycles
mfc2 $1, tHPos[12] // tHPos = lowest Y value = highest on screen (x, y, addr)
tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
tPosCatF equ $v25
vsub tPosMmH, tMPos, tHPos
mfc2 $2, tMPos[12] // tMPos = mid vertex (x, y, addr)
vsub tPosLmH, tLPos, tHPos
.if !ENABLE_PROFILING
sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit
.endif
vsub tPosHmM, tHPos, tMPos
andi $6, $6, (G_SHADE | G_ZBUFFER)
vsub tPosCatI, tLPos, tMPos
mfc2 $3, tLPos[12] // tLPos = highest Y value = lowest on screen (x, y, addr)
vmov tPosCatI[2], tPosMmH[0]
.if !CFG_NO_OCCLUSION_PLANE
and $16, $16, $7
and $16, $16, $8
andi $16, $16, CLIP_OCCLUDED
.endif
tXPF equ $v16 // Triangle cross product
tXPI equ $v17
tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
tXPRcpI equ $v24
t1WI equ $v13 // elems 0, 4, 6
t1WF equ $v14
vmudh $v29, tPosMmH, tPosLmH[0]
.if !CFG_NO_OCCLUSION_PLANE
bnez $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
.endif
llv t1WI[0], VTX_INV_W_VEC($1)
vmadh $v29, tPosLmH, tPosHmM[0]
lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
vreadacc tXPI, ACC_UPPER
lpv tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
vreadacc tXPF, ACC_MIDDLE
lpv tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
vrcp $v20[0], tPosCatI[1]
.if !ENABLE_PROFILING
lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx)
.endif
vmov tPosCatI[3], tPosLmH[0]
llv t1WI[8], VTX_INV_W_VEC($2)
vrcph $v22[0], tXPI[1]
llv t1WI[12], VTX_INV_W_VEC($3)
vrcpl tXPRcpF[1], tXPF[1]
.if !ENABLE_PROFILING
bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set
.endif
vrcph tXPRcpI[1], $v31[2] // 0
.if !ENABLE_PROFILING
vlt $v29, $v31, $v31[3] // Set vcc to 11100000
vmrg tHAtI, $v25, tHAtI // RGB from $4, alpha from $1
vmrg tMAtI, $v25, tMAtI // RGB from $4, alpha from $2
vmrg tLAtI, $v25, tLAtI // RGB from $4, alpha from $3
tri_skip_flat_shading:
.endif
// 52 cycles
vrcp $v20[2], tPosMmH[1]
lb $20, (alphaCompareCullMode)($zero)
vrcph $v22[2], tPosMmH[1]
lw $16, VTX_INV_W_VEC($1) // $16, $7, $8 = 1/W for H, M, L
vrcp $v20[3], tPosLmH[1]
lw $7, VTX_INV_W_VEC($2)
vrcph $v22[3], tPosLmH[1]
lw $8, VTX_INV_W_VEC($3)
vmudl tHAtI, tHAtI, vTRC_0100 // vertex color 1 >>= 8
lbu $9, textureSettings1 + 3
vmudl tMAtI, tMAtI, vTRC_0100 // vertex color 2 >>= 8
sub $11, $16, $7 // Four instr: $16 = max($16, $7)
vmudl tLAtI, tLAtI, vTRC_0100 // vertex color 3 >>= 8
sra $10, $11, 31
vmudl $v29, $v20, vTRC_0020
// no nop if tri_skip_flip_facing was unaligned
vmadm $v22, $v22, vTRC_0020
beqz $20, tri_skip_alpha_compare_cull
vmadn $v20, $v31, $v31[2] // 0
// Alpha compare culling
vge $v26, tHAtI, tMAtI
lbu $19, alphaCompareCullThresh
vlt $v27, tHAtI, tMAtI
bgtz $20, @@skip1
vge $v26, $v26, tLAtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts
vlt $v26, $v27, tLAtI // else if < 0, $v26 = min of 3 verts
@@skip1: // $v26 elem 3 has max or min alpha value
mfc2 $24, $v26[6]
sub $24, $24, $19 // sign bit set if (max/min) < thresh
xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
bltz $24, return_and_end_mat // if max < thresh or if min >= thresh.
tri_skip_alpha_compare_cull:
// 63 cycles
vmudm tPosCatF, tPosCatI, vTRC_1000
// no nop if tri_skip_alpha_compare_cull was unaligned
vmadn tPosCatI, $v31, $v31[2] // 0
and $11, $11, $10
vsubc tSubPxHF, vZero, tSubPxHF
sub $16, $16, $11
vsub tSubPxHI, vZero, vZero
sub $11, $16, $8 // Four instr: $16 = max($16, $8)
vmudm $v29, tPosCatF, $v20
sra $10, $11, 31
vmadl $v29, tPosCatI, $v20
and $11, $11, $10
vmadn $v20, tPosCatI, $v22
sub $16, $16, $11
vmadh tPosCatI, tPosCatF, $v22
sw $16, 0x0010(rdpCmdBufPtr) // Store max of three verts' 1/W to temp mem
vmudl $v29, tXPRcpF, tXPF
tMx1W equ $v27
llv tMx1W[0], 0x0010(rdpCmdBufPtr) // Load max of three verts' 1/W
vmadm $v29, tXPRcpI, tXPF
mfc2 $16, tXPI[1]
vmadn tXPF, tXPRcpF, tXPI
lbu $7, textureSettings1 + 2
vmadh tXPI, tXPRcpI, tXPI
lsv tMAtI[14], VTX_SCR_Z($2)
vand $v22, $v20, vTRC_FFF8
lsv tLAtI[14], VTX_SCR_Z($3)
vcr tPosCatI, tPosCatI, vTRC_0100
lsv tMAtF[14], VTX_SCR_Z_FRAC($2)
vmudh $v29, vOne, $v31[4] // 4
lsv tLAtF[14], VTX_SCR_Z_FRAC($3)
vmadn tXPF, tXPF, $v31[0] // -4
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
vmadh tXPI, tXPI, $v31[0] // -4
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
vmudn $v29, $v3, tHPos[0]
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
vmadl $v29, $v22, tSubPxHF[1]
ssv tLPos[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
vmadm $v29, tPosCatI, tSubPxHF[1]
ssv tMPos[2], 0x0004(rdpCmdBufPtr) // Store YM edge coefficient
vmadn $v2, $v22, tSubPxHI[1]
ssv tHPos[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
vmadh $v3, tPosCatI, tSubPxHI[1]
lw $19, otherMode1
tMnWI equ $v27
tMnWF equ $v10
vrcph $v29[0], tMx1W[0] // Reciprocal of max 1/W = min W
andi $10, $16, 0x0080 // Extract the left major flag from $16
vrcpl tMnWF[0], tMx1W[1]
or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
vmudh t1WF, vOne, t1WI[1q]
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
vrcph tMnWI[0], $v31[2] // 0
sb $zero, materialCullMode // This covers tri write out
tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF
tSTWHMF equ $v25
vmudh tSTWHMI, vOne, $v31[7] // 0x7FFF
ssv tPosMmH[2], 0x0030(rdpCmdBufPtr) // MmHY -> first short (temp mem)
vmudm $v29, t1WI, tMnWF[0] // 1/W each vtx * min W = 1 for one of the verts, < 1 for others
llv tSTWHMI[0], VTX_TC_VEC($1)
vmadl $v29, t1WF, tMnWF[0]
ssv tPosLmH[0], 0x0032(rdpCmdBufPtr) // LmHX -> second short (temp mem)
vmadn t1WF, t1WF, tMnWI[0]
llv tSTWHMI[8], VTX_TC_VEC($2)
vmadh t1WI, t1WI, tMnWI[0]
ssv tPosHmM[0], 0x0034(rdpCmdBufPtr) // HmMX -> third short (temp mem)
tSTWLI equ $v10 // L = elems 4-6; init W = 7FFF
tSTWLF equ $v13
vmudh tSTWLI, vOne, $v31[7] // 0x7FFF
andi $19, $19, ZMODE_DEC // Mask to two Z mode bits
set_vcc_11110001 // select RGBA___Z or ____STW_
llv tSTWLI[8], VTX_TC_VEC($3)
vmudm $v29, tSTWHMI, t1WF[0h] // (S, T, 7FFF) * (1 or <1) for H and M
addi $19, $19, -ZMODE_DEC // Check if equal to decal mode
vmadh tSTWHMI, tSTWHMI, t1WI[0h]
ldv tPosLmH[8], 0x0030(rdpCmdBufPtr) // MmHY -> e4, LmHX -> e5, HmMX -> e6
vmadn tSTWHMF, $v31, $v31[2] // 0
vmudm $v29, tSTWLI, t1WF[6] // (S, T, 7FFF) * (1 or <1) for L
vmadh tSTWLI, tSTWLI, t1WI[6]
vmadn tSTWLF, $v31, $v31[2] // 0
sdv tSTWHMI[0], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int to temp mem
vmrg tMAtI, tMAtI, tSTWHMI // Merge S, T, W Mid into elems 4-6
sdv tSTWHMF[0], 0x0028(rdpCmdBufPtr) // Move S, T, W Hi Frac to temp mem
vmrg tMAtF, tMAtF, tSTWHMF // Merge S, T, W Mid into elems 4-6
ldv tHAtI[8], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int from temp mem
vmrg tLAtI, tLAtI, tSTWLI // Merge S, T, W Low into elems 4-6
ldv tHAtF[8], 0x0028(rdpCmdBufPtr) // Move S, T, W Hi Frac from temp mem
vmrg tLAtF, tLAtF, tSTWLF // Merge S, T, W Low into elems 4-6
.if !ENABLE_PROFILING
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
.endif
// 106 cycles
vmudl $v29, tXPF, tXPRcpF
lsv tHAtF[14], VTX_SCR_Z_FRAC($1)
vmadm $v29, tXPI, tXPRcpF
lsv tHAtI[14], VTX_SCR_Z($1) // contains R, G, B, A, S, T, W, Z
vmadn tXPRcpF, tXPF, tXPRcpI
lh $1, VTX_SCR_VEC($2)
vmadh tXPRcpI, tXPI, tXPRcpI
addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
vmudh tPosLmH, tPosLmH, $v31[0h] // e1 LmHY * -4 = 4*HmLY; e456 MmHY,LmHX,HmMX *= 4
tAtLmHF equ $v10
tAtLmHI equ $v9
tAtMmHF equ $v13
tAtMmHI equ $v7
vsubc tAtLmHF, tLAtF, tHAtF
andi $3, $6, G_SHADE
vsub tAtLmHI, tLAtI, tHAtI
sll $1, $1, 14
vsubc tAtMmHF, tMAtF, tHAtF
sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient
vsub tAtMmHI, tMAtI, tHAtI
ssv $v3[6], 0x0010(rdpCmdBufPtr) // Store XH edge coefficient (integer part)
// DaDx = (v3 - v1) * factor + (v2 - v1) * factor
tDaDxF equ $v2
tDaDxI equ $v3
vmudn $v29, tAtLmHF, tPosLmH[4] // MmHY * 4
ssv $v2[6], 0x0012(rdpCmdBufPtr) // Store XH edge coefficient (fractional part)
vmadh $v29, tAtLmHI, tPosLmH[4] // MmHY * 4
ssv $v3[4], 0x0018(rdpCmdBufPtr) // Store XM edge coefficient (integer part)
vmadn $v29, tAtMmHF, tPosLmH[1] // LmHY * -4 = HmLY * 4
ssv $v2[4], 0x001A(rdpCmdBufPtr) // Store XM edge coefficient (fractional part)
vmadh $v29, tAtMmHI, tPosLmH[1] // LmHY * -4 = HmLY * 4
ssv tPosCatI[0], 0x000C(rdpCmdBufPtr) // Store DxLDy edge coefficient (integer part)
vreadacc tDaDxF, ACC_MIDDLE
ssv $v20[0], 0x000E(rdpCmdBufPtr) // Store DxLDy edge coefficient (fractional part)
vreadacc tDaDxI, ACC_UPPER
ssv tPosCatI[6], 0x0014(rdpCmdBufPtr) // Store DxHDy edge coefficient (integer part)
// DaDy = (v2 - v1) * factor + (v3 - v1) * factor
tDaDyF equ $v6
tDaDyI equ $v7
vmudn $v29, tAtMmHF, tPosLmH[5] // LmHX * 4
ssv $v20[6], 0x0016(rdpCmdBufPtr) // Store DxHDy edge coefficient (fractional part)
vmadh $v29, tAtMmHI, tPosLmH[5] // LmHX * 4
ssv tPosCatI[4], 0x001C(rdpCmdBufPtr) // Store DxMDy edge coefficient (integer part)
vmadn $v29, tAtLmHF, tPosLmH[6] // HmMX * 4
ssv $v20[4], 0x001E(rdpCmdBufPtr) // Store DxMDy edge coefficient (fractional part)
vmadh $v29, tAtLmHI, tPosLmH[6] // HmMX * 4
sll $11, $3, 4 // Shift (geometry mode & G_SHADE) by 4 to get 0x40 if G_SHADE is set
vreadacc tDaDyF, ACC_MIDDLE
add $1, $2, $11 // Increment the triangle pointer by 0x40 bytes (shade coefficients) if G_SHADE is set
vreadacc tDaDyI, ACC_UPPER
sll $11, $9, 5 // Shift texture enabled (which is 2 when on) by 5 to get 0x40 if textures are on
// DaDx, DaDy *= more factors
vmudl $v29, tDaDxF, tXPRcpF[1]
add rdpCmdBufPtr, $1, $11 // Increment the triangle pointer by 0x40 bytes (texture coefficients) if textures are on
vmadm $v29, tDaDxI, tXPRcpF[1]
andi $6, $6, G_ZBUFFER // Get the value of G_ZBUFFER from the current geometry mode
vmadn tDaDxF, tDaDxF, tXPRcpI[1]
sll $11, $6, 4 // Shift (geometry mode & G_ZBUFFER) by 4 to get 0x10 if G_ZBUFFER is set
vmadh tDaDxI, tDaDxI, tXPRcpI[1]
move $10, rdpCmdBufPtr // Write Z here
vmudl $v29, tDaDyF, tXPRcpF[1]
add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set
vmadm $v29, tDaDyI, tXPRcpF[1]
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP
vmadn tDaDyF, tDaDyF, tXPRcpI[1]
sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional)
vmadh tDaDyI, tDaDyI, tXPRcpI[1]
sdv tDaDxI[0], 0x0008($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (integer)
// DaDe = DaDx * factor
tDaDeF equ $v8
tDaDeI equ $v9
// 135 cycles
vmadl $v29, tDaDxF, $v20[3]
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
vmadm $v29, tDaDxI, $v20[3]
sdv tDaDxI[8], 0x0008($1) // Store DsDx, DtDx, DwDx texture coefficients (integer)
vmadn tDaDeF, tDaDxF, tPosCatI[3]
sdv tDaDyF[0], 0x0038($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (fractional)
vmadh tDaDeI, tDaDxI, tPosCatI[3]
sdv tDaDyI[0], 0x0028($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer)
// Base value += DaDe * factor
vmudn $v29, tHAtF, vOne[0]
sdv tDaDyF[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficients (fractional)
vmadh $v29, tHAtI, vOne[0]
sdv tDaDyI[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficients (integer)
vmadl $v29, tDaDeF, tSubPxHF[1]
sdv tDaDeF[0], 0x0030($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional)
vmadm $v29, tDaDeI, tSubPxHF[1]
sdv tDaDeI[0], 0x0020($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (integer)
vmadn tHAtF, tDaDeF, tSubPxHI[1]
sdv tDaDeF[8], 0x0030($1) // Store DsDe, DtDe, DwDe texture coefficients (fractional)
vmadh tHAtI, tDaDeI, tSubPxHI[1]
sdv tDaDeI[8], 0x0020($1) // Store DsDe, DtDe, DwDe texture coefficients (integer)
// All values start in element 7. "a", attribute, is Z. Need
// tHAtI, tHAtF, tDaDxI, tDaDxF, tDaDeI, tDaDeF, tDaDyI, tDaDyF
// VCC is still 11110001
// 145 cycles
vmrg tDaDyI, tDaDyF, tDaDyI[7] // Elems 6-7: DzDyI:F
beqz $19, tri_decal_fix_z
vmrg tDaDxI, tDaDxF, tDaDxI[7] // Elems 6-7: DzDxI:F
tri_return_from_decal_fix_z:
vmrg tDaDeI, tDaDeF, tDaDeI[7] // Elems 6-7: DzDeI:F
sdv tHAtF[0], 0x0010($2) // Store RGBA shade color (fractional)
vmrg $v10, tHAtF, tHAtI[7] // Elems 6-7: ZI:F
sdv tHAtI[0], 0x0000($2) // Store RGBA shade color (integer)
sdv tHAtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional)
sdv tHAtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer)
slv tDaDyI[12], 0x0C($10) // DzDyI:F
slv tDaDxI[12], 0x04($10) // DzDxI:F
slv tDaDeI[12], 0x08($10) // DzDeI:F
bltz dmemAddr, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
slv $v10[12], 0x00($10) // ZI:F
// 156 cycles
flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
mfc0 $11, SP_DMA_BUSY // Check if any DMA is in flight
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
lw $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
.if CFG_PROFILING_C
// This is a wait for DMA busy loop, but written inline to avoid overwriting ra.
addi perfCounterD, perfCounterD, 7 // 6 instr + 1 taken branch
.endif
bnez $11, flush_rdp_buffer // Wait until no DMAs are active
addi dmaLen, dmemAddr, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy
blez dmaLen, old_return_routine // Exit if nothing to copy, or if dmemAddr is large negative num from last flush DMA write
mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time)
add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer
sub $10, $10, $11 // $10 = FIFO end addr - future pointer
bgez $10, @@has_room // Branch if we can fit this
@@await_rdp_dblbuf_avail:
mfc0 $11, DPC_STATUS // Read RDP status
andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available
.if COUNTER_C_FIFO_FULL
addi perfCounterC, perfCounterC, 7 // 4 instr + 2 after mfc + 1 taken branch
.endif
lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO
@@await_past_first_instr:
mfc0 $11, DPC_CURRENT // Load RDP current pointer
beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start
.if COUNTER_C_FIFO_FULL
addi perfCounterC, perfCounterC, 6 // 3 instr + 2 after mfc + 1 taken branch
.else
nop
.endif
// Start was previously the start of the FIFO, unless this is the first buffer,
// in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we
// have a new end value waiting (END_VALID), it'll load end but leave current. By
// setting start here, it will also load current with start.
mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO
@@keep_waiting:
.if COUNTER_C_FIFO_FULL
// This is here so we only count it when stalling below or on FIFO end codepath
addi perfCounterC, perfCounterC, 10 // 7 instr + 2 after mfc + 1 taken branch
.endif
@@has_room:
mfc0 $11, DPC_CURRENT // Load RDP current pointer
sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start)
blez $11, @@copy_buffer // Current is behind or at current end, can do copy
sub $11, $11, dmaLen // If amount current is ahead of current end
blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting
@@copy_buffer:
add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size
sw $11, rdpFifoPos
// Set up the DMA from DMEM to the RDP fifo in RDRAM
addi dmaLen, dmaLen, -1 // subtract 1 from the length
addi dmemAddr, rdpCmdBufEndP1, -(0x2000 | (RDP_CMD_BUFSIZE + 8)) // The 0x2000 is meaningless, negative means write
xori rdpCmdBufEndP1, rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word ^ rdpCmdBuffer2EndPlus1Word // Swap between the two RDP command buffers
j dma_read_write
addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8)
tri_decal_fix_z:
// Valid range of tHAtI = 0 to 7FFF, but most of the scene is large values
vmudh $v29, vOne, vTRC_DO // accum all elems = -DM/2
vmadm $v25, tHAtI, vTRC_DM // elem 7 = (0 to DM/2-1) - DM/2 = -DM/2 to -1
vcr tDaDyI, tDaDyI, $v25[7] // Clamp DzDyI (6) to <= -val or >= val; clobbers DzDyF (7)
j tri_return_from_decal_fix_z
set_vcc_11110001 // Clobbered by vcr
tri_culled_by_occlusion_plane:
.if CFG_PROFILING_B
addi perfCounterB, perfCounterB, 0x4000
.endif
return_and_end_mat:
jr $ra
sb $zero, materialCullMode // This covers all tri early exits except clipping
tri_fan_store:
lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1
j tri_main
sb $11, 5(rdpCmdBufPtr) // Store vtx 1
.if (. & 4)
.warning "One instruction of padding before ovl234"
.endif
.align 8
vtx_select_lighting:
.if CFG_PROFILING_B
srl $11, vtxLeft, 4 // Vertex count
add perfCounterA, perfCounterA, $11 // Add to number of lit vertices
.endif
bltz viLtFlag, ovl234_ltadv_entrypoint // Advanced lighting if have point lights
andi $10, vGeomMid, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
bnez $10, ovl234_ltadv_entrypoint // Advanced lighting if specular or Fresnel
lbu ambLight, numLightsxSize
// Fallthrough to ltbasic on whichever overlay is loaded
.if (. & 4)
.error "vtx_select_lighting must be an even number of instructions"
.endif
ovl234_start:
ovl3_start:
// Clipping overlay.
// Jump here for basic lighting setup. If overlay 3 is loaded (this code), loads overlay 2
// and jumps to right here, which is now in the new code.
ovl234_ltbasic_entrypoint_ovl3ver: // same IMEM address as ovl234_ltbasic_entrypoint
.if CFG_PROFILING_B
addi perfCounterC, perfCounterC, 1 // Count lighting overlay load
.endif
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
li cmd_w1_dram, orga(ovl2_start) // set up a load for overlay 2
// Jump here for advanced lighting. If overlay 3 is loaded (this code), loads
// overlay 4 and jumps to right here, which is now in the new code.
ovl234_ltadv_entrypoint_ovl3ver: // same IMEM address as ovl234_ltadv_entrypoint
.if CFG_PROFILING_B
addi perfCounterD, perfCounterD, 1 // Count overlay 4 load
.endif
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
li cmd_w1_dram, orga(ovl4_start) // set up a load for overlay 4
// Jump here for clipping and rare commands. If overlay 3 is loaded (this code), directly starts
// the clipping code.
ovl234_clipmisc_entrypoint:
sh $ra, tempTriRA // Tri return after clipping
.if CFG_PROFILING_B
nop // Needs to take up the space for the other perf counter
.endif
bnez $1, vtx_constants_for_clip // In clipping, $1 is vtx 1 addr, never 0. Cmd dispatch, $1 = 0.
li inVtx, 0x8000 // inVtx < 0 means from clipping. Inc'd each vtx write by 2 * inputVtxSize, but this is large enough it should stay negative.
li $3, (0xFF00 | G_MEMSET)
beq $3, $7, g_memset_ovl3
lw cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Overwritten by overlay load
g_dma_io_ovl3: // otherwise
jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
// At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
// So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
sra dmemAddr, dmemAddr, 2
j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command
clip_after_constants:
.if CFG_PROFILING_B
addi perfCounterB, perfCounterB, 1 // Increment clipped (input) tris count
.endif
// Clear all temp vertex slots used.
li $11, (MAX_CLIP_GEN_VERTS - 1) * vtxSize
clip_init_used_loop:
sh $zero, (VTX_CLIP + clipTempVerts)($11)
bgtz $11, clip_init_used_loop
addi $11, $11, -vtxSize
li clipMaskIdx, 4 // 4=screen, 3=+x, 2=-x, 1=+y, 0=-y
li clipMask, CLIP_CAMPLANE // Initial clip mask for screen clipping
li clipPolySelect, 6 // Everything being indexed from 6 saves one instruction at the end of the loop
sh $1, (clipPoly - 6 + 0)(clipPolySelect) // Write the current three verts
sh $2, (clipPoly - 6 + 2)(clipPolySelect) // as the initial polygon
sh $3, (clipPoly - 6 + 4)(clipPolySelect) // Initial state $3 = clipVLastOfsc
sh $zero, (clipPoly)(clipPolySelect) // nullptr to mark end of polygon
sb $zero, materialCullMode // In case only/all tri(s) clip then offscreen
// Available locals here: $11, $1, $7, $20, $24, $10
clip_condlooptop:
lhu clipFlags, VTX_CLIP(clipVLastOfsc) // Load flags for final vertex of the last polygon
addi clipPolyRead, clipPolySelect, -6 // Start reading at the beginning of the old polygon
xori clipPolySelect, clipPolySelect, 6 ^ ((clipPoly2 - clipPoly) + 6) // Swap to the other polygon memory
addi clipPolyWrite, clipPolySelect, -6 // Start writing at the beginning of the new polygon
and clipFlags, clipFlags, clipMask // Mask last flags to current clip condition
clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the edge
lhu clipVNext, (clipPoly)(clipPolyRead) // Read next vertex (farther end of edge)
addi clipPolyRead, clipPolyRead, 0x0002 // Increment read pointer
beqz clipVNext, clip_nextcond // If next vtx is nullptr, done with input polygon
lhu $11, VTX_CLIP(clipVNext) // Load flags for next vtx
and $11, $11, clipMask // Mask next flags to current clip condition
beq $11, clipFlags, clip_nextedge // Both set or both clear = both off screen or both on screen, no subdivision
move clipFlags, $11 // clipFlags = masked next vtx's flags
// Going to subdivide this edge. Find available temp vertex slot.
li outVtxBase, clipTempVertsEnd
clip_find_unused_loop:
lhu $11, (VTX_CLIP - vtxSize)(outVtxBase)
addi $10, outVtxBase, -clipTempVerts // This is within the loop rather than before b/c delay after lhu
blez $10, clip_done // If can't find one (should never happen), give up
andi $11, $11, CLIP_VTX_USED
bnez $11, clip_find_unused_loop
addi outVtxBase, outVtxBase, -vtxSize
beqz clipFlags, clip_skipswap23 // Next vtx flag is clear / on screen,
move clipVOnsc, clipVNext // therefore last vtx is set / off screen
move clipVOnsc, clipVLastOfsc // Otherwise swap; note we are overwriting
move clipVLastOfsc, clipVNext // clipVLastOfsc but not clipVNext
clip_skipswap23:
// Interpolate between clipVLastOfsc and clipVOns; create a new vertex which is on the
// clipping boundary (e.g. at the screen edge)
cPosOnOfF equ vpClpF
cPosOnOfI equ vpClpI
cPosOfF equ vpScrF
cPosOfI equ vpScrI
cRGBAOf equ vpLtTot
cRGBAOn equ vpRGBA
cSTOf equ vpST
cSTOn equ sSTS // Intentionally overwriting this kept reg. Vtx scales ST again, need to re-store unscaled value.
// Also uses sRTF, sRTI = vTemp1, vTemp2, and vtx_final_setup_for_clip sets sOPM = vKept2
cTemp equ vpMdl
cBaseF equ vpNrmlX
cBaseI equ vpNrmlY
cDiffF equ $v2
cDiffI equ $v3
cRRF equ $v4 // Range Reduction frac
cRRI equ $v5 // Range Reduction int
cFadeOf equ $v6
cFadeOn equ $v7
/*
Five clip conditions (these are in a different order from vanilla):
cBaseI/cBaseF[3] cDiffI/cDiffF[3]
4 W=0: W1 W1 - W2
3 +X : X1 - 2*W1 (X1 - 2*W1) - (X2 - 2*W2) <- the 2 is clip ratio
2 -X : X1 + 2*W1 (X1 + 2*W1) - (X2 + 2*W2)
1 +Y : Y1 - 2*W1 (Y1 - 2*W1) - (Y2 - 2*W2)
0 -Y : Y1 + 2*W1 (Y1 + 2*W1) - (Y2 + 2*W2)
*/
xori $11, clipMaskIdx, 1 // Invert sign of condition
ldv cPosOnOfF[0], VTX_FRAC_VEC(clipVOnsc)
ctc2 $11, $vcc // Conditions 1 (+y) or 3 (+x) -> vcc[0] = 0
ldv cPosOnOfI[0], VTX_INT_VEC (clipVOnsc)
vmrg cTemp, vOne, $v31[1] // elem 0 is 1 if W or neg cond, -1 if pos cond
andi $11, clipMaskIdx, 4 // W condition and screen clipping
ldv cPosOnOfF[8], VTX_FRAC_VEC(clipVLastOfsc) // Off screen to elems 4-7
bnez $11, clip_w // If so, use 1 or -1
ldv cPosOnOfI[8], VTX_INT_VEC (clipVLastOfsc)
vmudh cTemp, cTemp, $v31[3] // elem 0 is (1 or -1) * 2 (clip ratio)
andi $11, clipMaskIdx, 2 // Conditions 2 (-x) or 3 (+x)
vmudm cBaseF, vOne, cPosOnOfF[0h] // Set accumulator (care about 3, 7) to X
bnez $11, clip_skipy
vmadh cBaseI, vOne, cPosOnOfI[0h]
vmudm cBaseF, vOne, cPosOnOfF[1h] // Discard that and set accumulator 3, 7 to Y
vmadh cBaseI, vOne, cPosOnOfI[1h]
clip_skipy:
vmadn cBaseF, cPosOnOfF, cTemp[0] // + W * +/- 2
vmadh cBaseI, cPosOnOfI, cTemp[0]
clip_skipxy:
vsubc cDiffF, cBaseF, cBaseF[7] // Vtx on screen - vtx off screen
vsub cDiffI, cBaseI, cBaseI[7]
// This is computing cDiffI:F = cBaseI:F / cDiffI:F to high precision.
// The first step is a range reduction, where cRRF becomes a scale factor
// (roughly min(1.0f, abs(1.0f / cDiffI:F))) which scales down cDiffI:F (denominator)
// Then the reciprocal of cDiffI:F is computed with a Newton-Raphson iteration
// and multiplied by cBaseI:F. Finally scale down the result (numerator) by cRRF.
vor cTemp, cDiffI, vOne[0] // Round up int sum to odd; this ensures the value is not 0, otherwise vabs result will be 0 instead of +/- 2
sub $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing
vrcph cRRI[3], cDiffI[3]
addi $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot
vrcpl cRRF[3], cDiffF[3] // 1 / (x+y+z+w), vtx on screen - vtx off screen
bgez $11, clip_done // If so, give up
vrcph cRRI[3], $v31[2] // 0; get int result of reciprocal
vabs cTemp, cTemp, $v31[3] // 2; cTemp = +/- 2 based on sum positive (incl. zero) or negative
lhu $11, VTX_CLIP(clipVLastOfsc) // Load clip flags for off screen vert
vmudn cRRF, cRRF, cTemp[3] // multiply reciprocal by +/- 2
sh outVtxBase, (clipPoly)(clipPolyWrite) // Write pointer to generated vertex to polygon
vmadh cRRI, cRRI, cTemp[3]
addi clipPolyWrite, clipPolyWrite, 2 // Increment write ptr
veq cRRI, cRRI, $v31[2] // 0; if RR int part is 0
andi $11, $11, ~CLIP_VTX_USED // Clear used flag from off screen vert
vmrg cRRF, cRRF, $v31[1] // keep RR frac, otherwise set frac to 0xFFFF (max)
sh $11, VTX_CLIP(clipVLastOfsc) // Store modified clip flags for off screen vert
vmudl $v29, cDiffF, cRRF[3] // Multiply clDiffI:F by RR frac*frac
ldv cPosOfF[0], VTX_FRAC_VEC (clipVLastOfsc) // Off screen loaded above, but need
vmadm cDiffI, cDiffI, cRRF[3] // int*frac, int out
ldv cPosOfI[0], VTX_INT_VEC (clipVLastOfsc) // it in elems 0-3 for interp
vmadn cDiffF, $v31, $v31[2] // 0; get frac out
luv cRGBAOf[0], VTX_COLOR_VEC(clipVLastOfsc)
vrcph sRTI[3], cDiffI[3] // Reciprocal of new scaled cDiff (discard)
luv cRGBAOn[0], VTX_COLOR_VEC(clipVOnsc)
vrcpl sRTF[3], cDiffF[3] // frac part
llv cSTOf[0], VTX_TC_VEC (clipVLastOfsc)
vrcph sRTI[3], $v31[2] // 0; int part
llv cSTOn[0], VTX_TC_VEC (clipVOnsc) // Must be before vtx_final_setup_for_clip
vmudl $v29, sRTF, cDiffF // D*R (see Newton-Raphson explanation)
.if CFG_NO_OCCLUSION_PLANE
li vtxLeft, -1 // vtxLeft < 0 triggers vtx_epilogue
.else
li vtxLeft, inputVtxSize // but trigger this on the second loop in this version
.endif
vmadm $v29, sRTI, cDiffF
.if CFG_NO_OCCLUSION_PLANE
addi outVtxBase, outVtxBase, -vtxSize // Inc'd by 2, must point to second vtx
.else
addi outVtxBase, outVtxBase, vtxSize // Not inc'd, must point to second vtx
.endif
vmadn cDiffF, sRTF, cDiffI
li vLoopRet, vtx_loop_no_lighting
vmadh cDiffI, sRTI, cDiffI
vmudh $v29, vOne, $v31[4] // 4; 4 - 4 * (D*R)
vmadn cDiffF, cDiffF, $v31[0] // -4
vmadh cDiffI, cDiffI, $v31[0] // -4
vmudl $v29, sRTF, cDiffF // 1/cDiff result = R * that
vmadm $v29, sRTI, cDiffF
vmadn sRTF, sRTF, cDiffI
vmadh sRTI, sRTI, cDiffI
vmudl $v29, cBaseF, sRTF // cDiff regs = cBase / cDiff
vmadm $v29, cBaseI, sRTF
vmadn cDiffF, cBaseF, sRTI
vmadh cDiffI, cBaseI, sRTI
vmudl $v29, cDiffF, cRRF[3] // Scale by range reduction
vmadm cDiffI, cDiffI, cRRF[3]
vmadn cDiffF, $v31, $v31[2] // Done cDiffI:F = cBaseI:F / cDiffI:F
// Clamp to 0x0001 to 0xFFFF range and create inverse on-screen factor
vlt cDiffI, cDiffI, vOne[0] // If integer part of factor less than 1,
vmrg cDiffF, cDiffF, $v31[1] // keep frac part of factor, else set to 0xFFFF (max val)
vsubc $v29, cDiffF, vOne[0] // frac part - 1 for carry
vge cDiffI, cDiffI, $v31[2] // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
j vtx_final_setup_for_clip // Clobbers vcc and accum in !NOC config.
vmrg cFadeOf, cDiffF, vOne[0] // keep frac part of factor, else set to 1 (min val)
clip_after_final_setup: // This is here because otherwise 3 cycle stall here.
vmudn cFadeOn, cFadeOf, $v31[1] // signed x * -1 = 0xFFFF - unsigned x! Fade factor for on screen vert
// Fade between attributes for on screen and off screen vert
vmudm $v29, cRGBAOf, cFadeOf[3]
vmadm vpRGBA, cRGBAOn, cFadeOn[3]
vmudm $v29, cSTOf, cFadeOf[3]
vmadm sSTS, cSTOn, cFadeOn[3]
vmudl $v29, cPosOfF, cFadeOf[3]
vmadm $v29, cPosOfI, cFadeOf[3]
vmadl $v29, cPosOnOfF, cFadeOn[3]
vmadm vpClpI, cPosOnOfI, cFadeOn[3]
j vtx_store_for_clip
vmadn vpClpF, $v31, $v31[2] // 0; load resulting frac pos
clip_after_vtx_store:
ori flagsV1, flagsV1, CLIP_VTX_USED // Mark generated vtx as used
slv sSTS[0], (VTX_TC_VEC )(outVtx1) // Store not-twice-scaled ST
sh flagsV1, (VTX_CLIP )(outVtx1) // Store generated vertex flags
clip_nextedge:
bnez clipFlags, clip_edgelooptop // Discard V2 if it was off screen (whether inserted vtx or not)
move clipVLastOfsc, clipVNext // Move what was the end of the edge to be the new start of the edge
sub $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing
addi $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot
bgez $11, clip_done // If so, give up
sh clipVLastOfsc, (clipPoly)(clipPolyWrite) // Former V2 was on screen,
j clip_edgelooptop // so add it to the output polygon
addi clipPolyWrite, clipPolyWrite, 2
clip_w:
vcopy cBaseF, cPosOnOfF // Result is just W
j clip_skipxy
vcopy cBaseI, cPosOnOfI
clip_nextcond:
sub $11, clipPolyWrite, clipPolySelect // Are there less than 3 verts in the output polygon?
bltz $11, clip_done // If so, degenerate result, quit
sh $zero, (clipPoly)(clipPolyWrite) // Terminate the output polygon with a 0
lhu clipVLastOfsc, (clipPoly - 2)(clipPolyWrite) // Initialize edge start to the last vert
beqz clipMaskIdx, clip_draw_tris
lbu $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount
li clipMask, 1
sllv clipMask, clipMask, $11
j clip_condlooptop
addi clipMaskIdx, clipMaskIdx, -1
clip_draw_tris:
sh $zero, activeClipPlanes
// Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite
// Draws verts in pattern like 0-1-4, 1-2-4, 2-3-4
clip_draw_tris_loop:
lhu $1, (clipPoly - 6)(clipPolySelect)
lhu $2, (clipPoly - 4)(clipPolySelect)
lhu $3, (clipPoly - 2)(clipPolyWrite)
mtc2 $1, $v27[10] // Addresses go in vector regs too
mtc2 $2, $v4[12]
jal tri_noinit
mtc2 $3, $v27[14]
bne clipPolyWrite, clipPolySelect, clip_draw_tris_loop
addi clipPolySelect, clipPolySelect, 2
clip_done:
li $11, CLIP_SCAL_NPXY | CLIP_CAMPLANE
sh $11, activeClipPlanes
lh $ra, tempTriRA
fill_vertex_table:
// Create bytes 00-07
li $1, 7
@@loop1:
sb $1, (vertexTable)($1)
bgtz $1, @@loop1
addi $1, $1, -1
// Load to vu and multiply by 2 to get vertex indexes. It would be more cycles
// to change the loop above to count by 2s than the stalls here.
li $2, vertexTable
lpv $v3[0], (0)($2)
li $3, vertexTable + ((G_MAX_VERTS + 8) * 2) // Need 0-56 inclusive, so do 0-63
vmudh $v3, $v3, $v31[3] // 2; now 0x0000, 0x0200, ..., 0x0E00
@@loop2:
vmudn $v29, vOne, vTRC_VB // Address of vertex buffer
vmadl $v4, $v3, vTRC_VS // Plus vtx indices times length
vadd $v3, $v3, vTRC_1000 // increment by 8 verts = 16
addi $2, $2, 0x10
bne $2, $3, @@loop2
sqv $v4[0], (-0x10)($2)
jr $ra
// Delay slot harmless
g_memset_ovl3:
llv $v2[0], (rdpHalf1Val)($zero) // Load the memset value
sll cmd_w0, cmd_w0, 8 // Clear upper byte
jal segmented_to_physical
srl cmd_w0, cmd_w0, 8 // Number of bytes to memset (must be mult of 16)
li $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart
jal @@clamp_to_memset_buffer
vmudh $v2, vOne, $v2[1] // Move element 1 (lower bytes) to all
addi $2, $2, memsetBufferStart // First qword set is one below end
@@pre_loop:
sqv $v2, (-0x10)($2)
bne $2, $3, @@pre_loop
addi $2, -0x10
@@transaction_loop:
jal @@clamp_to_memset_buffer
li dmemAddr, 0x8000 | memsetBufferStart // Always write from start of buffer
jal dma_read_write
addi dmaLen, $2, -1
sub cmd_w0, cmd_w0, $2
bgtz cmd_w0, @@transaction_loop
add cmd_w1_dram, cmd_w1_dram, $2
j wait_for_dma_and_run_next_command
// Delay slot harmless
@@clamp_to_memset_buffer:
addi $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize)
sra $10, $11, 31
and $11, $11, $10
jr $ra
addi $2, $11, memsetBufferSize
ovl3_end:
.align 8
ovl3_padded_end:
.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga())
ovl234_end:
G_MTX_end: // TODO move to ovl3?
// Multiplies the temp loaded matrix into the M or VP matrix
lhu $6, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
li $3, tempMatrix // Input 1 = temp mem (loaded mtx)
jal while_wait_dma_busy
move $2, $6 // Input 0 = output
li $ra, run_next_DL_command
mtx_multiply:
// $3, $2 are input matrices; $6 is output matrix; $7 is 0 for return to vtx
addi $10, $3, 0x0018
@@loop:
vmadn $v7, $v31, $v31[2] // 0
addi $11, $3, 0x0008
vmadh $v6, $v31, $v31[2] // 0
addi $2, $2, -0x0020
vmudh $v29, $v31, $v31[2] // 0
@@innerloop:
ldv $v3[0], 0x0040($2)
ldv $v3[8], 0x0040($2)
lqv vTemp2[0], 0x0020($3) // Input 1
ldv $v2[0], 0x0020($2)
ldv $v2[8], 0x0020($2)
lqv vTemp1[0], 0x0000($3) // Input 1
vmadl $v29, $v3, vTemp2[0h]
addi $3, $3, 0x0002
vmadm $v29, $v2, vTemp2[0h]
addi $2, $2, 0x0008 // Increment input 0 pointer
vmadn $v5, $v3, vTemp1[0h]
bne $3, $11, @@innerloop
vmadh $v4, $v2, vTemp1[0h]
bne $3, $10, @@loop
addi $3, $3, 0x0008
sqv $v7[0], (0x0020)($6)
sqv $v6[0], (0x0000)($6)
sqv $v4[0], (0x0010)($6)
jr $ra
sqv $v5[0], (0x0030)($6)
vtx_after_dma:
srl $2, cmd_w0, 11 // n << 1
sub $2, cmd_w0, $2 // = v0 << 1
lhu outVtxBase, (vertexTable)($2) // Address of output start
andi inVtx, dmemAddr, 0xFFF8 // Round down input start addr to DMA word
.if COUNTER_A_UPPER_VERTEX_COUNT
sll $11, vtxLeft, 12 // Vtx count * 0x10000
add perfCounterA, perfCounterA, $11 // Add to vertex count
.endif
vtx_constants_for_clip:
// Sets up constants needed for vertex loop, including during clipping.
// Results fill vPerm1:4. Uses misc temps.
lhu vGeomMid, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
.if CFG_NO_OCCLUSION_PLANE
llv sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
veq $v29, $v31, $v31[3h] // VCC = 00010001
ldv sVPO[8], (viewport + 8)($zero)
llv sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
vmrg sFGM, vOne, $v31[2] // sFGM is 0,0,0,1,0,0,0,1
ldv sVPS[0], (viewport)($zero) // Load vscale duplicated in 0-3 and 4-7
vne $v29, $v31, $v31[3h] // VCC = 11101110
ldv sVPS[8], (viewport)($zero)
lb $11, geometryModeLabel + 3 // G_ATTROFFSET_ST_ENABLE in sign bit
vmrg sVPO, sVPO, sFOG[1] // Put fog offset in elements 3,7 of vtrans
llv $v30[0], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 0, 1
vmov sSTS[4], sSTS[0]
llv $v30[8], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 4, 5
vmrg sVPS, sVPS, sFOG[0] // Put fog multiplier in elements 3,7 of vscale
bltz $11, @@keepoffset
lbu $7, mvpValid
vclr $v30
@@keepoffset:
.else
lb flagsV1, geometryModeLabel + 3 // G_ATTROFFSET_ST_ENABLE in sign bit
lw $11, (fogFactor)($zero) // Load fog multiplier MSBs and offset LSBs
llv sSTS[0], (textureSettings2)($zero) // Texture ST scale in 0, 1
llv $v30[0], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 0, 1
llv $v30[8], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 4, 5
bltz flagsV1, @@keepoffset
srl $10, $11, 16 // Fog multiplier to lower bits
vclr $v30
@@keepoffset:
sh $11, (viewport + 0xE)($zero) // Store fog offset over vtrans W
vmov sSTS[4], sSTS[0]
sh $10, (viewport + 0x6)($zero) // Store fog multiplier over vscale W
lbu $7, mvpValid
ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3
ldv sO03[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2
ldv sO47[0], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // Load coeffs 4-7
ldv sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2
ldv sOCM[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) // Load mid coeffs
ldv sOCM[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) // and for vtx 2
.endif
vmov sSTS[5], sSTS[1]
bltz inVtx, clip_after_constants // inVtx < 0 means from clipping
lsv $v30[6], (perspNorm - altBase)(altBaseReg) // Perspective norm elem 3
vtx_after_setup_constants:
bnez $7, @@skip_recalc_mvp
lb viLtFlag, pointLightFlagOrDirXfrmValid
li $2, vpMatrix
li $3, mMatrix
jal mtx_multiply
li $6, mvpMatrix
sb $10, mvpValid // $10 is nonzero from mtx_multiply, in fact 0x18
@@skip_recalc_mvp:
andi $11, vGeomMid, G_LIGHTING >> 8
bnez $11, vtx_select_lighting
sb $zero, materialCullMode // Vtx ends material. Must be before lighting for clever packedNormalsMaskConstant reuse
vtx_setup_no_lighting:
li vLoopRet, vtx_loop_no_lighting
vtx_after_lt_setup:
li $11, mvpMatrix
vtx_load_mtx:
lqv vMTX0I, (0x00)($11) // Load MVP matrix
lqv vMTX2I, (0x10)($11)
lqv vMTX0F, (0x20)($11)
lqv vMTX2F, (0x30)($11)
// nop TODO
vcopy vMTX1I, vMTX0I
vcopy vMTX3I, vMTX2I
ldv vMTX1I[0], (0x08)($11)
vcopy vMTX1F, vMTX0F
ldv vMTX3I[0], (0x18)($11)
vcopy vMTX3F, vMTX2F
ldv vMTX1F[0], (0x28)($11)
ldv vMTX3F[0], (0x38)($11)
ldv vMTX0I[8], (0x00)($11)
ldv vMTX2I[8], (0x10)($11)
ldv vMTX0F[8], (0x20)($11)
beqz $11, ltadv_after_mtx // $11 = 0 = mMatrix if from ltadv
ldv vMTX2F[8], (0x30)($11)
vtx_final_setup_for_clip:
.if !CFG_NO_OCCLUSION_PLANE
vge $v29, $v31, $v31[2h] // VCC = 00110011
.endif
andi fogFlag, vGeomMid, G_FOG >> 8 // Can't put before lt b/c fogFlag = mtx valid flag.
.if !CFG_NO_OCCLUSION_PLANE
vmrg sOPM, vOne, $v31[1] // Signs of sOPM are --++--++
.endif
srl fogFlag, fogFlag, 5 // 8 if G_FOG is set, 0 otherwise
addi outVtx1, rdpCmdBufEndP1, tempPrevInvalVtx // Write prev loop vtx garbage here
.if !CFG_NO_OCCLUSION_PLANE
addi outVtx2, rdpCmdBufEndP1, tempPrevInvalVtx // Write prev loop vtx garbage here
.endif
bltz inVtx, clip_after_final_setup // inVtx < 0 means from clipping
.if CFG_NO_OCCLUSION_PLANE
addi outVtx2, rdpCmdBufEndP1, tempPrevInvalVtx // Write prev loop vtx garbage here
.else
vmudh sOPM, sOPM, $v31[5] // sOPM is 0xC000, 0xC000, 0x4000, 0x4000, repeat
.endif
jal while_wait_dma_busy // Wait for vertex load to finish
addi outVtxBase, outVtxBase, -vtxSize // Will inc by 2, but need point to 2nd
.if CFG_NO_OCCLUSION_PLANE // With occlusion plane, vpMdl loaded at vtx_store_loop_entry
ldv vpMdl[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // 1st vec pos
ldv vpMdl[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // 2nd vec pos
.endif
llv sTCL[8], (VTX_IN_CN + 0 * inputVtxSize)(inVtx) // RGBA in 4:5
llv sTCL[12], (VTX_IN_CN + 1 * inputVtxSize)(inVtx) // RGBA in 6:7
llv vpST[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1
j vtx_store_loop_entry
llv vpST[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5
.if (. & 4)
.warning "One instruction of padding before vertex loop"
.endif
.align 8
.if CFG_NO_OCCLUSION_PLANE
vtx_loop_no_lighting:
// lCOL <- sSCI
// lDTC <- sRTF
// lVCI <- sRTI
// vpLtTot <- s1WF
// vpNrmlX <- s1WI
vmadh $v29, vMTX1I, vpMdl[1h]
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
vmadn vpClpF, vMTX2F, vpMdl[2h]
or flagsV1, flagsV1, $10 // Combine results for first vertex
vmadh vpClpI, vMTX2I, vpMdl[2h]
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
// lDOT <- vpMdl
// sFOG <- lCOL
vge sFOG, vpScrI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
luv vpRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
// sCLZ <- sTCL
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
vtx_return_from_lighting:
vtx_return_from_texgen:
vtx_store_for_clip:
vmudl $v29, vpClpF, $v30[3] // Persp norm
sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0
// s1WI <- vpNrmlX
vmadm s1WI, vpClpI, $v30[3] // Persp norm
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
// s1WF <- vpLtTot
vmadn s1WF, $v31, $v31[2] // 0
sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
vmov vpScrF[1], sCLZ[2]
sbv sFOG[7], (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below
// sSCF <- lDOT
vmudn sSCF, vpClpF, $v31[3] // W * clip ratio for scaled clipping
ssv sCLZ[12], (VTX_SCR_Z )(outVtx2)
// sSCI <- sFOG
vmadh sSCI, vpClpI, $v31[3] // W * clip ratio for scaled clipping
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
vrcph $v29[0], s1WI[3]
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
// sRTF <- lDTC
vrcpl sRTF[2], s1WF[3]
ssv vpScrF[12], (VTX_SCR_Z_FRAC )(outVtx2)
// sRTI <- lVCI
vrcph sRTI[3], s1WI[7]
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
vrcpl sRTF[6], s1WF[7]
sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter
vrcph sRTI[7], $v31[2] // 0
andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0
vch $v29, vpClpI, vpClpI[3h] // Clip screen high
sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second
vcl $v29, vpClpF, vpClpF[3h] // Clip screen low
addi outVtx1, outVtxBase, -vtxSize // First output vtx always
vmudl $v29, s1WF, sRTF[2h]
cfc2 flagsV1, $vcc // Screen clip results
vmadm $v29, s1WI, sRTF[2h]
sdv vpClpF[8], (VTX_FRAC_VEC )(outVtx2)
vmadn s1WF, s1WF, sRTI[3h]
// sTCL <- sCLZ
ldv sTCL[0], (VTX_IN_TC + 2 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3
vmadh s1WI, s1WI, sRTI[3h]
sdv vpClpF[0], (VTX_FRAC_VEC )(outVtx1)
vch $v29, vpClpI, sSCI[3h] // Clip scaled high
lsv vpClpF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below
vmudh $v29, vOne, $v31[4] // 4
sdv vpClpI[8], (VTX_INT_VEC )(outVtx2)
vmadn s1WF, s1WF, $v31[0] // -4
lsv vpClpF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below
vmadh s1WI, s1WI, $v31[0] // -4
sdv vpClpI[0], (VTX_INT_VEC )(outVtx1)
vmudm $v29, vpST, sSTS // Scale ST
ldv sTCL[8], (VTX_IN_TC + 3 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7
// sST2 <- vpScrI
vmadh sST2, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
suv vpRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx
vmudl $v29, s1WF, sRTF[2h]
lsv vpClpI[14], (VTX_Z_INT )(outVtx2) // load Z into W slot, will be for fog below
vmadm $v29, s1WI, sRTF[2h]
suv vpRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx
vmadn s1WF, s1WF, sRTI[3h]
lsv vpClpI[6], (VTX_Z_INT )(outVtx1) // load Z into W slot, will be for fog below
vmadh s1WI, s1WI, sRTI[3h]
srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots
vcl $v29, vpClpF, sSCF[3h] // Clip scaled low
andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vcopy vpST, sTCL
cfc2 $11, $vcc // Scaled clip results
vmudl $v29, vpClpF, s1WF[3h] // Pos times inv W
ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2)
vmadm $v29, vpClpI, s1WF[3h] // Pos times inv W
// vpMdl <- sSCF
ldv vpMdl[0], (VTX_IN_OB + 2 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration
vmadn vpClpF, vpClpF, s1WI[3h]
ldv vpMdl[8], (VTX_IN_OB + 3 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration
vmadh vpClpI, vpClpI, s1WI[3h] // vpClpI:vpClpF = pos times inv W
addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices
vmov sTCL[4], vpST[2] // First vtx RG to elem 4
andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vmov sTCL[5], vpST[3] // First vtx BA to elem 5
sll $10, $11, 4 // Shift first vertex scaled clipping to second slots
vmudl $v29, vpClpF, $v30[3] // Persp norm
ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1)
vmadm vpClpI, vpClpI, $v30[3] // Persp norm
ssv s1WI[14], (VTX_INV_W_INT )(outVtx2)
vmadn vpClpF, $v31, $v31[2] // 0; Now vpClpI:vpClpF = projected position
ssv s1WI[6], (VTX_INV_W_INT )(outVtx1)
// vnop // TODO maybe can rotate the loop so this is the jr land slot?
slv sST2[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2
vmudh $v29, sVPO, vOne // offset * 1
slv sST2[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1
vmadh $v29, sFGM, $v31[6] // + (0,0,0,1,0,0,0,1) * 0x7F00
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
vmadn vpScrF, vpClpF, sVPS // + pos frac * scale
or flagsV2, flagsV2, $11 // Combine results for second vertex
// vpScrI <- sST2
vmadh vpScrI, vpClpI, sVPS // int part, vpScrI:vpScrF is now screen space pos
sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags
vtx_store_loop_entry:
vmudn $v29, vMTX3F, vOne
blez vtxLeft, vtx_epilogue
vmadh $v29, vMTX3I, vOne
vmadn $v29, vMTX0F, vpMdl[0h]
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
vmadh $v29, vMTX0I, vpMdl[0h]
jr vLoopRet
vmadn $v29, vMTX1F, vpMdl[1h]
vtx_epilogue:
vge sFOG, vpScrI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
or flagsV1, flagsV1, $10 // Combine results for first vertex
beqz fogFlag, @@skip_fog
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
sbv sFOG[15], (VTX_COLOR_A )(outVtx2)
sbv sFOG[7], (VTX_COLOR_A )(outVtx1)
@@skip_fog:
vmov vpScrF[1], sCLZ[2]
ssv sCLZ[12], (VTX_SCR_Z )(outVtx2)
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
ssv vpScrF[12], (VTX_SCR_Z_FRAC )(outVtx2)
bltz inVtx, clip_after_vtx_store // inVtx < 0 means from clipping
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
// Fallthrough (across the versions boundary) to vtx_end
.else // not CFG_NO_OCCLUSION_PLANE
// 70 cycles, 16 more than NOC
// 6 vu cycles for plane, 8 vu cycles for edges, 0 more vnops than NOC,
// 1 branch delay slot with SU instr, 1 land-after-branch.
vtx_loop_no_lighting:
// lDTC <- sVPS
// lVCI <- sRTI
// vpLtTot <- sTCL
// vpNrmlX <- s1WF
// lDIR <- sOTM
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0
vmrg sOCS, sOCS, sOTM // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
sbv sFOG[7], (VTX_COLOR_A + 8)($11) // ...which gets overwritten below
vmrg vpScrF, vpScrF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6
// vpRGBA <- lDIR
luv vpRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
// lDOT <- sCLZ
// lCOL <- sFOG
vtx_return_from_texgen:
vmudm $v29, vpST, sSTS // Scale ST
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
vmadh vpST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
vtx_return_from_lighting:
vge $v29, sOCS, sO47 // Each compare to coeffs 4-7
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
vmudn $v29, vMTX3F, vOne
cfc2 $11, $vcc
vmadh $v29, vMTX3I, vOne
slv vpScrF[10], (VTX_SCR_Z )(outVtx2)
vmadn $v29, vMTX0F, vpMdl[0h]
addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices
vmadh $v29, vMTX0I, vpMdl[0h]
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
vmadn $v29, vMTX1F, vpMdl[1h]
or $11, $11, $10 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded
vmadh $v29, vMTX1I, vpMdl[1h]
andi $10, $11, 0x000F // Bits 0-3 for vtx 1
// vpClpF <- lDOT
vmadn vpClpF, vMTX2F, vpMdl[2h]
addi $11, $11, -(0x0010) // If not occluded, atl 1 of 4-7 set, so $11 >= 0x10. Else $11 < 0x10.
// vpClpI <- lCOL
vmadh vpClpI, vMTX2I, vpMdl[2h]
bnez $10, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag
andi $11, $11, CLIP_OCCLUDED // This is bit 11, = sign bit b/c |$11| <= 0xFF
ori flagsV1, flagsV1, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag
@@skipv1:
// 16 cycles
vtx_store_for_clip:
vmudl $v29, vpClpF, $v30[3] // Persp norm
or flagsV2, flagsV2, $11 // occluded = $11 negative = sign bit set = $11 is flag, else 0
// s1WI <- vpMdl
vmadm s1WI, vpClpI, $v30[3] // Persp norm
sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags
// s1WF <- vpNrmlX
vmadn s1WF, $v31, $v31[2] // 0
blez vtxLeft, vtx_epilogue
vmudn $v29, vpClpF, sOCM // X * kx, Y * ky, Z * kz
vmadh $v29, vpClpI, sOCM // Int * int
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
vrcph $v29[0], s1WI[3]
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
// sRTF <- lDTC
vrcpl sRTF[2], s1WF[3]
sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter
// sRTI <- lVCI
vrcph sRTI[3], s1WI[7]
andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0
vrcpl sRTF[6], s1WF[7]
sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second
vrcph sRTI[7], $v31[2] // 0
addi outVtx1, outVtxBase, -vtxSize // First output vtx always
vreadacc sOCS, ACC_UPPER // Load int * int portion
suv vpRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx
vch $v29, vpClpI, vpClpI[3h] // Clip screen high
suv vpRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx
vmudl $v29, s1WF, sRTF[2h]
sdv vpClpI[8], (VTX_INT_VEC )(outVtx2)
vmadm $v29, s1WI, sRTF[2h]
sdv vpClpI[0], (VTX_INT_VEC )(outVtx1)
vmadn s1WF, s1WF, sRTI[3h]
sdv vpClpF[8], (VTX_FRAC_VEC )(outVtx2)
vmadh s1WI, s1WI, sRTI[3h]
sdv vpClpF[0], (VTX_FRAC_VEC )(outVtx1)
vcl $v29, vpClpF, vpClpF[3h] // Clip screen low
sqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // For Z to W manip. RGBA not currently stored here
vmudh $v29, vOne, $v31[4] // 4
cfc2 flagsV1, $vcc // Screen clip results
vmadn s1WF, s1WF, $v31[0] // -4
ssv vpClpI[4], (tempVpRGBA + 6)(rdpCmdBufEndP1) // First Z to W
vmadh s1WI, s1WI, $v31[0] // -4
// sTCL <- vpLtTot
ldv sTCL[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3
// sSCF <- vpScrF
vmudn sSCF, vpClpF, $v31[3] // W * clip ratio for scaled clipping
ssv vpClpI[12], (tempVpRGBA + 14)(rdpCmdBufEndP1) // Second Z to W
// sSCI <- vpScrI
vmadh sSCI, vpClpI, $v31[3] // W * clip ratio for scaled clipping
lsv vpClpF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below
vmudl $v29, s1WF, sRTF[2h]
lqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // Load int part with Z in W
vmadm $v29, s1WI, sRTF[2h]
lsv vpClpF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below
vmadn s1WF, s1WF, sRTI[3h]
ldv sTCL[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7
vmadh s1WI, s1WI, sRTI[3h]
srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots
vch $v29, vpClpI, sSCI[3h] // Clip scaled high
andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vcl $v29, vpClpF, sSCF[3h] // Clip scaled low
slv vpST[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2
vmudl $v29, vpClpF, s1WF[3h] // Pos times inv W
cfc2 $11, $vcc // Scaled clip results
vmadm $v29, vpClpI, s1WF[3h] // Pos times inv W
slv vpST[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1
vmadn vpClpF, vpClpF, s1WI[3h]
// sVPO <- sSCF
ldv sVPO[0], (viewport + 8)($zero) // Load viewport offset incl. fog for first vertex
vmadh vpClpI, vpClpI, s1WI[3h] // vpClpI:vpClpF = pos times inv W
ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2)
// sOTM <- vpRGBA
vadd sOTM, sOCS, sOCS[1h] // Add Y to X
ldv sVPO[8], (viewport + 8)($zero) // Load viewport offset incl. fog for second vertex
vcopy vpST, sTCL
ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1)
vmudl $v29, vpClpF, $v30[3] // Persp norm
// sVPS <- sSCI
ldv sVPS[0], (viewport)($zero) // Load viewport scale incl. fog for first vertex
vmadm vpClpI, vpClpI, $v30[3] // Persp norm
ssv s1WI[14], (VTX_INV_W_INT )(outVtx2)
vmadn vpClpF, $v31, $v31[2] // 0; Now vpClpI:vpClpF = projected position
ldv sVPS[8], (viewport)($zero) // Load viewport scale incl. fog for second vertex
vadd sOCS, sOTM, sOCS[2h] // Add Z to X
ssv s1WI[6], (VTX_INV_W_INT )(outVtx1)
vmov sTCL[4], vpST[2] // First vtx RG to elem 4
andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
vmudh $v29, sVPO, vOne // offset * 1
sll $10, $11, 4 // Shift first vertex scaled clipping to second slots
// vpScrF <- sVPO
vmadn vpScrF, vpClpF, sVPS // + pos frac * scale
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
// vpScrI <- sVPS
vmadh vpScrI, vpClpI, sVPS // int part, vpScrI:vpScrF is now screen space pos
or flagsV2, flagsV2, $11 // Combine results for second vertex
// sFOG <- vpClpI
vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
vlt $v29, sOCS, sOCM[3h] // Occlusion plane X+Y+Z<C in elems 0, 4
or flagsV1, flagsV1, $10 // Combine results for first vertex
vmov sTCL[5], vpST[3] // First vtx BA to elem 5
cfc2 $10, $vcc // Load occlusion plane mid results to bits 3 and 7
vmudh sOTM, vpScrI, $v31[4] // 4; scale up x and y
// vpMdl <- s1WI
vtx_store_loop_entry:
ldv vpMdl[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
ldv vpMdl[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration
// vnop
andi $10, $10, (1 << 0) | (1 << 4) // Only bits 0, 4 from occlusion
vmulf $v29, sOPM, vpScrI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0
vmacf sOCS, sO03, sOTM[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
vmulf $v29, sOPM, vpScrI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2
sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
vmacf sOTM, sO03, sOTM[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2
jr vLoopRet
// sCLZ <- vpClpF
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
// vnop in land slot
vtx_epilogue:
bltz inVtx, clip_after_vtx_store // inVtx < 0 means from clipping
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
// Fallthrough to vtx_end
.endif
vtx_end:
.if CFG_PROFILING_A
li $ra, 0 // Flag for coming from vtx
lqv vTRC, (vTRCValue)($zero) // Restore value overwritten by matrix
tris_end:
mfc0 $11, DPC_CLOCK
lw $10, startCounterTime
sub $11, $11, $10
beqz $ra, run_next_DL_command // $ra != 0 if from tri cmds
add perfCounterA, perfCounterA, $11 // Add to vert cycles perf counter
sub perfCounterA, perfCounterA, $11 // From tris, undo add to vert perf counter
sub $10, perfCounterC, $4 // How long we stalled for RDP FIFO during this cmd
sub $11, $11, $10 // Subtract that from the tri cycles
j run_next_DL_command
add perfCounterD, perfCounterD, $11 // Add to tri cycles perf counter
.else
j run_next_DL_command
lqv vTRC, (vTRCValue)($zero) // Restore value overwritten by matrix
.endif
.if CFG_PROFILING_B
loadOverlayInstrs equ 13
.elseif CFG_PROFILING_C
loadOverlayInstrs equ 24
.else
loadOverlayInstrs equ 12
.endif
endFreeImemAddr equ (0x1FC8 - (4 * loadOverlayInstrs))
startFreeImem:
.if . > endFreeImemAddr
.error "Out of IMEM space"
.endif
.org endFreeImemAddr
endFreeImem:
load_overlay_0_and_enter:
li postOvlRA, 0x1000 // Sets up return address
li cmd_w1_dram, orga(ovl0_start) // Sets up ovl0 table address
// To use these: set postOvlRA ($10) to the address to execute after the load is
// done, and set cmd_w1_dram to orga(your_overlay).
load_overlays_0_1:
li dmaLen, ovl01_end - 0x1000 - 1
j load_overlay_inner
li dmemAddr, 0x1000
load_overlays_2_3_4:
addi postOvlRA, $ra, -8 // Got here with jal, but want to return to addr of jal itself
li dmaLen, ovl234_end - ovl234_start - 1
li dmemAddr, ovl234_start
load_overlay_inner:
lw $11, OSTask + OSTask_ucode
.if CFG_PROFILING_B
addi perfCounterC, perfCounterC, 0x4000 // Increment overlay (all 0-4) load count
.endif
.if CFG_PROFILING_C
mfc0 ovlInitClock, DPC_CLOCK // see below
.endif
jal shared_dma_read_write // If CFG_PROFILING_C, use the one without perfCounterD
add cmd_w1_dram, cmd_w1_dram, $11
move $ra, postOvlRA
// Fall through to while_wait_dma_busy
.if CFG_PROFILING_C
// ...except if profiling DMA time. According to Tharo's testing, and in contradiction
// to the manual, almost no instructions are issued while an IMEM DMA is happening.
// So we have to time it using counters.
mfc0 $11, SP_DMA_BUSY
@@while_dma_busy:
bnez $11, @@while_dma_busy
mfc0 $11, SP_DMA_BUSY
mfc0 $11, DPC_CLOCK
sub $11, $11, ovlInitClock
jr $ra
add perfCounterD, perfCounterD, $11
// Also, normal dma_read_write below can't be changed to insert perfCounterD due to
// S2DEX constraints. So we have to duplicate that part of it.
dma_read_write:
mfc0 $11, SP_DMA_FULL
bnez $11, dma_read_write
addi perfCounterD, perfCounterD, 6 // 3 instr + 2 after mfc + 1 taken branch
j dma_read_write_not_full
// $11 load in delay slot is harmless.
.endif
.if . != 0x1FC8
// This has to be at this address for boot and S2DEX compatibility
.error "Error in organization of end of IMEM"
.endif
// The code from here to the end is shared with S2DEX, so great care is needed for changes.
while_wait_dma_busy:
mfc0 $11, SP_DMA_BUSY // Load the DMA_BUSY value
.if CFG_PROFILING_C
bnez $11, while_wait_dma_busy
// perfCounterD is $12, which is a temp register in S2DEX, which happens to
// never have state carried over while_wait_dma_busy.
addi perfCounterD, perfCounterD, 6 // 3 instr + 2 after mfc + 1 taken branch
.else
@@while_dma_busy:
bnez $11, @@while_dma_busy // Loop until DMA_BUSY is cleared
mfc0 $11, SP_DMA_BUSY // Update DMA_BUSY value
.endif
old_return_routine:
jr $ra
// Has mfc0 in branch delay slot, causes a stall if first instr after ret is load
.if !CFG_PROFILING_C
dma_read_write:
.endif
shared_dma_read_write:
mfc0 $11, SP_DMA_FULL // load the DMA_FULL value
@@while_dma_full:
bnez $11, @@while_dma_full // Loop until DMA_FULL is cleared
mfc0 $11, SP_DMA_FULL // Update DMA_FULL value
dma_read_write_not_full:
mtc0 dmemAddr, SP_MEM_ADDR // Set the DMEM address to DMA from/to
bltz dmemAddr, dma_write // If the DMEM address is negative, this is a DMA write, if not read
mtc0 cmd_w1_dram, SP_DRAM_ADDR // Set the DRAM address to DMA from/to
jr $ra
mtc0 dmaLen, SP_RD_LEN // Initiate a DMA read with a length of dmaLen
dma_write:
jr $ra
mtc0 dmaLen, SP_WR_LEN // Initiate a DMA write with a length of dmaLen
.if . != 0x00002000
.error "Code at end of IMEM shared with other ucodes has been corrupted"
.endif
.headersize 0x00001000 - orga()
// Overlay 0 handles three cases of stopping the current microcode.
// The action here is controlled by $1. If yielding, $1 > 0. If this was
// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
ovl0_start:
jal flush_rdp_buffer // See G_FLUSH_handler for docs on these 3 instructions.
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1
jal flush_rdp_buffer
add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
.if CFG_PROFILING_C
mfc0 $11, DPC_CLOCK
lw $10, startCounterTime
sub $11, $11, $10
add perfCounterA, perfCounterA, $11
.endif
bnez $1, task_done_or_yield // Continue to load ucode if 0
load_ucode:
lw cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr
sw $zero, OSTask + OSTask_flags // So next ucode knows it didn't come from yield
li dmemAddr, start // Beginning of overwritable part of IMEM
sw taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL
sw cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute
// Store counters in mvpMatrix; first 0x180 of DMEM will be preserved in ucode swap AND
// if other ucode yields
sw perfCounterA, mvpMatrix + YDF_OFFSET_PERFCOUNTERA
sw perfCounterB, mvpMatrix + YDF_OFFSET_PERFCOUNTERB
sw perfCounterC, mvpMatrix + YDF_OFFSET_PERFCOUNTERC
sw perfCounterD, mvpMatrix + YDF_OFFSET_PERFCOUNTERD
jal dma_read_write // DMA DRAM read -> IMEM write
li dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM
lw cmd_w1_dram, rdpHalf1Val // Get DRAM address of ucode data from rdpHalf1Val
li dmemAddr, endSharedDMEM // DMEM address is endSharedDMEM
andi dmaLen, cmd_w0, 0x0FFF // Extract DMEM length from command word
add cmd_w1_dram, cmd_w1_dram, dmemAddr // Start overwriting data from endSharedDMEM
jal dma_read_write // initate DMA read
sub dmaLen, dmaLen, dmemAddr // End that much before the end of DMEM
j while_wait_dma_busy
// Jumping to actual start of new ucode, which normally zeros vZero. Not sure why later ucodes
// jumped one instruction in.
li $ra, start
.if . > start
.error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE"
.endif
task_done_or_yield:
sw perfCounterA, yieldDataFooter + YDF_OFFSET_PERFCOUNTERA
sw perfCounterB, yieldDataFooter + YDF_OFFSET_PERFCOUNTERB
sw perfCounterC, yieldDataFooter + YDF_OFFSET_PERFCOUNTERC
bltz $1, task_done // $1 < 0 = Got to the end of the parent DL
sw perfCounterD, yieldDataFooter + YDF_OFFSET_PERFCOUNTERD
task_yield: // Otherwise $1 > 0 = CPU requested yield
lw $11, OSTask + OSTask_ucode // Save pointer to current ucode
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
li dmemAddr, 0x8000 // 0, but negative = write
li dmaLen, OS_YIELD_DATA_SIZE - 1
li $10, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals
sw taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR // Save pointer to where in DL
sw $11, yieldDataFooter + YDF_OFFSET_UCODE
j dma_read_write
li $ra, set_status_and_break
task_done:
// Copy just the yield data footer, which has the perf counters.
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
addi cmd_w1_dram, cmd_w1_dram, yieldDataFooter
li dmemAddr, 0x8000 | yieldDataFooter // negative = write
jal dma_read_write
li dmaLen, YIELD_DATA_FOOTER_SIZE - 1
jal while_wait_dma_busy
li $10, SP_SET_SIG2 // task done signal
set_status_and_break: // $10 is the status to set
mtc0 $10, SP_STATUS
break 0
nop
ovl0_end:
.align 8
ovl0_padded_end:
.if ovl0_padded_end > ovl01_end
.error "Automatic resizing for overlay 0 failed"
.endif
// overlay 1 (0x178 bytes loaded into 0x1000)
.headersize 0x00001000 - orga()
ovl1_start:
G_POPMTX_handler:
lw $11, matrixStackPtr // Current matrix stack pointer
lw $2, OSTask + OSTask_dram_stack // Top of the stack
sub cmd_w1_dram, $11, cmd_w1_dram // Decrease pointer by amount in command
sub $1, cmd_w1_dram, $2 // Is it still valid / within the stack?
bgez $1, @@skip // If so, skip the failsafe
sb $zero, mvpValid // Mark matrix as needing recompute
move cmd_w1_dram, $2 // Use the top of the stack as the new pointer
@@skip:
j do_movemem // Load the new matrix from the stack
sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer
G_MTX_handler:
// The lower 3 bits of G_MTX are, from LSb to MSb (0 value/1 value),
// matrix type (modelview/projection)
// load type (multiply/load)
// push type (nopush/push)
// In F3DEX2 (and by extension F3DZEX), G_MTX_PUSH is inverted, so 1 is nopush and 0 is push
.if CFG_PROFILING_C
addi perfCounterC, perfCounterC, 1 // Increment matrix count
.endif
andi $11, cmd_w0, G_MTX_P_MV | G_MTX_NOPUSH_PUSH // Read the matrix type and push type flags into $11
bnez $11, load_mtx // If the matrix type is projection or this is not a push, skip pushing the matrix
andi $2, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $2 (0 is multiply, 2 is load)
// TODO move this codepath to ovl3
lw cmd_w1_dram, matrixStackPtr // Set up the DMA from dmem to rdram at the matrix stack pointer
li dmemAddr, -0x2000 //
jal dma_read_write // DMA the current matrix from dmem to rdram
li dmaLen, 0x0040 - 1 // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive)
addi cmd_w1_dram, cmd_w1_dram, 0x40 // Increase the matrix stack pointer by the size of one matrix
sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer
lw cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Load command word 1 again
load_mtx:
add $7, $7, $2 // Add the load type to the command byte in $7, selects the return address based on whether the matrix needs multiplying or just loading
sb $zero, mvpValid
G_MOVEMEM_handler:
jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one
do_movemem:
andi $1, cmd_w0, 0x00FE // Move the movemem table index into $1 (bits 1-7 of the first command word)
lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into dmaLen
lhu dmemAddr, (movememTable)($1) // Load the address of the memory location for the given movemem index
srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
lh $ra, (movememHandlerTable - (G_POPMTX | 0xFF00))($7) // Loads the return address from movememHandlerTable based on command byte
j dma_read_write
G_SETOTHERMODE_H_handler: // These handler labels must be 4 bytes apart for the code below to work
add dmemAddr, dmemAddr, $2 // This is for the code above, does nothing for G_SETOTHERMODE_H
G_SETOTHERMODE_L_handler:
lw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) // resolves to othermode0 or othermode1 based on which handler was jumped to
lui $2, 0x8000
srav $2, $2, cmd_w0
srl $1, cmd_w0, 8
srlv $2, $2, $1
nor $2, $2, $zero
and $3, $3, $2
or $3, $3, cmd_w1_dram
sw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11)
lw cmd_w0, otherMode0
j G_RDP_handler
lw cmd_w1_dram, otherMode1
G_RDPSETOTHERMODE_handler:
li $1, 8 // Offset from scissor DMEM to othermode DMEM
G_SETSCISSOR_handler: // $1 is 0 if jumped here
sw cmd_w0, (scissorUpLeft)($1) // otherMode0 = scissorUpLeft + 8
j G_RDP_handler // Send the command to the RDP
sw cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8
G_GEOMETRYMODE_handler: // $7 = G_GEOMETRYMODE (as negative) if jumped here
lw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
j run_next_DL_command // run the next DL command
sw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // update the geometry mode value
G_TEXTURE_handler:
li $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler) // Calculate the offset from texrectWord1 and $11 for saving to textureSettings
G_TEXRECT_handler: // $11 contains address of handler
G_TEXRECTFLIP_handler:
// Stores first command word into textureSettings for gSPTexture, 0x00D0 for gSPTextureRectangle/Flip
sw cmd_w0, (texrectWord1 - G_TEXRECTFLIP_handler)($11)
G_RDPHALF_1_handler:
j run_next_DL_command
// Stores second command word into textureSettings for gSPTexture, 0x00D4 for gSPTextureRectangle/Flip, 0x00D8 for G_RDPHALF_1
sw cmd_w1_dram, (texrectWord2 - G_TEXRECTFLIP_handler)($11)
G_RDPHALF_2_handler:
ldv $v29[0], (texrectWord1)($zero)
lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0
addi rdpCmdBufPtr, rdpCmdBufPtr, 8
.if !ENABLE_PROFILING
addi perfCounterB, perfCounterB, 1 // Increment number of tex/fill rects
.endif
sb $zero, materialCullMode // This covers tex and fill rects
j G_RDP_handler
sdv $v29[0], -8(rdpCmdBufPtr)
G_RELSEGMENT_handler:
jal segmented_to_physical // Resolve new segment address relative to existing segment
G_MOVEWORD_handler:
srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
lhu $10, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
do_moveword:
sll $11, cmd_w0, 16 // Sign bit = upper bit of offset
add $10, $10, cmd_w0 // Offset + base; only lower 12 bits matter
bltz $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
sh cmd_w1_dram, ($10) // Store value from cmd into halfword
j run_next_DL_command
sw cmd_w1_dram, ($10) // Store value from cmd into word (offset + moveword_table[index])
// Converts the segmented address in cmd_w1_dram to the corresponding physical address
segmented_to_physical:
srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11
andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift
lw $11, (segmentTable)($11) // Get the current address of the segment
sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out
srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared
jr $ra
add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address
G_CULLDL_handler:
lhu $10, (vertexTable)(cmd_w0) // Start vtx addr
lhu $3, (vertexTable)(cmd_w1_dram) // End vertex
/*
CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1
verts which are behind the occlusion plane, and 1 vert which is behind the camera
plane and therefore randomly erroneously also set as behind the occlusion plane.
However, the convex hull of all the verts goes through visible area. This will be
incorrectly culled here. We can't afford the extra few instructions to disable
the occlusion plane if the vert is behind the camera, because this only matters for
G_CULLDL and not for tris.
*/
li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
lhu $11, VTX_CLIP($10)
culldl_loop:
and $1, $1, $11
beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render
lhu $11, (vtxSize + VTX_CLIP)($10) // next vertex clip flags
bne $10, $3, culldl_loop // loop until reaching the last vertex
addi $10, $10, vtxSize // advance to the next vertex
li cmd_w0, 0 // Clear count of DL cmds to skip loading
G_ENDDL_handler:
lbu $1, displayListStackLength // Load the DL stack index; if end stack,
beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end
addi $1, $1, -4 // Decrement the DL stack index
j call_ret_common // has a different version in ovl1
lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
ovl1_end:
.align 8
ovl1_padded_end:
.if ovl1_padded_end > ovl01_end
.error "Automatic resizing for overlay 1 failed"
.endif
.headersize ovl234_start - orga()
ovl2_start:
// Basic lighting overlay.
// Jump here for basic lighting setup. If overlay 2 is loaded (this code), jumps into the
// rest of the lighting code below.
ovl234_ltbasic_entrypoint:
.if CFG_PROFILING_B
nop // Needs to take up the space for the other perf counter
.endif
j ltbasic_continue_setup
and $11, viLtFlag, $7 // viLtFlag=7F lts valid, $7=18 mtx valid
// Jump here for advanced lighting. If overlay 2 is loaded (this code), loads
// overlay 4 and jumps to right here, which is now in the new code.
ovl234_ltadv_entrypoint_ovl2ver: // same IMEM address as ovl234_ltadv_entrypoint
.if CFG_PROFILING_B
addi perfCounterD, perfCounterD, 1 // Count overlay 4 load
.endif
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
li cmd_w1_dram, orga(ovl4_start) // set up a load for overlay 4
// Jump here for clipping and rare commands. If overlay 2 is loaded (this code), loads overlay 3
// and jumps to right here, which is now in the new code.
ovl234_clipmisc_entrypoint_ovl2ver: // same IMEM address as ovl234_clipmisc_entrypoint
sh $ra, tempTriRA // Tri return after clipping
.if CFG_PROFILING_B
addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load
.endif
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
li cmd_w1_dram, orga(ovl3_start) // set up a load for overlay 3
ltbasic_continue_setup:
bnez $11, ltbasic_setup_after_xfrm // Skip if lights and matrix were valid
addi ambLight, ambLight, altBase // Point to ambient light; stored through vtx proc
xfrm_dir_lights:
lpWrld equ $v11 // light pair world direction
lpMdl equ $v12 // light pair model space direction (not yet normalized)
lpFinal equ $v13 // light pair normalized model space direction
lpSqrI equ $v14 // Light pair direction squared int part
lpSqrF equ $v15 // Light pair direction squared frac part
lpMdl2 equ $v19 // Copy of lpMdl for pipelining
lpSumI equ $v20 // Light pair direction sum of squares int part
lpSumF equ $v21 // Light pair direction sum of squares frac part
lpRsqI equ $v22 // Light pair reciprocal square root int part
lpRsqF equ $v23 // Light pair reciprocal square root frac part
// Transform directional lights' direction by M transpose.
// First, load M transpose. $v0-$v7 is the MVP matrix and $v24-$v31 is
// permanent values, leaving $v8-$v15 and $v16-$v23 for the transposes.
// This is mainly just an excuse to use the rare ltv and swv instructions.
// The F3DEX2 implementation takes 18 instructions and 11 cycles.
// This implementation is 23 instructions and 17 cycles, but this version
// loads M transpose to both halves of each vector so we can process two
// lights at a time, which matters because there's always at least 3 lights
// (technically 2 for EX3)--the lookat directions. Plus, those 17 cycles
// also include a few instructions starting the loop.
// Memory at mMatrix contains, in shorts within qwords, for the elements we care about:
// A B C - D E F - (X int, Y int)
// G H I - - - - - (Z int, W int)
// M N O - P Q R - (X frac, Y frac)
// S T U - - - - - (Z frac, W frac)
// First, load this pattern in $v8-$v15 (int) and $v16-$v23 (frac).
// $v8 A - G - A - G - $v16 M - S - M - S -
// $v9 - B - H - B - H $v17 - N - T - N - T
// $v10 I - C - I - C - $v18 U - O - U - O -
// $v11 - - - - - - - - $v19 - - - - - - - -
// $v12 D - - - D - - - $v20 P - - - P - - -
// $v13 - E - - - E - - $v21 - Q - - - Q - -
// $v14 - - F - - - F - $v22 - - R - - - R -
// $v15 - - - - - - - - $v23 - - - - - - - -
ltv $v8[0], (mMatrix + 0x00)($zero) // A to $v8[0] etc.
ltv $v8[12], (mMatrix + 0x10)($zero) // G to $v8[2] etc.
ltv $v8[8], (mMatrix + 0x00)($zero) // A to $v8[4] etc.
ltv $v8[4], (mMatrix + 0x10)($zero) // G to $v8[6] etc.
ltv $v16[0], (mMatrix + 0x20)($zero)
ltv $v16[12], (mMatrix + 0x30)($zero)
ltv $v16[8], (mMatrix + 0x20)($zero)
ltv $v16[4], (mMatrix + 0x30)($zero)
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
vmudh $v9, vOne, $v9[1q] // B - H - B - H -
lsv $v18[6], (mMatrix + 0x2C)($zero) // U - O(R)U - O -
vmrg $v8, $v8, $v12[0q] // A D G - A D G -
lsv $v18[14], (mMatrix + 0x2C)($zero) // U - O R U - O(R)
vmrg $v10, $v10, $v14[0q] // I - C F I - C F
lpv lpWrld[0], (lightBufferLookat - altBase)(altBaseReg) // Lookat 0 and 1
vmudh $v17, vOne, $v17[1q] // N - T - N - T -
li curLight, altBase - 4 * lightSize // + ltBufOfs = light -4; write pointer
vmrg $v9, $v9, $v13 // B E H - B E H -
li $11, 0x7F // Mark lights valid
vmrg $v16, $v16, $v20[0q] // M P S - M P S -
swv $v18[4], (tempXfrmLt)(rdpCmdBufEndP1) // Stores O R U - O R U -
vmudh $v29, $v8, lpWrld[0h] // Start transforming lookat
lqv $v18, (tempXfrmLt)(rdpCmdBufEndP1)
// This is slightly wrong, vmrg writes accum lo. But only affects lookat and
// we are only reading accum mid result. Basically rounding error.
vmrg $v17, $v17, $v21 // N Q T - N Q T -
swv $v10[4], (tempXfrmLt)(rdpCmdBufEndP1) // Stores C F I - C F I -
vmadh $v29, $v9, lpWrld[1h]
lqv $v10, (tempXfrmLt)(rdpCmdBufEndP1)
vmadn $v29, $v16, lpWrld[0h]
sb $11, pointLightFlagOrDirXfrmValid
// 18 cycles
xfrm_light_loop_1:
vmadn $v29, $v18, lpWrld[2h]
xfrm_light_loop_2:
vmadn $v29, $v17, lpWrld[1h]
vmadh lpMdl, $v10, lpWrld[2h] // lpMdl[0:2] and [4:6] = two lights dir in model space
vrsqh $v29[0], lpSumI[0]
vrsql lpRsqF[0], lpSumF[0]
vrsqh lpRsqI[0], lpSumI[4]
addi curLight, curLight, 2 * lightSize // Iters: -2, 0, 2, ...
vrsql lpRsqF[4], lpSumF[4]
lw $20, (ltBufOfs + 8 + 2 * lightSize)(curLight) // First iter = light 0
vrsqh lpRsqI[4], $v31[2] // 0
lw $24, (ltBufOfs + 8 + 3 * lightSize)(curLight) // First iter = light 1
vmudh $v29, lpMdl, lpMdl // Squared
sub $10, curLight, altBaseReg // Is curLight (write ptr) <= 0?
vreadacc lpSqrF, ACC_MIDDLE // Read not-clamped value
sub $11, curLight, ambLight // Is curLight (write ptr) <, =, or > ambient light?
vreadacc lpSqrI, ACC_UPPER
sw $20, (tempXfrmLt)(rdpCmdBufEndP1) // Store light 0
vmudm $v29, lpMdl2, lpRsqF[0h] // Vec int * frac scaling
sw $24, (tempXfrmLt + 4)(rdpCmdBufEndP1) // Store light 1
vmadh lpFinal, lpMdl2, lpRsqI[0h] // Vec int * int scaling
lpv lpWrld[0], (tempXfrmLt)(rdpCmdBufEndP1) // Load dirs 0-2, 4-6
vmudm $v29, vOne, lpSqrF[2h] // Sum of squared components
vmadh $v29, vOne, lpSqrI[2h]
vmadm $v29, vOne, lpSqrF[1h]
vmadh $v29, vOne, lpSqrI[1h]
spv lpFinal[0], (tempXfrmLt)(rdpCmdBufEndP1) // Store elem 0-2, 4-6 as bytes to temp memory
vmadn lpSumF, lpSqrF, vOne // elem 0, 4; swapped so we can do vmadn and get result
lw $20, (tempXfrmLt)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit
vmadh lpSumI, lpSqrI, vOne
lw $24, (tempXfrmLt + 4)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit
vcopy lpMdl2, lpMdl
blez $10, xfrm_light_store_lookat // curLight = -2 or 0
vmudh $v29, $v8, lpWrld[0h]
// 20 cycles from xfrm_light_loop_2 not counting land
vmadh $v29, $v9, lpWrld[1h]
bgtz $11, ltbasic_setup_after_xfrm // curLight > ambient; only one light valid
sw $20, (ltBufOfs + 0xC - 2 * lightSize)(curLight) // Write light relative -2
vmadn $v29, $v16, lpWrld[0h]
bltz $11, xfrm_light_loop_1 // curLight < ambient; more lights to compute
sw $24, (ltBufOfs + 0xC - 1 * lightSize)(curLight) // Write light relative -1
ltbasic_setup_after_xfrm:
// Constants registers:
// e0 e1 e2 e3 e4 e5 e6 e7
// vLTC 0xF800 Lt1 Z AOAmb AODir Lt1 X Lt1 Y AOAmb AODir
// $v30 SOffs TOffs 0/AOa Persp SOffs TOffs 0x0020 0x0800
lpv vLTC[0], (ltBufOfs + 8 - lightSize)(ambLight) // First lt xfrmed dir in elems 4-6
addi lbFakeAmb, ambLight, ltBufOfs // Ptr to load amb light from; normally actual ambient light
li vLoopRet, ltbasic_start_standard
andi $11, vGeomMid, (G_AMBOCCLUSION | G_PACKED_NORMALS | G_LIGHTTOALPHA | G_TEXTURE_GEN) >> 8
vmov $v30[2], $v31[2] // 0 as AO alpha offset
vmov vLTC[1], vLTC[6] // Move first lt Z to elem 1; watch stall on vLTC load
beqz $11, vtx_after_lt_setup // None of the above features enabled
li lbAfter, vtx_return_from_lighting
andi $11, vGeomMid, G_TEXTURE_GEN >> 8
beqz $11, @@skip_texgen
andi $10, vGeomMid, G_PACKED_NORMALS >> 8
li lbAfter, 0x8000 | ltbasic_texgen // Negative is used as flag
@@skip_texgen:
beqz $10, @@skip_packed
move lbTexgenOrRet, lbAfter
// Packed normals setup
sbv $v31[15], (3)(lbFakeAmb) // 0xFF; Set ambient "alpha" to FF / 7F80
vmov $v30[6], $v31[2] // 0; clear element 6, will overwrite second byte of it below
sbv $v31[15], (7)(lbFakeAmb) // 0xFF; so vpLtTot alpha ~= 7FFF, so * vtx alpha
li lbAfter, ltbasic_packed
li vLoopRet, ltbasic_start_packed
lsv vLTC[0], (packedNormalsMaskConstant - altBase)(altBaseReg) // 0xF800; cull mode already zeroed
llv $v30[13], (packedNormalsConstants - altBase)(altBaseReg) // 00[20 0800 OB]; out of bounds truncates
@@skip_packed:
andi $11, vGeomMid, G_LIGHTTOALPHA >> 8
beqz $11, @@skip_l2a
andi $10, vGeomMid, G_AMBOCCLUSION >> 8
li lbAfter, ltbasic_l2a
@@skip_l2a:
beqz $10, vtx_after_lt_setup
// AO setup
move lbPostAo, lbAfter // Harmless to be done even if not AO
addi lbFakeAmb, rdpCmdBufEndP1, tempAmbient // Temp mem as ambient light
vmov $v30[2], $v31[7] // 7FFF as AO alpha offset
spv vOne[0], (0)(lbFakeAmb) // Store all zeros here (upper bytes of vOne are 0)
llv vLTC[4], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 2, 3
llv vLTC[12], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 6, 7
j vtx_after_lt_setup
li lbAfter, ltbasic_ao
xfrm_light_store_lookat:
vmadh $v29, $v9, lpWrld[1h]
spv lpFinal[0], (xfrmLookatDirs)($zero) // Store lookat. 1st time garbage, 2nd real
vmadn $v29, $v16, lpWrld[0h]
j xfrm_light_loop_2
vmadn $v29, $v18, lpWrld[2h]
// Lighting within vertex loop
.if CFG_NO_OCCLUSION_PLANE
.macro instan_lt_vec_1
vmadh $v29, vMTX1I, vpMdl[1h]
.endmacro
.macro instan_lt_vec_2
vmadn vpClpF, vMTX2F, vpMdl[2h]
.endmacro
.macro instan_lt_vec_3
vmadh vpClpI, vMTX2I, vpMdl[2h]
.endmacro
// lDOT <- vpMdl
.macro instan_lt_scl_1
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
.endmacro
.macro instan_lt_scl_2
or flagsV1, flagsV1, $10 // Combine results for first vertex
.endmacro
// sFOG <- lCOL
.macro instan_lt_vs_45
vge sFOG, vpScrI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
.endmacro
.else
.macro instan_lt_vec_1
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
.endmacro
.macro instan_lt_vec_2
vmrg sOCS, sOCS, sOTM // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
.endmacro
.macro instan_lt_vec_3
vmrg vpScrF, vpScrF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6
.endmacro
// lDOT <- sCLZ
// vpRGBA <- sOTM
.macro instan_lt_scl_1
sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0
.endmacro
.macro instan_lt_scl_2
sbv sFOG[7], (VTX_COLOR_A + 8)($11)
.endmacro
// lCOL <- sFOG
.macro instan_lt_vs_45
vmudm $v29, vpST, sSTS // Scale ST
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
vmadh vpST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
.endmacro
.endif
.align 8
// If lighting, vLoopRet = ltbasic_start_packed if packed, else ltbasic_start_standard
ltbasic_start_packed:
instan_lt_vec_1
instan_lt_vec_2
instan_lt_vec_3
vand vpNrmlX, vpMdl, vLTC[0] // 0xF800; mask X to only top 5 bits
luv lVCI[0], (tempVpRGBA)(rdpCmdBufEndP1) // Load RGBA
vmudn vpNrmlY, vpMdl, $v30[6] // (1 << 5) = 0x0020; left shift normals Y
j ltbasic_after_start
vmudn vpNrmlZ, vpMdl, $v30[7] // (1 << 11) = 0x0800; left shift normals Z
.align 8
ltbasic_start_standard:
// Using elem 3, 7 for regular normals because packed normal results are there.
instan_lt_vec_1
lpv vpNrmlX[3], (tempVpRGBA)(rdpCmdBufEndP1) // X to elem 3, 7
instan_lt_vec_2
lpv vpNrmlY[2], (tempVpRGBA)(rdpCmdBufEndP1) // Y to elem 3, 7
instan_lt_vec_3
lpv vpNrmlZ[1], (tempVpRGBA)(rdpCmdBufEndP1) // Z to elem 3, 7
vnop
luv lVCI[0], (tempVpRGBA)(rdpCmdBufEndP1) // Load vertex color input
ltbasic_after_start:
vmulf $v29, vpNrmlX, vLTC[4] // Normals X elems 3, 7 * first light dir X
// lDIR <- (NOC: -, Occ: sOTM)
lpv lDIR[0], (ltBufOfs + 8 - 2*lightSize)(ambLight) // Xfrmed dir in elems 4-6; temp reg
vmacf $v29, vpNrmlY, vLTC[5] // Normals Y elems 3, 7 * first light dir Y
luv vpLtTot, (0)(lbFakeAmb) // Total light level, init to ambient or zeros if AO
// lDOT <- (NOC: vpMdl, Occ: sCLZ)
vmacf lDOT, vpNrmlZ, vLTC[1] // Normals Z elems 3, 7 * first light dir Z
instan_lt_scl_1 // $11 can be used as a temporary, except b/w instan_lt_scl_1...
vsub lVCI, lVCI, $v30[2] // Offset alpha for AO, or 0 normally
instan_lt_scl_2 // ...and instan_lt_scl_2
// lCOL <- (Occ: sFOG here / NOC: sSCI earlier)
// vnop
beq ambLight, altBaseReg, ltbasic_post
move curLight, ambLight // Point to ambient light
ltbasic_loop:
vge lDTC, lDOT, $v31[2] // 0; clamp dot product to >= 0
vmulf $v29, vpNrmlX, lDIR[4] // Normals X elems 3, 7 * next light dir
luv lCOL, (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color
vmacf $v29, vpNrmlY, lDIR[5] // Normals Y elems 3, 7 * next light dir
addi curLight, curLight, -lightSize
vmacf lDOT, vpNrmlZ, lDIR[6] // Normals Z elems 3, 7 * next light dir
lpv lDIR[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6; DOES dual-issue
vmudh $v29, vOne, vpLtTot // Load accum mid with current light level
bne curLight, altBaseReg, ltbasic_loop
vmacf vpLtTot, lCOL, lDTC[3h] // + light color * dot product
ltbasic_post:
// (NOC: sFOG here / Occ: vpClpI later) <- lCOL
instan_lt_vs_45
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
jr lbAfter
// vpRGBA <- lDIR
vmrg vpRGBA, vpLtTot, lVCI // RGB = light, A = vtx alpha
// lbAfter = ltbasic_ao if AO else
// lbPostAo = ltbasic_l2a if L2A else
// ltbasic_packed if packed else
// lbTexgenOrRet = ltbasic_texgen if texgen else
// vtx_return_from_lighting
ltbasic_ao:
vmudn $v29, vLTC, lVCI[3h] // (aoAmb 2 6, aoDir 3 7) * (alpha - 1)
luv vpRGBA, (ltBufOfs + 0)(ambLight) // Ambient light level
vmadh lDTC, vOne, $v31[7] // + 0x7FFF (1 in s.15)
vadd lVCI, lVCI, $v31[7] // 0x7FFF; undo offset alpha
vmulf $v29, vpLtTot, lDTC[3h] // Sum of dir lights *= dir factor
vmacf vpLtTot, vpRGBA, lDTC[2h] // + ambient * amb factor
jr lbPostAo // Return, texgen, l2a, or packed
vmacf vpRGBA, $v31, $v31[2] // 0; need it in vpRGBA if returning, else in vpLtTot
ltbasic_l2a:
// Light-to-alpha (cel shading): alpha = max of light components, RGB = vertex color
vge vpLtTot, vpLtTot, vpLtTot[1h] // elem 0 = max(R0, G0); elem 4 = max(R1, G1)
vge vpLtTot, vpLtTot, vpLtTot[2h] // elem 0 = max(R0, G0, B0); equiv for elem 4
vne $v29, $v31, $v31[3h] // Reset VCC to 11101110 (clobbered by vge)
jr lbTexgenOrRet
vmrg vpRGBA, lVCI, vpLtTot[0h] // RGB is vcol (garbage if not packed); A is light
ltbasic_packed:
bgez lbTexgenOrRet, vtx_return_from_lighting // < 0 for texgen
vmulf vpRGBA, vpLtTot, lVCI // (Light color, 7FFF alpha) * vertex RGBA.
ltbasic_texgen:
// Texgen: in vpNrmlX:Y:Z; temps vpLtTot, lDOT, lDTC; out vpST.
lLkDrs equ lDTC // lighting Lookat Directions
lLkDt0 equ vpLtTot // lighting Lookat Dot product 0
lLkDt1 equ lDOT // lighting Lookat Dot product 1
lpv lLkDrs[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6
vmulf $v29, vpNrmlX, lLkDrs[0] // Normals X elems 0, 4 * lookat 0 X
vmacf $v29, vpNrmlY, lLkDrs[1] // Normals Y elems 0, 4 * lookat 0 Y
.if !CFG_NO_OCCLUSION_PLANE
addi outVtxBase, outVtxBase, -2*vtxSize // Undo doing this twice due to repeating ST scale
.endif
vmacf lLkDt0, vpNrmlZ, lLkDrs[2] // Normals Z elems 0, 4 * lookat 0 Z
vmulf $v29, vpNrmlX, lLkDrs[4] // Normals X elems 0, 4 * lookat 1 X
vmacf $v29, vpNrmlY, lLkDrs[5] // Normals Y elems 0, 4 * lookat 1 Y
vmacf lLkDt1, vpNrmlZ, lLkDrs[6] // Normals Z elems 0, 4 * lookat 1 Z
vmudh lLkDt0, vOne, lLkDt0[3h] // Move lookat 0 dot product to elem 0
lLkCns equ lLkDrs // lighting Lookat Constants
llv lLkCns[0], (texgenLinearCoeffs - altBase)(altBaseReg)
vne $v29, $v31, $v31[1h] // Set VCC to 10111011
andi $11, vGeomMid, G_TEXTURE_GEN_LINEAR >> 8
vmrg lLkDt0, lLkDt0, lLkDt1[3h] // Dot products in elements 0, 1, 4, 5
vmudh $v29, vOne, $v31[5] // 1 * 0x4000
beqz $11, vtx_return_from_texgen
vmacf vpST, lLkDt0, $v31[5] // + dot products * 0x4000 ( / 2)
// Texgen_Linear:
vmulf vpST, lLkDt0, $v31[5] // dot products * 0x4000 ( / 2)
lLkST2 equ lLkDt0 // lighting Lookat ST squared
vmulf lLkST2, vpST, vpST // ST squared
vmulf $v29, vpST, $v31[7] // Move ST to accumulator (0x7FFF = 1)
lLkTmp equ lLkDt1 // lighting Lookat Temp
vmacf lLkTmp, vpST, lLkCns[1] // + ST * 0x6CB3
vmudh $v29, vOne, $v31[5] // 1 * 0x4000
vmacf vpST, vpST, lLkCns[0] // + ST * 0x44D3
j vtx_return_from_texgen
vmacf vpST, lLkST2, lLkTmp // + ST squared * (ST + ST * coeff)
ovl2_end:
.align 8
ovl2_padded_end:
.headersize ovl234_start - orga()
ovl4_start:
// Advanced lighting overlay.
// Jump here for basic lighting setup. If overlay 4 is loaded (this code), loads overlay 2
// and jumps to right here, which is now in the new code.
ovl234_ltbasic_entrypoint_ovl4ver: // same IMEM address as ovl234_ltbasic_entrypoint
.if CFG_PROFILING_B
addi perfCounterC, perfCounterC, 1 // Count lighting overlay load
.endif
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
li cmd_w1_dram, orga(ovl2_start) // set up a load for overlay 2
// Jump here for advanced lighting. If overlay 4 is loaded (this code), jumps
// to the instruction selection below.
ovl234_ltadv_entrypoint:
.if CFG_PROFILING_B
nop // Needs to take up the space for the other perf counter
.endif
j vtx_load_mtx
li $11, mMatrix
// Jump here for clipping and rare commands. If overlay 4 is loaded (this code), loads overlay 3
// and jumps to right here, which is now in the new code.
ovl234_clipmisc_entrypoint_ovl4ver: // same IMEM address as ovl234_clipmisc_entrypoint
sh $ra, tempTriRA // Tri return after clipping
.if CFG_PROFILING_B
addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load
.endif
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
li cmd_w1_dram, orga(ovl3_start) // set up a load for overlay 3
ltadv_after_mtx:
move laPtr, inVtx
vcopy TODOpackedscales, vOne
move laVtxLeft, vtxLeft
vmudn aDWF, vMTX1F, $v31[7] // 0x7FFF; transform a normal (0, 7FFF, 0)
// 0001 00[20 0800 XX]00 = (1<<0),(1<<5),(1<<11),XX
llv TODOpackedscales[3], (packedNormalsConstants - altBase)(altBaseReg)
vmadh aDWI, vMTX1I, $v31[7]
j ltadv_normalize
llv TODOpackedscales[11], (packedNormalsConstants - altBase)(altBaseReg)
ltadv_continue_setup:
andi laL2A, vGeomMid, G_LIGHTTOALPHA >> 8
andi laTexgen, vGeomMid, G_TEXTURE_GEN >> 8
vcopy TODOnormalscale, $v29 // $v29[0:1] is int:frac scale (1 / length)
jal while_wait_dma_busy
lqv TODOparams, (fxParams - altBase)(altBaseReg) // AO, texgen, and Fresnel params
ltadv_vtx_loop:
ldv aMDL[0], (VTX_IN_OB + 0 * inputVtxSize)(laPtr) // Model pos
ldv aMDL[8], (VTX_IN_OB + 1 * inputVtxSize)(laPtr)
lw $11, (VTX_IN_CN + 1 * inputVtxSize)(laPtr) // Vtx 2 RGBA
lw laSTKept,(VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 1 ST
jal ltadv_xfrm
sw $11, (VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 2 RGBA -> Vtx 1 ST
vmadn vpWrlF, vM3F, vOne // Finish vertex pos transform
addi laPtr, laPtr, 2 * inputVtxSize
vmadh vpWrlI, vM3I, vOne
addi laVtxLeft, laVtxLeft, -2 * inputVtxSize
vsub TODOoffsetA, vpRGBA, $v31[7] // 0x7FFF; offset alpha
andi laPacked, vGeomMid, G_PACKED_NORMALS >> 8
vmudh aMDL, TODOpackedscales, aMDL[3h]
beqz laPacked, @@skip_packed
luv vpLtTot, (ltBufOfs + 0)(curLight) // Total light level, init to ambient
lpv aMDL, (VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 2:1 regular normals
@@skip_packed:
vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
jal ltadv_xfrm
vmadm aAOF, TODOoffsetA, $v30[0] // + (alpha - 1) * aoAmb factor; elems 3, 7
vmulf vpLtTot, vpLtTot, aAOF[3h] // light color *= ambient factor
sll laSpecular, vGeomMid, (31 - 5) // G_LIGHTING_SPECULAR to sign bit
vmudn $v29, aDWF, TODOnormalscale[0h] // Vec frac * int scaling, discard result
andi laSpecFres, vGeomMid, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
vmadm $v29, aDWI, TODOnormalscale[1h] // Vec int * frac scaling, discard result
vmadh vpWNrm, aDWI, TODOnormalscale[0h] // Vec int * int scaling
vmudn vpWrlF, vpWrlF, $v31[1] // -1; negate world pos so add light/cam pos to it
beqz laSpecFres, ltadv_pre_loop // Not specular or fresnel
vmadh vpWrlI, vpWrlI, $v31[1] // -1
// Get aDIR = normalize(camera - vertex), aDOT = (vpWNrm dot aDIR)
ldv aDWI[0], (cameraWorldPos - altBase)(altBaseReg) // Camera world pos
j ltadv_normal_to_vertex
ldv aDWI[8], (cameraWorldPos - altBase)(altBaseReg)
ltadv_after_camera:
// If specular, replace vpWNrm with reflected vector
vcopy TODOfresneldot, aDOT // Dot product for fresnel
bgez laSpecular, ltadv_pre_loop // Sign bit clear = not specular
li laSpecFres, 0 // Clear flag for specular or fresnel
vmulf aSCL, vpWNrm, aDOT[0h] // Projection of camera vec onto normal
vmudh $v29, aDIR, $v31[1] // -camera vec
vmadh vpWNrm, aSCL, $v31[3] // + 2 * projection
ltadv_pre_loop:
vmudh vpNrmlX, vOne, vpWNrm[0h] // Move normals to separate registers
vmudh vpNrmlY, vOne, vpWNrm[1h] // per component, in elems 0-3, 4-7
vmudh vpNrmlZ, vOne, vpWNrm[2h]
ltadv_loop:
vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
lbu $11, (ltBufOfs + 3 - lightSize)(curLight) // Light type / constant attenuation
vmadm aAOF, vpRGBA, $v30[1] // + (alpha - 1) * aoDir factor; elems 3, 7
beq curLight, altBaseReg, ltadv_post
lpv aDOT[0], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2
bnez $11, ltadv_point
luv aLTC, (ltBufOfs + 0 - lightSize)(curLight) // Light color
vmulf $v29, vpNrmlX, aDOT[0]
vmacf $v29, vpNrmlY, aDOT[1]
bltzal laSpecular, ltadv_specular
vmacf aDOT, vpNrmlZ, aDOT[2]
ltadv_finish_light:
vmulf aLTC, aLTC, aAOF[3h] // light color *= dir or point light factor
vge aDOT, aDOT, $v31[2] // 0; clamp dot product to >= 0
addi curLight, curLight, -lightSize
vmudh $v29, vOne, vpLtTot // Load accum mid with current light level
j ltadv_loop
vmacf vpLtTot, aLTC, aDOT[0h] // + light color * dot product
ltadv_xfrm:
vmudn $v29, vM0F, aMDL[0h]
lbu curLight, numLightsxSize // Scalar instructions here must be OK to do twice
vmadh $v29, vM0I, aMDL[0h]
luv vpRGBA, (VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 2:1 RGBA
vmadn $v29, vM1F, aMDL[1h]
vmadh $v29, vM1I, aMDL[1h]
addi curLight, curLight, altBase // Point to ambient light
vmadn aDWF, vM2F, aMDL[2h]
jr $ra
vmadh aDWI, vM2I, aMDL[2h]
ltadv_specular:
lb $11, (ltBufOfs + 0xF - lightSize)(curLight) // Light size factor
mtc2 $11, aAOF[0] // Light size factor
vxor aDOT, aDOT, $v31[7] // = 0x7FFF - dot product
vmudh aDOT, aDOT, aAOF[0] // * size factor
jr $ra
vxor aDOT, aDOT, $v31[7] // = 0x7FFF - result
ltadv_post:
vmulf aCLO, vpRGBA, vpLtTot // RGB output is RGB * light
beqz laL2A, @@skip_cel
vcopy aALO, vpRGBA // Alpha output = vertex alpha (only 3, 7 matter)
// Cel: alpha = max of light components, RGB = vertex color
vge aALO, vpLtTot, vpLtTot[1h] // elem 0 = max(R0, G0); elem 4 = max(R1, G1)
vge aALO, aALO, aALO[2h] // elem 0 = max(R0, G0, B0); equiv for elem 4
vcopy aCLO, vpRGBA // RGB output is vertex color
vmudh aALO, vOne, aALO[0h] // move light level elem 0, 4 to 3, 7
@@skip_cel:
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
bnez laPacked, @@skip_novtxcolor
andi $11, vGeomMid, (G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
vcopy aCLO, vpLtTot // If no packed normals, base output is just light
@@skip_novtxcolor:
beqz $11, @@skip_fresnel
vmrg vpRGBA, aCLO, aALO // Merge base output and alpha output
// Fresnel: dot product in vpWNrm[3h]. Also valid rest of vpWNrm for texgen,
// aLkDt0, vpRGBA. Available: aDOT, aSCL, aDIR.
lsv aSCL[0], (vTRC_0100_addr - altBase)(altBaseReg) // 0x0100 to aSCL[0]
vabs aDOT, vpWNrm, vpWNrm // Absolute value of dot product for underwater
andi $11, vGeomMid, G_FRESNEL_COLOR >> 8
vmudh $v29, vOne, $v30[7] // Fresnel offset
vmacf aDOT, aDOT, $v30[6] // + factor * scale
beqz $11, @@skip
vmudh aDOT, aDOT, aSCL[0] // Result * 0x0100, clamped to 0x7FFF
veq $v29, $v31, $v31[3h] // Set VCC to 00010001 if G_FRESNEL_COLOR
@@skip:
vmrg vpRGBA, vpRGBA, aDOT[3h] // Replace color or alpha with fresnel
vge vpRGBA, vpRGBA, $v31[2] // Clamp to >= 0 for fresnel; doesn't affect others
@@skip_fresnel:
beqz laTexgen, @@skip_texgen // no texgen
suv vpRGBA, (VTX_IN_TC - 2 * inputVtxSize)(laPtr) // Vtx 2:1 RGBA
// Texgen: aLkDt0, vpWNrm, vpRGBA; output vpST
/*
// aLkDt0 <- vpLtTot
vmudh $v29, vOne, aLkDt0[0h]
vmulf aLkDt0, vpWNrm, aDOT // Normal * lookat 0 dir
// aLkDt1 <- TODO
lpv aLkDt1[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2
vmadh $v29, vOne, aLkDt0[1h]
// aL2F <- aALO
lpv aL2F[0], (ltBufOfs + 8 - lightSize)(curLight) // Lookat 1 dir in elems 4-6
vmadh aLkDt0, vOne, aLkDt0[2h] // aLkDt0 = dot product 0
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
vmrg aLkDt1, aLkDt1, aL2F // aLkDt1 = lookat 1 dir
vmulf aLkDt1, vpWNrm, aLkDt1 // Normal * lookat 1 dir
vmudh $v29, vOne, aLkDt1[0h]
vmadh $v29, vOne, aLkDt1[1h]
vmadh aLkDt1, vOne, aLkDt1[2h]
// TODO Rest of texgen
*/
@@skip_texgen:
lw $11, (VTX_IN_TC - 2 * inputVtxSize)(laPtr) // Vtx 2 RGBA from vtx 1 ST slot
bltz laVtxLeft, vtx_setup_no_lighting
sw laSTKept, (VTX_IN_TC - 2 * inputVtxSize)(laPtr) // Real vtx 1 ST
bgtz laVtxLeft, ltadv_vtx_loop
sw $11, (VTX_IN_CN - 1 * inputVtxSize)(laPtr) // Real vtx 2 RGBA
j vtx_setup_no_lighting
TODO make sure delay slot is OK
ltadv_point:
/*
Input vector 1 elem size 7FFF.0000 -> len^2 3FFF0001 -> 1/len 0001.0040 -> vec +801E.FFC0 -> clamped 7FFF
len^2 * 1/len = 400E.FFC1 so about half actual length
Input vector 1 elem size 0100.0000 -> len^2 00010000 -> 1/len 007F.FFC0 -> vec 7FFF.C000 -> clamped 7FFF
len^2 * 1/len = 007F.FFC0 so about half actual length
Input vector 1 elem size 0010.0000 -> len^2 00000100 -> 1/len 07FF.FC00 -> vec 7FFF.C000
Input vector 1 elem size 0001.0000 -> len^2 00000001 -> 1/len 7FFF.C000 -> vec 7FFF.C000
*/
ldv aDWI[0], (ltBufOfs + 8 - lightSize)(curLight) // Light position int part 0-3
ldv aDWI[8], (ltBufOfs + 8 - lightSize)(curLight) // 4-7
ltadv_normal_to_vertex:
// This reused for fresnel; scalar unit stuff all garbage in that case
// Input point (light / camera) in aDWI; computes aDIR = normalize(input - vertex),
// aDWI = (vpWNrm dot aDIR), and aSCL = scale on dot product
// Uses temps aDWF, aL2I, aL2F, $v29
lbu $20, (ltBufOfs + 7 - lightSize)(curLight) // PL: Linear factor
vadd aDWI, aDWI, vpWrlI // Both: Int
lbu $24, (ltBufOfs + 0xE - lightSize)(curLight) // PL: Quadratic factor
ltadv_normalize:
// Normalize vector in aDWI:vpWrlF i/f, output in aDIR. Secondary outputs for
// point lighting in $v29[0h:1h] and aL2I[0h]. Also uses temps aL2F, $11, $20, $24
// Doing point light scalar stuff too.
// Also overwrites vpLtTot elems 3, 7
vmudm $v29, aDWI, vpWrlF // Squared. Don't care about frac*frac term
sll $11, $11, 8 // Constant factor, 00000100 - 0000FF00
vmadn $v29, vpWrlF, aDWI
sll $20, $20, 6 // Linear factor, 00000040 - 00003FC0
vmadh $v29, aDWI, aDWI
vreadacc aL2F, ACC_MIDDLE
vreadacc aL2I, ACC_UPPER
mtc2 $11, vpLtTot[6] // Constant frac part in elem 3
// vnop; vnop
vmudm $v29, vOne, aL2F[2h] // Sum of squared components
vmadh $v29, vOne, aL2I[2h]
srl $11, $24, 5 // Top 3 bits
vmadm $v29, vOne, aL2F[1h]
mtc2 $20, vpLtTot[14] // Linear frac part in elem 7
vmadh $v29, vOne, aL2I[1h]
andi $20, $24, 0x1F // Bottom 5 bits
vmadn aL2F, aL2F, vOne // elem 0; swapped so we can do vmadn and get result
ori $20, $20, 0x20 // Append leading 1 to mantissa
vmadh aL2I, aL2I, vOne
sllv $20, $20, $11 // Left shift to create floating point
// vnop; vnop; vnop
vrsqh $v29[2], aL2I[0] // High input, garbage output
sll $20, $20, 8 // Min range 00002000, 00002100... 00003F00, max 00100000...001F8000
vrsql $v29[1], aL2F[0] // Low input, low output
bnez $24, @@skip // If original value is zero, set to zero
vrsqh $v29[0], aL2I[4] // High input, high output
li $20, 0
@@skip:
vrsql $v29[5], aL2F[4] // Low input, low output
vrsqh $v29[4], $v31[2] // 0 input, high output
mtc2 $20, aL2I[6] // Quadratic frac part in elem 3
// vnop; vnop; vnop
vmudn aDWF, aDWF, $v29[0h] // Vec frac * int scaling, discard result
srl $20, $20, 16
vmadm aDWF, aDWI, $v29[1h] // Vec int * frac scaling, discard result
bgez TODO, ltadv_continue_setup
vmadh aDIR, aDWI, $v29[0h] // Vec int * int scaling
// aDIR = normalized vector from vertex to light, $v29[0h:1h] = 1/len, aL2I:F[0h] = len^2
// aSCL <- aDWF
vmudm aSCL, aL2I, $v29[1h] // PL: len^2 int * 1/len frac
vmadn aSCL, aL2F, $v29[0h] // PL: len^2 frac * 1/len int = len frac
vmadh $v29, aL2I, $v29[0h] // PL: len^2 int * 1/len int = len int
vmulf aDWI, vpNrmlX, aDIR[0h] // Both: Normalized light dir * normalized normals
mtc2 $20, aL2I[14] // PL: Quadratic int part in elem 7
vmacf aDWI, vpNrmlY, aDIR[1h]
bnez laSpecFres, ltadv_after_camera // Set if computing specular or fresnel
vmacf aDWI, vpNrmlZ, aDIR[2h]
vmudl aSCL, aSCL, vpLtTot[7] // len frac * linear factor frac
vmadm aSCL, $v29, vpLtTot[7] // + len int * linear factor frac
vmadm aSCL, vOne, vpLtTot[3] // + 1 * constant factor frac
vmadl aSCL, aL2F, aL2I[3] // + len^2 frac * quadratic factor frac
vmadm aSCL, aL2I, aL2I[3] // + len^2 int * quadratic factor frac
vmadn $v29, aL2F, aL2I[7] // + len^2 frac * quadratic factor int = $v29 frac
bltzal laSpecular, ltadv_specular
vmadh aL2I, aL2I, aL2I[7] // + len^2 int * quadratic factor int = aL2I int
vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
vmadm aAOF, vpRGBA, $v30[2] // + (alpha - 1) * aoPoint factor; elems 3, 7
vrcph aSCL[1], aL2I[0] // 1/(2*light factor), input of 0000.8000 -> no change normals
// aLTC <- aL2F
luv aLTC, (ltBufOfs + 0 - lightSize)(curLight) // aLTC = light color
vrcpl aSCL[2], $v29[0] // Light factor 0001.0000 -> normals /= 2
vrcph aSCL[3], aL2I[4] // Light factor 0000.1000 -> normals *= 8 (with clamping)
vrcpl aSCL[6], $v29[4] // Light factor 0010.0000 -> normals /= 32
vrcph aSCL[7], $v31[2] // 0
// This is a scale on the dot product, not the light, because the scale can
// increase a small dot product (close to perpendicular), while it can't
// increase a light beyond white.
vmudm $v29, aDOT, aSCL[2h] // Dot product int * scale frac
j ltadv_finish_light
vmadh aDOT, aDOT, aSCL[3h] // Dot product int * scale int, clamp to 0x7FFF
// aAOF <- aL2I
CFG_DEBUG_NORMALS equ 0 // Can manually enable here
.if CFG_DEBUG_NORMALS
.warning "Debug normals visualization is enabled"
vmudh $v29, vOne, $v31[5] // 0x4000; middle gray
j TODO
vmacf vpRGBA, vpWNrm, $v31[5] // 0x4000; + 0.5 * normal
.endif
ovl4_end:
.align 8
ovl4_padded_end:
.close // CODE_FILE