mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
3699 lines
173 KiB
ArmAsm
3699 lines
173 KiB
ArmAsm
.rsp
|
|
|
|
.include "rsp/rsp_defs.inc"
|
|
.include "rsp/gbi.inc"
|
|
|
|
// This file assumes DATA_FILE and CODE_FILE are set on the command line
|
|
|
|
.if version() < 110
|
|
.error "armips 0.11 or newer is required"
|
|
.endif
|
|
|
|
// Sign-extends the immediate using addi. ori would zero-extend.
|
|
.macro li, reg, imm
|
|
addi reg, $zero, imm
|
|
.endmacro
|
|
|
|
.macro move, dst, src
|
|
ori dst, src, 0
|
|
.endmacro
|
|
|
|
// Prohibit macros involving slt; this silently clobbers $1. You can of course
|
|
// manually write the slt and branch instructions if you want this behavior.
|
|
.macro blt, ra, rb, lbl
|
|
.error "blt is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
.macro bgt, ra, rb, lbl
|
|
.error "bgt is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
.macro ble, ra, rb, lbl
|
|
.error "ble is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
.macro bge, ra, rb, lbl
|
|
.error "bge is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
// This version doesn't depend on $v0 to be vZero, which it often is not in
|
|
// F3DEX3, and also doesn't get corrupted if $vco is set / consume $vco which
|
|
// may be needed for a subsequent instruction.
|
|
.macro vcopy, dst, src
|
|
vor dst, src, src
|
|
.endmacro
|
|
|
|
// Using $v31 instead of dst as the source because $v31 doesn't change, whereas
|
|
// dst might have been modified 2 or 3 cycles ago, causing a stall.
|
|
.macro vclr, dst
|
|
vxor dst, $v31, $v31
|
|
.endmacro
|
|
|
|
// Also using $v31 for the dummy args here to avoid stalls. dst was once written
|
|
// in vanilla tri code just before reading (should have been $v29), leading to
|
|
// stalls!
|
|
ACC_UPPER equ 0
|
|
ACC_MIDDLE equ 1
|
|
ACC_LOWER equ 2
|
|
.macro vreadacc, dst, N
|
|
vsar dst, $v31, $v31[N]
|
|
.endmacro
|
|
|
|
//
|
|
// Profiling configurations. To make space for the profiling features, if any of
|
|
// the profiling configurations are enabled, G_LIGHTTORDP and !G_SHADING_SMOOTH
|
|
// are removed, i.e. G_LIGHTTORDP behaves as a no-op and all tris are smooth
|
|
// shaded.
|
|
//
|
|
|
|
// Profiling Configuration A
|
|
// perfCounterA:
|
|
// cycles RSP spent processing vertex commands (incl. vertex DMAs)
|
|
// perfCounterB:
|
|
// upper 16 bits: fetched DL command count
|
|
// lower 16 bits: DL command count
|
|
// perfCounterC:
|
|
// cycles RSP was stalled because RDP FIFO was full
|
|
// perfCounterD:
|
|
// cycles RSP spent processing triangle commands, NOT including buffer flushes
|
|
.if CFG_PROFILING_A
|
|
.if CFG_PROFILING_B || CFG_PROFILING_C
|
|
.error "At most one CFG_PROFILING_ option can be enabled at a time"
|
|
.endif
|
|
ENABLE_PROFILING equ 1
|
|
COUNTER_A_UPPER_VERTEX_COUNT equ 0
|
|
COUNTER_B_LOWER_CMD_COUNT equ 1
|
|
COUNTER_C_FIFO_FULL equ 1
|
|
|
|
// Profiling Configuration B
|
|
// perfCounterA:
|
|
// upper 16 bits: vertex count
|
|
// lower 16 bits: lit vertex count
|
|
// perfCounterB:
|
|
// upper 18 bits: tris culled by occlusion plane count
|
|
// lower 14 bits: clipped (input) tris count
|
|
// perfCounterC:
|
|
// upper 18 bits: overlay (all 0-4) load count
|
|
// lower 14 bits: overlay 2 (lighting) load count
|
|
// perfCounterD:
|
|
// upper 18 bits: overlay 3 (clipping) load count
|
|
// lower 14 bits: overlay 4 (misc) load count
|
|
.elseif CFG_PROFILING_B
|
|
.if CFG_PROFILING_C
|
|
.error "At most one CFG_PROFILING_ option can be enabled at a time"
|
|
.endif
|
|
ENABLE_PROFILING equ 1
|
|
COUNTER_A_UPPER_VERTEX_COUNT equ 1
|
|
COUNTER_B_LOWER_CMD_COUNT equ 0
|
|
COUNTER_C_FIFO_FULL equ 0
|
|
|
|
// Profiling Configuration C
|
|
// perfCounterA:
|
|
// cycles RSP believes it was running (this ucode only)
|
|
// perfCounterB:
|
|
// upper 16 bits: samples GCLK was alive (sampled once per DL command count)
|
|
// lower 16 bits: DL command count
|
|
// perfCounterC:
|
|
// upper 18 bits: small RDP command count (all RDP cmds except tris)
|
|
// lower 14 bits: matrix loads count
|
|
// perfCounterD:
|
|
// cycles RSP was stalled waiting for miscellaneous DMAs to finish
|
|
.elseif CFG_PROFILING_C
|
|
ENABLE_PROFILING equ 1
|
|
COUNTER_A_UPPER_VERTEX_COUNT equ 0
|
|
COUNTER_B_LOWER_CMD_COUNT equ 1
|
|
COUNTER_C_FIFO_FULL equ 0
|
|
|
|
// Default (extra profiling disabled)
|
|
// perfCounterA:
|
|
// upper 16 bits: vertex count
|
|
// lower 16 bits: RDP/out tri count
|
|
// perfCounterB:
|
|
// upper 18 bits: RSP/in tri count
|
|
// lower 14 bits: tex/fill rect count
|
|
// perfCounterC:
|
|
// cycles RSP was stalled because RDP FIFO was full
|
|
// perfCounterD:
|
|
// unused/zero
|
|
.else
|
|
ENABLE_PROFILING equ 0
|
|
COUNTER_A_UPPER_VERTEX_COUNT equ 1
|
|
COUNTER_B_LOWER_CMD_COUNT equ 0
|
|
COUNTER_C_FIFO_FULL equ 1
|
|
|
|
.endif
|
|
|
|
CFG_DEBUG_NORMALS equ 0 // Can manually enable here
|
|
|
|
// Only raise a warning in base modes; in profiling modes, addresses will be off
|
|
.macro warn_if_base, warntext
|
|
.if !ENABLE_PROFILING
|
|
.warning warntext
|
|
.endif
|
|
.endmacro
|
|
|
|
.macro align_with_warning, alignment, warntext
|
|
.if (. & (alignment - 1))
|
|
warn_if_base warntext
|
|
.endif
|
|
.align alignment
|
|
.endmacro
|
|
|
|
/*
|
|
There are two different memory spaces for the overlays: (a) IMEM and (b) the
|
|
microcode file (which, plus an offset, is also the location in DRAM).
|
|
|
|
A label marks both an IMEM addresses and a file address, but evaluating the
|
|
label in an integer context (e.g. in a branch) gives the IMEM address.
|
|
`orga(your_label)` gets the file address of the label, and `.orga` sets the
|
|
file address.
|
|
`.headersize`, as well as the value after `.create`, sets the difference
|
|
between IMEM addresses and file addresses, so you can set the IMEM address
|
|
with `.headersize desired_imem_addr - orga()`.
|
|
|
|
In IMEM, the whole microcode is organized as (each row is the same address):
|
|
|
|
0x80 space | |
|
|
for boot code Overlay 0 Overlay 1
|
|
(End (More cmd
|
|
start task) handlers)
|
|
(initialization) | |
|
|
|
|
Rest command handlers
|
|
Vertex start
|
|
All tri write cmds
|
|
|
|
Overlay 2 Overlay 3 Overlay 4
|
|
(Basic lighting) (Clipping, (Advanced
|
|
rare cmds) lighting)
|
|
|
|
Main vertex write
|
|
|
|
DMA code
|
|
|
|
In the file, the microcode is organized as:
|
|
start (file addr 0x0 = IMEM 0x1080)
|
|
Many command handlers
|
|
Overlay 3
|
|
Vertex and tri handlers
|
|
DMA code (end of this = IMEM 0x2000 = file 0xF80)
|
|
Overlay 0
|
|
Overlay 1
|
|
Overlay 2
|
|
Overlay 4
|
|
*/
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//////////////////////////////////// DMEM //////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// RSP DMEM
|
|
.create DATA_FILE, 0x0000
|
|
|
|
/*
|
|
Matrices are stored and used in a transposed format compared to how they are
|
|
normally written in mathematics. For the integer part:
|
|
00 02 04 06 typical Xscl Rot Rot 0
|
|
08 0A 0C 0E use: Rot Yscl Rot 0
|
|
10 12 14 16 Rot Rot Zscl 0
|
|
18 1A 1C 1E Xpos Ypos Zpos 1
|
|
The fractional part comes next and is in the same format.
|
|
Applying this transformation is done by multiplying a row vector times the
|
|
matrix, like:
|
|
X Y Z 1 * Xscl Rot Rot 0 = NewX NewY NewZ 1
|
|
Rot Yscl Rot 0
|
|
Rot Rot Zscl 0
|
|
Xpos Ypos Zpos 1
|
|
In C, the matrix is accessed as matrix[row][col], and the vector is vector[row].
|
|
*/
|
|
// 0x0000-0x0040: model matrix
|
|
mMatrix:
|
|
.fill 0x40
|
|
|
|
// 0x0040-0x0080: view * projection matrix
|
|
vpMatrix:
|
|
.fill 0x40
|
|
|
|
// model * (view * projection) matrix
|
|
mvpMatrix:
|
|
.fill 0x40
|
|
|
|
.if . != 0x00C0
|
|
.error "Scissor and othermode must be at 0x00C0 for S2DEX"
|
|
.endif
|
|
|
|
// scissor (four 12-bit values)
|
|
scissorUpLeft: // the command byte is included since the command word is copied verbatim
|
|
.dw (G_SETSCISSOR << 24) | (( 0 * 4) << 12) | (( 0 * 4) << 0)
|
|
scissorBottomRight:
|
|
.dw ((320 * 4) << 12) | ((240 * 4) << 0)
|
|
|
|
// othermode
|
|
otherMode0: // command byte included, same as above
|
|
.dw (G_RDPSETOTHERMODE << 24) | (0x080CFF)
|
|
otherMode1:
|
|
.dw 0x00000000
|
|
|
|
// Saved texrect state for combining the multiple input commands into one RDP texrect command
|
|
texrectWord1:
|
|
.fill 4 // first word, has command byte, xh and yh
|
|
texrectWord2:
|
|
.fill 4 // second word, has tile, xl, yl
|
|
|
|
// First half of RDP value for split commands
|
|
rdpHalf1Val:
|
|
.fill 4
|
|
|
|
activeClipPlanes:
|
|
.dh CLIP_SCAL_NPXY | CLIP_CAMPLANE // Normal tri write, set to zero when clipping
|
|
|
|
// displaylist stack length
|
|
displayListStackLength:
|
|
.db 0x00 // starts at 0, increments by 4 for each "return address" pushed onto the stack
|
|
|
|
unused1:
|
|
.db 0
|
|
|
|
// viewport
|
|
viewport:
|
|
.fill 16
|
|
|
|
// Current RDP fifo output position
|
|
rdpFifoPos:
|
|
.fill 4
|
|
|
|
matrixStackPtr:
|
|
.dw 0x00000000
|
|
|
|
// segment table
|
|
segmentTable:
|
|
.fill (4 * 16) // 16 DRAM pointers
|
|
|
|
// displaylist stack
|
|
displayListStack:
|
|
|
|
// ucode text (shared with DL stack)
|
|
.ascii ID_STR, 0x0A
|
|
endIdStr:
|
|
.if endIdStr < 0x180
|
|
.fill (0x180 - endIdStr)
|
|
.elseif endIdStr > 0x180
|
|
.error "ID_STR is too long"
|
|
.align 16 // to suppress subsequent errors
|
|
.endif
|
|
|
|
endSharedDMEM:
|
|
.if . != 0x180
|
|
.error "endSharedDMEM at incorrect address, matters for G_LOAD_UCODE / S2DEX"
|
|
.endif
|
|
|
|
// constants for register $v31
|
|
.if (. & 15) != 0
|
|
.error "Wrong alignment for v31value"
|
|
.endif
|
|
v31Value:
|
|
// v31 must go from lowest to highest (signed) values for vcc patterns.
|
|
// Also relies on the fact that $v31[0h] is -4,-4,-4,-4, 4, 4, 4, 4.
|
|
.dh -4 // used in clipping, vtx write for Newton-Raphson reciprocal
|
|
.dh -1 // used often
|
|
.dh 0 // used often
|
|
.dh 2 // used as clip ratio (vtx write, clipping) and in clipping
|
|
.dh 4 // used for same Newton-Raphsons, occlusion plane scaling
|
|
.dh 0x4000 // used in tri write, texgen
|
|
.dh 0x7F00 // used in fog, normals unpacking
|
|
.dh 0x7FFF // used often
|
|
|
|
/*
|
|
Quick note on Newton-Raphson:
|
|
https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
|
|
Given input D, we want to find the reciprocal R. The base formula for refining
|
|
the estimate of R is R_new = R*(2 - D*R). However, since the RSP reciprocal
|
|
instruction moves the radix point 1 to the left, the result has to be multiplied
|
|
by 2. So it's 2*R*(2 - D*2*R) = R*(4 - 4*D*R) = R*(1*4 + D*R*-4). This is where
|
|
the 4 and -4 come from. For tri write, the result needs to be multiplied by 4
|
|
for subpixels, so it's 16 and -16.
|
|
*/
|
|
|
|
cameraWorldPos:
|
|
.skip 6
|
|
tempTriRA:
|
|
.skip 2 // Overwritten as part of camera world position, used as temp
|
|
lightBufferLookat:
|
|
.skip 8 // s8 X0, Y0, Z0, dummy, X1, Y1, Z1, dummy
|
|
lightBufferMain:
|
|
.skip (G_MAX_LIGHTS * lightSize)
|
|
lightBufferAmbient:
|
|
.skip 8 // just colors for ambient light
|
|
ltBufOfs equ (lightBufferMain - altBase)
|
|
|
|
occlusionPlaneEdgeCoeffs:
|
|
/*
|
|
NOTE: This explanation is outdated; see cpu/occlusionplane.c
|
|
Vertex is in occlusion region if all five equations below are true:
|
|
4 * screenX[s13.2] * c0[s0.15] - 0.5 * screenY[s13.2] < c4[s14.1]
|
|
4 * screenY[s13.2] * c1[s0.15] - 0.5 * screenX[s13.2] < c5[s14.1]
|
|
4 * screenX[s13.2] * c2[s0.15] + 0.5 * screenY[s13.2] < c6[s14.1]
|
|
4 * screenY[s13.2] * c3[s0.15] + 0.5 * screenX[s13.2] < c7[s14.1]
|
|
clamp_to_0.s15(clipX[s15.16] * kx[0.s15])
|
|
+ clamp_to_0.s15(clipY[s15.16] * ky[0.s15])
|
|
+ clamp_to_0.s15(clipZ[s15.16] * kz[0.s15])
|
|
+ kc[0.s15]
|
|
>= 0
|
|
The first four can be rewritten as (again, vertex is occluded if all are true):
|
|
screenY > screenX * 8*c0 + -2*c4
|
|
screenX > screenY * 8*c1 + -2*c5
|
|
screenY < screenX * -8*c2 + 2*c6
|
|
screenX < screenY * -8*c3 + 2*c7
|
|
where screenX and screenY are in subpixels (e.g. screenX = 100 = 25.0 pixels),
|
|
c0-c3 are shorts representing -1:0.99997,
|
|
and c4-c7 are shorts representing "half pixels" (e.g. c4 = 50 = 25.0 pixels)
|
|
|
|
For the last equation, one option is to think of kx through kc as in s10.5 mode
|
|
instead, so a value of 0x0020 is 1.0 and they can range from -0x400.00 to
|
|
0x3FF.F8. This choice is because clipZ ranges from 0x0000.0000 at the camera
|
|
plane to 0x03FF.0000 at the maximum distance away. The normal distance Adult
|
|
Link is from the camera is about 0x00B0.0000.
|
|
|
|
A better option is to develop your plane equation in floating point, e.g.
|
|
clipX[f] * -0.2f + clipY[f] * 0.4f + clipZ[f] * 1.0f + -200.0f >= 0
|
|
then multiply everything by (32768.0f / max(abs(kx), abs(ky), abs(kz), abs(kc)))
|
|
(here 32768.0f / 200.0f = 163.84f)
|
|
clipX[f] * -32.77f + clipY[f] * 65.54f + clipZ[f] * 163.84f + -32768
|
|
*/
|
|
.dh 0x0000 // c0
|
|
.dh 0x0000 // c1
|
|
.dh 0x0000 // c2
|
|
.dh 0x0000 // c3
|
|
.dh 0x8000 // c4
|
|
.dh 0x8000 // c5
|
|
.dh 0x8000 // c6
|
|
.dh 0x8000 // c7
|
|
occlusionPlaneMidCoeffs:
|
|
.dh 0x0000 // kx
|
|
.dh 0x0000 // ky
|
|
.dh 0x0000 // kz
|
|
.dh 0x8000 // kc
|
|
|
|
// Alternate base address because vector load offsets can't reach all of DMEM.
|
|
// altBaseReg permanently points here.
|
|
.if (. & 15) != 0
|
|
.error "Wrong alignment for altBase"
|
|
.endif
|
|
altBase:
|
|
|
|
textureSettings1:
|
|
.dw 0x00000000 // first word, has command byte, level, tile, and on
|
|
|
|
textureSettings2:
|
|
.dw 0xFFFFFFFF // second word, has s and t scale
|
|
|
|
geometryModeLabel:
|
|
.dw 0x00000000 // originally initialized to G_CLIPPING, but that does nothing
|
|
|
|
fogFactor:
|
|
.dw 0x00000000
|
|
|
|
// constants for register vTRC
|
|
.if (. & 15) != 0
|
|
.error "Wrong alignment for vTRCValue"
|
|
.endif
|
|
vTRCValue:
|
|
decalFixMult equ 0x0400
|
|
decalFixOff equ (-(decalFixMult / 2))
|
|
.dh vertexBuffer // currently 0x02DE; for converting vertex index to address
|
|
.dh vtxSize << 7 // 0x1300; it's not 0x2600 because vertex indices are *2
|
|
.dh 0x1000 // some multiplier in tri write, increment in vertex indices
|
|
.dh decalFixMult
|
|
.dh 0x0020 // some edge write thing in tri write; formerly Z scale factor
|
|
.dh 0xFFF8 // used once in tri write, mask away lower ST bits
|
|
.dh decalFixOff // negative
|
|
.dh 0x0100 // used several times in tri write
|
|
.macro set_vcc_11110001
|
|
vge $v29, vTRC, vTRC[7]
|
|
.endmacro
|
|
.if (vertexBuffer < 0x0100 || decalFixMult < 0x100)
|
|
.error "VCC pattern for vTRC corrupted"
|
|
.endif
|
|
vTRC_VB equ vTRC[0] // Vertex Buffer
|
|
vTRC_VS equ vTRC[1] // Vertex Size
|
|
vTRC_1000 equ vTRC[2]
|
|
vTRC_DM equ vTRC[3] // Decal Multiplier
|
|
vTRC_0020 equ vTRC[4]
|
|
vTRC_FFF8 equ vTRC[5]
|
|
vTRC_DO equ vTRC[6] // Decal Offset
|
|
vTRC_0100 equ vTRC[7]
|
|
vTRC_0100_addr equ (vTRCValue + 2 * 7)
|
|
|
|
fxParams:
|
|
|
|
.if (. & 15) != 0
|
|
.error "Wrong alignment for fxParams"
|
|
.endif
|
|
// First 8 values here loaded with lqv.
|
|
|
|
aoAmbientFactor:
|
|
.dh 0xFFFF
|
|
aoDirectionalFactor:
|
|
.dh 0xA000
|
|
aoPointFactor:
|
|
.dh 0x0000
|
|
|
|
perspNorm:
|
|
.dh 0xFFFF
|
|
|
|
texgenLinearCoeffs:
|
|
.dh 0x44D3
|
|
.dh 0x6CB3
|
|
|
|
fresnelScale:
|
|
.dh 0x0000
|
|
fresnelOffset:
|
|
.dh 0x0000
|
|
|
|
attrOffsetST:
|
|
.dh 0x0100
|
|
.dh 0xFF00
|
|
|
|
alphaCompareCullMode:
|
|
.db 0x00 // 0 = disabled, 1 = cull if all < thresh, -1 = cull if all >= thresh
|
|
alphaCompareCullThresh:
|
|
.db 0x00 // Alpha threshold, 00 - FF
|
|
|
|
lastMatDLPhyAddr:
|
|
.dw 0
|
|
|
|
packedNormalsMaskConstant:
|
|
.db 0xF8 // When read, materialCullMode has been zeroed, so read as 0xF800
|
|
materialCullMode:
|
|
.db 0
|
|
|
|
// moveword table
|
|
movewordTable:
|
|
.dh fxParams // G_MW_FX
|
|
.dh numLightsxSize - 3 // G_MW_NUMLIGHT; writes numLightsxSize and pointLightFlag, zeroes dirLightsXfrmValid
|
|
packedNormalsConstants:
|
|
.if (. & 4) != 0
|
|
.error "Alignment broken for packed normals constants in movewordTable"
|
|
.endif
|
|
.dh 0x2008 // For packed normals; unused in movewordTable
|
|
.if (segmentTable & 0xFF00) != 0
|
|
.error "Packed normals constants relies on first byte of segmentTable addr being 0"
|
|
.endif
|
|
.dh segmentTable // G_MW_SEGMENT
|
|
.dh fogFactor // G_MW_FOG
|
|
.dh lightBufferMain // G_MW_LIGHTCOL
|
|
|
|
// Movemem table
|
|
movememTable:
|
|
.dh mMatrix // G_MV_MMTX
|
|
.dh tempMatrix // G_MV_TEMPMTX0 multiply temp matrix (model)
|
|
.dh vpMatrix // G_MV_VPMTX
|
|
.dh tempMatrix // G_MV_TEMPMTX1 multiply temp matrix (view*projection)
|
|
.dh viewport // G_MV_VIEWPORT
|
|
.dh cameraWorldPos // G_MV_LIGHT
|
|
|
|
afterMovememRaTable:
|
|
.dh run_next_DL_command
|
|
.dh G_MTX_multiply_end
|
|
|
|
clipCondShifts:
|
|
.db CLIP_SCAL_NY_SHIFT // Constants for clipping algorithm
|
|
.db CLIP_SCAL_PY_SHIFT
|
|
.db CLIP_SCAL_NX_SHIFT
|
|
.db CLIP_SCAL_PX_SHIFT
|
|
|
|
mvpValid:
|
|
.db 0 // Nonzero if the MVP matrix is valid, 0 if it needs to be recomputed.
|
|
dirLightsXfrmValid:
|
|
.db 0 // Nonzero if transformed directional lights are valid.
|
|
unused2:
|
|
.db 0
|
|
pointLightFlag:
|
|
.db 0 // Sign bit set if there are point lights.
|
|
numLightsxSize:
|
|
.db 0 // lightSize * number of lights
|
|
|
|
.macro miniTableEntry, addr
|
|
.if addr < 0x1000 || addr >= 0x1400
|
|
.error "Handler address out of range!"
|
|
.endif
|
|
.db (addr - 0x1000) >> 2
|
|
.endmacro
|
|
|
|
// RDP/Immediate Command Mini Table
|
|
// 1 byte per entry, after << 2 points to an addr in first 1/4 of IMEM
|
|
miniTableEntry G_FLUSH_handler
|
|
miniTableEntry G_MEMSET_handler
|
|
miniTableEntry G_DMA_IO_handler
|
|
miniTableEntry G_TEXTURE_handler
|
|
miniTableEntry G_POPMTX_handler
|
|
miniTableEntry G_GEOMETRYMODE_handler
|
|
miniTableEntry G_MTX_handler
|
|
miniTableEntry G_MOVEWORD_handler
|
|
miniTableEntry G_MOVEMEM_handler
|
|
miniTableEntry G_LOAD_UCODE_handler
|
|
miniTableEntry G_DL_handler
|
|
miniTableEntry G_ENDDL_handler
|
|
miniTableEntry G_SPNOOP_handler
|
|
miniTableEntry G_RDPHALF_1_handler
|
|
miniTableEntry G_SETOTHERMODE_L_handler
|
|
miniTableEntry G_SETOTHERMODE_H_handler
|
|
miniTableEntry G_TEXRECT_handler
|
|
miniTableEntry G_TEXRECTFLIP_handler
|
|
miniTableEntry G_SYNC_handler // G_RDPLOADSYNC
|
|
miniTableEntry G_SYNC_handler // G_RDPPIPESYNC
|
|
miniTableEntry G_SYNC_handler // G_RDPTILESYNC
|
|
miniTableEntry G_SYNC_handler // G_RDPFULLSYNC
|
|
miniTableEntry G_RDP_handler // G_SETKEYGB
|
|
miniTableEntry G_RDP_handler // G_SETKEYR
|
|
miniTableEntry G_RDP_handler // G_SETCONVERT
|
|
miniTableEntry G_SETSCISSOR_handler
|
|
miniTableEntry G_RDP_handler // G_SETPRIMDEPTH
|
|
miniTableEntry G_RDPSETOTHERMODE_handler
|
|
miniTableEntry load_cmds_handler // G_LOADTLUT
|
|
miniTableEntry G_RDPHALF_2_handler
|
|
miniTableEntry G_RDP_handler // G_SETTILESIZE
|
|
miniTableEntry load_cmds_handler // G_LOADBLOCK
|
|
miniTableEntry load_cmds_handler // G_LOADTILE
|
|
miniTableEntry G_RDP_handler // G_SETTILE
|
|
miniTableEntry G_RDP_handler // G_FILLRECT
|
|
miniTableEntry G_RDP_handler // G_SETFILLCOLOR
|
|
miniTableEntry G_RDP_handler // G_SETFOGCOLOR
|
|
miniTableEntry G_RDP_handler // G_SETBLENDCOLOR
|
|
miniTableEntry G_RDP_handler // G_SETPRIMCOLOR
|
|
miniTableEntry G_RDP_handler // G_SETENVCOLOR
|
|
miniTableEntry G_RDP_handler // G_SETCOMBINE
|
|
miniTableEntry G_SETxIMG_handler // G_SETTIMG
|
|
miniTableEntry G_SETxIMG_handler // G_SETZIMG
|
|
miniTableEntry G_SETxIMG_handler // G_SETCIMG
|
|
cmdMiniTable:
|
|
miniTableEntry G_SYNC_handler // G_NOOP
|
|
miniTableEntry G_VTX_handler
|
|
miniTableEntry G_MODIFYVTX_handler
|
|
miniTableEntry G_CULLDL_handler
|
|
miniTableEntry G_BRANCH_WZ_handler
|
|
miniTableEntry G_TRI1_handler
|
|
miniTableEntry G_TRI2_handler
|
|
miniTableEntry G_QUAD_handler
|
|
miniTableEntry G_TRISTRIP_handler
|
|
miniTableEntry G_TRIFAN_handler
|
|
miniTableEntry G_LIGHTTORDP_handler
|
|
miniTableEntry G_RELSEGMENT_handler
|
|
|
|
|
|
// The maximum number of generated vertices in a clip polygon. In reality, this
|
|
// is equal to MAX_CLIP_POLY_VERTS, but for testing we can change them separately.
|
|
// In case you're wondering if it's possible to have a 7-vertex polygon where all
|
|
// 7 verts are generated, it looks like this (X = generated vertex):
|
|
// ___----=>
|
|
// +---------------__X----X _-^
|
|
// | __--^^ X^
|
|
// | __--^^ _-^|
|
|
// _X^^^ _-^ |
|
|
// C | _-^ |
|
|
// ^X _-^ |
|
|
// |\ _-^ |
|
|
// +-X--_X^---------------+
|
|
// V^
|
|
MAX_CLIP_GEN_VERTS equ 7
|
|
// Normally, each clip plane can cut off a "tip" of a polygon, turning one vert
|
|
// into two. (It can also cut off more of the polygon and remove additional verts,
|
|
// but the maximum is one more vert per clip plane.) So with 5 clip planes, we
|
|
// could have a maximum of 8 verts in the final polygon. However, the verts
|
|
// generated by the no-nearclipping plane will always be at infinity, so they
|
|
// will always get replaced by generated verts from one of the other clip planes.
|
|
// Put another way, if there are 8 verts in the final polygon, there are 8 edges,
|
|
// which are portions of the 3 original edges plus portions of 5 edges along the
|
|
// 5 clip planes. But the edge portion along the no-nearclipping plane is at
|
|
// infinity, so that edge can't be on screen.
|
|
// It is rare but possible for these assumptions to be violated and a polygon
|
|
// with more than 7 verts to be generated. For example, numerical precision
|
|
// issues could cause the polygon to be slightly non-convex at one of the clip
|
|
// planes, causing the plane to cut off more than one tip. However, this
|
|
// implementation checks for an imminent overflow and aborts clipping (draws no
|
|
// tris) if this occurs. Because this is caused by extreme/degenerate cases like
|
|
// the camera exactly on a tri, not drawing anything is an okay result.
|
|
MAX_CLIP_POLY_VERTS equ 7
|
|
CLIP_POLY_SIZE_BYTES equ (MAX_CLIP_POLY_VERTS+1) * 2
|
|
CLIP_TEMP_VERTS_SIZE_BYTES equ (MAX_CLIP_GEN_VERTS * vtxSize)
|
|
|
|
VERTEX_BUFFER_SIZE_BYTES equ (G_MAX_VERTS * vtxSize)
|
|
|
|
RDP_CMD_BUFSIZE equ 0xB0
|
|
RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command
|
|
RDP_CMD_BUFSIZE_TOTAL equ (RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS)
|
|
|
|
INPUT_BUFFER_CMDS equ 21
|
|
INPUT_BUFFER_SIZE_BYTES equ (INPUT_BUFFER_CMDS * 8)
|
|
INPUT_BUFFER_CLOBBER_OSTASK_AMT equ 0x10 // Input buffer overwrites beginning of OSTask, see rsp_defs.inc
|
|
OSTASK_ORIG_SIZE equ 0x40
|
|
OSTASK_CLOBBERED_SIZE equ (OSTASK_ORIG_SIZE - INPUT_BUFFER_CLOBBER_OSTASK_AMT)
|
|
|
|
END_VARIABLE_LEN_DMEM equ (0x1000 - OSTASK_CLOBBERED_SIZE - INPUT_BUFFER_SIZE_BYTES - (2 * RDP_CMD_BUFSIZE_TOTAL) - (2 * CLIP_POLY_SIZE_BYTES) - CLIP_TEMP_VERTS_SIZE_BYTES - VERTEX_BUFFER_SIZE_BYTES)
|
|
|
|
startFreeDmem:
|
|
.org END_VARIABLE_LEN_DMEM
|
|
endFreeDmem:
|
|
|
|
// Main vertex buffer in RSP internal format
|
|
vertexBuffer:
|
|
.skip VERTEX_BUFFER_SIZE_BYTES
|
|
|
|
// Space for temporary verts for clipping code, and reused for other things
|
|
clipTempVerts:
|
|
|
|
// Round up to 0x10
|
|
.org ((clipTempVerts + 0xF) & 0xFF0)
|
|
// Vertex addresses, to avoid a multiply-add for each vertex index lookup
|
|
vertexTable:
|
|
.skip ((G_MAX_VERTS + 8) * 2) // halfword for each vertex; need 1 extra end addr, easier to write 8 extra
|
|
|
|
.if . > yieldDataFooter
|
|
// Need to fit everything through vertex buffer in yield buffer, would like
|
|
// to also fit vertexTable to avoid recompute after yield
|
|
.error "Too much being stored in yieldable DMEM"
|
|
.endif
|
|
.if (. & 15) != 0
|
|
.error "tempMatrix not aligned"
|
|
.endif
|
|
|
|
tempMatrix:
|
|
.skip 0x40
|
|
|
|
.if . > (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES)
|
|
.error "Too much in clipTempVerts"
|
|
.endif
|
|
.org (clipTempVerts + CLIP_TEMP_VERTS_SIZE_BYTES)
|
|
clipTempVertsEnd:
|
|
|
|
clipPoly:
|
|
.skip CLIP_POLY_SIZE_BYTES // 3 5 7 + term 0
|
|
clipPoly2: // \ / \ / \
|
|
.skip CLIP_POLY_SIZE_BYTES // 4 6 7 + term 0
|
|
|
|
|
|
// First RDP Command Buffer
|
|
rdpCmdBuffer1:
|
|
.skip RDP_CMD_BUFSIZE
|
|
.if (. & 8) != 8
|
|
.error "RDP command buffer alignment to 8 assumption broken"
|
|
.endif
|
|
rdpCmdBuffer1End:
|
|
.skip 8
|
|
rdpCmdBuffer1EndPlus1Word:
|
|
// This is so that we can temporarily store vector regs here with lqv/sqv
|
|
.skip RDP_CMD_BUFSIZE_EXCESS - 8
|
|
// Second RDP Command Buffer
|
|
rdpCmdBuffer2:
|
|
.skip RDP_CMD_BUFSIZE
|
|
.if (. & 8) != 8
|
|
.error "RDP command buffer alignment to 8 assumption broken"
|
|
.endif
|
|
rdpCmdBuffer2End:
|
|
.skip 8
|
|
rdpCmdBuffer2EndPlus1Word:
|
|
.skip RDP_CMD_BUFSIZE_EXCESS - 8
|
|
|
|
// Input buffer. After RDP cmd buffers so it can be vector addressed from end.
|
|
inputBuffer:
|
|
.skip INPUT_BUFFER_SIZE_BYTES - INPUT_BUFFER_CLOBBER_OSTASK_AMT
|
|
// 0x0FC0-0x1000: OSTask
|
|
OSTask:
|
|
.skip INPUT_BUFFER_CLOBBER_OSTASK_AMT
|
|
inputBufferEnd:
|
|
inputBufferEndSgn equ -(0x1000 - inputBufferEnd) // Underflow DMEM address
|
|
// rest of OSTask
|
|
.skip OSTASK_CLOBBERED_SIZE
|
|
|
|
.if . != 0x1000
|
|
.error "DMEM organization incorrect"
|
|
.endif
|
|
|
|
.close // DATA_FILE
|
|
|
|
// See rsp_defs.inc about why these are not used and we can reuse them.
|
|
startCounterTime equ (OSTask + OSTask_ucode_size)
|
|
xfrmLookatDirs equ -(0x1000 - (OSTask + OSTask_ucode_data)) // and OSTask_ucode_data_size
|
|
|
|
memsetBufferStart equ ((vertexBuffer + 0xF) & 0xFF0)
|
|
memsetBufferMaxEnd equ (rdpCmdBuffer1 & 0xFF0)
|
|
memsetBufferMaxSize equ (memsetBufferMaxEnd - memsetBufferStart)
|
|
memsetBufferSize equ (memsetBufferMaxSize > 0x800 ? 0x800 : memsetBufferMaxSize)
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////// Register Naming ////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
/*
|
|
Scalar regs:
|
|
Tri write Clip VW Vtx write ltbasic ltadv V/L init Cmd dispatch
|
|
$zero ---------------------- Hardwired zero ------------------------------------------
|
|
$1 v1 texptr <------------- vtxLeft ------------------------------> temp, init 0
|
|
$2 v2 shdptr clipVNext -------> <----- lbPostAo laPtr temp
|
|
$3 v3 shdflg clipVLastOfsc vLoopRet ---------> laVtxLeft temp
|
|
$4 flat shading vtx or (perf) initial FIFO stall time -----------------------------
|
|
$5 <------------------------ vGeomMid ------------------------------>
|
|
$6 geom mode clipMaskIdx -----> <-- lbTexgenOrRet laSTKept
|
|
$7 v2flag tile <------------- fogFlag ----------> laPacked mtx valid cmd byte
|
|
$8 v3flag <------------- outVtx2 ----------> laSpecular outVtx2
|
|
$9 xp texenab clipMask --------> <----- curLight ---------> viLtFlag
|
|
$10 -------------------------- temp2 -----------------------------------------------
|
|
$11 --------------------------- temp -----------------------------------------------
|
|
$12 ----------------------- perfCounterD -------------------------------------------
|
|
$13 ------------------------ altBaseReg --------------------------------------------
|
|
$14 <-------------- inVtx ------------------------------->
|
|
$15 <------------ outVtxBase ---------------------------->
|
|
$16 v1flag lmaj clipFlags -------> <----- lbFakeAmb laSpecFres ovlInitClock
|
|
$17 clipPolyRead ---->
|
|
$18 <---------- clipPolySelect -->
|
|
$19 temp clipVOnsc outVtx1 ----------> laL2A <--------- dmaLen
|
|
$20 temp <------------- flagsV1 ----------> laTexgen <--------- dmemAddr
|
|
$21 <---------- clipPolyWrite ---> <----- ambLight ambLight
|
|
$22 ---------------------- rdpCmdBufEndP1 ------------------------------------------
|
|
$23 ----------------------- rdpCmdBufPtr -------------------------------------------
|
|
$24 temp <------------- flagsV2 ----------> fp temp <--------- cmd_w1_dram
|
|
$25 cmd_w0 --------------------> <----- lbAfter <--------- cmd_w0
|
|
$26 ------------------------ taskDataPtr -------------------------------------------
|
|
$27 ---------------------- inputBufferPos ------------------------------------------
|
|
$28 ----------------------- perfCounterA -------------------------------------------
|
|
$29 ----------------------- perfCounterB -------------------------------------------
|
|
$30 ----------------------- perfCounterC -------------------------------------------
|
|
$ra return address, sometimes sign bit is flag -------------------------------------
|
|
*/
|
|
|
|
// Global scalar regs:
|
|
perfCounterD equ $12 // Performance counter D (functions depend on config)
|
|
altBaseReg equ $13 // Alternate base address register for vector loads
|
|
rdpCmdBufEndP1 equ $22 // Pointer to one command word past "end" (middle) of RDP command buf
|
|
rdpCmdBufPtr equ $23 // RDP command buffer current DMEM pointer
|
|
taskDataPtr equ $26 // Task data (display list) DRAM pointer
|
|
inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end
|
|
perfCounterA equ $28 // Performance counter A (functions depend on config)
|
|
perfCounterB equ $29 // Performance counter B (functions depend on config)
|
|
perfCounterC equ $30 // Performance counter C (functions depend on config)
|
|
|
|
// Vertex write:
|
|
vtxLeft equ $1 // Number of vertices left to process * 0x10
|
|
vLoopRet equ $3 // Return address at end of vtx loop = top of loop or misc lighting
|
|
vGeomMid equ $5 // Middle two bytes of geometry mode
|
|
fogFlag equ $7 // 8 if fog enabled, else 0
|
|
outVtx2 equ $8 // Pointer to second or dummy (= outVtx1) transformed vert
|
|
inVtx equ $14 // Pointer to loaded vertex to transform; < 0 means from clipping.
|
|
outVtxBase equ $15 // Pointer to vertex buffer to store transformed verts
|
|
outVtx1 equ $19 // Pointer to first transformed vert
|
|
flagsV1 equ $20 // Clip flags for vertex 1
|
|
flagsV2 equ $24 // Clip flags for vertex 2
|
|
|
|
// Lighting basic:
|
|
lbPostAo equ $2 // Address to return to after AO
|
|
lbTexgenOrRet equ $6 // ltbasic_texgen as negative if texgen, else vtx_return_from_lighting
|
|
curLight equ $9 // Current light pointer with offset
|
|
lbFakeAmb equ $16 // Pointer to ambient light or to 8 bytes of zeros if AO enabled
|
|
ambLight equ $21 // Ambient (top) light pointer with offset
|
|
lbAfter equ $25 // Address to return to after main lighting loop (vertex or extras)
|
|
|
|
// Lighting advanced:
|
|
laPtr equ $2 // Pointer to current vertex pair being lit
|
|
laVtxLeft equ $3 // Count of vertices left * 0x10
|
|
laSTKept equ $6 // Texture coords of vertex 1 kept through processing
|
|
laPacked equ $7 // Nonzero if packed normals enabled
|
|
laSpecular equ $8 // Sign bit set if specular enabled
|
|
laSpecFres equ $16 // Nonzero if doing ltadv_normal_to_vertex for specular or Fresnel
|
|
laL2A equ $19 // Nonzero if light-to-alpha (cel shading) enabled
|
|
laTexgen equ $20 // Nonzero if texgen enabled
|
|
|
|
// Clipping
|
|
clipVNext equ $2 // Next vertex (vertex at forward end of current edge)
|
|
clipVLastOfsc equ $3 // Last vertex / offscreen vertex
|
|
clipVOnsc equ $19 // Onscreen vertex
|
|
clipMaskIdx equ $6 // Clip mask index 4-0
|
|
clipMask equ $9 // Current clip mask (one bit)
|
|
clipFlags equ $16 // Current clipping flags being checked
|
|
clipPolyRead equ $17 // Read pointer within current polygon being clipped
|
|
clipPolySelect equ $18 // Clip poly double buffer selection
|
|
clipPolyWrite equ $21 // Write pointer within current polygon being clipped
|
|
|
|
// Vertex init
|
|
viLtFlag equ $9 // Holds pointLightFlag or dirLightsXfrmValid
|
|
|
|
// Misc
|
|
nextRA equ $10 // Address to return to after overlay load
|
|
ovlInitClock equ $16 // Temp for profiling
|
|
dmaLen equ $19 // DMA length in bytes minus 1
|
|
dmemAddr equ $20 // DMA address in DMEM or IMEM. Also = rdpCmdBufPtr - rdpCmdBufEndP1 for flush_rdp_buffer
|
|
cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr
|
|
cmd_w0 equ $25 // DL command word 0, also holds next tris info
|
|
|
|
// Global vector regs:
|
|
vZero equ $v0 // All elements = 0; NOT global, only in tri write and clip. Mtx in vtx.
|
|
vTRC equ $v1 // Triangle Constants; NOT global, only in tri write and clip. Mtx in vtx.
|
|
vOne equ $v28 // All elements = 1; global
|
|
// $v29: permanent temp register, also write results here to discard
|
|
// $v30: vtx / lt = sSTO + persp norm + more lighting params
|
|
// $v31: Global constant vector register
|
|
|
|
// Vertex / lighting vector regs:
|
|
// Prefixes: v = vector register, vp = vertex pair, s = vertex store,
|
|
// l = basic lighting, a = advanced lighting
|
|
// Sadly, "vp" stands for vertex pair, view*projection matrix, and viewport
|
|
|
|
vMTX0I equ $v0 // Matrix rows int/frac; MVP normally, or M in ltadv
|
|
vMTX1I equ $v1
|
|
vMTX2I equ $v2
|
|
vMTX3I equ $v3
|
|
vMTX0F equ $v4
|
|
vMTX1F equ $v5
|
|
vMTX2F equ $v6
|
|
vMTX3F equ $v7
|
|
vTemp1 equ $v8 // Temporaries, used by lighting (along with some vp regs)
|
|
vTemp2 equ $v9
|
|
vKept1 equ $v10 // Kept across lighting
|
|
vKept2 equ $v11
|
|
vpMdl equ $v12 // Vertex pair model space position
|
|
vpClpF equ $v13 // Vertex pair clip space position frac
|
|
vpClpI equ $v14 // Vertex pair clip space position int
|
|
vpScrF equ $v15 // Vertex pair screen space position frac
|
|
vpScrI equ $v16 // Vertex pair screen space position int
|
|
vpST equ $v17 // Vertex pair ST texture coordinates
|
|
vpRGBA equ $v18 // Vertex pair color
|
|
vpLtTot equ $v19 // Vertex pair total light
|
|
vpNrmlX equ $v20 // Vertex pair normal X (elems 3, 7)
|
|
vpNrmlY equ $v21 // Vertex pair normal Y (elems 3, 7)
|
|
vpNrmlZ equ $v22 // Vertex pair normal Z (elems 3, 7)
|
|
vLTC equ $v23 // Lighting constants - first light dir, constants for packed normals
|
|
vPerm1 equ $v24 // Regs loaded in vtx_constants_for_clip and permanently kept through vtx/lt
|
|
vPerm2 equ $v25
|
|
vPerm3 equ $v26
|
|
vPerm4 equ $v27
|
|
|
|
// Lighting temporaries. Lighting also modifies vpNrmlX:Y:Z, vpLtTot, vpRGBA, and
|
|
// in texgen vpST. Only the two regs in the comments below and vKept1 are kept.
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
// vpClpI:F are kept, vpMdl is free to use as temp
|
|
lDOT equ vpMdl // lighting DOT product
|
|
lCOL equ vKept2 // lighting total light COLor
|
|
.else
|
|
// vpMdl is kept, these are free to use as temps
|
|
lDOT equ vpClpF
|
|
lCOL equ vpClpI
|
|
.endif
|
|
lDTC equ vTemp1 // lighting DoT Clamped
|
|
lVCI equ vTemp2 // lighting Vertex Color In
|
|
lDIR equ vpRGBA // lighting transformed light DIRection
|
|
|
|
// Kept
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
sCLZ equ vKept1 // vtx_store Clamped Z. Does have to be kept even though in instan_lt_vs_45 b/c need rest of lt temps at start of texgen (and advanced lighting).
|
|
sOCS equ $v29 // Does not exist
|
|
.else
|
|
sOCS equ vKept1 // vtx_store Occlusion State
|
|
sCLZ equ vpClpF // Not a kept in this config
|
|
.endif
|
|
|
|
// Common vertex temporaries
|
|
sRTF equ vTemp1 // vtx_store Reciprocal Temp Frac
|
|
sRTI equ vTemp2 // vtx_store Reciprocal Temp Int
|
|
sFOG equ lCOL // lCOL -> sFOG in lt epilogue with NOC, else sFOG -> lCOL in lt prologue
|
|
|
|
// Misc temps used by both
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
s1WI equ vpNrmlX // vtx_store 1/W Int
|
|
s1WF equ vpLtTot // vtx_store 1/W Frac
|
|
sSCI equ sFOG // vtx_store Scaled Clipping Int
|
|
sSCF equ vpMdl // vtx_store Scaled Clipping Frac
|
|
sTCL equ sCLZ // vtx_store Temp CoLor
|
|
.else
|
|
s1WI equ vpMdl
|
|
s1WF equ vpNrmlX
|
|
sSCI equ vpScrI
|
|
sSCF equ vpScrF
|
|
sTCL equ vpLtTot
|
|
.endif
|
|
|
|
// Misc temps used by only one
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
sST2 equ vpScrI // vtx_store ST coordinates copy 2
|
|
sOTM equ $v29 // Does not exist
|
|
.else
|
|
sST2 equ $v29 // Does not exist
|
|
sOTM equ vpRGBA // vtx_store Occlusion Temporary
|
|
.endif
|
|
|
|
// Permanently kept through vertex/lighting
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
sVPS equ vPerm1 // vtx_store ViewPort Scale
|
|
sVPO equ vPerm2 // vtx_store ViewPort Offset
|
|
sFGM equ vPerm3 // vtx_store FoG Mask
|
|
sO03 equ $v29 // Does not exist
|
|
sO47 equ $v29
|
|
sOCM equ $v29
|
|
sOPM equ $v29
|
|
.else
|
|
// These are temps, not permanents, on this codepath
|
|
sVPS equ vpScrI // Temp, not permament, on this codepath
|
|
sVPO equ vpScrF // Temp, not permament, on this codepath
|
|
sFGM equ $v29 // Does not exist
|
|
sO03 equ vPerm1 // vtx_store Occlusion plane edge coefficients 0-3
|
|
sO47 equ vPerm2 // vtx_store Occlusion plane edge coefficients 4-7
|
|
sOCM equ vPerm3 // vtx_store Occlusion plane Mid coefficients
|
|
sOPM equ vKept2 // vtx_store Occlusion Plus Minus. Loaded in vtx_after_lt_setup not vtx_constants_for_clip b/c clobbered by lighting.
|
|
.endif
|
|
sSTS equ vPerm4
|
|
|
|
// ltadv:
|
|
aPNScl equ $v8 // ltadv Packed Normals Scales = (1<<0),(1<<5),(1<<11),XX, repeat
|
|
aNrmSc equ $v9 // ltadv Normals Scale = [0h:1h] scale to normalize all normals; elems 2,3,6,7 used for point light factors
|
|
aDOT equ $v10 // ltadv Dot product = normals dot direction; also briefly light dir
|
|
aLen2I equ $v11 // ltadv Length 2quared Int part
|
|
// Uses vpMdl = $v12
|
|
vpWrlF equ $v13 // vertex pair World position Frac part
|
|
vpWrlI equ $v14 // vertex pair World position Int part
|
|
aDPosF equ $v15 // ltadv Delta Position Frac part
|
|
aDPosI equ $v16 // ltadv Delta Position Int part
|
|
aOAFrs equ $v17 // ltadv Offset Alpha (elem 3,7) and Fresnel (elem 0,4)
|
|
// Uses vpRGBA, vpLtTot, vpNrmlX, vpNrmlY, vpNrmlZ = $v18, $v19, $v20, $v21, $v22
|
|
aParam equ $v23 // ltadv Parameters = AO, texgen, and Fresnel params
|
|
|
|
aAOF2 equ aDOT // Version of aAOF in init, can't be aDPosI/F or vpMdl there
|
|
aPLFcI equ aLen2I // ltadv Point Light Factor Int part
|
|
aLen2F equ vpMdl // ltadv Length 2quared Frac part
|
|
aPLFcF equ vpMdl // ltadv Point Light Factor Frac part
|
|
aLTC equ vpMdl // ltadv Light Color
|
|
aClOut equ vpWrlF // ltadv Color Out
|
|
aAlOut equ vpWrlI // ltadv Alpha Out
|
|
aDIR equ aDPosF // ltadv Direction = normalize(light or cam - vertex)
|
|
aDotSc equ aDPosF // ltadv Dot product Scale factor
|
|
aLkDt0 equ aDPosF // ltadv Lookat Dot product 0 for texgen
|
|
aLenF equ aDPosI // ltadv Length Frac part
|
|
aAOF equ aDPosI // ltadv Ambient Occlusion Factor
|
|
aProj equ aDPosI // ltadv Projection
|
|
aLkDt1 equ aDPosI // ltadv Lookat Dot product 1 for texgen
|
|
// vpST equ aOAFrs // ST used in texgen
|
|
vpWNrm equ vpNrmlX // vertex pair World space Normals
|
|
aRcpLn equ $v29 // ltadv Reciprocal of Length
|
|
aLenI equ $v29 // ltadv Length Int part
|
|
|
|
|
|
|
|
// Temp storage after rdpCmdBufEndP1. There is 0xA8 of space here which will
|
|
// always be free during vtx load or clipping.
|
|
tempVpRGBA equ 0x00 // Only used during loop
|
|
tempXfrmLt equ tempVpRGBA // ltbasic only used during init
|
|
tempVtx1ST equ tempVpRGBA // ltadv only during init
|
|
tempAmbient equ 0x10 // ltbasic set during init, used during loop
|
|
tempPrevInvalVtxStart equ 0x20
|
|
tempPrevInvalVtx equ (tempPrevInvalVtxStart + vtxSize) // 0x46; fog writes here
|
|
tempPrevInvalVtxEnd equ (tempPrevInvalVtx + vtxSize) // 0x6C; rest of vtx writes here
|
|
.if tempPrevInvalVtxEnd > (RDP_CMD_BUFSIZE_EXCESS - 8)
|
|
.error "Too much temp storage used!"
|
|
.endif
|
|
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//////////////////////////////////// IMEM //////////////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// RSP IMEM
|
|
.create CODE_FILE, 0x00001080
|
|
|
|
// Initialization routines
|
|
// Everything up until ovl01_end will get overwritten by ovl1
|
|
start: // This is at IMEM 0x1080, not the start of IMEM
|
|
vnop // Return to here from S2DEX overlay 0 G_LOAD_UCODE jumps to start+4!
|
|
lqv $v31[0], (v31Value)($zero) // Actual start is here
|
|
vadd $v29, $v29, $v29 // Consume VCO (carry) value possibly set by the previous ucode
|
|
lqv vTRC, (vTRCValue)($zero) // Always as this value except vtx_store
|
|
li altBaseReg, altBase
|
|
li rdpCmdBufPtr, rdpCmdBuffer1
|
|
vclr vOne
|
|
li rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word
|
|
lw $11, rdpFifoPos
|
|
lw $10, OSTask + OSTask_flags
|
|
li $1, SP_CLR_SIG2 | SP_CLR_SIG1 // Clear task done and yielded signals
|
|
vsub vOne, vOne, $v31[1] // 1 = 0 - -1
|
|
beqz $11, initialize_rdp // If RDP FIFO not set up yet, starting ucode from scratch
|
|
mtc0 $1, SP_STATUS
|
|
andi $10, $10, OS_TASK_YIELDED // Resumed from yield or came from called ucode?
|
|
beqz $10, continue_from_os_task // If latter, load DL (task data) pointer from OSTask
|
|
// Otherwise continuing from yield; perf counters saved here at yield
|
|
lw perfCounterA, yieldDataFooter + YDF_OFFSET_PERFCOUNTERA
|
|
lw perfCounterB, yieldDataFooter + YDF_OFFSET_PERFCOUNTERB
|
|
lw perfCounterC, yieldDataFooter + YDF_OFFSET_PERFCOUNTERC
|
|
lw perfCounterD, yieldDataFooter + YDF_OFFSET_PERFCOUNTERD
|
|
j finish_setup
|
|
lw taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR
|
|
|
|
initialize_rdp:
|
|
mfc0 $11, DPC_STATUS // Read RDP status
|
|
andi $11, $11, DPC_STATUS_XBUS_DMA // Look at XBUS enabled bit
|
|
bnez $11, @@start_new_buf // If XBUS is enabled, start new buffer
|
|
mfc0 $2, DPC_END // Load RDP end pointer
|
|
lw $3, OSTask + OSTask_output_buff // Load start of FIFO
|
|
sub $11, $3, $2 // If start of FIFO > RDP end,
|
|
bgtz $11, @@start_new_buf // start new buffer
|
|
mfc0 $1, DPC_CURRENT // Load RDP current pointer
|
|
lw $3, OSTask + OSTask_output_buff_size // Load end of FIFO
|
|
beqz $1, @@start_new_buf // If RDP current pointer is 0, start new buffer
|
|
sub $11, $1, $3 // If RDP current > end of fifo,
|
|
bgez $11, @@start_new_buf // start new buffer
|
|
nop
|
|
bne $1, $2, @@continue_buffer // If RDP current != RDP end, keep current buffer
|
|
@@start_new_buf:
|
|
// There may be one buffer executing in the RDP, and another queued in the
|
|
// double-buffered start/end regs. Wait for the latter to be available
|
|
// (i.e. possibly one buffer executing, none waiting).
|
|
mfc0 $11, DPC_STATUS // Read RDP status
|
|
andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
|
|
bnez $11, @@start_new_buf // Wait until double buffered start/end available
|
|
li $11, DPC_STATUS_CLR_XBUS // Bit to disable XBUS mode
|
|
mtc0 $11, DPC_STATUS // Set bit, disable XBUS
|
|
lw $2, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
|
|
// Set up the next buffer for the RDP to be zero size and at the end of the FIFO.
|
|
mtc0 $2, DPC_START // Set RDP start addr to end of FIFO
|
|
mtc0 $2, DPC_END // Set RDP end addr to end of FIFO
|
|
@@continue_buffer:
|
|
// If we jumped here, the RDP is currently executing from the middle of the FIFO.
|
|
// So we can just append commands to there and move the end pointer.
|
|
sw $2, rdpFifoPos // Set FIFO position to end of FIFO or RDP end
|
|
lw $11, matrixStackPtr // Initialize matrix stack pointer from OSTask
|
|
bnez $11, continue_from_os_task // if not yet initialized
|
|
lw $11, OSTask + OSTask_dram_stack
|
|
sw $11, matrixStackPtr
|
|
continue_from_os_task:
|
|
// Counters stored here if jumped to different ucode
|
|
// If starting from scratch, these are zero
|
|
lw perfCounterA, mvpMatrix + YDF_OFFSET_PERFCOUNTERA
|
|
lw perfCounterB, mvpMatrix + YDF_OFFSET_PERFCOUNTERB
|
|
lw perfCounterC, mvpMatrix + YDF_OFFSET_PERFCOUNTERC
|
|
lw perfCounterD, mvpMatrix + YDF_OFFSET_PERFCOUNTERD
|
|
jal fill_vertex_table
|
|
lw taskDataPtr, OSTask + OSTask_data_ptr
|
|
finish_setup:
|
|
.if CFG_PROFILING_C
|
|
mfc0 $11, DPC_CLOCK
|
|
sw $11, startCounterTime
|
|
.endif
|
|
sh $zero, mvpValid // and dirLightsXfrmValid
|
|
li inputBufferPos, 0
|
|
li cmd_w1_dram, orga(ovl1_start)
|
|
j load_overlays_0_1
|
|
li nextRA, displaylist_dma
|
|
|
|
start_end:
|
|
.align 8
|
|
start_padded_end:
|
|
|
|
.orga max(orga(), max(ovl0_padded_end - ovl0_start, ovl1_padded_end - ovl1_start) - 0x80)
|
|
ovl01_end:
|
|
|
|
G_CULLDL_handler: // 15
|
|
lhu $10, (vertexTable)(cmd_w0) // Start vtx addr
|
|
lhu $3, (vertexTable)(cmd_w1_dram) // End vertex
|
|
/*
|
|
CLIP_OCCLUDED can't be included here because: Suppose the list consists of N-1
|
|
verts which are behind the occlusion plane, and 1 vert which is behind the camera
|
|
plane and therefore randomly erroneously also set as behind the occlusion plane.
|
|
However, the convex hull of all the verts goes through visible area. This will be
|
|
incorrectly culled here. We can't afford the extra few instructions to disable
|
|
the occlusion plane if the vert is behind the camera, because this only matters for
|
|
G_CULLDL and not for tris.
|
|
*/
|
|
li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE)
|
|
lhu $11, VTX_CLIP($10)
|
|
culldl_loop:
|
|
and $1, $1, $11
|
|
beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render
|
|
lhu $11, (vtxSize + VTX_CLIP)($10) // next vertex clip flags
|
|
bne $10, $3, culldl_loop // loop until reaching the last vertex
|
|
addi $10, $10, vtxSize // advance to the next vertex
|
|
li cmd_w0, 0 // Clear count of DL cmds to skip loading
|
|
G_ENDDL_handler:
|
|
lbu $1, displayListStackLength // Load the DL stack index; if end stack,
|
|
beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end
|
|
addi $1, $1, -4 // Decrement the DL stack index
|
|
j call_ret_common // has a different version in ovl1
|
|
lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
|
|
|
|
G_BRANCH_WZ_handler:
|
|
lhu $10, (vertexTable)(cmd_w0) // Vertex addr from byte 3
|
|
.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
|
|
lh $10, VTX_W_INT($10) // read the w coordinate of the vertex (f3dzex)
|
|
.else
|
|
lw $10, VTX_SCR_Z($10) // read the screen z coordinate (int and frac) of the vertex (f3dex2)
|
|
.endif
|
|
sub $2, $10, cmd_w1_dram // subtract the w/z value being tested
|
|
bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL
|
|
lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to
|
|
li cmd_w0, 0x8000 // Bit 16 set (via negative) = nopush, bits 3-7 = 0 for hint
|
|
G_DL_handler:
|
|
sll $2, cmd_w0, 15 // Shifts the push/nopush value to the sign bit
|
|
lbu $1, displayListStackLength // Get the DL stack length
|
|
jal segmented_to_physical
|
|
add $3, taskDataPtr, inputBufferPos // Current DL pos to push on stack
|
|
bltz $2, call_ret_common // Nopush = branch = flag is set
|
|
move taskDataPtr, cmd_w1_dram // Set the new DL to the target display list
|
|
sw $3, (displayListStack)($1)
|
|
addi $1, $1, 4 // Increment the DL stack length
|
|
call_ret_common:
|
|
sb $zero, materialCullMode // This covers call, branch, return, and cull and branchZ successes
|
|
sb $1, displayListStackLength
|
|
andi inputBufferPos, cmd_w0, 0x00F8 // Byte 3, how many cmds to drop from load (max 0xA0)
|
|
displaylist_dma:
|
|
// Load INPUT_BUFFER_SIZE_BYTES - inputBufferPos cmds (inputBufferPos >= 0, mult of 8)
|
|
addi inputBufferPos, inputBufferPos, -INPUT_BUFFER_SIZE_BYTES // inputBufferPos = - num cmds
|
|
.if CFG_PROFILING_A
|
|
sll $11, inputBufferPos, 16 - 3 // Divide by 8 for num cmds to load, then move to upper 16
|
|
sub perfCounterB, perfCounterB, $11 // Negative so subtract
|
|
.endif
|
|
nor dmaLen, inputBufferPos, $zero // DMA length = -inputBufferPos - 1 = ones compliment
|
|
move cmd_w1_dram, taskDataPtr // set up the DRAM address to read from
|
|
sub taskDataPtr, taskDataPtr, inputBufferPos // increment the DRAM address to read from next time
|
|
addi dmemAddr, inputBufferPos, inputBufferEnd // set the address to DMA read to
|
|
dma_and_wait_goto_next_command:
|
|
li nextRA, run_next_DL_command
|
|
dma_and_wait_goto_next_ra:
|
|
j dma_read_write
|
|
li $ra, wait_goto_next_ra
|
|
|
|
G_POPMTX_handler:
|
|
G_DMA_IO_handler:
|
|
G_MEMSET_handler:
|
|
j ovl234_clipmisc_entrypoint // Delay slot is harmless
|
|
load_cmds_handler:
|
|
lb $3, materialCullMode
|
|
bltz $3, run_next_DL_command // If cull mode is < 0, in mat second time, skip the load
|
|
G_RDP_handler:
|
|
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer
|
|
G_SYNC_handler:
|
|
.if CFG_PROFILING_C
|
|
addi perfCounterC, perfCounterC, 0x4000 // Increment small RDP command count
|
|
.endif
|
|
sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer
|
|
addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words
|
|
check_rdp_buffer_full_and_run_next_cmd:
|
|
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1
|
|
bgezal dmemAddr, flush_rdp_buffer
|
|
// $1 on next instr survives flush_rdp_buffer
|
|
.if !CFG_PROFILING_A
|
|
tris_end:
|
|
.endif
|
|
.if ENABLE_PROFILING
|
|
G_LIGHTTORDP_handler:
|
|
.endif
|
|
G_SPNOOP_handler:
|
|
run_next_DL_command:
|
|
mfc0 $1, SP_STATUS // load the status word into register $1
|
|
lw cmd_w0, (inputBufferEnd)(inputBufferPos) // load the command word into cmd_w0
|
|
beqz inputBufferPos, displaylist_dma // load more DL commands if none are left
|
|
andi $1, $1, SP_STATUS_SIG0 // check if the task should yield
|
|
sra $7, cmd_w0, 24 // extract DL command byte from command word
|
|
lbu $11, (cmdMiniTable)($7) // Load mini table entry
|
|
bnez $1, load_overlay_0_and_enter // load and execute overlay 0 if yielding; $1 > 0
|
|
lw cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram
|
|
sll $11, $11, 2 // Convert to a number of instructions
|
|
.if CFG_PROFILING_C
|
|
mfc0 $10, DPC_STATUS
|
|
andi $10, $10, DPC_STATUS_GCLK_ALIVE // Sample whether GCLK is active now
|
|
sll $10, $10, 16 - 3 // move from bit 3 to bit 16
|
|
add perfCounterB, perfCounterB, $10 // Add to the perf counter
|
|
.endif
|
|
.if CFG_PROFILING_A
|
|
mfc0 $10, DPC_CLOCK
|
|
.endif
|
|
.if COUNTER_B_LOWER_CMD_COUNT
|
|
addi perfCounterB, perfCounterB, 1 // Count commands
|
|
.endif
|
|
.if CFG_PROFILING_A
|
|
move $4, perfCounterC // Save initial FIFO stall time
|
|
sw $10, startCounterTime
|
|
.endif
|
|
jr $11 // Jump to handler
|
|
addi inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words
|
|
// $1 must remain zero
|
|
// $7 must retain the command byte for load_mtx and overlay 3 stuff
|
|
// $11 must contain the handler called for several handlers
|
|
|
|
G_SETxIMG_handler: // 10
|
|
lb $3, materialCullMode // Get current mode
|
|
jal segmented_to_physical // Convert image to physical address
|
|
lw $2, lastMatDLPhyAddr // Get last material physical addr
|
|
bnez $3, G_RDP_handler // If not in normal mode (0), exit
|
|
add $10, taskDataPtr, inputBufferPos // Current material physical addr
|
|
beq $10, $2, @@skip // Branch if we are executing the same mat again
|
|
sw $10, lastMatDLPhyAddr // Store material physical addr
|
|
li $7, 1 // > 0: in material first time
|
|
@@skip: // Otherwise $7 was < 0: cull mode (in mat second time)
|
|
j G_RDP_handler
|
|
sb $7, materialCullMode
|
|
|
|
.if !ENABLE_PROFILING
|
|
G_LIGHTTORDP_handler: // 9
|
|
lbu $11, numLightsxSize // Ambient light
|
|
lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size
|
|
andi $2, cmd_w0, 0x00FF // Byte 3 = alpha
|
|
sub $1, $11, $1 // Light address; byte 2 counts from end
|
|
lw $3, (lightBufferMain-1)($1) // Load light RGB into lower 3 bytes
|
|
move cmd_w0, cmd_w1_dram // Move second word to first (cmd byte, prim level)
|
|
sll $3, $3, 8 // Shift light RGB to upper 3 bytes and clear alpha byte
|
|
j G_RDP_handler // Send to RDP
|
|
or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word
|
|
.endif
|
|
|
|
G_LOAD_UCODE_handler: // 4
|
|
j load_overlay_0_and_enter // Delay slot is harmless
|
|
G_MODIFYVTX_handler:
|
|
lhu $10, (vertexTable)(cmd_w0) // Byte 3 = vtx being modified
|
|
j do_moveword // Moveword adds cmd_w0 to $10 for final addr
|
|
lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos) // offset in vtx, bit 15 clear
|
|
|
|
G_TRIFAN_handler: // 17
|
|
li $1, 0x8000 // $ra negative = flag for G_TRIFAN
|
|
G_TRISTRIP_handler:
|
|
addi $ra, $1, tri_strip_fan_loop // otherwise $1 == 0
|
|
addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte
|
|
tri_strip_fan_loop:
|
|
lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word
|
|
addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command
|
|
beq $11, cmd_w0, tris_end // If off end of command, exit
|
|
sll $10, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit
|
|
bltz $10, tris_end // If negative, exit
|
|
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices
|
|
bltz $ra, tri_fan_store // Finish handling G_TRIFAN
|
|
addi cmd_w0, cmd_w0, 1 // Increment
|
|
andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri
|
|
bnez $11, tri_main // Draw as is
|
|
srl $10, cmd_w1_dram, 8 // Move vtx 2 to LSBs
|
|
sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2
|
|
j tri_main
|
|
sb $10, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3
|
|
|
|
// H = highest on screen = lowest Y value; then M = mid, L = low
|
|
tHAtF equ $v5
|
|
tMAtF equ $v7
|
|
tLAtF equ $v9
|
|
tHAtI equ $v18
|
|
tMAtI equ $v19
|
|
tLAtI equ $v21
|
|
tHPos equ $v14
|
|
tMPos equ $v2
|
|
tLPos equ $v10
|
|
tPosMmH equ $v6
|
|
tPosLmH equ $v8
|
|
tPosHmM equ $v11
|
|
|
|
G_TRI2_handler:
|
|
G_QUAD_handler:
|
|
jal tri_main // Send second tri; return here for first tri
|
|
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store second tri indices
|
|
G_TRI1_handler:
|
|
li $ra, tris_end // After done with this tri, exit tri processing
|
|
sw cmd_w0, 4(rdpCmdBufPtr) // Store first tri indices
|
|
tri_main:
|
|
lpv $v27[0], 0(rdpCmdBufPtr) // To vector unit
|
|
lbu $1, 5(rdpCmdBufPtr)
|
|
lbu $2, 6(rdpCmdBufPtr)
|
|
lbu $3, 7(rdpCmdBufPtr)
|
|
vclr vZero
|
|
lhu $1, (vertexTable)($1)
|
|
vmudn $v29, vOne, vTRC_VB // Address of vertex buffer
|
|
lhu $2, (vertexTable)($2)
|
|
vmadl $v27, $v27, vTRC_VS // Plus vtx indices times length
|
|
lhu $3, (vertexTable)($3)
|
|
vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6
|
|
.if !ENABLE_PROFILING
|
|
addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested
|
|
move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading
|
|
.endif
|
|
tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
|
|
vnxor tHAtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding
|
|
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
|
|
vnxor tMAtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
|
|
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
|
|
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
|
|
llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
|
|
vnxor tLAtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding
|
|
lhu $16, VTX_CLIP($1)
|
|
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
|
|
lhu $7, VTX_CLIP($2)
|
|
// vnop
|
|
lhu $8, VTX_CLIP($3)
|
|
vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
|
|
andi $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
|
|
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
|
|
and $11, $11, $7
|
|
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
|
|
and $11, $11, $8
|
|
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
|
|
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
|
|
bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull
|
|
// 22 cycles
|
|
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
|
|
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
|
|
lhu $24, activeClipPlanes
|
|
vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
|
|
lw $6, geometryModeLabel // Load full geometry mode word
|
|
vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
|
|
or $10, $16, $7
|
|
vmrg tLPos, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
|
|
or $10, $10, $8 // $10 = all clip bits which are true for any verts
|
|
vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
|
|
and $10, $10, $24 // If clipping is enabled, check clip flags
|
|
vmrg $v4, tHPos, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
|
|
mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended
|
|
vmrg tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
|
|
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
|
|
// 30 cycles
|
|
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
|
|
vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
|
|
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
|
|
vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor
|
|
sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
|
|
vmrg tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
|
|
bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing)
|
|
// 34 cycles
|
|
vmrg tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
|
|
tSubPxHF equ $v4
|
|
tSubPxHI equ $v26
|
|
vmudn tSubPxHF, tHPos, $v31[5] // 0x4000
|
|
beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull.
|
|
// 36 cycles
|
|
mfc2 $1, tHPos[12] // tHPos = lowest Y value = highest on screen (x, y, addr)
|
|
tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
|
|
tPosCatF equ $v25
|
|
vsub tPosMmH, tMPos, tHPos
|
|
mfc2 $2, tMPos[12] // tMPos = mid vertex (x, y, addr)
|
|
vsub tPosLmH, tLPos, tHPos
|
|
.if !ENABLE_PROFILING
|
|
sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit
|
|
.endif
|
|
vsub tPosHmM, tHPos, tMPos
|
|
andi $6, $6, (G_SHADE | G_ZBUFFER)
|
|
vsub tPosCatI, tLPos, tMPos
|
|
mfc2 $3, tLPos[12] // tLPos = highest Y value = lowest on screen (x, y, addr)
|
|
vmov tPosCatI[2], tPosMmH[0]
|
|
.if !CFG_NO_OCCLUSION_PLANE
|
|
and $16, $16, $7
|
|
and $16, $16, $8
|
|
andi $16, $16, CLIP_OCCLUDED
|
|
.endif
|
|
tXPF equ $v16 // Triangle cross product
|
|
tXPI equ $v17
|
|
tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
|
|
tXPRcpI equ $v24
|
|
t1WI equ $v13 // elems 0, 4, 6
|
|
t1WF equ $v14
|
|
vmudh $v29, tPosMmH, tPosLmH[0]
|
|
.if !CFG_NO_OCCLUSION_PLANE
|
|
bnez $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
|
|
.endif
|
|
llv t1WI[0], VTX_INV_W_VEC($1)
|
|
vmadh $v29, tPosLmH, tPosHmM[0]
|
|
lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
|
|
vreadacc tXPI, ACC_UPPER
|
|
lpv tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
|
|
vreadacc tXPF, ACC_MIDDLE
|
|
lpv tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
|
|
vrcp $v20[0], tPosCatI[1]
|
|
.if !ENABLE_PROFILING
|
|
lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx)
|
|
.endif
|
|
vmov tPosCatI[3], tPosLmH[0]
|
|
llv t1WI[8], VTX_INV_W_VEC($2)
|
|
vrcph $v22[0], tXPI[1]
|
|
llv t1WI[12], VTX_INV_W_VEC($3)
|
|
vrcpl tXPRcpF[1], tXPF[1]
|
|
.if !ENABLE_PROFILING
|
|
bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set
|
|
.endif
|
|
vrcph tXPRcpI[1], $v31[2] // 0
|
|
.if !ENABLE_PROFILING
|
|
vlt $v29, $v31, $v31[3] // Set vcc to 11100000
|
|
vmrg tHAtI, $v25, tHAtI // RGB from $4, alpha from $1
|
|
vmrg tMAtI, $v25, tMAtI // RGB from $4, alpha from $2
|
|
vmrg tLAtI, $v25, tLAtI // RGB from $4, alpha from $3
|
|
tri_skip_flat_shading:
|
|
.endif
|
|
// 52 cycles
|
|
vrcp $v20[2], tPosMmH[1]
|
|
lb $20, (alphaCompareCullMode)($zero)
|
|
vrcph $v22[2], tPosMmH[1]
|
|
lw $16, VTX_INV_W_VEC($1) // $16, $7, $8 = 1/W for H, M, L
|
|
vrcp $v20[3], tPosLmH[1]
|
|
lw $7, VTX_INV_W_VEC($2)
|
|
vrcph $v22[3], tPosLmH[1]
|
|
lw $8, VTX_INV_W_VEC($3)
|
|
vmudl tHAtI, tHAtI, vTRC_0100 // vertex color 1 >>= 8
|
|
lbu $9, textureSettings1 + 3
|
|
vmudl tMAtI, tMAtI, vTRC_0100 // vertex color 2 >>= 8
|
|
sub $11, $16, $7 // Four instr: $16 = max($16, $7)
|
|
vmudl tLAtI, tLAtI, vTRC_0100 // vertex color 3 >>= 8
|
|
sra $10, $11, 31
|
|
vmudl $v29, $v20, vTRC_0020
|
|
// no nop if tri_skip_flip_facing was unaligned
|
|
vmadm $v22, $v22, vTRC_0020
|
|
beqz $20, tri_skip_alpha_compare_cull
|
|
vmadn $v20, $v31, $v31[2] // 0
|
|
// Alpha compare culling
|
|
vge $v26, tHAtI, tMAtI
|
|
lbu $19, alphaCompareCullThresh
|
|
vlt $v27, tHAtI, tMAtI
|
|
bgtz $20, @@skip1
|
|
vge $v26, $v26, tLAtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts
|
|
vlt $v26, $v27, tLAtI // else if < 0, $v26 = min of 3 verts
|
|
@@skip1: // $v26 elem 3 has max or min alpha value
|
|
mfc2 $24, $v26[6]
|
|
sub $24, $24, $19 // sign bit set if (max/min) < thresh
|
|
xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
|
|
bltz $24, return_and_end_mat // if max < thresh or if min >= thresh.
|
|
tri_skip_alpha_compare_cull:
|
|
// 63 cycles
|
|
vmudm tPosCatF, tPosCatI, vTRC_1000
|
|
// no nop if tri_skip_alpha_compare_cull was unaligned
|
|
vmadn tPosCatI, $v31, $v31[2] // 0
|
|
and $11, $11, $10
|
|
vsubc tSubPxHF, vZero, tSubPxHF
|
|
sub $16, $16, $11
|
|
vsub tSubPxHI, vZero, vZero
|
|
sub $11, $16, $8 // Four instr: $16 = max($16, $8)
|
|
vmudm $v29, tPosCatF, $v20
|
|
sra $10, $11, 31
|
|
vmadl $v29, tPosCatI, $v20
|
|
and $11, $11, $10
|
|
vmadn $v20, tPosCatI, $v22
|
|
sub $16, $16, $11
|
|
vmadh tPosCatI, tPosCatF, $v22
|
|
sw $16, 0x0010(rdpCmdBufPtr) // Store max of three verts' 1/W to temp mem
|
|
vmudl $v29, tXPRcpF, tXPF
|
|
tMx1W equ $v27
|
|
llv tMx1W[0], 0x0010(rdpCmdBufPtr) // Load max of three verts' 1/W
|
|
vmadm $v29, tXPRcpI, tXPF
|
|
mfc2 $16, tXPI[1]
|
|
vmadn tXPF, tXPRcpF, tXPI
|
|
lbu $7, textureSettings1 + 2
|
|
vmadh tXPI, tXPRcpI, tXPI
|
|
lsv tMAtI[14], VTX_SCR_Z($2)
|
|
vand $v22, $v20, vTRC_FFF8
|
|
lsv tLAtI[14], VTX_SCR_Z($3)
|
|
vcr tPosCatI, tPosCatI, vTRC_0100
|
|
lsv tMAtF[14], VTX_SCR_Z_FRAC($2)
|
|
vmudh $v29, vOne, $v31[4] // 4
|
|
lsv tLAtF[14], VTX_SCR_Z_FRAC($3)
|
|
vmadn tXPF, tXPF, $v31[0] // -4
|
|
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
|
|
vmadh tXPI, tXPI, $v31[0] // -4
|
|
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
|
|
vmudn $v29, $v3, tHPos[0]
|
|
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
|
|
vmadl $v29, $v22, tSubPxHF[1]
|
|
ssv tLPos[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
|
|
vmadm $v29, tPosCatI, tSubPxHF[1]
|
|
ssv tMPos[2], 0x0004(rdpCmdBufPtr) // Store YM edge coefficient
|
|
vmadn $v2, $v22, tSubPxHI[1]
|
|
ssv tHPos[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
|
|
vmadh $v3, tPosCatI, tSubPxHI[1]
|
|
lw $19, otherMode1
|
|
tMnWI equ $v27
|
|
tMnWF equ $v10
|
|
vrcph $v29[0], tMx1W[0] // Reciprocal of max 1/W = min W
|
|
andi $10, $16, 0x0080 // Extract the left major flag from $16
|
|
vrcpl tMnWF[0], tMx1W[1]
|
|
or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
|
|
vmudh t1WF, vOne, t1WI[1q]
|
|
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
|
|
vrcph tMnWI[0], $v31[2] // 0
|
|
sb $zero, materialCullMode // This covers tri write out
|
|
tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF
|
|
tSTWHMF equ $v25
|
|
vmudh tSTWHMI, vOne, $v31[7] // 0x7FFF
|
|
ssv tPosMmH[2], 0x0030(rdpCmdBufPtr) // MmHY -> first short (temp mem)
|
|
vmudm $v29, t1WI, tMnWF[0] // 1/W each vtx * min W = 1 for one of the verts, < 1 for others
|
|
llv tSTWHMI[0], VTX_TC_VEC($1)
|
|
vmadl $v29, t1WF, tMnWF[0]
|
|
ssv tPosLmH[0], 0x0032(rdpCmdBufPtr) // LmHX -> second short (temp mem)
|
|
vmadn t1WF, t1WF, tMnWI[0]
|
|
llv tSTWHMI[8], VTX_TC_VEC($2)
|
|
vmadh t1WI, t1WI, tMnWI[0]
|
|
ssv tPosHmM[0], 0x0034(rdpCmdBufPtr) // HmMX -> third short (temp mem)
|
|
tSTWLI equ $v10 // L = elems 4-6; init W = 7FFF
|
|
tSTWLF equ $v13
|
|
vmudh tSTWLI, vOne, $v31[7] // 0x7FFF
|
|
andi $19, $19, ZMODE_DEC // Mask to two Z mode bits
|
|
set_vcc_11110001 // select RGBA___Z or ____STW_
|
|
llv tSTWLI[8], VTX_TC_VEC($3)
|
|
vmudm $v29, tSTWHMI, t1WF[0h] // (S, T, 7FFF) * (1 or <1) for H and M
|
|
addi $19, $19, -ZMODE_DEC // Check if equal to decal mode
|
|
vmadh tSTWHMI, tSTWHMI, t1WI[0h]
|
|
ldv tPosLmH[8], 0x0030(rdpCmdBufPtr) // MmHY -> e4, LmHX -> e5, HmMX -> e6
|
|
vmadn tSTWHMF, $v31, $v31[2] // 0
|
|
vmudm $v29, tSTWLI, t1WF[6] // (S, T, 7FFF) * (1 or <1) for L
|
|
vmadh tSTWLI, tSTWLI, t1WI[6]
|
|
vmadn tSTWLF, $v31, $v31[2] // 0
|
|
sdv tSTWHMI[0], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int to temp mem
|
|
vmrg tMAtI, tMAtI, tSTWHMI // Merge S, T, W Mid into elems 4-6
|
|
sdv tSTWHMF[0], 0x0028(rdpCmdBufPtr) // Move S, T, W Hi Frac to temp mem
|
|
vmrg tMAtF, tMAtF, tSTWHMF // Merge S, T, W Mid into elems 4-6
|
|
ldv tHAtI[8], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int from temp mem
|
|
vmrg tLAtI, tLAtI, tSTWLI // Merge S, T, W Low into elems 4-6
|
|
ldv tHAtF[8], 0x0028(rdpCmdBufPtr) // Move S, T, W Hi Frac from temp mem
|
|
vmrg tLAtF, tLAtF, tSTWLF // Merge S, T, W Low into elems 4-6
|
|
.if !ENABLE_PROFILING
|
|
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
|
|
.endif
|
|
// 106 cycles
|
|
vmudl $v29, tXPF, tXPRcpF
|
|
lsv tHAtF[14], VTX_SCR_Z_FRAC($1)
|
|
vmadm $v29, tXPI, tXPRcpF
|
|
lsv tHAtI[14], VTX_SCR_Z($1) // contains R, G, B, A, S, T, W, Z
|
|
vmadn tXPRcpF, tXPF, tXPRcpI
|
|
lh $1, VTX_SCR_VEC($2)
|
|
vmadh tXPRcpI, tXPI, tXPRcpI
|
|
addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
|
|
vmudh tPosLmH, tPosLmH, $v31[0h] // e1 LmHY * -4 = 4*HmLY; e456 MmHY,LmHX,HmMX *= 4
|
|
tAtLmHF equ $v10
|
|
tAtLmHI equ $v9
|
|
tAtMmHF equ $v13
|
|
tAtMmHI equ $v7
|
|
vsubc tAtLmHF, tLAtF, tHAtF
|
|
andi $3, $6, G_SHADE
|
|
vsub tAtLmHI, tLAtI, tHAtI
|
|
sll $1, $1, 14
|
|
vsubc tAtMmHF, tMAtF, tHAtF
|
|
sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient
|
|
vsub tAtMmHI, tMAtI, tHAtI
|
|
ssv $v3[6], 0x0010(rdpCmdBufPtr) // Store XH edge coefficient (integer part)
|
|
// DaDx = (v3 - v1) * factor + (v2 - v1) * factor
|
|
tDaDxF equ $v2
|
|
tDaDxI equ $v3
|
|
vmudn $v29, tAtLmHF, tPosLmH[4] // MmHY * 4
|
|
ssv $v2[6], 0x0012(rdpCmdBufPtr) // Store XH edge coefficient (fractional part)
|
|
vmadh $v29, tAtLmHI, tPosLmH[4] // MmHY * 4
|
|
ssv $v3[4], 0x0018(rdpCmdBufPtr) // Store XM edge coefficient (integer part)
|
|
vmadn $v29, tAtMmHF, tPosLmH[1] // LmHY * -4 = HmLY * 4
|
|
ssv $v2[4], 0x001A(rdpCmdBufPtr) // Store XM edge coefficient (fractional part)
|
|
vmadh $v29, tAtMmHI, tPosLmH[1] // LmHY * -4 = HmLY * 4
|
|
ssv tPosCatI[0], 0x000C(rdpCmdBufPtr) // Store DxLDy edge coefficient (integer part)
|
|
vreadacc tDaDxF, ACC_MIDDLE
|
|
ssv $v20[0], 0x000E(rdpCmdBufPtr) // Store DxLDy edge coefficient (fractional part)
|
|
vreadacc tDaDxI, ACC_UPPER
|
|
ssv tPosCatI[6], 0x0014(rdpCmdBufPtr) // Store DxHDy edge coefficient (integer part)
|
|
// DaDy = (v2 - v1) * factor + (v3 - v1) * factor
|
|
tDaDyF equ $v6
|
|
tDaDyI equ $v7
|
|
vmudn $v29, tAtMmHF, tPosLmH[5] // LmHX * 4
|
|
ssv $v20[6], 0x0016(rdpCmdBufPtr) // Store DxHDy edge coefficient (fractional part)
|
|
vmadh $v29, tAtMmHI, tPosLmH[5] // LmHX * 4
|
|
ssv tPosCatI[4], 0x001C(rdpCmdBufPtr) // Store DxMDy edge coefficient (integer part)
|
|
vmadn $v29, tAtLmHF, tPosLmH[6] // HmMX * 4
|
|
ssv $v20[4], 0x001E(rdpCmdBufPtr) // Store DxMDy edge coefficient (fractional part)
|
|
vmadh $v29, tAtLmHI, tPosLmH[6] // HmMX * 4
|
|
sll $11, $3, 4 // Shift (geometry mode & G_SHADE) by 4 to get 0x40 if G_SHADE is set
|
|
vreadacc tDaDyF, ACC_MIDDLE
|
|
add $1, $2, $11 // Increment the triangle pointer by 0x40 bytes (shade coefficients) if G_SHADE is set
|
|
vreadacc tDaDyI, ACC_UPPER
|
|
sll $11, $9, 5 // Shift texture enabled (which is 2 when on) by 5 to get 0x40 if textures are on
|
|
// DaDx, DaDy *= more factors
|
|
vmudl $v29, tDaDxF, tXPRcpF[1]
|
|
add rdpCmdBufPtr, $1, $11 // Increment the triangle pointer by 0x40 bytes (texture coefficients) if textures are on
|
|
vmadm $v29, tDaDxI, tXPRcpF[1]
|
|
andi $6, $6, G_ZBUFFER // Get the value of G_ZBUFFER from the current geometry mode
|
|
vmadn tDaDxF, tDaDxF, tXPRcpI[1]
|
|
sll $11, $6, 4 // Shift (geometry mode & G_ZBUFFER) by 4 to get 0x10 if G_ZBUFFER is set
|
|
vmadh tDaDxI, tDaDxI, tXPRcpI[1]
|
|
move $10, rdpCmdBufPtr // Write Z here
|
|
vmudl $v29, tDaDyF, tXPRcpF[1]
|
|
add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set
|
|
vmadm $v29, tDaDyI, tXPRcpF[1]
|
|
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1 // Check if we need to write out to RDP
|
|
vmadn tDaDyF, tDaDyF, tXPRcpI[1]
|
|
sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional)
|
|
vmadh tDaDyI, tDaDyI, tXPRcpI[1]
|
|
sdv tDaDxI[0], 0x0008($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (integer)
|
|
// DaDe = DaDx * factor
|
|
tDaDeF equ $v8
|
|
tDaDeI equ $v9
|
|
// 135 cycles
|
|
vmadl $v29, tDaDxF, $v20[3]
|
|
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
|
|
vmadm $v29, tDaDxI, $v20[3]
|
|
sdv tDaDxI[8], 0x0008($1) // Store DsDx, DtDx, DwDx texture coefficients (integer)
|
|
vmadn tDaDeF, tDaDxF, tPosCatI[3]
|
|
sdv tDaDyF[0], 0x0038($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (fractional)
|
|
vmadh tDaDeI, tDaDxI, tPosCatI[3]
|
|
sdv tDaDyI[0], 0x0028($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer)
|
|
// Base value += DaDe * factor
|
|
vmudn $v29, tHAtF, vOne[0]
|
|
sdv tDaDyF[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficients (fractional)
|
|
vmadh $v29, tHAtI, vOne[0]
|
|
sdv tDaDyI[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficients (integer)
|
|
vmadl $v29, tDaDeF, tSubPxHF[1]
|
|
sdv tDaDeF[0], 0x0030($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional)
|
|
vmadm $v29, tDaDeI, tSubPxHF[1]
|
|
sdv tDaDeI[0], 0x0020($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (integer)
|
|
vmadn tHAtF, tDaDeF, tSubPxHI[1]
|
|
sdv tDaDeF[8], 0x0030($1) // Store DsDe, DtDe, DwDe texture coefficients (fractional)
|
|
vmadh tHAtI, tDaDeI, tSubPxHI[1]
|
|
sdv tDaDeI[8], 0x0020($1) // Store DsDe, DtDe, DwDe texture coefficients (integer)
|
|
// All values start in element 7. "a", attribute, is Z. Need
|
|
// tHAtI, tHAtF, tDaDxI, tDaDxF, tDaDeI, tDaDeF, tDaDyI, tDaDyF
|
|
// VCC is still 11110001
|
|
// 145 cycles
|
|
vmrg tDaDyI, tDaDyF, tDaDyI[7] // Elems 6-7: DzDyI:F
|
|
beqz $19, tri_decal_fix_z
|
|
vmrg tDaDxI, tDaDxF, tDaDxI[7] // Elems 6-7: DzDxI:F
|
|
tri_return_from_decal_fix_z:
|
|
vmrg tDaDeI, tDaDeF, tDaDeI[7] // Elems 6-7: DzDeI:F
|
|
sdv tHAtF[0], 0x0010($2) // Store RGBA shade color (fractional)
|
|
vmrg $v10, tHAtF, tHAtI[7] // Elems 6-7: ZI:F
|
|
sdv tHAtI[0], 0x0000($2) // Store RGBA shade color (integer)
|
|
sdv tHAtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional)
|
|
sdv tHAtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer)
|
|
slv tDaDyI[12], 0x0C($10) // DzDyI:F
|
|
slv tDaDxI[12], 0x04($10) // DzDxI:F
|
|
slv tDaDeI[12], 0x08($10) // DzDeI:F
|
|
bltz dmemAddr, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
|
|
slv $v10[12], 0x00($10) // ZI:F
|
|
// 156 cycles
|
|
flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
|
|
mfc0 $11, SP_DMA_BUSY // Check if any DMA is in flight
|
|
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
|
|
lw $10, OSTask + OSTask_output_buff_size // Load FIFO "size" (actually end addr)
|
|
.if CFG_PROFILING_C
|
|
// This is a wait for DMA busy loop, but written inline to avoid overwriting ra.
|
|
addi perfCounterD, perfCounterD, 7 // 6 instr + 1 taken branch
|
|
.endif
|
|
bnez $11, flush_rdp_buffer // Wait until no DMAs are active
|
|
addi dmaLen, dmemAddr, RDP_CMD_BUFSIZE + 8 // dmaLen = size of DMEM buffer to copy
|
|
blez dmaLen, old_return_routine // Exit if nothing to copy, or if dmemAddr is large negative num from last flush DMA write
|
|
mtc0 cmd_w1_dram, DPC_END // Set RDP to execute until FIFO end (buf pushed last time)
|
|
add $11, cmd_w1_dram, dmaLen // $11 = future FIFO pointer if we append this new buffer
|
|
sub $10, $10, $11 // $10 = FIFO end addr - future pointer
|
|
bgez $10, @@has_room // Branch if we can fit this
|
|
@@await_rdp_dblbuf_avail:
|
|
mfc0 $11, DPC_STATUS // Read RDP status
|
|
andi $11, $11, DPC_STATUS_START_VALID // Start valid = second start addr in dbl buf
|
|
bnez $11, @@await_rdp_dblbuf_avail // Wait until double buffered start/end available
|
|
.if COUNTER_C_FIFO_FULL
|
|
addi perfCounterC, perfCounterC, 7 // 4 instr + 2 after mfc + 1 taken branch
|
|
.endif
|
|
lw cmd_w1_dram, OSTask + OSTask_output_buff // Start of FIFO
|
|
@@await_past_first_instr:
|
|
mfc0 $11, DPC_CURRENT // Load RDP current pointer
|
|
beq $11, cmd_w1_dram, @@await_past_first_instr // Wait until RDP moved past start
|
|
.if COUNTER_C_FIFO_FULL
|
|
addi perfCounterC, perfCounterC, 6 // 3 instr + 2 after mfc + 1 taken branch
|
|
.else
|
|
nop
|
|
.endif
|
|
// Start was previously the start of the FIFO, unless this is the first buffer,
|
|
// in which case it was the end of the FIFO. Normally, when the RDP gets to end, if we
|
|
// have a new end value waiting (END_VALID), it'll load end but leave current. By
|
|
// setting start here, it will also load current with start.
|
|
mtc0 cmd_w1_dram, DPC_START // Set RDP start to start of FIFO
|
|
@@keep_waiting:
|
|
.if COUNTER_C_FIFO_FULL
|
|
// This is here so we only count it when stalling below or on FIFO end codepath
|
|
addi perfCounterC, perfCounterC, 10 // 7 instr + 2 after mfc + 1 taken branch
|
|
.endif
|
|
@@has_room:
|
|
mfc0 $11, DPC_CURRENT // Load RDP current pointer
|
|
sub $11, $11, cmd_w1_dram // Current - current end (rdpFifoPos or start)
|
|
blez $11, @@copy_buffer // Current is behind or at current end, can do copy
|
|
sub $11, $11, dmaLen // If amount current is ahead of current end
|
|
blez $11, @@keep_waiting // is <= size of buffer to copy, keep waiting
|
|
@@copy_buffer:
|
|
add $11, cmd_w1_dram, dmaLen // New end is current end + buffer size
|
|
sw $11, rdpFifoPos
|
|
// Set up the DMA from DMEM to the RDP fifo in RDRAM
|
|
addi dmaLen, dmaLen, -1 // subtract 1 from the length
|
|
addi dmemAddr, rdpCmdBufEndP1, -(0x2000 | (RDP_CMD_BUFSIZE + 8)) // The 0x2000 is meaningless, negative means write
|
|
xori rdpCmdBufEndP1, rdpCmdBufEndP1, rdpCmdBuffer1EndPlus1Word ^ rdpCmdBuffer2EndPlus1Word // Swap between the two RDP command buffers
|
|
j dma_read_write
|
|
addi rdpCmdBufPtr, rdpCmdBufEndP1, -(RDP_CMD_BUFSIZE + 8)
|
|
|
|
tri_decal_fix_z:
|
|
// Valid range of tHAtI = 0 to 7FFF, but most of the scene is large values
|
|
vmudh $v29, vOne, vTRC_DO // accum all elems = -DM/2
|
|
vmadm $v25, tHAtI, vTRC_DM // elem 7 = (0 to DM/2-1) - DM/2 = -DM/2 to -1
|
|
vcr tDaDyI, tDaDyI, $v25[7] // Clamp DzDyI (6) to <= -val or >= val; clobbers DzDyF (7)
|
|
j tri_return_from_decal_fix_z
|
|
set_vcc_11110001 // Clobbered by vcr
|
|
|
|
tri_culled_by_occlusion_plane:
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterB, perfCounterB, 0x4000
|
|
.endif
|
|
return_and_end_mat:
|
|
jr $ra
|
|
sb $zero, materialCullMode // This covers all tri early exits except clipping
|
|
|
|
align_with_warning 8, "One instruction of padding before ovl234"
|
|
|
|
vtx_select_lighting:
|
|
.if CFG_PROFILING_B
|
|
srl $11, vtxLeft, 4 // Vertex count
|
|
add perfCounterA, perfCounterA, $11 // Add to number of lit vertices
|
|
.endif
|
|
bltz viLtFlag, ovl234_ltadv_entrypoint // Advanced lighting if have point lights
|
|
andi $10, vGeomMid, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
|
|
bnez $10, ovl234_ltadv_entrypoint // Advanced lighting if specular or Fresnel
|
|
lb viLtFlag, dirLightsXfrmValid
|
|
// Fallthrough to ltbasic on whichever overlay is loaded
|
|
|
|
.if (. & 4)
|
|
.error "vtx_select_lighting must be an even number of instructions"
|
|
.endif
|
|
ovl234_start:
|
|
|
|
ovl3_start:
|
|
// Clipping overlay.
|
|
|
|
// Jump here for basic lighting setup. If overlay 3 is loaded (this code), loads overlay 2
|
|
// and jumps to right here, which is now in the new code.
|
|
ovl234_ltbasic_entrypoint_ovl3ver: // same IMEM address as ovl234_ltbasic_entrypoint
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterC, perfCounterC, 1 // Count lighting overlay load
|
|
.endif
|
|
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
|
|
li cmd_w1_dram, orga(ovl2_start) // set up a load for overlay 2
|
|
|
|
// Jump here for advanced lighting. If overlay 3 is loaded (this code), loads
|
|
// overlay 4 and jumps to right here, which is now in the new code.
|
|
ovl234_ltadv_entrypoint_ovl3ver: // same IMEM address as ovl234_ltadv_entrypoint
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterD, perfCounterD, 1 // Count overlay 4 load
|
|
.endif
|
|
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
|
|
li cmd_w1_dram, orga(ovl4_start) // set up a load for overlay 4
|
|
|
|
// Jump here for clipping and rare commands. If overlay 3 is loaded (this code), directly starts
|
|
// the clipping code.
|
|
ovl234_clipmisc_entrypoint:
|
|
sh $ra, tempTriRA // Tri return after clipping
|
|
.if CFG_PROFILING_B
|
|
nop // Needs to take up the space for the other perf counter
|
|
.endif
|
|
bnez $1, vtx_constants_for_clip // In clipping, $1 is vtx 1 addr, never 0. Cmd dispatch, $1 = 0.
|
|
li inVtx, 0x8000 // inVtx < 0 means from clipping. Inc'd each vtx write by 2 * inputVtxSize, but this is large enough it should stay negative.
|
|
lw cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Overwritten by overlay load
|
|
li $3, (0xFF00 | G_MTX)
|
|
beq $3, $7, g_mtx_push_ovl3
|
|
li $11, (0xFF00 | G_MEMSET)
|
|
beq $11, $7, g_memset_ovl3
|
|
li $3, (0xFF00 | G_DMA_IO)
|
|
beq $3, $7, g_dma_io_ovl3
|
|
g_popmtx_ovl3: // otherwise
|
|
lw $11, matrixStackPtr // Current matrix stack pointer
|
|
lw $2, OSTask + OSTask_dram_stack // Top of the stack
|
|
sub cmd_w1_dram, $11, cmd_w1_dram // Decrease pointer by amount in command
|
|
sub $3, cmd_w1_dram, $2 // Is it still valid / within the stack?
|
|
bgez $3, @@skip // If so, skip the failsafe
|
|
sh $zero, mvpValid // and dirLightsXfrmValid; mark both mtx and dir lts invalid
|
|
move cmd_w1_dram, $2 // Use the top of the stack as the new pointer
|
|
@@skip:
|
|
j do_movemem // Must keep $1 = 0
|
|
sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer
|
|
|
|
g_mtx_push_ovl3:
|
|
lw cmd_w1_dram, matrixStackPtr // Set up the DMA from dmem to rdram at the matrix stack pointer
|
|
li dmemAddr, (mMatrix | 0x8000) // mMatrix, negative = write
|
|
jal dma_read_write // DMA the current matrix from dmem to rdram
|
|
li dmaLen, 0x0040 - 1 // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive)
|
|
addi cmd_w1_dram, cmd_w1_dram, 0x40 // Increase the matrix stack pointer by the size of one matrix
|
|
sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer
|
|
j load_mtx
|
|
lw cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Load command word 1 again
|
|
|
|
g_dma_io_ovl3:
|
|
jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
|
|
lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
|
|
andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
|
|
j dma_and_wait_goto_next_command // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
|
|
// At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
|
|
// So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
|
|
sra dmemAddr, dmemAddr, 2
|
|
|
|
clip_after_constants:
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterB, perfCounterB, 1 // Increment clipped (input) tris count
|
|
.endif
|
|
// Clear all temp vertex slots used.
|
|
li $11, (MAX_CLIP_GEN_VERTS - 1) * vtxSize
|
|
clip_init_used_loop:
|
|
sh $zero, (VTX_CLIP + clipTempVerts)($11)
|
|
bgtz $11, clip_init_used_loop
|
|
addi $11, $11, -vtxSize
|
|
li clipMaskIdx, 4 // 4=screen, 3=+x, 2=-x, 1=+y, 0=-y
|
|
li clipMask, CLIP_CAMPLANE // Initial clip mask for screen clipping
|
|
li clipPolySelect, 6 // Everything being indexed from 6 saves one instruction at the end of the loop
|
|
sh $1, (clipPoly - 6 + 0)(clipPolySelect) // Write the current three verts
|
|
sh $2, (clipPoly - 6 + 2)(clipPolySelect) // as the initial polygon
|
|
sh $3, (clipPoly - 6 + 4)(clipPolySelect) // Initial state $3 = clipVLastOfsc
|
|
sh $zero, (clipPoly)(clipPolySelect) // nullptr to mark end of polygon
|
|
sb $zero, materialCullMode // In case only/all tri(s) clip then offscreen
|
|
// Available locals here: $11, $1, $7, $20, $24, $10
|
|
clip_condlooptop:
|
|
lhu clipFlags, VTX_CLIP(clipVLastOfsc) // Load flags for final vertex of the last polygon
|
|
addi clipPolyRead, clipPolySelect, -6 // Start reading at the beginning of the old polygon
|
|
xori clipPolySelect, clipPolySelect, 6 ^ ((clipPoly2 - clipPoly) + 6) // Swap to the other polygon memory
|
|
addi clipPolyWrite, clipPolySelect, -6 // Start writing at the beginning of the new polygon
|
|
and clipFlags, clipFlags, clipMask // Mask last flags to current clip condition
|
|
clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the edge
|
|
lhu clipVNext, (clipPoly)(clipPolyRead) // Read next vertex (farther end of edge)
|
|
addi clipPolyRead, clipPolyRead, 0x0002 // Increment read pointer
|
|
beqz clipVNext, clip_nextcond // If next vtx is nullptr, done with input polygon
|
|
lhu $11, VTX_CLIP(clipVNext) // Load flags for next vtx
|
|
and $11, $11, clipMask // Mask next flags to current clip condition
|
|
beq $11, clipFlags, clip_nextedge // Both set or both clear = both off screen or both on screen, no subdivision
|
|
move clipFlags, $11 // clipFlags = masked next vtx's flags
|
|
// Going to subdivide this edge. Find available temp vertex slot.
|
|
li outVtxBase, clipTempVertsEnd
|
|
clip_find_unused_loop:
|
|
lhu $11, (VTX_CLIP - vtxSize)(outVtxBase)
|
|
addi $10, outVtxBase, -clipTempVerts // This is within the loop rather than before b/c delay after lhu
|
|
blez $10, clip_done // If can't find one (should never happen), give up
|
|
andi $11, $11, CLIP_VTX_USED
|
|
bnez $11, clip_find_unused_loop
|
|
addi outVtxBase, outVtxBase, -vtxSize
|
|
beqz clipFlags, clip_skipswap23 // Next vtx flag is clear / on screen,
|
|
move clipVOnsc, clipVNext // therefore last vtx is set / off screen
|
|
move clipVOnsc, clipVLastOfsc // Otherwise swap; note we are overwriting
|
|
move clipVLastOfsc, clipVNext // clipVLastOfsc but not clipVNext
|
|
clip_skipswap23:
|
|
// Interpolate between clipVLastOfsc and clipVOns; create a new vertex which is on the
|
|
// clipping boundary (e.g. at the screen edge)
|
|
cPosOnOfF equ vpClpF
|
|
cPosOnOfI equ vpClpI
|
|
cPosOfF equ vpScrF
|
|
cPosOfI equ vpScrI
|
|
cRGBAOf equ vpLtTot
|
|
cRGBAOn equ vpRGBA
|
|
cSTOf equ vpST
|
|
cSTOn equ sSTS // Intentionally overwriting this kept reg. Vtx scales ST again, need to re-store unscaled value.
|
|
// Also uses sRTF, sRTI = vTemp1, vTemp2, and vtx_final_setup_for_clip sets sOPM = vKept2
|
|
cTemp equ vpMdl
|
|
cBaseF equ vpNrmlX
|
|
cBaseI equ vpNrmlY
|
|
cDiffF equ $v2
|
|
cDiffI equ $v3
|
|
cRRF equ $v4 // Range Reduction frac
|
|
cRRI equ $v5 // Range Reduction int
|
|
cFadeOf equ $v6
|
|
cFadeOn equ $v7
|
|
/*
|
|
Five clip conditions (these are in a different order from vanilla):
|
|
cBaseI/cBaseF[3] cDiffI/cDiffF[3]
|
|
4 W=0: W1 W1 - W2
|
|
3 +X : X1 - 2*W1 (X1 - 2*W1) - (X2 - 2*W2) <- the 2 is clip ratio
|
|
2 -X : X1 + 2*W1 (X1 + 2*W1) - (X2 + 2*W2)
|
|
1 +Y : Y1 - 2*W1 (Y1 - 2*W1) - (Y2 - 2*W2)
|
|
0 -Y : Y1 + 2*W1 (Y1 + 2*W1) - (Y2 + 2*W2)
|
|
*/
|
|
xori $11, clipMaskIdx, 1 // Invert sign of condition
|
|
ldv cPosOnOfF[0], VTX_FRAC_VEC(clipVOnsc)
|
|
ctc2 $11, $vcc // Conditions 1 (+y) or 3 (+x) -> vcc[0] = 0
|
|
ldv cPosOnOfI[0], VTX_INT_VEC (clipVOnsc)
|
|
vmrg cTemp, vOne, $v31[1] // elem 0 is 1 if W or neg cond, -1 if pos cond
|
|
andi $11, clipMaskIdx, 4 // W condition and screen clipping
|
|
ldv cPosOnOfF[8], VTX_FRAC_VEC(clipVLastOfsc) // Off screen to elems 4-7
|
|
bnez $11, clip_w // If so, use 1 or -1
|
|
ldv cPosOnOfI[8], VTX_INT_VEC (clipVLastOfsc)
|
|
vmudh cTemp, cTemp, $v31[3] // elem 0 is (1 or -1) * 2 (clip ratio)
|
|
andi $11, clipMaskIdx, 2 // Conditions 2 (-x) or 3 (+x)
|
|
vmudm cBaseF, vOne, cPosOnOfF[0h] // Set accumulator (care about 3, 7) to X
|
|
bnez $11, clip_skipy
|
|
vmadh cBaseI, vOne, cPosOnOfI[0h]
|
|
vmudm cBaseF, vOne, cPosOnOfF[1h] // Discard that and set accumulator 3, 7 to Y
|
|
vmadh cBaseI, vOne, cPosOnOfI[1h]
|
|
clip_skipy:
|
|
vmadn cBaseF, cPosOnOfF, cTemp[0] // + W * +/- 2
|
|
vmadh cBaseI, cPosOnOfI, cTemp[0]
|
|
clip_skipxy:
|
|
vsubc cDiffF, cBaseF, cBaseF[7] // Vtx on screen - vtx off screen
|
|
vsub cDiffI, cBaseI, cBaseI[7]
|
|
// This is computing cDiffI:F = cBaseI:F / cDiffI:F to high precision.
|
|
// The first step is a range reduction, where cRRF becomes a scale factor
|
|
// (roughly min(1.0f, abs(1.0f / cDiffI:F))) which scales down cDiffI:F (denominator)
|
|
// Then the reciprocal of cDiffI:F is computed with a Newton-Raphson iteration
|
|
// and multiplied by cBaseI:F. Finally scale down the result (numerator) by cRRF.
|
|
vor cTemp, cDiffI, vOne[0] // Round up int sum to odd; this ensures the value is not 0, otherwise vabs result will be 0 instead of +/- 2
|
|
sub $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing
|
|
vrcph cRRI[3], cDiffI[3]
|
|
addi $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot
|
|
vrcpl cRRF[3], cDiffF[3] // 1 / (x+y+z+w), vtx on screen - vtx off screen
|
|
bgez $11, clip_done // If so, give up
|
|
vrcph cRRI[3], $v31[2] // 0; get int result of reciprocal
|
|
vabs cTemp, cTemp, $v31[3] // 2; cTemp = +/- 2 based on sum positive (incl. zero) or negative
|
|
lhu $11, VTX_CLIP(clipVLastOfsc) // Load clip flags for off screen vert
|
|
vmudn cRRF, cRRF, cTemp[3] // multiply reciprocal by +/- 2
|
|
sh outVtxBase, (clipPoly)(clipPolyWrite) // Write pointer to generated vertex to polygon
|
|
vmadh cRRI, cRRI, cTemp[3]
|
|
addi clipPolyWrite, clipPolyWrite, 2 // Increment write ptr
|
|
veq cRRI, cRRI, $v31[2] // 0; if RR int part is 0
|
|
andi $11, $11, ~CLIP_VTX_USED // Clear used flag from off screen vert
|
|
vmrg cRRF, cRRF, $v31[1] // keep RR frac, otherwise set frac to 0xFFFF (max)
|
|
sh $11, VTX_CLIP(clipVLastOfsc) // Store modified clip flags for off screen vert
|
|
vmudl $v29, cDiffF, cRRF[3] // Multiply clDiffI:F by RR frac*frac
|
|
ldv cPosOfF[0], VTX_FRAC_VEC (clipVLastOfsc) // Off screen loaded above, but need
|
|
vmadm cDiffI, cDiffI, cRRF[3] // int*frac, int out
|
|
ldv cPosOfI[0], VTX_INT_VEC (clipVLastOfsc) // it in elems 0-3 for interp
|
|
vmadn cDiffF, $v31, $v31[2] // 0; get frac out
|
|
luv cRGBAOf[0], VTX_COLOR_VEC(clipVLastOfsc)
|
|
vrcph sRTI[3], cDiffI[3] // Reciprocal of new scaled cDiff (discard)
|
|
luv cRGBAOn[0], VTX_COLOR_VEC(clipVOnsc)
|
|
vrcpl sRTF[3], cDiffF[3] // frac part
|
|
llv cSTOf[0], VTX_TC_VEC (clipVLastOfsc)
|
|
vrcph sRTI[3], $v31[2] // 0; int part
|
|
llv cSTOn[0], VTX_TC_VEC (clipVOnsc) // Must be before vtx_final_setup_for_clip
|
|
vmudl $v29, sRTF, cDiffF // D*R (see Newton-Raphson explanation)
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
li vtxLeft, -1 // vtxLeft < 0 triggers vtx_epilogue
|
|
.else
|
|
li vtxLeft, inputVtxSize // but trigger this on the second loop in this version
|
|
.endif
|
|
vmadm $v29, sRTI, cDiffF
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
addi outVtxBase, outVtxBase, -vtxSize // Inc'd by 2, must point to second vtx
|
|
.else
|
|
addi outVtxBase, outVtxBase, vtxSize // Not inc'd, must point to second vtx
|
|
.endif
|
|
vmadn cDiffF, sRTF, cDiffI
|
|
li vLoopRet, vtx_loop_no_lighting
|
|
vmadh cDiffI, sRTI, cDiffI
|
|
vmudh $v29, vOne, $v31[4] // 4; 4 - 4 * (D*R)
|
|
vmadn cDiffF, cDiffF, $v31[0] // -4
|
|
vmadh cDiffI, cDiffI, $v31[0] // -4
|
|
vmudl $v29, sRTF, cDiffF // 1/cDiff result = R * that
|
|
vmadm $v29, sRTI, cDiffF
|
|
vmadn sRTF, sRTF, cDiffI
|
|
vmadh sRTI, sRTI, cDiffI
|
|
vmudl $v29, cBaseF, sRTF // cDiff regs = cBase / cDiff
|
|
vmadm $v29, cBaseI, sRTF
|
|
vmadn cDiffF, cBaseF, sRTI
|
|
vmadh cDiffI, cBaseI, sRTI
|
|
vmudl $v29, cDiffF, cRRF[3] // Scale by range reduction
|
|
vmadm cDiffI, cDiffI, cRRF[3]
|
|
vmadn cDiffF, $v31, $v31[2] // Done cDiffI:F = cBaseI:F / cDiffI:F
|
|
// Clamp to 0x0001 to 0xFFFF range and create inverse on-screen factor
|
|
vlt cDiffI, cDiffI, vOne[0] // If integer part of factor less than 1,
|
|
vmrg cDiffF, cDiffF, $v31[1] // keep frac part of factor, else set to 0xFFFF (max val)
|
|
vsubc $v29, cDiffF, vOne[0] // frac part - 1 for carry
|
|
vge cDiffI, cDiffI, $v31[2] // 0; If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
|
|
j vtx_final_setup_for_clip // Clobbers vcc and accum in !NOC config.
|
|
vmrg cFadeOf, cDiffF, vOne[0] // keep frac part of factor, else set to 1 (min val)
|
|
clip_after_final_setup: // This is here because otherwise 3 cycle stall here.
|
|
vmudn cFadeOn, cFadeOf, $v31[1] // signed x * -1 = 0xFFFF - unsigned x! Fade factor for on screen vert
|
|
// Fade between attributes for on screen and off screen vert
|
|
vmudm $v29, cRGBAOf, cFadeOf[3]
|
|
vmadm vpRGBA, cRGBAOn, cFadeOn[3]
|
|
vmudm $v29, cSTOf, cFadeOf[3]
|
|
vmadm sSTS, cSTOn, cFadeOn[3]
|
|
vmudl $v29, cPosOfF, cFadeOf[3]
|
|
vmadm $v29, cPosOfI, cFadeOf[3]
|
|
vmadl $v29, cPosOnOfF, cFadeOn[3]
|
|
vmadm vpClpI, cPosOnOfI, cFadeOn[3]
|
|
j vtx_store_for_clip
|
|
vmadn vpClpF, $v31, $v31[2] // 0; load resulting frac pos
|
|
clip_after_vtx_store:
|
|
ori flagsV1, flagsV1, CLIP_VTX_USED // Mark generated vtx as used
|
|
slv sSTS[0], (VTX_TC_VEC )(outVtx1) // Store not-twice-scaled ST
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store generated vertex flags
|
|
clip_nextedge:
|
|
bnez clipFlags, clip_edgelooptop // Discard V2 if it was off screen (whether inserted vtx or not)
|
|
move clipVLastOfsc, clipVNext // Move what was the end of the edge to be the new start of the edge
|
|
sub $11, clipPolyWrite, clipPolySelect // Make sure we are not overflowing
|
|
addi $11, $11, 6 - ((MAX_CLIP_POLY_VERTS) * 2) // Write ptr to last zero slot
|
|
bgez $11, clip_done // If so, give up
|
|
sh clipVLastOfsc, (clipPoly)(clipPolyWrite) // Former V2 was on screen,
|
|
j clip_edgelooptop // so add it to the output polygon
|
|
addi clipPolyWrite, clipPolyWrite, 2
|
|
|
|
clip_w:
|
|
vcopy cBaseF, cPosOnOfF // Result is just W
|
|
j clip_skipxy
|
|
vcopy cBaseI, cPosOnOfI
|
|
|
|
clip_nextcond:
|
|
sub $11, clipPolyWrite, clipPolySelect // Are there less than 3 verts in the output polygon?
|
|
bltz $11, clip_done // If so, degenerate result, quit
|
|
sh $zero, (clipPoly)(clipPolyWrite) // Terminate the output polygon with a 0
|
|
lhu clipVLastOfsc, (clipPoly - 2)(clipPolyWrite) // Initialize edge start to the last vert
|
|
beqz clipMaskIdx, clip_draw_tris
|
|
lbu $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount
|
|
li clipMask, 1
|
|
sllv clipMask, clipMask, $11
|
|
j clip_condlooptop
|
|
addi clipMaskIdx, clipMaskIdx, -1
|
|
|
|
clip_draw_tris:
|
|
sh $zero, activeClipPlanes
|
|
// Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite
|
|
// Draws verts in pattern like 0-1-4, 1-2-4, 2-3-4
|
|
clip_draw_tris_loop:
|
|
lhu $1, (clipPoly - 6)(clipPolySelect)
|
|
lhu $2, (clipPoly - 4)(clipPolySelect)
|
|
lhu $3, (clipPoly - 2)(clipPolyWrite)
|
|
mtc2 $1, $v27[10] // Addresses go in vector regs too
|
|
mtc2 $2, $v4[12]
|
|
jal tri_noinit
|
|
mtc2 $3, $v27[14]
|
|
bne clipPolyWrite, clipPolySelect, clip_draw_tris_loop
|
|
addi clipPolySelect, clipPolySelect, 2
|
|
clip_done:
|
|
li $11, CLIP_SCAL_NPXY | CLIP_CAMPLANE
|
|
sh $11, activeClipPlanes
|
|
lh $ra, tempTriRA
|
|
fill_vertex_table:
|
|
// Create bytes 00-07
|
|
li $1, 7
|
|
@@loop1:
|
|
sb $1, (vertexTable)($1)
|
|
bgtz $1, @@loop1
|
|
addi $1, $1, -1
|
|
// Load to vu and multiply by 2 to get vertex indexes. It would be more cycles
|
|
// to change the loop above to count by 2s than the stalls here.
|
|
li $2, vertexTable
|
|
lpv $v3[0], (0)($2)
|
|
li $3, vertexTable + ((G_MAX_VERTS + 8) * 2) // Need 0-56 inclusive, so do 0-63
|
|
vmudh $v3, $v3, $v31[3] // 2; now 0x0000, 0x0200, ..., 0x0E00
|
|
@@loop2:
|
|
vmudn $v29, vOne, vTRC_VB // Address of vertex buffer
|
|
vmadl $v4, $v3, vTRC_VS // Plus vtx indices times length
|
|
vadd $v3, $v3, vTRC_1000 // increment by 8 verts = 16
|
|
addi $2, $2, 0x10
|
|
bne $2, $3, @@loop2
|
|
sqv $v4[0], (-0x10)($2)
|
|
jr $ra
|
|
// Delay slot harmless
|
|
|
|
g_memset_ovl3:
|
|
llv $v2[0], (rdpHalf1Val)($zero) // Load the memset value
|
|
sll cmd_w0, cmd_w0, 8 // Clear upper byte
|
|
jal segmented_to_physical
|
|
srl cmd_w0, cmd_w0, 8 // Number of bytes to memset (must be mult of 16)
|
|
li $3, memsetBufferStart + 0x10 // Last qword set is memsetBufferStart
|
|
jal @@clamp_to_memset_buffer
|
|
vmudh $v2, vOne, $v2[1] // Move element 1 (lower bytes) to all
|
|
addi $2, $2, memsetBufferStart // First qword set is one below end
|
|
@@pre_loop:
|
|
sqv $v2, (-0x10)($2)
|
|
bne $2, $3, @@pre_loop
|
|
addi $2, -0x10
|
|
@@transaction_loop:
|
|
jal @@clamp_to_memset_buffer
|
|
li dmemAddr, 0x8000 | memsetBufferStart // Always write from start of buffer
|
|
jal dma_read_write
|
|
addi dmaLen, $2, -1
|
|
sub cmd_w0, cmd_w0, $2
|
|
bgtz cmd_w0, @@transaction_loop
|
|
add cmd_w1_dram, cmd_w1_dram, $2
|
|
j while_wait_dma_busy
|
|
li $ra, run_next_DL_command
|
|
@@clamp_to_memset_buffer:
|
|
addi $11, cmd_w0, -memsetBufferSize // $2 = min(cmd_w0, memsetBufferSize)
|
|
sra $10, $11, 31
|
|
and $11, $11, $10
|
|
jr $ra
|
|
addi $2, $11, memsetBufferSize
|
|
|
|
ovl3_end:
|
|
.align 8
|
|
ovl3_padded_end:
|
|
|
|
.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga())
|
|
ovl234_end:
|
|
|
|
vtx_after_dma:
|
|
srl $2, cmd_w0, 11 // n << 1
|
|
sub $2, cmd_w0, $2 // = v0 << 1
|
|
lhu outVtxBase, (vertexTable)($2) // Address of output start
|
|
andi inVtx, dmemAddr, 0xFFF8 // Round down input start addr to DMA word
|
|
.if COUNTER_A_UPPER_VERTEX_COUNT
|
|
sll $11, vtxLeft, 12 // Vtx count * 0x10000
|
|
add perfCounterA, perfCounterA, $11 // Add to vertex count
|
|
.endif
|
|
vtx_constants_for_clip:
|
|
// Sets up constants needed for vertex loop, including during clipping.
|
|
// Results fill vPerm1:4. Uses misc temps.
|
|
lhu vGeomMid, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
llv sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
|
|
ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
|
|
veq $v29, $v31, $v31[3h] // VCC = 00010001
|
|
ldv sVPO[8], (viewport + 8)($zero)
|
|
llv sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
|
|
vmrg sFGM, vOne, $v31[2] // sFGM is 0,0,0,1,0,0,0,1
|
|
ldv sVPS[0], (viewport)($zero) // Load vscale duplicated in 0-3 and 4-7
|
|
vne $v29, $v31, $v31[3h] // VCC = 11101110
|
|
ldv sVPS[8], (viewport)($zero)
|
|
lb $11, geometryModeLabel + 3 // G_ATTROFFSET_ST_ENABLE in sign bit
|
|
vmrg sVPO, sVPO, sFOG[1] // Put fog offset in elements 3,7 of vtrans
|
|
llv $v30[0], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 0, 1
|
|
vmov sSTS[4], sSTS[0]
|
|
llv $v30[8], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 4, 5
|
|
vmrg sVPS, sVPS, sFOG[0] // Put fog multiplier in elements 3,7 of vscale
|
|
bltz $11, @@keepoffset
|
|
lbu $7, mvpValid
|
|
vclr $v30
|
|
@@keepoffset:
|
|
.else
|
|
lb flagsV1, geometryModeLabel + 3 // G_ATTROFFSET_ST_ENABLE in sign bit
|
|
lw $11, (fogFactor)($zero) // Load fog multiplier MSBs and offset LSBs
|
|
llv sSTS[0], (textureSettings2 - altBase)(altBaseReg) // Texture ST scale in 0, 1
|
|
llv $v30[0], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 0, 1
|
|
llv $v30[8], (attrOffsetST - altBase)(altBaseReg) // Texture ST offset in 4, 5
|
|
bltz flagsV1, @@keepoffset
|
|
srl $10, $11, 16 // Fog multiplier to lower bits
|
|
vclr $v30
|
|
@@keepoffset:
|
|
sh $11, (viewport + 0xE)($zero) // Store fog offset over vtrans W
|
|
vmov sSTS[4], sSTS[0]
|
|
sh $10, (viewport + 0x6)($zero) // Store fog multiplier over vscale W
|
|
lbu $7, mvpValid
|
|
ldv sO03[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3
|
|
ldv sO03[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2
|
|
ldv sO47[0], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // Load coeffs 4-7
|
|
ldv sO47[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2
|
|
ldv sOCM[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) // Load mid coeffs
|
|
ldv sOCM[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg) // and for vtx 2
|
|
.endif
|
|
vmov sSTS[5], sSTS[1]
|
|
bltz inVtx, clip_after_constants // inVtx < 0 means from clipping
|
|
lsv $v30[6], (perspNorm - altBase)(altBaseReg) // Perspective norm elem 3
|
|
vtx_after_setup_constants:
|
|
bnez $7, @@skip_recalc_mvp
|
|
lb viLtFlag, pointLightFlag
|
|
li $2, vpMatrix
|
|
li dmemAddr, mMatrix
|
|
jal mtx_multiply
|
|
li $3, mvpMatrix
|
|
sb $10, mvpValid // $10 is nonzero from mtx_multiply, in fact 0x18
|
|
@@skip_recalc_mvp:
|
|
andi $11, vGeomMid, G_LIGHTING >> 8
|
|
bnez $11, vtx_select_lighting
|
|
sb $zero, materialCullMode // Vtx ends material. Must be before lighting for clever packedNormalsMaskConstant reuse
|
|
vtx_setup_no_lighting:
|
|
li vLoopRet, vtx_loop_no_lighting
|
|
vtx_after_lt_setup:
|
|
li $11, mvpMatrix
|
|
vtx_load_mtx:
|
|
lqv vMTX0I, (0x00)($11) // Load MVP matrix
|
|
lqv vMTX2I, (0x10)($11)
|
|
lqv vMTX0F, (0x20)($11)
|
|
lqv vMTX2F, (0x30)($11)
|
|
// nop TODO
|
|
vcopy vMTX1I, vMTX0I
|
|
vcopy vMTX3I, vMTX2I
|
|
ldv vMTX1I[0], (0x08)($11)
|
|
vcopy vMTX1F, vMTX0F
|
|
ldv vMTX3I[0], (0x18)($11)
|
|
vcopy vMTX3F, vMTX2F
|
|
ldv vMTX1F[0], (0x28)($11)
|
|
ldv vMTX3F[0], (0x38)($11)
|
|
ldv vMTX0I[8], (0x00)($11)
|
|
ldv vMTX2I[8], (0x10)($11)
|
|
ldv vMTX0F[8], (0x20)($11)
|
|
beqz $11, ltadv_after_mtx // $11 = 0 = mMatrix if from ltadv
|
|
ldv vMTX2F[8], (0x30)($11)
|
|
vtx_final_setup_for_clip:
|
|
.if !CFG_NO_OCCLUSION_PLANE
|
|
vge $v29, $v31, $v31[2h] // VCC = 00110011
|
|
.endif
|
|
andi fogFlag, vGeomMid, G_FOG >> 8 // Can't put before lt b/c fogFlag = mtx valid flag.
|
|
.if !CFG_NO_OCCLUSION_PLANE
|
|
vmrg sOPM, vOne, $v31[1] // Signs of sOPM are --++--++
|
|
.endif
|
|
srl fogFlag, fogFlag, 5 // 8 if G_FOG is set, 0 otherwise
|
|
addi outVtx1, rdpCmdBufEndP1, tempPrevInvalVtx // Write prev loop vtx garbage here
|
|
.if !CFG_NO_OCCLUSION_PLANE
|
|
addi outVtx2, rdpCmdBufEndP1, tempPrevInvalVtx // Write prev loop vtx garbage here
|
|
.endif
|
|
bltz inVtx, clip_after_final_setup // inVtx < 0 means from clipping
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
addi outVtx2, rdpCmdBufEndP1, tempPrevInvalVtx // Write prev loop vtx garbage here
|
|
.else
|
|
vmudh sOPM, sOPM, $v31[5] // sOPM is 0xC000, 0xC000, 0x4000, 0x4000, repeat
|
|
.endif
|
|
jal while_wait_dma_busy // Wait for vertex load to finish
|
|
addi outVtxBase, outVtxBase, -vtxSize // Will inc by 2, but need point to 2nd
|
|
.if CFG_NO_OCCLUSION_PLANE // With occlusion plane, vpMdl loaded at vtx_store_loop_entry
|
|
ldv vpMdl[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // 1st vec pos
|
|
ldv vpMdl[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // 2nd vec pos
|
|
.endif
|
|
llv sTCL[8], (VTX_IN_CN + 0 * inputVtxSize)(inVtx) // RGBA in 4:5
|
|
llv sTCL[12], (VTX_IN_CN + 1 * inputVtxSize)(inVtx) // RGBA in 6:7
|
|
llv vpST[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1
|
|
j vtx_store_loop_entry
|
|
llv vpST[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5
|
|
|
|
align_with_warning 8, "One instruction of padding before vertex loop"
|
|
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
|
|
vtx_loop_no_lighting:
|
|
// lCOL <- sSCI
|
|
// lDTC <- sRTF
|
|
// lVCI <- sRTI
|
|
// vpLtTot <- s1WF
|
|
// vpNrmlX <- s1WI
|
|
vmadh $v29, vMTX1I, vpMdl[1h]
|
|
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
vmadn vpClpF, vMTX2F, vpMdl[2h]
|
|
or flagsV1, flagsV1, $10 // Combine results for first vertex
|
|
vmadh vpClpI, vMTX2I, vpMdl[2h]
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
|
|
// lDOT <- vpMdl
|
|
// sFOG <- lCOL
|
|
vge sFOG, vpScrI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
|
|
luv vpRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
|
|
// sCLZ <- sTCL
|
|
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
|
|
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
|
|
vtx_return_from_lighting:
|
|
vtx_return_from_texgen:
|
|
vtx_store_for_clip:
|
|
vmudl $v29, vpClpF, $v30[3] // Persp norm
|
|
sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0
|
|
// s1WI <- vpNrmlX
|
|
vmadm s1WI, vpClpI, $v30[3] // Persp norm
|
|
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
|
|
// s1WF <- vpLtTot
|
|
vmadn s1WF, $v31, $v31[2] // 0
|
|
sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
|
|
vmov vpScrF[1], sCLZ[2]
|
|
sbv sFOG[7], (VTX_COLOR_A + 8 - vtxSize)($11) // ...which gets overwritten below
|
|
// sSCF <- lDOT
|
|
vmudn sSCF, vpClpF, $v31[3] // W * clip ratio for scaled clipping
|
|
ssv sCLZ[12], (VTX_SCR_Z )(outVtx2)
|
|
// sSCI <- sFOG
|
|
vmadh sSCI, vpClpI, $v31[3] // W * clip ratio for scaled clipping
|
|
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
|
|
vrcph $v29[0], s1WI[3]
|
|
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
|
|
// sRTF <- lDTC
|
|
vrcpl sRTF[2], s1WF[3]
|
|
ssv vpScrF[12], (VTX_SCR_Z_FRAC )(outVtx2)
|
|
// sRTI <- lVCI
|
|
vrcph sRTI[3], s1WI[7]
|
|
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
|
|
vrcpl sRTF[6], s1WF[7]
|
|
sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter
|
|
vrcph sRTI[7], $v31[2] // 0
|
|
andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0
|
|
vch $v29, vpClpI, vpClpI[3h] // Clip screen high
|
|
sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second
|
|
vcl $v29, vpClpF, vpClpF[3h] // Clip screen low
|
|
addi outVtx1, outVtxBase, -vtxSize // First output vtx always
|
|
vmudl $v29, s1WF, sRTF[2h]
|
|
cfc2 flagsV1, $vcc // Screen clip results
|
|
vmadm $v29, s1WI, sRTF[2h]
|
|
sdv vpClpF[8], (VTX_FRAC_VEC )(outVtx2)
|
|
vmadn s1WF, s1WF, sRTI[3h]
|
|
// sTCL <- sCLZ
|
|
ldv sTCL[0], (VTX_IN_TC + 2 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3
|
|
vmadh s1WI, s1WI, sRTI[3h]
|
|
sdv vpClpF[0], (VTX_FRAC_VEC )(outVtx1)
|
|
vch $v29, vpClpI, sSCI[3h] // Clip scaled high
|
|
lsv vpClpF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below
|
|
vmudh $v29, vOne, $v31[4] // 4
|
|
sdv vpClpI[8], (VTX_INT_VEC )(outVtx2)
|
|
vmadn s1WF, s1WF, $v31[0] // -4
|
|
lsv vpClpF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below
|
|
vmadh s1WI, s1WI, $v31[0] // -4
|
|
sdv vpClpI[0], (VTX_INT_VEC )(outVtx1)
|
|
vmudm $v29, vpST, sSTS // Scale ST
|
|
ldv sTCL[8], (VTX_IN_TC + 3 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7
|
|
// sST2 <- vpScrI
|
|
vmadh sST2, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
|
|
suv vpRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx
|
|
vmudl $v29, s1WF, sRTF[2h]
|
|
lsv vpClpI[14], (VTX_Z_INT )(outVtx2) // load Z into W slot, will be for fog below
|
|
vmadm $v29, s1WI, sRTF[2h]
|
|
suv vpRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx
|
|
vmadn s1WF, s1WF, sRTI[3h]
|
|
lsv vpClpI[6], (VTX_Z_INT )(outVtx1) // load Z into W slot, will be for fog below
|
|
vmadh s1WI, s1WI, sRTI[3h]
|
|
srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots
|
|
vcl $v29, vpClpF, sSCF[3h] // Clip scaled low
|
|
andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vcopy vpST, sTCL
|
|
cfc2 $11, $vcc // Scaled clip results
|
|
vmudl $v29, vpClpF, s1WF[3h] // Pos times inv W
|
|
ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2)
|
|
vmadm $v29, vpClpI, s1WF[3h] // Pos times inv W
|
|
// vpMdl <- sSCF
|
|
ldv vpMdl[0], (VTX_IN_OB + 2 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration
|
|
vmadn vpClpF, vpClpF, s1WI[3h]
|
|
ldv vpMdl[8], (VTX_IN_OB + 3 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration
|
|
vmadh vpClpI, vpClpI, s1WI[3h] // vpClpI:vpClpF = pos times inv W
|
|
addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices
|
|
vmov sTCL[4], vpST[2] // First vtx RG to elem 4
|
|
andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vmov sTCL[5], vpST[3] // First vtx BA to elem 5
|
|
sll $10, $11, 4 // Shift first vertex scaled clipping to second slots
|
|
vmudl $v29, vpClpF, $v30[3] // Persp norm
|
|
ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1)
|
|
vmadm vpClpI, vpClpI, $v30[3] // Persp norm
|
|
ssv s1WI[14], (VTX_INV_W_INT )(outVtx2)
|
|
vmadn vpClpF, $v31, $v31[2] // 0; Now vpClpI:vpClpF = projected position
|
|
ssv s1WI[6], (VTX_INV_W_INT )(outVtx1)
|
|
// vnop // TODO maybe can rotate the loop so this is the jr land slot?
|
|
slv sST2[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2
|
|
vmudh $v29, sVPO, vOne // offset * 1
|
|
slv sST2[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1
|
|
vmadh $v29, sFGM, $v31[6] // + (0,0,0,1,0,0,0,1) * 0x7F00
|
|
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
vmadn vpScrF, vpClpF, sVPS // + pos frac * scale
|
|
or flagsV2, flagsV2, $11 // Combine results for second vertex
|
|
// vpScrI <- sST2
|
|
vmadh vpScrI, vpClpI, sVPS // int part, vpScrI:vpScrF is now screen space pos
|
|
sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags
|
|
vtx_store_loop_entry:
|
|
vmudn $v29, vMTX3F, vOne
|
|
blez vtxLeft, vtx_epilogue
|
|
vmadh $v29, vMTX3I, vOne
|
|
vmadn $v29, vMTX0F, vpMdl[0h]
|
|
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
|
|
vmadh $v29, vMTX0I, vpMdl[0h]
|
|
jr vLoopRet
|
|
vmadn $v29, vMTX1F, vpMdl[1h]
|
|
|
|
vtx_epilogue:
|
|
vge sFOG, vpScrI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
|
|
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
|
|
or flagsV1, flagsV1, $10 // Combine results for first vertex
|
|
beqz fogFlag, @@skip_fog
|
|
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
|
|
sbv sFOG[15], (VTX_COLOR_A )(outVtx2)
|
|
sbv sFOG[7], (VTX_COLOR_A )(outVtx1)
|
|
@@skip_fog:
|
|
vmov vpScrF[1], sCLZ[2]
|
|
ssv sCLZ[12], (VTX_SCR_Z )(outVtx2)
|
|
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
|
|
ssv vpScrF[12], (VTX_SCR_Z_FRAC )(outVtx2)
|
|
bltz inVtx, clip_after_vtx_store // inVtx < 0 means from clipping
|
|
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
|
|
// Fallthrough (across the versions boundary) to vtx_end
|
|
|
|
.else // not CFG_NO_OCCLUSION_PLANE
|
|
|
|
// 70 cycles, 16 more than NOC
|
|
// 6 vu cycles for plane, 8 vu cycles for edges, 0 more vnops than NOC,
|
|
// 1 branch delay slot with SU instr, 1 land-after-branch.
|
|
vtx_loop_no_lighting:
|
|
// lDTC <- sVPS
|
|
// lVCI <- sRTI
|
|
// vpLtTot <- sTCL
|
|
// vpNrmlX <- s1WF
|
|
// lDIR <- sOTM
|
|
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
|
|
sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0
|
|
vmrg sOCS, sOCS, sOTM // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
|
|
sbv sFOG[7], (VTX_COLOR_A + 8)($11) // ...which gets overwritten below
|
|
vmrg vpScrF, vpScrF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6
|
|
// vpRGBA <- lDIR
|
|
luv vpRGBA[0], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx pair RGBA
|
|
// lDOT <- sCLZ
|
|
// lCOL <- sFOG
|
|
vtx_return_from_texgen:
|
|
vmudm $v29, vpST, sSTS // Scale ST
|
|
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
|
|
vmadh vpST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
|
|
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
|
|
vtx_return_from_lighting:
|
|
vge $v29, sOCS, sO47 // Each compare to coeffs 4-7
|
|
slv vpScrI[0], (VTX_SCR_VEC )(outVtx1)
|
|
vmudn $v29, vMTX3F, vOne
|
|
cfc2 $11, $vcc
|
|
vmadh $v29, vMTX3I, vOne
|
|
slv vpScrF[10], (VTX_SCR_Z )(outVtx2)
|
|
vmadn $v29, vMTX0F, vpMdl[0h]
|
|
addi inVtx, inVtx, (2 * inputVtxSize) // Advance two positions forward in the input vertices
|
|
vmadh $v29, vMTX0I, vpMdl[0h]
|
|
slv vpScrF[2], (VTX_SCR_Z )(outVtx1)
|
|
vmadn $v29, vMTX1F, vpMdl[1h]
|
|
or $11, $11, $10 // Combine occlusion results. Any set in 0-3, 4-7 = not occluded
|
|
vmadh $v29, vMTX1I, vpMdl[1h]
|
|
andi $10, $11, 0x000F // Bits 0-3 for vtx 1
|
|
// vpClpF <- lDOT
|
|
vmadn vpClpF, vMTX2F, vpMdl[2h]
|
|
addi $11, $11, -(0x0010) // If not occluded, atl 1 of 4-7 set, so $11 >= 0x10. Else $11 < 0x10.
|
|
// vpClpI <- lCOL
|
|
vmadh vpClpI, vMTX2I, vpMdl[2h]
|
|
bnez $10, @@skipv1 // If nonzero, at least one equation false, don't set occluded flag
|
|
andi $11, $11, CLIP_OCCLUDED // This is bit 11, = sign bit b/c |$11| <= 0xFF
|
|
ori flagsV1, flagsV1, CLIP_OCCLUDED // All equations true, set vtx 1 occluded flag
|
|
@@skipv1:
|
|
// 16 cycles
|
|
vtx_store_for_clip:
|
|
vmudl $v29, vpClpF, $v30[3] // Persp norm
|
|
or flagsV2, flagsV2, $11 // occluded = $11 negative = sign bit set = $11 is flag, else 0
|
|
// s1WI <- vpMdl
|
|
vmadm s1WI, vpClpI, $v30[3] // Persp norm
|
|
sh flagsV2, (VTX_CLIP )(outVtx2) // Store second vertex clip flags
|
|
// s1WF <- vpNrmlX
|
|
vmadn s1WF, $v31, $v31[2] // 0
|
|
blez vtxLeft, vtx_epilogue
|
|
vmudn $v29, vpClpF, sOCM // X * kx, Y * ky, Z * kz
|
|
vmadh $v29, vpClpI, sOCM // Int * int
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
|
|
vrcph $v29[0], s1WI[3]
|
|
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
|
|
// sRTF <- lDTC
|
|
vrcpl sRTF[2], s1WF[3]
|
|
sra $11, vtxLeft, 31 // All 1s if on single-vertex last iter
|
|
// sRTI <- lVCI
|
|
vrcph sRTI[3], s1WI[7]
|
|
andi $11, $11, vtxSize // vtxSize if on single-vertex last iter, else normally 0
|
|
vrcpl sRTF[6], s1WF[7]
|
|
sub outVtx2, outVtxBase, $11 // First output vtx on last iter, else second
|
|
vrcph sRTI[7], $v31[2] // 0
|
|
addi outVtx1, outVtxBase, -vtxSize // First output vtx always
|
|
vreadacc sOCS, ACC_UPPER // Load int * int portion
|
|
suv vpRGBA[4], (VTX_COLOR_VEC )(outVtx2) // Store RGBA for second vtx
|
|
vch $v29, vpClpI, vpClpI[3h] // Clip screen high
|
|
suv vpRGBA[0], (VTX_COLOR_VEC )(outVtx1) // Store RGBA for first vtx
|
|
vmudl $v29, s1WF, sRTF[2h]
|
|
sdv vpClpI[8], (VTX_INT_VEC )(outVtx2)
|
|
vmadm $v29, s1WI, sRTF[2h]
|
|
sdv vpClpI[0], (VTX_INT_VEC )(outVtx1)
|
|
vmadn s1WF, s1WF, sRTI[3h]
|
|
sdv vpClpF[8], (VTX_FRAC_VEC )(outVtx2)
|
|
vmadh s1WI, s1WI, sRTI[3h]
|
|
sdv vpClpF[0], (VTX_FRAC_VEC )(outVtx1)
|
|
vcl $v29, vpClpF, vpClpF[3h] // Clip screen low
|
|
sqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // For Z to W manip. RGBA not currently stored here
|
|
vmudh $v29, vOne, $v31[4] // 4
|
|
cfc2 flagsV1, $vcc // Screen clip results
|
|
vmadn s1WF, s1WF, $v31[0] // -4
|
|
ssv vpClpI[4], (tempVpRGBA + 6)(rdpCmdBufEndP1) // First Z to W
|
|
vmadh s1WI, s1WI, $v31[0] // -4
|
|
// sTCL <- vpLtTot
|
|
ldv sTCL[0], (VTX_IN_TC + 0 * inputVtxSize)(inVtx) // ST in 0:1, RGBA in 2:3
|
|
// sSCF <- vpScrF
|
|
vmudn sSCF, vpClpF, $v31[3] // W * clip ratio for scaled clipping
|
|
ssv vpClpI[12], (tempVpRGBA + 14)(rdpCmdBufEndP1) // Second Z to W
|
|
// sSCI <- vpScrI
|
|
vmadh sSCI, vpClpI, $v31[3] // W * clip ratio for scaled clipping
|
|
lsv vpClpF[14], (VTX_Z_FRAC )(outVtx2) // load Z into W slot, will be for fog below
|
|
vmudl $v29, s1WF, sRTF[2h]
|
|
lqv vpClpI, (tempVpRGBA)(rdpCmdBufEndP1) // Load int part with Z in W
|
|
vmadm $v29, s1WI, sRTF[2h]
|
|
lsv vpClpF[6], (VTX_Z_FRAC )(outVtx1) // load Z into W slot, will be for fog below
|
|
vmadn s1WF, s1WF, sRTI[3h]
|
|
ldv sTCL[8], (VTX_IN_TC + 1 * inputVtxSize)(inVtx) // ST in 4:5, RGBA in 6:7
|
|
vmadh s1WI, s1WI, sRTI[3h]
|
|
srl flagsV2, flagsV1, 4 // Shift second vertex screen clipping to first slots
|
|
vch $v29, vpClpI, sSCI[3h] // Clip scaled high
|
|
andi flagsV2, flagsV2, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vcl $v29, vpClpF, sSCF[3h] // Clip scaled low
|
|
slv vpST[8], (VTX_TC_VEC )(outVtx2) // Store scaled S, T vertex 2
|
|
vmudl $v29, vpClpF, s1WF[3h] // Pos times inv W
|
|
cfc2 $11, $vcc // Scaled clip results
|
|
vmadm $v29, vpClpI, s1WF[3h] // Pos times inv W
|
|
slv vpST[0], (VTX_TC_VEC )(outVtx1) // Store scaled S, T vertex 1
|
|
vmadn vpClpF, vpClpF, s1WI[3h]
|
|
// sVPO <- sSCF
|
|
ldv sVPO[0], (viewport + 8)($zero) // Load viewport offset incl. fog for first vertex
|
|
vmadh vpClpI, vpClpI, s1WI[3h] // vpClpI:vpClpF = pos times inv W
|
|
ssv s1WF[14], (VTX_INV_W_FRAC)(outVtx2)
|
|
// sOTM <- vpRGBA
|
|
vadd sOTM, sOCS, sOCS[1h] // Add Y to X
|
|
ldv sVPO[8], (viewport + 8)($zero) // Load viewport offset incl. fog for second vertex
|
|
vcopy vpST, sTCL
|
|
ssv s1WF[6], (VTX_INV_W_FRAC)(outVtx1)
|
|
vmudl $v29, vpClpF, $v30[3] // Persp norm
|
|
// sVPS <- sSCI
|
|
ldv sVPS[0], (viewport)($zero) // Load viewport scale incl. fog for first vertex
|
|
vmadm vpClpI, vpClpI, $v30[3] // Persp norm
|
|
ssv s1WI[14], (VTX_INV_W_INT )(outVtx2)
|
|
vmadn vpClpF, $v31, $v31[2] // 0; Now vpClpI:vpClpF = projected position
|
|
ldv sVPS[8], (viewport)($zero) // Load viewport scale incl. fog for second vertex
|
|
vadd sOCS, sOTM, sOCS[2h] // Add Z to X
|
|
ssv s1WI[6], (VTX_INV_W_INT )(outVtx1)
|
|
vmov sTCL[4], vpST[2] // First vtx RG to elem 4
|
|
andi flagsV1, flagsV1, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vmudh $v29, sVPO, vOne // offset * 1
|
|
sll $10, $11, 4 // Shift first vertex scaled clipping to second slots
|
|
// vpScrF <- sVPO
|
|
vmadn vpScrF, vpClpF, sVPS // + pos frac * scale
|
|
andi $11, $11, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
// vpScrI <- sVPS
|
|
vmadh vpScrI, vpClpI, sVPS // int part, vpScrI:vpScrF is now screen space pos
|
|
or flagsV2, flagsV2, $11 // Combine results for second vertex
|
|
// sFOG <- vpClpI
|
|
vmadh sFOG, vOne, $v31[6] // + 0x7F00 in all elements, clamp to 0x7FFF for fog
|
|
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
vlt $v29, sOCS, sOCM[3h] // Occlusion plane X+Y+Z<C in elems 0, 4
|
|
or flagsV1, flagsV1, $10 // Combine results for first vertex
|
|
vmov sTCL[5], vpST[3] // First vtx BA to elem 5
|
|
cfc2 $10, $vcc // Load occlusion plane mid results to bits 3 and 7
|
|
vmudh sOTM, vpScrI, $v31[4] // 4; scale up x and y
|
|
// vpMdl <- s1WI
|
|
vtx_store_loop_entry:
|
|
ldv vpMdl[0], (VTX_IN_OB + 0 * inputVtxSize)(inVtx) // Pos of 1st vector for next iteration
|
|
vge sFOG, sFOG, $v31[6] // 0x7F00; clamp fog to >= 0 (want low byte only)
|
|
ldv vpMdl[8], (VTX_IN_OB + 1 * inputVtxSize)(inVtx) // Pos of 2nd vector on next iteration
|
|
// vnop
|
|
andi $10, $10, (1 << 0) | (1 << 4) // Only bits 0, 4 from occlusion
|
|
vmulf $v29, sOPM, vpScrI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
|
|
sub $11, outVtx2, fogFlag // Points 8 before outVtx2 if fog, else 0
|
|
vmacf sOCS, sO03, sOTM[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2
|
|
sdv sTCL[8], (tempVpRGBA)(rdpCmdBufEndP1) // Vtx 0 and 1 RGBA in order
|
|
vmulf $v29, sOPM, vpScrI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2
|
|
sbv sFOG[15], (VTX_COLOR_A + 8)($11) // In VTX_SCR_Y if fog disabled...
|
|
vmacf sOTM, sO03, sOTM[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2
|
|
jr vLoopRet
|
|
// sCLZ <- vpClpF
|
|
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
|
|
// vnop in land slot
|
|
|
|
vtx_epilogue:
|
|
bltz inVtx, clip_after_vtx_store // inVtx < 0 means from clipping
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
|
|
// Fallthrough to vtx_end
|
|
|
|
.endif
|
|
|
|
vtx_end:
|
|
.if CFG_PROFILING_A
|
|
li $ra, 0 // Flag for coming from vtx
|
|
lqv vTRC, (vTRCValue)($zero) // Restore value overwritten by matrix
|
|
tris_end:
|
|
mfc0 $11, DPC_CLOCK
|
|
lw $10, startCounterTime
|
|
sub $11, $11, $10
|
|
beqz $ra, run_next_DL_command // $ra != 0 if from tri cmds
|
|
add perfCounterA, perfCounterA, $11 // Add to vert cycles perf counter
|
|
sub perfCounterA, perfCounterA, $11 // From tris, undo add to vert perf counter
|
|
sub $10, perfCounterC, $4 // How long we stalled for RDP FIFO during this cmd
|
|
sub $11, $11, $10 // Subtract that from the tri cycles
|
|
j run_next_DL_command
|
|
add perfCounterD, perfCounterD, $11 // Add to tri cycles perf counter
|
|
.else
|
|
j run_next_DL_command
|
|
lqv vTRC, (vTRCValue)($zero) // Restore value overwritten by matrix
|
|
.endif
|
|
|
|
tri_fan_store:
|
|
lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1
|
|
sh cmd_w1_dram, 5(rdpCmdBufPtr) // Store vtx N+2 and N+3 as 1 and 2
|
|
j tri_main
|
|
sb $11, 7(rdpCmdBufPtr) // Store vtx 1 as 3
|
|
|
|
// Converts the segmented address in cmd_w1_dram to the corresponding physical address
|
|
segmented_to_physical: // 7
|
|
srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11
|
|
andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift
|
|
lw $11, (segmentTable)($11) // Get the current address of the segment
|
|
sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out
|
|
srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared
|
|
jr $ra
|
|
add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address
|
|
|
|
load_overlay_0_and_enter:
|
|
li nextRA, 0x1000 // Sets up return address
|
|
li cmd_w1_dram, orga(ovl0_start) // Sets up ovl0 table address
|
|
load_overlays_0_1:
|
|
li dmaLen, ovl01_end - 0x1000 - 1
|
|
j load_overlay_inner
|
|
li dmemAddr, 0x1000
|
|
|
|
load_overlays_2_3_4:
|
|
addi nextRA, $ra, -8 // Got here with jal, but want to return to addr of jal itself
|
|
li dmaLen, ovl234_end - ovl234_start - 1
|
|
li dmemAddr, ovl234_start
|
|
load_overlay_inner: // dmaLen, dmemAddr, cmd_w1_dram, and nextRA must be set
|
|
lw $11, OSTask + OSTask_ucode
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterC, perfCounterC, 0x4000 // Increment overlay (all 0-4) load count
|
|
.endif
|
|
.if !CFG_PROFILING_C
|
|
j dma_and_wait_goto_next_ra
|
|
add cmd_w1_dram, cmd_w1_dram, $11
|
|
.else
|
|
// According to Tharo's testing, and in contradiction to the manual, almost no
|
|
// instructions are issued while an IMEM DMA is happening. So we have to time
|
|
// it using counters.
|
|
mfc0 ovlInitClock, DPC_CLOCK
|
|
jal shared_dma_read_write // The one without perfCounterD
|
|
add cmd_w1_dram, cmd_w1_dram, $11
|
|
mfc0 $11, SP_DMA_BUSY
|
|
@@while_dma_busy:
|
|
bnez $11, @@while_dma_busy
|
|
mfc0 $11, SP_DMA_BUSY
|
|
mfc0 $11, DPC_CLOCK
|
|
sub $11, $11, ovlInitClock
|
|
jr nextRA
|
|
add perfCounterD, perfCounterD, $11
|
|
|
|
// Also, normal dma_read_write below can't be changed to insert perfCounterD due to
|
|
// S2DEX constraints. So we have to duplicate that part of it.
|
|
dma_read_write:
|
|
mfc0 $11, SP_DMA_FULL
|
|
bnez $11, dma_read_write
|
|
addi perfCounterD, perfCounterD, 6 // 3 instr + 2 after mfc + 1 taken branch
|
|
j dma_read_write_not_full
|
|
// $11 load in delay slot is harmless.
|
|
.endif
|
|
|
|
endFreeImemAddr equ 0x1FC4
|
|
startFreeImem:
|
|
.if . > endFreeImemAddr
|
|
.error "Out of IMEM space"
|
|
.endif
|
|
.org endFreeImemAddr
|
|
endFreeImem:
|
|
|
|
wait_goto_next_ra:
|
|
move $ra, nextRA
|
|
// Fallthrough to while_wait_dma_busy
|
|
|
|
.if . != 0x1FC8
|
|
// This has to be at this address for boot and S2DEX compatibility
|
|
.error "Error in organization of end of IMEM"
|
|
.endif
|
|
|
|
// The code from here to the end is shared with S2DEX, so great care is needed for changes.
|
|
while_wait_dma_busy:
|
|
mfc0 $11, SP_DMA_BUSY // Load the DMA_BUSY value
|
|
.if CFG_PROFILING_C
|
|
bnez $11, while_wait_dma_busy
|
|
// perfCounterD is $12, which is a temp register in S2DEX, which happens to
|
|
// never have state carried over while_wait_dma_busy.
|
|
addi perfCounterD, perfCounterD, 6 // 3 instr + 2 after mfc + 1 taken branch
|
|
.else
|
|
@@while_dma_busy:
|
|
bnez $11, @@while_dma_busy // Loop until DMA_BUSY is cleared
|
|
mfc0 $11, SP_DMA_BUSY // Update DMA_BUSY value
|
|
.endif
|
|
old_return_routine:
|
|
jr $ra
|
|
// Has mfc0 in branch delay slot, causes a stall if first instr after ret is load
|
|
|
|
.if !CFG_PROFILING_C
|
|
dma_read_write:
|
|
.endif
|
|
shared_dma_read_write:
|
|
mfc0 $11, SP_DMA_FULL // load the DMA_FULL value
|
|
@@while_dma_full:
|
|
bnez $11, @@while_dma_full // Loop until DMA_FULL is cleared
|
|
mfc0 $11, SP_DMA_FULL // Update DMA_FULL value
|
|
dma_read_write_not_full:
|
|
mtc0 dmemAddr, SP_MEM_ADDR // Set the DMEM address to DMA from/to
|
|
bltz dmemAddr, dma_write // If the DMEM address is negative, this is a DMA write, if not read
|
|
mtc0 cmd_w1_dram, SP_DRAM_ADDR // Set the DRAM address to DMA from/to
|
|
jr $ra
|
|
mtc0 dmaLen, SP_RD_LEN // Initiate a DMA read with a length of dmaLen
|
|
dma_write:
|
|
jr $ra
|
|
mtc0 dmaLen, SP_WR_LEN // Initiate a DMA write with a length of dmaLen
|
|
|
|
.if . != 0x00002000
|
|
.error "Code at end of IMEM shared with other ucodes has been corrupted"
|
|
.endif
|
|
|
|
.headersize 0x00001000 - orga()
|
|
|
|
// Overlay 0 handles three cases of stopping the current microcode.
|
|
// The action here is controlled by $1. If yielding, $1 > 0. If this was
|
|
// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
|
|
ovl0_start:
|
|
jal flush_rdp_buffer // See G_FLUSH_handler for docs on these 3 instructions.
|
|
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1
|
|
jal flush_rdp_buffer
|
|
add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
|
|
.if CFG_PROFILING_C
|
|
mfc0 $11, DPC_CLOCK
|
|
lw $10, startCounterTime
|
|
sub $11, $11, $10
|
|
add perfCounterA, perfCounterA, $11
|
|
.endif
|
|
bnez $1, task_done_or_yield // Continue to load ucode if 0
|
|
load_ucode:
|
|
lw cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr
|
|
sw $zero, OSTask + OSTask_flags // So next ucode knows it didn't come from yield
|
|
li dmemAddr, start // Beginning of overwritable part of IMEM
|
|
sw taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL
|
|
sw cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute
|
|
// Store counters in mvpMatrix; first 0x180 of DMEM will be preserved in ucode swap AND
|
|
// if other ucode yields
|
|
sw perfCounterA, mvpMatrix + YDF_OFFSET_PERFCOUNTERA
|
|
sw perfCounterB, mvpMatrix + YDF_OFFSET_PERFCOUNTERB
|
|
sw perfCounterC, mvpMatrix + YDF_OFFSET_PERFCOUNTERC
|
|
sw perfCounterD, mvpMatrix + YDF_OFFSET_PERFCOUNTERD
|
|
jal dma_read_write // DMA DRAM read -> IMEM write
|
|
li dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM
|
|
lw cmd_w1_dram, rdpHalf1Val // Get DRAM address of ucode data from rdpHalf1Val
|
|
li dmemAddr, endSharedDMEM // DMEM address is endSharedDMEM
|
|
andi dmaLen, cmd_w0, 0x0FFF // Extract DMEM length from command word
|
|
add cmd_w1_dram, cmd_w1_dram, dmemAddr // Start overwriting data from endSharedDMEM
|
|
jal dma_read_write // initate DMA read
|
|
sub dmaLen, dmaLen, dmemAddr // End that much before the end of DMEM
|
|
j while_wait_dma_busy
|
|
// Jumping to actual start of new ucode, which normally zeros vZero. Not sure why later ucodes
|
|
// jumped one instruction in.
|
|
li $ra, start
|
|
|
|
.if . > start
|
|
.error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE"
|
|
.endif
|
|
|
|
task_done_or_yield:
|
|
sw perfCounterA, yieldDataFooter + YDF_OFFSET_PERFCOUNTERA
|
|
sw perfCounterB, yieldDataFooter + YDF_OFFSET_PERFCOUNTERB
|
|
sw perfCounterC, yieldDataFooter + YDF_OFFSET_PERFCOUNTERC
|
|
bltz $1, task_done // $1 < 0 = Got to the end of the parent DL
|
|
sw perfCounterD, yieldDataFooter + YDF_OFFSET_PERFCOUNTERD
|
|
task_yield: // Otherwise $1 > 0 = CPU requested yield
|
|
lw $11, OSTask + OSTask_ucode // Save pointer to current ucode
|
|
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
|
|
li dmemAddr, 0x8000 // 0, but negative = write
|
|
li dmaLen, OS_YIELD_DATA_SIZE - 1
|
|
li $10, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals
|
|
sw taskDataPtr, yieldDataFooter + YDF_OFFSET_TASKDATAPTR // Save pointer to where in DL
|
|
sw $11, yieldDataFooter + YDF_OFFSET_UCODE
|
|
j dma_read_write
|
|
li $ra, set_status_and_break
|
|
|
|
task_done:
|
|
// Copy just the yield data footer, which has the perf counters.
|
|
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
|
|
addi cmd_w1_dram, cmd_w1_dram, yieldDataFooter
|
|
li dmemAddr, 0x8000 | yieldDataFooter // negative = write
|
|
jal dma_read_write
|
|
li dmaLen, YIELD_DATA_FOOTER_SIZE - 1
|
|
jal while_wait_dma_busy
|
|
li $10, SP_SET_SIG2 // task done signal
|
|
set_status_and_break: // $10 is the status to set
|
|
mtc0 $10, SP_STATUS
|
|
break 0
|
|
nop
|
|
|
|
ovl0_end:
|
|
.align 8
|
|
ovl0_padded_end:
|
|
|
|
.if ovl0_padded_end > ovl01_end
|
|
.error "Automatic resizing for overlay 0 failed"
|
|
.endif
|
|
|
|
// overlay 1
|
|
.headersize 0x00001000 - orga()
|
|
|
|
ovl1_start:
|
|
|
|
G_MTX_handler: // 12
|
|
.if CFG_PROFILING_C
|
|
addi perfCounterC, perfCounterC, 1 // Increment matrix count
|
|
.endif
|
|
andi $11, cmd_w0, G_MTX_VP_M | G_MTX_NOPUSH_PUSH
|
|
beqz $11, ovl234_clipmisc_entrypoint // Model and push: go to overlay for push
|
|
sh $zero, mvpValid // and dirLightsXfrmValid
|
|
load_mtx:
|
|
andi $1, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $1 (2 is multiply, 0 is load)
|
|
G_MOVEMEM_handler: // Otherwise $1 is 0
|
|
jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one
|
|
do_movemem:
|
|
// 0: load M, 2: mul M -> load temp, 4: load VP, 6: mul VP -> load temp
|
|
andi $3, cmd_w0, 0x00FE // Movemem table index into $1 (bits 1-7 of the word 0)
|
|
lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Second byte of word 0
|
|
lhu dmemAddr, (movememTable)($3) // $3 reused in G_MTX_multiply_end
|
|
srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
|
|
add dmemAddr, dmemAddr, $2
|
|
j dma_and_wait_goto_next_ra
|
|
lh nextRA, (afterMovememRaTable)($1) // $1 is 2 if mtx multiply, else 0
|
|
|
|
G_FLUSH_handler: // 32
|
|
jal flush_rdp_buffer // Flush once to push partial DMEM buf to FIFO
|
|
sub dmemAddr, rdpCmdBufPtr, rdpCmdBufEndP1 // Prereq; offset buffer fullness
|
|
// If the DMEM buffer was empty, dmemAddr will be unchanged and valid for this next
|
|
// jump. Otherwise, running the DMA write will cause dmemAddr to get set to a large
|
|
// negative number. Then for this second jump, the same codepath will be triggered as
|
|
// if the buffer was empty. The result is it will wait for the DMA to finish, set
|
|
// DPC_END, and return to $ra. This is why the dmemAddr register (as opposed to,
|
|
// for example, dmaLen) is used as the DMEM buf fullness.
|
|
j flush_rdp_buffer
|
|
G_MTX_multiply_end:
|
|
li $ra, run_next_DL_command // Dual use for above and below
|
|
lhu $3, (movememTable - G_MV_TEMPMTX0)($3) // $3=2->0=M; $3=6->4=VP
|
|
move $2, $3 // Input 0 = output
|
|
mtx_multiply:
|
|
// $2 and dmemAddr are input matrices; $3 is output matrix
|
|
addi $10, dmemAddr, 0x0018
|
|
@@loop:
|
|
vmadn $v7, $v31, $v31[2] // 0
|
|
addi $11, dmemAddr, 0x0008
|
|
vmadh $v6, $v31, $v31[2] // 0
|
|
addi $2, $2, -0x0020
|
|
vmudh $v29, $v31, $v31[2] // 0
|
|
@@innerloop:
|
|
ldv $v3[0], 0x0040($2)
|
|
ldv $v3[8], 0x0040($2)
|
|
lqv vTemp2[0], 0x0020(dmemAddr) // Input 1
|
|
ldv $v2[0], 0x0020($2)
|
|
ldv $v2[8], 0x0020($2)
|
|
lqv vTemp1[0], 0x0000(dmemAddr) // Input 1
|
|
vmadl $v29, $v3, vTemp2[0h]
|
|
addi dmemAddr, dmemAddr, 0x0002
|
|
vmadm $v29, $v2, vTemp2[0h]
|
|
addi $2, $2, 0x0008 // Increment input 0 pointer
|
|
vmadn $v5, $v3, vTemp1[0h]
|
|
bne dmemAddr, $11, @@innerloop
|
|
vmadh $v4, $v2, vTemp1[0h]
|
|
bne dmemAddr, $10, @@loop
|
|
addi dmemAddr, dmemAddr, 0x0008
|
|
sqv $v7[0], (0x0020)($3)
|
|
sqv $v6[0], (0x0000)($3)
|
|
sqv $v4[0], (0x0010)($3)
|
|
jr $ra
|
|
sqv $v5[0], (0x0030)($3)
|
|
|
|
G_VTX_handler: // 19
|
|
lhu dmemAddr, (vertexTable)(cmd_w0) // (v0 + n) end address; up to 56 inclusive
|
|
jal segmented_to_physical // Convert address in cmd_w1_dram to physical
|
|
lhu vtxLeft, (inputBufferEnd - 0x07)(inputBufferPos) // vtxLeft = size in bytes = vtx count * 0x10
|
|
sub dmemAddr, dmemAddr, vtxLeft // Start addr = end addr - size. Rounded down to DMA word by H/W
|
|
addi dmaLen, vtxLeft, -1 // DMA length is always offset by -1
|
|
j dma_read_write
|
|
G_SETOTHERMODE_H_handler: // These handler labels must be 4 bytes apart for the code below to work
|
|
li $ra, vtx_after_dma // Only for above, nop for below
|
|
G_SETOTHERMODE_L_handler:
|
|
lw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) // resolves to othermode0 or othermode1 based on which handler was jumped to
|
|
lui $2, 0x8000
|
|
srav $2, $2, cmd_w0
|
|
srl $1, cmd_w0, 8
|
|
srlv $2, $2, $1
|
|
nor $2, $2, $zero
|
|
and $3, $3, $2
|
|
or $3, $3, cmd_w1_dram
|
|
sw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11)
|
|
lw cmd_w0, otherMode0
|
|
j G_RDP_handler
|
|
lw cmd_w1_dram, otherMode1
|
|
|
|
G_RDPSETOTHERMODE_handler: // 4
|
|
li $1, 8 // Offset from scissor DMEM to othermode DMEM
|
|
G_SETSCISSOR_handler: // $1 is 0 if jumped here
|
|
sw cmd_w0, (scissorUpLeft)($1) // otherMode0 = scissorUpLeft + 8
|
|
j G_RDP_handler // Send the command to the RDP
|
|
sw cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8
|
|
|
|
G_GEOMETRYMODE_handler: // 5; $7 = G_GEOMETRYMODE (as negative) if jumped here
|
|
lw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
|
|
and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
|
|
or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
|
|
j run_next_DL_command // run the next DL command
|
|
sw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // update the geometry mode value
|
|
|
|
G_TEXTURE_handler: // 4
|
|
li $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler) // Calculate the offset from texrectWord1 and $11 for saving to textureSettings
|
|
G_TEXRECT_handler: // $11 contains address of handler
|
|
G_TEXRECTFLIP_handler:
|
|
// Stores first command word into textureSettings for gSPTexture, 0x00D0 for gSPTextureRectangle/Flip
|
|
sw cmd_w0, (texrectWord1 - G_TEXRECTFLIP_handler)($11)
|
|
G_RDPHALF_1_handler:
|
|
j run_next_DL_command
|
|
// Stores second command word into textureSettings for gSPTexture, 0x00D4 for gSPTextureRectangle/Flip, 0x00D8 for G_RDPHALF_1
|
|
sw cmd_w1_dram, (texrectWord2 - G_TEXRECTFLIP_handler)($11)
|
|
|
|
G_RDPHALF_2_handler: // 7
|
|
ldv $v29[0], (texrectWord1)($zero)
|
|
lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0
|
|
addi rdpCmdBufPtr, rdpCmdBufPtr, 8
|
|
.if !ENABLE_PROFILING
|
|
addi perfCounterB, perfCounterB, 1 // Increment number of tex/fill rects
|
|
.endif
|
|
sb $zero, materialCullMode // This covers tex and fill rects
|
|
j G_RDP_handler
|
|
sdv $v29[0], -8(rdpCmdBufPtr)
|
|
|
|
G_RELSEGMENT_handler: // 9
|
|
jal segmented_to_physical // Resolve new segment address relative to existing segment
|
|
G_MOVEWORD_handler:
|
|
srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
|
|
lhu $10, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
|
|
do_moveword:
|
|
sll $11, cmd_w0, 16 // Sign bit = upper bit of offset
|
|
add $10, $10, cmd_w0 // Offset + base; only lower 12 bits matter
|
|
bltz $11, run_next_DL_command // If upper bit of offset is set, exit after halfword
|
|
sh cmd_w1_dram, ($10) // Store value from cmd into halfword
|
|
j run_next_DL_command
|
|
sw cmd_w1_dram, ($10) // Store value from cmd into word (offset + moveword_table[index])
|
|
|
|
ovl1_end:
|
|
align_with_warning 8, "One instruction of padding at end of ovl1"
|
|
ovl1_padded_end:
|
|
|
|
.if ovl1_padded_end > ovl01_end
|
|
.error "Automatic resizing for overlay 1 failed"
|
|
.endif
|
|
// Currently want exactly 92 instructions (based on current size of start)
|
|
.if ovl1_padded_end > start_padded_end
|
|
warn_if_base "ovl1 is larger than start, try to move something out"
|
|
.endif
|
|
.if ovl1_padded_end < start_padded_end
|
|
warn_if_base "ovl1 is smaller than start, wasting space!"
|
|
.endif
|
|
|
|
.headersize ovl234_start - orga()
|
|
|
|
ovl2_start:
|
|
// Basic lighting overlay.
|
|
|
|
// Jump here for basic lighting setup. If overlay 2 is loaded (this code), jumps into the
|
|
// rest of the lighting code below.
|
|
ovl234_ltbasic_entrypoint:
|
|
.if CFG_PROFILING_B
|
|
nop // Needs to take up the space for the other perf counter
|
|
.endif
|
|
j ltbasic_continue_setup
|
|
lbu ambLight, numLightsxSize
|
|
|
|
// Jump here for advanced lighting. If overlay 2 is loaded (this code), loads
|
|
// overlay 4 and jumps to right here, which is now in the new code.
|
|
ovl234_ltadv_entrypoint_ovl2ver: // same IMEM address as ovl234_ltadv_entrypoint
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterD, perfCounterD, 1 // Count overlay 4 load
|
|
.endif
|
|
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
|
|
li cmd_w1_dram, orga(ovl4_start) // set up a load for overlay 4
|
|
|
|
// Jump here for clipping and rare commands. If overlay 2 is loaded (this code), loads overlay 3
|
|
// and jumps to right here, which is now in the new code.
|
|
ovl234_clipmisc_entrypoint_ovl2ver: // same IMEM address as ovl234_clipmisc_entrypoint
|
|
sh $ra, tempTriRA // Tri return after clipping
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load
|
|
.endif
|
|
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
|
|
li cmd_w1_dram, orga(ovl3_start) // set up a load for overlay 3
|
|
|
|
ltbasic_continue_setup:
|
|
bnez viLtFlag, ltbasic_setup_after_xfrm // Skip if lights were valid
|
|
addi ambLight, ambLight, altBase // Point to ambient light; stored through vtx proc
|
|
xfrm_dir_lights:
|
|
lpWrld equ $v11 // light pair world direction
|
|
lpMdl equ $v12 // light pair model space direction (not yet normalized)
|
|
lpFinal equ $v13 // light pair normalized model space direction
|
|
lpSqrI equ $v14 // Light pair direction squared int part
|
|
lpSqrF equ $v15 // Light pair direction squared frac part
|
|
lpMdl2 equ $v19 // Copy of lpMdl for pipelining
|
|
lpSumI equ $v20 // Light pair direction sum of squares int part
|
|
lpSumF equ $v21 // Light pair direction sum of squares frac part
|
|
lpRsqI equ $v22 // Light pair reciprocal square root int part
|
|
lpRsqF equ $v23 // Light pair reciprocal square root frac part
|
|
// Transform directional lights' direction by M transpose.
|
|
// First, load M transpose. $v0-$v7 is the MVP matrix and $v24-$v31 is
|
|
// permanent values, leaving $v8-$v15 and $v16-$v23 for the transposes.
|
|
// This is mainly just an excuse to use the rare ltv and swv instructions.
|
|
// The F3DEX2 implementation takes 18 instructions and 11 cycles.
|
|
// This implementation is 23 instructions and 17 cycles, but this version
|
|
// loads M transpose to both halves of each vector so we can process two
|
|
// lights at a time, which matters because there's always at least 3 lights
|
|
// (technically 2 for EX3)--the lookat directions. Plus, those 17 cycles
|
|
// also include a few instructions starting the loop.
|
|
// Memory at mMatrix contains, in shorts within qwords, for the elements we care about:
|
|
// A B C - D E F - (X int, Y int)
|
|
// G H I - - - - - (Z int, W int)
|
|
// M N O - P Q R - (X frac, Y frac)
|
|
// S T U - - - - - (Z frac, W frac)
|
|
// First, load this pattern in $v8-$v15 (int) and $v16-$v23 (frac).
|
|
// $v8 A - G - A - G - $v16 M - S - M - S -
|
|
// $v9 - B - H - B - H $v17 - N - T - N - T
|
|
// $v10 I - C - I - C - $v18 U - O - U - O -
|
|
// $v11 - - - - - - - - $v19 - - - - - - - -
|
|
// $v12 D - - - D - - - $v20 P - - - P - - -
|
|
// $v13 - E - - - E - - $v21 - Q - - - Q - -
|
|
// $v14 - - F - - - F - $v22 - - R - - - R -
|
|
// $v15 - - - - - - - - $v23 - - - - - - - -
|
|
ltv $v8[0], (mMatrix + 0x00)($zero) // A to $v8[0] etc.
|
|
ltv $v8[12], (mMatrix + 0x10)($zero) // G to $v8[2] etc.
|
|
ltv $v8[8], (mMatrix + 0x00)($zero) // A to $v8[4] etc.
|
|
ltv $v8[4], (mMatrix + 0x10)($zero) // G to $v8[6] etc.
|
|
ltv $v16[0], (mMatrix + 0x20)($zero)
|
|
ltv $v16[12], (mMatrix + 0x30)($zero)
|
|
ltv $v16[8], (mMatrix + 0x20)($zero)
|
|
ltv $v16[4], (mMatrix + 0x30)($zero)
|
|
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
|
|
vmudh $v9, vOne, $v9[1q] // B - H - B - H -
|
|
lsv $v18[6], (mMatrix + 0x2C)($zero) // U - O(R)U - O -
|
|
vmrg $v8, $v8, $v12[0q] // A D G - A D G -
|
|
lsv $v18[14], (mMatrix + 0x2C)($zero) // U - O R U - O(R)
|
|
vmrg $v10, $v10, $v14[0q] // I - C F I - C F
|
|
lpv lpWrld[0], (lightBufferLookat - altBase)(altBaseReg) // Lookat 0 and 1
|
|
vmudh $v17, vOne, $v17[1q] // N - T - N - T -
|
|
li curLight, altBase - 4 * lightSize // + ltBufOfs = light -4; write pointer
|
|
vmrg $v9, $v9, $v13 // B E H - B E H -
|
|
li $11, 0x7F // Mark lights valid. Could use some other reg known to be zero, but need a nop here.
|
|
vmrg $v16, $v16, $v20[0q] // M P S - M P S -
|
|
swv $v18[4], (tempXfrmLt)(rdpCmdBufEndP1) // Stores O R U - O R U -
|
|
vmudh $v29, $v8, lpWrld[0h] // Start transforming lookat
|
|
lqv $v18, (tempXfrmLt)(rdpCmdBufEndP1)
|
|
// This is slightly wrong, vmrg writes accum lo. But only affects lookat and
|
|
// we are only reading accum mid result. Basically rounding error.
|
|
vmrg $v17, $v17, $v21 // N Q T - N Q T -
|
|
swv $v10[4], (tempXfrmLt)(rdpCmdBufEndP1) // Stores C F I - C F I -
|
|
vmadh $v29, $v9, lpWrld[1h]
|
|
lqv $v10, (tempXfrmLt)(rdpCmdBufEndP1)
|
|
vmadn $v29, $v16, lpWrld[0h]
|
|
sb $11, dirLightsXfrmValid
|
|
// 18 cycles
|
|
xfrm_light_loop_1:
|
|
vmadn $v29, $v18, lpWrld[2h]
|
|
xfrm_light_loop_2:
|
|
vmadn $v29, $v17, lpWrld[1h]
|
|
vmadh lpMdl, $v10, lpWrld[2h] // lpMdl[0:2] and [4:6] = two lights dir in model space
|
|
vrsqh $v29[0], lpSumI[0]
|
|
vrsql lpRsqF[0], lpSumF[0]
|
|
vrsqh lpRsqI[0], lpSumI[4]
|
|
addi curLight, curLight, 2 * lightSize // Iters: -2, 0, 2, ...
|
|
vrsql lpRsqF[4], lpSumF[4]
|
|
lw $20, (ltBufOfs + 8 + 2 * lightSize)(curLight) // First iter = light 0
|
|
vrsqh lpRsqI[4], $v31[2] // 0
|
|
lw $24, (ltBufOfs + 8 + 3 * lightSize)(curLight) // First iter = light 1
|
|
vmudh $v29, lpMdl, lpMdl // Squared
|
|
sub $10, curLight, altBaseReg // Is curLight (write ptr) <= 0?
|
|
vreadacc lpSqrF, ACC_MIDDLE // Read not-clamped value
|
|
sub $11, curLight, ambLight // Is curLight (write ptr) <, =, or > ambient light?
|
|
vreadacc lpSqrI, ACC_UPPER
|
|
sw $20, (tempXfrmLt)(rdpCmdBufEndP1) // Store light 0
|
|
vmudm $v29, lpMdl2, lpRsqF[0h] // Vec int * frac scaling
|
|
sw $24, (tempXfrmLt + 4)(rdpCmdBufEndP1) // Store light 1
|
|
vmadh lpFinal, lpMdl2, lpRsqI[0h] // Vec int * int scaling
|
|
lpv lpWrld[0], (tempXfrmLt)(rdpCmdBufEndP1) // Load dirs 0-2, 4-6
|
|
vmudm $v29, vOne, lpSqrF[2h] // Sum of squared components
|
|
vmadh $v29, vOne, lpSqrI[2h]
|
|
vmadm $v29, vOne, lpSqrF[1h]
|
|
vmadh $v29, vOne, lpSqrI[1h]
|
|
spv lpFinal[0], (tempXfrmLt)(rdpCmdBufEndP1) // Store elem 0-2, 4-6 as bytes to temp memory
|
|
vmadn lpSumF, lpSqrF, vOne // elem 0, 4; swapped so we can do vmadn and get result
|
|
lw $20, (tempXfrmLt)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit
|
|
vmadh lpSumI, lpSqrI, vOne
|
|
lw $24, (tempXfrmLt + 4)(rdpCmdBufEndP1) // Load 3 (4) bytes to scalar unit
|
|
vcopy lpMdl2, lpMdl
|
|
blez $10, xfrm_light_store_lookat // curLight = -2 or 0
|
|
vmudh $v29, $v8, lpWrld[0h]
|
|
// 20 cycles from xfrm_light_loop_2 not counting land
|
|
vmadh $v29, $v9, lpWrld[1h]
|
|
bgtz $11, ltbasic_setup_after_xfrm // curLight > ambient; only one light valid
|
|
sw $20, (ltBufOfs + 0xC - 2 * lightSize)(curLight) // Write light relative -2
|
|
vmadn $v29, $v16, lpWrld[0h]
|
|
bltz $11, xfrm_light_loop_1 // curLight < ambient; more lights to compute
|
|
sw $24, (ltBufOfs + 0xC - 1 * lightSize)(curLight) // Write light relative -1
|
|
ltbasic_setup_after_xfrm:
|
|
// Constants registers:
|
|
// e0 e1 e2 e3 e4 e5 e6 e7
|
|
// vLTC 0xF800 Lt1 Z AOAmb AODir Lt1 X Lt1 Y AOAmb AODir
|
|
// $v30 SOffs TOffs 0/AOa Persp SOffs TOffs 0x0020 0x0800
|
|
lpv vLTC[0], (ltBufOfs + 8 - lightSize)(ambLight) // First lt xfrmed dir in elems 4-6
|
|
addi lbFakeAmb, ambLight, ltBufOfs // Ptr to load amb light from; normally actual ambient light
|
|
li vLoopRet, ltbasic_start_standard
|
|
andi $11, vGeomMid, (G_AMBOCCLUSION | G_PACKED_NORMALS | G_LIGHTTOALPHA | G_TEXTURE_GEN) >> 8
|
|
vmov $v30[2], $v31[2] // 0 as AO alpha offset
|
|
vmov vLTC[1], vLTC[6] // Move first lt Z to elem 1; watch stall on vLTC load
|
|
beqz $11, vtx_after_lt_setup // None of the above features enabled
|
|
li lbAfter, vtx_return_from_lighting
|
|
andi $11, vGeomMid, G_TEXTURE_GEN >> 8
|
|
beqz $11, @@skip_texgen
|
|
andi $10, vGeomMid, G_PACKED_NORMALS >> 8
|
|
li lbAfter, 0x8000 | ltbasic_texgen // Negative is used as flag
|
|
@@skip_texgen:
|
|
beqz $10, @@skip_packed
|
|
move lbTexgenOrRet, lbAfter
|
|
// Packed normals setup
|
|
sbv $v31[15], (3)(lbFakeAmb) // 0xFF; Set ambient "alpha" to FF / 7F80
|
|
vmov $v30[6], $v31[2] // 0; clear element 6, will overwrite second byte of it below
|
|
sbv $v31[15], (7)(lbFakeAmb) // 0xFF; so vpLtTot alpha ~= 7FFF, so * vtx alpha
|
|
li lbAfter, ltbasic_packed
|
|
li vLoopRet, ltbasic_start_packed
|
|
lsv vLTC[0], (packedNormalsMaskConstant - altBase)(altBaseReg) // 0xF800; cull mode already zeroed
|
|
llv $v30[13], (packedNormalsConstants - altBase)(altBaseReg) // 00[20 0800 OB]; out of bounds truncates
|
|
@@skip_packed:
|
|
andi $11, vGeomMid, G_LIGHTTOALPHA >> 8
|
|
beqz $11, @@skip_l2a
|
|
andi $10, vGeomMid, G_AMBOCCLUSION >> 8
|
|
li lbAfter, ltbasic_l2a
|
|
@@skip_l2a:
|
|
beqz $10, vtx_after_lt_setup
|
|
// AO setup
|
|
move lbPostAo, lbAfter // Harmless to be done even if not AO
|
|
addi lbFakeAmb, rdpCmdBufEndP1, tempAmbient // Temp mem as ambient light
|
|
vmov $v30[2], $v31[7] // 7FFF as AO alpha offset
|
|
spv vOne[0], (0)(lbFakeAmb) // Store all zeros here (upper bytes of vOne are 0)
|
|
llv vLTC[4], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 2, 3
|
|
llv vLTC[12], (aoAmbientFactor - altBase)(altBaseReg) // Ambient and dir to elems 6, 7
|
|
j vtx_after_lt_setup
|
|
li lbAfter, ltbasic_ao
|
|
|
|
xfrm_light_store_lookat:
|
|
vmadh $v29, $v9, lpWrld[1h]
|
|
spv lpFinal[0], (xfrmLookatDirs)($zero) // Store lookat. 1st time garbage, 2nd real
|
|
vmadn $v29, $v16, lpWrld[0h]
|
|
j xfrm_light_loop_2
|
|
vmadn $v29, $v18, lpWrld[2h]
|
|
|
|
// Lighting within vertex loop
|
|
|
|
.if CFG_NO_OCCLUSION_PLANE
|
|
|
|
.macro instan_lt_vec_1
|
|
vmadh $v29, vMTX1I, vpMdl[1h]
|
|
.endmacro
|
|
.macro instan_lt_vec_2
|
|
vmadn vpClpF, vMTX2F, vpMdl[2h]
|
|
.endmacro
|
|
.macro instan_lt_vec_3
|
|
vmadh vpClpI, vMTX2I, vpMdl[2h]
|
|
.endmacro
|
|
// lDOT <- vpMdl
|
|
.macro instan_lt_scl_1
|
|
andi $10, $10, CLIP_SCAL_NPXY // Mask to only bits we care about
|
|
.endmacro
|
|
.macro instan_lt_scl_2
|
|
or flagsV1, flagsV1, $10 // Combine results for first vertex
|
|
.endmacro
|
|
// sFOG <- lCOL
|
|
.macro instan_lt_vs_45
|
|
vge sFOG, vpScrI, $v31[6] // Clamp W/fog to >= 0x7F00 (low byte is used)
|
|
addi vtxLeft, vtxLeft, -2*inputVtxSize // Decrement vertex count by 2
|
|
vge sCLZ, vpScrI, $v31[2] // 0; clamp Z to >= 0
|
|
sh flagsV1, (VTX_CLIP )(outVtx1) // Store first vertex flags
|
|
.endmacro
|
|
|
|
.else
|
|
|
|
.macro instan_lt_vec_1
|
|
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
|
|
.endmacro
|
|
.macro instan_lt_vec_2
|
|
vmrg sOCS, sOCS, sOTM // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
|
|
.endmacro
|
|
.macro instan_lt_vec_3
|
|
vmrg vpScrF, vpScrF, sCLZ[2h] // Z int elem 2, 6 to elem 1, 5; Z frac in elem 2, 6
|
|
.endmacro
|
|
// lDOT <- sCLZ
|
|
// vpRGBA <- sOTM
|
|
.macro instan_lt_scl_1
|
|
sub $11, outVtx1, fogFlag // Points 8 before outVtx1 if fog, else 0
|
|
.endmacro
|
|
.macro instan_lt_scl_2
|
|
sbv sFOG[7], (VTX_COLOR_A + 8)($11)
|
|
.endmacro
|
|
// lCOL <- sFOG
|
|
.macro instan_lt_vs_45
|
|
vmudm $v29, vpST, sSTS // Scale ST
|
|
slv vpScrI[8], (VTX_SCR_VEC )(outVtx2)
|
|
vmadh vpST, vOne, $v30 // + 1 * ST offset; elems 0, 1, 4, 5
|
|
addi outVtxBase, outVtxBase, 2*vtxSize // Points to SECOND output vtx
|
|
.endmacro
|
|
|
|
.endif
|
|
|
|
.align 8
|
|
|
|
// If lighting, vLoopRet = ltbasic_start_packed if packed, else ltbasic_start_standard
|
|
|
|
ltbasic_start_packed:
|
|
instan_lt_vec_1
|
|
instan_lt_vec_2
|
|
instan_lt_vec_3
|
|
vand vpNrmlX, vpMdl, vLTC[0] // 0xF800; mask X to only top 5 bits
|
|
luv lVCI[0], (tempVpRGBA)(rdpCmdBufEndP1) // Load RGBA
|
|
vmudn vpNrmlY, vpMdl, $v30[6] // (1 << 5) = 0x0020; left shift normals Y
|
|
j ltbasic_after_start
|
|
vmudn vpNrmlZ, vpMdl, $v30[7] // (1 << 11) = 0x0800; left shift normals Z
|
|
|
|
.align 8
|
|
ltbasic_start_standard:
|
|
// Using elem 3, 7 for regular normals because packed normal results are there.
|
|
instan_lt_vec_1
|
|
lpv vpNrmlX[3], (tempVpRGBA)(rdpCmdBufEndP1) // X to elem 3, 7
|
|
instan_lt_vec_2
|
|
lpv vpNrmlY[2], (tempVpRGBA)(rdpCmdBufEndP1) // Y to elem 3, 7
|
|
instan_lt_vec_3
|
|
lpv vpNrmlZ[1], (tempVpRGBA)(rdpCmdBufEndP1) // Z to elem 3, 7
|
|
vnop
|
|
luv lVCI[0], (tempVpRGBA)(rdpCmdBufEndP1) // Load vertex color input
|
|
ltbasic_after_start:
|
|
|
|
.if CFG_DEBUG_NORMALS
|
|
.warning "Debug normals visualization is enabled"
|
|
vmudh vpNrmlX, vOne, vpNrmlX[3h] // Move X to all elements
|
|
vne $v29, $v31, $v31[1h] // Set VCC to 10111011
|
|
vmrg vpNrmlX, vpNrmlX, vpNrmlY[3h] // X in 0, 4; Y to 1, 5
|
|
vne $v29, $v31, $v31[2h] // Set VCC to 11011101
|
|
vmrg vpNrmlX, vpNrmlX, vpNrmlZ[3h] // Z to 2, 6
|
|
vmudh $v29, vOne, $v31[5] // 0x4000; middle gray
|
|
j vtx_return_from_lighting
|
|
vmacf vpRGBA, vpNrmlX, $v31[5] // 0x4000; + 0.5 * normal
|
|
.else // CFG_DEBUG_NORMALS
|
|
|
|
vmulf $v29, vpNrmlX, vLTC[4] // Normals X elems 3, 7 * first light dir X
|
|
// lDIR <- (NOC: -, Occ: sOTM)
|
|
lpv lDIR[0], (ltBufOfs + 8 - 2*lightSize)(ambLight) // Xfrmed dir in elems 4-6; temp reg
|
|
vmacf $v29, vpNrmlY, vLTC[5] // Normals Y elems 3, 7 * first light dir Y
|
|
luv vpLtTot, (0)(lbFakeAmb) // Total light level, init to ambient or zeros if AO
|
|
// lDOT <- (NOC: vpMdl, Occ: sCLZ)
|
|
vmacf lDOT, vpNrmlZ, vLTC[1] // Normals Z elems 3, 7 * first light dir Z
|
|
instan_lt_scl_1 // $11 can be used as a temporary, except b/w instan_lt_scl_1...
|
|
vsub lVCI, lVCI, $v30[2] // Offset alpha for AO, or 0 normally
|
|
instan_lt_scl_2 // ...and instan_lt_scl_2
|
|
// lCOL <- (Occ: sFOG here / NOC: sSCI earlier)
|
|
// vnop
|
|
beq ambLight, altBaseReg, ltbasic_post
|
|
move curLight, ambLight // Point to ambient light
|
|
ltbasic_loop:
|
|
vge lDTC, lDOT, $v31[2] // 0; clamp dot product to >= 0
|
|
vmulf $v29, vpNrmlX, lDIR[4] // Normals X elems 3, 7 * next light dir
|
|
luv lCOL, (ltBufOfs + 0 - 1*lightSize)(curLight) // Light color
|
|
vmacf $v29, vpNrmlY, lDIR[5] // Normals Y elems 3, 7 * next light dir
|
|
addi curLight, curLight, -lightSize
|
|
vmacf lDOT, vpNrmlZ, lDIR[6] // Normals Z elems 3, 7 * next light dir
|
|
lpv lDIR[0], (ltBufOfs + 8 - 2*lightSize)(curLight) // Xfrmed dir in elems 4-6; DOES dual-issue
|
|
vmudh $v29, vOne, vpLtTot // Load accum mid with current light level
|
|
bne curLight, altBaseReg, ltbasic_loop
|
|
vmacf vpLtTot, lCOL, lDTC[3h] // + light color * dot product
|
|
ltbasic_post:
|
|
// (NOC: sFOG here / Occ: vpClpI later) <- lCOL
|
|
instan_lt_vs_45
|
|
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
|
|
jr lbAfter
|
|
// vpRGBA <- lDIR
|
|
vmrg vpRGBA, vpLtTot, lVCI // RGB = light, A = vtx alpha
|
|
|
|
.endif // CFG_DEBUG_NORMALS
|
|
|
|
// lbAfter = ltbasic_ao if AO else
|
|
// lbPostAo = ltbasic_l2a if L2A else
|
|
// ltbasic_packed if packed else
|
|
// lbTexgenOrRet = ltbasic_texgen if texgen else
|
|
// vtx_return_from_lighting
|
|
|
|
ltbasic_ao:
|
|
vmudn $v29, vLTC, lVCI[3h] // (aoAmb 2 6, aoDir 3 7) * (alpha - 1)
|
|
luv vpRGBA, (ltBufOfs + 0)(ambLight) // Ambient light level
|
|
vmadh lDTC, vOne, $v31[7] // + 0x7FFF (1 in s.15)
|
|
vadd lVCI, lVCI, $v31[7] // 0x7FFF; undo offset alpha
|
|
vmulf $v29, vpLtTot, lDTC[3h] // Sum of dir lights *= dir factor
|
|
vmacf vpLtTot, vpRGBA, lDTC[2h] // + ambient * amb factor
|
|
jr lbPostAo // Return, texgen, l2a, or packed
|
|
vmacf vpRGBA, $v31, $v31[2] // 0; need it in vpRGBA if returning, else in vpLtTot
|
|
|
|
ltbasic_l2a:
|
|
// Light-to-alpha (cel shading): alpha = max of light components, RGB = vertex color
|
|
vge vpLtTot, vpLtTot, vpLtTot[1h] // elem 0 = max(R0, G0); elem 4 = max(R1, G1)
|
|
vge vpLtTot, vpLtTot, vpLtTot[2h] // elem 0 = max(R0, G0, B0); equiv for elem 4
|
|
vne $v29, $v31, $v31[3h] // Reset VCC to 11101110 (clobbered by vge)
|
|
jr lbTexgenOrRet
|
|
vmrg vpRGBA, lVCI, vpLtTot[0h] // RGB is vcol (garbage if not packed); A is light
|
|
|
|
ltbasic_packed:
|
|
bgez lbTexgenOrRet, vtx_return_from_lighting // < 0 for texgen
|
|
vmulf vpRGBA, vpLtTot, lVCI // (Light color, 7FFF alpha) * vertex RGBA.
|
|
ltbasic_texgen:
|
|
// Texgen: in vpNrmlX:Y:Z; temps vpLtTot, lDOT, lDTC; out vpST.
|
|
lLkDrs equ lDTC // lighting Lookat Directions
|
|
lLkDt0 equ vpLtTot // lighting Lookat Dot product 0
|
|
lLkDt1 equ lDOT // lighting Lookat Dot product 1
|
|
lpv lLkDrs[0], (xfrmLookatDirs + 0)($zero) // Lookat 0 in 0-2, 1 in 4-6
|
|
.macro texgen_dots, lookats, dot0, dot1
|
|
vmulf $v29, vpNrmlX, lookats[0] // Normals X * lookat 0 X
|
|
vmacf $v29, vpNrmlY, lookats[1] // Normals Y * lookat 0 Y
|
|
vmacf dot0, vpNrmlZ, lookats[2] // Normals Z * lookat 0 Z
|
|
vmulf $v29, vpNrmlX, lookats[4] // Normals X * lookat 1 X
|
|
vmacf $v29, vpNrmlY, lookats[5] // Normals Y * lookat 1 Y
|
|
vmacf dot1, vpNrmlZ, lookats[6] // Normals Z * lookat 1 Z
|
|
.endmacro
|
|
texgen_dots lLkDrs, lLkDt0, lLkDt1
|
|
.if !CFG_NO_OCCLUSION_PLANE
|
|
addi outVtxBase, outVtxBase, -2*vtxSize // Undo doing this twice due to repeating ST scale
|
|
.endif
|
|
// In ltbasic, normals are in elems 3, 7; in ltadv, elems 0, 4
|
|
vmudh lLkDt0, vOne, lLkDt0[3h] // Move dot 0 from elems 3, 7 to 0, 4
|
|
.macro texgen_body, lookats, dot0, dot1, normalselem, branch_no_texgen_linear
|
|
// lookats now holds texgen linear coefficients elems 0, 1
|
|
llv lookats[0], (texgenLinearCoeffs - altBase)(altBaseReg)
|
|
vne $v29, $v31, $v31[1h] // Set VCC to 10111011
|
|
andi $11, vGeomMid, G_TEXTURE_GEN_LINEAR >> 8
|
|
vmrg dot0, dot0, dot1[normalselem] // Dot products in elements 0, 1, 4, 5
|
|
vmudh $v29, vOne, $v31[5] // 1 * 0x4000
|
|
beqz $11, branch_no_texgen_linear
|
|
vmacf vpST, dot0, $v31[5] // + dot products * 0x4000 ( / 2)
|
|
// Texgen_Linear:
|
|
vmulf vpST, dot0, $v31[5] // dot products * 0x4000 ( / 2)
|
|
// dot0 now holds lighting Lookat ST squared
|
|
vmulf dot0, vpST, vpST // ST squared
|
|
vmulf $v29, vpST, $v31[7] // Move ST to accumulator (0x7FFF = 1)
|
|
// dot1 now holds lighting Lookat Temp
|
|
vmacf dot1, vpST, lookats[1] // + ST * 0x6CB3
|
|
vmudh $v29, vOne, $v31[5] // 1 * 0x4000
|
|
vmacf vpST, vpST, lookats[0] // + ST * 0x44D3
|
|
.endmacro
|
|
texgen_body lLkDrs, lLkDt0, lLkDt1, 3h, vtx_return_from_texgen
|
|
j vtx_return_from_texgen
|
|
.macro texgen_lastinstr, dot0, dot1
|
|
vmacf vpST, dot0, dot1 // + ST squared * (ST + ST * coeff)
|
|
.endmacro
|
|
texgen_lastinstr lLkDt0, lLkDt1
|
|
|
|
ovl2_end:
|
|
.align 8
|
|
ovl2_padded_end:
|
|
|
|
.headersize ovl234_start - orga()
|
|
|
|
ovl4_start:
|
|
// Advanced lighting overlay.
|
|
|
|
// Jump here for basic lighting setup. If overlay 4 is loaded (this code), loads overlay 2
|
|
// and jumps to right here, which is now in the new code.
|
|
ovl234_ltbasic_entrypoint_ovl4ver: // same IMEM address as ovl234_ltbasic_entrypoint
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterC, perfCounterC, 1 // Count lighting overlay load
|
|
.endif
|
|
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
|
|
li cmd_w1_dram, orga(ovl2_start) // set up a load for overlay 2
|
|
|
|
// Jump here for advanced lighting. If overlay 4 is loaded (this code), jumps
|
|
// to the instruction selection below.
|
|
ovl234_ltadv_entrypoint:
|
|
.if CFG_PROFILING_B
|
|
nop // Needs to take up the space for the other perf counter
|
|
.endif
|
|
j vtx_load_mtx
|
|
li $11, mMatrix
|
|
|
|
// Jump here for clipping and rare commands. If overlay 4 is loaded (this code), loads overlay 3
|
|
// and jumps to right here, which is now in the new code.
|
|
ovl234_clipmisc_entrypoint_ovl4ver: // same IMEM address as ovl234_clipmisc_entrypoint
|
|
sh $ra, tempTriRA // Tri return after clipping
|
|
.if CFG_PROFILING_B
|
|
addi perfCounterD, perfCounterD, 0x4000 // Count clipping overlay load
|
|
.endif
|
|
jal load_overlays_2_3_4 // Not a call; returns to $ra-8 = here
|
|
li cmd_w1_dram, orga(ovl3_start) // set up a load for overlay 3
|
|
|
|
ltadv_spec_fres_setup: // Odd instruction
|
|
// Get aDIR = normalize(camera - vertex), aDOT = (vpWNrm dot aDIR)
|
|
ldv aDPosI[0], (cameraWorldPos - altBase)(altBaseReg) // Camera world pos
|
|
j ltadv_normal_to_vertex
|
|
ldv aDPosI[8], (cameraWorldPos - altBase)(altBaseReg)
|
|
ltadv_after_camera:
|
|
vmov aOAFrs[0], aDOT[0] // Save Fresnel dot product in aOAFrs[0h]
|
|
vmov aOAFrs[4], aDOT[4] // elems 0, 4
|
|
bgez laSpecular, ltadv_loop // Sign bit clear = not specular
|
|
li laSpecFres, 0 // Clear flag for specular or fresnel
|
|
// aProj <- aLenF
|
|
vmulf aProj, vpWNrm, aDOT[0h] // Projection of camera vec onto normal
|
|
vmudh $v29, aDIR, $v31[1] // -camera vec
|
|
j ltadv_normals_to_regs // For specular, replace vpWNrm with reflected vector
|
|
vmadh vpWNrm, aProj, $v31[3] // + 2 * projection
|
|
// aDPosI <- aProj
|
|
|
|
ltadv_xfrm: // Even instruction
|
|
vmudn $v29, vMTX0F, vpMdl[0h]
|
|
lbu curLight, numLightsxSize // Scalar instructions here must be OK to do twice
|
|
vmadh $v29, vMTX0I, vpMdl[0h]
|
|
luv vpRGBA, (VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 2:1 RGBA
|
|
vmadn $v29, vMTX1F, vpMdl[1h]
|
|
vmadh $v29, vMTX1I, vpMdl[1h]
|
|
addi curLight, curLight, altBase // Point to ambient light
|
|
vmadn aDPosF, vMTX2F, vpMdl[2h]
|
|
jr $ra
|
|
vmadh aDPosI, vMTX2I, vpMdl[2h]
|
|
|
|
ltadv_after_mtx: // Even instruction
|
|
move laPtr, inVtx
|
|
vcopy aPNScl, vOne
|
|
move laVtxLeft, vtxLeft
|
|
vmudn aDPosF, vMTX1F, $v31[7] // 0x7FFF; transform a normal (0, 7FFF, 0)
|
|
// 0001 00[20 0800 XX]01 = (1<<0),(1<<5),(1<<11),XX, repeat
|
|
llv aPNScl[3], (packedNormalsConstants - altBase)(altBaseReg)
|
|
vmadh aDPosI, vMTX1I, $v31[7]
|
|
j ltadv_normalize
|
|
llv aPNScl[11], (packedNormalsConstants - altBase)(altBaseReg)
|
|
ltadv_continue_setup:
|
|
lqv aParam, (fxParams - altBase)(altBaseReg)
|
|
vcopy aNrmSc, aRcpLn // aRcpLn[0:1] is int:frac scale (1 / length)
|
|
lsv aPNScl[6], (packedNormalsMaskConstant - altBase)(altBaseReg) // F800
|
|
vge $v29, $v31, $v31[3] // Set VCC to 00011111
|
|
andi $11, vGeomMid, G_AMBOCCLUSION >> 8
|
|
bnez $11, @@skip_zero_ao
|
|
andi laL2A, vGeomMid, G_LIGHTTOALPHA >> 8
|
|
vmrg aParam, aParam, $v31[2] // 0
|
|
@@skip_zero_ao:
|
|
jal while_wait_dma_busy
|
|
andi laTexgen, vGeomMid, G_TEXTURE_GEN >> 8
|
|
ldv vpMdl[0], (VTX_IN_OB + 1 * inputVtxSize)(laPtr) // Vtx 2 Model pos + PN
|
|
ldv vpMdl[8], (VTX_IN_OB + 0 * inputVtxSize)(laPtr) // Vtx 1 Model pos + PN
|
|
align_with_warning 8, "One instruction of padding before ltadv_vtx_loop"
|
|
ltadv_vtx_loop: // Even instruction
|
|
vmudm $v29, aPNScl, vpMdl[3h] // Packed normals from elem 3,7 of model pos
|
|
lw $11, (VTX_IN_CN + 1 * inputVtxSize)(laPtr) // Vtx 2 RGBA
|
|
vmadn vpNrmlY, $v31, $v31[2] // 0; load lower (vpMdl unsigned but must be T operand)
|
|
lw laSTKept,(VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 1 ST
|
|
vand vpNrmlX, vpMdl, aPNScl[3] // 0xF800; X component masked in elem 3, 7
|
|
jal ltadv_xfrm
|
|
sw $11, (VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 2 RGBA -> Vtx 1 ST
|
|
vmadn vpWrlF, vMTX3F, vOne // Finish vertex pos transform
|
|
vmadh vpWrlI, vMTX3I, vOne
|
|
andi laPacked, vGeomMid, G_PACKED_NORMALS >> 8
|
|
// aOAFrs <- vpST
|
|
vsub aOAFrs, vpRGBA, $v31[7] // 0x7FFF; offset alpha elems 3, 7
|
|
luv vpLtTot, (ltBufOfs + 0)(curLight) // Total light level, init to ambient
|
|
vne $v29, $v31, $v31[0h] // Set VCC to 01110111
|
|
beqz laPacked, @@skip_packed_normals
|
|
lpv vpMdl, (VTX_IN_TC + 0 * inputVtxSize)(laPtr) // Vtx 2:1 regular normals
|
|
vmrg vpMdl, vpNrmlY, vpNrmlX[3h] // Masked X to 0, 4; multiplied Y, Z in 1, 2, 5, 6
|
|
@@skip_packed_normals:
|
|
vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
|
|
jal ltadv_xfrm
|
|
// aAOF2 <- aDOT
|
|
vmadm aAOF2, aOAFrs, aParam[0] // + (alpha - 1) * aoAmb factor; elems 3, 7
|
|
// aLTC <- vpMdl
|
|
vmulf vpLtTot, vpLtTot, aAOF2[3h] // light color *= ambient factor
|
|
// aDOT <- aAOF2
|
|
vmudn $v29, aDPosF, aNrmSc[0h] // Vec frac * int scaling, discard result
|
|
// aDIR <- aDPosF
|
|
addi laPtr, laPtr, 2 * inputVtxSize
|
|
vmadm $v29, aDPosI, aNrmSc[1h] // Vec int * frac scaling, discard result
|
|
addi laVtxLeft, laVtxLeft, -2 * inputVtxSize
|
|
// vpWNrm <- vpNrmlX
|
|
vmadh vpWNrm, aDPosI, aNrmSc[0h] // Vec int * int scaling
|
|
sll laSpecular, vGeomMid, (31 - 5) // G_LIGHTING_SPECULAR to sign bit
|
|
vmudn vpWrlF, vpWrlF, $v31[1] // -1; negate world pos so add light/cam pos to it
|
|
andi laSpecFres, vGeomMid, (G_LIGHTING_SPECULAR | G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
|
|
vmadh vpWrlI, vpWrlI, $v31[1] // -1
|
|
|
|
.if CFG_DEBUG_NORMALS
|
|
vmudh $v29, vOne, $v31[5] // 0x4000; middle gray
|
|
li laTexgen, 0
|
|
vmacf vpRGBA, vpWNrm, $v31[5] // 0x4000; + 0.5 * normal
|
|
ltadv_finish_light:
|
|
ltadv_loop:
|
|
ltadv_normals_to_regs:
|
|
ltadv_specular:
|
|
.else
|
|
|
|
ltadv_normals_to_regs:
|
|
vmudh vpNrmlY, vOne, vpWNrm[1h] // Move normals to separate registers
|
|
bnez laSpecFres, ltadv_spec_fres_setup
|
|
vmudh vpNrmlZ, vOne, vpWNrm[2h] // per component, in elems 0-3, 4-7
|
|
// vpNrmlX <- vpWNrm
|
|
// aAOF <- aDPosI
|
|
align_with_warning 8, "One instruction of padding before ltadv_loop"
|
|
ltadv_loop: // Even instruction
|
|
vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
|
|
lbu $11, (ltBufOfs + 3 - lightSize)(curLight) // Light type / constant attenuation
|
|
vmadm aAOF, aOAFrs, aParam[1] // + (alpha - 1) * aoDir factor; elems 3, 7
|
|
beq curLight, altBaseReg, ltadv_post
|
|
lpv aDOT[0], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2
|
|
bnez $11, ltadv_point
|
|
luv aLTC, (ltBufOfs + 0 - lightSize)(curLight) // Light color
|
|
vmulf $v29, vpNrmlX, aDOT[0]
|
|
vmacf $v29, vpNrmlY, aDOT[1]
|
|
bltzal laSpecular, ltadv_specular
|
|
vmacf aDOT, vpNrmlZ, aDOT[2]
|
|
ltadv_finish_light:
|
|
vmulf aLTC, aLTC, aAOF[3h] // light color *= dir or point light factor
|
|
vge aDOT, aDOT, $v31[2] // 0; clamp dot product to >= 0
|
|
addi curLight, curLight, -lightSize
|
|
vmudh $v29, vOne, vpLtTot // Load accum mid with current light level
|
|
j ltadv_loop
|
|
vmacf vpLtTot, aLTC, aDOT[0h] // + light color * dot product
|
|
|
|
ltadv_specular: // aDOT in/out, uses vpLtTot[3] and $11 as temps
|
|
lb $11, (ltBufOfs + 0xF - lightSize)(curLight) // Light size factor
|
|
mtc2 $11, vpLtTot[6] // Light size factor in elem 3 as temp
|
|
vxor aDOT, aDOT, $v31[7] // = 0x7FFF - dot product
|
|
vmudh aDOT, aDOT, vpLtTot[3] // * size factor
|
|
jr $ra
|
|
vxor aDOT, aDOT, $v31[7] // = 0x7FFF - result
|
|
|
|
.align 8
|
|
ltadv_post:
|
|
// aClOut <- vpWrlF
|
|
// aAlOut <- vpWrlI
|
|
// vpMdl <- aLTC
|
|
vge aAOF, vpLtTot, vpLtTot[1h] // elem 0 = max(R0, G0); elem 4 = max(R1, G1)
|
|
ldv vpMdl[0], (VTX_IN_OB + 1 * inputVtxSize)(laPtr) // Vtx 2 Model pos + PN
|
|
vmulf aClOut, vpRGBA, vpLtTot // RGB output is RGB * light
|
|
beqz laL2A, @@skip_cel
|
|
vcopy aAlOut, vpRGBA // Alpha output = vertex alpha (only 3, 7 matter)
|
|
// Cel: alpha = max of light components, RGB = vertex color
|
|
vge aAOF, aAOF, aAOF[2h] // elem 0 = max(R0, G0, B0); equiv for elem 4
|
|
vcopy aClOut, vpRGBA // RGB output is vertex color
|
|
vmudh aAlOut, vOne, aAOF[0h] // move light level elem 0, 4 to 3, 7
|
|
@@skip_cel:
|
|
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
|
|
bnez laPacked, @@skip_novtxcolor
|
|
andi $11, vGeomMid, (G_FRESNEL_COLOR | G_FRESNEL_ALPHA) >> 8
|
|
vcopy aClOut, vpLtTot // If no packed normals, base output is just light
|
|
@@skip_novtxcolor:
|
|
vmrg vpRGBA, aClOut, aAlOut // Merge base output and alpha output
|
|
beqz $11, ltadv_skip_fresnel
|
|
ldv vpMdl[8], (VTX_IN_OB + 0 * inputVtxSize)(laPtr) // Vtx 1 Model pos + PN
|
|
lsv aAOF[0], (vTRC_0100_addr - altBase)(altBaseReg) // Load constant 0x0100 to temp
|
|
vabs aOAFrs, aOAFrs, aOAFrs // Fresnel dot in aOAFrs[0h]; absolute value for underwater
|
|
andi $11, vGeomMid, G_FRESNEL_COLOR >> 8
|
|
vmudh $v29, vOne, aParam[7] // Fresnel offset
|
|
vmacf aOAFrs, aOAFrs, aParam[6] // + factor * scale
|
|
beqz $11, @@skip
|
|
vmudh aOAFrs, aOAFrs, aAOF[0] // Result * 0x0100, clamped to 0x7FFF
|
|
veq $v29, $v31, $v31[3h] // Set VCC to 00010001 if G_FRESNEL_COLOR
|
|
@@skip:
|
|
vmrg vpRGBA, vpRGBA, aOAFrs[0h] // Replace color or alpha with fresnel
|
|
vge vpRGBA, vpRGBA, $v31[2] // Clamp to >= 0 for fresnel; doesn't affect others
|
|
|
|
.endif // CFG_DEBUG_NORMALS
|
|
|
|
ltadv_skip_fresnel:
|
|
beqz laTexgen, ltadv_after_texgen
|
|
suv vpRGBA, (VTX_IN_TC - 2 * inputVtxSize)(laPtr) // Vtx 2:1 RGBA
|
|
// Texgen: aDOT still contains lookat 0 in elems 0-2, lookat 1 in elems 4-6
|
|
// vpST <- aOAFrs
|
|
texgen_dots aDOT, aLkDt0, aLkDt1
|
|
texgen_body aDOT, aLkDt0, aLkDt1, 0h, ltadv_texgen_end
|
|
texgen_lastinstr aLkDt0, aLkDt1
|
|
ltadv_texgen_end: // Vtx 2 ST in vpST elem 0, 1; vtx 1 ST in vpST elem 4, 5
|
|
slv vpST[8], (tempVtx1ST)(rdpCmdBufEndP1) // Vtx 1 ST
|
|
bltz laVtxLeft, ltadv_after_texgen // Only vtx 1 is valid, don't write vtx 2
|
|
lw laSTKept, (tempVtx1ST)(rdpCmdBufEndP1) // Overwrite stored Vtx 1 ST
|
|
slv vpST[0], (VTX_IN_TC - 1 * inputVtxSize)(laPtr) // Vtx 2 ST
|
|
ltadv_after_texgen:
|
|
lw $11, (VTX_IN_TC - 2 * inputVtxSize)(laPtr) // Vtx 2 RGBA from vtx 1 ST slot
|
|
bltz laVtxLeft, vtx_setup_no_lighting
|
|
sw laSTKept, (VTX_IN_TC - 2 * inputVtxSize)(laPtr) // Restore vtx 1 ST
|
|
ltadv_vtx_loop_end:
|
|
bgtz laVtxLeft, ltadv_vtx_loop
|
|
sw $11, (VTX_IN_CN - 1 * inputVtxSize)(laPtr) // Real vtx 2 RGBA
|
|
j vtx_setup_no_lighting
|
|
// Delay slot is OK
|
|
|
|
ltadv_point:
|
|
/*
|
|
Input vector 1 elem size 7FFF.0000 -> len^2 3FFF0001 -> 1/len 0001.0040 -> vec +801E.FFC0 -> clamped 7FFF
|
|
len^2 * 1/len = 400E.FFC1 so about half actual length
|
|
Input vector 1 elem size 0100.0000 -> len^2 00010000 -> 1/len 007F.FFC0 -> vec 7FFF.C000 -> clamped 7FFF
|
|
len^2 * 1/len = 007F.FFC0 so about half actual length
|
|
Input vector 1 elem size 0010.0000 -> len^2 00000100 -> 1/len 07FF.FC00 -> vec 7FFF.C000
|
|
Input vector 1 elem size 0001.0000 -> len^2 00000001 -> 1/len 7FFF.C000 -> vec 7FFF.C000
|
|
*/
|
|
// aDPosI <- aAOF
|
|
ldv aDPosI[0], (ltBufOfs + 8 - lightSize)(curLight) // Light position int part 0-3
|
|
ldv aDPosI[8], (ltBufOfs + 8 - lightSize)(curLight) // 4-7
|
|
lbu $10, (ltBufOfs + 7 - lightSize)(curLight) // PL: Linear factor
|
|
ltadv_normal_to_vertex:
|
|
vadd aDPosI, aDPosI, vpWrlI // Not using aDPosF; frac part is just vpWrlF
|
|
lbu $24, (ltBufOfs + 0xE - lightSize)(curLight) // PL: Quadratic factor
|
|
ltadv_normalize: // Normalize vector in aDPosI:vpWrlF i/f
|
|
vmudm $v29, aDPosI, vpWrlF // Squared. Don't care about frac*frac term
|
|
sll $11, $11, 8 // Constant factor, 00000100 - 0000FF00
|
|
vmadn $v29, vpWrlF, aDPosI
|
|
sll $10, $10, 6 // Linear factor, 00000040 - 00003FC0
|
|
vmadh $v29, aDPosI, aDPosI
|
|
mtc2 $11, aNrmSc[4] // Constant frac part in elem 2
|
|
// aLen2F <- aLTC
|
|
vreadacc aLen2F, ACC_MIDDLE
|
|
mtc2 $10, aNrmSc[6] // Linear frac part in elem 3
|
|
vreadacc aLen2I, ACC_UPPER
|
|
srl $11, $24, 5 // Top 3 bits
|
|
// vnop; vnop
|
|
vmudm $v29, vOne, aLen2F[2h] // Sum of squared components
|
|
andi $10, $24, 0x1F // Bottom 5 bits
|
|
vmadh $v29, vOne, aLen2I[2h]
|
|
ori $10, $10, 0x20 // Append leading 1 to mantissa
|
|
vmadm $v29, vOne, aLen2F[1h]
|
|
sllv $10, $10, $11 // Left shift to create floating point
|
|
vmadh $v29, vOne, aLen2I[1h]
|
|
sll $10, $10, 8 // Min range 00002000, 00002100... 00003F00, max 00100000...001F8000
|
|
vmadn aLen2F, aLen2F, vOne // elem 0; swapped so we can do vmadn and get result
|
|
bnez $24, @@skip // If original value is zero, set to zero
|
|
vmadh aLen2I, aLen2I, vOne
|
|
li $10, 0
|
|
@@skip:
|
|
// vnop; vnop
|
|
// aRcpLn <- $v29
|
|
vrsqh aRcpLn[2], aLen2I[0] // High input, garbage output
|
|
vrsql aRcpLn[1], aLen2F[0] // Low input, low output
|
|
mtc2 $10, aNrmSc[12] // Quadratic frac part in elem 6
|
|
vrsqh aRcpLn[0], aLen2I[4] // High input, high output
|
|
srl $10, $10, 16
|
|
vrsql aRcpLn[5], aLen2F[4] // Low input, low output
|
|
beq laPtr, inVtx, ltadv_continue_setup // Return aRcpLn; cond works only iter 0
|
|
vrsqh aRcpLn[4], $v31[2] // 0 input, high output
|
|
// vnop; vnop; vnop
|
|
vmudn aDIR, vpWrlF, aRcpLn[0h] // Vec frac * int scaling, discard result
|
|
mtc2 $10, aNrmSc[14] // Quadratic int part in elem 7
|
|
vmadm aDIR, aDPosI, aRcpLn[1h] // Vec int * frac scaling, discard result
|
|
vmadh aDIR, aDPosI, aRcpLn[0h] // Vec int * int scaling
|
|
// aLenF <- aDPosI
|
|
vmudm aLenF, aLen2I, aRcpLn[1h] // len^2 int * 1/len frac; ignoring frac*frac
|
|
vmadn aLenF, aLen2F, aRcpLn[0h] // len^2 frac * 1/len int = len frac
|
|
// aLenI <- aRcpLn
|
|
vmadh aLenI, aLen2I, aRcpLn[0h] // len^2 int * 1/len int = len int
|
|
vmulf aDOT, vpNrmlX, aDIR[0h] // Normalized light dir * normalized normals
|
|
vmacf aDOT, vpNrmlY, aDIR[1h]
|
|
bnez laSpecFres, ltadv_after_camera // Return if initial spec/fres; returns aDOT, aDIR
|
|
vmacf aDOT, vpNrmlZ, aDIR[2h]
|
|
// $v29 <- aLenI
|
|
vmudm $v29, aLenI, aNrmSc[3] // len int * linear factor frac
|
|
vmadl $v29, aLenF, aNrmSc[3] // + len frac * linear factor frac
|
|
vmadm $v29, vOne, aNrmSc[2] // + 1 * constant factor frac
|
|
vmadl $v29, aLen2F, aNrmSc[6] // + len^2 frac * quadratic factor frac
|
|
vmadm $v29, aLen2I, aNrmSc[6] // + len^2 int * quadratic factor frac
|
|
// aPLFcF <- aLen2F
|
|
vmadn aPLFcF, aLen2F, aNrmSc[7] // + len^2 frac * quadratic factor int = aPLFcF frac
|
|
bltzal laSpecular, ltadv_specular
|
|
// aPLFcI <- aLen2I
|
|
vmadh aPLFcI, aLen2I, aNrmSc[7] // + len^2 int * quadratic factor int = aLen2I int
|
|
// aAOF <- aLenF
|
|
vmudh aAOF, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
|
|
vmadm aAOF, aOAFrs, aParam[2] // + (alpha - 1) * aoPoint factor; elems 3, 7
|
|
// aDotSc <- aDIR
|
|
vrcph aDotSc[1], aPLFcI[0] // 1/(2*light factor), input of 0000.8000 -> no change normals
|
|
vrcpl aDotSc[2], aPLFcF[0] // Light factor 0001.0000 -> normals /= 2
|
|
vrcph aDotSc[3], aPLFcI[4] // Light factor 0000.1000 -> normals *= 8 (with clamping)
|
|
// aLen2I <- aPLFcI
|
|
vrcpl aDotSc[6], aPLFcF[4] // Light factor 0010.0000 -> normals /= 32
|
|
vrcph aDotSc[7], $v31[2] // 0
|
|
// aLTC <- aPLFcF
|
|
luv aLTC, (ltBufOfs + 0 - lightSize)(curLight) // aLTC = light color
|
|
// This is a scale on the dot product, not the light, because the scale can
|
|
// increase a small dot product (close to perpendicular), while it can't
|
|
// increase a light beyond white.
|
|
vmudm $v29, aDOT, aDotSc[2h] // Dot product int * scale frac
|
|
j ltadv_finish_light // Returns aLTC, aAOF, aDOT
|
|
vmadh aDOT, aDOT, aDotSc[3h] // Dot product int * scale int, clamp to 0x7FFF
|
|
// aDIR <- aDotSc
|
|
|
|
|
|
|
|
ovl4_end:
|
|
.align 8
|
|
ovl4_padded_end:
|
|
|
|
.close // CODE_FILE
|