mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
2693 lines
131 KiB
ArmAsm
2693 lines
131 KiB
ArmAsm
.rsp
|
|
|
|
.include "rsp/rsp_defs.inc"
|
|
.include "rsp/gbi.inc"
|
|
|
|
// This file assumes DATA_FILE and CODE_FILE are set on the command line
|
|
|
|
.if version() < 110
|
|
.error "armips 0.11 or newer is required"
|
|
.endif
|
|
|
|
.macro li, reg, imm
|
|
addi reg, $zero, imm
|
|
.endmacro
|
|
|
|
.macro move, dst, src
|
|
ori dst, src, 0
|
|
.endmacro
|
|
|
|
// Prohibit macros involving slt; this silently clobbers $1. You can of course
|
|
// manually write the slt and branch instructions if you want this behavior.
|
|
.macro blt, ra, rb, lbl
|
|
.error "blt is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
.macro bgt, ra, rb, lbl
|
|
.error "bgt is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
.macro ble, ra, rb, lbl
|
|
.error "ble is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
.macro bge, ra, rb, lbl
|
|
.error "bge is a macro using slt, and silently clobbers $1!"
|
|
.endmacro
|
|
|
|
// This version doesn't depend on $v0 to be vZero, which it often is not in
|
|
// F3DEX3, and also doesn't get corrupted if $vco is set / consume $vco which
|
|
// may be needed for a subsequent instruction.
|
|
.macro vcopy, dst, src
|
|
vor dst, src, src
|
|
.endmacro
|
|
|
|
.macro vclr, dst
|
|
vxor dst, dst, dst
|
|
.endmacro
|
|
|
|
ACC_UPPER equ 0
|
|
ACC_MIDDLE equ 1
|
|
ACC_LOWER equ 2
|
|
.macro vreadacc, dst, N
|
|
vsar dst, dst, dst[N]
|
|
.endmacro
|
|
|
|
/*
|
|
There are two different memory spaces for the overlays: (a) IMEM and (b) the
|
|
microcode file (which, plus an offset, is also the location in DRAM).
|
|
|
|
A label marks both an IMEM addresses and a file address, but evaluating the
|
|
label in an integer context (e.g. in a branch) gives the IMEM address.
|
|
`orga(your_label)` gets the file address of the label, and `.orga` sets the
|
|
file address.
|
|
`.headersize`, as well as the value after `.create`, sets the difference
|
|
between IMEM addresses and file addresses, so you can set the IMEM address
|
|
with `.headersize desired_imem_addr - orga()`.
|
|
|
|
In IMEM, the whole microcode is organized as (each row is the same address):
|
|
|
|
0x80 space | |
|
|
for boot code Overlay 0 Overlay 1
|
|
(End (More cmd
|
|
start task) handlers)
|
|
(initialization) | |
|
|
|
|
Many command
|
|
handlers
|
|
|
|
Overlay 2 Overlay 3 Overlay 4
|
|
(Lighting) (Clipping) (mIT, rare cmds)
|
|
|
|
Vertex and
|
|
tri handlers
|
|
|
|
DMA code
|
|
|
|
In the file, the microcode is organized as:
|
|
start (file addr 0x0 = IMEM 0x1080)
|
|
Many command handlers
|
|
Overlay 3
|
|
Vertex and tri handlers
|
|
DMA code (end of this = IMEM 0x2000 = file 0xF80)
|
|
Overlay 0
|
|
Overlay 1
|
|
Overlay 2
|
|
Overlay 4
|
|
*/
|
|
|
|
.macro jumpTableEntry, addr
|
|
.dh addr & 0xFFFF
|
|
.endmacro
|
|
|
|
// RSP DMEM
|
|
.create DATA_FILE, 0x0000
|
|
|
|
/*
|
|
Matrices are stored and used in a transposed format compared to how they are
|
|
normally written in mathematics. For the integer part:
|
|
00 02 04 06 typical Xscl Rot Rot 0
|
|
08 0A 0C 0E use: Rot Yscl Rot 0
|
|
10 12 14 16 Rot Rot Zscl 0
|
|
18 1A 1C 1E Xpos Ypos Zpos 1
|
|
The fractional part comes next and is in the same format.
|
|
Applying this transformation is done by multiplying a row vector times the
|
|
matrix, like:
|
|
X Y Z 1 * Xscl Rot Rot 0 = NewX NewY NewZ 1
|
|
Rot Yscl Rot 0
|
|
Rot Rot Zscl 0
|
|
Xpos Ypos Zpos 1
|
|
In C, the matrix is accessed as matrix[row][col], and the vector is vector[row].
|
|
*/
|
|
// 0x0000-0x0040: model matrix
|
|
mMatrix:
|
|
.fill 64
|
|
|
|
// 0x0040-0x0080: view * projection matrix
|
|
vpMatrix:
|
|
.fill 64
|
|
|
|
// model inverse transpose matrix; first three rows only
|
|
mITMatrix:
|
|
.fill 0x30
|
|
|
|
fogFactor:
|
|
.dw 0x00000000
|
|
|
|
textureSettings1:
|
|
.dw 0x00000000 // first word, has command byte, bowtie val, level, tile, and on
|
|
|
|
textureSettings2:
|
|
.dw 0x00000000 // second word, has s and t scale
|
|
|
|
geometryModeLabel:
|
|
.dw 0x00000000 // originally initialized to G_CLIPPING, but that does nothing
|
|
|
|
.if . != 0x00C0
|
|
.error "Scissor and othermode must be at 0x00C0 for S2DEX"
|
|
.endif
|
|
|
|
// scissor (four 12-bit values)
|
|
scissorUpLeft: // the command byte is included since the command word is copied verbatim
|
|
.dw (G_SETSCISSOR << 24) | (( 0 * 4) << 12) | (( 0 * 4) << 0)
|
|
scissorBottomRight:
|
|
.dw ((320 * 4) << 12) | ((240 * 4) << 0)
|
|
|
|
// othermode
|
|
otherMode0: // command byte included, same as above
|
|
.dw (G_RDPSETOTHERMODE << 24) | (0x080CFF)
|
|
otherMode1:
|
|
.dw 0x00000000
|
|
|
|
// Saved texrect state for combining the multiple input commands into one RDP texrect command
|
|
texrectWord1:
|
|
.fill 4 // first word, has command byte, xh and yh
|
|
texrectWord2:
|
|
.fill 4 // second word, has tile, xl, yl
|
|
|
|
// First half of RDP value for split commands (shared by perspNorm moveword to be able to write a 32-bit value)
|
|
rdpHalf1Val:
|
|
.fill 4
|
|
|
|
// perspective norm
|
|
perspNorm:
|
|
.dh 0xFFFF
|
|
|
|
// displaylist stack length
|
|
displayListStackLength:
|
|
.db 0x00 // starts at 0, increments by 4 for each "return address" pushed onto the stack
|
|
|
|
// Is M inverse transpose valid or does it need to be recomputed. Zeroed when modifying M.
|
|
mITValid:
|
|
.db 0
|
|
|
|
// viewport
|
|
viewport:
|
|
.fill 16
|
|
|
|
// Current RDP fifo output position
|
|
rdpFifoPos:
|
|
.fill 4
|
|
|
|
matrixStackPtr:
|
|
.dw 0x00000000
|
|
|
|
// segment table
|
|
segmentTable:
|
|
.fill (4 * 16) // 16 DRAM pointers
|
|
|
|
// displaylist stack
|
|
displayListStack:
|
|
|
|
// ucode text (shared with DL stack)
|
|
.ascii ID_STR, 0x0A
|
|
|
|
.align 16
|
|
.if . - displayListStack != 0x48
|
|
.warning "ID_STR incorrect length, affects displayListStack"
|
|
.endif
|
|
|
|
endSharedDMEM:
|
|
.if . != 0x180
|
|
.error "endSharedDMEM at incorrect address, matters for G_LOAD_UCODE / S2DEX"
|
|
.endif
|
|
|
|
// constants for register $v31
|
|
.if (. & 15) != 0
|
|
.error "Wrong alignment for v31value"
|
|
.endif
|
|
v31Value:
|
|
// v31 must go from lowest to highest (signed) values for vcc patterns.
|
|
.dh -4 // used in clipping, vtx write for Newton-Raphson reciprocal
|
|
.dh -1 // used often
|
|
.dh 1 // used to load accumulator when vOne or vLtOne not available
|
|
.dh 2 // used as clip ratio (vtx write, clipping) and in clipping
|
|
.dh 4 // used to initialize 4s in vSTScl in vtx setup
|
|
.dh 0x4000 // used in tri write, texgen
|
|
.dh 0x7F00 // used in fog, normals unpacking
|
|
.dh 0x7FFF // used often
|
|
|
|
// constants for register $v30; only used in tri write and vtx_indices_to_addr
|
|
.if (. & 15) != 0
|
|
.error "Wrong alignment for v30value"
|
|
.endif
|
|
// Only one VCC pattern used:
|
|
// vge xxx, $v30, $v30[7] = 11110001 in tri write
|
|
v30Value:
|
|
.dh vertexBuffer // this and next used in vtx_indices_to_addr
|
|
.dh vtxSize << 7 // 0x1300; it's not 0x2600 because vertex indices are *2
|
|
.dh 0x1000 // used once in tri write, some multiplier
|
|
.dh 0x0100 // used several times in tri write
|
|
.dh -16 // used in tri write for Newton-Raphson reciprocal
|
|
.dh 0xFFF8 // used once in tri write, mask away lower ST bits
|
|
.dh 0x0010 // used once in tri write for Newton-Raphson reciprocal
|
|
.dh 0x0020 // used in tri write, both signed and unsigned multipliers
|
|
|
|
/*
|
|
Quick note on Newton-Raphson:
|
|
https://en.wikipedia.org/wiki/Division_algorithm#Newton%E2%80%93Raphson_division
|
|
Given input D, we want to find the reciprocal R. The base formula for refining
|
|
the estimate of R is R_new = R*(2 - D*R). However, since the RSP reciprocal
|
|
instruction moves the radix point 1 to the left, the result has to be multiplied
|
|
by 2. So it's 2*R*(2 - D*2*R) = R*(4 - 4*D*R) = R*(1*4 + D*R*-4). This is where
|
|
the 4 and -4 come from. For tri write, the result needs to be multiplied by 4
|
|
for subpixels, so it's 16 and -16.
|
|
*/
|
|
|
|
cameraWorldPos:
|
|
.skip 6
|
|
tempHalfword2:
|
|
.skip 2 // Overwritten as part of camera world position, but can be used as temp
|
|
lightBufferLookat:
|
|
.skip 8 // s8 X0, Y0, Z0, dummy, X1, Y1, Z1, dummy
|
|
lightBufferMain:
|
|
.skip (G_MAX_LIGHTS * lightSize)
|
|
lightBufferAmbient:
|
|
.skip 8 // just colors for ambient light
|
|
ltBufOfs equ (lightBufferMain - altBase)
|
|
|
|
occlusionPlaneEdgeCoeffs:
|
|
/*
|
|
Vertex is in occlusion region if all five equations below are true:
|
|
4 * screenX[s13.2] * c0[s0.15] - 0.5 * screenY[s13.2] < c4[s14.1]
|
|
4 * screenY[s13.2] * c1[s0.15] - 0.5 * screenX[s13.2] < c5[s14.1]
|
|
4 * screenX[s13.2] * c2[s0.15] + 0.5 * screenY[s13.2] < c6[s14.1]
|
|
4 * screenY[s13.2] * c3[s0.15] + 0.5 * screenX[s13.2] < c7[s14.1]
|
|
clamp_to_0.s15(clipX[s15.16] * kx[0.s15])
|
|
+ clamp_to_0.s15(clipY[s15.16] * ky[0.s15])
|
|
+ clamp_to_0.s15(clipZ[s15.16] * kz[0.s15])
|
|
+ kc[0.s15]
|
|
>= 0
|
|
The first four can be rewritten as (again, vertex is occluded if all are true):
|
|
screenY > screenX * 8*c0 + -2*c4
|
|
screenX > screenY * 8*c1 + -2*c5
|
|
screenY < screenX * -8*c2 + 2*c6
|
|
screenX < screenY * -8*c3 + 2*c7
|
|
where screenX and screenY are in subpixels (e.g. screenX = 100 = 25.0 pixels),
|
|
c0-c3 are shorts representing -1:0.99997,
|
|
and c4-c7 are shorts representing "half pixels" (e.g. c4 = 50 = 25.0 pixels)
|
|
|
|
For the last equation, one option is to think of kx through kc as in s10.5 mode
|
|
instead, so a value of 0x0020 is 1.0 and they can range from -0x400.00 to
|
|
0x3FF.F8. This choice is because clipZ ranges from 0x0000.0000 at the camera
|
|
plane to 0x03FF.0000 at the maximum distance away. The normal distance Adult
|
|
Link is from the camera is about 0x00B0.0000.
|
|
|
|
A better option is to develop your plane equation in floating point, e.g.
|
|
clipX[f] * -0.2f + clipY[f] * 0.4f + clipZ[f] * 1.0f + -200.0f >= 0
|
|
then multiply everything by (32768.0f / max(abs(kx), abs(ky), abs(kz), abs(kc)))
|
|
(here 32768.0f / 200.0f = 163.84f)
|
|
clipX[f] * -32.77f + clipY[f] * 65.54f + clipZ[f] * 163.84f + -32768
|
|
*/
|
|
.dh 0x0000 // c0
|
|
.dh 0x0000 // c1
|
|
.dh 0x0000 // c2
|
|
.dh 0x0000 // c3
|
|
.dh 0x8000 // c4
|
|
.dh 0x8000 // c5
|
|
.dh 0x8000 // c6
|
|
.dh 0x8000 // c7
|
|
occlusionPlaneMidCoeffs:
|
|
.dh 0x0000 // kx
|
|
.dh 0x0000 // ky
|
|
.dh 0x0000 // kz
|
|
.dh 0x8000 // kc
|
|
|
|
// Alternate base address because vector load offsets can't reach all of DMEM.
|
|
// altBaseReg permanently points here.
|
|
altBase:
|
|
|
|
fxParams:
|
|
|
|
aoAmbientFactor:
|
|
.dh 0xFFFF
|
|
aoDirectionalFactor:
|
|
.dh 0xA000
|
|
|
|
/*
|
|
fresnelOffset = Dot product value, in 0000 - 7FFF, which gives shade alpha = 0
|
|
Let k = dot product value, in 0000 - 7FFF, which gives shade alpha = FF.
|
|
Then fresnelScale = 0.7FFF / (k - fresnelOffset) as s7.8 fixed point.
|
|
Alternatively, shade alpha [0000 - 7FFF] =
|
|
fresnelScale [-80.00 - 7F.FF] * (dot product [0000 - 7FFF] - fresnelOffset)
|
|
Examples:
|
|
1. Grazing -> 00; normal -> FF
|
|
Then set fresnelOffset = 0000, fresnelScale = 01.00
|
|
2. Grazing -> FF; normal -> 00
|
|
Then set fresnelOffset = 7FFF, fresnelScale = FF.00 (-01.00)
|
|
3. 30 degrees (0.5f or 4000) -> FF; 60 degrees (0.86f or 6ED9) -> 00
|
|
Then set fresnelOffset = 6ED9, fresnelScale = FD.45 (-02.BB = 1 / (0.5f - 0.86f))
|
|
*/
|
|
fresnelOffset:
|
|
.dh 0x0000 // See above
|
|
fresnelScale:
|
|
.dh 0x0000 // See above
|
|
|
|
.if (. & 7) != 0
|
|
.error "Wrong alignment before attrOffsetST"
|
|
.endif
|
|
attrOffsetST:
|
|
.dh 0x0100
|
|
.dh 0xFF00
|
|
|
|
attrOffsetZ:
|
|
.dh 0xFFFE
|
|
tempHalfword1:
|
|
.dh 0x0000 // Overwritten by movewords to above and below, can be used as temp
|
|
|
|
materialCullMode: // Overwritten to 0 by SPNormalsMode, but that should not
|
|
.db 0 // happen in the middle of tex setup
|
|
normalsMode:
|
|
.db 0 // Overwrites tempHalfword1 and materialCullMode
|
|
|
|
alphaCompareCullMode:
|
|
.db 0x00 // 0 = disabled, 1 = cull if all < thresh, -1 = cull if all >= thresh
|
|
alphaCompareCullThresh:
|
|
.db 0x00 // Alpha threshold, 00 - FF
|
|
tempHalfword3:
|
|
.dh 0x0000 // Overwritten by movewords to above and below, can be used as temp
|
|
|
|
.db 0
|
|
numLightsxSize:
|
|
.db 0 // Overwrites above
|
|
|
|
lastMatDLPhyAddr:
|
|
.dw 0
|
|
|
|
texgenLinearCoeffs:
|
|
.dh 0x44D3
|
|
.dh 0x6CB3
|
|
|
|
// Constants for clipping algorithm
|
|
clipCondShifts:
|
|
.db CLIP_SCAL_NY_SHIFT
|
|
.db CLIP_SCAL_PY_SHIFT
|
|
.db CLIP_SCAL_NX_SHIFT
|
|
.db CLIP_SCAL_PX_SHIFT
|
|
|
|
// "Forward declaration" of temporary matrix in clipTempVerts scratch space, aligned to 16 bytes
|
|
tempMemRounded equ ((clipTempVerts + 15) & ~15)
|
|
|
|
// Movemem table
|
|
movememTable:
|
|
.dh tempMemRounded // G_MTX multiply temp matrix (model)
|
|
.dh mMatrix // G_MV_MMTX
|
|
.dh tempMemRounded // G_MTX multiply temp matrix (projection)
|
|
.dh vpMatrix // G_MV_PMTX
|
|
.dh viewport // G_MV_VIEWPORT
|
|
.dh cameraWorldPos // G_MV_LIGHT
|
|
|
|
// moveword table
|
|
movewordTable:
|
|
.dh fxParams // G_MW_FX
|
|
.dh numLightsxSize - 3 // G_MW_NUMLIGHT
|
|
.dh perspNorm - 2 // G_MW_PERSPNORM
|
|
.dh segmentTable // G_MW_SEGMENT
|
|
.dh fogFactor // G_MW_FOG
|
|
.dh lightBufferMain // G_MW_LIGHTCOL
|
|
|
|
// G_POPMTX, G_MTX, G_MOVEMEM Command Jump Table
|
|
movememHandlerTable:
|
|
jumpTableEntry G_POPMTX_end // G_POPMTX
|
|
jumpTableEntry ovl234_ovl4_entrypoint // G_MTX (multiply)
|
|
jumpTableEntry G_MOVEMEM_end // G_MOVEMEM, G_MTX (load)
|
|
|
|
// RDP/Immediate Command Jump Table
|
|
jumpTableEntry ovl234_ovl4_entrypoint // G_DMA_IO
|
|
jumpTableEntry G_TEXTURE_handler
|
|
jumpTableEntry G_POPMTX_handler
|
|
jumpTableEntry G_GEOMETRYMODE_handler
|
|
jumpTableEntry G_MTX_handler
|
|
jumpTableEntry G_MOVEWORD_handler
|
|
jumpTableEntry G_MOVEMEM_handler
|
|
jumpTableEntry G_LOAD_UCODE_handler
|
|
jumpTableEntry G_DL_handler
|
|
jumpTableEntry G_ENDDL_handler
|
|
jumpTableEntry G_SPNOOP_handler
|
|
jumpTableEntry G_RDPHALF_1_handler
|
|
jumpTableEntry G_SETOTHERMODE_L_handler
|
|
jumpTableEntry G_SETOTHERMODE_H_handler
|
|
jumpTableEntry G_TEXRECT_handler
|
|
jumpTableEntry G_TEXRECTFLIP_handler
|
|
cmdJumpTable:
|
|
jumpTableEntry G_VTX_handler
|
|
jumpTableEntry ovl234_ovl4_entrypoint // G_MODIFYVTX
|
|
jumpTableEntry G_CULLDL_handler
|
|
jumpTableEntry ovl234_ovl4_entrypoint // G_BRANCH_WZ
|
|
jumpTableEntry G_TRI1_handler
|
|
jumpTableEntry G_TRI2_handler
|
|
jumpTableEntry G_QUAD_handler
|
|
jumpTableEntry G_TRISTRIP_handler
|
|
jumpTableEntry G_TRIFAN_handler
|
|
jumpTableEntry G_LIGHTTORDP_handler
|
|
|
|
// The maximum number of generated vertices in a clip polygon. In reality, this
|
|
// is equal to MAX_CLIP_POLY_VERTS, but for testing we can change them separately.
|
|
// In case you're wondering if it's possible to have a 7-vertex polygon where all
|
|
// 7 verts are generated, it looks like this (X = generated vertex):
|
|
// ___----=>
|
|
// +---------------__X----X _-^
|
|
// | __--^^ X^
|
|
// | __--^^ _-^|
|
|
// _X^^^ _-^ |
|
|
// C | _-^ |
|
|
// ^X _-^ |
|
|
// |\ _-^ |
|
|
// +-X--_X^---------------+
|
|
// V^
|
|
MAX_CLIP_GEN_VERTS equ 7
|
|
// Normally, each clip plane can cut off a "tip" of a polygon, turning one vert
|
|
// into two. (It can also cut off more of the polygon and remove additional verts,
|
|
// but the maximum is one more vert per clip plane.) So with 5 clip planes, we
|
|
// could have a maximum of 8 verts in the final polygon. However, the verts
|
|
// generated by the no-nearclipping plane will always be at infinity, so they
|
|
// will always get replaced by generated verts from one of the other clip planes.
|
|
// Put another way, if there are 8 verts in the final polygon, there are 8 edges,
|
|
// which are portions of the 3 original edges plus portions of 5 edges along the
|
|
// 5 clip planes. But the edge portion along the no-nearclipping plane is at
|
|
// infinity, so that edge can't be on screen.
|
|
MAX_CLIP_POLY_VERTS equ 7
|
|
clipPoly:
|
|
.skip (MAX_CLIP_POLY_VERTS+1) * 2 // 3 5 7 + term 0
|
|
clipPoly2: // \ / \ / \
|
|
.skip (MAX_CLIP_POLY_VERTS+1) * 2 // 4 6 7 + term 0
|
|
|
|
// Vertex buffer in RSP internal format
|
|
vertexBuffer:
|
|
.skip (G_MAX_VERTS * vtxSize)
|
|
|
|
YIELD_DATA_FOOTER_SIZE equ 0x10
|
|
yieldDataFooter equ OS_YIELD_DATA_SIZE - YIELD_DATA_FOOTER_SIZE
|
|
|
|
.if . > yieldDataFooter
|
|
// OS_YIELD_DATA_SIZE (0xC00) bytes of DMEM are saved; the last data in that is
|
|
// the footer, organized as:
|
|
// +0: perfCounter1: Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP
|
|
// +4: perfCounter2: Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects
|
|
// +8: taskDataPtr
|
|
// +C: ucode
|
|
// So, any data starting from the address of this footer will be clobbered,
|
|
// so the vertex buffer and other data which needs to be save across yield
|
|
// can't extend here. (The input buffer will be reloaded from the next
|
|
// command in the source DL.)
|
|
.error "Important things in DMEM will not be saved at yield!"
|
|
.endif
|
|
|
|
// Space for temporary verts for clipping code
|
|
// tempMemRounded defined above = this rounded up to 16 bytes, for temp mtx etc.
|
|
clipTempVerts:
|
|
.skip MAX_CLIP_GEN_VERTS * vtxSize
|
|
clipTempVertsEnd:
|
|
|
|
.if (. - tempMemRounded) < 0x40
|
|
.error "Not enough space for temp matrix!"
|
|
.endif
|
|
|
|
RDP_CMD_BUFSIZE equ 0xB0
|
|
RDP_CMD_BUFSIZE_EXCESS equ 0xB0 // Maximum size of an RDP triangle command
|
|
RDP_CMD_BUFSIZE_TOTAL equ (RDP_CMD_BUFSIZE + RDP_CMD_BUFSIZE_EXCESS)
|
|
INPUT_BUFFER_CMDS equ 21
|
|
INPUT_BUFFER_LEN equ (INPUT_BUFFER_CMDS * 8)
|
|
END_VARIABLE_LEN_DMEM equ (0xFC0 - INPUT_BUFFER_LEN - (2 * RDP_CMD_BUFSIZE_TOTAL))
|
|
|
|
endVariableDmemUse:
|
|
|
|
.if . > END_VARIABLE_LEN_DMEM
|
|
.error "Out of DMEM space"
|
|
.endif
|
|
|
|
.org END_VARIABLE_LEN_DMEM
|
|
|
|
// First RDP Command Buffer
|
|
rdpCmdBuffer1:
|
|
.skip RDP_CMD_BUFSIZE
|
|
rdpCmdBuffer1End:
|
|
.skip RDP_CMD_BUFSIZE_EXCESS
|
|
// Second RDP Command Buffer
|
|
rdpCmdBuffer2:
|
|
.skip RDP_CMD_BUFSIZE
|
|
rdpCmdBuffer2End:
|
|
.skip RDP_CMD_BUFSIZE_EXCESS
|
|
|
|
// Input buffer. After RDP cmd buffers so it can be vector addressed from end.
|
|
inputBuffer:
|
|
.skip INPUT_BUFFER_LEN
|
|
inputBufferEnd:
|
|
|
|
.if . != 0xFC0
|
|
.error "DMEM organization incorrect"
|
|
.endif
|
|
|
|
.org 0xFC0
|
|
|
|
// 0x0FC0-0x1000: OSTask
|
|
OSTask:
|
|
.skip 0x40
|
|
|
|
.close // DATA_FILE
|
|
|
|
// RSP IMEM
|
|
.create CODE_FILE, 0x00001080
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
/////////////////////////////// Register Use Map ///////////////////////////////
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
|
|
// Vertex / lighting all regs:
|
|
vM0I equ $v0 // mMatrix rows int/frac
|
|
vM1I equ $v1 // Valid in vertex, lighting, and M inverse transpose
|
|
vM2I equ $v2
|
|
vM3I equ $v3
|
|
vM0F equ $v4
|
|
vM1F equ $v5
|
|
vM2F equ $v6
|
|
vM3F equ $v7
|
|
vVP0I equ $v8 // vpMatrix rows int/frac
|
|
vVP1I equ $v9 // Valid in vertex and lighting only
|
|
vVP2I equ $v10 // $v10 also used as temp in lighting, then reloaded with vVP2I
|
|
vVP3I equ $v11
|
|
vVP0F equ $v12
|
|
vVP1F equ $v13
|
|
vVP2F equ $v14
|
|
vVP3F equ $v15
|
|
vVpScl equ $v16 // Vertex constants (viewport scale, offset, ST scale, offset)
|
|
vVpOfs equ $v17 // Also contain other constants, see comment in vtx_setup_constants
|
|
vSTScl equ $v18 // Valid in vertex, lighting, and first half of clipping
|
|
vSTOfs equ $v19
|
|
// Remaining regs sometimes valid in vertex and lighting, also used as temps
|
|
vPairPosI equ $v20 // Vertex pair model / world space position int/frac
|
|
vPairPosF equ $v21
|
|
vPairST equ $v22 // Vertex pair ST texture coordinates
|
|
vPairTPosF equ $v23 // Vertex pair transformed (clip / screen) space position frac/int
|
|
vPairTPosI equ $v24
|
|
// $v25: temp
|
|
// $v26: temp
|
|
vPairRGBA equ $v27 // Vertex pair color
|
|
vPairNrml equ $v28 // Vertex pair normals (model then world space)
|
|
// $v29: permanent temp register, also write results here to discard
|
|
vPairLt equ $v30 // Vertex pair total light color/intensity (RGB-RGB-)
|
|
// $v31: Only global constant vector register
|
|
|
|
// Some extra defines for lighting:
|
|
vPackPXY equ $v23 // Positive X and Y in packed normals
|
|
vPackZ equ $v24 // Z in packed normals
|
|
vLtOne equ $v24 // 1 in each vector lane, for adding into accumulator for fast dot products
|
|
vLtRGBOut equ $v25 // Light / effects RGB output
|
|
vLtAOut equ $v26 // Light / effects alpha output
|
|
vLtColor equ $v26 // Light color
|
|
vLookat1 equ $v28 // Lookat direction 1
|
|
vLookat0 equ $v30 // Lookat direction 0
|
|
// M inverse transpose matrix in regs briefly:
|
|
vLtMIT0I equ $v26
|
|
vLtMIT1I equ $v25
|
|
vLtMIT2I equ $v23
|
|
vLtMIT0F equ $v29
|
|
vLtMIT1F equ $v30
|
|
vLtMIT2F equ $v10
|
|
|
|
// Other vector regs defines:
|
|
vZero equ $v0 // all elements = 0 (has other value in vtx / lighting)
|
|
vOne equ $v1 // all elements = 1 (has other value in vtx / lighting)
|
|
|
|
// Global and semi-global (i.e. one main function + occasional local) scalar regs:
|
|
// $zero // Hardwired zero scalar register
|
|
altBaseReg equ $13 // Alternate base address register for vector loads
|
|
inputVtxPos equ $14 // Pointer to loaded vertex to transform
|
|
outputVtxPos equ $15 // Pointer to vertex buffer to store transformed verts
|
|
clipFlags equ $16 // Current clipping flags being checked
|
|
clipPolyRead equ $17 // Read pointer within current polygon being clipped
|
|
clipPolySelect equ $18 // Clip poly double buffer selection, or < 0 for normal tri write
|
|
clipPolyWrite equ $21 // Write pointer within current polygon being clipped
|
|
rdpCmdBufEnd equ $22 // RDP command buffer end DRAM pointer
|
|
rdpCmdBufPtr equ $23 // RDP command buffer current DRAM pointer
|
|
cmd_w1_dram equ $24 // DL command word 1, which is also DMA DRAM addr
|
|
cmd_w0 equ $25 // DL command word 0, also holds next tris info
|
|
taskDataPtr equ $26 // Task data (display list) DRAM pointer
|
|
inputBufferPos equ $27 // DMEM position within display list input buffer, relative to end
|
|
perfCounter1 equ $28 // Upper 16 bits: num verts; lower 16 bits: num tris sent to RDP
|
|
perfCounter2 equ $29 // Upper 18 bits: num tris requested; lower 14 bits: num tex/fill rects
|
|
// $ra // Return address
|
|
|
|
// Misc scalar regs:
|
|
clipMaskIdx equ $5
|
|
secondVtxPos equ $8
|
|
curLight equ $9
|
|
|
|
// Arguments to dma_read_write
|
|
dmaLen equ $19 // also used by itself
|
|
dmemAddr equ $20
|
|
// cmd_w1_dram // used for all dma_read_write DRAM addresses
|
|
|
|
// Argument to load_overlay*
|
|
postOvlRA equ $12 // Commonly used locally
|
|
|
|
// ==== Summary of uses of all registers
|
|
// $zero: Hardwired zero scalar register
|
|
// $1: vertex 1 addr, zero when command handler is called, count of
|
|
// remaining vertices * 0x10, pointer to store texture coefficients, local
|
|
// $2: vertex 2 addr, vertex at end of edge during clipping, pointer to store
|
|
// shade coefficients, local
|
|
// $3: vertex 3 addr, vertex at start of edge during clipping, local
|
|
// $4: pre-shuffle vertex 1 addr for flat shading during tri write, otherwise unused
|
|
// $5: clipMaskIdx, geometry mode middle 2 bytes during vertex load / lighting,
|
|
// local
|
|
// $6: geometry mode low byte during tri write, local
|
|
// $7: command byte when command handler is called, fog flag in vtx write,
|
|
// mIT recompute flag in Overlay 4, local
|
|
// $8: secondVtxPos, local
|
|
// $9: curLight, clip mask during clipping, local
|
|
// $10: unused
|
|
// $11: very common local
|
|
// $12: postOvlRA, local
|
|
// $13: altBaseReg (global)
|
|
// $14: inputVtxPos, local
|
|
// $15: outputVtxPos, local
|
|
// $16: clipFlags (global)
|
|
// $17: clipPolyRead (global)
|
|
// $18: clipPolySelect (global)
|
|
// $19: dmaLen, onscreen vertex during clipping, local
|
|
// $20: dmemAddr, local
|
|
// $21: clipPolyWrite (global)
|
|
// $22: rdpCmdBufEnd (global)
|
|
// $23: rdpCmdBufPtr (global)
|
|
// $24: cmd_w1_dram, local
|
|
// $25: cmd_w0 (global); holds next tris info during tri write -> clipping ->
|
|
// vtx write
|
|
// $26: taskDataPtr (global)
|
|
// $27: inputBufferPos (global)
|
|
// $28: perfCounter1 (global)
|
|
// $29: perfCounter2 (global)
|
|
// $30: unused
|
|
// $ra: Return address for jal, b*al
|
|
// $v0: vZero (every element 0)
|
|
// $v1: vOne (every element 1)
|
|
// $v2: very common local
|
|
// $v3: local
|
|
// $v4: local
|
|
// $v5: local
|
|
// $v6: local
|
|
// $v7: local
|
|
// $v8: local
|
|
// $v9: local
|
|
// $v10: local
|
|
// $v11: local
|
|
// $v12: local
|
|
// $v13: local
|
|
// $v14: local
|
|
// $v15: local
|
|
// $v16: vVpScl, local
|
|
// $v17: vVpOfs, local
|
|
// $v18: vSTScl, local
|
|
// $v19: vSTOfs, local
|
|
// $v20: local
|
|
// $v21: local
|
|
// $v22: vPairST, local
|
|
// $v23: vPairTPosF, local
|
|
// $v24: vPairTPosI, local
|
|
// $v25: prev vertex data, local
|
|
// $v26: prev vertex data, local
|
|
// $v27: vPairRGBA, local
|
|
// $v28: local
|
|
// $v29: register to write to discard results, local
|
|
// $v30: constant values for tri write
|
|
// $v31: general constant values
|
|
|
|
// Initialization routines
|
|
// Everything up until ovl01_end will get overwritten by ovl0 and/or ovl1
|
|
start: // This is at IMEM 0x1080, not the start of IMEM
|
|
vadd $v29, $v29, $v29 // Consume VCO (carry) value possibly set by the previous ucode
|
|
lqv $v31[0], (v31Value)($zero)
|
|
li altBaseReg, altBase
|
|
li rdpCmdBufPtr, rdpCmdBuffer1
|
|
li rdpCmdBufEnd, rdpCmdBuffer1End
|
|
lw $11, rdpFifoPos
|
|
lw $12, OSTask + OSTask_flags
|
|
li $1, SP_CLR_SIG2 | SP_CLR_SIG1 // Clear task done and yielded signals
|
|
beqz $11, initialize_rdp // If RDP FIFO not set up yet, starting ucode from scratch
|
|
mtc0 $1, SP_STATUS
|
|
andi $12, $12, OS_TASK_YIELDED // Resumed from yield or came from called ucode?
|
|
beqz $12, continue_from_os_task // If latter, load DL (task data) pointer from OSTask
|
|
sw $zero, OSTask + OSTask_flags // Clear all task flags, incl. yielded
|
|
continue_from_yield:
|
|
lw perfCounter1, yieldDataFooter + 0x0 // Perf counters saved here at yield
|
|
lw perfCounter2, yieldDataFooter + 0x4
|
|
j finish_setup
|
|
lw taskDataPtr, yieldDataFooter + 0x8 // load DL pointer from yield data
|
|
|
|
initialize_rdp:
|
|
mfc0 $11, DPC_STATUS
|
|
andi $11, $11, DPC_STATUS_XBUS_DMA
|
|
bnez $11, wait_dpc_start_valid
|
|
mfc0 $2, DPC_END
|
|
lw $3, OSTask + OSTask_output_buff
|
|
sub $11, $3, $2
|
|
bgtz $11, wait_dpc_start_valid
|
|
mfc0 $1, DPC_CURRENT
|
|
lw $3, OSTask + OSTask_output_buff_size
|
|
beqz $1, wait_dpc_start_valid
|
|
sub $11, $1, $3
|
|
bgez $11, wait_dpc_start_valid
|
|
nop
|
|
bne $1, $2, f3dzex_0000111C
|
|
wait_dpc_start_valid:
|
|
mfc0 $11, DPC_STATUS
|
|
andi $11, $11, DPC_STATUS_START_VALID
|
|
bnez $11, wait_dpc_start_valid
|
|
li $11, DPC_STATUS_CLR_XBUS
|
|
mtc0 $11, DPC_STATUS
|
|
lw $2, OSTask + OSTask_output_buff_size
|
|
mtc0 $2, DPC_START
|
|
mtc0 $2, DPC_END
|
|
f3dzex_0000111C:
|
|
sw $2, rdpFifoPos
|
|
lw $11, matrixStackPtr // Initialize matrix stack pointer from OSTask
|
|
bnez $11, continue_from_os_task // if not yet initialized
|
|
lw $11, OSTask + OSTask_dram_stack
|
|
sw $11, matrixStackPtr
|
|
continue_from_os_task:
|
|
lw perfCounter1, textureSettings1 // Counters stored here if jumped to different ucode
|
|
lw perfCounter2, textureSettings2 // If starting from scratch, these are zero
|
|
lw taskDataPtr, OSTask + OSTask_data_ptr
|
|
finish_setup:
|
|
li inputBufferPos, 0
|
|
li cmd_w1_dram, orga(ovl1_start)
|
|
j load_overlays_0_1
|
|
li postOvlRA, displaylist_dma
|
|
|
|
start_end:
|
|
.align 8
|
|
start_padded_end:
|
|
|
|
.orga max(orga(), max(ovl0_padded_end - ovl0_start, ovl1_padded_end - ovl1_start) - 0x80)
|
|
ovl01_end:
|
|
|
|
displaylist_dma_with_count:
|
|
andi inputBufferPos, cmd_w0, 0x00F8 // Byte 3, how many cmds to drop from load (max 0xA0)
|
|
displaylist_dma:
|
|
// Load INPUT_BUFFER_LEN - inputBufferPos cmds (inputBufferPos >= 0, mult of 8)
|
|
addi inputBufferPos, inputBufferPos, -INPUT_BUFFER_LEN // inputBufferPos = - num cmds
|
|
nor dmaLen, inputBufferPos, $zero // DMA length = -inputBufferPos - 1 = ones compliment
|
|
move cmd_w1_dram, taskDataPtr // set up the DRAM address to read from
|
|
jal dma_read_write // initiate the DMA read
|
|
addi dmemAddr, inputBufferPos, inputBufferEnd // set the address to DMA read to
|
|
sub taskDataPtr, taskDataPtr, inputBufferPos // increment the DRAM address to read from next time
|
|
wait_for_dma_and_run_next_command:
|
|
G_POPMTX_end:
|
|
G_MOVEMEM_end:
|
|
jal while_wait_dma_busy // wait for the DMA read to finish
|
|
G_SPNOOP_handler:
|
|
run_next_DL_command:
|
|
mfc0 $1, SP_STATUS // load the status word into register $1
|
|
vclr vZero // Zero vZero for each command
|
|
beqz inputBufferPos, displaylist_dma // load more DL commands if none are left
|
|
andi $1, $1, SP_STATUS_SIG0 // check if the task should yield
|
|
lw cmd_w0, (inputBufferEnd)(inputBufferPos) // load the command word into cmd_w0
|
|
bnez $1, load_overlay_0_and_enter // load and execute overlay 0 if yielding; $1 > 0
|
|
sra $7, cmd_w0, 24 // extract DL command byte from command word
|
|
lw cmd_w1_dram, (inputBufferEnd + 4)(inputBufferPos) // load the next DL word into cmd_w1_dram
|
|
vadd vOne, vZero, $v31[2] // 1; set up vOne for each command
|
|
addi inputBufferPos, inputBufferPos, 0x0008 // increment the DL index by 2 words
|
|
// $7 must retain the command byte for load_mtx and overlay 4 stuff
|
|
// $11 must contain the handler called for several handlers
|
|
// $1 must remain zero
|
|
addi $2, $7, -G_VTX // If >= G_VTX, use jump table
|
|
bgez $2, do_cmd_jump_table // $2 is the index
|
|
addi $3, $2, G_VTX - (0xFF00 | G_SETTIMG) // If >= G_SETTIMG, use handler; for G_NOOP, this puts
|
|
bgez $3, G_SETxIMG_handler // garbage in second word, but normal handler does anyway
|
|
addi $12, $3, G_SETTIMG - G_SETTILE // If >= G_SETTILE, use RDP handler
|
|
bgez $12, G_RDP_handler
|
|
addi $2, $12, G_SETTILE - G_RDPLOADSYNC // If >= G_RDPLOADSYNC, refine cmd further
|
|
bgez $2, refine_cmd_further // Otherwise $2 (negative) is the index
|
|
do_cmd_jump_table:
|
|
sll $11, $2, 1 // Multiply jump table index by 2 for addr offset
|
|
lhu $11, cmdJumpTable($11) // Load address of handler from jump table
|
|
jr $11 // Jump to handler
|
|
nop // TODO; delay slot must not affect $1, $7, $11
|
|
|
|
G_DL_handler:
|
|
lbu $1, displayListStackLength // Get the DL stack length
|
|
sll $2, cmd_w0, 15 // Shifts the push/nopush value to the sign bit
|
|
branch_dl:
|
|
jal segmented_to_physical
|
|
add $3, taskDataPtr, inputBufferPos // Current DL pos to push on stack
|
|
sub $11, cmd_w1_dram, taskDataPtr // Negative how far new target is behind current end
|
|
bltz $2, call_ret_common // Nopush = branch = flag is set
|
|
move taskDataPtr, cmd_w1_dram // Set the new DL to the target display list
|
|
sw $3, (displayListStack)($1)
|
|
addi $1, $1, 4 // Increment the DL stack length
|
|
call_ret_common:
|
|
sb $zero, materialCullMode // This covers call, branch, return, and cull and branchZ successes
|
|
j displaylist_dma_with_count
|
|
sb $1, displayListStackLength
|
|
|
|
G_CULLDL_handler:
|
|
j vtx_addrs_from_cmd // Load start vtx addr in $12, end vtx in $3
|
|
li $11, culldl_return_from_addrs
|
|
culldl_return_from_addrs:
|
|
li $1, (CLIP_SCRN_NPXY | CLIP_CAMPLANE) // TODO | CLIP_OCCLUDED)
|
|
lhu $11, VTX_CLIP($12)
|
|
culldl_loop:
|
|
and $1, $1, $11
|
|
beqz $1, run_next_DL_command // Some vertex is on the screen-side of all clipping planes; have to render
|
|
lhu $11, (vtxSize + VTX_CLIP)($12) // next vertex clip flags
|
|
bne $12, $3, culldl_loop // loop until reaching the last vertex
|
|
addi $12, $12, vtxSize // advance to the next vertex
|
|
li cmd_w0, 0 // Clear count of DL cmds to skip loading
|
|
G_ENDDL_handler:
|
|
lbu $1, displayListStackLength // Load the DL stack index; if end stack,
|
|
beqz $1, load_overlay_0_and_enter // load overlay 0; $1 < 0 signals end
|
|
addi $1, $1, -4 // Decrement the DL stack index
|
|
j call_ret_common // has a different version in ovl1
|
|
lw taskDataPtr, (displayListStack)($1) // Load addr of DL to return to
|
|
|
|
G_SETxIMG_handler:
|
|
beqz $7, G_RDP_handler // Don't do any of this for G_NOOP
|
|
lb $3, materialCullMode // Get current mode
|
|
jal segmented_to_physical // Convert image to physical address
|
|
lw $2, lastMatDLPhyAddr // Get last material physical addr
|
|
bnez $3, G_RDP_handler // If not in normal mode (0), exit
|
|
add $12, taskDataPtr, inputBufferPos // Current material physical addr
|
|
beq $12, $2, @@skip // Branch if we are executing the same mat again
|
|
sw $12, lastMatDLPhyAddr // Store material physical addr
|
|
li $7, 1 // > 0: in material first time
|
|
@@skip: // Otherwise $7 was < 0: cull mode (in mat second time)
|
|
j G_RDP_handler
|
|
sb $7, materialCullMode
|
|
|
|
refine_cmd_further:
|
|
addi $12, $7, -(0xFF00 | G_SETSCISSOR) // Relative to G_SETSCISSOR = 0
|
|
bltz $12, G_RDP_handler // G_RDPLOADSYNC through G_SETCONVERT
|
|
andi $2, $2, 0x0003 // $2 is relative to G_RDPLOADSYNC;
|
|
beqz $2, G_RDP_handler // G_SETPRIMDEPTH and G_SETTILESIZE are multiples of 4 from here
|
|
addi $3, $7, -(0xFF00 | G_LOADTLUT) // G_SETSCISSOR and G_RDPSETOTHERMODE are < this
|
|
bltz $3, scissor_other_handler
|
|
li $2, (0xFF00 | G_RDPHALF_2)
|
|
beq $2, $7, G_RDPHALF_2_handler // Otherwise G_LOADTLUT, G_LOADBLOCK, or G_LOADTILE
|
|
load_cmds_handler:
|
|
lb $3, materialCullMode
|
|
bltz $3, run_next_DL_command // If cull mode is < 0, in mat second time, skip the load
|
|
G_RDP_handler:
|
|
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Add the second word of the command to the RDP command buffer
|
|
G_SYNC_handler:
|
|
sw cmd_w0, 0(rdpCmdBufPtr) // Add the command word to the RDP command buffer
|
|
addi rdpCmdBufPtr, rdpCmdBufPtr, 8 // Increment the next RDP command pointer by 2 words
|
|
check_rdp_buffer_full_and_run_next_cmd:
|
|
li $ra, run_next_DL_command // Set up running the next DL command as the return address
|
|
check_rdp_buffer_full:
|
|
sub $11, rdpCmdBufPtr, rdpCmdBufEnd
|
|
blez $11, return_routine // Return if rdpCmdBufEnd >= rdpCmdBufPtr
|
|
flush_rdp_buffer:
|
|
mfc0 $12, SP_DMA_BUSY
|
|
lw cmd_w1_dram, rdpFifoPos
|
|
addi dmaLen, $11, RDP_CMD_BUFSIZE
|
|
bnez $12, flush_rdp_buffer
|
|
lw $12, OSTask + OSTask_output_buff_size
|
|
mtc0 cmd_w1_dram, DPC_END
|
|
add $11, cmd_w1_dram, dmaLen
|
|
sub $12, $12, $11
|
|
bgez $12, f3dzex_000012A8
|
|
@@await_start_valid:
|
|
mfc0 $11, DPC_STATUS
|
|
andi $11, $11, DPC_STATUS_START_VALID
|
|
bnez $11, @@await_start_valid
|
|
lw cmd_w1_dram, OSTask + OSTask_output_buff
|
|
f3dzex_00001298:
|
|
mfc0 $11, DPC_CURRENT
|
|
beq $11, cmd_w1_dram, f3dzex_00001298
|
|
nop
|
|
mtc0 cmd_w1_dram, DPC_START
|
|
f3dzex_000012A8:
|
|
mfc0 $11, DPC_CURRENT
|
|
sub $11, $11, cmd_w1_dram
|
|
blez $11, f3dzex_000012BC
|
|
sub $11, $11, dmaLen
|
|
blez $11, f3dzex_000012A8
|
|
f3dzex_000012BC:
|
|
add $11, cmd_w1_dram, dmaLen
|
|
sw $11, rdpFifoPos
|
|
// Set up the DMA from DMEM to the RDP fifo in RDRAM
|
|
addi dmaLen, dmaLen, -1 // subtract 1 from the length
|
|
addi dmemAddr, rdpCmdBufEnd, -(0x2000 | RDP_CMD_BUFSIZE) // The 0x2000 is meaningless, negative means write
|
|
xori rdpCmdBufEnd, rdpCmdBufEnd, rdpCmdBuffer1End ^ rdpCmdBuffer2End // Swap between the two RDP command buffers
|
|
j dma_read_write
|
|
addi rdpCmdBufPtr, rdpCmdBufEnd, -RDP_CMD_BUFSIZE
|
|
|
|
.if (. & 4)
|
|
.warning "One instruction of padding before ovl234"
|
|
.endif
|
|
|
|
.align 8
|
|
ovl234_start:
|
|
|
|
ovl3_start:
|
|
|
|
// Jump here to do lighting. If overlay 3 is loaded (this code), loads and jumps
|
|
// to overlay 2 (same address as right here).
|
|
ovl234_lighting_entrypoint_ovl3ver: // same IMEM address as ovl234_lighting_entrypoint
|
|
li cmd_w1_dram, orga(ovl2_start) // set up a load for overlay 2
|
|
j load_overlays_2_3_4 // load overlay 2
|
|
li postOvlRA, ovl234_lighting_entrypoint // set the return address
|
|
|
|
// Jump here for all overlay 4 features. If overlay 3 is loaded (this code),
|
|
// loads and jumps to overlay 4 (ovl234_start).
|
|
ovl234_ovl4_entrypoint_ovl3ver: // same IMEM address as ovl234_ovl4_entrypoint
|
|
li cmd_w1_dram, orga(ovl4_start) // set up a load for overlay 4
|
|
j load_overlays_2_3_4 // load overlay 4
|
|
li postOvlRA, ovl234_ovl4_entrypoint // set the return address
|
|
|
|
// Jump here to do clipping. If overlay 3 is loaded (this code), directly starts
|
|
// the clipping code.
|
|
ovl234_clipping_entrypoint:
|
|
sh $ra, tempHalfword1
|
|
ovl3_clipping_nosavera:
|
|
sh $4, tempHalfword2
|
|
jal vtx_setup_constants
|
|
li clipMaskIdx, 4
|
|
clip_after_constants:
|
|
// Clear all temp vertex slots used.
|
|
li $11, (MAX_CLIP_GEN_VERTS - 1) * vtxSize
|
|
clip_init_used_loop:
|
|
sh $zero, (VTX_CLIP + clipTempVerts)($11)
|
|
bgtz $11, clip_init_used_loop
|
|
addi $11, $11, -vtxSize
|
|
// This being >= 0 also indicates that tri writes are in clipping mode.
|
|
li clipPolySelect, 6 // Everything being indexed from 6 saves one instruction at the end of the loop
|
|
// Write the current three verts as the initial polygon
|
|
sh $1, (clipPoly - 6 + 0)(clipPolySelect)
|
|
sh $2, (clipPoly - 6 + 2)(clipPolySelect)
|
|
sh $3, (clipPoly - 6 + 4)(clipPolySelect)
|
|
sh $zero, (clipPoly)(clipPolySelect) // Zero to mark end of polygon
|
|
li $9, CLIP_CAMPLANE // Initial clip mask for no nearclipping
|
|
// Available locals here: $11, $1, $7, $20, $24, $12
|
|
clip_condlooptop: // Loop over six clipping conditions: near, far, +y, +x, -y, -x
|
|
lhu clipFlags, VTX_CLIP($3) // Load flags for V3, which will be the final vertex of the last polygon
|
|
and clipFlags, clipFlags, $9 // Mask V3's flags to current clip condition
|
|
addi clipPolyRead, clipPolySelect, -6 // Start reading at the beginning of the old polygon
|
|
xori clipPolySelect, clipPolySelect, 6 ^ (clipPoly2 + 6 - clipPoly) // Swap to the other polygon memory
|
|
addi clipPolyWrite, clipPolySelect, -6 // Start writing at the beginning of the new polygon
|
|
clip_edgelooptop: // Loop over edges connecting verts, possibly subdivide the edge
|
|
// Edge starts from V3, ends at V2
|
|
lhu $2, (clipPoly)(clipPolyRead) // Read next vertex of input polygon as V2 (end of edge)
|
|
addi clipPolyRead, clipPolyRead, 0x0002 // Increment read pointer
|
|
beqz $2, clip_nextcond // If V2 is 0, done with input polygon
|
|
lhu $11, VTX_CLIP($2) // Load flags for V2
|
|
and $11, $11, $9 // Mask V2's flags to current clip condition
|
|
beq $11, clipFlags, clip_nextedge // Both set or both clear = both off screen or both on screen, no subdivision
|
|
move clipFlags, $11 // clipFlags = masked V2's flags
|
|
// Going to subdivide this edge. Find available temp vertex slot.
|
|
li outputVtxPos, clipTempVerts + MAX_CLIP_GEN_VERTS * vtxSize
|
|
clip_find_unused_loop:
|
|
lhu $11, (VTX_CLIP - vtxSize)(outputVtxPos)
|
|
addi $12, outputVtxPos, -clipTempVerts // This is within the loop rather than before b/c delay after lhu
|
|
blez $12, clip_done // If can't find one (should never happen), give up
|
|
andi $11, $11, CLIP_VTX_USED
|
|
bnez $11, clip_find_unused_loop
|
|
addi outputVtxPos, outputVtxPos, -vtxSize
|
|
beqz clipFlags, clip_skipswap23 // V2 flag is clear / on screen, therefore V3 is set / off screen
|
|
move $19, $2 //
|
|
move $19, $3 // Otherwise swap V2 and V3; note we are overwriting $3 but not $2
|
|
move $3, $2 //
|
|
clip_skipswap23: // After possible swap, $19 = vtx not meeting clip cond / on screen, $3 = vtx meeting clip cond / off screen
|
|
// Interpolate between these two vertices; create a new vertex which is on the
|
|
// clipping boundary (e.g. at the screen edge)
|
|
vClBaseF equ $v8
|
|
vClBaseI equ $v9
|
|
vClDiffF equ $v10
|
|
vClDiffI equ $v11
|
|
vClFade1 equ $v10 // = vClDiffF
|
|
vClFade2 equ $v2
|
|
/*
|
|
Five clip conditions (these are in a different order from vanilla):
|
|
vClBaseI/vClBaseF[3] vClDiffI/vClDiffF[3]
|
|
4 W=0: W1 W1 - W2
|
|
3 +X : X1 - 2*W1 (X1 - 2*W1) - (X2 - 2*W2) <- the 2 is clip ratio
|
|
2 -X : X1 + 2*W1 (X1 + 2*W1) - (X2 + 2*W2)
|
|
1 +Y : Y1 - 2*W1 (Y1 - 2*W1) - (Y2 - 2*W2)
|
|
0 -Y : Y1 + 2*W1 (Y1 + 2*W1) - (Y2 + 2*W2)
|
|
*/
|
|
xori $11, clipMaskIdx, 1 // Invert sign of condition
|
|
ldv $v4[0], VTX_FRAC_VEC($19) // Vtx on screen, frac pos
|
|
ctc2 $11, $vcc // Conditions 1 (+y) or 3 (+x) -> vcc[0] = 0
|
|
ldv $v5[0], VTX_INT_VEC ($19) // Vtx on screen, int pos
|
|
vmrg $v29, vOne, $v31[1] // elem 0 is 1 if W or neg cond, -1 if pos cond
|
|
andi $11, clipMaskIdx, 4 // W condition and screen clipping
|
|
ldv $v4[8], VTX_FRAC_VEC($3) // Vtx off screen, frac pos
|
|
bnez $11, clip_w // If so, use 1 or -1
|
|
ldv $v5[8], VTX_INT_VEC ($3) // Vtx off screen, int pos
|
|
vmudh $v29, $v29, $v31[3] // elem 0 is (1 or -1) * 2 (clip ratio)
|
|
andi $11, clipMaskIdx, 2 // Conditions 2 (-x) or 3 (+x)
|
|
vmudm vClBaseF, vOne, $v4[0h] // Set accumulator (care about 3, 7) to X
|
|
bnez $11, clip_skipy
|
|
vmadh vClBaseI, vOne, $v5[0h]
|
|
vmudm vClBaseF, vOne, $v4[1h] // Discard that and set accumulator 3, 7 to Y
|
|
vmadh vClBaseI, vOne, $v5[1h]
|
|
clip_skipy:
|
|
vmadn vClBaseF, $v4, $v29[0] // + W * +/- 2
|
|
vmadh vClBaseI, $v5, $v29[0]
|
|
clip_skipxy:
|
|
vsubc vClDiffF, vClBaseF, vClBaseF[7] // Vtx on screen - vtx off screen
|
|
vsub vClDiffI, vClBaseI, vClBaseI[7]
|
|
// Not sure what the first reciprocal is for.
|
|
vor $v29, vClDiffI, vOne[0] // round up int sum to odd; this ensures the value is not 0, otherwise v29 will be 0 instead of +/- 2
|
|
vrcph $v3[3], vClDiffI[3]
|
|
vrcpl $v2[3], vClDiffF[3] // frac: 1 / (x+y+z+w), vtx on screen - vtx off screen
|
|
vrcph $v3[3], vZero[0] // get int result of reciprocal
|
|
vabs $v29, $v29, $v31[3] // 2; v29 = +/- 2 based on sum positive (incl. zero) or negative
|
|
vmudn $v2, $v2, $v29[3] // multiply reciprocal by +/- 2
|
|
vmadh $v3, $v3, $v29[3]
|
|
veq $v3, $v3, vZero[0] // if reciprocal high is 0
|
|
vmrg $v2, $v2, $v31[1] // keep reciprocal low, otherwise set to -1
|
|
vmudl $v29, vClDiffF, $v2[3] // sum frac * reciprocal, discard
|
|
vmadm vClDiffI, vClDiffI, $v2[3] // sum int * reciprocal, frac out
|
|
vmadn vClDiffF, vZero, vZero[0] // get int out
|
|
vrcph $v13[3], vClDiffI[3] // reciprocal again (discard result)
|
|
vrcpl $v12[3], vClDiffF[3] // frac part
|
|
vrcph $v13[3], vZero[0] // int part
|
|
vmudl $v29, $v12, vClDiffF // self * own reciprocal? frac*frac discard
|
|
vmadm $v29, $v13, vClDiffF // self * own reciprocal? int*frac discard
|
|
vmadn vClDiffF, $v12, vClDiffI // self * own reciprocal? frac out
|
|
vmadh vClDiffI, $v13, vClDiffI // self * own reciprocal? int out
|
|
vmudh $v29, vOne, vSTScl[3] // 4 (int part), Newton-Raphson algorithm
|
|
vmadn vClDiffF, vClDiffF, $v31[0] // - 4 * prev result frac part
|
|
vmadh vClDiffI, vClDiffI, $v31[0] // - 4 * prev result frac part
|
|
vmudl $v29, $v12, vClDiffF // * own reciprocal again? frac*frac discard
|
|
vmadm $v29, $v13, vClDiffF // * own reciprocal again? int*frac discard
|
|
vmadn $v12, $v12, vClDiffI // * own reciprocal again? frac out
|
|
vmadh $v13, $v13, vClDiffI // * own reciprocal again? int out
|
|
vmudl $v29, vClBaseF, $v12
|
|
// Have to load $v6 and $v7 because they were not loaded above.
|
|
// Also, put color/TC in $v12 and $v13 instead of $v26 and $v25 as the former
|
|
// will survive vertices_store.
|
|
ldv $v6[0], VTX_FRAC_VEC($3) // Vtx off screen, frac pos
|
|
vmadm $v29, vClBaseI, $v12
|
|
ldv $v7[0], VTX_INT_VEC ($3) // Vtx off screen, int pos
|
|
vmadn vClDiffF, vClBaseF, $v13
|
|
luv $v12[0], VTX_COLOR_VEC($3) // Vtx off screen, RGBA
|
|
vmadh vClDiffI, vClBaseI, $v13 // 11:10 = vtx on screen sum * prev calculated value
|
|
llv $v14[0], VTX_TC_VEC ($3) // Vtx off screen, ST
|
|
vmudl $v29, vClDiffF, $v2[3]
|
|
luv $v13[0], VTX_COLOR_VEC($19) // Vtx on screen, RGBA
|
|
vmadm vClDiffI, vClDiffI, $v2[3]
|
|
llv vPairST[0], VTX_TC_VEC($19) // Vtx on screen, ST
|
|
vmadn vClDiffF, vClDiffF, vZero[0] // * one of the reciprocals above
|
|
vlt vClDiffI, vClDiffI, vOne[0] // If integer part of factor less than 1,
|
|
vmrg vClDiffF, vClDiffF, $v31[1] // keep frac part of factor, else set to 0xFFFF (max val)
|
|
vsubc $v29, vClDiffF, vOne[0] // frac part - 1 for carry
|
|
vge vClDiffI, vClDiffI, vZero[0] // If integer part of factor >= 0 (after carry, so overall value >= 0x0000.0001),
|
|
vmrg vClFade1, vClDiffF, vOne[0] // keep frac part of factor, else set to 1 (min val)
|
|
vmudn vClFade2, vClFade1, $v31[1] // signed x * -1 = 0xFFFF - unsigned x! v2[3] is fade factor for on screen vert
|
|
// Fade between attributes for on screen and off screen vert
|
|
// Also, colors are now in $v12 and $v13.
|
|
// Also, texture coords are now in $v14 and vPairST.
|
|
vmudm $v29, $v12, vClFade1[3] // Fade factor for off screen vert * off screen vert color and TC
|
|
lhu $11, VTX_CLIP($3) // Load clip flags for off screen vert
|
|
vmadm vPairRGBA, $v13, vClFade2[3] // + Fade factor for on screen vert * on screen vert color
|
|
li $7, 0x0000 // Set no fog
|
|
vmudm $v29, $v14, vClFade1[3] // Fade factor for off screen vert * off screen vert TC
|
|
move secondVtxPos, outputVtxPos // Writes garbage second vertex and then output vertex to same place
|
|
vmadm vPairST, vPairST, vClFade2[3] // + Fade factor for on screen vert * on screen vert TC
|
|
andi $11, $11, ~CLIP_VTX_USED // Clear used flag from off screen vert
|
|
vmudl $v29, $v6, vClFade1[3] // Fade factor for off screen vert * off screen vert pos frac
|
|
sh outputVtxPos, (clipPoly)(clipPolyWrite) // Write pointer to generated vertex to polygon
|
|
vmadm $v29, $v7, vClFade1[3] // + Fade factor for off screen vert * off screen vert pos int
|
|
addi clipPolyWrite, clipPolyWrite, 2 // Increment write ptr
|
|
vmadl $v29, $v4, vClFade2[3] // + Fade factor for on screen vert * on screen vert pos frac
|
|
sh $11, VTX_CLIP($3) // Store modified clip flags for off screen vert
|
|
vmadm vPairTPosI, $v5, vClFade2[3] // + Fade factor for on screen vert * on screen vert pos int
|
|
jal vtx_store // Write new vertex
|
|
vmadn vPairTPosF, vZero, vZero[0] // Load resulting frac pos
|
|
clip_nextedge:
|
|
bnez clipFlags, clip_edgelooptop // Discard V2 if it was off screen (whether inserted vtx or not)
|
|
move $3, $2 // Move what was the end of the edge to be the new start of the edge
|
|
sh $3, (clipPoly)(clipPolyWrite) // Former V2 was on screen, so add it to the output polygon
|
|
j clip_edgelooptop
|
|
addi clipPolyWrite, clipPolyWrite, 2
|
|
|
|
clip_w:
|
|
vcopy vClBaseF, $v4 // Result is just W
|
|
j clip_skipxy
|
|
vcopy vClBaseI, $v5
|
|
|
|
clip_nextcond:
|
|
sub $11, clipPolyWrite, clipPolySelect // Are there less than 3 verts in the output polygon?
|
|
bltz $11, clip_done // If so, degenerate result, quit
|
|
sh $zero, (clipPoly)(clipPolyWrite) // Terminate the output polygon with a 0
|
|
lhu $3, (clipPoly - 2)(clipPolyWrite) // Initialize the edge start (V3) to the last vert
|
|
clip_nextcond_skip:
|
|
beqz clipMaskIdx, clip_draw_tris
|
|
lbu $11, (clipCondShifts - 1)(clipMaskIdx) // Load next clip condition shift amount
|
|
li $9, 1
|
|
sllv $9, $9, $11 // $9 is clip mask
|
|
j clip_condlooptop
|
|
addi clipMaskIdx, clipMaskIdx, -1
|
|
|
|
clip_draw_tris:
|
|
lhu $4, tempHalfword2 // Pointer to original first vertex for flat shading
|
|
lqv $v30, v30Value($zero)
|
|
// Current polygon starts 6 (3 verts) below clipPolySelect, ends 2 (1 vert) below clipPolyWrite
|
|
addi clipPolySelect, clipPolySelect, -6 // = Pointer to first vertex
|
|
// Available locals: most registers ($5, $6, $7, $8, $9, $11, $12, etc.)
|
|
// Available regs which won't get clobbered by tri write:
|
|
// clipPolySelect, clipPolyWrite, $14 (inputVtxPos), $15 (outputVtxPos), (more)
|
|
// Find vertex highest on screen (lowest screen Y)
|
|
li $5, 0x7FFF // current best value
|
|
move $7, clipPolySelect // initial vertex pointer
|
|
lhu $12, (clipPoly)($7) // Load vertex address
|
|
clip_search_highest_loop:
|
|
lh $9, VTX_SCR_Y($12) // Load screen Y
|
|
sub $11, $9, $5 // Branch if new vtx Y >= best vtx Y
|
|
bgez $11, clip_search_skip_better
|
|
addi $7, $7, 2 // Next vertex
|
|
addi $14, $7, -2 // Save pointer to best/current vertex
|
|
move $5, $9 // Save best value
|
|
clip_search_skip_better:
|
|
bne clipPolyWrite, $7, clip_search_highest_loop
|
|
lhu $12, (clipPoly)($7) // Next vertex address
|
|
addi clipPolyWrite, clipPolyWrite, -2 // = Pointer to last vertex
|
|
// Find next closest vertex, from the two on either side
|
|
bne $14, clipPolySelect, @@skip1
|
|
addi $6, $14, -2 // $6 = previous vertex
|
|
move $6, clipPolyWrite
|
|
@@skip1:
|
|
lhu $7, (clipPoly)($6)
|
|
bne $14, clipPolyWrite, @@skip2
|
|
addi $8, $14, 2 // $8 = next vertex
|
|
move $8, clipPolySelect
|
|
@@skip2:
|
|
lhu $9, (clipPoly)($8)
|
|
lh $7, VTX_SCR_Y($7)
|
|
lh $9, VTX_SCR_Y($9)
|
|
sub $11, $7, $9 // If value from prev vtx >= value from next, use next
|
|
bgez $11, clip_draw_loop
|
|
move $15, $8 // $14 is first, $8 -> $15 is next
|
|
move $15, $14 // $14 -> $15 is next
|
|
move $14, $6 // $6 -> $14 is first
|
|
clip_draw_loop:
|
|
// Current edge is $14 - $15 (pointers to clipPoly). We can either draw
|
|
// (previous) - $14 - $15, or we can draw $14 - $15 - (next). We want the
|
|
// one where the lower edge covers the fewest scanlines. This edge is
|
|
// (previous) - $15 or $14 - (next).
|
|
// $1, $2, $3, $5 are vertices at $11=prev, $14, $15, $12=next
|
|
bne $14, clipPolySelect, @@skip1
|
|
addi $11, $14, -2
|
|
move $11, clipPolyWrite
|
|
@@skip1:
|
|
beq $11, $15, clip_done // If previous is $15, we only have two verts left, done
|
|
lhu $1, (clipPoly)($11) // From the group below, need something in the delay slot
|
|
bne $15, clipPolyWrite, @@skip2
|
|
addi $12, $15, 2
|
|
move $12, clipPolySelect
|
|
@@skip2:
|
|
lhu $2, (clipPoly)($14)
|
|
lhu $3, (clipPoly)($15)
|
|
lhu $5, (clipPoly)($12)
|
|
lsv $v5[0], (VTX_SCR_Y)($1)
|
|
lsv $v5[4], (VTX_SCR_Y)($2)
|
|
lsv $v5[2], (VTX_SCR_Y)($3)
|
|
lsv $v5[6], (VTX_SCR_Y)($5)
|
|
vsub $v5, $v5, $v5[1q] // Y(prev) - Y($15) in elem 0, Y($14) - Y(next) in elem 2
|
|
move $8, $14 // Temp copy of $14, will be overwritten
|
|
vabs $v5, $v5, $v5 // abs of each
|
|
vlt $v29, $v5, $v5[0h] // Elem 2: second difference less than first difference
|
|
cfc2 $9, $vcc // Get comparison results
|
|
andi $9, $9, 4 // Look at only vector element 2
|
|
beqz $9, clip_final_draw // Skip the change if second diff greater than or equal to first diff
|
|
move $14, $11 // If skipping, drawing prev-$14-$15, so update $14 to be prev
|
|
move $1, $2 // Drawing $14, $15, next
|
|
move $2, $3
|
|
move $3, $5
|
|
move $14, $8 // Restore overwritten $14
|
|
move $15, $12 // Update $15 to be next
|
|
clip_final_draw:
|
|
mtc2 $1, $v27[10] // Addresses go in vector regs too
|
|
mtc2 $2, $v4[12]
|
|
mtc2 $3, $v27[14]
|
|
j tri_noinit // Draw tri
|
|
li $ra, clip_draw_loop // When done, return to top of loop
|
|
|
|
clip_done:
|
|
lh $ra, tempHalfword1
|
|
jr $ra
|
|
li clipPolySelect, -1 // Back to normal tri drawing mode (check clip masks)
|
|
|
|
ovl3_end:
|
|
.align 8
|
|
ovl3_padded_end:
|
|
|
|
.orga max(max(ovl2_padded_end - ovl2_start, ovl4_padded_end - ovl4_start) + orga(ovl3_start), orga())
|
|
ovl234_end:
|
|
|
|
G_VTX_handler:
|
|
lhu $1, (inputBufferEnd - 0x07)(inputBufferPos) // $1 = size in bytes = vtx count * 0x10
|
|
srl $2, cmd_w0, 11 // n << 1
|
|
sub $2, cmd_w0, $2 // v0 << 1
|
|
sb $2, (inputBufferEnd - 0x06)(inputBufferPos) // Store v0 << 1 as byte 2
|
|
sll $11, $1, 12 // Vtx count * 0x10000
|
|
add perfCounter1, perfCounter1, $11 // Add to vertex count
|
|
j vtx_addrs_from_cmd // v0 << 1 is elem 2, (v0 + n) << 1 is elem 3 = $12
|
|
li $11, vtx_return_from_addrs
|
|
vtx_return_from_addrs:
|
|
lhu $5, geometryModeLabel + 1 // Middle 2 bytes of geom mode
|
|
andi $12, $12, 0xFFF8 // Round down end addr to DMA word; one input vtx still fits in one internal vtx
|
|
mfc2 outputVtxPos, $v27[4] // Address of start in vtxSize units
|
|
jal segmented_to_physical // Convert address in cmd_w1_dram to physical
|
|
sub dmemAddr, $12, $1 // Start addr = end addr - size
|
|
jal dma_read_write
|
|
addi dmaLen, $1, -1 // DMA length is always offset by -1
|
|
move inputVtxPos, dmemAddr
|
|
lqv vM0I, (mMatrix + 0x00)($zero) // Load M matrix
|
|
lqv vM2I, (mMatrix + 0x10)($zero)
|
|
lqv vM0F, (mMatrix + 0x20)($zero)
|
|
lqv vM2F, (mMatrix + 0x30)($zero)
|
|
lbu $11, mITValid // 0 if matrix invalid, 1 if valid
|
|
vcopy vM1I, vM0I
|
|
lbu $12, normalsMode // bit 0 clear if don't compute mIT, set if do
|
|
vcopy vM3I, vM2I
|
|
ldv vM1I[0], (mMatrix + 0x08)($zero)
|
|
vcopy vM1F, vM0F
|
|
ldv vM3I[0], (mMatrix + 0x18)($zero)
|
|
vcopy vM3F, vM2F
|
|
ldv vM1F[0], (mMatrix + 0x28)($zero)
|
|
sltiu $11, $11, 1 // 0 if matrix valid, 1 if invalid
|
|
srl $7, $5, 9 // G_LIGHTING in bit 1
|
|
and $7, $7, $11 // If lighting enabled and need to update matrix,
|
|
and $7, $7, $12 // and computing mIT,
|
|
ldv vM3F[0], (mMatrix + 0x38)($zero)
|
|
ldv vM0I[8], (mMatrix + 0x00)($zero)
|
|
ldv vM2I[8], (mMatrix + 0x10)($zero)
|
|
ldv vM0F[8], (mMatrix + 0x20)($zero)
|
|
bnez $7, ovl234_ovl4_entrypoint // run overlay 4 to compute M inverse transpose
|
|
ldv vM2F[8], (mMatrix + 0x30)($zero)
|
|
vtx_after_calc_mit:
|
|
lqv vVP0I, (vpMatrix + 0x00)($zero)
|
|
lqv vVP2I, (vpMatrix + 0x10)($zero)
|
|
lqv vVP0F, (vpMatrix + 0x20)($zero)
|
|
lqv vVP2F, (vpMatrix + 0x30)($zero)
|
|
addi outputVtxPos, outputVtxPos, -2*vtxSize // Going to increment this by 2 verts in loop
|
|
vcopy vVP1I, vVP0I
|
|
li $ra, 0 // Flag to not return to clipping
|
|
vcopy vVP3I, vVP2I
|
|
ldv vVP1I[0], (vpMatrix + 0x08)($zero)
|
|
vcopy vVP1F, vVP0F
|
|
ldv vVP3I[0], (vpMatrix + 0x18)($zero)
|
|
vcopy vVP3F, vVP2F
|
|
ldv vVP1F[0], (vpMatrix + 0x28)($zero)
|
|
ldv vVP3F[0], (vpMatrix + 0x38)($zero)
|
|
ldv vVP0I[8], (vpMatrix + 0x00)($zero)
|
|
ldv vVP2I[8], (vpMatrix + 0x10)($zero)
|
|
ldv vVP0F[8], (vpMatrix + 0x20)($zero)
|
|
ldv vVP2F[8], (vpMatrix + 0x30)($zero)
|
|
vtx_setup_constants:
|
|
/*
|
|
vVpScl = [vscale[0], -vscale[1], vscale[2], fogMult, (repeat)]
|
|
vVpOfs = [vtrans[0], vtrans[1], vtrans[2], fogOffset, (repeat)]
|
|
vSTScl = [TexSScl, TexTScl, perspNorm, 4, TexSScl, TexTScl, ---, 4 ]
|
|
vSTOfs = [TexSOfs, TexTOfs, aoAmb, 0, TexSOfs, TexTOfs, aoDir, 0 ]
|
|
$v31 = [-4, -1, 1, 2, 4, 0x4000, 0x7F00, 0x7FFF]
|
|
aoAmb, aoDir set to 0 if ambient occlusion disabled
|
|
TexSOfs, TexTOfs set to 0 if ST attr offset disabled;
|
|
vtrans[2] not incremented by Z attr offset if disabled
|
|
*/
|
|
vne $v29, $v31, $v31[2h] // VCC = 11011101
|
|
ldv vSTOfs[0], (attrOffsetST - altBase)(altBaseReg) // elems 0, 1, 2 = S, T, Z offset
|
|
vclr $v21 // Zero
|
|
ldv vVpOfs[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
|
|
ldv vVpOfs[8], (viewport + 8)($zero)
|
|
lhu $12, (geometryModeLabel+2)($zero)
|
|
vmrg $v29, $v21, vSTOfs[2] // all zeros except elems 2, 6 are Z offset
|
|
ldv vSTOfs[8], (attrOffsetST - altBase)(altBaseReg) // Duplicated in 4-6
|
|
andi $11, $12, G_ATTROFFSET_Z_ENABLE
|
|
beqz $11, @@skipz // Skip if Z offset disabled
|
|
llv $v20[4], (aoAmbientFactor - altBase)(altBaseReg) // Load aoAmb 2 and aoDir 3
|
|
vadd vVpOfs, vVpOfs, $v29 // add Z offset if enabled
|
|
@@skipz:
|
|
andi $11, $12, G_ATTROFFSET_ST_ENABLE
|
|
bnez $11, @@skipst // Skip if ST offset enabled
|
|
llv vSTScl[0], (textureSettings2)($zero) // Texture ST scale in 0, 1
|
|
vclr vSTOfs // If disabled, clear ST offset
|
|
@@skipst:
|
|
andi $11, $12, G_AMBOCCLUSION
|
|
vmov $v20[6], $v20[3] // move aoDir to 6
|
|
bnez $11, @@skipao // Skip if ambient occlusion enabled
|
|
llv vSTScl[8], (textureSettings2)($zero) // Texture ST scale in 4, 5
|
|
vcopy $v20, $v21 // Set aoAmb and aoDir to 0
|
|
@@skipao:
|
|
ldv vVpScl[0], (viewport)($zero) // Load vscale duplicated in 0-3 and 4-7
|
|
ldv vVpScl[8], (viewport)($zero)
|
|
llv vSTScl[4], (perspNorm)($zero) // perspNorm in elem 2, garbage in 3
|
|
llv $v23[0], (fogFactor)($zero) // Load fog multiplier 0 and offset 1
|
|
vmrg vSTOfs, vSTOfs, $v20 // move aoAmb and aoDir into vSTOfs
|
|
vne $v29, $v31, $v31[3h] // VCC = 11101110
|
|
vsub $v20, $v21, vVpScl // -vscale
|
|
vmrg vSTScl, vSTScl, $v31[4] // Put 4s in elements 3,7
|
|
vmrg vVpScl, vVpScl, $v23[0] // Put fog multiplier in elements 3,7 of vscale
|
|
vadd $v23, $v23, $v31[6] // Add 0x7F00 to fog offset
|
|
vmrg vSTOfs, vSTOfs, $v21 // Put 0s in elements 3,7
|
|
vmov vVpScl[1], $v20[1] // Negate vscale[1] because RDP top = y=0
|
|
vmov vVpScl[5], $v20[1] // Same for second half
|
|
bnez $ra, clip_after_constants // Return to clipping if from there
|
|
vmrg vVpOfs, vVpOfs, $v23[1] // Put fog offset in elements 3,7 of vtrans
|
|
jal while_wait_dma_busy // Wait for vertex load to finish
|
|
andi $7, $5, G_FOG >> 8 // Nonzero if fog enabled
|
|
vtx_load_loop:
|
|
ldv vPairPosI[8], (VTX_IN_OB + inputVtxSize * 1)(inputVtxPos)
|
|
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
|
|
ldv vPairPosI[0], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos)
|
|
vmudn $v29, vM3F, $v31[2] // 1
|
|
// Element access wraps in lpv/luv, but not intuitively. Basically the named
|
|
// element and above do get the values at the specified address, but the earlier
|
|
// elements get the values before that, except masked to 0xF. So for example here,
|
|
// elems 4-7 get bytes 0-3 of the vertex as it looks like they should, but elems
|
|
// 0-3 get bytes C-F of the vertex (which is what we want).
|
|
luv vPairRGBA[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Colors as unsigned, lower 4
|
|
vmadh $v29, vM3I, $v31[2]
|
|
luv $v25[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
|
|
vmadn $v29, vM0F, vPairPosI[0h]
|
|
lpv $v28[4], (VTX_IN_OB + inputVtxSize * 0)(inputVtxPos) // Normals as signed, lower 4
|
|
vmadh $v29, vM0I, vPairPosI[0h]
|
|
lpv $v26[0], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // Upper 4
|
|
vmadn $v29, vM1F, vPairPosI[1h]
|
|
llv vPairST[0], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // ST in 0:1
|
|
vmadh $v29, vM1I, vPairPosI[1h]
|
|
llv vPairST[8], (VTX_IN_TC + inputVtxSize * 1)(inputVtxPos) // ST in 4:5
|
|
vmadn vPairPosF, vM2F, vPairPosI[2h]
|
|
andi $11, $5, G_LIGHTING >> 8
|
|
vmadh vPairPosI, vM2I, vPairPosI[2h] // vPairPosI/F = vertices world coords
|
|
// Elems 0-1 get bytes 6-7 of the following vertex (0)
|
|
lpv $v30[2], (VTX_IN_TC - inputVtxSize * 1)(inputVtxPos) // Packed normals as signed, lower 2
|
|
vmrg vPairRGBA, vPairRGBA, $v25 // Merge colors
|
|
bnez $11, ovl234_lighting_entrypoint
|
|
// Elems 4-5 get bytes 6-7 of the following vertex (1)
|
|
lpv $v25[6], (VTX_IN_TC + inputVtxSize * 0)(inputVtxPos) // Upper 2 in 4:5
|
|
vtx_return_from_lighting:
|
|
vmudn $v29, vVP3F, $v31[2] // 1
|
|
addi inputVtxPos, inputVtxPos, 2*inputVtxSize
|
|
vmadh $v29, vVP3I, $v31[2] // 1
|
|
addi outputVtxPos, outputVtxPos, 2*vtxSize
|
|
vmadl $v29, vVP0F, vPairPosF[0h]
|
|
addi $1, $1, -2*inputVtxSize // Counter of remaining verts * inputVtxSize
|
|
vmadm $v29, vVP0I, vPairPosF[0h]
|
|
addi secondVtxPos, outputVtxPos, vtxSize
|
|
vmadn $v29, vVP0F, vPairPosI[0h]
|
|
bgez $1, @@skip1 // If < 0 verts remain, second and output vertices write to same mem
|
|
vmadh $v29, vVP0I, vPairPosI[0h]
|
|
move secondVtxPos, outputVtxPos
|
|
@@skip1:
|
|
vmadl $v29, vVP1F, vPairPosF[1h]
|
|
li $ra, vtx_load_loop
|
|
vmadm $v29, vVP1I, vPairPosF[1h]
|
|
bgtz $1, @@skip2 // If <= 0 verts remain, run next DL command
|
|
vmadn $v29, vVP1F, vPairPosI[1h]
|
|
li $ra, run_next_DL_command
|
|
@@skip2:
|
|
vmadh $v29, vVP1I, vPairPosI[1h]
|
|
vmadl $v29, vVP2F, vPairPosF[2h]
|
|
vmadm $v29, vVP2I, vPairPosF[2h]
|
|
vmadn vPairTPosF, vVP2F, vPairPosI[2h]
|
|
vmadh vPairTPosI, vVP2I, vPairPosI[2h]
|
|
vmudm $v29, vPairST, vSTScl // Scale ST; must be after texgen
|
|
vmadh vPairST, vSTOfs, $v31[2] // + 1 * ST offset
|
|
vtx_store:
|
|
// Inputs: vPairTPosI, vPairTPosF, vPairST, vPairRGBA
|
|
// Locals: $v20, $v21, $v25, $v26, $v28, $v30 ($v29 is temp)
|
|
// Scalar regs: secondVtxPos, outputVtxPos; set to the same thing if only write 1 vtx
|
|
// $7 != 0 if fog; temps $6, $11, $12, $20, $24
|
|
ldv $v30[0], (occlusionPlaneMidCoeffs - altBase)(altBaseReg)
|
|
vch $v29, vPairTPosI, vPairTPosI[3h] // Clip screen high
|
|
ldv $v30[8], (occlusionPlaneMidCoeffs - altBase)(altBaseReg)
|
|
vcl $v29, vPairTPosF, vPairTPosF[3h] // Clip screen low
|
|
vmudl $v29, vPairTPosF, vSTScl[2] // Persp norm
|
|
vmadm $v20, vPairTPosI, vSTScl[2] // Persp norm
|
|
vmadn $v21, vSTOfs, vSTOfs[3] // Zero
|
|
cfc2 $12, $vcc // Load screen clipping results
|
|
vmudn $v29, vPairTPosF, $v30 // X * kx, Y * ky, Z * kz
|
|
vmadh $v29, vPairTPosI, $v30 // Int * int
|
|
vreadacc $v28, ACC_UPPER // Load int * int portion
|
|
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
|
|
srl $24, $12, 4 // Shift second vertex screen clipping to first slots
|
|
vmudn $v26, vPairTPosF, $v31[3] // W * clip ratio for scaled clipping
|
|
andi $24, $24, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vmadh $v25, vPairTPosI, $v31[3] // W * clip ratio for scaled clipping
|
|
andi $12, $12, CLIP_SCRN_NPXY | CLIP_CAMPLANE // Mask to only screen bits we care about
|
|
vmrg $v28, $v28, $v30 // Put constant factor in elems 3, 7
|
|
sdv vPairTPosF[8], (VTX_FRAC_VEC )(secondVtxPos)
|
|
vrcph $v29[0], $v20[3]
|
|
sdv vPairTPosF[0], (VTX_FRAC_VEC )(outputVtxPos)
|
|
vrcpl $v30[2], $v21[3]
|
|
sdv vPairTPosI[8], (VTX_INT_VEC )(secondVtxPos)
|
|
vrcph $v30[3], $v20[7]
|
|
sdv vPairTPosI[0], (VTX_INT_VEC )(outputVtxPos)
|
|
vrcpl $v30[6], $v21[7]
|
|
suv vPairRGBA[4], (VTX_COLOR_VEC )(secondVtxPos)
|
|
vadd $v28, $v28, $v28[0q] // Add pairs upwards
|
|
suv vPairRGBA[0], (VTX_COLOR_VEC )(outputVtxPos)
|
|
vrcph $v30[7], vSTOfs[3] // Zero
|
|
slv vPairST[8], (VTX_TC_VEC )(secondVtxPos)
|
|
vch $v29, vPairTPosI, $v25[3h] // Clip scaled high
|
|
slv vPairST[0], (VTX_TC_VEC )(outputVtxPos)
|
|
vcl $v29, vPairTPosF, $v26[3h] // Clip scaled low
|
|
vadd $v28, $v28, $v28[1h] // Add elems 1, 5 to 3, 7
|
|
cfc2 $20, $vcc // Load scaled clipping results
|
|
vmudl $v29, $v21, $v30[2h]
|
|
vmadm $v29, $v20, $v30[2h]
|
|
vmadn $v21, $v21, $v30[3h]
|
|
lsv vPairTPosF[14], (VTX_Z_FRAC )(secondVtxPos) // load Z into W slot, will be for fog below
|
|
vmadh $v20, $v20, $v30[3h]
|
|
lsv vPairTPosF[6], (VTX_Z_FRAC )(outputVtxPos) // load Z into W slot, will be for fog below
|
|
vge $v29, $v28, vSTOfs[3] // >= 0 in elems 3, 7
|
|
vmudh $v29, vSTScl, $v31[2] // 4 * 1 in elems 3, 7
|
|
cfc2 $11, $vcc // Load occlusion plane mid results to bits 3 and 7 (garbage in others)
|
|
vmadn $v21, $v21, $v31[0] // -4
|
|
ori $12, $12, CLIP_VTX_USED // Write for all first verts, only matters for generated verts
|
|
vmadh $v20, $v20, $v31[0] // -4
|
|
andi $20, $20, ~(CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4)) // Mask out bits we will or in
|
|
vge $v29, vPairTPosI, vSTOfs[3] // Zero; vcc set if w >= 0
|
|
andi $11, $11, CLIP_OCCLUDED | (CLIP_OCCLUDED >> 4) // Only meaningful bits from occlusion
|
|
vmrg $v25, vSTOfs, $v31[7] // 0 or 0x7FFF in elems 3, 7, latter if w < 0
|
|
lsv vPairTPosI[14], (VTX_Z_INT )(secondVtxPos) // load Z into W slot, will be for fog below
|
|
vmudl $v29, $v21, $v30[2h]
|
|
lsv vPairTPosI[6], (VTX_Z_INT )(outputVtxPos) // load Z into W slot, will be for fog below
|
|
vmadm $v29, $v20, $v30[2h]
|
|
or $20, $20, $11 // Combine occlusion results with scaled results
|
|
vmadn $v28, $v21, $v30[3h]
|
|
sll $11, $20, 4 // Shift first vertex scaled clipping to second slots
|
|
vmadh $v30, $v20, $v30[3h] // $v30:$v28 is 1/W
|
|
andi $20, $20, CLIP_SCAL_NPXY | CLIP_OCCLUDED // Mask to only bits we care about
|
|
vmadh $v25, $v25, $v31[7] // 0x7FFF; $v25:$v28 is 1/W but large number if W negative
|
|
andi $11, $11, CLIP_SCAL_NPXY | CLIP_OCCLUDED // Mask to only bits we care about
|
|
vge $v29, $v31, $v31[2h] // Set VCC to 00110011
|
|
or $24, $24, $20 // Combine results for second vertex
|
|
vmudl $v29, vPairTPosF, $v28[3h]
|
|
ssv $v28[14], (VTX_INV_W_FRAC)(secondVtxPos)
|
|
vmadm $v29, vPairTPosI, $v28[3h]
|
|
ssv $v28[6], (VTX_INV_W_FRAC)(outputVtxPos)
|
|
vmadn vPairTPosF, vPairTPosF, $v25[3h]
|
|
ssv $v30[14], (VTX_INV_W_INT )(secondVtxPos)
|
|
vmadh vPairTPosI, vPairTPosI, $v25[3h] // pos * 1/W
|
|
ssv $v30[6], (VTX_INV_W_INT )(outputVtxPos)
|
|
vmudl $v29, vPairTPosF, vSTScl[2] // Persp norm
|
|
ldv $v30[0], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // Load coeffs 0-3
|
|
vmadm vPairTPosI, vPairTPosI, vSTScl[2] // Persp norm
|
|
ldv $v30[8], (occlusionPlaneEdgeCoeffs - altBase)(altBaseReg) // and for vtx 2
|
|
vmadn vPairTPosF, vSTOfs, vSTOfs[3] // Zero
|
|
or $12, $12, $11 // Combine results for first vertex
|
|
vmudh $v29, vVpOfs, $v31[2] // offset * 1
|
|
vmadn vPairTPosF, vPairTPosF, vVpScl // + XYZ * scale
|
|
vmadh vPairTPosI, vPairTPosI, vVpScl
|
|
vmrg $v26, $v31, $v31[0] // Signs of $v26 are --++--++
|
|
// 2 cycles to wait for vPairTPosI
|
|
vmudh $v28, vPairTPosI, $v31[4] // 4; scale up x and y
|
|
slv vPairTPosI[8], (VTX_SCR_VEC )(secondVtxPos)
|
|
vabs $v26, $v26, $v31[5] // $v26 is 0xC000, 0xC000, 0x4000, 0x4000, repeat
|
|
slv vPairTPosI[0], (VTX_SCR_VEC )(outputVtxPos)
|
|
vge $v21, vPairTPosI, $v31[6] // 0x7F00; clamp fog to >= 0 (low byte only)
|
|
ssv vPairTPosF[12], (VTX_SCR_Z_FRAC)(secondVtxPos)
|
|
vge $v20, vPairTPosI, vSTOfs[3] // Zero; clamp Z to >= 0
|
|
ssv vPairTPosF[4], (VTX_SCR_Z_FRAC)(outputVtxPos)
|
|
vmulf $v29, $v30, $v28[0h] // 4*X1*c0, --, 4*X1*c2, --, repeat vtx 2
|
|
beqz $7, vtx_skip_fog
|
|
vmacf $v25, $v26, vPairTPosI[1h] // -0x4000*Y1, --, +0x4000*Y1, --, repeat vtx 2
|
|
sbv $v21[15], (VTX_COLOR_A )(secondVtxPos)
|
|
sbv $v21[7], (VTX_COLOR_A )(outputVtxPos)
|
|
vtx_skip_fog:
|
|
vmulf $v29, $v30, $v28[1h] // --, 4*Y1*c1, --, 4*Y1*c3, repeat vtx 2
|
|
ssv $v20[12], (VTX_SCR_Z )(secondVtxPos)
|
|
vmacf $v26, $v26, vPairTPosI[0h] // --, -0x4000*X1, --, +0x4000*X1, repeat vtx 2
|
|
ssv $v20[4], (VTX_SCR_Z )(outputVtxPos)
|
|
veq $v29, $v31, $v31[0q] // Set VCC to 10101010
|
|
ldv $v30[0], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // Load coeffs 4-7
|
|
vmrg $v25, $v25, $v26 // Elems 0-3 are results for vtx 0, 4-7 for vtx 1
|
|
ldv $v30[8], (occlusionPlaneEdgeCoeffs + 8 - altBase)(altBaseReg) // and for vtx 2
|
|
vge $v29, $v25, $v30 // Each compare to coeffs 4-7
|
|
cfc2 $20, $vcc
|
|
andi $11, $20, 0x00F0 // Bits 4-7 for vtx 2
|
|
beqz $11, @@skipv2 // If 0, all equations true, don't clear occluded flag
|
|
andi $20, $20, 0x000F // Bits 0-3 for vtx 1
|
|
andi $24, $24, ~CLIP_OCCLUDED // At least one eqn false, clear vtx 2 occluded flag
|
|
@@skipv2:
|
|
beqz $20, @@skipv1 // If 0, all equations true, don't clear occluded flag
|
|
sh $24, (VTX_CLIP )(secondVtxPos) // Store second vertex clip flags
|
|
andi $12, $12, ~CLIP_OCCLUDED // At least one eqn false, clear vtx 1 occluded flag
|
|
@@skipv1:
|
|
jr $ra
|
|
sh $12, (VTX_CLIP )(outputVtxPos) // Store first vertex results
|
|
|
|
vtx_addrs_from_cmd:
|
|
// Treat eight bytes of last command each as vertex indices << 1
|
|
// inputBufferEnd is close enough to the end of DMEM to fit in signed offset
|
|
lpv $v27[0], (-(0x1000 - (inputBufferEnd - 0x08)))(inputBufferPos)
|
|
vtx_indices_to_addr:
|
|
// Input and output in $v27
|
|
// Also out elem 3 -> $12, elem 7 -> $3 because these are used more than once
|
|
lqv $v30, v30Value($zero)
|
|
vmudl $v29, $v27, $v30[1] // Multiply vtx indices times length
|
|
vmadn $v27, vOne, $v30[0] // Add address of vertex buffer
|
|
sb $zero, materialCullMode // This covers all tri cmds, vtx, modify vtx, branchZ, cull
|
|
mfc2 $12, $v27[6]
|
|
jr $11
|
|
mfc2 $3, $v27[14]
|
|
|
|
G_TRISTRIP_handler:
|
|
j tri_strip_fan_start
|
|
li $ra, tri_strip_fan_loop
|
|
G_TRIFAN_handler:
|
|
li $ra, tri_strip_fan_loop + 0x8000 // Negative = flag for G_TRIFAN
|
|
tri_strip_fan_start:
|
|
addi cmd_w0, inputBufferPos, inputBufferEnd - 8 // Start pointing to cmd byte
|
|
tri_strip_fan_loop:
|
|
lw cmd_w1_dram, 0(cmd_w0) // Load tri indices to lower 3 bytes of word
|
|
addi $11, inputBufferPos, inputBufferEnd - 3 // Off end of command
|
|
beq $11, cmd_w0, run_next_DL_command // If off end of command, exit
|
|
sll $12, cmd_w1_dram, 24 // Put sign bit of vtx 3 in sign bit
|
|
bltz $12, run_next_DL_command // If negative, exit
|
|
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Store non-shuffled indices
|
|
bltz $ra, tri_fan_store // Finish handling G_TRIFAN
|
|
addi cmd_w0, cmd_w0, 1 // Increment
|
|
andi $11, cmd_w0, 1 // If odd, this is the 1st/3rd/5th tri
|
|
bnez $11, tri_main // Draw as is
|
|
srl $12, cmd_w1_dram, 8 // Move vtx 2 to LSBs
|
|
sb cmd_w1_dram, 6(rdpCmdBufPtr) // Store vtx 3 to spot for 2
|
|
j tri_main
|
|
sb $12, 7(rdpCmdBufPtr) // Store vtx 2 to spot for 3
|
|
|
|
tri_fan_store:
|
|
lb $11, (inputBufferEnd - 7)(inputBufferPos) // Load vtx 1
|
|
j tri_main
|
|
sb $11, 5(rdpCmdBufPtr) // Store vtx 1
|
|
|
|
G_TRI2_handler:
|
|
G_QUAD_handler:
|
|
jal tri_main // Send second tri; return here for first tri
|
|
sw cmd_w1_dram, 4(rdpCmdBufPtr) // Put second tri indices in temp memory
|
|
G_TRI1_handler:
|
|
li $ra, run_next_DL_command // After done with this tri, run next cmd
|
|
sw cmd_w0, 4(rdpCmdBufPtr) // Put first tri indices in temp memory
|
|
tri_main:
|
|
lpv $v27[0], 0(rdpCmdBufPtr) // Load tri indexes to elems 5, 6, 7
|
|
j vtx_indices_to_addr // elem 7 -> $3; rest in $v27
|
|
li $11, tri_return_from_addrs
|
|
tri_return_from_addrs:
|
|
mfc2 $1, $v27[10]
|
|
vcopy $v4, $v27 // Need vtx 2 addr in $v4 elem 6
|
|
addi perfCounter2, perfCounter2, 0x4000 // Increment number of tris requested
|
|
mfc2 $2, $v27[12]
|
|
move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading
|
|
li clipPolySelect, -1 // Normal tri drawing mode (check clip masks)
|
|
tri_noinit:
|
|
// ra is next cmd, second tri in TRI2, or middle of clipping
|
|
tV1AtF equ $v5
|
|
tV2AtF equ $v7
|
|
tV3AtF equ $v9
|
|
tV1AtI equ $v18
|
|
tV2AtI equ $v19
|
|
tV3AtI equ $v21
|
|
vnxor tV1AtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding
|
|
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
|
|
vnxor tV2AtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
|
|
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
|
|
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
|
|
llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
|
|
vnxor tV3AtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding
|
|
lhu $5, VTX_CLIP($1)
|
|
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
|
|
lhu $7, VTX_CLIP($2)
|
|
vadd $v2, vZero, $v6[1] // v2 all elems = y-coord of vertex 1
|
|
lhu $8, VTX_CLIP($3)
|
|
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
|
|
lw $6, geometryModeLabel // Load full geometry mode word
|
|
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
|
|
and $9, $5, $7
|
|
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
|
|
and $9, $9, $8 // $9 = all clip bits which are true for all three verts
|
|
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
|
|
andi $11, $9, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
|
|
vmrg $v14, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
|
|
bnez $11, return_routine // Then the whole tri is offscreen, cull
|
|
or $5, $5, $7
|
|
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
|
|
or $5, $5, $8 // $5 = all clip bits which are true for any verts
|
|
vmadh $v29, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
|
|
andi $5, $5, CLIP_SCAL_NPXY | CLIP_CAMPLANE // Does tri cross scaled bounds or cam plane?
|
|
vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
|
|
sra $11, clipPolySelect, 31 // All 1s if negative, meaning clipping allowed
|
|
vmrg $v10, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
|
|
and $5, $5, $11 // Clear this if clipping not allowed
|
|
vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
|
|
bnez $5, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip
|
|
mfc2 $8, $v29[0] // elem 0 = x = cross product => lower 16 bits, sign extended
|
|
vmrg $v4, $v14, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
|
|
andi $9, $9, CLIP_OCCLUDED
|
|
vmrg $v14, $v8, $v14 // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
|
|
bnez $9, return_routine // Cull if all verts occluded
|
|
srl $11, $8, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
|
|
vlt $v6, $v6, $v2 // v6 (thrown out), VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
|
|
beqz $8, return_routine // If cross product is 0, tri is degenerate (zero area), cull.
|
|
addi $11, $11, 21 // = 21 if back facing, 22 if front facing
|
|
vor $v3, vZero, $v31[5] // 0x4000; some rounding factor
|
|
sllv $11, $6, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
|
|
vmrg $v2, $v4, $v10 // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
|
|
bltz $11, return_routine // Cull if bit is set (culled based on facing)
|
|
vmrg $v10, $v10, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
|
|
vmudn $v4, $v14, $v31[5] // 0x4000
|
|
mfc2 $1, $v14[12] // $v14 = lowest Y value = highest on screen (x, y, addr)
|
|
vsub $v6, $v2, $v14
|
|
mfc2 $2, $v2[12] // $v2 = mid vertex (x, y, addr)
|
|
vsub $v8, $v10, $v14
|
|
mfc2 $3, $v10[12] // $v10 = highest Y value = lowest on screen (x, y, addr)
|
|
vsub $v11, $v14, $v2
|
|
vsub $v12, $v14, $v10 // VH - VL (negative)
|
|
llv $v13[0], VTX_INV_W_VEC($1)
|
|
vsub $v15, $v10, $v2
|
|
llv $v13[8], VTX_INV_W_VEC($2)
|
|
vmudh $v16, $v6, $v8[0]
|
|
llv $v13[12], VTX_INV_W_VEC($3)
|
|
vmadh $v16, $v8, $v11[0]
|
|
lpv tV1AtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
|
|
vreadacc $v17, ACC_UPPER
|
|
lpv tV2AtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
|
|
vreadacc $v16, ACC_MIDDLE
|
|
lpv tV3AtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
|
|
vmov $v15[2], $v6[0]
|
|
lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx)
|
|
vrcp $v20[0], $v15[1]
|
|
sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit
|
|
vrcph $v22[0], $v17[1]
|
|
// TODO If everything else is done and we still have an instruction to spare,
|
|
// this will prevent a hang if G_TEXTURE_ENABLE is set in the geometry mode
|
|
//andi $6, $6, (G_SHADE | G_ZBUFFER)
|
|
vrcpl $v23[1], $v16[1]
|
|
bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set
|
|
vrcph $v24[1], vZero[0]
|
|
vlt $v29, $v31, $v31[3] // Set vcc to 11100000
|
|
vmrg tV1AtI, $v25, tV1AtI // RGB from $4, alpha from $1
|
|
vmrg tV2AtI, $v25, tV2AtI // RGB from $4, alpha from $2
|
|
vmrg tV3AtI, $v25, tV3AtI // RGB from $4, alpha from $3
|
|
tri_skip_flat_shading:
|
|
vrcp $v20[2], $v6[1]
|
|
lb $20, alphaCompareCullMode($zero)
|
|
vrcph $v22[2], $v6[1]
|
|
lw $5, VTX_INV_W_VEC($1)
|
|
vrcp $v20[3], $v8[1]
|
|
lw $7, VTX_INV_W_VEC($2)
|
|
vrcph $v22[3], $v8[1]
|
|
lw $8, VTX_INV_W_VEC($3)
|
|
vmudl tV1AtI, tV1AtI, $v30[3] // 0x0100; vertex color 1 >>= 8
|
|
lbu $9, textureSettings1 + 3
|
|
vmudl tV2AtI, tV2AtI, $v30[3] // 0x0100; vertex color 2 >>= 8
|
|
sub $11, $5, $7
|
|
vmudl tV3AtI, tV3AtI, $v30[3] // 0x0100; vertex color 3 >>= 8
|
|
sra $12, $11, 31
|
|
vmov $v15[3], $v8[0]
|
|
and $11, $11, $12
|
|
vmudl $v29, $v20, $v30[7] // 0x0020
|
|
beqz $20, tri_skip_alpha_compare_cull
|
|
sub $5, $5, $11
|
|
// Alpha compare culling
|
|
vge $v26, tV1AtI, tV2AtI
|
|
lbu $19, alphaCompareCullThresh
|
|
vlt $v27, tV1AtI, tV2AtI
|
|
bgtz $20, @@skip1
|
|
vge $v26, $v26, tV3AtI // If alphaCompareCullMode > 0, $v26 = max of 3 verts
|
|
vlt $v26, $v27, tV3AtI // else if < 0, $v26 = min of 3 verts
|
|
@@skip1: // $v26 elem 3 has max or min alpha value
|
|
mfc2 $24, $v26[6]
|
|
sub $24, $24, $19 // sign bit set if (max/min) < thresh
|
|
xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
|
|
bltz $24, return_routine // if max < thresh or if min >= thresh.
|
|
tri_skip_alpha_compare_cull:
|
|
vmadm $v22, $v22, $v30[7] // 0x0020
|
|
sub $11, $5, $8
|
|
vmadn $v20, vZero, vZero[0]
|
|
sra $12, $11, 31
|
|
vmudm $v25, $v15, $v30[2] // 0x1000
|
|
and $11, $11, $12
|
|
vmadn $v15, vZero, vZero[0]
|
|
sub $5, $5, $11
|
|
vsubc $v4, vZero, $v4
|
|
sw $5, 0x0010(rdpCmdBufPtr)
|
|
vsub $v26, vZero, vZero
|
|
llv $v27[0], 0x0010(rdpCmdBufPtr)
|
|
vmudm $v29, $v25, $v20
|
|
mfc2 $5, $v17[1]
|
|
vmadl $v29, $v15, $v20
|
|
lbu $7, textureSettings1 + 2
|
|
vmadn $v20, $v15, $v22
|
|
lsv tV2AtI[14], VTX_SCR_Z($2)
|
|
vmadh $v15, $v25, $v22
|
|
lsv tV3AtI[14], VTX_SCR_Z($3)
|
|
vmudl $v29, $v23, $v16
|
|
lsv tV2AtF[14], VTX_SCR_Z_FRAC($2)
|
|
vmadm $v29, $v24, $v16
|
|
lsv tV3AtF[14], VTX_SCR_Z_FRAC($3)
|
|
vmadn $v16, $v23, $v17
|
|
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
|
|
vmadh $v17, $v24, $v17
|
|
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
|
|
vand $v22, $v20, $v30[5] // 0xFFF8
|
|
vcr $v15, $v15, $v30[3] // 0x0100
|
|
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
|
|
vmudh $v29, vOne, $v30[6] // 0x0010
|
|
ssv $v10[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
|
|
vmadn $v16, $v16, $v30[4] // -16
|
|
ssv $v2[2], 0x0004(rdpCmdBufPtr) // Store YM edge coefficient
|
|
vmadh $v17, $v17, $v30[4] // -16
|
|
ssv $v14[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
|
|
vmudn $v29, $v3, $v14[0]
|
|
andi $12, $5, 0x0080 // Extract the left major flag from $5
|
|
vmadl $v29, $v22, $v4[1]
|
|
or $12, $12, $7 // Combine the left major flag with the level and tile from the texture settings
|
|
vmadm $v29, $v15, $v4[1]
|
|
sb $12, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
|
|
vmadn $v2, $v22, $v26[1]
|
|
beqz $9, tri_skip_tex // If textures are not enabled, skip texture coefficient calculation
|
|
vmadh $v3, $v15, $v26[1]
|
|
vrcph $v29[0], $v27[0]
|
|
vrcpl $v10[0], $v27[1]
|
|
vadd $v14, vZero, $v13[1q]
|
|
vrcph $v27[0], vZero[0]
|
|
vor $v22, vZero, $v31[7] // 0x7FFF
|
|
vmudm $v29, $v13, $v10[0]
|
|
vmadl $v29, $v14, $v10[0]
|
|
llv $v22[0], VTX_TC_VEC($1)
|
|
vmadn $v14, $v14, $v27[0]
|
|
llv $v22[8], VTX_TC_VEC($2)
|
|
vmadh $v13, $v13, $v27[0]
|
|
vor $v10, vZero, $v31[7] // 0x7FFF
|
|
vge $v29, $v30, $v30[7] // Set VCC to 11110001; select RGBA___Z or ____STW_
|
|
llv $v10[8], VTX_TC_VEC($3)
|
|
vmudm $v29, $v22, $v14[0h]
|
|
vmadh $v22, $v22, $v13[0h]
|
|
vmadn $v25, vZero, vZero[0]
|
|
vmudm $v29, $v10, $v14[6] // acc = (v10 * v14[6]); v29 = mid(clamp(acc))
|
|
vmadh $v10, $v10, $v13[6] // acc += (v10 * v13[6]) << 16; v10 = mid(clamp(acc))
|
|
vmadn $v13, vZero, vZero[0] // v13 = lo(clamp(acc))
|
|
sdv $v22[0], 0x0020(rdpCmdBufPtr)
|
|
vmrg tV2AtI, tV2AtI, $v22 // Merge S, T, W into elems 4-6
|
|
sdv $v25[0], 0x0028(rdpCmdBufPtr) // 8
|
|
vmrg tV2AtF, tV2AtF, $v25 // Merge S, T, W into elems 4-6
|
|
ldv tV1AtI[8], 0x0020(rdpCmdBufPtr) // 8
|
|
vmrg tV3AtI, tV3AtI, $v10 // Merge S, T, W into elems 4-6
|
|
ldv tV1AtF[8], 0x0028(rdpCmdBufPtr) // 8
|
|
vmrg tV3AtF, tV3AtF, $v13 // Merge S, T, W into elems 4-6
|
|
tri_skip_tex:
|
|
vmudl $v29, $v16, $v23
|
|
lsv tV1AtF[14], VTX_SCR_Z_FRAC($1)
|
|
vmadm $v29, $v17, $v23
|
|
lsv tV1AtI[14], VTX_SCR_Z($1)
|
|
vmadn $v23, $v16, $v24
|
|
lh $1, VTX_SCR_VEC($2)
|
|
vmadh $v24, $v17, $v24
|
|
addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
|
|
// tV*At* contains R, G, B, A, S, T, W, Z. tD31* = vtx 3 - vtx 1, tD21* = vtx 2 - vtx 1
|
|
tD31F equ $v10
|
|
tD31I equ $v9
|
|
tD21F equ $v13
|
|
tD21I equ $v7
|
|
vsubc tD31F, tV3AtF, tV1AtF
|
|
andi $3, $6, G_SHADE
|
|
vsub tD31I, tV3AtI, tV1AtI
|
|
sll $1, $1, 14
|
|
vsubc tD21F, tV2AtF, tV1AtF
|
|
sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient
|
|
vsub tD21I, tV2AtI, tV1AtI
|
|
ssv $v3[6], 0x0010(rdpCmdBufPtr) // Store XH edge coefficient (integer part)
|
|
// DaDx = (v3 - v1) * factor + (v2 - v1) * factor
|
|
tDaDxF equ $v2
|
|
tDaDxI equ $v3
|
|
vmudn $v29, tD31F, $v6[1]
|
|
ssv $v2[6], 0x0012(rdpCmdBufPtr) // Store XH edge coefficient (fractional part)
|
|
vmadh $v29, tD31I, $v6[1]
|
|
ssv $v3[4], 0x0018(rdpCmdBufPtr) // Store XM edge coefficient (integer part)
|
|
vmadn $v29, tD21F, $v12[1]
|
|
ssv $v2[4], 0x001A(rdpCmdBufPtr) // Store XM edge coefficient (fractional part)
|
|
vmadh $v29, tD21I, $v12[1]
|
|
ssv $v15[0], 0x000C(rdpCmdBufPtr) // Store DxLDy edge coefficient (integer part)
|
|
vreadacc tDaDxF, ACC_MIDDLE
|
|
ssv $v20[0], 0x000E(rdpCmdBufPtr) // Store DxLDy edge coefficient (fractional part)
|
|
vreadacc tDaDxI, ACC_UPPER
|
|
ssv $v15[6], 0x0014(rdpCmdBufPtr) // Store DxHDy edge coefficient (integer part)
|
|
// DaDy = (v2 - v1) * factor + (v3 - v1) * factor
|
|
tDaDyF equ $v6
|
|
tDaDyI equ $v7
|
|
vmudn $v29, tD21F, $v8[0]
|
|
ssv $v20[6], 0x0016(rdpCmdBufPtr) // Store DxHDy edge coefficient (fractional part)
|
|
vmadh $v29, tD21I, $v8[0]
|
|
ssv $v15[4], 0x001C(rdpCmdBufPtr) // Store DxMDy edge coefficient (integer part)
|
|
vmadn $v29, tD31F, $v11[0]
|
|
ssv $v20[4], 0x001E(rdpCmdBufPtr) // Store DxMDy edge coefficient (fractional part)
|
|
vmadh $v29, tD31I, $v11[0]
|
|
sll $11, $3, 4 // Shift (geometry mode & G_SHADE) by 4 to get 0x40 if G_SHADE is set
|
|
vreadacc tDaDyF, ACC_MIDDLE
|
|
add $1, $2, $11 // Increment the triangle pointer by 0x40 bytes (shade coefficients) if G_SHADE is set
|
|
vreadacc tDaDyI, ACC_UPPER
|
|
sll $11, $9, 5 // Shift texture enabled (which is 2 when on) by 5 to get 0x40 if textures are on
|
|
// DaDx, DaDy *= more factors
|
|
vmudl $v29, tDaDxF, $v23[1]
|
|
add rdpCmdBufPtr, $1, $11 // Increment the triangle pointer by 0x40 bytes (texture coefficients) if textures are on
|
|
vmadm $v29, tDaDxI, $v23[1]
|
|
andi $6, $6, G_ZBUFFER // Get the value of G_ZBUFFER from the current geometry mode
|
|
vmadn tDaDxF, tDaDxF, $v24[1]
|
|
sll $11, $6, 4 // Shift (geometry mode & G_ZBUFFER) by 4 to get 0x10 if G_ZBUFFER is set
|
|
vmadh tDaDxI, tDaDxI, $v24[1]
|
|
add rdpCmdBufPtr, rdpCmdBufPtr, $11 // Increment the triangle pointer by 0x10 bytes (depth coefficients) if G_ZBUFFER is set
|
|
vmudl $v29, tDaDyF, $v23[1]
|
|
addi perfCounter1, perfCounter1, 1 // Increment number of tris sent to RDP
|
|
vmadm $v29, tDaDyI, $v23[1]
|
|
vmadn tDaDyF, tDaDyF, $v24[1]
|
|
sdv tDaDxF[0], 0x0018($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (fractional)
|
|
vmadh tDaDyI, tDaDyI, $v24[1]
|
|
sdv tDaDxI[0], 0x0008($2) // Store DrDx, DgDx, DbDx, DaDx shade coefficients (integer)
|
|
// DaDe = DaDx * factor
|
|
tDaDeF equ $v8
|
|
tDaDeI equ $v9
|
|
vmadl $v29, tDaDxF, $v20[3]
|
|
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
|
|
vmadm $v29, tDaDxI, $v20[3]
|
|
sdv tDaDxI[8], 0x0008($1) // Store DsDx, DtDx, DwDx texture coefficients (integer)
|
|
vmadn tDaDeF, tDaDxF, $v15[3]
|
|
sdv tDaDyF[0], 0x0038($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (fractional)
|
|
vmadh tDaDeI, tDaDxI, $v15[3]
|
|
sdv tDaDyI[0], 0x0028($2) // Store DrDy, DgDy, DbDy, DaDy shade coefficients (integer)
|
|
// Base value += DaDe * factor
|
|
vmudn $v29, tV1AtF, vOne[0]
|
|
sdv tDaDyF[8], 0x0038($1) // Store DsDy, DtDy, DwDy texture coefficients (fractional)
|
|
vmadh $v29, tV1AtI, vOne[0]
|
|
sdv tDaDyI[8], 0x0028($1) // Store DsDy, DtDy, DwDy texture coefficients (integer)
|
|
vmadl $v29, tDaDeF, $v4[1]
|
|
sdv tDaDeF[0], 0x0030($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (fractional)
|
|
vmadm $v29, tDaDeI, $v4[1]
|
|
sdv tDaDeI[0], 0x0020($2) // Store DrDe, DgDe, DbDe, DaDe shade coefficients (integer)
|
|
vmadn tV1AtF, tDaDeF, $v26[1]
|
|
sdv tDaDeF[8], 0x0030($1) // Store DsDe, DtDe, DwDe texture coefficients (fractional)
|
|
vmadh tV1AtI, tDaDeI, $v26[1]
|
|
sdv tDaDeI[8], 0x0020($1) // Store DsDe, DtDe, DwDe texture coefficients (integer)
|
|
tV1AtFF equ $v10
|
|
vmudn tV1AtFF, tDaDeF, $v4[1] // Super-frac (frac * frac) part; assumes v4 factor >= 0
|
|
vmudn tDaDeF, tDaDeF, $v30[7] // 0x0020
|
|
vmadh tDaDeI, tDaDeI, $v30[7] // 0x0020
|
|
sdv tV1AtF[0], 0x0010($2) // Store RGBA shade color (fractional)
|
|
vmudn tDaDxF, tDaDxF, $v30[7] // 0x0020
|
|
sdv tV1AtI[0], 0x0000($2) // Store RGBA shade color (integer)
|
|
vmadh tDaDxI, tDaDxI, $v30[7] // 0x0020
|
|
sdv tV1AtF[8], 0x0010($1) // Store S, T, W texture coefficients (fractional)
|
|
vmudn tDaDyF, tDaDyF, $v30[7] // 0x0020
|
|
beqz $6, check_rdp_buffer_full // see below
|
|
sdv tV1AtI[8], 0x0000($1) // Store S, T, W texture coefficients (integer)
|
|
vmadh tDaDyI, tDaDyI, $v30[7] // 0x0020
|
|
ssv tDaDeF[14], -0x0006(rdpCmdBufPtr)
|
|
vmudl $v29, tV1AtFF, $v30[7] // 0x0020
|
|
ssv tDaDeI[14], -0x0008(rdpCmdBufPtr)
|
|
vmadn tV1AtF, tV1AtF, $v30[7] // 0x0020
|
|
ssv tDaDxF[14], -0x000A(rdpCmdBufPtr)
|
|
vmadh tV1AtI, tV1AtI, $v30[7] // 0x0020
|
|
ssv tDaDxI[14], -0x000C(rdpCmdBufPtr)
|
|
ssv tDaDyF[14], -0x0002(rdpCmdBufPtr)
|
|
ssv tDaDyI[14], -0x0004(rdpCmdBufPtr)
|
|
ssv tV1AtF[14], -0x000E(rdpCmdBufPtr)
|
|
j check_rdp_buffer_full // eventually returns to $ra, which is next cmd, second tri in TRI2, or middle of clipping
|
|
ssv tV1AtI[14], -0x10(rdpCmdBufPtr)
|
|
|
|
load_overlay_0_and_enter:
|
|
G_LOAD_UCODE_handler:
|
|
li postOvlRA, 0x1000 // Sets up return address
|
|
li cmd_w1_dram, orga(ovl0_start) // Sets up ovl0 table address
|
|
// To use these: set postOvlRA ($12) to the address to execute after the load is
|
|
// done, and set cmd_w1_dram to orga(your_overlay).
|
|
load_overlays_0_1:
|
|
li dmaLen, ovl01_end - 0x1000 - 1
|
|
j load_overlay_inner
|
|
li dmemAddr, 0x1000
|
|
load_overlays_2_3_4:
|
|
li dmaLen, ovl234_end - ovl234_start - 1
|
|
li dmemAddr, ovl234_start
|
|
load_overlay_inner:
|
|
lw $11, OSTask + OSTask_ucode
|
|
jal dma_read_write
|
|
add cmd_w1_dram, cmd_w1_dram, $11
|
|
move $ra, postOvlRA
|
|
// Fall through to while_wait_dma_busy
|
|
|
|
totalImemUseUpTo1FC8:
|
|
|
|
.if . > 0x1FC8
|
|
.error "Constraints violated on what can be overwritten at end of ucode (relevant for G_LOAD_UCODE)"
|
|
.endif
|
|
.org 0x1FC8
|
|
|
|
while_wait_dma_busy:
|
|
mfc0 $11, SP_DMA_BUSY // Load the DMA_BUSY value
|
|
while_dma_busy:
|
|
bnez $11, while_dma_busy // Loop until DMA_BUSY is cleared
|
|
mfc0 $11, SP_DMA_BUSY // Update DMA_BUSY value
|
|
// This routine is used to return via conditional branch
|
|
return_routine:
|
|
jr $ra
|
|
|
|
dma_read_write:
|
|
mfc0 $11, SP_DMA_FULL // load the DMA_FULL value
|
|
while_dma_full:
|
|
bnez $11, while_dma_full // Loop until DMA_FULL is cleared
|
|
mfc0 $11, SP_DMA_FULL // Update DMA_FULL value
|
|
mtc0 dmemAddr, SP_MEM_ADDR // Set the DMEM address to DMA from/to
|
|
bltz dmemAddr, dma_write // If the DMEM address is negative, this is a DMA write, if not read
|
|
mtc0 cmd_w1_dram, SP_DRAM_ADDR // Set the DRAM address to DMA from/to
|
|
jr $ra
|
|
mtc0 dmaLen, SP_RD_LEN // Initiate a DMA read with a length of dmaLen
|
|
dma_write:
|
|
jr $ra
|
|
mtc0 dmaLen, SP_WR_LEN // Initiate a DMA write with a length of dmaLen
|
|
|
|
.if . > 0x00002000
|
|
.error "Not enough room in IMEM"
|
|
.endif
|
|
|
|
.headersize 0x00001000 - orga()
|
|
|
|
// Overlay 0 handles three cases of stopping the current microcode.
|
|
// The action here is controlled by $1. If yielding, $1 > 0. If this was
|
|
// G_LOAD_UCODE, $1 == 0. If we got to the end of the parent DL, $1 < 0.
|
|
ovl0_start:
|
|
sub $11, rdpCmdBufPtr, rdpCmdBufEnd
|
|
addi $12, $11, RDP_CMD_BUFSIZE - 1
|
|
bgezal $12, flush_rdp_buffer
|
|
sw perfCounter1, yieldDataFooter + 0x0 // Stored here for yield and done
|
|
sw perfCounter2, yieldDataFooter + 0x4 // otherwise this is temp memory
|
|
jal while_wait_dma_busy
|
|
lw $24, rdpFifoPos
|
|
bltz $1, task_done // $1 < 0 = Got to the end of the parent DL
|
|
mtc0 $24, DPC_END // Set the end pointer of the RDP so that it starts the task
|
|
bnez $1, task_yield // $1 > 0 = CPU requested yield
|
|
add taskDataPtr, taskDataPtr, inputBufferPos // inputBufferPos <= 0; taskDataPtr was where in the DL after the current chunk loaded
|
|
load_ucode:
|
|
lw cmd_w1_dram, (inputBufferEnd - 0x04)(inputBufferPos) // word 1 = ucode code DRAM addr
|
|
sw taskDataPtr, OSTask + OSTask_data_ptr // Store where we are in the DL
|
|
sw cmd_w1_dram, OSTask + OSTask_ucode // Store pointer to new ucode about to execute
|
|
sw perfCounter1, textureSettings1 // Store counters in texture settings; first 0x180 of DMEM
|
|
sw perfCounter2, textureSettings2 // will be preserved in ucode swap AND if other ucode yields
|
|
li dmemAddr, start // Beginning of overwritable part of IMEM
|
|
jal dma_read_write // DMA DRAM read -> IMEM write
|
|
li dmaLen, (while_wait_dma_busy - start) - 1 // End of overwritable part of IMEM
|
|
lw cmd_w1_dram, rdpHalf1Val // Get DRAM address of ucode data from rdpHalf1Val
|
|
li dmemAddr, endSharedDMEM // DMEM address is endSharedDMEM
|
|
andi dmaLen, cmd_w0, 0x0FFF // Extract DMEM length from command word
|
|
add cmd_w1_dram, cmd_w1_dram, dmemAddr // Start overwriting data from endSharedDMEM
|
|
jal dma_read_write // initate DMA read
|
|
sub dmaLen, dmaLen, dmemAddr // End that much before the end of DMEM
|
|
j while_wait_dma_busy
|
|
// Jumping to actual start of new ucode, which normally zeros vZero. Not sure why later ucodes
|
|
// jumped one instruction in.
|
|
li $ra, start
|
|
|
|
.if . > start
|
|
.error "ovl0_start does not fit within the space before the start of the ucode loaded with G_LOAD_UCODE"
|
|
.endif
|
|
|
|
task_yield:
|
|
lw $11, OSTask + OSTask_ucode // Save pointer to current ucode
|
|
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
|
|
li dmemAddr, 0x8000 // 0, but negative = write
|
|
li dmaLen, OS_YIELD_DATA_SIZE - 1
|
|
li $12, SP_SET_SIG1 | SP_SET_SIG2 // yielded and task done signals
|
|
sw taskDataPtr, yieldDataFooter + 0x8 // Save pointer to where in DL
|
|
sw $11, yieldDataFooter + 0xC
|
|
j dma_read_write
|
|
li $ra, set_status_and_break
|
|
|
|
task_done:
|
|
// Copy just the part of the yield data that has the perf counters.
|
|
lw cmd_w1_dram, OSTask + OSTask_yield_data_ptr
|
|
addi cmd_w1_dram, cmd_w1_dram, yieldDataFooter
|
|
li dmemAddr, 0x8000 | yieldDataFooter // negative = write
|
|
li dmaLen, YIELD_DATA_FOOTER_SIZE - 1
|
|
jal dma_read_write
|
|
li $12, SP_SET_SIG2 // task done signal
|
|
set_status_and_break: // $12 is the status to set
|
|
mtc0 $12, SP_STATUS
|
|
break 0
|
|
nop
|
|
|
|
ovl0_end:
|
|
.align 8
|
|
ovl0_padded_end:
|
|
|
|
.if ovl0_padded_end > ovl01_end
|
|
.error "Automatic resizing for overlay 0 failed"
|
|
.endif
|
|
|
|
// overlay 1 (0x170 bytes loaded into 0x1000)
|
|
.headersize 0x00001000 - orga()
|
|
|
|
ovl1_start:
|
|
|
|
G_POPMTX_handler:
|
|
lw $11, matrixStackPtr // Get the current matrix stack pointer
|
|
lw $2, OSTask + OSTask_dram_stack // Read the location of the dram stack
|
|
sub cmd_w1_dram, $11, cmd_w1_dram // Decrease the matrix stack pointer by the amount passed in the second command word
|
|
sub $1, cmd_w1_dram, $2 // Subtraction to check if the new pointer is greater than or equal to $2
|
|
bgez $1, do_popmtx // If the new matrix stack pointer is greater than or equal to $2, then use the new pointer as is
|
|
nop
|
|
move cmd_w1_dram, $2 // If the new matrix stack pointer is less than $2, then use $2 as the pointer instead
|
|
do_popmtx:
|
|
beq cmd_w1_dram, $11, run_next_DL_command // If no bytes were popped, then we don't need to make the mvp matrix as being out of date and can run the next command
|
|
sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer with the new value
|
|
j do_movemem
|
|
sb $zero, mITValid
|
|
|
|
G_MTX_handler:
|
|
// The lower 3 bits of G_MTX are, from LSb to MSb (0 value/1 value),
|
|
// matrix type (modelview/projection)
|
|
// load type (multiply/load)
|
|
// push type (nopush/push)
|
|
// In F3DEX2 (and by extension F3DZEX), G_MTX_PUSH is inverted, so 1 is nopush and 0 is push
|
|
andi $11, cmd_w0, G_MTX_P_MV | G_MTX_NOPUSH_PUSH // Read the matrix type and push type flags into $11
|
|
bnez $11, load_mtx // If the matrix type is projection or this is not a push, skip pushing the matrix
|
|
andi $2, cmd_w0, G_MTX_MUL_LOAD // Read the matrix load type into $2 (0 is multiply, 2 is load)
|
|
lw cmd_w1_dram, matrixStackPtr // Set up the DMA from dmem to rdram at the matrix stack pointer
|
|
li dmemAddr, -0x2000 //
|
|
jal dma_read_write // DMA the current matrix from dmem to rdram
|
|
li dmaLen, 0x0040 - 1 // Set the DMA length to the size of a matrix (minus 1 because DMA is inclusive)
|
|
addi cmd_w1_dram, cmd_w1_dram, 0x40 // Increase the matrix stack pointer by the size of one matrix
|
|
sw cmd_w1_dram, matrixStackPtr // Update the matrix stack pointer
|
|
lw cmd_w1_dram, (inputBufferEnd - 4)(inputBufferPos) // Load command word 1 again
|
|
load_mtx:
|
|
add $7, $7, $2 // Add the load type to the command byte in $7, selects the return address based on whether the matrix needs multiplying or just loading
|
|
sb $zero, mITValid
|
|
G_MOVEMEM_handler:
|
|
jal segmented_to_physical // convert the memory address cmd_w1_dram to a virtual one
|
|
do_movemem:
|
|
andi $1, cmd_w0, 0x00FE // Move the movemem table index into $1 (bits 1-7 of the first command word)
|
|
lbu dmaLen, (inputBufferEnd - 0x07)(inputBufferPos) // Move the second byte of the first command word into dmaLen
|
|
lhu dmemAddr, (movememTable)($1) // Load the address of the memory location for the given movemem index
|
|
srl $2, cmd_w0, 5 // ((w0) >> 8) << 3; top 3 bits of idx must be 0; lower 1 bit of len byte must be 0
|
|
lh $ra, (movememHandlerTable - (G_POPMTX | 0xFF00))($7) // Loads the return address from movememHandlerTable based on command byte
|
|
j dma_read_write
|
|
G_SETOTHERMODE_H_handler: // These handler labels must be 4 bytes apart for the code below to work
|
|
add dmemAddr, dmemAddr, $2 // This is for the code above, does nothing for G_SETOTHERMODE_H
|
|
G_SETOTHERMODE_L_handler:
|
|
lw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11) // resolves to othermode0 or othermode1 based on which handler was jumped to
|
|
lui $2, 0x8000
|
|
srav $2, $2, cmd_w0
|
|
srl $1, cmd_w0, 8
|
|
srlv $2, $2, $1
|
|
nor $2, $2, $zero
|
|
and $3, $3, $2
|
|
or $3, $3, cmd_w1_dram
|
|
sw $3, (othermode0 - G_SETOTHERMODE_H_handler)($11)
|
|
lw cmd_w0, otherMode0
|
|
j G_RDP_handler
|
|
lw cmd_w1_dram, otherMode1
|
|
|
|
scissor_other_handler: // $12 is 0 for G_SETSCISSOR or 2 for G_RDPSETOTHERMODE
|
|
sll $12, $12, 2 // Now 0 or 8
|
|
sw cmd_w0, (scissorUpLeft)($12) // otherMode0 = scissorUpLeft + 8
|
|
j G_RDP_handler // Send the command to the RDP
|
|
sw cmd_w1_dram, (scissorBottomRight)($12) // otherMode1 = scissorBottomRight + 8
|
|
|
|
G_GEOMETRYMODE_handler: // $7 = G_GEOMETRYMODE (as negative) if jumped here
|
|
lw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
|
|
and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
|
|
or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
|
|
j run_next_DL_command // run the next DL command
|
|
sw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // update the geometry mode value
|
|
|
|
G_TEXTURE_handler:
|
|
li $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler) // Calculate the offset from texrectWord1 and $11 for saving to textureSettings
|
|
G_TEXRECT_handler: // $11 contains address of handler
|
|
G_TEXRECTFLIP_handler:
|
|
// Stores first command word into textureSettings for gSPTexture, 0x00D0 for gSPTextureRectangle/Flip
|
|
sw cmd_w0, (texrectWord1 - G_TEXRECTFLIP_handler)($11)
|
|
G_RDPHALF_1_handler:
|
|
j run_next_DL_command
|
|
// Stores second command word into textureSettings for gSPTexture, 0x00D4 for gSPTextureRectangle/Flip, 0x00D8 for G_RDPHALF_1
|
|
sw cmd_w1_dram, (texrectWord2 - G_TEXRECTFLIP_handler)($11)
|
|
|
|
G_RDPHALF_2_handler:
|
|
ldv $v29[0], (texrectWord1)($zero)
|
|
lw cmd_w0, rdpHalf1Val // load the RDPHALF1 value into w0
|
|
addi rdpCmdBufPtr, rdpCmdBufPtr, 8
|
|
addi perfCounter2, perfCounter2, 1 // Increment number of tex/fill rects
|
|
sb $zero, materialCullMode // This covers tex and fill rects
|
|
j G_RDP_handler
|
|
sdv $v29[0], -8(rdpCmdBufPtr)
|
|
|
|
G_MOVEWORD_handler:
|
|
srl $2, cmd_w0, 16 // load the moveword command and word index into $2 (e.g. 0xDB06 for G_MW_SEGMENT)
|
|
lhu $12, (movewordTable - (G_MOVEWORD << 8))($2) // subtract the moveword label and offset the word table by the word index (e.g. 0xDB06 becomes 0x0304)
|
|
do_moveword:
|
|
add $12, $12, cmd_w0 // adds the offset in the command word to the address from the table (the upper 4 bytes are effectively ignored)
|
|
j run_next_DL_command // process the next command
|
|
sw cmd_w1_dram, ($12) // moves the specified value (in cmd_w1_dram) into the word (offset + moveword_table[index])
|
|
|
|
// Converts the segmented address in cmd_w1_dram to the corresponding physical address
|
|
segmented_to_physical:
|
|
srl $11, cmd_w1_dram, 22 // Copy (segment index << 2) into $11
|
|
andi $11, $11, 0x3C // Clear the bottom 2 bits that remained during the shift
|
|
lw $11, (segmentTable)($11) // Get the current address of the segment
|
|
sll cmd_w1_dram, cmd_w1_dram, 8 // Shift the address to the left so that the top 8 bits are shifted out
|
|
srl cmd_w1_dram, cmd_w1_dram, 8 // Shift the address back to the right, resulting in the original with the top 8 bits cleared
|
|
jr $ra
|
|
add cmd_w1_dram, cmd_w1_dram, $11 // Add the segment's address to the masked input address, resulting in the virtual address
|
|
|
|
G_LIGHTTORDP_handler:
|
|
lbu $11, numLightsxSize // Ambient light
|
|
lbu $1, (inputBufferEnd - 0x6)(inputBufferPos) // Byte 2 = light count from end * size
|
|
andi $2, cmd_w0, 0x00FF // Byte 3 = alpha
|
|
sub $1, $11, $1 // Light address; byte 2 counts from end
|
|
lw $3, (lightBufferMain-1)($1) // Load light RGB into lower 3 bytes
|
|
move cmd_w0, cmd_w1_dram // Move second word to first (cmd byte, prim level)
|
|
sll $3, $3, 8 // Shift light RGB to upper 3 bytes and clear alpha byte
|
|
j G_RDP_handler // Send to RDP
|
|
or cmd_w1_dram, $3, $2 // Combine RGB and alpha in second word
|
|
|
|
ovl1_end:
|
|
.align 8
|
|
ovl1_padded_end:
|
|
|
|
.if ovl1_padded_end > ovl01_end
|
|
.error "Automatic resizing for overlay 1 failed"
|
|
.endif
|
|
|
|
.headersize ovl234_start - orga()
|
|
|
|
ovl2_start:
|
|
ovl234_lighting_entrypoint:
|
|
vmrg vPairNrml, $v28, $v26 // Merge normals
|
|
j lt_continue_setup
|
|
andi $11, $5, G_PACKED_NORMALS >> 8
|
|
|
|
ovl234_ovl4_entrypoint_ovl2ver: // same IMEM address as ovl234_ovl4_entrypoint
|
|
li cmd_w1_dram, orga(ovl4_start) // set up a load for overlay 4
|
|
j load_overlays_2_3_4 // load overlay 4
|
|
li postOvlRA, ovl234_ovl4_entrypoint // set the return address
|
|
|
|
ovl234_clipping_entrypoint_ovl2ver: // same IMEM address as ovl234_clipping_entrypoint
|
|
sh $ra, tempHalfword1
|
|
li cmd_w1_dram, orga(ovl3_start) // set up a load of overlay 3
|
|
j load_overlays_2_3_4 // load overlay 3
|
|
li postOvlRA, ovl3_clipping_nosavera // set up the return address in ovl3
|
|
|
|
lt_continue_setup:
|
|
// Inputs: vPairPosI/F vertices pos world int:frac, vPairRGBA, vPairST,
|
|
// $v28 vPairNrml, $v30:$v25 (to be merged) packed normals
|
|
// Outputs: vPairRGBA, vPairST, must leave alone vPairPosI/F
|
|
// Locals: $v29 temp, $v23 (will be vPairTPosF), $v24 (will be vPairTPosI),
|
|
// $v25 (after merge), $v26, whichever of $v28 or $v30 is unused
|
|
// Use $v10 (vVP2I) as an extra local, restore before return
|
|
beqz $11, lt_skip_packed_normals
|
|
vmrg $v30, $v30, $v25 // Merge packed normals
|
|
// Packed normals algorithm. This produces a vector (one for each input vertex)
|
|
// in vPairNrml such that |X| + |Y| + |Z| = 0x7F00 (called L1 norm), in the
|
|
// same direction as the standard normal vector. The length is not "correct"
|
|
// compared to the standard normal, but it's is normalized anyway after the M
|
|
// matrix transform.
|
|
vand vPackPXY, $v30, $v31[6] // 0x7F00; positive X, Y
|
|
vclr $v29 // Zero
|
|
vaddc vPackZ, vPackPXY, vPackPXY[1q] // elem 0, 4: pos X + pos Y, no clamping
|
|
vadd $v26, $v29, $v29 // Save carry bit, indicates use 0x7F00 - x and y
|
|
vxor vPairNrml, vPackPXY, $v31[6] // 0x7F00 - x, 0x7F00 - y
|
|
vxor vPackZ, vPackZ, $v31[6] // 0x7F00 - +X - +Y in elems 0, 4
|
|
vne $v29, $v29, $v26[0h] // set 0-3, 4-7 vcc if (+X + +Y) overflowed, discard result
|
|
vmrg vPairNrml, vPairNrml, vPackPXY // If so, use 0x7F00 - +X, else +X (same for Y)
|
|
vne $v29, $v31, $v31[2h] // set VCC to 11011101
|
|
vabs vPairNrml, $v30, vPairNrml // Apply sign of original X and Y to new X and Y
|
|
vmrg vPairNrml, vPairNrml, vPackZ[0h] // Move Z to elements 2, 6
|
|
lt_skip_packed_normals:
|
|
// Transform normals by M, in case normalsMode = G_NORMALSMODE_FAST.
|
|
vclr vLtOne
|
|
vsub vPairRGBA, vPairRGBA, $v31[7] // 0x7FFF; offset alpha, will be fixed later
|
|
lbu curLight, numLightsxSize
|
|
vmudn $v29, vM0F, vPairNrml[0h]
|
|
lbu $11, normalsMode($zero)
|
|
vmadh $v29, vM0I, vPairNrml[0h]
|
|
vmadn $v29, vM1F, vPairNrml[1h]
|
|
addi curLight, curLight, altBase // Point to ambient light
|
|
vmadh $v29, vM1I, vPairNrml[1h]
|
|
vmadn $v10, vM2F, vPairNrml[2h] // $v10 = normals frac
|
|
vmadh $v23, vM2I, vPairNrml[2h] // $v23 = normals int
|
|
beqz $11, lt_after_xfrm_normals // Skip if G_NORMALSMODE_FAST
|
|
vadd vLtOne, vLtOne, $v31[2] // 1; vLtOne = 1
|
|
// Transform normals by M inverse transpose, for G_NORMALSMODE_AUTO or G_NORMALSMODE_MANUAL
|
|
lqv vLtMIT0I, (mITMatrix + 0x00)($zero) // x int, y int
|
|
lqv vLtMIT2I, (mITMatrix + 0x10)($zero) // z int, x frac
|
|
lqv vLtMIT1F, (mITMatrix + 0x20)($zero) // y frac, z frac
|
|
vcopy vLtMIT1I, vLtMIT0I
|
|
vcopy vLtMIT0F, vLtMIT2I
|
|
ldv vLtMIT1I[0], (mITMatrix + 0x08)($zero)
|
|
vcopy vLtMIT2F, vLtMIT1F
|
|
ldv vLtMIT0F[0], (mITMatrix + 0x18)($zero)
|
|
ldv vLtMIT0I[8], (mITMatrix + 0x00)($zero)
|
|
ldv vLtMIT1F[8], (mITMatrix + 0x20)($zero)
|
|
ldv vLtMIT2F[0], (mITMatrix + 0x28)($zero)
|
|
ldv vLtMIT2I[8], (mITMatrix + 0x10)($zero)
|
|
// Transform normals by M inverse transpose matrix.
|
|
// At this point we have stuffed two and three quarters matrices into registers at once.
|
|
// Nintendo was only able to fit one and three quarters matrices into registers at once.
|
|
// ($v10 is stolen from VP but $v24=vLtOne could be available here, so if we
|
|
// swapped the use below of those two regs and moved down the init of vLtOne, it
|
|
// really would be the full 2 & 3/4 matrices.) This is 22/32 registers full of matrices.
|
|
// The remaining 10 registers are: vVp/ST Scl/Ofs and $v31 constants, vPairPosI/F,
|
|
// vPairST, vPairRGBA, vPairNrml.
|
|
vmudn $v29, vLtMIT0F, vPairNrml[0h] // vLtMIT0F = $v29
|
|
vmadh $v29, vLtMIT0I, vPairNrml[0h]
|
|
vmadn $v29, vLtMIT1F, vPairNrml[1h]
|
|
vmadh $v29, vLtMIT1I, vPairNrml[1h]
|
|
vmadn $v10, vLtMIT2F, vPairNrml[2h] // vLtMIT2F = $v10 = normals frac
|
|
vmadh $v23, vLtMIT2I, vPairNrml[2h] // vLtMIT2I = $v23 = normals int
|
|
lt_after_xfrm_normals:
|
|
// Normalize normals; in $v23:$v10 i/f, out $v23
|
|
jal lt_normalize
|
|
luv vPairLt, (ltBufOfs + 0)(curLight) // Total light level, init to ambient
|
|
// Set up ambient occlusion: light *= (factor * (alpha - 1) + 1)
|
|
vmudm $v25, vPairRGBA, vSTOfs[2] // (alpha - 1) * aoAmb factor; elems 3, 7
|
|
vcopy vPairNrml, $v23
|
|
vadd $v25, $v25, $v31[7] // 0x7FFF = 1 in s.15; elems 3, 7
|
|
vmulf vPairLt, vPairLt, $v25[3h] // light color *= ambient factor
|
|
lt_loop:
|
|
// vPairPosI/F, vPairST, $v23 light pos/dir (then local), $v10 $v25 locals,
|
|
// vLtColor, vPairRGBA, vPairNrml, $v29 temp, vPairLt, vLtOne
|
|
lpv $v23[0], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 0-2
|
|
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
|
|
lpv $v25[4], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 4-6
|
|
lbu $11, (ltBufOfs + 3 - lightSize)(curLight) // Light type / constant attenuation
|
|
beq curLight, altBaseReg, lt_post
|
|
vmrg $v23, $v23, $v25 // $v23 = light direction
|
|
bnez $11, lt_point
|
|
luv vLtColor, (ltBufOfs + 0 - lightSize)(curLight) // Light color
|
|
vmulf $v23, $v23, vPairNrml // Light dir * normalized normals
|
|
vmudh $v29, vLtOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
|
|
vmadm $v10, vPairRGBA, vSTOfs[6] // + (alpha - 1) * aoDir factor
|
|
vmudh $v29, vLtOne, $v23[0h] // Sum components of dot product as signed
|
|
vmadh $v29, vLtOne, $v23[1h]
|
|
vmadh $v23, vLtOne, $v23[2h]
|
|
vmulf vLtColor, vLtColor, $v10[3h] // light color *= ambient or point light factor
|
|
vge $v23, $v23, vSTOfs[3] // Clamp dot product to >= 0
|
|
lt_finish_light:
|
|
addi curLight, curLight, -lightSize
|
|
vmudh $v29, vLtOne, vPairLt // Load accum mid with current light level
|
|
j lt_loop
|
|
vmacf vPairLt, vLtColor, $v23[0h] // + light color * dot product
|
|
|
|
lt_post:
|
|
vadd vPairRGBA, vPairRGBA, $v31[7] // 0x7FFF; undo change for ambient occlusion
|
|
andi $11, $5, G_LIGHTTOALPHA >> 8
|
|
andi $20, $5, G_PACKED_NORMALS >> 8
|
|
andi $12, $5, G_TEXTURE_GEN >> 8
|
|
vmulf vLtRGBOut, vPairRGBA, vPairLt // RGB output is RGB * light
|
|
beqz $11, lt_skip_cel
|
|
vcopy vLtAOut, vPairRGBA // Alpha output = vertex alpha (only 3, 7 matter)
|
|
// Cel: alpha = max of light components, RGB = vertex color
|
|
vge vLtAOut, vPairLt, vPairLt[1h] // elem 0 = max(R0, G0); elem 4 = max(R1, G1)
|
|
vge vLtAOut, vLtAOut, vLtAOut[2h] // elem 0 = max(R0, G0, B0); equiv for elem 4
|
|
vcopy vLtRGBOut, vPairRGBA // RGB output is vertex color
|
|
vmudh vLtAOut, vLtOne, vLtAOut[0h] // move elem 0, 4 to 3, 7
|
|
lt_skip_cel:
|
|
vne $v29, $v31, $v31[3h] // Set VCC to 11101110
|
|
bnez $20, lt_skip_novtxcolor
|
|
andi $24, $5, G_FRESNEL >> 8
|
|
vcopy vLtRGBOut, vPairLt // If no packed normals, base output is just light
|
|
lt_skip_novtxcolor:
|
|
vmulf vLookat0, vPairNrml, $v23 // Normal * lookat 0 dir; vLookat0 = $v30 = vPairLt
|
|
beqz $24, lt_skip_fresnel
|
|
vmrg vPairRGBA, vLtRGBOut, vLtAOut // Merge base output and alpha output
|
|
// Fresnel: call point lighting; camera pos in $v23
|
|
ldv $v23[0], (cameraWorldPos - altBase)(altBaseReg) // Camera world pos
|
|
j lt_normal_to_vertex
|
|
ldv $v23[8], (cameraWorldPos - altBase)(altBaseReg)
|
|
lt_finish_fresnel: // output in $v23
|
|
llv $v10[0], (fresnelOffset - altBase)(altBaseReg) // Load fresnel offset and scale
|
|
vabs $v23, $v23, $v23 // Absolute value
|
|
vmudn $v26, $v31, $v10[1] // Elem 4 = low part of 0x0100 * scale
|
|
vmadh $v25, $v31, vSTOfs[3] // + 0; elem 4 = high part of 0x0100 * scale
|
|
vsub $v23, $v23, $v10[0] // Subtract offset
|
|
vmudl $v29, $v23, $v26[4] // Unsigned Fresnel value * low part shifted scale
|
|
vmadn $v23, $v23, $v25[4] // Alpha = unsigned Fresnel value * high part
|
|
vmrg vPairRGBA, vPairRGBA, $v23 // Merge base output and alpha output
|
|
lt_skip_fresnel:
|
|
ldv vVP2I[0], (vpMatrix + 0x10)($zero) // Restore $v10 = vVP2I before returning
|
|
beqz $12, vtx_return_from_lighting // no texgen
|
|
ldv vVP2I[8], (vpMatrix + 0x10)($zero)
|
|
// Texgen: vLookat0, vLookat1, locals $v25, $v26, $v23, have vLtOne = $v24
|
|
// Output: vPairST; have to leave vPairPosI/F, vPairRGBA
|
|
vmudh $v29, vLtOne, vLookat0[0h]
|
|
lpv vLookat1[4], (ltBufOfs + 0 - lightSize)(curLight) // Lookat 1 dir in elems 0-2
|
|
vmadh $v29, vLtOne, vLookat0[1h]
|
|
lpv $v26[0], (ltBufOfs + 8 - lightSize)(curLight) // Lookat 1 dir in elems 4-6
|
|
vmadh vLookat0, vLtOne, vLookat0[2h] // vLookat0 = dot product 0
|
|
vlt $v29, $v31, $v31[4] // Set VCC to 11110000
|
|
vmrg vLookat1, vLookat1, $v26 // vLookat1 = lookat 1 dir
|
|
vmulf vLookat1, vPairNrml, vLookat1 // Normal * lookat 1 dir
|
|
vmudh $v29, vLtOne, vLookat1[0h]
|
|
vmadh $v29, vLtOne, vLookat1[1h]
|
|
vmadh vLookat1, vLtOne, vLookat1[2h] // vLookat1 = dot product 1
|
|
vne $v29, $v31, $v31[1h] // Set VCC to 10111011
|
|
llv $v23[0], (texgenLinearCoeffs - altBase)(altBaseReg)
|
|
vmrg vLookat0, vLookat0, vLookat1[0h] // Dot products in elements 0, 1, 4, 5
|
|
andi $11, $5, G_TEXTURE_GEN_LINEAR >> 8
|
|
vmudh $v29, vLtOne, $v31[5] // 1 * 0x4000
|
|
beqz $11, vtx_return_from_lighting
|
|
vmacf vPairST, vLookat0, $v31[5] // + dot products * 0x4000 ( / 2)
|
|
// Texgen_Linear:
|
|
vmulf vPairST, vLookat0, $v31[5] // dot products * 0x4000 ( / 2)
|
|
vmulf $v26, vPairST, vPairST // ST squared
|
|
vmulf $v25, vPairST, $v31[7] // Move ST to accumulator (0x7FFF = 1)
|
|
vmacf $v25, vPairST, $v23[1] // + ST * 0x6CB3
|
|
vmudh $v29, vLtOne, $v31[5] // 1 * 0x4000
|
|
vmacf vPairST, vPairST, $v23[0] // + ST * 0x44D3
|
|
j vtx_return_from_lighting
|
|
vmacf vPairST, $v26, $v25 // + ST squared * (ST + ST * coeff)
|
|
|
|
lt_point:
|
|
/*
|
|
Input vector 1 elem size 7FFF.0000 -> len^2 3FFF0001 -> 1/len 0001.0040 -> vec +801E.FFC0 -> clamped 7FFF
|
|
len^2 * 1/len = 400E.FFC1 so about half actual length
|
|
Input vector 1 elem size 0100.0000 -> len^2 00010000 -> 1/len 007F.FFC0 -> vec 7FFF.C000 -> clamped 7FFF
|
|
len^2 * 1/len = 007F.FFC0 so about half actual length
|
|
Input vector 1 elem size 0010.0000 -> len^2 00000100 -> 1/len 07FF.FC00 -> vec 7FFF.C000
|
|
Input vector 1 elem size 0001.0000 -> len^2 00000001 -> 1/len 7FFF.C000 -> vec 7FFF.C000
|
|
*/
|
|
ldv $v23[0], (ltBufOfs + 8 - lightSize)(curLight) // Light position int part 0-3
|
|
ldv $v23[8], (ltBufOfs + 8 - lightSize)(curLight) // 4-7
|
|
lt_normal_to_vertex:
|
|
// This reused for fresnel; scalar unit stuff all garbage in that case
|
|
// Input point (light / camera) in $v23; computes $v23 = (vPairNrml dot (input - vertex))
|
|
// Uses temps $v10, $v25, $v26, $v29
|
|
vclr $v10 // Zero light pos frac part
|
|
vsubc $v10, $v10, vPairPosF // Vector from vertex to light, frac
|
|
lbu $20, (ltBufOfs + 7 - lightSize)(curLight) // Linear factor
|
|
vsub $v23, $v23, vPairPosI // Int
|
|
jal lt_normalize
|
|
lbu $24, (ltBufOfs + 0xE - lightSize)(curLight) // Quadratic factor
|
|
// $v23 = normalized vector from vertex to light, $v29[0h:1h] = 1/len, $v25 = len^2
|
|
vmudm $v10, $v25, $v29[1h] // len^2 int * 1/len frac
|
|
vmadn $v10, $v26, $v29[0h] // len^2 frac * 1/len int = len frac
|
|
mtc2 $20, vPairLt[14] // Quadratic int part in elem 7
|
|
vmadh $v29, $v25, $v29[0h] // len^2 int * 1/len int = len int
|
|
vmulf $v23, $v23, vPairNrml // Normalized light dir * normalized normals
|
|
vmudl $v10, $v10, vPairNrml[7] // len frac * linear factor frac
|
|
vmadm $v10, $v29, vPairNrml[7] // + len int * linear factor frac
|
|
vmadm $v10, vLtOne, vPairNrml[3] // + 1 * constant factor frac
|
|
vmadl $v10, $v26, vPairLt[3] // + len^2 frac * quadratic factor frac
|
|
vmadm $v10, $v25, vPairLt[3] // + len^2 int * quadratic factor frac
|
|
vmadn $v29, $v26, vPairLt[7] // + len^2 frac * quadratic factor int
|
|
vmadh $v25, $v25, vPairLt[7] // + len^2 int * quadratic factor int
|
|
luv vLtColor, (ltBufOfs + 0 - lightSize)(curLight) // vLtColor = $v26
|
|
vmudh $v10, vLtOne, $v23[0h] // Sum components of dot product as signed
|
|
vmadh $v10, vLtOne, $v23[1h]
|
|
beq curLight, altBaseReg, lt_finish_fresnel // If finished light loop, is fresnel
|
|
vmadh $v23, vLtOne, $v23[2h]
|
|
vrcph $v10[1], $v25[0] // 1/(2*light factor), input of 0000.8000 -> no change normals
|
|
vrcpl $v10[2], $v29[0] // Light factor 0001.0000 -> normals /= 2
|
|
vrcph $v10[3], $v25[4] // Light factor 0000.1000 -> normals *= 8 (with clamping)
|
|
vrcpl $v10[6], $v29[4] // Light factor 0010.0000 -> normals /= 32
|
|
vrcph $v10[7], vSTOfs[3] // 0
|
|
vge $v23, $v23, vSTOfs[3] // Clamp dot product to >= 0
|
|
vmudm $v29, $v23, $v10[2h] // Dot product int * rcp frac
|
|
j lt_finish_light
|
|
vmadh $v23, $v23, $v10[3h] // Dot product int * rcp int, clamp to 0x7FFF
|
|
|
|
lt_normalize:
|
|
// Normalize vector in $v23:$v10 i/f, output in $v23. Also continue point
|
|
// light scalar unit stuff. Uses temps $v25, $v26, $v29, also $11, $20, $24
|
|
// Also overwrites vPairNrml and vPairLt elems 3, 7
|
|
vmudm $v29, $v23, $v10 // Squared. Don't care about frac*frac term
|
|
sll $11, $11, 8 // Constant factor, 00000100 - 0000FF00
|
|
vmadn $v29, $v10, $v23
|
|
sll $20, $20, 6 // Linear factor, 00000040 - 00003FC0
|
|
vmadh $v29, $v23, $v23
|
|
vreadacc $v26, ACC_MIDDLE
|
|
vreadacc $v25, ACC_UPPER
|
|
mtc2 $11, vPairNrml[6] // Constant frac part in elem 3
|
|
vmudm $v29, vLtOne, $v26[2h] // Sum of squared components
|
|
vmadh $v29, vLtOne, $v25[2h]
|
|
srl $11, $24, 5 // Top 3 bits
|
|
vmadm $v29, vLtOne, $v26[1h]
|
|
mtc2 $20, vPairNrml[14] // Linear frac part in elem 7
|
|
vmadh $v29, vLtOne, $v25[1h]
|
|
andi $20, $24, 0x1F // Bottom 5 bits
|
|
vmadn $v26, $v26, vLtOne // elem 0; swapped so we can do vmadn and get result
|
|
ori $20, $20, 0x20 // Append leading 1 to mantissa
|
|
vmadh $v25, $v25, vLtOne
|
|
sllv $20, $20, $11 // Left shift to create floating point
|
|
vrsqh $v29[2], $v25[0] // High input, garbage output
|
|
sll $20, $20, 8 // Min range 00002000, 00002100... 00003F00, max 00100000...001F8000
|
|
vrsql $v29[1], $v26[0] // Low input, low output
|
|
bnez $24, @@skip // If original value is zero, set to zero
|
|
vrsqh $v29[0], $v25[4] // High input, high output
|
|
li $20, 0
|
|
@@skip:
|
|
vrsql $v29[5], $v26[4] // Low input, low output
|
|
vrsqh $v29[4], vSTOfs[3] // 0 input, high output
|
|
mtc2 $20, vPairLt[6] // Quadratic frac part in elem 3
|
|
vmudn $v10, $v10, $v29[0h] // Vec frac * int scaling, discard result
|
|
srl $20, $20, 16
|
|
vmadm $v10, $v23, $v29[1h] // Vec int * frac scaling, discard result
|
|
jr $ra
|
|
vmadh $v23, $v23, $v29[0h] // Vec int * int scaling
|
|
|
|
ovl2_end:
|
|
.align 8
|
|
ovl2_padded_end:
|
|
|
|
.headersize ovl234_start - orga()
|
|
|
|
ovl4_start:
|
|
// Contains M inverse transpose (mIT) computation, and some rarely-used command handlers.
|
|
|
|
ovl234_lighting_entrypoint_ovl4ver: // same IMEM address as ovl234_lighting_entrypoint
|
|
li cmd_w1_dram, orga(ovl2_start) // set up a load for overlay 2
|
|
j load_overlays_2_3_4 // load overlay 2
|
|
li postOvlRA, ovl234_lighting_entrypoint // set the return address
|
|
|
|
ovl234_ovl4_entrypoint:
|
|
vclr $v30 // $v30 = 0 for calc_mit
|
|
j ovl4_select_instr
|
|
li $11, 1 // $7 = 1 (lighting & mIT invalid) if doing calc_mit
|
|
|
|
ovl234_clipping_entrypoint_ovl4ver: // same IMEM address as ovl234_clipping_entrypoint
|
|
sh $ra, tempHalfword1
|
|
li cmd_w1_dram, orga(ovl3_start) // set up a load of overlay 3
|
|
j load_overlays_2_3_4 // load overlay 3
|
|
li postOvlRA, ovl3_clipping_nosavera // set up the return address in ovl3
|
|
|
|
ovl4_select_instr:
|
|
beq $11, $7, calc_mit // otherwise $7 = command byte
|
|
li $12, G_BRANCH_WZ
|
|
beq $12, $7, G_BRANCH_WZ_handler
|
|
li $11, G_MODIFYVTX
|
|
beq $11, $7, G_MODIFYVTX_handler
|
|
li $12, (0xFF00 | G_DMA_IO)
|
|
beq $12, $7, G_DMA_IO_handler
|
|
// Otherwise G_MTX_end, which starts with a harmless instruction
|
|
|
|
G_MTX_end: // Multiplies the temp loaded matrix into the M or VP matrix
|
|
lhu $5, (movememTable + G_MV_MMTX)($1) // Output; $1 holds 0 for M or 4 for VP.
|
|
move $2, $5 // Input 0 = output
|
|
jal while_wait_dma_busy // If ovl4 already in memory, was not done
|
|
li $3, tempMemRounded // Input 1 = temp mem (loaded mtx)
|
|
addi $12, $3, 0x0018
|
|
@@loop:
|
|
vmadn $v9, vZero, vZero[0]
|
|
addi $11, $3, 0x0008
|
|
vmadh $v8, vZero, vZero[0]
|
|
addi $2, $2, -0x0020
|
|
vmudh $v29, vZero, vZero[0]
|
|
@@innerloop:
|
|
ldv $v5[0], 0x0040($2)
|
|
ldv $v5[8], 0x0040($2)
|
|
lqv $v3[0], 0x0020($3) // Input 1
|
|
ldv $v4[0], 0x0020($2)
|
|
ldv $v4[8], 0x0020($2)
|
|
lqv $v2[0], 0x0000($3) // Input 1
|
|
vmadl $v29, $v5, $v3[0h]
|
|
addi $3, $3, 0x0002
|
|
vmadm $v29, $v4, $v3[0h]
|
|
addi $2, $2, 0x0008 // Increment input 0 pointer
|
|
vmadn $v7, $v5, $v2[0h]
|
|
bne $3, $11, @@innerloop
|
|
vmadh $v6, $v4, $v2[0h]
|
|
bne $3, $12, @@loop
|
|
addi $3, $3, 0x0008
|
|
// Store the results in M or VP
|
|
sqv $v9[0], 0x0020($5)
|
|
sqv $v8[0], 0x0000($5)
|
|
sqv $v7[0], 0x0030($5)
|
|
j run_next_DL_command
|
|
sqv $v6[0], 0x0010($5)
|
|
|
|
G_DMA_IO_handler:
|
|
jal segmented_to_physical // Convert the provided segmented address (in cmd_w1_dram) to a virtual one
|
|
lh dmemAddr, (inputBufferEnd - 0x07)(inputBufferPos) // Get the 16 bits in the middle of the command word (since inputBufferPos was already incremented for the next command)
|
|
andi dmaLen, cmd_w0, 0x0FF8 // Mask out any bits in the length to ensure 8-byte alignment
|
|
// At this point, dmemAddr's highest bit is the flag, it's next 13 bits are the DMEM address, and then it's last two bits are the upper 2 of size
|
|
// So an arithmetic shift right 2 will preserve the flag as being the sign bit and get rid of the 2 size bits, shifting the DMEM address to start at the LSbit
|
|
sra dmemAddr, dmemAddr, 2
|
|
j dma_read_write // Trigger a DMA read or write, depending on the G_DMA_IO flag (which will occupy the sign bit of dmemAddr)
|
|
li $ra, wait_for_dma_and_run_next_command // Setup the return address for running the next DL command
|
|
|
|
G_MODIFYVTX_handler:
|
|
j vtx_addrs_from_cmd // byte 3 = vtx being modified; addr -> $12
|
|
li $11, modifyvtx_return_from_addrs
|
|
modifyvtx_return_from_addrs:
|
|
j do_moveword // Moveword adds cmd_w0 to $12 for final addr
|
|
lbu cmd_w0, (inputBufferEnd - 0x07)(inputBufferPos)
|
|
|
|
G_BRANCH_WZ_handler:
|
|
j vtx_addrs_from_cmd // byte 3 = vtx being tested; addr -> $12
|
|
li $11, branchwz_return_from_addrs
|
|
branchwz_return_from_addrs:
|
|
.if CFG_G_BRANCH_W // G_BRANCH_W/G_BRANCH_Z difference; this defines F3DZEX vs. F3DEX2
|
|
lh $12, VTX_W_INT($12) // read the w coordinate of the vertex (f3dzex)
|
|
.else
|
|
lw $12, VTX_SCR_Z($12) // read the screen z coordinate (int and frac) of the vertex (f3dex2)
|
|
.endif
|
|
sub $2, $12, cmd_w1_dram // subtract the w/z value being tested
|
|
bgez $2, run_next_DL_command // if vtx.w/z >= cmd w/z, continue running this DL
|
|
lw cmd_w1_dram, rdpHalf1Val // load the RDPHALF1 value as the location to branch to
|
|
j branch_dl // need $2 < 0 for nopush and cmd_w1_dram
|
|
move cmd_w0, $zero // No count of DL cmds to skip
|
|
|
|
calc_mit:
|
|
/*
|
|
Compute M inverse transpose. All regs available except vM0I::vM3F and $v31.
|
|
$v31 constants present, but no other constants.
|
|
Register use (all only elems 0-2):
|
|
$v8:$v9 X left rotated int:frac, $v10:$v11 X right rotated int:frac
|
|
$v12:$v13 Y left rotated int:frac, $v14:$v15 Y right rotated int:frac
|
|
$v16:$v17 Z left rotated int:frac, $v18:$v19 Z right rotated int:frac
|
|
Rest temps.
|
|
Scale factor can be arbitrary, but final matrix must only reduce a vector's
|
|
magnitude (rotation * scale < 1). So want components of matrix to be < 0001.0000.
|
|
However, if input matrix has components on the order of 0000.0100, multiplying
|
|
two terms will reduce that to the order of 0000.0001, which kills all the precision.
|
|
*/
|
|
// Get absolute value of all terms of M matrix.
|
|
li $12, mMatrix + 0xE // For right rotates with lrv/ldv
|
|
vxor $v20, vM0I, $v31[1] // One's complement of X int part
|
|
sb $7, mITValid // $7 is 1 if we got here, mark valid
|
|
vlt $v29, vM0I, $v30[0] // X int part < 0
|
|
li $11, mMatrix + 2 // For left rotates with lqv/ldv
|
|
vabs $v21, vM0I, vM0F // Apply sign of X int part to X frac part
|
|
lrv $v10[0], (0x00)($12) // X int right shifted
|
|
vxor $v22, vM1I, $v31[1] // One's complement of Y int part
|
|
lrv $v11[0], (0x20)($12) // X frac right shifted
|
|
vmrg $v20, $v20, vM0I // $v20:$v21 = abs(X int:frac)
|
|
lqv $v16[0], (0x10)($11) // Z int left shifted
|
|
vlt $v29, vM1I, $v30[0] // Y int part < 0
|
|
lqv $v17[0], (0x30)($11) // Z frac left shifted
|
|
vabs $v23, vM1I, vM1F // Apply sign of Y int part to Y frac part
|
|
lsv $v10[0], (0x02)($11) // X int right rot elem 2->0
|
|
vxor $v24, vM2I, $v31[1] // One's complement of Z int part
|
|
lsv $v11[0], (0x22)($11) // X frac right rot elem 2->0
|
|
vmrg $v22, $v22, vM1I // $v22:$v23 = abs(Y int:frac)
|
|
lsv $v16[4], (0x0E)($11) // Z int left rot elem 0->2
|
|
vlt $v29, vM2I, $v30[0] // Z int part < 0
|
|
lsv $v17[4], (0x2E)($11) // Z frac left rot elem 0->2
|
|
vabs $v25, vM2I, vM2F // Apply sign of Z int part to Z frac part
|
|
lrv $v18[0], (0x10)($12) // Z int right shifted
|
|
vmrg $v24, $v24, vM2I // $v24:$v25 = abs(Z int:frac)
|
|
lrv $v19[0], (0x30)($12) // Z frac right shifted
|
|
// See if any of the int parts are nonzero. Also, get the maximum of the frac parts.
|
|
vge $v21, $v21, $v23
|
|
lqv $v8[0], (0x00)($11) // X int left shifted
|
|
vor $v20, $v20, $v22
|
|
lqv $v9[0], (0x20)($11) // X frac left shifted
|
|
vmudn $v11, $v11, $v31[1] // -1; negate X right rot
|
|
lsv $v18[0], (0x12)($11) // Z int right rot elem 2->0
|
|
vmadh $v10, $v10, $v31[1]
|
|
lsv $v19[0], (0x32)($11) // Z frac right rot elem 2->0
|
|
vge $v21, $v21, $v25
|
|
lsv $v8[4], (-0x02)($11) // X int left rot elem 0->2
|
|
vor $v20, $v20, $v24
|
|
lsv $v9[4], (0x1E)($11) // X frac left rot elem 0->2
|
|
vmudn $v17, $v17, $v31[1] // -1; negate Z left rot
|
|
ldv $v12[0], (0x08)($11) // Y int left shifted
|
|
vmadh $v16, $v16, $v31[1]
|
|
ldv $v13[0], (0x28)($11) // Y frac left shifted
|
|
vge $v21, $v21, $v21[1h]
|
|
ldv $v14[0], (-0x08)($12) // Y int right shifted
|
|
vor $v20, $v20, $v20[1h]
|
|
ldv $v15[0], (0x18)($12) // Y frac right shifted
|
|
vmudn $v27, $v19, $v31[1] // -1; $v26:$v27 is negated copy of Z right rot
|
|
lsv $v12[4], (0x06)($11) // Y int left rot elem 0->2
|
|
vmadh $v26, $v18, $v31[1]
|
|
lsv $v13[4], (0x26)($11) // Y frac left rot elem 0->2
|
|
vge $v21, $v21, $v21[2h]
|
|
lsv $v14[0], (0x0A)($11) // Y int right rot elem 2->0
|
|
vor $v20, $v20, $v20[2h]
|
|
lsv $v15[0], (0x2A)($11) // Y frac right rot elem 2->0
|
|
// Scale factor is 1/(2*(max^2)) (clamped if overflows).
|
|
// 1/(2*max) is what vrcp provides, so we multiply that by 2 and then by the rcp
|
|
// output. If we used the scale factor of 1/(max^2), the output matrix would have
|
|
// components on the order of 0001.0000, but we want the components to be smaller than this.
|
|
vrcp $v28[1], $v21[0] // low in, low out (discarded)
|
|
vrcph $v28[0], $v30[0] // zero in, high out (only care about elem 0)
|
|
vadd $v22, $v28, $v28 // *2
|
|
vmudh $v28, $v22, $v28 // (1/max) * (1/(2*max)), clamp to 0x7FFF
|
|
veq $v29, $v20, $v30[0] // elem 0 (all int parts) == 0
|
|
vmrg $v28, $v28, $v31[2] // If so, use computed normalization, else use 1 (elem 0)
|
|
/*
|
|
The original equations for the matrix rows are (XL = X rotated left, etc., n = normalization):
|
|
n*(YL*ZR - YR*ZL)
|
|
n*(ZL*XR - ZR*XL)
|
|
n*(XL*YR - XR*YL)
|
|
We need to apply the normalization to one of each of the terms before the multiply,
|
|
and also there's no multiply-subtract instruction, only multiply-add. Converted to:
|
|
(n*YL)* ZR + (n* YR )*(-ZL)
|
|
(n*XL)*(-ZR) + (n*(-XR))*(-ZL)
|
|
(n*XL)* YR + (n*(-XR))* YL
|
|
So the steps are:
|
|
Negate XR, negate ZL, negated copy of ZR (all done above)
|
|
Scale XL, scale negated XR
|
|
Do multiply-adds for Y and Z output vectors
|
|
Scale YL, scale YR
|
|
Do multiply-adds for X output vector
|
|
*/
|
|
vmudn $v9, $v9, $v28[0] // Scale XL
|
|
vmadh $v8, $v8, $v28[0]
|
|
vmudn $v11, $v11, $v28[0] // Scale XR
|
|
vmadh $v10, $v10, $v28[0]
|
|
// Z output vector: XL*YR + XR*YL, with each term having had scale and/or negative applied
|
|
vmudl $v29, $v9, $v15
|
|
vmadm $v29, $v8, $v15
|
|
vmadn $v29, $v9, $v14
|
|
vmadh $v29, $v8, $v14
|
|
vmadl $v29, $v11, $v13
|
|
vmadm $v29, $v10, $v13
|
|
vmadn $v25, $v11, $v12
|
|
vmadh $v24, $v10, $v12 // $v24:$v25 = Z output
|
|
vmudn $v13, $v13, $v28[0] // Scale YL
|
|
vmadh $v12, $v12, $v28[0]
|
|
vmudn $v15, $v15, $v28[0] // Scale YR
|
|
vmadh $v14, $v14, $v28[0]
|
|
// Y output vector: XL*ZR + XR*ZL, with each term having had scale and/or negative applied
|
|
vmudl $v29, $v9, $v27 // Negated copy of ZR
|
|
vmadm $v29, $v8, $v27
|
|
vmadn $v29, $v9, $v26
|
|
vmadh $v29, $v8, $v26
|
|
sdv $v25[0], (mITMatrix + 0x28)($zero)
|
|
vmadl $v29, $v11, $v17
|
|
sdv $v24[0], (mITMatrix + 0x10)($zero)
|
|
vmadm $v29, $v10, $v17
|
|
vmadn $v23, $v11, $v16
|
|
vmadh $v22, $v10, $v16 // $v22:$v23 = Y output
|
|
// X output vector: YL*ZR + YR*ZL, with each term having had scale and/or negative applied
|
|
vmudl $v29, $v13, $v19
|
|
vmadm $v29, $v12, $v19
|
|
vmadn $v29, $v13, $v18
|
|
vmadh $v29, $v12, $v18
|
|
sdv $v23[0], (mITMatrix + 0x20)($zero)
|
|
vmadl $v29, $v15, $v17
|
|
sdv $v22[0], (mITMatrix + 0x08)($zero)
|
|
vmadm $v29, $v14, $v17
|
|
vmadn $v21, $v15, $v16
|
|
vmadh $v20, $v14, $v16 // $v20:$v21 = X output
|
|
sdv $v21[0], (mITMatrix + 0x18)($zero)
|
|
j vtx_after_calc_mit
|
|
sdv $v20[0], (mITMatrix + 0x00)($zero)
|
|
|
|
ovl4_end:
|
|
.align 8
|
|
ovl4_padded_end:
|
|
|
|
.close // CODE_FILE
|