mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
Working on optimizations
This commit is contained in:
@@ -18,8 +18,8 @@ into account, but in some cases it is assumed to be optimal.
|
||||
All numbers assume default profiling configuration. Tri numbers assume texture,
|
||||
shade, and Z, and not flushing the buffer. Tri numbers are measured from the
|
||||
first cycle of the command handler inclusive, to the first cycle of whatever is
|
||||
after $ra exclusive; this is in order to capture the extra latency and stalls in
|
||||
F3DEX2.
|
||||
after $ra exclusive; this is in order to capture an extra stall cycle in F3DEX2
|
||||
when finishing a triangle and going to the next command.
|
||||
|
||||
Vertex numbers assume no extra F3DEX3 features (packed normals, ambient
|
||||
occlusion, etc.). These features are listed below as the number of extra cycles
|
||||
@@ -33,19 +33,19 @@ even to an odd number of lights adds a different time than vice versa.
|
||||
|----------------------------|--------|------------|--------|
|
||||
| Command dispatch | 12 | 12 | 12 |
|
||||
| Small RDP command | 14 | 5 | 5 |
|
||||
| Only/2nd tri to offscreen | 27 | 26 | 26 |
|
||||
| 1st tri to offscreen | 28 | 27 | 27 |
|
||||
| Only/2nd tri to clip | 32 | 31 | 31 |
|
||||
| 1st tri to clip | 33 | 32 | 32 |
|
||||
| Only/2nd tri to backface | 38 | 38 | 38 |
|
||||
| 1st tri to backface | 39 | 39 | 39 |
|
||||
| Only/2nd tri to degenerate | 42 | 40 | 40 |
|
||||
| 1st tri to degenerate | 43 | 41 | 41 |
|
||||
| Only/2nd tri to occluded | Can't | Can't | 49 |
|
||||
| 1st tri to occluded | Can't | Can't | 50 |
|
||||
| Only/2nd tri to draw | 172 | 159 | 162 |
|
||||
| 1st tri to draw | 173 | 160 | 163 |
|
||||
| Extra per tri from snake | Can't | 10 | 10 |
|
||||
| Only/2nd tri to offscreen | 27 | 25 | 25 |
|
||||
| 1st tri to offscreen | 28 | 26 | 26 |
|
||||
| Only/2nd tri to clip | 32 | 30 | 30 |
|
||||
| 1st tri to clip | 33 | 31 | 31 |
|
||||
| Only/2nd tri to backface | 38 | 36 | 36 |
|
||||
| 1st tri to backface | 39 | 37 | 37 |
|
||||
| Only/2nd tri to degenerate | 42 | 38 | 38 |
|
||||
| 1st tri to degenerate | 43 | 39 | 39 |
|
||||
| Only/2nd tri to occluded | Can't | Can't | 42 |
|
||||
| 1st tri to occluded | Can't | Can't | 43 |
|
||||
| Only/2nd tri to draw | 172 | 156 | 159 |
|
||||
| 1st tri to draw | 173 | 157 | 160 |
|
||||
| Extra per tri from snake | Can't | 9 | 9 |
|
||||
| Vtx before DMA start | 16 | 17 | 17 |
|
||||
| Vtx pair, no lighting | 54 | 54 | 70 |
|
||||
| Vtx pair, 0 dir lts | Can't | 65 | 81 |
|
||||
|
||||
179
f3dex3.s
179
f3dex3.s
@@ -786,6 +786,7 @@ $ra return address, sometimes sign bit is flag -------------------------------
|
||||
*/
|
||||
|
||||
// Global scalar regs:
|
||||
vGeomMid equ $5 // Middle two bytes of geometry mode in lower 16 bits
|
||||
perfCounterD equ $12 // Performance counter D (functions depend on config)
|
||||
altBaseReg equ $13 // Alternate base address register for vector loads
|
||||
rdpCmdBufEndP1 equ $22 // Pointer to one command word past "end" (middle) of RDP command buf
|
||||
@@ -799,7 +800,6 @@ perfCounterC equ $30 // Performance counter C (functions depend on config)
|
||||
// Vertex write:
|
||||
vtxLeft equ $1 // Number of vertices left to process * 0x10
|
||||
vLoopRet equ $3 // Return address at end of vtx loop = top of loop or misc lighting
|
||||
vGeomMid equ $5 // Middle two bytes of geometry mode
|
||||
fogFlag equ $7 // 8 if fog enabled, else 0
|
||||
outVtx2 equ $8 // Pointer to second or dummy (= outVtx1) transformed vert
|
||||
inVtx equ $14 // Pointer to loaded vertex to transform; < 0 means from clipping.
|
||||
@@ -1098,6 +1098,7 @@ finish_setup:
|
||||
sw $11, startCounterTime
|
||||
.endif
|
||||
sh $zero, mvpValid // and dirLightsXfrmValid
|
||||
lhu vGeomMid, geometryModeLabel + 1
|
||||
li inputBufferPos, 0
|
||||
li cmd_w1_dram, orga(ovl1_start)
|
||||
j load_overlays_0_1
|
||||
@@ -1277,20 +1278,20 @@ G_MODIFYVTX_handler:
|
||||
// cmd_w0 + inputBufferEnd
|
||||
G_TRISNAKE_handler:
|
||||
sw cmd_w0, rdpHalf1Val // Store indices a, b, c
|
||||
addi inputBufferPos, inputBufferPos, -5 // Point to byte 3, index c of 1st tri
|
||||
addi inputBufferPos, inputBufferPos, -6 // Point to byte 2, index b of 1st tri
|
||||
li $ra, tri_snake_loop // For tri_main
|
||||
tri_snake_loop:
|
||||
lh $3, (inputBufferEnd - 1)(inputBufferPos) // Load indices b and c
|
||||
lh $3, (inputBufferEnd)(inputBufferPos) // Load indices b and c
|
||||
addi inputBufferPos, inputBufferPos, 1 // Increment indices being read
|
||||
tri_snake_loop_from_input_buffer:
|
||||
lb $2, rdpHalf1Val + 1 // Old v1; == index b, except when bridging between old and new load
|
||||
li $ra, tri_snake_loop // For tri_main
|
||||
bltz $3, tri_snake_end // Upper bit of real index b set = done
|
||||
andi $11, $3, 1 // Get direction flag from index c
|
||||
beqz inputBufferPos, tri_snake_over_input_buffer // == 0 at end of input buffer
|
||||
andi $3, $3, 0x7E // Mask out flags from index c
|
||||
sb $3, rdpHalf1Val + 1 // Store index c as vertex 1
|
||||
sb $2, (rdpHalf1Val + 2)($11) // Store old v1 as 2 if dir clear or 3 if set
|
||||
j tri_main
|
||||
addi inputBufferPos, inputBufferPos, 1 // Increment indices being read
|
||||
sb $2, (rdpHalf1Val + 2)($11) // Store old v1 as 2 if dir clear or 3 if set
|
||||
|
||||
// H = highest on screen = lowest Y value; then M = mid, L = low
|
||||
tHAtF equ $v5
|
||||
@@ -1317,128 +1318,130 @@ tri_main:
|
||||
lpv $v27[4], (rdpHalf1Val)($zero) // To vector unit in elems 5-7
|
||||
lbu $1, rdpHalf1Val + 1
|
||||
lbu $2, rdpHalf1Val + 2
|
||||
lbu $3, rdpHalf1Val + 3
|
||||
vclr vZero
|
||||
lhu $1, (vertexTable)($1)
|
||||
lbu $3, rdpHalf1Val + 3
|
||||
vmudn $v29, vOne, vTRC_VB // Address of vertex buffer
|
||||
lhu $2, (vertexTable)($2)
|
||||
lhu $1, (vertexTable)($1)
|
||||
vmadl $v27, $v27, vTRC_VS // Plus vtx indices times length
|
||||
lhu $2, (vertexTable)($2)
|
||||
vmadl $v6, $v31, $v31[2] // 0; vtx 1 addr in $v6 elem 5
|
||||
lhu $3, (vertexTable)($3)
|
||||
vmadl $v4, $v31, $v31[2] // 0; vtx 2 addr in $v4 elem 6
|
||||
.if !ENABLE_PROFILING
|
||||
// vnop
|
||||
addi perfCounterB, perfCounterB, 0x4000 // Increment number of tris requested
|
||||
move $4, $1 // Save original vertex 1 addr (pre-shuffle) for flat shading
|
||||
.endif
|
||||
tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping
|
||||
vnxor tHAtF, vZero, $v31[7] // v5 = 0x8000; init frac value for attrs for rounding
|
||||
vmov $v4[5], $v27[6] // elem 5 of v4 = vertex 2 addr
|
||||
llv $v6[0], VTX_SCR_VEC($1) // Load pixel coords of vertex 1 into v6 (elems 0, 1 = x, y)
|
||||
vnxor tMAtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
|
||||
vmov $v8[5], $v27[7] // elem 5 of v8 = vertex 3 addr
|
||||
llv $v4[0], VTX_SCR_VEC($2) // Load pixel coords of vertex 2 into v4
|
||||
vmov $v6[6], $v27[5] // elem 6 of v6 = vertex 1 addr
|
||||
vnxor tMAtF, vZero, $v31[7] // v7 = 0x8000; init frac value for attrs for rounding
|
||||
llv $v8[0], VTX_SCR_VEC($3) // Load pixel coords of vertex 3 into v8
|
||||
vnxor tLAtF, vZero, $v31[7] // v9 = 0x8000; init frac value for attrs for rounding
|
||||
lhu $16, VTX_CLIP($1)
|
||||
vmov $v8[6], $v27[7] // elem 6 of v8 = vertex 3 addr
|
||||
lhu $7, VTX_CLIP($2)
|
||||
// vnop
|
||||
lhu $8, VTX_CLIP($3)
|
||||
vmudh $v2, vOne, $v6[1] // v2 all elems = y-coord of vertex 1
|
||||
andi $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
|
||||
lhu $7, VTX_CLIP($2)
|
||||
vsub $v10, $v6, $v4 // v10 = vertex 1 - vertex 2 (x, y, addr)
|
||||
and $11, $11, $7
|
||||
lhu $8, VTX_CLIP($3)
|
||||
vsub $v12, $v6, $v8 // v12 = vertex 1 - vertex 3 (x, y, addr)
|
||||
and $11, $11, $8
|
||||
andi $11, $16, CLIP_SCRN_NPXY | CLIP_CAMPLANE // All three verts on wrong side of same plane
|
||||
vsub $v11, $v4, $v6 // v11 = vertex 2 - vertex 1 (x, y, addr)
|
||||
and $11, $11, $7
|
||||
vlt $v13, $v2, $v4[1] // v13 = min(v1.y, v2.y), VCO = v1.y < v2.y
|
||||
and $11, $11, $8
|
||||
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
|
||||
bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull
|
||||
// 22 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
|
||||
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
|
||||
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
|
||||
lhu $24, activeClipPlanes
|
||||
// 21 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
|
||||
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
|
||||
vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
|
||||
lw $6, geometryModeLabel // Load full geometry mode word
|
||||
lhu $24, activeClipPlanes
|
||||
vge $v2, $v2, $v4[1] // v2 = max(vert1.y, vert2.y), VCO = vert1.y > vert2.y
|
||||
or $10, $16, $7
|
||||
sll $20, vGeomMid, 29 // Original bit 10 (now bit 2) in the sign bit, for facing cull
|
||||
vmrg tLPos, $v6, $v4 // v10 = vert1.y > vert2.y ? vert1 : vert2 (higher vertex of vert1, vert2)
|
||||
or $10, $10, $8 // $10 = all clip bits which are true for any verts
|
||||
or $10, $16, $7
|
||||
vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y
|
||||
and $10, $10, $24 // If clipping is enabled, check clip flags
|
||||
or $10, $10, $8 // $10 = all clip bits which are true for any verts
|
||||
vmrg $v4, tHPos, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3)
|
||||
mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended
|
||||
vmrg tHPos, $v8, tHPos // v14 = max(vert1.y, vert2.y) > vert3.y : vert3 ? higher(vert1, vert2)
|
||||
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
|
||||
// 30 cycles
|
||||
sll $20, $6, 21 // Bit 10 in the sign bit, for facing cull
|
||||
and $10, $10, $24 // If clipping is enabled, check clip flags
|
||||
vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
|
||||
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
|
||||
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
|
||||
// 29 cycles
|
||||
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
|
||||
vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor
|
||||
sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
|
||||
vmrg tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
|
||||
bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing)
|
||||
// 34 cycles
|
||||
// 32 cycles
|
||||
vmrg tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
|
||||
tSubPxHF equ $v4
|
||||
tSubPxHI equ $v26
|
||||
vmudn tSubPxHF, tHPos, $v31[5] // 0x4000
|
||||
beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull.
|
||||
// 36 cycles
|
||||
mfc2 $1, tHPos[12] // tHPos = lowest Y value = highest on screen (x, y, addr)
|
||||
// 34 cycles
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
and $16, $16, $7
|
||||
.endif
|
||||
vsub tPosMmH, tMPos, tHPos
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
and $16, $16, $8
|
||||
.endif
|
||||
vsub tPosLmH, tLPos, tHPos
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
andi $16, $16, CLIP_OCCLUDED
|
||||
bnez $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
|
||||
// 38 cycles
|
||||
.endif
|
||||
mfc2 $1, tHPos[10] // tHPos = lowest Y value = highest on screen (x, y, addr)
|
||||
// 36 cycles if NOC (39 if occlusion plane)
|
||||
vsub tPosHmM, tHPos, tMPos
|
||||
mfc2 $2, tMPos[10] // tMPos = mid vertex (x, y, addr)
|
||||
tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
|
||||
tPosCatF equ $v25
|
||||
vsub tPosMmH, tMPos, tHPos
|
||||
mfc2 $2, tMPos[12] // tMPos = mid vertex (x, y, addr)
|
||||
vsub tPosLmH, tLPos, tHPos
|
||||
.if !ENABLE_PROFILING
|
||||
sll $11, $6, 10 // Moves the value of G_SHADING_SMOOTH into the sign bit
|
||||
.endif
|
||||
vsub tPosHmM, tHPos, tMPos
|
||||
andi $6, $6, (G_SHADE | G_ZBUFFER)
|
||||
vsub tPosCatI, tLPos, tMPos
|
||||
mfc2 $3, tLPos[12] // tLPos = highest Y value = lowest on screen (x, y, addr)
|
||||
vmov tPosCatI[2], tPosMmH[0]
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
and $16, $16, $7
|
||||
and $16, $16, $8
|
||||
andi $16, $16, CLIP_OCCLUDED
|
||||
.if !ENABLE_PROFILING
|
||||
andi $11, vGeomMid, G_SHADING_SMOOTH >> 8
|
||||
.endif
|
||||
tXPF equ $v16 // Triangle cross product
|
||||
tXPI equ $v17
|
||||
tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
|
||||
tXPRcpI equ $v24
|
||||
vmov tPosCatI[2], tPosMmH[0]
|
||||
lbu $6, geometryModeLabel + 3 // Load lowest byte for G_SHADE, G_ZBUFFER. Also has G_ATTROFFSET_ST_ENABLE, but G_TRI_FILL will get OR'd into it and force that set.
|
||||
vmudh $v29, tPosMmH, tPosLmH[0]
|
||||
mfc2 $3, tLPos[10] // tLPos = highest Y value = lowest on screen (x, y, addr)
|
||||
t1WI equ $v13 // elems 0, 4, 6
|
||||
t1WF equ $v14
|
||||
vmudh $v29, tPosMmH, tPosLmH[0]
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
bnez $16, tri_culled_by_occlusion_plane // Cull if all verts occluded
|
||||
.endif
|
||||
llv t1WI[0], VTX_INV_W_VEC($1)
|
||||
vmadh $v29, tPosLmH, tPosHmM[0]
|
||||
lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
|
||||
llv t1WI[0], VTX_INV_W_VEC($1)
|
||||
tXPF equ $v16 // Triangle cross product
|
||||
tXPI equ $v17
|
||||
vreadacc tXPI, ACC_UPPER
|
||||
lpv tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
|
||||
lpv tHAtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1
|
||||
vreadacc tXPF, ACC_MIDDLE
|
||||
lpv tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
|
||||
vrcp $v20[0], tPosCatI[1]
|
||||
.if !ENABLE_PROFILING
|
||||
lpv $v25[0], VTX_COLOR_VEC($4) // Load RGB from vertex 4 (flat shading vtx)
|
||||
.endif
|
||||
vmov tPosCatI[3], tPosLmH[0]
|
||||
llv t1WI[8], VTX_INV_W_VEC($2)
|
||||
vrcph $v22[0], tXPI[1]
|
||||
vrcp $v20[0], tPosCatI[1]
|
||||
lpv tMAtI[0], VTX_COLOR_VEC($2) // Load vert color of vertex 2
|
||||
vmov tPosCatI[3], tPosLmH[0]
|
||||
llv t1WI[12], VTX_INV_W_VEC($3)
|
||||
vrcph $v22[0], tXPI[1]
|
||||
lpv tLAtI[0], VTX_COLOR_VEC($3) // Load vert color of vertex 3
|
||||
tXPRcpF equ $v23 // Reciprocal of cross product (becomes that * 4)
|
||||
tXPRcpI equ $v24
|
||||
vrcpl tXPRcpF[1], tXPF[1]
|
||||
.if !ENABLE_PROFILING
|
||||
bltz $11, tri_skip_flat_shading // Branch if G_SHADING_SMOOTH is set
|
||||
.endif
|
||||
vrcph tXPRcpI[1], $v31[2] // 0
|
||||
.if !ENABLE_PROFILING
|
||||
lbu $10, rdpHalf1Val + 1 // Original vertex 1
|
||||
lhu $10, (vertexTable)($10)
|
||||
lpv $v25[0], VTX_COLOR_VEC($10) // Load RGB from vertex 4 (flat shading vtx)
|
||||
vlt $v29, $v31, $v31[3] // Set vcc to 11100000
|
||||
vmrg tHAtI, $v25, tHAtI // RGB from $4, alpha from $1
|
||||
vmrg tMAtI, $v25, tMAtI // RGB from $4, alpha from $2
|
||||
vmrg tLAtI, $v25, tLAtI // RGB from $4, alpha from $3
|
||||
tri_skip_flat_shading:
|
||||
.endif
|
||||
// 52 cycles
|
||||
// 49 cycles
|
||||
vrcp $v20[2], tPosMmH[1]
|
||||
lb $20, (alphaCompareCullMode)($zero)
|
||||
vrcph $v22[2], tPosMmH[1]
|
||||
@@ -1471,7 +1474,7 @@ tri_skip_flat_shading:
|
||||
xor $24, $24, $20 // invert sign bit if other cond. Sign bit set -> cull
|
||||
bltz $24, return_and_end_mat // if max < thresh or if min >= thresh.
|
||||
tri_skip_alpha_compare_cull:
|
||||
// 63 cycles
|
||||
// 60 cycles
|
||||
vmudm tPosCatF, tPosCatI, vTRC_1000
|
||||
// no nop if tri_skip_alpha_compare_cull was unaligned
|
||||
vmadn tPosCatI, $v31, $v31[2] // 0
|
||||
@@ -1494,7 +1497,7 @@ tMx1W equ $v27
|
||||
vmadm $v29, tXPRcpI, tXPF
|
||||
mfc2 $16, tXPI[1]
|
||||
vmadn tXPF, tXPRcpF, tXPI
|
||||
lbu $7, textureSettings1 + 2
|
||||
|
||||
vmadh tXPI, tXPRcpI, tXPI
|
||||
lsv tMAtI[14], VTX_SCR_Z($2)
|
||||
vand $v22, $v20, vTRC_FFF8
|
||||
@@ -1504,11 +1507,8 @@ tMx1W equ $v27
|
||||
vmudh $v29, vOne, $v31[4] // 4
|
||||
lsv tLAtF[14], VTX_SCR_Z_FRAC($3)
|
||||
vmadn tXPF, tXPF, $v31[0] // -4
|
||||
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
|
||||
vmadh tXPI, tXPI, $v31[0] // -4
|
||||
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
|
||||
vmudn $v29, $v3, tHPos[0]
|
||||
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
|
||||
vmadl $v29, $v22, tSubPxHF[1]
|
||||
ssv tLPos[2], 0x0002(rdpCmdBufPtr) // Store YL edge coefficient
|
||||
vmadm $v29, tPosCatI, tSubPxHF[1]
|
||||
@@ -1516,17 +1516,17 @@ tMx1W equ $v27
|
||||
vmadn $v2, $v22, tSubPxHI[1]
|
||||
ssv tHPos[2], 0x0006(rdpCmdBufPtr) // Store YH edge coefficient
|
||||
vmadh $v3, tPosCatI, tSubPxHI[1]
|
||||
lw $19, otherMode1
|
||||
ori $11, $6, G_TRI_FILL // Combine geometry mode (only the low byte will matter) with the base triangle type to make the triangle command id
|
||||
tMnWI equ $v27
|
||||
tMnWF equ $v10
|
||||
vrcph $v29[0], tMx1W[0] // Reciprocal of max 1/W = min W
|
||||
andi $10, $16, 0x0080 // Extract the left major flag from $16
|
||||
or $11, $11, $9 // Incorporate whether textures are enabled into the triangle command id
|
||||
vrcpl tMnWF[0], tMx1W[1]
|
||||
or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
|
||||
lbu $7, textureSettings1 + 2
|
||||
vmudh t1WF, vOne, t1WI[1q]
|
||||
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
|
||||
sb $11, 0x0000(rdpCmdBufPtr) // Store the triangle command id
|
||||
vrcph tMnWI[0], $v31[2] // 0
|
||||
sb $zero, materialCullMode // This covers tri write out
|
||||
lw $19, otherMode1
|
||||
tSTWHMI equ $v22 // H = elems 0-2, M = elems 4-6; init W = 7FFF
|
||||
tSTWHMF equ $v25
|
||||
vmudh tSTWHMI, vOne, $v31[7] // 0x7FFF
|
||||
@@ -1550,8 +1550,11 @@ tSTWLF equ $v13
|
||||
vmadh tSTWHMI, tSTWHMI, t1WI[0h]
|
||||
ldv tPosLmH[8], 0x0030(rdpCmdBufPtr) // MmHY -> e4, LmHX -> e5, HmMX -> e6
|
||||
vmadn tSTWHMF, $v31, $v31[2] // 0
|
||||
andi $10, $16, 0x0080 // Extract the left major flag from $16
|
||||
vmudm $v29, tSTWLI, t1WF[6] // (S, T, 7FFF) * (1 or <1) for L
|
||||
or $10, $10, $7 // Combine the left major flag with the level and tile from the texture settings
|
||||
vmadh tSTWLI, tSTWLI, t1WI[6]
|
||||
sb $10, 0x0001(rdpCmdBufPtr) // Store the left major flag, level, and tile settings
|
||||
vmadn tSTWLF, $v31, $v31[2] // 0
|
||||
sdv tSTWHMI[0], 0x0020(rdpCmdBufPtr) // Move S, T, W Hi Int to temp mem
|
||||
vmrg tMAtI, tMAtI, tSTWHMI // Merge S, T, W Mid into elems 4-6
|
||||
@@ -1564,7 +1567,7 @@ tSTWLF equ $v13
|
||||
.if !ENABLE_PROFILING
|
||||
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
|
||||
.endif
|
||||
// 106 cycles
|
||||
// 103 cycles
|
||||
vmudl $v29, tXPF, tXPRcpF
|
||||
lsv tHAtF[14], VTX_SCR_Z_FRAC($1)
|
||||
vmadm $v29, tXPI, tXPRcpF
|
||||
@@ -1574,14 +1577,15 @@ tSTWLF equ $v13
|
||||
vmadh tXPRcpI, tXPI, tXPRcpI
|
||||
addi $2, rdpCmdBufPtr, 0x20 // Increment the triangle pointer by 0x20 bytes (edge coefficients)
|
||||
vmudh tPosLmH, tPosLmH, $v31[0h] // e1 LmHY * -4 = 4*HmLY; e456 MmHY,LmHX,HmMX *= 4
|
||||
andi $3, $6, G_SHADE
|
||||
tAtLmHF equ $v10
|
||||
tAtLmHI equ $v9
|
||||
tAtMmHF equ $v13
|
||||
tAtMmHI equ $v7
|
||||
vsubc tAtLmHF, tLAtF, tHAtF
|
||||
andi $3, $6, G_SHADE
|
||||
vsub tAtLmHI, tLAtI, tHAtI
|
||||
sll $1, $1, 14
|
||||
vsub tAtLmHI, tLAtI, tHAtI
|
||||
sb $zero, materialCullMode // This covers tri write out
|
||||
vsubc tAtMmHF, tMAtF, tHAtF
|
||||
sw $1, 0x0008(rdpCmdBufPtr) // Store XL edge coefficient
|
||||
vsub tAtMmHI, tMAtI, tHAtI
|
||||
@@ -1636,7 +1640,7 @@ tDaDyI equ $v7
|
||||
// DaDe = DaDx * factor
|
||||
tDaDeF equ $v8
|
||||
tDaDeI equ $v9
|
||||
// 135 cycles
|
||||
// 132 cycles
|
||||
vmadl $v29, tDaDxF, $v20[3]
|
||||
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
|
||||
vmadm $v29, tDaDxI, $v20[3]
|
||||
@@ -1677,7 +1681,7 @@ tri_return_from_decal_fix_z:
|
||||
slv tDaDeI[12], 0x08($10) // DzDeI:F
|
||||
bltz dmemAddr, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
|
||||
slv $v10[12], 0x00($10) // ZI:F
|
||||
// 156 cycles
|
||||
// 153 cycles
|
||||
flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
|
||||
mfc0 $11, SP_DMA_BUSY // Check if any DMA is in flight
|
||||
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
|
||||
@@ -2140,7 +2144,6 @@ vtx_after_dma:
|
||||
vtx_constants_for_clip:
|
||||
// Sets up constants needed for vertex loop, including during clipping.
|
||||
// Results fill vPerm1:4. Uses misc temps.
|
||||
lhu vGeomMid, geometryModeLabel + 1 // Load middle 2 bytes of geom mode
|
||||
.if CFG_NO_OCCLUSION_PLANE
|
||||
llv sFOG[0], (fogFactor - altBase)(altBaseReg) // Load fog multiplier 0 and offset 1
|
||||
ldv sVPO[0], (viewport + 8)($zero) // Load vtrans duplicated in 0-3 and 4-7
|
||||
@@ -2612,6 +2615,7 @@ tri_snake_over_input_buffer:
|
||||
j displaylist_dma_tri_snake // inputBufferPos is now 0; load whole buffer
|
||||
li nextRA, tri_snake_ret_from_input_buffer
|
||||
tri_snake_ret_from_input_buffer:
|
||||
li $ra, tri_snake_loop // Clobbered by DMA. Putting this in the loop saves an instruction but loop takes 1 more cycle per tri.
|
||||
j tri_snake_loop_from_input_buffer // inputBufferPos pointing to first byte loaded
|
||||
lbu $3, (inputBufferEnd)(inputBufferPos) // Load c; clear real index b sign bit -> don't exit
|
||||
|
||||
@@ -2913,12 +2917,13 @@ G_SETSCISSOR_handler: // $1 is 0 if jumped here
|
||||
j G_RDP_handler // Send the command to the RDP
|
||||
sw cmd_w1_dram, (scissorBottomRight)($1) // otherMode1 = scissorBottomRight + 8
|
||||
|
||||
G_GEOMETRYMODE_handler: // 5; $7 = G_GEOMETRYMODE (as negative) if jumped here
|
||||
lw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // load the geometry mode value
|
||||
G_GEOMETRYMODE_handler:
|
||||
lw $11, geometryModeLabel // load the geometry mode value
|
||||
and $11, $11, cmd_w0 // clears the flags in cmd_w0 (set in g*SPClearGeometryMode)
|
||||
or $11, $11, cmd_w1_dram // sets the flags in cmd_w1_dram (set in g*SPSetGeometryMode)
|
||||
sw $11, geometryModeLabel // update the geometry mode value
|
||||
j run_next_DL_command // run the next DL command
|
||||
sw $11, (geometryModeLabel + (0x100 - G_GEOMETRYMODE))($7) // update the geometry mode value
|
||||
lsr vGeomMid, $11, 8 // Middle 2 bytes of geom mode to lower 16 bits
|
||||
|
||||
G_TEXTURE_handler: // 4
|
||||
li $11, textureSettings1 - (texrectWord1 - G_TEXRECTFLIP_handler) // Calculate the offset from texrectWord1 and $11 for saving to textureSettings
|
||||
|
||||
Reference in New Issue
Block a user