mirror of
https://github.com/HackerN64/F3DEX3.git
synced 2026-01-21 10:37:45 -08:00
Updated documentation
This commit is contained in:
@@ -17,7 +17,7 @@ faster overall. Plus, RSP time is also saved for the tris which are not drawn,
|
||||
which can approximately cancel out the extra RSP time for computing the
|
||||
occlusion plane for all vertices.
|
||||
|
||||
## Functionality in Overlay 3
|
||||
## Functionality in overlays
|
||||
|
||||
The following commands are moved to Overlay 2 or 3 in F3DEX3 to save IMEM space.
|
||||
This means that code will have to be loaded from DRAM to run them if a different
|
||||
@@ -124,6 +124,20 @@ does not send the texture cofficients if they are disabled, saving DRAM access
|
||||
time for RSP -> FIFO and FIFO -> RDP. RDP time savings from avoiding loading a
|
||||
texture are unaffected of course.
|
||||
|
||||
## Yield check timing
|
||||
|
||||
In F3DEX2, the microcode checks whether the CPU has requested that it yield (to
|
||||
run the audio microcode) before running every display list command. F3DEX3 now
|
||||
performs this check every time the input buffer is refilled, which is typically
|
||||
once every 21 commands. The amount by which this delays the start of the audio
|
||||
microcode is typically very small, and worst case during normal conditions would
|
||||
be a few hundred microseconds. However, if the RDP FIFO is full during this
|
||||
time, the microcode will have to wait for the RDP to make progress through its
|
||||
workload to free up space for the outputs of the RSP commands. This will slow
|
||||
down the RSP to the RDP's speed, and since triangles can be arbitrarily large
|
||||
on screen, this can theoretically cause huge stalls. If you ever encounter this
|
||||
in practice, please contact Sauraen.
|
||||
|
||||
## Obscure semantic differences from F3DEX2 that should never matter in practice
|
||||
|
||||
- Changing fog settings--i.e. enabling or disabling `G_FOG` in the geometry mode
|
||||
|
||||
@@ -13,7 +13,8 @@ improve overall game performance from there.
|
||||
These are cycle counts for many key paths in the microcode. Lower numbers are
|
||||
better. The timings are hand-counted taking into account all pipeline stalls and
|
||||
all dual-issue conditions. Instruction alignment after branches is usually taken
|
||||
into account, but in some cases it is assumed to be optimal.
|
||||
into account (especially in F3DEX3), but in some cases it is assumed to be
|
||||
optimal.
|
||||
|
||||
All numbers assume default profiling configuration. Tri numbers assume texture,
|
||||
shade, and Z, and not flushing the buffer. Tri numbers are measured from the
|
||||
@@ -31,21 +32,21 @@ even to an odd number of lights adds a different time than vice versa.
|
||||
|
||||
| | F3DEX2 | F3DEX3_NOC | F3DEX3 |
|
||||
|----------------------------|--------|------------|--------|
|
||||
| Command dispatch | 12 | 12 | 12 |
|
||||
| Small RDP command | 14 | 5 | 5 |
|
||||
| Only/2nd tri to offscreen | 27 | 25 | 25 |
|
||||
| 1st tri to offscreen | 28 | 26 | 26 |
|
||||
| Only/2nd tri to clip | 32 | 30 | 30 |
|
||||
| 1st tri to clip | 33 | 31 | 31 |
|
||||
| Only/2nd tri to backface | 38 | 36 | 36 |
|
||||
| 1st tri to backface | 39 | 37 | 37 |
|
||||
| Only/2nd tri to degenerate | 42 | 38 | 38 |
|
||||
| 1st tri to degenerate | 43 | 39 | 39 |
|
||||
| Only/2nd tri to occluded | Can't | Can't | 42 |
|
||||
| 1st tri to occluded | Can't | Can't | 43 |
|
||||
| Only/2nd tri to draw | 172 | 156 | 158 |
|
||||
| 1st tri to draw | 173 | 157 | 159 |
|
||||
| Tri snake | Can't | * | * |
|
||||
| Command dispatch | 12 | 10 | 10 |
|
||||
| Small RDP command | 14 | 4 | 4 |
|
||||
| Only/2nd tri to offscreen | 27 | 20 | 20 |
|
||||
| 1st tri to offscreen | 28 | 21 | 21 |
|
||||
| Only/2nd tri to clip | 32 | 25 | 25 |
|
||||
| 1st tri to clip | 33 | 26 | 26 |
|
||||
| Only/2nd tri to backface | 38 | 31 | 31 |
|
||||
| 1st tri to backface | 39 | 32 | 32 |
|
||||
| Only/2nd tri to degenerate | 42 | 33 | 33 |
|
||||
| 1st tri to degenerate | 43 | 34 | 34 |
|
||||
| Only/2nd tri to occluded | Can't | Can't | 37 |
|
||||
| 1st tri to occluded | Can't | Can't | 38 |
|
||||
| Only/2nd tri to draw | 172 | 149 | 151 |
|
||||
| 1st tri to draw | 173 | 150 | 152 |
|
||||
| Tri snake | Can't | 10/11* | 10/11* |
|
||||
| Vtx before DMA start | 16 | 17 | 17 |
|
||||
| Vtx pair, no lighting | 54 | 54 | 70 |
|
||||
| Vtx pair, 0 dir lts | Can't | 65 | 81 |
|
||||
@@ -91,110 +92,25 @@ even to an odd number of lights adds a different time than vice versa.
|
||||
|
||||
## Triangle Snake Cycle Counts
|
||||
|
||||
### Very Long Snakes
|
||||
With the recent F3DEX3 updates bringing significant RSP time savings in command
|
||||
dispatch and triangle draw, triangle snakes are unfortuantely no longer
|
||||
competitive in RSP time.
|
||||
|
||||
For this section, we assume almost all tris are contained in very long snakes,
|
||||
so the overhead of starting and ending snakes is negligible. This overhead is
|
||||
discussed in the next section.
|
||||
Suppose we have two tris which are offscreen. If drawn with `SP2Triangles`, this
|
||||
is 10 cycles for command dispatch, 21 cycles to cull the first tri, and 20
|
||||
cycles to cull the second, for a total of 51 cycles. If drawn as part of a long
|
||||
triangle snake, the triangle snake processing adds 10 or 11 cycles relative to
|
||||
the `SP2Triangles` first or second triangle respectively. So this is 31 cycles
|
||||
to cull each triangle, for a total of 61 cycles.
|
||||
|
||||
We are assuming that the same set of tris is being drawn with or without snakes.
|
||||
Thus, cycles from `tri_main_from_snake` through the instruction after the return
|
||||
exclusive are not counted here, as they are the same regardless of which method
|
||||
is being used.
|
||||
It gets worse for snakes when counting the overhead of starting and ending a
|
||||
snake, which have also gotten worse with the recent changes bringing triangle
|
||||
performance improvements. I used to have a long discussion here computing
|
||||
estimated performance for switching to snakes, but the numbers have all changed
|
||||
and they were imprecise to begin with. The upshot is for a typical scene,
|
||||
switching everything from `SP2Triangles` to snakes might save about 70 us of
|
||||
RDRAM/RDP time but cost about 400 us of RSP time.
|
||||
|
||||
For a pair of tris drawn without snakes, i.e. with a single `SP2Triangles`
|
||||
command, the cycles are:
|
||||
- Command dispatch: 12
|
||||
- First tri up to `tri_main_from_snake`: 5
|
||||
- Second tri up to `tri_main_from_snake`: 4
|
||||
- Total: 21
|
||||
|
||||
For a pair of tris which are part of a long snake, the cycles are:
|
||||
- Each tri up to `tri_main_from_snake`: 11
|
||||
- Total: 22
|
||||
|
||||
However, there's also the memory bandwidth savings. The `SP2Triangles` command
|
||||
is 8 bytes and the two tris in a long snake are 2 bytes, so switching to snake
|
||||
saves 6 bytes of bandwidth. Testing has shown that RSP DMAs on average transfer
|
||||
about 2.2 bytes per cycle, though it depends on the length. So this is a savings
|
||||
of about 2.7 cycles of RDRAM / RDP time. Since the DMAs loading this data are
|
||||
input buffer loads, and the RSP stalls waiting for input buffer loads (it does
|
||||
not do useful work during this time), this is also 2.7 cycles of RSP time. This
|
||||
offsets the 1 extra cycle of processing the tri pair above.
|
||||
|
||||
Therefore, switching to snake (assuming very long snakes) saves about 2.7
|
||||
cycles of RDRAM / RDP time and 1.7 cycles of RSP time per two tris, or about
|
||||
0.9 RSP cycles and 1.4 RDRAM cycles per tri.
|
||||
|
||||
### Starting a Snake
|
||||
|
||||
Since a `SPTriSnake` command encodes 5 triangles, for comparison to
|
||||
`SP2Triangles` we will consider the overhead for 10 triangles total / two snake
|
||||
starts.
|
||||
|
||||
For `SPTriSnake`, this is 2 x (12 cycles command dispatch + 4 cycles snake
|
||||
initialization + 5 tris x 11 cycles per tri as discussed above) = 142 RSP
|
||||
cycles. And it is 16 bytes of loads = 7.3 cycles of RDRAM / RDP time and stall
|
||||
RSP time. So the total cost is 149.3 RSP and 7.3 RDRAM cycles.
|
||||
|
||||
For `SP2Triangles`, this is 5 x (21 cycles as discussed above) = 105 RSP cycles.
|
||||
And it is 40 bytes of loads = 18.2 cycles of RDRAM / RDP time and stall RSP
|
||||
time. So the total cost is 123.2 RSP and 18.2 RDRAM cycles.
|
||||
|
||||
But drawing those 10 tris as part of very long snakes would have saved 13.5
|
||||
RDRAM cycles and 8.5 RSP cycles. So the relative cost of drawing these tris as
|
||||
two start-of-snakes instead of in very long snakes is 34.6 RSP cycles and 2.6
|
||||
RDRAM cycles. Thus the cost of each start-of-snake relative to long snakes is
|
||||
17.3 RSP cycles and 1.3 RDRAM cycles.
|
||||
|
||||
### Ending a Snake
|
||||
|
||||
Ending a snake costs 12 cycles of RSP time and has no direct impact on memory
|
||||
traffic. However, calculating the overall performance is more complicated: the
|
||||
snake can end after 1-8 bytes of the `SPContinueSnake` command, and the
|
||||
remaining bytes are "wasted" in that they do not contribute to drawing tris
|
||||
with memory bandwidth savings.
|
||||
|
||||
From a mesh optimization standpoint, this is not an issue. If you have a snake
|
||||
which has filled 8 bytes of the previous `SPContinueSnake` command, and you have
|
||||
another triangle to draw, there are only two cases. If that tri can't be
|
||||
appended to the snake, you have to draw it with a `SP1Triangle` command either
|
||||
way, so there is no performance difference. If it can be appended to the snake,
|
||||
doing so will take 8 bytes of memory traffic--the same as the `SP1Triangle`
|
||||
command. The snake end penalty will have to be paid whether before or after this
|
||||
tri. And it's 11 RSP cycles to draw one more tri in an existing snake, whereas
|
||||
the command dispatch plus second tri code for `SP1Triangle` is 16 cycles. So
|
||||
it's better to continue a snake than to stop it early and use non-snake
|
||||
commands, even if this leads to a mostly empty `SPContinueSnake` command. Of
|
||||
course, if you can fill up even more tris in the command, the performance
|
||||
benefit increases.
|
||||
|
||||
Assuming snake lengths are uniformly distributed, on average a snake will end
|
||||
after 4.5 bytes (the same number of triangles) of a `SPContinueSnake` command.
|
||||
In this case, the command will take 4.5 tris x 11 cycles per tri + 12 cycle end
|
||||
snake penalty = 61.5 RSP cycles, and 8 bytes of memory traffic = 3.6 RDRAM
|
||||
cycles. If these 4.5 tris were instead drawn with `SP2Triangles` commands, that
|
||||
would be 2.25 commands = 47.3 RSP cycles and 18 bytes = 8.2 RDRAM cycles. Thus
|
||||
on average, the snake end costs 14.2 RSP cycles and saves 4.6 RDRAM cycles
|
||||
compared to `SP2Triangles` commands. But drawing those 4.5 tris as part of very
|
||||
long snakes would have saved 3.9 RSP cycles and 6.1 RDRAM cycles. So the average
|
||||
cost of ending a snake relative to very long snakes is 18.1 RSP and 1.5 RDRAM
|
||||
cycles.
|
||||
|
||||
### Example
|
||||
|
||||
Suppose there are 4000 tris on screen. Suppose that 90% of them have been
|
||||
encoded with snakes--the rest are disconnected single tris or tri pairs (quads).
|
||||
That 10% are then encoded with `SP2Triangles` commands, which is the same
|
||||
performance with or without snakes, so we ignore those tris, and there are
|
||||
3600 "snakeable" tris in the scene.
|
||||
|
||||
Suppose that the average snake length is 16, to account for some objects with
|
||||
more contiguous tris with the same material, and others with smaller disjoint
|
||||
parts. Thus, for 3600 tris, there are 225 snakes.
|
||||
|
||||
Switching the 3600 tris from `SP2Triangles` commands to long snakes saves
|
||||
4860 RDRAM cycles and 3060 RSP cycles. However, the 225 snake starts and ends
|
||||
cost 630 RDRAM and 7965 RSP cycles relative to this. So the total performance
|
||||
change of switching to snakes in this case is that the RDRAM / RDP goes faster
|
||||
by 4230 cycles = 68 us, but the RSP goes slower by 4905 cycles = 78 us.
|
||||
However, note that in F3DEX2, `SP2Triangles` to two offscreen triangles is
|
||||
12+28+27 = 67 cycles. F3DEX3 is so much faster than F3DEX2 that even the
|
||||
performance penalty of snakes doesn't outweigh this.
|
||||
|
||||
@@ -147,19 +147,20 @@ This is still an unlikely case though:
|
||||
this is probably poorly optimized to begin with.
|
||||
- When the FIFO fills up, this means wasted RSP time, even if this happens not
|
||||
to conflict with a yield. If the FIFO fills up often and RSP peformance is
|
||||
imporant in your game (either for audio or because the graphics are RSP bound),
|
||||
important in your game (either for audio or because the graphics are RSP bound),
|
||||
you should expand the FIFO.
|
||||
- A snake this long is rare in typical low-poly N64 meshes. And, the export tool
|
||||
could limit the maximum snake length generated.
|
||||
|
||||
A future version of F3DEX3 could allow the snake command to yield in the middle.
|
||||
This has not been implemented yet because it is very difficult to validate.
|
||||
Yields are rare relative to display list commands (typically 1-2 of the former
|
||||
and many thousands of the latter per frame). And, until we have a robust F3DEX3
|
||||
mesh optimizer and a game where most things are drawn with snakes (i.e. few
|
||||
vanilla assets left), snakes will also be rare in the display list. So it will
|
||||
be hard to know whether the yield-during-snake codepath is even being run, let
|
||||
alone whether it is correct in all cases.
|
||||
F3DEX3 now checks for yields whenever the input buffer is refilled, not before
|
||||
every command as in F3DEX2. When a triangle snake extends across the boundary of
|
||||
the input buffer, a yield can occur, and F3DEX3 correctly suspends and resumes
|
||||
the triangle snake in this case. So, while triangle snakes can be unlimited in
|
||||
length, because the input buffer is 21 commands = 168 bytes, there is guaranteed
|
||||
to be a yield check at least once every 168 triangles. (Any snake of 8 tris or
|
||||
more can potentially cross the input buffer end and therefore be interrupted.)
|
||||
This guarantee does not help practically though, as practically snakes will not
|
||||
be more than about 110 tris due to the vertex buffer size.
|
||||
|
||||
## Comparison with Tiny3D
|
||||
|
||||
|
||||
24
f3dex3.s
24
f3dex3.s
@@ -1412,7 +1412,7 @@ tri_from_clip:
|
||||
and $11, $11, $8
|
||||
vmrg tHPos, $v6, $v4 // v14 = v1.y < v2.y ? v1 : v2 (lower vertex of v1, v2)
|
||||
bnez $11, return_and_end_mat // Then the whole tri is offscreen, cull
|
||||
// 21 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
|
||||
// 16 cycles (for tri2 first tri; tri1/only subtract 1 from counts)
|
||||
vmudh $v29, $v10, $v12[1] // x = (v1 - v2).x * (v1 - v3).y ...
|
||||
vmadh $v26, $v12, $v11[1] // ... + (v1 - v3).x * (v2 - v1).y = cross product = dir tri is facing
|
||||
lhu $24, activeClipPlanes
|
||||
@@ -1428,18 +1428,18 @@ tri_from_clip:
|
||||
and $10, $10, $24 // If clipping is enabled, check clip flags
|
||||
vlt $v29, $v6, $v2 // VCO = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y)
|
||||
bnez $10, ovl234_clipmisc_entrypoint // Facing info and occlusion may be garbage if need to clip
|
||||
// 29 cycles
|
||||
// 24 cycles
|
||||
srl $11, $9, 31 // = 0 if x prod positive (back facing), 1 if x prod negative (front facing)
|
||||
vmudh $v3, vOne, $v31[5] // 0x4000; some rounding factor
|
||||
sllv $11, $20, $11 // Sign bit = bit 10 of geom mode if back facing, bit 9 if front facing
|
||||
vmrg tMPos, $v4, tLPos // v2 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2, vert3) ? highest(vert1, vert2)
|
||||
bltz $11, return_and_end_mat // Cull if bit is set (culled based on facing)
|
||||
// 32 cycles
|
||||
// 27 cycles
|
||||
vmrg tLPos, tLPos, $v4 // v10 = max(vert1.y, vert2.y, vert3.y) < max(vert1.y, vert2.y) : highest(vert1, vert2) ? highest(vert1, vert2, vert3)
|
||||
tSubPxHF equ $v4
|
||||
vmudn tSubPxHF, tHPos, $v31[5] // 0x4000
|
||||
beqz $9, return_and_end_mat // If cross product is 0, tri is degenerate (zero area), cull.
|
||||
// 34 cycles
|
||||
// 29 cycles
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
and $6, $6, $7
|
||||
.endif
|
||||
@@ -1454,10 +1454,10 @@ tSubPxHF equ $v4
|
||||
vsub tPosHmM, tHPos, tMPos
|
||||
.if !CFG_NO_OCCLUSION_PLANE
|
||||
bnez $6, tri_culled_by_occlusion_plane // Cull if all verts occluded
|
||||
// 38 cycles
|
||||
// 33 cycles
|
||||
.endif
|
||||
mfc2 $1, tHPos[4] // tHPos = lowest Y value = highest on screen (x, y, addr)
|
||||
// 37 cycles if NOC (39 if occlusion plane)
|
||||
// 32 cycles if NOC (34 if occlusion plane)
|
||||
tPosCatI equ $v15 // 0 X L-M; 1 Y L-M; 2 X M-H; 3 X L-H; 4-7 garbage
|
||||
vsub tPosCatI, tLPos, tMPos
|
||||
mfc2 $2, tMPos[4] // tMPos = mid vertex (x, y, addr)
|
||||
@@ -1492,7 +1492,7 @@ tXPRcpI equ $v24
|
||||
.endif
|
||||
vrcph tXPRcpI[1], $v31[2] // 0
|
||||
tri_return_from_flat_shading:
|
||||
// 44 cycles
|
||||
// 43 cycles
|
||||
vrcp $v20[2], tPosMmH[1]
|
||||
ssv tPosMmH[2], 0x0030(rdpCmdBufPtr) // MmHY -> first short (temp mem)
|
||||
vrcph $v22[2], tPosMmH[1]
|
||||
@@ -1514,7 +1514,7 @@ tri_return_from_flat_shading:
|
||||
vmadn $v20, $v31, $v31[2] // 0
|
||||
// $v6 <- tPosMmH; $v6 clobbered in alpha compare cull
|
||||
tri_return_from_alpha_compare_cull: // Uses $v25, $v26
|
||||
// 60 cycles
|
||||
// 53 cycles
|
||||
tPosCatF equ $v25
|
||||
vmudm tPosCatF, tPosCatI, vTRC_1000
|
||||
mtc2 $20, tMPos[14] // 0xFFF8; only elem 0, 1, 2 of this reg used now
|
||||
@@ -1615,7 +1615,7 @@ tSTWHMF equ $v25 // <- tMnWI
|
||||
.if !ENABLE_PROFILING
|
||||
addi perfCounterA, perfCounterA, 1 // Increment number of tris sent to RDP
|
||||
.endif
|
||||
// 103 cycles
|
||||
// 96 cycles
|
||||
vmudl $v29, tXPF, tXPRcpF
|
||||
lsv tHAtF[14], VTX_SCR_Z_FRAC($1)
|
||||
vmadm $v29, tXPI, tXPRcpF
|
||||
@@ -1688,7 +1688,7 @@ tDaDyF equ $v6
|
||||
// DaDe = DaDx * factor
|
||||
tDaDeF equ $v8
|
||||
tDaDeI equ $v9
|
||||
// 132 cycles
|
||||
// 125 cycles
|
||||
vmadl $v29, tDaDxF, $v20[3]
|
||||
sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional)
|
||||
vmadm $v29, tDaDxI, $v20[3]
|
||||
@@ -1713,7 +1713,7 @@ tDaDeI equ $v9
|
||||
// All values start in element 7. "a", attribute, is Z. Need
|
||||
// tHAtI, tHAtF, tDaDxI, tDaDxF, tDaDeI, tDaDeF, tDaDyI, tDaDyF
|
||||
// VCC is still 11110001
|
||||
// 145 cycles
|
||||
// 135 cycles
|
||||
vmrg tDaDyI, tDaDyF, tDaDyI[7] // Elems 6-7: DzDyI:F
|
||||
beqz $19, tri_decal_fix_z
|
||||
vmrg tDaDxI, tDaDxF, tDaDxI[7] // Elems 6-7: DzDxI:F
|
||||
@@ -1730,7 +1730,7 @@ tri_return_from_decal_fix_z:
|
||||
slv tDaDeI[12], 0x08($10) // DzDeI:F
|
||||
bltz dmemAddr, return_and_end_mat // Return if rdpCmdBufPtr < end+1 i.e. ptr <= end
|
||||
slv $v10[12], 0x00($10) // ZI:F
|
||||
// 153 cycles
|
||||
// 146 cycles
|
||||
flush_rdp_buffer: // Prereq: dmemAddr = rdpCmdBufPtr - rdpCmdBufEndP1, or dmemAddr = large neg num -> only wait and set DPC_END
|
||||
mfc0 $11, SP_DMA_BUSY // Check if any DMA is in flight
|
||||
lw cmd_w1_dram, rdpFifoPos // FIFO pointer = end of RDP read, start of RSP write
|
||||
|
||||
Reference in New Issue
Block a user