diff --git a/docs/Documentation/Configuration.md b/docs/Documentation/Configuration.md index 471edd5..cc49bad 100644 --- a/docs/Documentation/Configuration.md +++ b/docs/Documentation/Configuration.md @@ -74,8 +74,8 @@ However, it retains all other F3DEX3 features: - All features not related to vertex/lighting: auto-batched rendering, packed 5 triangles commands, hints system, etc. -The performance of F3DEX3 vertex processing with both LVP and NOC is almost the -same as that of F3DEX2; see the Performance Results section below. +The performance of F3DEX3 vertex processing with both LVP and NOC is nearly +identical that of F3DEX2; see the Performance page. ## Profiling diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md index 04fecbd..4fbdd12 100644 --- a/docs/Documentation/Performance.md +++ b/docs/Documentation/Performance.md @@ -1,30 +1,53 @@ @page performance Performance Results +# Philosophy + +The base version of F3DEX3 was created for RDP bound games like OoT, where new +visual effects are desired and increasing the RSP time a bit does not affect the +overall performance. F3DEX3_LVP_NOC was created + # Performance Results -Cycle counts; lower is better. These are hand-counted timings taking into -account all pipeline stalls and all dual-issue conditions. Instruction alignment -is sometimes taken into account, otherwise assumed to be optimal. +These are cycle counts for all the critical paths in the microcode. Lower is +better. The timings are hand-counted taking into account all pipeline stalls and +all dual-issue conditions. Instruction alignment is sometimes taken into +account, otherwise assumed to be optimal. Vertex / lighting numbers assume no special features (texgen, packed normals, -etc.) Tri numbers assume texture, shade, and Z. Empty cells are "not measured -yet". +etc.) Tri numbers assume texture, shade, and Z. All numbers assume default +profiling configuration. Empty cells are "not measured yet". -| | F3DEX2 | F3DEX3_LVP_NOC | F3DEX3_LVP | F3DEX3_NOC | F3DEX3 | -|-----------------------|--------|----------------|------------|------------|--------| -| Vtx pair, no lighting | 54 | 54 | 81 | 79 | 98 | -| Vtx pair, 0 dir lts | Can't | | | | | -| Vtx pair, 1 dir lt | 73 | 70 | 96 | 182 | 201 | -| Vtx pair, 2 dir lts | 76 | 77 | 103 | 211 | 230 | -| Vtx pair, 3 dir lts | 88 | 84 | 110 | 240 | 259 | -| Vtx pair, 4 dir lts | 91 | 91 | 117 | 269 | 288 | -| Vtx pair, 5 dir lts | 103 | 98 | 124 | 298 | 317 | -| Vtx pair, 6 dir lts | 106 | 105 | 131 | 327 | 346 | -| Vtx pair, 7 dir lts | 118 | 112 | 138 | 356 | 375 | -| Vtx pair, 8 dir lts | Can't | 119 | 145 | 385 | 404 | -| Vtx pair, 9 dir lts | Can't | 126 | 152 | 414 | 433 | +| | F3DEX2 | F3DEX3_LVP_NOC | F3DEX3_LVP | F3DEX3_NOC | F3DEX3 | +|----------------------------|--------|----------------|------------|------------|--------| +| Vtx pair, no lighting | 54 | 54 | 81 | 79 | 98 | +| Vtx pair, 0 dir lts | Can't | 64 | | | | +| Vtx pair, 1 dir lt | 73 | 70 | 96 | 182 | 201 | +| Vtx pair, 2 dir lts | 76 | 77 | 103 | 211 | 230 | +| Vtx pair, 3 dir lts | 88 | 84 | 110 | 240 | 259 | +| Vtx pair, 4 dir lts | 91 | 91 | 117 | 269 | 288 | +| Vtx pair, 5 dir lts | 103 | 98 | 124 | 298 | 317 | +| Vtx pair, 6 dir lts | 106 | 105 | 131 | 327 | 346 | +| Vtx pair, 7 dir lts | 118 | 112 | 138 | 356 | 375 | +| Vtx pair, 8 dir lts | Can't | 119 | 145 | 385 | 404 | +| Vtx pair, 9 dir lts | Can't | 126 | 152 | 414 | 433 | +| Command dispatch | 12 | 12 | +| Only/2nd tri to offscreen | 27 | 29 | +| 1st tri to offscreen | 28 | 29 | +| Only/2nd tri to clip | 32 | 31 | +| 1st tri to clip | 33 | 31 | +| Only/2nd tri to backface | 38 | 40 | +| 1st tri to backface | 39 | 40 | +| Only/2nd tri to degenerate | 42 | 42 | +| 1st tri to degenerate | 43 | 42 | +| Only/2nd tri to occluded | Can't | Can't | 49 | Can't | 49 | +| 1st tri to occluded | Can't | Can't | 49 | Can't | 49 | +| Only/2nd tri to draw | 172 | 170 | 171 | 170 | 171 | +| 1st tri to draw | 173 | 170 | 171 | 170 | 171 | +Tri numbers are measured from the first cycle of the command handler inclusive, +to the first cycle of whatever is after the return exclusive. This is in order +to capture the extra mfc0 to mfc0 stall due to return_routine in F3DEX2. Vertex processing time as reported by the performance counter in the `PA` diff --git a/f3dex3.s b/f3dex3.s index d0c755d..a70858b 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -2454,6 +2454,7 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping and $10, $10, $24 // If clipping is enabled, check clip flags vge $v6, $v13, $v8[1] // v6 = max(max(vert1.y, vert2.y), vert3.y), VCO = max(vert1.y, vert2.y) > vert3.y bnez $10, ovl234_clipping_entrypoint // Facing info and occlusion may be garbage if need to clip + // 29 cycles mfc2 $9, $v26[0] // elem 0 = x = cross product => lower 16 bits, sign extended vmrg $v4, $v14, $v8 // v4 = max(vert1.y, vert2.y) > vert3.y : higher(vert1, vert2) ? vert3 (highest vertex of vert1, vert2, vert3) and $5, $5, $7 @@ -2484,9 +2485,11 @@ tri_noinit: // ra is next cmd, second tri in TRI2, or middle of clipping vsub $v15, $v10, $v2 .if !CFG_NO_OCCLUSION_PLANE andi $5, $5, CLIP_OCCLUDED - bnez $5, tri_culled_by_occlusion_plane // Cull if all verts occluded .endif vmudh $v29, $v6, $v8[0] +.if !CFG_NO_OCCLUSION_PLANE + bnez $5, tri_culled_by_occlusion_plane // Cull if all verts occluded +.endif llv $v13[0], VTX_INV_W_VEC($1) vmadh $v29, $v8, $v11[0] lpv tV1AtI[0], VTX_COLOR_VEC($1) // Load vert color of vertex 1 @@ -2699,6 +2702,7 @@ tDaDyI equ $v7 // DaDe = DaDx * factor tDaDeF equ $v8 tDaDeI equ $v9 + // 137 cycles vmadl $v29, tDaDxF, $v20[3] sdv tDaDxF[8], 0x0018($1) // Store DsDx, DtDx, DwDx texture coefficients (fractional) vmadm $v29, tDaDxI, $v20[3] @@ -2752,7 +2756,8 @@ tV1AtFF equ $v10 // 162 cycles .if CFG_NO_OCCLUSION_PLANE || CFG_LEGACY_VTX_PIPE - // If we have room for the extra instructions. Z disabled is rare. + // If we have room for the extra instructions. Z disabled is rare, so the + // extra 8 cycles of finishing the dummy Z write above isn't too much of a problem. no_z_buffer: sdv tV1AtF[0], 0x0010($2) // Store RGBA shade color (fractional) sdv tV1AtI[0], 0x0000($2) // Store RGBA shade color (integer)