From eaaa2fbcf55ee0a2a3d836993a1a24da65e96814 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Fri, 12 Apr 2024 22:56:32 -0700 Subject: [PATCH] Added performance results --- README.md | 85 ++++++++++++++++++++++++++++++++++++------------------- f3dex3.s | 10 ++++++- 2 files changed, 65 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 67e5f00..698b940 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,8 @@ framerate: - This only applies to vertex processing, not triangle processing or other miscellaneous microcode tasks. So the total RSP cycles spent doing useful work during the frame is only modestly increased. +- The increase in time is only RSP cycles; there is no additional memory + traffic, so the RDP time is not directly affected. - In scenes which are complex enough to fill the RSP->RDP FIFO in DRAM, the RSP usually spends a significant fraction of time waiting for the FIFO to not be full (as revealed by the F3DEX3 performance counters, see below). In these @@ -184,13 +186,13 @@ faster version based on the same algorithms as F3DEX2. This removes: However, it retains all other F3DEX3 features: - 56 verts, 9 directional lights -- Occlusion plane (optional, see below) +- Occlusion plane (optional with NOC configuration) - Z attribute offsets - All features not related to vertex/lighting: auto-batched rendering, packed 5 triangles commands, hints system, etc. -The performance of F3DEX3 vertex processing with LVP and NOC is almost the same -as that of F3DEX2; see the Performance Results section below. +The performance of F3DEX3 vertex processing with both LVP and NOC is almost the +same as that of F3DEX2; see the Performance Results section below. ### Profiling @@ -252,16 +254,36 @@ Some ways to use this for debugging are: ## Performance Results -Vertex pipeline cycles per vertex pair in steady state. Hand-counted timings -taking into account all pipeline stalls, but not instruction alignment. +Vertex pipeline cycles per **vertex pair** in steady state. Hand-counted timings +taking into account all pipeline stalls and all dual-issue conditions except for +instruction alignment. -| Microcode | No Lighting | First Dir Lt | Second Dir Lt | -|----------------|-------------|--------------|---------------| -| F3DEX3 | -| F3DEX3_NOC | -| F3DEX3_LVP | -| F3DEX3_LVP_NOC | -| F3DEX2 | 54 | 19 | 3 | +| Microcode | No Lighting | First Dir Lt | Total for 1 Dir Lt | Extra Dir Lts | +|----------------|-------------|--------------|--------------------|---------------| +| F3DEX3 | 97 | 103 | 200 | 29 | +| F3DEX3_NOC | 79 | 103 | 182 | 29 | +| F3DEX3_LVP | 80 | 15 | 95 | 7 | +| F3DEX3_LVP_NOC | 62 | 15 | 77 | 7 | +| F3DEX2 | 54 | 19 | 73 | 3 then 12 | + +Vertex processing time as reported by the performance counter in the `PA` +configuration. +- Scene 1: Kakariko, adult day, from DMT entrance +- Scene 2: Custom empty scene with Suzanne monkey head with 1 dir light +- Scene 3: Same but Suzanne has vertex colors instead of lighting (Link is still + on screen and has lighting) + +| Microcode | Scene 1 | Scene 2 | Scene 3 | +|----------------|---------|---------|---------| +| F3DEX3 | 7.64ms | 3.13ms | 2.37ms | +| F3DEX3_NOC | 7.07ms | 2.89ms | 2.14ms | +| F3DEX3_LVP | 4.57ms | 1.77ms | 1.67ms | +| F3DEX3_LVP_NOC | 3.96ms | 1.52ms | 1.41ms | +| F3DEX2 | No* | No* | No* | +| Vertex count | 3664 | 1608 | 1608 | + +*F3DEX2 does not contain performance counters, so the portion of the RSP time +taken for vertex processing cannot be measured. ## Porting Your Romhack Codebase to F3DEX3 @@ -484,12 +506,7 @@ always use the new encoding. ### Vertex Processing RSP Time -The vertex processing algorithm in F3DEX3 is redesigned compared to F3DEX2, -which enables several of the new graphical features in F3DEX3 as well as the -56 vertex buffer. With the new algorithm, the RSP takes significantly longer to -process vertices in F3DEX3, especially vertices without lighting or with a very -small number of directional lights. Note that this is RSP cycles only, not RDP -cycles or DRAM traffic. +See the Microcode Configuration and Performance Results sections above. ### Overlay 4 @@ -560,11 +577,15 @@ It is recommended to use `G_NORMALS_MODE_FAST` (the default) for most things, and use `G_NORMALS_MODE_AUTO` only for objects while they currently have a nonuniform scale (e.g. Mario only while he is squashed). +Note that in the LVP configuration, lighting is computed in model space by +transforming light directions into model space with M transpose, like in F3DEX2. +Thus there is no mIT matrix and the SPNormalsMode setting is ignored. + ### Optimizing for RSP code size -A number of over-zealous optimizations in F3DEX2 which saved a few cycles but -took several more instructions have been removed. This has a very small impact -on overall RSP time and no impact on RDP time. +A number of optimizations in F3DEX2 which saved a few cycles but took several +more instructions have been removed. Outside of vertex processing, these have a +very small impact on overall RSP time and no impact on RDP time. ### Far clipping removal @@ -578,22 +599,25 @@ The removal of far clipping saved a bunch of DMEM space, and enabled other changes to the clipping implementation which saved even more DMEM space. NoN (No Nearclipping) is also mandatory in F3DEX3, though this was already the -microcode option used in OoT. +microcode option used in OoT. Note that tris are still clipped at the camera +plane; nearclipping means they are clipped at the nearplane, which is a short +distance in front of the camera plane. ### Removal of scaled vertex normals A few clever romhackers figured out that you could shrink the normals on verts in your mesh (so their length is less than "1") to make the lighting on those -verts dimmer and create a version of ambient occlusion. F3DEX3 normalizes vertex -normals after transforming them, which is required for most features of the -lighting system including packed normals, so this no longer works. However, -F3DEX3 has support for ambient occlusion via vertex alpha, which accomplishes -the same goal with some extra benefits: +verts dimmer and create a version of ambient occlusion. In the base vertex +pipeline, F3DEX3 normalizes vertex normals after transforming them, which is +required for most features of the lighting system including packed normals, so +this no longer works. However, F3DEX3 has support for ambient occlusion via +vertex alpha, which accomplishes the same goal with some extra benefits: - Much easier to create: just paint the vertex alpha in Blender / fast64. The scaled normals approach was not supported in fast64 and had to be done with scripts or by hand. -- The amount of ambient occlusion in F3DEX3 can be set at runtime based on scene - lighting, whereas the scaled normals approach is baked into the mesh. +- The amount of ambient occlusion in F3DEX3 can be set at runtime based on + variable scene lighting, whereas the scaled normals approach is baked into the + mesh. - F3DEX3 can have the vertex alpha affect ambient, directional, and point lights by different amounts, which is not possible with scaled normals. In fact, scaled normals never affect the ambient light, contrary to the concept of @@ -607,6 +631,9 @@ F3DEX3 will fix the normals' scale but then apply the AO. The only case where scaled normals work but F3DEX3 AO doesn't work is for meshes with vertex alpha actually used for transparency (therefore also no fog). +Note that in LVP mode, scaled normals are supported and work the same way as in +F3DEX2, while ambient occlusion is not supported. + ### RDP temporary buffers shrinking In FIFO versions of F3DEX2, there are two DMEM buffers to hold RDP commands diff --git a/f3dex3.s b/f3dex3.s index 431d79b..936d398 100644 --- a/f3dex3.s +++ b/f3dex3.s @@ -1416,7 +1416,7 @@ sWRL equ $v25 // vtx_store W Reciprocal Low | IMPORTANT: Can be the same reg as sWRH equ $v26 // vtx_store W Reciprocal High | using different ones saves one cycle delay vmudl $v29, vPairTPosF, $v30[3] // Persp norm move secondVtxPos, outputVtxPos // Second and output vertices write to same mem... - vmadm s1WH, vPairTPosI, $v30[3] // Persp norm + vmadm s1WH, vPairTPosI, $v30[3] // Persp norm bltz $1, @@skipsecond // ...if < 0 verts remain, ... vmadn s1WL, $v31, $v31[2] // 0 addi secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx @@ -3038,6 +3038,7 @@ lt_loop: lpv vCCC[4], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 4-6 lbu $11, (ltBufOfs + 3 - lightSize)(curLight) // Light type / constant attenuation beq curLight, altBaseReg, lt_post + // nop vmrg vAAA, vAAA, vCCC // vAAA = light direction bnez $11, lt_point luv vDDD, (ltBufOfs + 0 - lightSize)(curLight) // Light color @@ -3045,6 +3046,7 @@ lt_loop: vmulf vAAA, vAAA, vPairNrml // Light dir * normalized normals vmudh $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15) vmadm vCCC, vPairRGBA, $v30[1] // + (alpha - 1) * aoDir factor; elems 3, 7 + // vnop vmudh $v29, vOne, vAAA[0h] vmadh $v29, vOne, vAAA[1h] vmadh vAAA, vOne, vAAA[2h] @@ -3060,9 +3062,11 @@ lt_finish_light: vxor vAAA, vAAA, $v31[7] // = 0x7FFF - result lt_skip_specular: vge vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0 + // vnop; vnop; vnop vmudm $v29, vAAA, vBBB[2h] // Dot product int * scale frac vmadh vAAA, vAAA, vBBB[3h] // Dot product int * scale int, clamp to 0x7FFF addi curLight, curLight, -lightSize + // vnop; vnop vmudh $v29, vOne, vPairLt // Load accum mid with current light level j lt_loop vmacf vPairLt, vDDD, vAAA[0h] // + light color * dot product @@ -3078,6 +3082,7 @@ vLtAOut equ $v26 // = vDDD: light / effects alpha output andi $11, $5, G_LIGHTTOALPHA >> 8 andi $20, $5, G_PACKED_NORMALS >> 8 andi $10, $5, G_TEXTURE_GEN >> 8 + // nop vmulf vLtRGBOut, vPairRGBA, vPairLt // RGB output is RGB * light beqz $11, lt_skip_cel vcopy vLtAOut, vPairRGBA // Alpha output = vertex alpha (only 3, 7 matter) @@ -3203,6 +3208,7 @@ lt_normalize: vreadacc vDDD, ACC_MIDDLE vreadacc vCCC, ACC_UPPER mtc2 $11, vPairLt[6] // Constant frac part in elem 3 + // vnop; vnop vmudm $v29, vOne, vDDD[2h] // Sum of squared components vmadh $v29, vOne, vCCC[2h] srl $11, $24, 5 // Top 3 bits @@ -3214,6 +3220,7 @@ lt_normalize: ori $20, $20, 0x20 // Append leading 1 to mantissa vmadh vCCC, vCCC, vOne sllv $20, $20, $11 // Left shift to create floating point + // vnop; vnop; vnop vrsqh $v29[2], vCCC[0] // High input, garbage output sll $20, $20, 8 // Min range 00002000, 00002100... 00003F00, max 00100000...001F8000 vrsql $v29[1], vDDD[0] // Low input, low output @@ -3224,6 +3231,7 @@ lt_normalize: vrsql $v29[5], vDDD[4] // Low input, low output vrsqh $v29[4], $v31[2] // 0 input, high output mtc2 $20, vCCC[6] // Quadratic frac part in elem 3 + // vnop; vnop; vnop vmudn vBBB, vBBB, $v29[0h] // Vec frac * int scaling, discard result srl $20, $20, 16 vmadm vBBB, vAAA, $v29[1h] // Vec int * frac scaling, discard result