From eaaa2fbcf55ee0a2a3d836993a1a24da65e96814 Mon Sep 17 00:00:00 2001
From: Sauraen <sauraen@gmail.com>
Date: Fri, 12 Apr 2024 22:56:32 -0700
Subject: [PATCH] Added performance results

---
 README.md | 85 ++++++++++++++++++++++++++++++++++++-------------------
 f3dex3.s  | 10 ++++++-
 2 files changed, 65 insertions(+), 30 deletions(-)

diff --git a/README.md b/README.md
index 67e5f00..698b940 100644
--- a/README.md
+++ b/README.md
@@ -158,6 +158,8 @@ framerate:
 - This only applies to vertex processing, not triangle processing or other
   miscellaneous microcode tasks. So the total RSP cycles spent doing useful work
   during the frame is only modestly increased.
+- The increase in time is only RSP cycles; there is no additional memory
+  traffic, so the RDP time is not directly affected.
 - In scenes which are complex enough to fill the RSP->RDP FIFO in DRAM, the RSP
   usually spends a significant fraction of time waiting for the FIFO to not be
   full (as revealed by the F3DEX3 performance counters, see below). In these
@@ -184,13 +186,13 @@ faster version based on the same algorithms as F3DEX2. This removes:
 
 However, it retains all other F3DEX3 features:
 - 56 verts, 9 directional lights
-- Occlusion plane (optional, see below)
+- Occlusion plane (optional with NOC configuration)
 - Z attribute offsets
 - All features not related to vertex/lighting: auto-batched rendering, packed 5
   triangles commands, hints system, etc.
 
-The performance of F3DEX3 vertex processing with LVP and NOC is almost the same
-as that of F3DEX2; see the Performance Results section below.
+The performance of F3DEX3 vertex processing with both LVP and NOC is almost the
+same as that of F3DEX2; see the Performance Results section below.
 
 ### Profiling
 
@@ -252,16 +254,36 @@ Some ways to use this for debugging are:
 
 ## Performance Results
 
-Vertex pipeline cycles per vertex pair in steady state. Hand-counted timings
-taking into account all pipeline stalls, but not instruction alignment.
+Vertex pipeline cycles per **vertex pair** in steady state. Hand-counted timings
+taking into account all pipeline stalls and all dual-issue conditions except for
+instruction alignment.
 
-| Microcode      | No Lighting | First Dir Lt | Second Dir Lt |
-|----------------|-------------|--------------|---------------|
-| F3DEX3         |
-| F3DEX3_NOC     |
-| F3DEX3_LVP     |
-| F3DEX3_LVP_NOC |
-| F3DEX2         | 54          | 19           | 3             |
+| Microcode      | No Lighting | First Dir Lt | Total for 1 Dir Lt | Extra Dir Lts |
+|----------------|-------------|--------------|--------------------|---------------|
+| F3DEX3         | 97          | 103          | 200                | 29            |
+| F3DEX3_NOC     | 79          | 103          | 182                | 29            |
+| F3DEX3_LVP     | 80          | 15           | 95                 | 7             |
+| F3DEX3_LVP_NOC | 62          | 15           | 77                 | 7             |
+| F3DEX2         | 54          | 19           | 73                 | 3 then 12     |
+
+Vertex processing time as reported by the performance counter in the `PA`
+configuration.
+- Scene 1: Kakariko, adult day, from DMT entrance
+- Scene 2: Custom empty scene with Suzanne monkey head with 1 dir light
+- Scene 3: Same but Suzanne has vertex colors instead of lighting (Link is still
+  on screen and has lighting)
+
+| Microcode      | Scene 1 | Scene 2 | Scene 3 |
+|----------------|---------|---------|---------|
+| F3DEX3         | 7.64ms  | 3.13ms  | 2.37ms  |
+| F3DEX3_NOC     | 7.07ms  | 2.89ms  | 2.14ms  |
+| F3DEX3_LVP     | 4.57ms  | 1.77ms  | 1.67ms  |
+| F3DEX3_LVP_NOC | 3.96ms  | 1.52ms  | 1.41ms  |
+| F3DEX2         | No*     | No*     | No*     |
+| Vertex count   | 3664    | 1608    | 1608    |
+
+*F3DEX2 does not contain performance counters, so the portion of the RSP time
+taken for vertex processing cannot be measured.
 
 
 ## Porting Your Romhack Codebase to F3DEX3
@@ -484,12 +506,7 @@ always use the new encoding.
 
 ### Vertex Processing RSP Time
 
-The vertex processing algorithm in F3DEX3 is redesigned compared to F3DEX2,
-which enables several of the new graphical features in F3DEX3 as well as the
-56 vertex buffer. With the new algorithm, the RSP takes significantly longer to
-process vertices in F3DEX3, especially vertices without lighting or with a very
-small number of directional lights. Note that this is RSP cycles only, not RDP
-cycles or DRAM traffic.
+See the Microcode Configuration and Performance Results sections above.
 
 ### Overlay 4
 
@@ -560,11 +577,15 @@ It is recommended to use `G_NORMALS_MODE_FAST` (the default) for most things,
 and use `G_NORMALS_MODE_AUTO` only for objects while they currently have a
 nonuniform scale (e.g. Mario only while he is squashed).
 
+Note that in the LVP configuration, lighting is computed in model space by
+transforming light directions into model space with M transpose, like in F3DEX2.
+Thus there is no mIT matrix and the SPNormalsMode setting is ignored.
+
 ### Optimizing for RSP code size
 
-A number of over-zealous optimizations in F3DEX2 which saved a few cycles but
-took several more instructions have been removed. This has a very small impact
-on overall RSP time and no impact on RDP time.
+A number of optimizations in F3DEX2 which saved a few cycles but took several
+more instructions have been removed. Outside of vertex processing, these have a
+very small impact on overall RSP time and no impact on RDP time.
 
 ### Far clipping removal
 
@@ -578,22 +599,25 @@ The removal of far clipping saved a bunch of DMEM space, and enabled other
 changes to the clipping implementation which saved even more DMEM space.
 
 NoN (No Nearclipping) is also mandatory in F3DEX3, though this was already the
-microcode option used in OoT.
+microcode option used in OoT. Note that tris are still clipped at the camera
+plane; nearclipping means they are clipped at the nearplane, which is a short
+distance in front of the camera plane.
 
 ### Removal of scaled vertex normals
 
 A few clever romhackers figured out that you could shrink the normals on verts
 in your mesh (so their length is less than "1") to make the lighting on those
-verts dimmer and create a version of ambient occlusion. F3DEX3 normalizes vertex
-normals after transforming them, which is required for most features of the
-lighting system including packed normals, so this no longer works. However,
-F3DEX3 has support for ambient occlusion via vertex alpha, which accomplishes
-the same goal with some extra benefits:
+verts dimmer and create a version of ambient occlusion. In the base vertex
+pipeline, F3DEX3 normalizes vertex normals after transforming them, which is
+required for most features of the lighting system including packed normals, so
+this no longer works. However, F3DEX3 has support for ambient occlusion via
+vertex alpha, which accomplishes the same goal with some extra benefits:
 - Much easier to create: just paint the vertex alpha in Blender / fast64. The
   scaled normals approach was not supported in fast64 and had to be done with
   scripts or by hand.
-- The amount of ambient occlusion in F3DEX3 can be set at runtime based on scene
-  lighting, whereas the scaled normals approach is baked into the mesh.
+- The amount of ambient occlusion in F3DEX3 can be set at runtime based on
+  variable scene lighting, whereas the scaled normals approach is baked into the
+  mesh.
 - F3DEX3 can have the vertex alpha affect ambient, directional, and point lights
   by different amounts, which is not possible with scaled normals. In fact,
   scaled normals never affect the ambient light, contrary to the concept of
@@ -607,6 +631,9 @@ F3DEX3 will fix the normals' scale but then apply the AO.
 The only case where scaled normals work but F3DEX3 AO doesn't work is for meshes
 with vertex alpha actually used for transparency (therefore also no fog).
 
+Note that in LVP mode, scaled normals are supported and work the same way as in
+F3DEX2, while ambient occlusion is not supported.
+
 ### RDP temporary buffers shrinking
 
 In FIFO versions of F3DEX2, there are two DMEM buffers to hold RDP commands
diff --git a/f3dex3.s b/f3dex3.s
index 431d79b..936d398 100644
--- a/f3dex3.s
+++ b/f3dex3.s
@@ -1416,7 +1416,7 @@ sWRL equ $v25 // vtx_store W Reciprocal Low  | IMPORTANT: Can be the same reg as
 sWRH equ $v26 // vtx_store W Reciprocal High | using different ones saves one cycle delay
     vmudl   $v29, vPairTPosF, $v30[3] // Persp norm
     move    secondVtxPos, outputVtxPos          // Second and output vertices write to same mem...
-    vmadm  s1WH, vPairTPosI, $v30[3] // Persp norm
+    vmadm   s1WH, vPairTPosI, $v30[3] // Persp norm
     bltz    $1, @@skipsecond                    // ...if < 0 verts remain, ...
      vmadn  s1WL, $v31, $v31[2] // 0
     addi    secondVtxPos, outputVtxPos, vtxSize // ...otherwise, second vtx is next vtx
@@ -3038,6 +3038,7 @@ lt_loop:
     lpv     vCCC[4], (ltBufOfs + 8 - lightSize)(curLight) // Light or lookat 0 dir in elems 4-6
     lbu     $11,     (ltBufOfs + 3 - lightSize)(curLight) // Light type / constant attenuation
     beq     curLight, altBaseReg, lt_post
+     // nop
      vmrg   vAAA, vAAA, vCCC                            // vAAA = light direction
     bnez    $11, lt_point
      luv    vDDD,    (ltBufOfs + 0 - lightSize)(curLight) // Light color
@@ -3045,6 +3046,7 @@ lt_loop:
     vmulf   vAAA, vAAA, vPairNrml // Light dir * normalized normals
     vmudh   $v29, vOne, $v31[7] // Load accum mid with 0x7FFF (1 in s.15)
     vmadm   vCCC, vPairRGBA, $v30[1] // + (alpha - 1) * aoDir factor; elems 3, 7
+    // vnop
     vmudh   $v29, vOne, vAAA[0h]
     vmadh   $v29, vOne, vAAA[1h]
     vmadh   vAAA, vOne, vAAA[2h]
@@ -3060,9 +3062,11 @@ lt_finish_light:
     vxor    vAAA, vAAA, $v31[7] // = 0x7FFF - result
 lt_skip_specular:
     vge     vAAA, vAAA, $v31[2] // 0; clamp dot product to >= 0
+    // vnop; vnop; vnop
     vmudm   $v29, vAAA, vBBB[2h] // Dot product int * scale frac
     vmadh   vAAA, vAAA, vBBB[3h] // Dot product int * scale int, clamp to 0x7FFF
     addi    curLight, curLight, -lightSize
+    // vnop; vnop
     vmudh   $v29, vOne, vPairLt // Load accum mid with current light level
     j       lt_loop
      vmacf  vPairLt, vDDD, vAAA[0h] // + light color * dot product
@@ -3078,6 +3082,7 @@ vLtAOut    equ $v26 // = vDDD: light / effects alpha output
     andi    $11, $5, G_LIGHTTOALPHA >> 8
     andi    $20, $5, G_PACKED_NORMALS >> 8
     andi    $10, $5, G_TEXTURE_GEN >> 8
+    // nop
     vmulf   vLtRGBOut, vPairRGBA, vPairLt  // RGB output is RGB * light
     beqz    $11, lt_skip_cel
      vcopy  vLtAOut, vPairRGBA             // Alpha output = vertex alpha (only 3, 7 matter)
@@ -3203,6 +3208,7 @@ lt_normalize:
     vreadacc vDDD, ACC_MIDDLE
     vreadacc vCCC, ACC_UPPER
     mtc2    $11, vPairLt[6] // Constant frac part in elem 3
+    // vnop; vnop
     vmudm   $v29, vOne, vDDD[2h] // Sum of squared components
     vmadh   $v29, vOne, vCCC[2h]
     srl     $11, $24, 5 // Top 3 bits
@@ -3214,6 +3220,7 @@ lt_normalize:
     ori     $20, $20, 0x20 // Append leading 1 to mantissa
     vmadh   vCCC, vCCC, vOne
     sllv    $20, $20, $11 // Left shift to create floating point
+    // vnop; vnop; vnop
     vrsqh   $v29[2], vCCC[0] // High input, garbage output
     sll     $20, $20, 8 // Min range 00002000, 00002100... 00003F00, max 00100000...001F8000
     vrsql   $v29[1], vDDD[0] // Low input, low output
@@ -3224,6 +3231,7 @@ lt_normalize:
     vrsql   $v29[5], vDDD[4] // Low input, low output
     vrsqh   $v29[4], $v31[2] // 0 input, high output
     mtc2    $20, vCCC[6] // Quadratic frac part in elem 3
+    // vnop; vnop; vnop
     vmudn   vBBB, vBBB, $v29[0h] // Vec frac * int scaling, discard result
     srl     $20, $20, 16
     vmadm   vBBB, vAAA, $v29[1h] // Vec int * frac scaling, discard result