From a10d3dbe06b2ff97fbc4df8c342ac5e5435fe410 Mon Sep 17 00:00:00 2001 From: Sauraen Date: Sun, 17 Nov 2024 22:30:52 -0800 Subject: [PATCH] Updated documentation --- Makefile | 23 +++------- README.md | 20 +++++---- docs/Documentation/Backwards Compatibility.md | 5 +++ docs/Documentation/Configuration.md | 44 ++++++++----------- docs/Documentation/Design Tradeoffs.md | 17 +++---- docs/Documentation/Performance.md | 17 ++++--- docs/Documentation/Porting your Romhack.md | 6 ++- .../{Minimal Scanlines.md => Removed.md} | 16 ++++++- docs/documentation.md | 4 +- 9 files changed, 76 insertions(+), 76 deletions(-) rename docs/Documentation/{Minimal Scanlines.md => Removed.md} (85%) diff --git a/Makefile b/Makefile index 683f0c9..feb762a 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,6 @@ ALL_OPTIONS := \ CFG_G_BRANCH_W \ CFG_NO_OCCLUSION_PLANE \ CFG_LEGACY_VTX_PIPE \ - CFG_EXTRA_PRECISION \ CFG_PROFILING_A \ CFG_PROFILING_B \ CFG_PROFILING_C @@ -157,24 +156,14 @@ define rule_builder_prof $$(eval $$(call rule_builder_final)) endef -define rule_builder_xp - NAME_PROF := $(NAME_XP) - OPTIONS_PROF := $(OPTIONS_XP) - $$(eval $$(call rule_builder_prof)) - - NAME_PROF := $(NAME_XP)_XP - OPTIONS_PROF := $(OPTIONS_XP) CFG_EXTRA_PRECISION - $$(eval $$(call rule_builder_prof)) -endef - define rule_builder_noc - NAME_XP := $(NAME_NOC) - OPTIONS_XP := $(OPTIONS_NOC) - $$(eval $$(call rule_builder_xp)) + NAME_PROF := $(NAME_NOC) + OPTIONS_PROF := $(OPTIONS_NOC) + $$(eval $$(call rule_builder_prof)) - NAME_XP := $(NAME_NOC)_NOC - OPTIONS_XP := $(OPTIONS_NOC) CFG_NO_OCCLUSION_PLANE - $$(eval $$(call rule_builder_xp)) + NAME_PROF := $(NAME_NOC)_NOC + OPTIONS_PROF := $(OPTIONS_NOC) CFG_NO_OCCLUSION_PLANE + $$(eval $$(call rule_builder_prof)) endef define rule_builder_lvp diff --git a/README.md b/README.md index 58d9042..d3b9fc1 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,10 @@ through the docs folder). Compared to F3DEX2 or any other F3D family microcode, F3DEX3 is... - faster on the RDP -- in `LVP_NOC` configuration (see docs), also faster on the RSP +- in `LVP_NOC` configuration ([see docs](https://hackern64.github.io/F3DEX3/configuration.html)), [also faster on the RSP](https://hackern64.github.io/F3DEX3/performance.html) - more accurate - full of new visual features -- measurable in performance +- [measurable in performance](https://hackern64.github.io/F3DEX3/counters.html) all at the same time! @@ -60,12 +60,9 @@ all at the same time! lights (including point) with different dynamic colors, whereas the vanilla system supports up to two directional lights and more than one dynamic color is difficult. -- New geometry mode bits `G_ATTROFFSET_ST_ENABLE` and `G_ATTROFFSET_Z_ENABLE` - apply settable offsets to vertex ST (`SPAttrOffsetST`) and/or Z - (`SPAttrOffsetZ`) values. These offsets are applied after their respective - scales. For Z, this enables a method of drawing coplanar surfaces like decals - but **without the Z fighting** which can happen with the RDP's native decal - mode. For ST, this enables **UV scrolling** without CPU intervention. +- New geometry mode bit `G_ATTROFFSET_ST_ENABLE` applies a settable offset to + vertex ST (`SPAttrOffsetST`) after the texture scale. This enables **UV + scrolling** without CPU intervention. ### Performance improvements @@ -111,6 +108,13 @@ all at the same time! ### Miscellaneous +- **Z-fighting of decals has been nearly eliminated**, with only a modest + increase in overdraw of very close occluding geometry. This is based on a + technique developed by SGI, neglected and removed by Nintendo, and re-added + by Rare; the F3DEX3 version improves upon it by choosing optimal parameters + and automatically enabling it for all decals with no code or DL changes. In + addition, the reduction in Z buffer precision from F3DEX(1) to F3DEX2 has been + reversed, and additional Z buffer precision beyond F3DEX(1) has been added. - **Point lighting** has been redesigned. The appearance when a light is close to an object has been improved. Fixed a bug in F3DEX2/ZEX point lighting where a Z component was accidentally doubled in the point lighting calculations. The diff --git a/docs/Documentation/Backwards Compatibility.md b/docs/Documentation/Backwards Compatibility.md index 6e75320..1e89bc0 100644 --- a/docs/Documentation/Backwards Compatibility.md +++ b/docs/Documentation/Backwards Compatibility.md @@ -7,6 +7,11 @@ F3DEX3 is backwards compatible with F3DEX2 at the C GBI level for all features and commands except: +- The viewport Y scale has been negated, and `G_MAXZ` has been renamed as its + value has changed. See the comment near `G_MAXZ` in the GBI. +- For the same reason, in `BrZ` configuration, any Z threshold values in + `SPBranchLessZ*` which are hard-coded into display lists (not based on + `G_MAXZ`) must be multiplied by 0x20. - The `G_SPECIAL_*` command IDs have been removed. `G_SPECIAL_2` and `G_SPECIAL_3` were no-ops in F3DEX2, and `G_SPECIAL_1` was a trigger to recalculate the MVP matrix. There is no MVP matrix in F3DEX3 so this is diff --git a/docs/Documentation/Configuration.md b/docs/Documentation/Configuration.md index 80d7b0a..712731f 100644 --- a/docs/Documentation/Configuration.md +++ b/docs/Documentation/Configuration.md @@ -1,4 +1,4 @@ -@page microcode Microcode Configuration +@page configuration Microcode Configuration # Microcode Configuration @@ -35,30 +35,29 @@ and otherwise use the base version. The primary tradeoff for all the new lighting features in F3DEX3 is increased RSP time for vertex processing. The base version of F3DEX3 takes about **2-2.5x** more RSP time for vertex processing than F3DEX2 (see Performance -Results section below), assuming no lighting or directional lights only. -However, under most circumstances, this does not affect the game's overall -framerate: -- This only applies to vertex processing, not triangle processing or other - miscellaneous microcode tasks. So the total RSP cycles spent doing useful work - during the frame is only modestly increased. +Results section below), assuming no lighting or directional lights only. You +should use the F3DEX3 performance counters (see below) to determine whether your +game is usually RSP or RDP bound. + +If your game is usually RDP bound--like OoT--this generally will not affect the +game's overall framerate, so you should stick with base F3DEX3: +- The increased time only applies to vertex processing, not triangle processing + or other miscellaneous microcode tasks. So the total RSP cycles spent doing + useful work during the frame is only modestly increased. - The increase in time is only RSP cycles; there is no additional memory traffic, so the RDP time is not directly affected. - In scenes which are complex enough to fill the RSP->RDP FIFO in DRAM, the RSP usually spends a significant fraction of time waiting for the FIFO to not be - full (as revealed by the F3DEX3 performance counters, see below). In these - cases, slower vertex processing simply means less time spent waiting, and - little to no change in total RSP time. + full, as revealed by the performance counters. In these cases, slower vertex + processing simply means less time spent waiting, and little to no change in + total RSP time. - When the FIFO does not fill up, usually the RSP takes significantly less time during the frame compared to the RDP, so increased RSP time usually does not affect the overall framerate. -As a result, you should always start with the base version of F3DEX3 in your -romhack, and if the RSP never becomes the bottleneck, you can stick with that. - -However, if you have done extreme optimizations in your game to reduce RDP time -(i.e. if you are Kaze Emanuar), it's possible for the RSP to sometimes become -the bottleneck with F3DEX3's advanced vertex processing. As a result, the Legacy -Vertex Pipeline (LVP) configuration has been introduced. +However, for RSP bound or extremely optimized (Kaze Emanuar) games, base F3DEX3 +can become a bottleneck, so the Legacy Vertex Pipeline (LVP) configuration has +been introduced. This configuration replaces F3DEX3's native vertex and lighting code with a faster version based on the same algorithms as F3DEX2. This removes: @@ -70,12 +69,11 @@ faster version based on the same algorithms as F3DEX2. This removes: However, it retains all other F3DEX3 features: - 56 verts, 9 directional lights - Occlusion plane (optional with NOC configuration) -- Z attribute offsets - All features not related to vertex/lighting: auto-batched rendering, packed 5 triangles commands, hints system, etc. -The performance of F3DEX3 vertex processing with both LVP and NOC is nearly -identical that of F3DEX2; see the Performance page. +With both LVP and NOC enabled, F3DEX3 is faster on the RSP than F3DEX2 (see +@ref performance). ## Profiling @@ -107,12 +105,6 @@ SM64), or `BrW` if the microcode is replacing F3DZEX (i.e. OoT or MM). This controls whether `SPBranchLessZ*` uses the vertex's W coordinate or screen Z coordinate. -## Extra Precision (`XP`) - -This configuration attempts to reproduce F3DEX(1) numerical behavior for Z -buffer coefficients, potentially improving Z fighting in some cases of decals or -opaque surfaces intended to behave like decals. - ## Debug Normals (`dbgN`) Debug Normals has been moved out of the Makefile as it is not a microcode diff --git a/docs/Documentation/Design Tradeoffs.md b/docs/Documentation/Design Tradeoffs.md index 8523999..d873abf 100644 --- a/docs/Documentation/Design Tradeoffs.md +++ b/docs/Documentation/Design Tradeoffs.md @@ -82,12 +82,6 @@ It is recommended to use `G_NORMALS_MODE_FAST` (the default) for most things, and use `G_NORMALS_MODE_AUTO` only for objects while they currently have a nonuniform scale (e.g. Mario only while he is squashed). -## Optimizing for RSP code size - -A number of optimizations in F3DEX2 which saved a few cycles but took several -more instructions have been removed. Outside of vertex processing, these have a -very small impact on overall RSP time and no impact on RDP time. - ## Far clipping removal Far clipping is completely removed in F3DEX3. Far clipping is not intentionally @@ -165,12 +159,11 @@ segment 0 must always be 0x00000000 so that this address resolves to e.g. In F3DEX2, the RSP time for drawing non-textured tris was significantly lower than for textured tris, by skipping a chunk of computation for the texture -coefficients if they were disabled. In F3DEX3, little to no computation is -skipped when textures are disabled, which means that the performance gain from -disabling textures in F3DEX2 has been mostly eliminated. (RDP time savings from -avoiding loading a texture are unaffected of course.) However, almost all -materials use textures, and F3DEX3 is a little faster at drawing textured tris -than F3DEX2, so this is still a benefit overall. +coefficients if they were disabled. In F3DEX3, no computation is skipped when +textures are disabled. However, almost all materials use textures, and F3DEX3 is +a little faster at drawing textured tris than F3DEX2. Plus, DRAM access time RSP +-> FIFO and FIFO -> RDP is still saved from not sending the coefficients, and +RDP time savings from avoiding loading a texture are unaffected of course. ## Obscure semantic differences from F3DEX2 that should never matter in practice diff --git a/docs/Documentation/Performance.md b/docs/Documentation/Performance.md index 9cb8e98..9006b93 100644 --- a/docs/Documentation/Performance.md +++ b/docs/Documentation/Performance.md @@ -7,17 +7,19 @@ visual effects are desired and increasing the RSP time a bit does not affect the overall performance. If your game is RSP bound, using the base version of F3DEX3 will make it slower. -Conversely, F3DEX3_LVP_NOC matches or beats the RSP performance of F3DEX2 on all -critical paths in the microcode, including command dispatch, vertex processing, -and triangle processing. Then, the RDP and memory traffic performance -improvements of F3DEX3--56 vertex buffer, auto-batched rendering, etc.--should -further improve performance from there. This means that switching from F3DEX2 to -F3DEX3_LVP_NOC should always improve performance regardless of whether your game -is RSP bound or RDP bound. +Conversely, F3DEX3_LVP_NOC matches or beats the RSP performance of F3DEX2 on +**all** critical paths in the microcode, including command dispatch, vertex +processing, and triangle processing. Then, the RDP and memory traffic +performance improvements of F3DEX3--56 vertex buffer, auto-batched rendering, +etc.--should further improve performance from there. This means that switching +from F3DEX2 to F3DEX3_LVP_NOC should always improve performance regardless of +whether your game is RSP bound or RDP bound. # Performance Results +## Cycle Counts + These are cycle counts for many key paths in the microcode. Lower numbers are better. The timings are hand-counted taking into account all pipeline stalls and all dual-issue conditions. Instruction alignment after branches is sometimes @@ -72,6 +74,7 @@ Tri numbers are measured from the first cycle of the command handler inclusive, to the first cycle of whatever is after $ra exclusive. This is in order to capture the extra latency and stalls in F3DEX2. +## Measurements Vertex processing time as reported by the performance counter in the `PA` configuration. diff --git a/docs/Documentation/Porting your Romhack.md b/docs/Documentation/Porting your Romhack.md index 5e6a217..f42e36b 100644 --- a/docs/Documentation/Porting your Romhack.md +++ b/docs/Documentation/Porting your Romhack.md @@ -37,13 +37,15 @@ similar for other games): Both OoT and SM64: -- Remove uses of internal GBI features which have been removed in F3DEX3 (see @ref compatibility for full list). In OoT, the only changes - needed are: +- Remove uses of internal GBI features which have been removed in F3DEX3 (see + @ref compatibility for full list). In OoT, the only changes needed are: - In `src/code/ucode_disas.c`, remove the switch statement cases for `G_LINE3D`, `G_MW_CLIP`, `G_MV_MATRIX`, `G_MVO_LOOKATX`, `G_MVO_LOOKATY`, and `G_MW_PERSPNORM`. - In `src/libultra/gu/lookathil.c`, remove the lines which set the `col`, `colc`, and `pad` fields. + - In each place `G_MAXZ` is used, a compiler error will be generated; + negate the Y scale in each related viewport and change to `G_NEW_MAXZ`. - Change your game engine lighting code to set the `type` (formerly `pad1`) field to 0 in the initialization of any directional light (`Light_t` and derived structs like `Light` or `Lightsn`). F3DEX3 ignores the state of the diff --git a/docs/Documentation/Minimal Scanlines.md b/docs/Documentation/Removed.md similarity index 85% rename from docs/Documentation/Minimal Scanlines.md rename to docs/Documentation/Removed.md index 220050b..039172a 100644 --- a/docs/Documentation/Minimal Scanlines.md +++ b/docs/Documentation/Removed.md @@ -1,6 +1,10 @@ -@page minimal-scanlines What happened to the clipping minimal scanlines algorithm? +@page removed Removed Features -# What happened to the clipping minimal scanlines algorithm? +# Removed Features + +These features were present in earlier F3DEX3 versions, but have been removed. + +## Clipping minimal scanlines algorithm Earlier F3DEX3 versions included a modified algorithm for triangulating the polygon which was formed as the result of clipping. This algorithm broke up the @@ -57,3 +61,11 @@ The best we can do, which is what all previous F3D family microcodes did and F3DEX3 does now, is to triangulate in a consistent way, based on the winding of the input triangles. The results are still wrong, but they're wrong the same way every frame, so there are no abrupt changes visible. + +## Z attribute offsets + +Earlier F3DEX3 versions included attribute offsets for vertex Z as well as ST. +By setting this to -2 and drawing an opaque tri, the tri would appear like a +decal, but with no Z-fighting. This has been removed and replaced with the decal +fix, which is automatic and does not require any special setup in the display +list. diff --git a/docs/documentation.md b/docs/documentation.md index 605dfd3..f96dda2 100644 --- a/docs/documentation.md +++ b/docs/documentation.md @@ -1,7 +1,7 @@ # Documentation - @subpage compatibility -- @subpage microcode +- @subpage configuration - @subpage design-tradeoffs -- @subpage minimal-scanlines +- @subpage removed - @subpage performance - @subpage porting