Replaced mtxf_to_mtx_asm with an equivalent implementation that supports WORLD_SCALE

2026-01-21 10:35:32 -08:00 · 2021-12-21 08:40:20 -05:00
parent eb2211948d
commit d4d8a0540f
7 changed files with 98 additions and 55 deletions
--- a/README.md
+++ b/README.md
@@ -23,11 +23,11 @@ This is a fork of the ultrasm64 repo by CrashOveride which includes the followin
 - **JoshDuMan**: decomp guy, general assistance
 - **Arceveti**: silhouette, shadow optimisation, better hanging, breath meter
 - **axollyon**: Console testing, bugfixes, idea-guying, and had a hand in silhouettes
- **Wiseguy**:  silhouette, graph node optimisations, instant input patch, cake screen fix, segmented code support, and various optimizations/fixes
+- **Wiseguy**:  World scale reimplementation, silhouette, graph node optimisations, instant input patch, cake screen fix, segmented code support, and various optimizations/fixes
 - **Kaze**: Graph node optimisations, automatic optimal collision distance
 - **Pyro Jay**: Texture improvements, repo banner art, some QoL stuff
 - **CrashOveride**: creating the `ultrasm64` repo
- **falcobuster**: coordinate overflow fix (world scale), ASM version of extended bounds
+- **falcobuster**: Original coordinate overflow fix (world scale), ASM version of extended bounds
 - **anonymous_moose**: porting falco's extended bounds to decomp

 Thanks to Frame#5375 and AloXado320 for also helping with silhouette stuff
@@ -99,7 +99,7 @@ Thanks to Frame#5375 and AloXado320 for also helping with silhouette stuff
 - Instant warp offset fix (makes the instant warp offset work even when warping to a different area) *
 - haveyourcake, also known as cake screen fix. Made by Wiseguy and ported/PR'd by Cheezepin
 - Tree particle fix (Whether a tree uses snow particles or not is decided via the model IDs instead of the course number) *
- Coordinate overflow fix by falcobuster. Your levels will render correctly on console and LLE emulators even when using 2x or 4x bounds, while not hurting anything on HLE plugins. *This is automatic now, you don't have to set WORLD_SCALE manually.*
+- Adjustable world scale. You can change the geometry scaling of your level, which allow large levels to render correctly on console and LLE emulators while not hurting anything on HLE plugins.
 - A couple vanilla texture fixes
 - Smoke fix (the smoke texture uses the correct texture format)

--- a/asm/math.s
+++ b/asm/math.s
@@ -1,48 +0,0 @@
-# assembler directives
-.set gp=64
-
-.include "macros.inc"
-
-
-.section .text, "ax"
-
-.balign 32
-glabel mtxf_to_mtx_asm
-    li $v0, 1
-    li.s $f4, 65536.0
-1:
-    lwc1 $f0, ($a1)
-    lwc1 $f2, 0x04($a1)
-
-    andi $t0, $v0, (1 << 1)
-    mul.s $f0, $f4
-    trunc.w.s $f0, $f0
-    mfc1 $t3, $f0
-    addiu $a1, 8
-    sra $t4, $t3, 16
-    sh $t4, 0x00($a0)
-    sh $t3, 0x20($a0)
-
-    addiu $v0, 2
-    bnez $t0, storezero
-
-    mul.s $f2, $f4
-    trunc.w.s $f2, $f2
-    mfc1 $t3, $f2
-    sra $t4, $t3, 16
-    sh $t4, 0x02($a0)
-    sh $t3, 0x22($a0)
-loopend:
-.set noreorder
-    bnel $v0, 0x11, 1b
-     addiu $a0, 4
-.set reorder
-
-    li $t1, 1
-    sh $t1, 0x02($a0)
-    jr $ra
-storezero:
-    sh $zero, 0x02($a0)
-    sh $zero, 0x22($a0)
-    j loopend
-
--- a/include/world_scale.h
+++ b/include/world_scale.h
@@ -0,0 +1,9 @@
+#ifndef __WORLD_SCALE_H__
+#define __WORLD_SCALE_H__
+
+// World scale value. This allows you to scale down geometry by the given amount, which allows for larger levels
+// without the distortion you would otherwise get. Larger world scale comes at a cost of precision, which
+// can increase Z-fighting. Values above 4 should not be necessary.
+#define WORLD_SCALE 1
+
+#endif
--- a/sm64.ld
+++ b/sm64.ld
@@ -251,7 +251,6 @@ SECTIONS
      BUILD_DIR/src/game/obj_behaviors.o(.text*);
      BUILD_DIR/src/game/object_helpers.o(.text*);
      BUILD_DIR/src/game/rendering_graph_node.o(.text*);
-      BUILD_DIR/asm/math.o(.text*);
      BUILD_DIR/src/engine/math_util.o(.text*);
      BUILD_DIR/src/engine/surface_collision.o(.text*);
      BUILD_DIR/src/engine/surface_load.o(.text*);
--- a/src/engine/math_util.c
+++ b/src/engine/math_util.c
@@ -1490,3 +1490,85 @@ void find_surface_on_ray(Vec3f orig, Vec3f dir, struct Surface **hit_surface, Ve
        }
    }
 }
+
+#include <world_scale.h>
+
+// Constructs a float in registers, which can be faster than gcc's default of loading a float from rodata.
+// Especially fast for halfword floats, which get loaded with a `lui` + `mtc1`.
+static ALWAYS_INLINE float construct_float(const float f)
+{
+    u32 r;
+    float f_out;
+    u32 i = *(u32*)(&f);
+
+    if (!__builtin_constant_p(i))
+    {
+        return *(float*)(&i);
+    }
+
+    u32 upper = (i >> 16);
+    u32 lower = (i >>  0) & 0xFFFF;
+
+    if ((i & 0xFFFF) == 0) {
+        __asm__ ("lui %0, %1"
+                                : "=r"(r)
+                                : "K"(upper));
+    } else if ((i & 0xFFFF0000) == 0) {
+        __asm__ ("addiu %0, $0, %1"
+                                : "+r"(r)
+                                : "K"(lower));
+    } else {
+        __asm__ ("lui %0, %1"
+                                : "=r"(r)
+                                : "K"(upper));
+        __asm__ ("addiu %0, %0, %1"
+                                : "+r"(r)
+                                : "K"(lower));
+    }
+
+    __asm__ ("mtc1 %1, %0"
+                         : "=f"(f_out)
+                         : "r"(r));
+    return f_out;
+}
+
+// Converts a floating point matrix to a fixed point matrix
+// Makes some assumptions about certain fields in the matrix, which will always be true for valid matrices.
+__attribute__((optimize("Os")))
+void mtxf_to_mtx_fast(s16* dst, float* src)
+{
+    float scale = construct_float(65536.0f / WORLD_SCALE);
+    // Iterate over pairs of values in the input matrix
+    for (int i = 0; i < 8; i++)
+    {
+        // Read the first input in the current pair
+        float a = src[2 * i + 0];
+
+        // Convert the first input to fixed
+        s32 a_int = (s32)(a * scale);
+        dst[2 * i +  0] = (s16)(a_int >> 16);
+        dst[2 * i + 16] = (s16)(a_int >>  0);
+
+        // If this is the left half of the matrix, convert the second input to fixed
+        if ((i & 1) == 0)
+        {
+            // Read the second input in the current pair
+            float b = src[2 * i + 1];
+            s32 b_int = (s32)(b * scale);
+            dst[2 * i +  1] = (s16)(b_int >> 16);
+            dst[2 * i + 17] = (s16)(b_int >>  0);
+        }
+        // Otherwise, skip the second input because column 4 will always be zero
+        // Row 4 column 4 is handled after the loop.
+        else
+        {
+            dst[2 * i +  1] = 0;
+            dst[2 * i + 17] = 0;
+        }
+
+    }
+    // Write 1.0 to the bottom right entry in the output matrix
+    // The low half was already set to zero in the loop, so we only need
+    //  to set the top half.
+    dst[15] = 1;
+}
--- a/src/engine/math_util.h
+++ b/src/engine/math_util.h
@@ -551,9 +551,9 @@ void mtxf_mul(Mat4 dest, Mat4 a, Mat4 b);
 void mtxf_scale_vec3f(Mat4 dest, Mat4 mtx, Vec3f s);
 void mtxf_mul_vec3s(Mat4 mtx, Vec3s b);

-extern void mtxf_to_mtx_asm(register void *dest, register void *src);
+extern void mtxf_to_mtx_fast(register s16 *dest, register float *src);
 ALWAYS_INLINE void mtxf_to_mtx(register void *dest, register void *src) {
-    mtxf_to_mtx_asm(dest, src);
+    mtxf_to_mtx_fast((s16*)dest, (float*)src);
    // guMtxF2L(src, dest);
 }

--- a/src/game/rendering_graph_node.c
+++ b/src/game/rendering_graph_node.c
@@ -18,6 +18,7 @@
 #include "behavior_data.h"
 #include "string.h"
 #include "color_presets.h"
+#include "world_scale.h"

 #include "config.h"

@@ -501,7 +502,7 @@ void geo_process_perspective(struct GraphNodePerspective *node) {
        sAspectRatio = 4.0f / 3.0f; // 1.33333f
 #endif

-        guPerspective(mtx, &perspNorm, node->fov, sAspectRatio, node->near, node->far, 1.0f);
+        guPerspective(mtx, &perspNorm, node->fov, sAspectRatio, node->near / (f32)WORLD_SCALE, node->far / (f32)WORLD_SCALE, 1.0f);
        gSPPerspNormalize(gDisplayListHead++, perspNorm);

        gSPMatrix(gDisplayListHead++, VIRTUAL_TO_PHYSICAL(mtx), G_MTX_PROJECTION | G_MTX_LOAD | G_MTX_NOPUSH);