Fixed typo in config_graphics.h

Implemented shear shadows and slightly faster mtxf_to_mtx function
2026-01-21 10:35:32 -08:00 · 2022-01-22 22:19:40 -05:00 · 2022-01-22 22:13:12 -05:00
16 changed files with 301 additions and 5827 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -116,7 +116,3 @@ lib/libs2d_engine.a

 # .orig files
 *.orig
-
-# Python
-__pycache__
-*.pyc
--- a/README.md
+++ b/README.md
@@ -29,7 +29,6 @@ This is a fork of the ultrasm64 repo by CrashOveride which includes the followin
 - **CrashOveride**: creating the [ultrasm64](https://github.com/CrashOveride95/ultrasm64) repo
 - **falcobuster**: Original coordinate overflow fix (world scale), ASM version of extended bounds
 - **anonymous_moose**: porting falco's extended bounds to decomp
- **tuxlovesyou**: `LOAD_MIO0_TEXTURE` macro and moral support

 Thanks to Frame#5375 and AloXado320 for also helping with silhouette stuff

--- a/include/config/config_graphics.h
+++ b/include/config/config_graphics.h
@@ -57,6 +57,11 @@
 // Use 64x64 quarter shadow textures (Vanilla are 16x16).
 #define HD_SHADOWS

+// Stretches shadows to fit the terrain instead of rotating them to align with it.
+// This makes them maintain a constant horizontal size.
+// Performs better than regular shadows.
+// #define SHEAR_SHADOWS
+
 // Makes certain objects (mainly trees) transparent when the camera gets close.
 // #define OBJ_OPACITY_BY_CAM_DIST

--- a/include/level_commands.h
+++ b/include/level_commands.h
@@ -310,9 +310,6 @@ enum GoddardScene {
    CMD_PTR(romEnd)
 #endif

-#undef LOAD_MIO0_TEXTURE
-#define LOAD_MIO0_TEXTURE(a,b,c) LOAD_YAY0_TEXTURE(a,b,c)
-
 #define CHANGE_AREA_SKYBOX(area, segStart, segEnd) \
    CMD_BBH(LEVEL_CMD_CHANGE_AREA_SKYBOX, 0x0C, area), \
    CMD_PTR(segStart), \
--- a/libmario/Makefile
+++ b/libmario/Makefile
@@ -1,11 +0,0 @@
-DECOMP_INPUT_FILES := ../src/game/mario_actions_airborne.c ../src/game/mario_actions_automatic.c ../src/game/mario_actions_cutscene.c \
-    ../src/game/mario_actions_moving.c ../src/game/mario_actions_object.c ../src/game/mario_actions_stationary.c ../src/game/mario_actions_submerged.c \
-	../src/game/mario_misc.c ../src/game/mario_step.c ../src/game/mario.c ../src/engine/math_util.c ../src/game/object_helpers.c
-
-all: libmario.dll libmario.so
-
-libmario.dll: $(wildcard mariolib/*.c) $(DECOMP_INPUT_FILES)
-	x86_64-w64-mingw32-gcc $^ -o $@ -shared -DNON_MATCHING -DAVOID_UB -D_LANGUAGE_C -ggdb -I.. -I../include -I../include/n64 -I../src/engine -I../src/game -I../src -Ism64 -lm -Wl,--subsystem,windows
-
-libmario.so: $(wildcard mariolib/*.c) $(DECOMP_INPUT_FILES)
-	gcc $^ -o $@ -shared -DNON_MATCHING -DAVOID_UB -D_LANGUAGE_C -ggdb -I.. -I../include -I../include/n64 -I../src/engine -I../src/game -I../src -Ism64 -lm -fPIC
--- a/libmario/inputs.py
+++ b/libmario/inputs.py
--- a/libmario/mariolib/anims.c
+++ b/libmario/mariolib/anims.c
--- a/libmario/mariolib/funcs.c
+++ b/libmario/mariolib/funcs.c
--- a/libmario/test.py
+++ b/libmario/test.py
@@ -1,205 +0,0 @@
-#!/usr/bin/env python3
-import inputs
-import time
-import threading
-import sys
-import fpstimer
-import os
-from ctypes import *
-
-pluginPath = os.path.dirname(os.path.realpath(__file__))
-
-library = 'libmario.so'
-if sys.platform == 'win32':
-    library = 'libmario.dll'
-if sys.platform == 'darwin':
-    library = 'libmario.dynlib'
-
-libmario = CDLL(os.path.join(pluginPath, library))
-
-    # /*0x00*/ s16 type;
-    # /*0x02*/ s16 force;
-    # /*0x04*/ s8 flags;
-    # /*0x05*/ s8 room;
-    # /*0x06*/ s16 lowerY;
-    # /*0x08*/ s16 upperY;
-    # /*0x0A*/ Vec3s vertex1;
-    # /*0x10*/ Vec3s vertex2;
-    # /*0x16*/ Vec3s vertex3;
-    # /*0x1C*/ struct {
-    #     f32 x;
-    #     f32 y;
-    #     f32 z;
-    # } normal;
-    # /*0x28*/ f32 originOffset;
-    # /*0x2C*/ struct Object *object;
-
-Vec3f = (c_float * 3)
-Vec3s = (c_int16 * 3)
-class Surface(Structure):
-    _fields_ = [
-        ('type', c_int16),
-        ('force', c_int16),
-        ('flags', c_int8),
-        ('room', c_int8),
-        ('lowerY', c_int16),
-        ('upperY', c_int16),
-        ('vertex1', Vec3s),
-        ('vertex2', Vec3s),
-        ('vertex3', Vec3s),
-        ('normal', Vec3f),
-        ('origin_offset', c_float),
-        ('object', c_void_p)
-    ]
-
-FindFloorHandlerType = CFUNCTYPE(c_float, c_float, c_float, c_float, POINTER(Surface), POINTER(c_int32))
-FindCeilHandlerType = CFUNCTYPE(c_float, c_float, c_float, c_float, POINTER(Surface), POINTER(c_int32))
-FindWallsHandlerType = CFUNCTYPE(c_int32, c_float, c_float, c_float, c_float, c_float, POINTER(Surface), POINTER(c_float))
-FindWaterLevelHandlerType = CFUNCTYPE(c_float, c_float, c_float)
-
-libmario.init.restype = None
-libmario.init.artypes = [FindFloorHandlerType, FindCeilHandlerType, FindWallsHandlerType, FindWaterLevelHandlerType]
-
-libmario.step.restype = None
-libmario.step.artypes = [c_int32, c_float, c_float]
-
-libmario.getMarioPosition.restype = None
-libmario.getMarioPosition.artypes = [Vec3f]
-
-libmario.getMarioVelocity.restype = None
-libmario.getMarioVelocity.artypes = [Vec3f]
-
-libmario.getMarioAnimFrame.restype = c_int32
-libmario.getMarioAnimFrame.artypes = []
-
-libmario.getMarioAnimIndex.restype = c_int32
-libmario.getMarioAnimIndex.artypes = []
-
-CONT_A      = 0x8000
-CONT_B      = 0x4000
-CONT_G      = 0x2000
-CONT_START  = 0x1000
-CONT_UP     = 0x0800
-CONT_DOWN   = 0x0400
-CONT_LEFT   = 0x0200
-CONT_RIGHT  = 0x0100
-CONT_L      = 0x0020
-CONT_R      = 0x0010
-CONT_E      = 0x0008
-CONT_D      = 0x0004
-CONT_C      = 0x0002
-CONT_F      = 0x0001
-
-A_BUTTON     = CONT_A
-B_BUTTON     = CONT_B
-L_TRIG       = CONT_L
-R_TRIG       = CONT_R
-Z_TRIG       = CONT_G
-START_BUTTON = CONT_START
-U_JPAD       = CONT_UP
-L_JPAD       = CONT_LEFT
-R_JPAD       = CONT_RIGHT
-D_JPAD       = CONT_DOWN
-U_CBUTTONS   = CONT_E
-L_CBUTTONS   = CONT_C
-R_CBUTTONS   = CONT_F
-D_CBUTTONS   = CONT_D
-
-events = []
-_t = None
-_handler = None
-
-def find_floor(x, y, z, surface_out, found_out):
-    found_out[0] = 1
-    surface_out[0].vertex1[0] = -100
-    surface_out[0].vertex1[1] = 0
-    surface_out[0].vertex1[2] = -100
-
-    surface_out[0].vertex2[0] = 100
-    surface_out[0].vertex2[1] = 0
-    surface_out[0].vertex2[2] = -100
-
-    surface_out[0].vertex3[0] = 100
-    surface_out[0].vertex3[1] = 0
-    surface_out[0].vertex3[2] = 100
-
-    surface_out[0].normal[0] = 0.0
-    surface_out[0].normal[1] = 1.0
-    surface_out[0].normal[2] = 0.0
-
-    surface_out[0].origin_offset = 0.0
-
-    return -1000.0
-
-def find_water_level(x, z):
-    return -100.0
-
-# Needs to be global to avoid getting garbage collected during execution
-find_floor_handler = FindFloorHandlerType(find_floor)
-find_water_level_handler = FindWaterLevelHandlerType(find_water_level)
-
-def worker():
-    global events
-    while True:
-        events.append(inputs.get_gamepad())
-        
-
-def main():
-    global _t
-    global _handler
-    global events
-    if not _t :
-        _t = threading.Thread(target=worker)
-        _t.daemon = True
-        _t.start()
-    stick_x = 0.0
-    stick_y = 0.0
-    buttons = 0
-    libmario.init(find_floor_handler, None, None, find_water_level_handler)
-    timer = fpstimer.FPSTimer(30)
-    try:
-        while True:
-            while len(events) > 0 :
-                for event in events[0]:
-                    if event.code == "ABS_X":
-                        stick_x = float(event.state) / 32768.0
-                    elif event.code == "ABS_Y":
-                        stick_y = float(event.state) / 32768.0
-                    elif event.code == "ABS_RX":
-                        gpd_input = "Right Stick X"
-                    elif event.code == "ABS_RY":
-                        gpd_input = "Right Stick Y"
-                    elif event.code == "BTN_SOUTH":
-                        if event.state == 1:
-                            buttons |= A_BUTTON
-                        else:
-                            buttons &= ~A_BUTTON
-                    elif event.code == "BTN_WEST":
-                        if event.state == 1:
-                            buttons |= B_BUTTON
-                        else:
-                            buttons &= ~B_BUTTON
-                    elif event.code == "ABS_Z":
-                        if event.state == 255:
-                            buttons |= Z_TRIG
-                        else:
-                            buttons &= ~Z_TRIG
-                    elif event.code != "SYN_REPORT":
-                        print(event.code + ':' + str(event.state))
-                events.pop(0)
-            
-            libmario.step(buttons, c_float(stick_x), c_float(stick_y))
-            pos = Vec3f()
-            vel = Vec3f()
-            libmario.getMarioPosition(pos)
-            libmario.getMarioVelocity(vel)
-            
-            print('Position: %8.2f %8.2f %8.2f  Velocity: %8.2f %8.2f %8.2f Buttons: 0x%08X Anim: 0x%02X AnimFrame: %d' % (pos[0], pos[1], pos[2], vel[0], vel[1], vel[2], buttons, libmario.getMarioAnimIndex(), libmario.getMarioAnimFrame()))
-
-            timer.sleep()
-    except KeyboardInterrupt:
-        print("Ctrl+C pressed...")
-        sys.exit(0)
-
-if __name__ == '__main__':
-    main()
--- a/src/engine/math_util.c
+++ b/src/engine/math_util.c
@@ -128,9 +128,9 @@ void surface_normal_to_vec3f(Vec3f dest, struct Surface *surf) {

 /// Convert float vector a to a short vector 'dest' by rounding the components to the nearest integer.
 #define vec3_copy_bits_roundf(fmt, dest, src) { \
-    register fmt x = lroundf(src[0]);            \
-    register fmt y = lroundf(src[1]);            \
-    register fmt z = lroundf(src[2]);            \
+    register fmt x = roundf(src[0]);            \
+    register fmt y = roundf(src[1]);            \
+    register fmt z = roundf(src[2]);            \
    ((fmt *) dest)[0] = x;                      \
    ((fmt *) dest)[1] = y;                      \
    ((fmt *) dest)[2] = z;                      \
@@ -592,6 +592,31 @@ void mtxf_billboard(Mat4 dest, Mat4 mtx, Vec3f position, Vec3f scale, s32 angle)
 * 'scale' is the scale of the shadow
 * 'yaw' is the angle which it should face
 */
+#ifdef SHEAR_SHADOWS
+void mtxf_shadow(Mat4 dest, Mat4 src, Vec3f upDir, Vec3f pos, Vec3f scale, s32 yaw) {
+    float hxy = -upDir[0]/upDir[1];
+    float hzy = -upDir[2]/upDir[1];
+    float cosyaw = coss(yaw);
+    float sinyaw = sins(yaw);
+    
+    Vec3f entry;
+    entry[0] = scale[0] * cosyaw;
+    entry[1] = scale[0] * cosyaw * hxy  - scale[0] * sinyaw * hzy;
+    entry[2] = -scale[0] * sinyaw;
+    linear_mtxf_mul_vec3f(src, dest[0], entry);
+    entry[0] = 0;
+    entry[1] = scale[1];
+    entry[2] = 0;
+    linear_mtxf_mul_vec3f(src, dest[1], entry);
+    entry[0] = scale[2] * sinyaw;
+    entry[1] = scale[2] * sinyaw * hxy  + scale[2] * cosyaw * hzy;
+    entry[2] = scale[2] * cosyaw;
+    linear_mtxf_mul_vec3f(src, dest[2], entry);
+    linear_mtxf_mul_vec3f(src, dest[3], pos);
+    vec3f_add(dest[3], src[3]);
+    MTXF_END(dest);
+}
+#else
 void mtxf_shadow(Mat4 dest, Mat4 src, Vec3f upDir, Vec3f pos, Vec3f scale, s32 yaw) {
    Vec3f lateralDir;
    Vec3f leftDir;
@@ -613,6 +638,7 @@ void mtxf_shadow(Mat4 dest, Mat4 src, Vec3f upDir, Vec3f pos, Vec3f scale, s32 y
    vec3f_add(dest[3], src[3]);
    MTXF_END(dest);
 }
+#endif

 /**
 * Set 'dest' to a transformation matrix that aligns an object with the terrain
@@ -802,7 +828,7 @@ void get_pos_from_transform_mtx(Vec3f dest, Mat4 objMtx, register Mat4 camMtx) {
        *x = (temp3[12] - temp2[12]);
        temp2++;
        temp3++;
-        x = (f32 *)(((uintptr_t)x) + 4);
+        x = (f32 *)(((u32)x) + 4);
    }
    temp2 -= 3;
    for (i = 0; i < 3; i++) {
@@ -1294,6 +1320,203 @@ s32 anim_spline_poll(Vec3f result) {
    return hasEnded;
 }

+/**************************************************
+ *                    RAYCASTING                  *
+ **************************************************/
+
+#define RAY_OFFSET 30.0f /* How many units to extrapolate surfaces when testing for a raycast */
+#define RAY_STEPS      4 /* How many steps to do when casting rays, default to quartersteps.  */
+
+/**
+ * @brief Checks if a ray intersects a surface using Möller–Trumbore intersection algorithm.
+ * 
+ * @param orig is the starting point of the ray.
+ * @param dir is the normalized ray direction.
+ * @param dir_length is the length of the ray.
+ * @param surface is the surface to check.
+ * @param hit_pos returns the position on the surface where the ray intersects it.
+ * @param length returns the distance from the starting point to the hit position.
+ * @return s32 TRUE if the ray intersects a surface.
+ */
+s32 ray_surface_intersect(Vec3f orig, Vec3f dir, f32 dir_length, struct Surface *surface, Vec3f hit_pos, f32 *length) {
+    // Ignore certain surface types.
+    if ((surface->type == SURFACE_INTANGIBLE) || (surface->flags & SURFACE_FLAG_NO_CAM_COLLISION)) return FALSE;
+    // Convert the vertices to Vec3f.
+    Vec3f v0, v1, v2;
+    vec3s_to_vec3f(v0, surface->vertex1);
+    vec3s_to_vec3f(v1, surface->vertex2);
+    vec3s_to_vec3f(v2, surface->vertex3);
+    // Get surface normal and extend it by RAY_OFFSET.
+    Vec3f norm;
+    surface_normal_to_vec3f(norm, surface);
+    vec3_mul_val(norm, RAY_OFFSET);
+    // Move the face forward by RAY_OFFSET.
+    vec3f_add(v0, norm);
+    vec3f_add(v1, norm);
+    vec3f_add(v2, norm);
+    // Make 'e1' (edge 1) the vector from vertex 0 to vertex 1.
+    Vec3f e1;
+    vec3f_diff(e1, v1, v0);
+    // Make 'e2' (edge 2) the vector from vertex 0 to vertex 2.
+    Vec3f e2;
+    vec3f_diff(e2, v2, v0);
+    // Make 'h' the cross product of 'dir' and edge 2.
+    Vec3f h;
+    vec3f_cross(h, dir, e2);
+    // Determine the cos(angle) difference between ray and surface normals.
+    f32 det = vec3f_dot(e1, h);
+    // Check if we're perpendicular from the surface.
+    if ((det > -NEAR_ZERO) && (det < NEAR_ZERO)) return FALSE;
+    // Check if we're making contact with the surface.
+    // Make f the inverse of the cos(angle) between ray and surface normals.
+    f32 f = 1.0f / det; // invDet
+    // Make 's' the vector from vertex 0 to 'orig'.
+    Vec3f s;
+    vec3f_diff(s, orig, v0);
+    // Make 'u' the cos(angle) between vectors 's' and normals, divided by 'det'.
+    f32 u = f * vec3f_dot(s, h);
+    // Check if 'u' is within bounds.
+    if ((u < 0.0f) || (u > 1.0f)) return FALSE;
+    // Make 'q' the cross product of 's' and edge 1. 
+    Vec3f q;
+    vec3f_cross(q, s, e1);
+    // Make 'v' the cos(angle) between the ray and 'q', divided by 'det'.
+    f32 v = f * vec3f_dot(dir, q);
+    // Check if 'v' is within bounds.
+    if ((v < 0.0f) || ((u + v) > 1.0f)) return FALSE;
+    // Get the length between our origin and the surface contact point.
+    // Make '*length' the cos(angle) betqwwn edge 2 and 'q', divided by 'det'.
+    *length = f * vec3f_dot(e2, q);
+    // Check if the length to the hit point is shorter than the ray length.
+    if ((*length <= NEAR_ZERO) || (*length > dir_length)) return FALSE;
+    // Successful contact.
+    // Make 'add_dir' into 'dir' scaled by 'length'.
+    Vec3f add_dir;
+    vec3_prod_val(add_dir, dir, *length);
+    // Make 'hit_pos' into the sum of 'orig' and 'add_dir'.
+    vec3f_sum(hit_pos, orig, add_dir);
+    return TRUE;
+}
+
+void find_surface_on_ray_list(struct SurfaceNode *list, Vec3f orig, Vec3f dir, f32 dir_length, struct Surface **hit_surface, Vec3f hit_pos, f32 *max_length) {
+    s32 hit;
+    f32 length;
+    Vec3f chk_hit_pos;
+    f32 top, bottom;
+#if PUPPYPRINT_DEBUG
+    OSTime first = osGetTime();
+#endif
+    // Get upper and lower bounds of ray
+    if (dir[1] >= 0.0f) {
+        // Ray is upwards.
+        top    = orig[1] + (dir[1] * dir_length);
+        bottom = orig[1];
+    } else {
+        // Ray is downwards.
+        top    = orig[1];
+        bottom = orig[1] + (dir[1] * dir_length);
+    }
+
+    // Iterate through every surface of the list
+    for (; list != NULL; list = list->next) {
+        // Reject surface if out of vertical bounds
+        if ((list->surface->lowerY > top) || (list->surface->upperY < bottom)) continue;
+        // Check intersection between the ray and this surface
+        hit = ray_surface_intersect(orig, dir, dir_length, list->surface, chk_hit_pos, &length);
+        if (hit && (length <= *max_length)) {
+            *hit_surface = list->surface;
+            vec3f_copy(hit_pos, chk_hit_pos);
+            *max_length = length;
+        }
+    }
+#if PUPPYPRINT_DEBUG
+    collisionTime[perfIteration] += osGetTime() - first;
+#endif
+}
+
+void find_surface_on_ray_cell(s32 cellX, s32 cellZ, Vec3f orig, Vec3f normalized_dir, f32 dir_length, struct Surface **hit_surface, Vec3f hit_pos, f32 *max_length, s32 flags) {
+    // Skip if OOB
+    if ((cellX >= 0) && (cellX <= (NUM_CELLS - 1)) && (cellZ >= 0) && (cellZ <= (NUM_CELLS - 1))) {
+        // Iterate through each surface in this partition
+        if ((normalized_dir[1] > -NEAR_ONE) && (flags & RAYCAST_FIND_CEIL)) {
+            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_CEILS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_CEILS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+        }
+        if ((normalized_dir[1] <  NEAR_ONE) && (flags & RAYCAST_FIND_FLOOR)) {
+            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_FLOORS].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_FLOORS].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+        }
+        if (flags & RAYCAST_FIND_WALL) {
+            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WALLS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WALLS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+        }
+        if (flags & RAYCAST_FIND_WATER) {
+            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WATER ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WATER ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
+        }
+    }
+}
+
+void find_surface_on_ray(Vec3f orig, Vec3f dir, struct Surface **hit_surface, Vec3f hit_pos, s32 flags) {
+    Vec3f normalized_dir;
+    f32 step;
+    s32 i;
+    const f32 invcell = 1.0f / CELL_SIZE;
+
+    // Set that no surface has been hit
+    *hit_surface = NULL;
+    vec3f_sum(hit_pos, orig, dir);
+
+    // Get normalized direction
+    f32 dir_length = vec3_mag(dir);
+    f32 max_length = dir_length;
+    vec3f_copy(normalized_dir, dir);
+    vec3f_normalize(normalized_dir);
+
+    // Get our cell coordinate
+    f32 fCellX    = (orig[0] + LEVEL_BOUNDARY_MAX) * invcell;
+    f32 fCellZ    = (orig[2] + LEVEL_BOUNDARY_MAX) * invcell;
+    s32 cellX     = fCellX;
+    s32 cellZ     = fCellZ;
+    s32 cellPrevX = cellX;
+    s32 cellPrevZ = cellZ;
+
+    // Don't do DDA if straight down
+    if ((normalized_dir[1] >= NEAR_ONE) || (normalized_dir[1] <= -NEAR_ONE)) {
+        find_surface_on_ray_cell(cellX, cellZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
+        return;
+    }
+
+    // Get cells we cross using DDA
+    f32 absDir0 = absf(dir[0]);
+    f32 absDir2 = absf(dir[2]);
+    if (absDir0 >= absDir2) {
+        step = (RAY_STEPS * absDir0) * invcell;
+    } else {
+        step = (RAY_STEPS * absDir2) * invcell;
+    }
+
+    f32 dx = (dir[0] / step) * invcell;
+    f32 dz = (dir[2] / step) * invcell;
+
+    for (i = 0; i < step && *hit_surface == NULL; i++) {
+        find_surface_on_ray_cell(cellX, cellZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
+
+        // Move cell coordinate
+        fCellX   += dx;
+        fCellZ   += dz;
+        cellPrevX = cellX;
+        cellPrevZ = cellZ;
+        cellX     = fCellX;
+        cellZ     = fCellZ;
+
+        if ((cellPrevX != cellX) && (cellPrevZ != cellZ)) {
+            find_surface_on_ray_cell(cellX, cellPrevZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
+            find_surface_on_ray_cell(cellPrevX, cellZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
+        }
+    }
+}
+
 // Constructs a float in registers, which can be faster than gcc's default of loading a float from rodata.
 // Especially fast for halfword floats, which get loaded with a `lui` + `mtc1`.
 static ALWAYS_INLINE float construct_float(const float f)
@@ -1333,40 +1556,63 @@ static ALWAYS_INLINE float construct_float(const float f)
    return f_out;
 }

+static ALWAYS_INLINE float mul_without_nop(float a, float b)
+{
+    float ret;
+    __asm__ ("mul.s %0, %1, %2"
+                         : "=f"(ret)
+                         : "f"(a), "f"(b));
+    return ret;
+}
+
+static ALWAYS_INLINE void swl(void* addr, s32 val, const int offset)
+{
+    __asm__ ("swl %1, %2(%0)"
+                        : 
+                        : "g"(addr), "g"(val), "I"(offset));
+}
+
 // Converts a floating point matrix to a fixed point matrix
 // Makes some assumptions about certain fields in the matrix, which will always be true for valid matrices.
-__attribute__((optimize("Os")))
+__attribute__((optimize("Os"))) __attribute__((aligned(32)))
 void mtxf_to_mtx_fast(s16* dst, float* src)
 {
+    int i;
    float scale = construct_float(65536.0f / WORLD_SCALE);
-    // Iterate over pairs of values in the input matrix
-    for (int i = 0; i < 8; i++)
+    // Iterate over rows of values in the input matrix
+    for (i = 0; i < 4; i++)
    {
-        // Read the first input in the current pair
-        float a = src[2 * i + 0];
+        // Read the three input in the current row (assume the fourth is zero)
+        float a = src[4 * i + 0];
+        float b = src[4 * i + 1];
+        float c = src[4 * i + 2];
+        float a_scaled = mul_without_nop(a,scale);
+        float b_scaled = mul_without_nop(b,scale);
+        float c_scaled = mul_without_nop(c,scale);

-        // Convert the first input to fixed
-        s32 a_int = (s32)(a * scale);
-        dst[2 * i +  0] = (s16)(a_int >> 16);
-        dst[2 * i + 16] = (s16)(a_int >>  0);
+        // Convert the three inputs to fixed
+        s32 a_int = (s32)a_scaled;
+        s32 b_int = (s32)b_scaled;
+        s32 c_int = (s32)c_scaled;
+        s32 c_high = c_int & 0xFFFF0000;
+        s32 c_low = c_int << 16;
+        
+        // Write the integer part of a, as well as garbage into the next two bytes.
+        // Those two bytes will get overwritten by the integer part of b.
+        // This prevents needing to shift or mask the integer value of a.
+        *(s32*)(&dst[4 * i +  0]) = a_int;
+        // Write the fractional part of a
+        dst[4 * i + 16] = (s16)a_int;

-        // If this is the left half of the matrix, convert the second input to fixed
-        if ((i & 1) == 0)
-        {
-            // Read the second input in the current pair
-            float b = src[2 * i + 1];
-            s32 b_int = (s32)(b * scale);
-            dst[2 * i +  1] = (s16)(b_int >> 16);
-            dst[2 * i + 17] = (s16)(b_int >>  0);
-        }
-        // Otherwise, skip the second input because column 4 will always be zero
-        // Row 4 column 4 is handled after the loop.
-        else
-        {
-            dst[2 * i +  1] = 0;
-            dst[2 * i + 17] = 0;
-        }
+        // Write the integer part of b using swl to avoid needing to shift.
+        swl(dst + 4 * i, b_int, 2);
+        // Write the fractional part of b.
+        dst[4 * i + 17] = (s16)b_int;

+        // Write the integer part of c and two zeroes for the 4th column.
+        *(s32*)(&dst[4 * i + 2]) = c_high;
+        // Write the fractional part of c and two zeroes for the 4th column
+        *(s32*)(&dst[4 * i + 18]) = c_low;
    }
    // Write 1.0 to the bottom right entry in the output matrix
    // The low half was already set to zero in the loop, so we only need
--- a/src/engine/math_util.h
+++ b/src/engine/math_util.h
@@ -212,16 +212,16 @@ extern f32 gSineTable[];
 }

 #define vec2_copy_roundf(dst, src) {    \
-    (dst)[0] = lroundf((src)[0]);        \
-    (dst)[1] = lroundf((src)[1]);        \
+    (dst)[0] = roundf((src)[0]);        \
+    (dst)[1] = roundf((src)[1]);        \
 }
 #define vec3_copy_roundf(dst, src) {    \
    vec2_copy_roundf((dst), (src));     \
-    (dst)[2] = lroundf((src)[2]);        \
+    (dst)[2] = roundf((src)[2]);        \
 }
 #define vec4_copy_roundf(dst, src) {    \
    vec3_copy_roundf((dst), (src));     \
-    (dst)[3] = lroundf((src)[3]);        \
+    (dst)[3] = roundf((src)[3]);        \
 }

 #define vec2_copy_inverse(dst, src) {   \
@@ -451,32 +451,23 @@ extern f32 gSineTable[];

 #define ABS(x)  (((x) > 0) ? (x) : -(x))

-#ifdef TARGET_N64
-ALWAYS_INLINE s32 lroundf(f32 in) {
+/// From Wiseguy
+ALWAYS_INLINE s32 roundf(f32 in) {
    f32 tmp;
    s32 out;
    __asm__("round.w.s %0,%1" : "=f" (tmp) : "f" (in ));
    __asm__("mfc1      %0,%1" : "=r" (out) : "f" (tmp));
    return out;
 }
-#else
-long lroundf(f32 in);
-#endif
 // backwards compatibility
-#define round_float(in) lroundf(in)
+#define round_float(in) roundf(in)

-#ifdef TARGET_N64
 /// Absolute value
 ALWAYS_INLINE f32 absf(f32 in) {
    f32 out;
    __asm__("abs.s %0,%1" : "=f" (out) : "f" (in));
    return out;
 }
-#else
-ALWAYS_INLINE f32 absf(f32 in) {
-    return in > 0 ? in : -in;
-}
-#endif
 ALWAYS_INLINE s32 absi(s32 in) {
    return ABS(in);
 }
--- a/src/engine/surface_collision.c
+++ b/src/engine/surface_collision.c
@@ -914,200 +914,3 @@ s32 unused_resolve_floor_or_ceil_collisions(s32 checkCeil, f32 *px, f32 *py, f32

    return 0;
 }
-
-/**************************************************
- *                    RAYCASTING                  *
- **************************************************/
-
-#define RAY_OFFSET 30.0f /* How many units to extrapolate surfaces when testing for a raycast */
-#define RAY_STEPS      4 /* How many steps to do when casting rays, default to quartersteps.  */
-
-/**
- * @brief Checks if a ray intersects a surface using Möller–Trumbore intersection algorithm.
- * 
- * @param orig is the starting point of the ray.
- * @param dir is the normalized ray direction.
- * @param dir_length is the length of the ray.
- * @param surface is the surface to check.
- * @param hit_pos returns the position on the surface where the ray intersects it.
- * @param length returns the distance from the starting point to the hit position.
- * @return s32 TRUE if the ray intersects a surface.
- */
-s32 ray_surface_intersect(Vec3f orig, Vec3f dir, f32 dir_length, struct Surface *surface, Vec3f hit_pos, f32 *length) {
-    // Ignore certain surface types.
-    if ((surface->type == SURFACE_INTANGIBLE) || (surface->flags & SURFACE_FLAG_NO_CAM_COLLISION)) return FALSE;
-    // Convert the vertices to Vec3f.
-    Vec3f v0, v1, v2;
-    vec3s_to_vec3f(v0, surface->vertex1);
-    vec3s_to_vec3f(v1, surface->vertex2);
-    vec3s_to_vec3f(v2, surface->vertex3);
-    // Get surface normal and extend it by RAY_OFFSET.
-    Vec3f norm;
-    surface_normal_to_vec3f(norm, surface);
-    vec3_mul_val(norm, RAY_OFFSET);
-    // Move the face forward by RAY_OFFSET.
-    vec3f_add(v0, norm);
-    vec3f_add(v1, norm);
-    vec3f_add(v2, norm);
-    // Make 'e1' (edge 1) the vector from vertex 0 to vertex 1.
-    Vec3f e1;
-    vec3f_diff(e1, v1, v0);
-    // Make 'e2' (edge 2) the vector from vertex 0 to vertex 2.
-    Vec3f e2;
-    vec3f_diff(e2, v2, v0);
-    // Make 'h' the cross product of 'dir' and edge 2.
-    Vec3f h;
-    vec3f_cross(h, dir, e2);
-    // Determine the cos(angle) difference between ray and surface normals.
-    f32 det = vec3f_dot(e1, h);
-    // Check if we're perpendicular from the surface.
-    if ((det > -NEAR_ZERO) && (det < NEAR_ZERO)) return FALSE;
-    // Check if we're making contact with the surface.
-    // Make f the inverse of the cos(angle) between ray and surface normals.
-    f32 f = 1.0f / det; // invDet
-    // Make 's' the vector from vertex 0 to 'orig'.
-    Vec3f s;
-    vec3f_diff(s, orig, v0);
-    // Make 'u' the cos(angle) between vectors 's' and normals, divided by 'det'.
-    f32 u = f * vec3f_dot(s, h);
-    // Check if 'u' is within bounds.
-    if ((u < 0.0f) || (u > 1.0f)) return FALSE;
-    // Make 'q' the cross product of 's' and edge 1. 
-    Vec3f q;
-    vec3f_cross(q, s, e1);
-    // Make 'v' the cos(angle) between the ray and 'q', divided by 'det'.
-    f32 v = f * vec3f_dot(dir, q);
-    // Check if 'v' is within bounds.
-    if ((v < 0.0f) || ((u + v) > 1.0f)) return FALSE;
-    // Get the length between our origin and the surface contact point.
-    // Make '*length' the cos(angle) betqwwn edge 2 and 'q', divided by 'det'.
-    *length = f * vec3f_dot(e2, q);
-    // Check if the length to the hit point is shorter than the ray length.
-    if ((*length <= NEAR_ZERO) || (*length > dir_length)) return FALSE;
-    // Successful contact.
-    // Make 'add_dir' into 'dir' scaled by 'length'.
-    Vec3f add_dir;
-    vec3_prod_val(add_dir, dir, *length);
-    // Make 'hit_pos' into the sum of 'orig' and 'add_dir'.
-    vec3f_sum(hit_pos, orig, add_dir);
-    return TRUE;
-}
-
-void find_surface_on_ray_list(struct SurfaceNode *list, Vec3f orig, Vec3f dir, f32 dir_length, struct Surface **hit_surface, Vec3f hit_pos, f32 *max_length) {
-    s32 hit;
-    f32 length;
-    Vec3f chk_hit_pos;
-    f32 top, bottom;
-#if PUPPYPRINT_DEBUG
-    OSTime first = osGetTime();
-#endif
-    // Get upper and lower bounds of ray
-    if (dir[1] >= 0.0f) {
-        // Ray is upwards.
-        top    = orig[1] + (dir[1] * dir_length);
-        bottom = orig[1];
-    } else {
-        // Ray is downwards.
-        top    = orig[1];
-        bottom = orig[1] + (dir[1] * dir_length);
-    }
-
-    // Iterate through every surface of the list
-    for (; list != NULL; list = list->next) {
-        // Reject surface if out of vertical bounds
-        if ((list->surface->lowerY > top) || (list->surface->upperY < bottom)) continue;
-        // Check intersection between the ray and this surface
-        hit = ray_surface_intersect(orig, dir, dir_length, list->surface, chk_hit_pos, &length);
-        if (hit && (length <= *max_length)) {
-            *hit_surface = list->surface;
-            vec3f_copy(hit_pos, chk_hit_pos);
-            *max_length = length;
-        }
-    }
-#if PUPPYPRINT_DEBUG
-    collisionTime[perfIteration] += osGetTime() - first;
-#endif
-}
-
-void find_surface_on_ray_cell(s32 cellX, s32 cellZ, Vec3f orig, Vec3f normalized_dir, f32 dir_length, struct Surface **hit_surface, Vec3f hit_pos, f32 *max_length, s32 flags) {
-    // Skip if OOB
-    if ((cellX >= 0) && (cellX <= (NUM_CELLS - 1)) && (cellZ >= 0) && (cellZ <= (NUM_CELLS - 1))) {
-        // Iterate through each surface in this partition
-        if ((normalized_dir[1] > -NEAR_ONE) && (flags & RAYCAST_FIND_CEIL)) {
-            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_CEILS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_CEILS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-        }
-        if ((normalized_dir[1] <  NEAR_ONE) && (flags & RAYCAST_FIND_FLOOR)) {
-            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_FLOORS].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_FLOORS].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-        }
-        if (flags & RAYCAST_FIND_WALL) {
-            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WALLS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WALLS ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-        }
-        if (flags & RAYCAST_FIND_WATER) {
-            find_surface_on_ray_list( gStaticSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WATER ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-            find_surface_on_ray_list(gDynamicSurfacePartition[cellZ][cellX][SPATIAL_PARTITION_WATER ].next, orig, normalized_dir, dir_length, hit_surface, hit_pos, max_length);
-        }
-    }
-}
-
-void find_surface_on_ray(Vec3f orig, Vec3f dir, struct Surface **hit_surface, Vec3f hit_pos, s32 flags) {
-    Vec3f normalized_dir;
-    f32 step;
-    s32 i;
-    const f32 invcell = 1.0f / CELL_SIZE;
-
-    // Set that no surface has been hit
-    *hit_surface = NULL;
-    vec3f_sum(hit_pos, orig, dir);
-
-    // Get normalized direction
-    f32 dir_length = vec3_mag(dir);
-    f32 max_length = dir_length;
-    vec3f_copy(normalized_dir, dir);
-    vec3f_normalize(normalized_dir);
-
-    // Get our cell coordinate
-    f32 fCellX    = (orig[0] + LEVEL_BOUNDARY_MAX) * invcell;
-    f32 fCellZ    = (orig[2] + LEVEL_BOUNDARY_MAX) * invcell;
-    s32 cellX     = fCellX;
-    s32 cellZ     = fCellZ;
-    s32 cellPrevX = cellX;
-    s32 cellPrevZ = cellZ;
-
-    // Don't do DDA if straight down
-    if ((normalized_dir[1] >= NEAR_ONE) || (normalized_dir[1] <= -NEAR_ONE)) {
-        find_surface_on_ray_cell(cellX, cellZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
-        return;
-    }
-
-    // Get cells we cross using DDA
-    f32 absDir0 = absf(dir[0]);
-    f32 absDir2 = absf(dir[2]);
-    if (absDir0 >= absDir2) {
-        step = (RAY_STEPS * absDir0) * invcell;
-    } else {
-        step = (RAY_STEPS * absDir2) * invcell;
-    }
-
-    f32 dx = (dir[0] / step) * invcell;
-    f32 dz = (dir[2] / step) * invcell;
-
-    for (i = 0; i < step && *hit_surface == NULL; i++) {
-        find_surface_on_ray_cell(cellX, cellZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
-
-        // Move cell coordinate
-        fCellX   += dx;
-        fCellZ   += dz;
-        cellPrevX = cellX;
-        cellPrevZ = cellZ;
-        cellX     = fCellX;
-        cellZ     = fCellZ;
-
-        if ((cellPrevX != cellX) && (cellPrevZ != cellZ)) {
-            find_surface_on_ray_cell(cellX, cellPrevZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
-            find_surface_on_ray_cell(cellPrevX, cellZ, orig, normalized_dir, dir_length, hit_surface, hit_pos, &max_length, flags);
-        }
-    }
-}
--- a/src/game/ingame_menu.c
+++ b/src/game/ingame_menu.c
@@ -1448,7 +1448,7 @@ void render_hud_cannon_reticle(void) {
    gSPDisplayList(gDisplayListHead++, dl_draw_triangle);
    gSPPopMatrix(gDisplayListHead++, G_MTX_MODELVIEW);

-    gSPPopMatrix(gDisplayListHead++, G_MTX_MODELVIEW);
+    // gSPPopMatrix(gDisplayListHead++, G_MTX_MODELVIEW);
 }

 void reset_red_coins_collected(void) {
--- a/src/game/save_file.c
+++ b/src/game/save_file.c
@@ -361,6 +361,16 @@ void save_file_load_all(void) {
 }

 #ifdef PUPPYCAM
+void puppycam_check_save(void) {
+    if (gSaveBuffer.menuData.firstBoot != 4
+        || gSaveBuffer.menuData.saveOptions.sensitivityX < 5
+        || gSaveBuffer.menuData.saveOptions.sensitivityY < 5) {
+        wipe_main_menu_data();
+        gSaveBuffer.menuData.firstBoot = 4;
+        puppycam_default_config();
+    }
+}
+
 void puppycam_get_save(void) {
    gPuppyCam.options = gSaveBuffer.menuData.saveOptions;

@@ -384,15 +394,6 @@ void puppycam_set_save(void) {
    gMainMenuDataModified = TRUE;
    save_main_menu_data();
 }
-
-void puppycam_check_save(void) {
-    if (gSaveBuffer.menuData.firstBoot != 4) {
-        wipe_main_menu_data();
-        gSaveBuffer.menuData.firstBoot = 4;
-        puppycam_default_config();
-        puppycam_set_save();
-    }
-}
 #endif

 /**
--- a/src/game/screen_transition.c
+++ b/src/game/screen_transition.c
@@ -130,8 +130,8 @@ void make_tex_transition_vertex(Vtx *verts, s32 n, s8 fadeTimer, struct WarpTran
    u16 zeroTimer = sTransitionTextureFadeCount[fadeTimer];
    f32 centerX = texRadius1 * coss(zeroTimer) - texRadius2 * sins(zeroTimer) + centerTransX;
    f32 centerY = texRadius1 * sins(zeroTimer) + texRadius2 * coss(zeroTimer) + centerTransY;
-    s16 x = lroundf(centerX);
-    s16 y = lroundf(centerY);
+    s16 x = roundf(centerX);
+    s16 y = roundf(centerY);

    make_vertex(verts, n, x, y, -1, tx * 32, ty * 32, r, g, b, 255);
 }
--- a/src/game/skybox.c
+++ b/src/game/skybox.c
@@ -161,7 +161,7 @@ s32 calculate_skybox_scaled_y(s8 player, UNUSED f32 fov) {

    // Scale by 360 / fov
    f32 degreesToScale = 360.0f * pitchInDegrees / 90.0f;
-    s32 roundedY = lroundf(degreesToScale);
+    s32 roundedY = roundf(degreesToScale);

    // Since pitch can be negative, and the tile grid starts 1 octant above the camera's focus, add
    // 5 octants to the y position
Author	SHA1	Message	Date
Mr-Wiseguy	e113978834	Fixed typo in config_graphics.h	2022-01-22 22:19:40 -05:00
Mr-Wiseguy	37632f3954	Implemented shear shadows and slightly faster mtxf_to_mtx function	2022-01-22 22:13:12 -05:00