From 9f515a9daa4fd7d429ef8586d2d7dae4e3d3ad61 Mon Sep 17 00:00:00 2001
From: Francisco Casas <fcasas@codeweavers.com>
Date: Mon, 8 Jul 2024 15:13:07 -0400
Subject: [PATCH] vkd3d-shader/hlsl: Lower non-constant array loads for SM1.

This is achieved by means of creating a variable storing zero,
loading every array element, comparing if the non-constant index
matches the index of that element at runtime, and in that case
store the corresponding element in the variable.

This seems to be the same strategy that the native compiler uses.
---
 libs/vkd3d-shader/hlsl_codegen.c            | 119 ++++++++++++++++++++
 tests/hlsl/non-const-indexing.shader_test   |  52 ++++-----
 tests/hlsl/sm1-const-allocation.shader_test |   6 +-
 3 files changed, 148 insertions(+), 29 deletions(-)

diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c
index f081fe94..3527656a 100644
--- a/libs/vkd3d-shader/hlsl_codegen.c
+++ b/libs/vkd3d-shader/hlsl_codegen.c
@@ -2633,6 +2633,123 @@ static bool validate_nonconstant_vector_store_derefs(struct hlsl_ctx *ctx, struc
     return false;
 }
 
+/* This pass flattens array loads that include the indexing of a non-constant index into multiple
+ * constant loads, where the value of only one of them ends up in the resulting node.
+ * This is achieved through a synthetic variable. The non-constant index is compared for equality
+ * with every possible value it can have within the array bounds, and the ternary operator is used
+ * to update the value of the synthetic var when the equality check passes. */
+static bool lower_nonconstant_array_loads(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr,
+        struct hlsl_block *block)
+{
+    struct hlsl_constant_value zero_value = {0};
+    struct hlsl_ir_node *cut_index, *zero, *store;
+    const struct hlsl_deref *deref;
+    struct hlsl_type *cut_type;
+    struct hlsl_ir_load *load;
+    struct hlsl_ir_var *var;
+    unsigned int i, i_cut;
+
+    if (instr->type != HLSL_IR_LOAD)
+        return false;
+    load = hlsl_ir_load(instr);
+    deref = &load->src;
+
+    if (deref->path_len == 0)
+        return false;
+
+    for (i = deref->path_len - 1; ; --i)
+    {
+        if (deref->path[i].node->type != HLSL_IR_CONSTANT)
+        {
+            i_cut = i;
+            break;
+        }
+
+        if (i == 0)
+            return false;
+    }
+
+    cut_index = deref->path[i_cut].node;
+    cut_type = deref->var->data_type;
+    for (i = 0; i < i_cut; ++i)
+        cut_type = hlsl_get_element_type_from_path_index(ctx, cut_type, deref->path[i].node);
+
+    if (cut_type->class != HLSL_CLASS_ARRAY)
+    {
+        VKD3D_ASSERT(hlsl_type_is_row_major(cut_type));
+        return false;
+    }
+
+    if (!(var = hlsl_new_synthetic_var(ctx, "array_load", instr->data_type, &instr->loc)))
+        return false;
+
+    if (!(zero = hlsl_new_constant(ctx, instr->data_type, &zero_value, &instr->loc)))
+        return false;
+    hlsl_block_add_instr(block, zero);
+
+    if (!(store = hlsl_new_simple_store(ctx, var, zero)))
+        return false;
+    hlsl_block_add_instr(block, store);
+
+    TRACE("Lowering non-constant array load on variable '%s'.\n", deref->var->name);
+    for (i = 0; i < cut_type->e.array.elements_count; ++i)
+    {
+        struct hlsl_type *btype = hlsl_get_scalar_type(ctx, HLSL_TYPE_BOOL);
+        struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS] = {0};
+        struct hlsl_ir_node *const_i, *equals, *ternary, *var_store;
+        struct hlsl_ir_load *var_load, *specific_load;
+        struct hlsl_deref deref_copy = {0};
+
+        if (!(const_i = hlsl_new_uint_constant(ctx, i, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, const_i);
+
+        operands[0] = cut_index;
+        operands[1] = const_i;
+        if (!(equals = hlsl_new_expr(ctx, HLSL_OP2_EQUAL, operands, btype, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, equals);
+
+        if (!(equals = hlsl_new_swizzle(ctx, HLSL_SWIZZLE(X, X, X, X), var->data_type->dimx, equals, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, equals);
+
+        if (!(var_load = hlsl_new_var_load(ctx, var, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, &var_load->node);
+
+        if (!hlsl_copy_deref(ctx, &deref_copy, deref))
+            return false;
+        hlsl_src_remove(&deref_copy.path[i_cut]);
+        hlsl_src_from_node(&deref_copy.path[i_cut], const_i);
+
+        if (!(specific_load = hlsl_new_load_index(ctx, &deref_copy, NULL, &cut_index->loc)))
+        {
+            hlsl_cleanup_deref(&deref_copy);
+            return false;
+        }
+        hlsl_block_add_instr(block, &specific_load->node);
+
+        hlsl_cleanup_deref(&deref_copy);
+
+        operands[0] = equals;
+        operands[1] = &specific_load->node;
+        operands[2] = &var_load->node;
+        if (!(ternary = hlsl_new_expr(ctx, HLSL_OP3_TERNARY, operands, instr->data_type, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, ternary);
+
+        if (!(var_store = hlsl_new_simple_store(ctx, var, ternary)))
+            return false;
+        hlsl_block_add_instr(block, var_store);
+    }
+
+    if (!(load = hlsl_new_var_load(ctx, var, &instr->loc)))
+        return false;
+    hlsl_block_add_instr(block, &load->node);
+
+    return true;
+}
 /* Lower combined samples and sampler variables to synthesized separated textures and samplers.
  * That is, translate SM1-style samples in the source to SM4-style samples in the bytecode. */
 static bool lower_combined_samples(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context)
@@ -6241,6 +6358,8 @@ int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry
 
     if (profile->major_version < 4)
     {
+        while (lower_ir(ctx, lower_nonconstant_array_loads, body));
+
         lower_ir(ctx, lower_ternary, body);
 
         lower_ir(ctx, lower_nonfloat_exprs, body);
diff --git a/tests/hlsl/non-const-indexing.shader_test b/tests/hlsl/non-const-indexing.shader_test
index f8901ffe..55ca15b6 100644
--- a/tests/hlsl/non-const-indexing.shader_test
+++ b/tests/hlsl/non-const-indexing.shader_test
@@ -1,4 +1,4 @@
-[pixel shader todo(sm<4)]
+[pixel shader]
 uniform float4 f[3];
 uniform float2 i;
 
@@ -12,16 +12,16 @@ uniform 0 float4 1.0 2.0 3.0 4.0
 uniform 4 float4 5.0 6.0 7.0 8.0
 uniform 8 float4 9.0 10.0 11.0 12.0
 uniform 12 float4 0 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (1.0, 2.0, 3.0, 4.0)
 uniform 12 float4 1 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 12 float4 0 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 12 float4 1 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (9.0, 10.0, 11.0, 12.0)
 
 
@@ -49,7 +49,7 @@ todo(glsl) draw quad
 probe (0, 0) rgba (14.0, 14.0, 14.0, 14.0)
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 float i;
 
 float4 main() : sv_target
@@ -61,7 +61,7 @@ float4 main() : sv_target
 
 [test]
 uniform 0 float 2.3
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (3, 3, 3, 3)
 
 
@@ -90,7 +90,7 @@ todo(sm<4 | glsl) draw quad
 probe (0, 0) rgba (24.0, 0.0, 21.0, 1.0)
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 uniform float2 i;
 
 float4 main() : sv_target
@@ -102,20 +102,20 @@ float4 main() : sv_target
 
 [test]
 uniform 0 float4 0 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (1.0, 2.0, 3.0, 4.0)
 uniform 0 float4 1 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 0 float4 0 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 0 float4 1 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (9.0, 10.0, 11.0, 12.0)
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 float4 a;
 
 float4 main() : sv_target
@@ -130,11 +130,11 @@ float4 main() : sv_target
 
 [test]
 uniform 0 float4 0 0 2.4 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (1.0, 120.0, 90.0, 4.0)
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 float i, j;
 
 float4 main() : sv_target
@@ -148,16 +148,16 @@ float4 main() : sv_target
 if(sm<4) uniform 0 float 3
 if(sm<4) uniform 4 float 1
 if(sm>=4) uniform 0 float4 3 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (7, 7, 7, 7)
 if(sm<4) uniform 0 float 5
 if(sm<4) uniform 4 float 0
 if(sm>=4) uniform 0 float4 5 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (10, 10, 10, 10)
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 float i, j;
 float k;
 
@@ -186,17 +186,17 @@ if(sm<4) uniform 0 float 2
 if(sm<4) uniform 4 float 1
 if(sm<4) uniform 8 float -1
 if(sm>=4) uniform 0 float4 2 1 -1 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (30, 31, 32, 33)
 if(sm<4) uniform 0 float 1
 if(sm<4) uniform 4 float 0
 if(sm<4) uniform 8 float 1
 if(sm>=4) uniform 0 float4 1 0 1 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (70, 71, 72, 73)
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 float i, j;
 
 float4 main() : sv_target
@@ -214,12 +214,12 @@ float4 main() : sv_target
 if(sm<4) uniform 0 float 11
 if(sm<4) uniform 4 float 12
 if(sm>=4) uniform 0 float4 11 12 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5, 5, 5, 5)
 if(sm<4) uniform 0 float 13
 if(sm<4) uniform 4 float 10
 if(sm>=4) uniform 0 float4 13 10 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (9, 9, 9, 9)
 
 
@@ -309,7 +309,7 @@ probe (0, 0) rgba (1, 5, 3, 4)
 % reset requirements
 
 
-[pixel shader todo(sm<4)]
+[pixel shader]
 uniform float4 f[4];
 uniform uint4 u;
 uniform uint4 v;
@@ -332,7 +332,7 @@ uniform 8 float 3.0
 uniform 12 float 4.0
 uniform 16 uint4 3 1 0 2
 uniform 20 uint4 0 3 1 2
-todo(sm<4 | glsl) draw quad
-if(sm<4)         todo probe (0,0) rgba (1.0, 1.0, 1.0, 1.0)
+todo(glsl) draw quad
+if(sm<4)         probe (0,0) rgba (1.0, 1.0, 1.0, 1.0)
 if(sm>=4 & sm<6) todo probe (0,0) rgba (4.0, 4.0, 4.0, 4.0)
 if(sm>=6)        probe (0,0) rgba (4.0, 3.0, 2.0, 1.0)
diff --git a/tests/hlsl/sm1-const-allocation.shader_test b/tests/hlsl/sm1-const-allocation.shader_test
index 86f9e5f9..4cc3eae8 100644
--- a/tests/hlsl/sm1-const-allocation.shader_test
+++ b/tests/hlsl/sm1-const-allocation.shader_test
@@ -375,7 +375,7 @@ draw quad
 probe (0, 0) rgba (6, 1, 0, 0)
 
 
-[pixel shader todo]
+[pixel shader]
 // Relative addressing extends the allocation size only up to the array's size.
 float idx;
 
@@ -405,8 +405,8 @@ uniform 8 float 2
 uniform 12 float 3
 uniform 16 float 4
 uniform 20 float 3
-todo draw quad
+draw quad
 probe (0, 0) rgba (3, 3, 3, 3)
 uniform 20 float 1
-todo draw quad
+draw quad
 probe (0, 0) rgba (1, 1, 1, 1)