vkd3d-shader/hlsl: Lower non-constant array loads for SM1.

This is achieved by means of creating a variable storing zero, loading every array element, comparing if the non-constant index matches the index of that element at runtime, and in that case store the corresponding element in the variable. This seems to be the same strategy that the native compiler uses.
Approved-by: Elizabeth Figura (@zfigura) Approved-by: Henri Verbeet (@hverbeet) Merge-Request: https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/936
2025-04-13 05:43:18 -07:00 · 2024-07-08 15:13:07 -04:00 · 2024-08-08 23:47:10 +02:00
parent e0cfd8f86a
commit 9f515a9daa
3 changed files with 148 additions and 29 deletions
--- a/libs/vkd3d-shader/hlsl_codegen.c
+++ b/libs/vkd3d-shader/hlsl_codegen.c
@@ -2633,6 +2633,123 @@ static bool validate_nonconstant_vector_store_derefs(struct hlsl_ctx *ctx, struc
    return false;
 }

+/* This pass flattens array loads that include the indexing of a non-constant index into multiple
+ * constant loads, where the value of only one of them ends up in the resulting node.
+ * This is achieved through a synthetic variable. The non-constant index is compared for equality
+ * with every possible value it can have within the array bounds, and the ternary operator is used
+ * to update the value of the synthetic var when the equality check passes. */
+static bool lower_nonconstant_array_loads(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr,
+        struct hlsl_block *block)
+{
+    struct hlsl_constant_value zero_value = {0};
+    struct hlsl_ir_node *cut_index, *zero, *store;
+    const struct hlsl_deref *deref;
+    struct hlsl_type *cut_type;
+    struct hlsl_ir_load *load;
+    struct hlsl_ir_var *var;
+    unsigned int i, i_cut;
+
+    if (instr->type != HLSL_IR_LOAD)
+        return false;
+    load = hlsl_ir_load(instr);
+    deref = &load->src;
+
+    if (deref->path_len == 0)
+        return false;
+
+    for (i = deref->path_len - 1; ; --i)
+    {
+        if (deref->path[i].node->type != HLSL_IR_CONSTANT)
+        {
+            i_cut = i;
+            break;
+        }
+
+        if (i == 0)
+            return false;
+    }
+
+    cut_index = deref->path[i_cut].node;
+    cut_type = deref->var->data_type;
+    for (i = 0; i < i_cut; ++i)
+        cut_type = hlsl_get_element_type_from_path_index(ctx, cut_type, deref->path[i].node);
+
+    if (cut_type->class != HLSL_CLASS_ARRAY)
+    {
+        VKD3D_ASSERT(hlsl_type_is_row_major(cut_type));
+        return false;
+    }
+
+    if (!(var = hlsl_new_synthetic_var(ctx, "array_load", instr->data_type, &instr->loc)))
+        return false;
+
+    if (!(zero = hlsl_new_constant(ctx, instr->data_type, &zero_value, &instr->loc)))
+        return false;
+    hlsl_block_add_instr(block, zero);
+
+    if (!(store = hlsl_new_simple_store(ctx, var, zero)))
+        return false;
+    hlsl_block_add_instr(block, store);
+
+    TRACE("Lowering non-constant array load on variable '%s'.\n", deref->var->name);
+    for (i = 0; i < cut_type->e.array.elements_count; ++i)
+    {
+        struct hlsl_type *btype = hlsl_get_scalar_type(ctx, HLSL_TYPE_BOOL);
+        struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS] = {0};
+        struct hlsl_ir_node *const_i, *equals, *ternary, *var_store;
+        struct hlsl_ir_load *var_load, *specific_load;
+        struct hlsl_deref deref_copy = {0};
+
+        if (!(const_i = hlsl_new_uint_constant(ctx, i, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, const_i);
+
+        operands[0] = cut_index;
+        operands[1] = const_i;
+        if (!(equals = hlsl_new_expr(ctx, HLSL_OP2_EQUAL, operands, btype, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, equals);
+
+        if (!(equals = hlsl_new_swizzle(ctx, HLSL_SWIZZLE(X, X, X, X), var->data_type->dimx, equals, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, equals);
+
+        if (!(var_load = hlsl_new_var_load(ctx, var, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, &var_load->node);
+
+        if (!hlsl_copy_deref(ctx, &deref_copy, deref))
+            return false;
+        hlsl_src_remove(&deref_copy.path[i_cut]);
+        hlsl_src_from_node(&deref_copy.path[i_cut], const_i);
+
+        if (!(specific_load = hlsl_new_load_index(ctx, &deref_copy, NULL, &cut_index->loc)))
+        {
+            hlsl_cleanup_deref(&deref_copy);
+            return false;
+        }
+        hlsl_block_add_instr(block, &specific_load->node);
+
+        hlsl_cleanup_deref(&deref_copy);
+
+        operands[0] = equals;
+        operands[1] = &specific_load->node;
+        operands[2] = &var_load->node;
+        if (!(ternary = hlsl_new_expr(ctx, HLSL_OP3_TERNARY, operands, instr->data_type, &cut_index->loc)))
+            return false;
+        hlsl_block_add_instr(block, ternary);
+
+        if (!(var_store = hlsl_new_simple_store(ctx, var, ternary)))
+            return false;
+        hlsl_block_add_instr(block, var_store);
+    }
+
+    if (!(load = hlsl_new_var_load(ctx, var, &instr->loc)))
+        return false;
+    hlsl_block_add_instr(block, &load->node);
+
+    return true;
+}
 /* Lower combined samples and sampler variables to synthesized separated textures and samplers.
 * That is, translate SM1-style samples in the source to SM4-style samples in the bytecode. */
 static bool lower_combined_samples(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context)
@@ -6241,6 +6358,8 @@ int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry

    if (profile->major_version < 4)
    {
+        while (lower_ir(ctx, lower_nonconstant_array_loads, body));
+
        lower_ir(ctx, lower_ternary, body);

        lower_ir(ctx, lower_nonfloat_exprs, body);
--- a/tests/hlsl/non-const-indexing.shader_test
+++ b/tests/hlsl/non-const-indexing.shader_test
@@ -1,4 +1,4 @@
-[pixel shader todo(sm<4)]
+[pixel shader]
 uniform float4 f[3];
 uniform float2 i;

@@ -12,16 +12,16 @@ uniform 0 float4 1.0 2.0 3.0 4.0
 uniform 4 float4 5.0 6.0 7.0 8.0
 uniform 8 float4 9.0 10.0 11.0 12.0
 uniform 12 float4 0 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (1.0, 2.0, 3.0, 4.0)
 uniform 12 float4 1 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 12 float4 0 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 12 float4 1 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (9.0, 10.0, 11.0, 12.0)


@@ -49,7 +49,7 @@ todo(glsl) draw quad
 probe (0, 0) rgba (14.0, 14.0, 14.0, 14.0)


-[pixel shader todo(sm<4)]
+[pixel shader]
 float i;

 float4 main() : sv_target
@@ -61,7 +61,7 @@ float4 main() : sv_target

 [test]
 uniform 0 float 2.3
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (3, 3, 3, 3)


@@ -90,7 +90,7 @@ todo(sm<4 | glsl) draw quad
 probe (0, 0) rgba (24.0, 0.0, 21.0, 1.0)


-[pixel shader todo(sm<4)]
+[pixel shader]
 uniform float2 i;

 float4 main() : sv_target
@@ -102,20 +102,20 @@ float4 main() : sv_target

 [test]
 uniform 0 float4 0 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (1.0, 2.0, 3.0, 4.0)
 uniform 0 float4 1 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 0 float4 0 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5.0, 6.0, 7.0, 8.0)
 uniform 0 float4 1 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (9.0, 10.0, 11.0, 12.0)


-[pixel shader todo(sm<4)]
+[pixel shader]
 float4 a;

 float4 main() : sv_target
@@ -130,11 +130,11 @@ float4 main() : sv_target

 [test]
 uniform 0 float4 0 0 2.4 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (1.0, 120.0, 90.0, 4.0)


-[pixel shader todo(sm<4)]
+[pixel shader]
 float i, j;

 float4 main() : sv_target
@@ -148,16 +148,16 @@ float4 main() : sv_target
 if(sm<4) uniform 0 float 3
 if(sm<4) uniform 4 float 1
 if(sm>=4) uniform 0 float4 3 1 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (7, 7, 7, 7)
 if(sm<4) uniform 0 float 5
 if(sm<4) uniform 4 float 0
 if(sm>=4) uniform 0 float4 5 0 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (10, 10, 10, 10)


-[pixel shader todo(sm<4)]
+[pixel shader]
 float i, j;
 float k;

@@ -186,17 +186,17 @@ if(sm<4) uniform 0 float 2
 if(sm<4) uniform 4 float 1
 if(sm<4) uniform 8 float -1
 if(sm>=4) uniform 0 float4 2 1 -1 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (30, 31, 32, 33)
 if(sm<4) uniform 0 float 1
 if(sm<4) uniform 4 float 0
 if(sm<4) uniform 8 float 1
 if(sm>=4) uniform 0 float4 1 0 1 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (70, 71, 72, 73)


-[pixel shader todo(sm<4)]
+[pixel shader]
 float i, j;

 float4 main() : sv_target
@@ -214,12 +214,12 @@ float4 main() : sv_target
 if(sm<4) uniform 0 float 11
 if(sm<4) uniform 4 float 12
 if(sm>=4) uniform 0 float4 11 12 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (5, 5, 5, 5)
 if(sm<4) uniform 0 float 13
 if(sm<4) uniform 4 float 10
 if(sm>=4) uniform 0 float4 13 10 0 0
-todo(sm<4 | glsl) draw quad
+todo(glsl) draw quad
 probe (0, 0) rgba (9, 9, 9, 9)


@@ -309,7 +309,7 @@ probe (0, 0) rgba (1, 5, 3, 4)
 % reset requirements


-[pixel shader todo(sm<4)]
+[pixel shader]
 uniform float4 f[4];
 uniform uint4 u;
 uniform uint4 v;
@@ -332,7 +332,7 @@ uniform 8 float 3.0
 uniform 12 float 4.0
 uniform 16 uint4 3 1 0 2
 uniform 20 uint4 0 3 1 2
-todo(sm<4 | glsl) draw quad
-if(sm<4)         todo probe (0,0) rgba (1.0, 1.0, 1.0, 1.0)
+todo(glsl) draw quad
+if(sm<4)         probe (0,0) rgba (1.0, 1.0, 1.0, 1.0)
 if(sm>=4 & sm<6) todo probe (0,0) rgba (4.0, 4.0, 4.0, 4.0)
 if(sm>=6)        probe (0,0) rgba (4.0, 3.0, 2.0, 1.0)
--- a/tests/hlsl/sm1-const-allocation.shader_test
+++ b/tests/hlsl/sm1-const-allocation.shader_test
@@ -375,7 +375,7 @@ draw quad
 probe (0, 0) rgba (6, 1, 0, 0)


-[pixel shader todo]
+[pixel shader]
 // Relative addressing extends the allocation size only up to the array's size.
 float idx;

@@ -405,8 +405,8 @@ uniform 8 float 2
 uniform 12 float 3
 uniform 16 float 4
 uniform 20 float 3
-todo draw quad
+draw quad
 probe (0, 0) rgba (3, 3, 3, 3)
 uniform 20 float 1
-todo draw quad
+draw quad
 probe (0, 0) rgba (1, 1, 1, 1)