vkd3d-shader/hlsl: Implement loop unrolling.

Based on a patch by Nikolay Sivov.

Co-authored-by: Nikolay Sivov <nsivov@codeweavers.com>
This commit is contained in:
Victor Chiletto 2024-04-11 11:11:14 -03:00 committed by Henri Verbeet
parent 7edd7dcf79
commit 2034a8bab9
Notes: Henri Verbeet 2024-07-11 00:40:50 +02:00
Approved-by: Elizabeth Figura (@zfigura)
Approved-by: Henri Verbeet (@hverbeet)
Merge-Request: https://gitlab.winehq.org/wine/vkd3d/-/merge_requests/786
7 changed files with 236 additions and 18 deletions

View File

@ -5710,6 +5710,222 @@ static void sm1_generate_vsir(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl
sm1_generate_vsir_signature(ctx, program); sm1_generate_vsir_signature(ctx, program);
} }
static struct hlsl_ir_jump *loop_unrolling_find_jump(struct hlsl_block *block, struct hlsl_ir_node *stop_point,
struct hlsl_block **found_block)
{
struct hlsl_ir_node *node;
LIST_FOR_EACH_ENTRY(node, &block->instrs, struct hlsl_ir_node, entry)
{
if (node == stop_point)
return NULL;
if (node->type == HLSL_IR_IF)
{
struct hlsl_ir_if *iff = hlsl_ir_if(node);
struct hlsl_ir_jump *jump = NULL;
if ((jump = loop_unrolling_find_jump(&iff->then_block, stop_point, found_block)))
return jump;
if ((jump = loop_unrolling_find_jump(&iff->else_block, stop_point, found_block)))
return jump;
}
else if (node->type == HLSL_IR_JUMP)
{
struct hlsl_ir_jump *jump = hlsl_ir_jump(node);
if (jump->type == HLSL_IR_JUMP_BREAK || jump->type == HLSL_IR_JUMP_CONTINUE)
{
*found_block = block;
return jump;
}
}
}
return NULL;
}
static unsigned int loop_unrolling_get_max_iterations(struct hlsl_ctx *ctx, struct hlsl_ir_loop *loop)
{
/* Always use the explicit limit if it has been passed. */
if (loop->unroll_limit)
return loop->unroll_limit;
/* All SMs will default to 1024 if [unroll] has been specified without an explicit limit. */
if (loop->unroll_type == HLSL_IR_LOOP_FORCE_UNROLL)
return 1024;
/* SM4 limits implicit unrolling to 254 iterations. */
if (hlsl_version_ge(ctx, 4, 0))
return 254;
/* SM<3 implicitly unrolls up to 1024 iterations. */
return 1024;
}
static bool loop_unrolling_unroll_loop(struct hlsl_ctx *ctx, struct hlsl_block *block,
struct hlsl_block *loop_parent, struct hlsl_ir_loop *loop)
{
unsigned int max_iterations, i;
max_iterations = loop_unrolling_get_max_iterations(ctx, loop);
for (i = 0; i < max_iterations; ++i)
{
struct hlsl_block tmp_dst, *jump_block;
struct hlsl_ir_jump *jump = NULL;
if (!hlsl_clone_block(ctx, &tmp_dst, &loop->body))
return false;
list_move_before(&loop->node.entry, &tmp_dst.instrs);
hlsl_block_cleanup(&tmp_dst);
hlsl_run_const_passes(ctx, block);
if ((jump = loop_unrolling_find_jump(loop_parent, &loop->node, &jump_block)))
{
enum hlsl_ir_jump_type type = jump->type;
if (jump_block != loop_parent)
{
if (loop->unroll_type == HLSL_IR_LOOP_FORCE_UNROLL)
hlsl_error(ctx, &jump->node.loc, VKD3D_SHADER_ERROR_HLSL_FAILED_FORCED_UNROLL,
"Unable to unroll loop, unrolling loops with conditional jumps is currently not supported.");
return false;
}
list_move_slice_tail(&tmp_dst.instrs, &jump->node.entry, list_prev(&loop_parent->instrs, &loop->node.entry));
hlsl_block_cleanup(&tmp_dst);
if (type == HLSL_IR_JUMP_BREAK)
break;
}
}
/* Native will not emit an error if max_iterations has been reached with an
* explicit limit. It also will not insert a loop if there are iterations left
* i.e [unroll(4)] for (i = 0; i < 8; ++i)) */
if (!loop->unroll_limit && i == max_iterations)
{
if (loop->unroll_type == HLSL_IR_LOOP_FORCE_UNROLL)
hlsl_error(ctx, &loop->node.loc, VKD3D_SHADER_ERROR_HLSL_FAILED_FORCED_UNROLL,
"Unable to unroll loop, maximum iterations reached (%u).", max_iterations);
return false;
}
list_remove(&loop->node.entry);
hlsl_free_instr(&loop->node);
return true;
}
/*
* loop_unrolling_find_unrollable_loop() is not the normal way to do things;
* normal passes simply iterate over the whole block and apply a transformation
* to every relevant instruction. However, loop unrolling can fail, and we want
* to leave the loop in its previous state in that case. That isn't a problem by
* itself, except that loop unrolling needs copy-prop in order to work properly,
* and copy-prop state at the time of the loop depends on the rest of the program
* up to that point. This means we need to clone the whole program, and at that
* point we have to search it again anyway to find the clone of the loop we were
* going to unroll.
*
* FIXME: Ideally we wouldn't clone the whole program; instead we would run copyprop
* up until the loop instruction, clone just that loop, then use copyprop again
* with the saved state after unrolling. However, copyprop currently isn't built
* for that yet [notably, it still relies on indices]. Note also this still doesn't
* really let us use transform_ir() anyway [since we don't have a good way to say
* "copyprop from the beginning of the program up to the instruction we're
* currently processing" from the callback]; we'd have to use a dedicated
* recursive function instead. */
static struct hlsl_ir_loop *loop_unrolling_find_unrollable_loop(struct hlsl_ctx *ctx, struct hlsl_block *block,
struct hlsl_block **containing_block)
{
struct hlsl_ir_node *instr;
LIST_FOR_EACH_ENTRY(instr, &block->instrs, struct hlsl_ir_node, entry)
{
switch (instr->type)
{
case HLSL_IR_LOOP:
{
struct hlsl_ir_loop *nested_loop;
struct hlsl_ir_loop *loop = hlsl_ir_loop(instr);
if ((nested_loop = loop_unrolling_find_unrollable_loop(ctx, &loop->body, containing_block)))
return nested_loop;
if (loop->unroll_type == HLSL_IR_LOOP_UNROLL || loop->unroll_type == HLSL_IR_LOOP_FORCE_UNROLL)
{
*containing_block = block;
return loop;
}
break;
}
case HLSL_IR_IF:
{
struct hlsl_ir_loop *loop;
struct hlsl_ir_if *iff = hlsl_ir_if(instr);
if ((loop = loop_unrolling_find_unrollable_loop(ctx, &iff->then_block, containing_block)))
return loop;
if ((loop = loop_unrolling_find_unrollable_loop(ctx, &iff->else_block, containing_block)))
return loop;
break;
}
case HLSL_IR_SWITCH:
{
struct hlsl_ir_switch *s = hlsl_ir_switch(instr);
struct hlsl_ir_switch_case *c;
struct hlsl_ir_loop *loop;
LIST_FOR_EACH_ENTRY(c, &s->cases, struct hlsl_ir_switch_case, entry)
{
if ((loop = loop_unrolling_find_unrollable_loop(ctx, &c->body, containing_block)))
return loop;
}
break;
}
default:
break;
}
}
return NULL;
}
static void transform_unroll_loops(struct hlsl_ctx *ctx, struct hlsl_block *block)
{
while (true)
{
struct hlsl_block clone, *containing_block;
struct hlsl_ir_loop *loop, *cloned_loop;
if (!(loop = loop_unrolling_find_unrollable_loop(ctx, block, &containing_block)))
return;
if (!hlsl_clone_block(ctx, &clone, block))
return;
cloned_loop = loop_unrolling_find_unrollable_loop(ctx, &clone, &containing_block);
assert(cloned_loop);
if (!loop_unrolling_unroll_loop(ctx, &clone, containing_block, cloned_loop))
{
hlsl_block_cleanup(&clone);
loop->unroll_type = HLSL_IR_LOOP_FORCE_LOOP;
continue;
}
hlsl_block_cleanup(block);
hlsl_block_init(block);
hlsl_block_add_block(block, &clone);
}
}
int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func, int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func,
enum vkd3d_shader_target_type target_type, struct vkd3d_shader_code *out) enum vkd3d_shader_target_type target_type, struct vkd3d_shader_code *out)
{ {
@ -5796,6 +6012,7 @@ int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry
hlsl_transform_ir(ctx, lower_discard_neg, body, NULL); hlsl_transform_ir(ctx, lower_discard_neg, body, NULL);
} }
transform_unroll_loops(ctx, body);
hlsl_run_const_passes(ctx, body); hlsl_run_const_passes(ctx, body);
remove_unreachable_code(ctx, body); remove_unreachable_code(ctx, body);

View File

@ -152,6 +152,7 @@ enum vkd3d_shader_error
VKD3D_SHADER_ERROR_HLSL_MISSING_TECHNIQUE = 5029, VKD3D_SHADER_ERROR_HLSL_MISSING_TECHNIQUE = 5029,
VKD3D_SHADER_ERROR_HLSL_UNKNOWN_MODIFIER = 5030, VKD3D_SHADER_ERROR_HLSL_UNKNOWN_MODIFIER = 5030,
VKD3D_SHADER_ERROR_HLSL_INVALID_STATE_BLOCK_ENTRY = 5031, VKD3D_SHADER_ERROR_HLSL_INVALID_STATE_BLOCK_ENTRY = 5031,
VKD3D_SHADER_ERROR_HLSL_FAILED_FORCED_UNROLL = 5032,
VKD3D_SHADER_WARNING_HLSL_IMPLICIT_TRUNCATION = 5300, VKD3D_SHADER_WARNING_HLSL_IMPLICIT_TRUNCATION = 5300,
VKD3D_SHADER_WARNING_HLSL_DIVISION_BY_ZERO = 5301, VKD3D_SHADER_WARNING_HLSL_DIVISION_BY_ZERO = 5301,

View File

@ -5,7 +5,7 @@ void main(float4 pos : position, out float tex : texcoord, out float4 out_pos :
out_pos = pos; out_pos = pos;
} }
[pixel shader todo(sm<4)] [pixel shader todo(sm<6)]
float4 main(float tex : texcoord) : sv_target float4 main(float tex : texcoord) : sv_target
{ {
int i; int i;
@ -23,7 +23,7 @@ float4 main(float tex : texcoord) : sv_target
} }
[test] [test]
todo(sm<4 | glsl) draw quad todo(sm<6 | glsl) draw quad
probe ( 0, 0, 159, 480) rgba (10.0, 35.0, 0.0, 0.0) probe ( 0, 0, 159, 480) rgba (10.0, 35.0, 0.0, 0.0)
probe (161, 0, 479, 480) rgba (10.0, 38.0, 0.0, 0.0) probe (161, 0, 479, 480) rgba (10.0, 38.0, 0.0, 0.0)
probe (481, 0, 640, 480) rgba ( 5.0, 10.0, 0.0, 0.0) probe (481, 0, 640, 480) rgba ( 5.0, 10.0, 0.0, 0.0)
@ -63,7 +63,7 @@ probe (481, 0, 640, 480) rgba ( 5.0, 10.0, 0.0, 0.0)
[require] [require]
% Reset requirements % Reset requirements
[pixel shader todo(sm<4)] [pixel shader]
float4 main(float tex : texcoord) : sv_target float4 main(float tex : texcoord) : sv_target
{ {
int i; int i;
@ -76,7 +76,7 @@ float4 main(float tex : texcoord) : sv_target
} }
[test] [test]
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (10.0, 45.0, 0.0, 0.0) probe (0, 0) rgba (10.0, 45.0, 0.0, 0.0)
[pixel shader fail(sm<6)] [pixel shader fail(sm<6)]

View File

@ -143,7 +143,7 @@ uniform 0 float 0.9
todo(sm<4 | glsl) draw quad todo(sm<4 | glsl) draw quad
probe (0, 0) rgba (1.0, 0.9, 1.0, 0.6) 1 probe (0, 0) rgba (1.0, 0.9, 1.0, 0.6) 1
[pixel shader todo(sm<4)] [pixel shader]
float func(out float o) float func(out float o)
{ {
o = 0.1; o = 0.1;
@ -181,7 +181,7 @@ float4 main() : sv_target
} }
[test] [test]
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (0.4, 0.3, 0.3, 0.9) 1 probe (0, 0) rgba (0.4, 0.3, 0.3, 0.9) 1
[pixel shader todo(sm<4)] [pixel shader todo(sm<4)]

View File

@ -1,6 +1,6 @@
% TODO: dxcompiler emits no loops for any of these test shaders. % TODO: dxcompiler emits no loops for any of these test shaders.
[pixel shader todo(sm<4)] [pixel shader]
float a; float a;
float4 main() : sv_target float4 main() : sv_target
@ -18,11 +18,11 @@ float4 main() : sv_target
[test] [test]
uniform 0 float 5.0 uniform 0 float 5.0
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (50.0, 50.0, 50.0, 50.0) probe (0, 0) rgba (50.0, 50.0, 50.0, 50.0)
[pixel shader todo(sm<4)] [pixel shader]
float a; float a;
float4 main() : sv_target float4 main() : sv_target
@ -41,10 +41,10 @@ float4 main() : sv_target
[test] [test]
uniform 0 float 4.0 uniform 0 float 4.0
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (20.0, 20.0, 20.0, 20.0) probe (0, 0) rgba (20.0, 20.0, 20.0, 20.0)
[pixel shader todo(sm<4)] [pixel shader]
float a; float a;
float4 main() : sv_target float4 main() : sv_target
@ -70,10 +70,10 @@ float4 main() : sv_target
[test] [test]
uniform 0 float 4.0 uniform 0 float 4.0
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (409.1, 409.1, 409.1, 409.1) probe (0, 0) rgba (409.1, 409.1, 409.1, 409.1)
[pixel shader todo(sm<4)] [pixel shader]
float a; float a;
float4 main() : sv_target float4 main() : sv_target
@ -100,7 +100,7 @@ float4 main() : sv_target
[test] [test]
uniform 0 float 4.0 uniform 0 float 4.0
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (410.1, 410.1, 410.1, 410.1) probe (0, 0) rgba (410.1, 410.1, 410.1, 410.1)
% loop attribute by itself % loop attribute by itself

View File

@ -124,7 +124,7 @@ uniform 0 float 0.9
todo(sm<4 | glsl) draw quad todo(sm<4 | glsl) draw quad
probe (0, 0) rgba (0.4, 0.5, 0.6, 0.7) 1 probe (0, 0) rgba (0.4, 0.5, 0.6, 0.7) 1
[pixel shader todo(sm<4)] [pixel shader]
void main(out float4 ret : sv_target) void main(out float4 ret : sv_target)
{ {
ret = float4(0.1, 0.2, 0.3, 0.4); ret = float4(0.1, 0.2, 0.3, 0.4);
@ -138,7 +138,7 @@ void main(out float4 ret : sv_target)
} }
[test] [test]
todo(sm<4 | glsl) draw quad todo(glsl) draw quad
probe (0, 0) rgba (0.2, 0.4, 0.6, 0.8) probe (0, 0) rgba (0.2, 0.4, 0.6, 0.8)
[pixel shader todo(sm<4)] [pixel shader todo(sm<4)]

View File

@ -124,7 +124,7 @@ float4 main(float4 pos : sv_position) : sv_target
shader model >= 4.0 shader model >= 4.0
shader model < 4.1 shader model < 4.1
[pixel shader todo] [pixel shader]
Texture2DMS<float4, 1> t; Texture2DMS<float4, 1> t;
float4 main(float4 pos : sv_position) : sv_target float4 main(float4 pos : sv_position) : sv_target
@ -139,7 +139,7 @@ float4 main(float4 pos : sv_position) : sv_target
} }
[test] [test]
todo draw quad todo(glsl) draw quad
probe (0, 0) rgba (0.1, 0.2, 0.3, 0.4) probe (0, 0) rgba (0.1, 0.2, 0.3, 0.4)
probe (1, 0) rgba (0.5, 0.7, 0.6, 0.8) probe (1, 0) rgba (0.5, 0.7, 0.6, 0.8)
probe (0, 1) rgba (0.6, 0.5, 0.2, 0.1) probe (0, 1) rgba (0.6, 0.5, 0.2, 0.1)