From e49beca0d5132ad871f6cdbaf2aad455b453f01e Mon Sep 17 00:00:00 2001 From: Petrichor Park Date: Mon, 19 Aug 2024 12:19:36 -0500 Subject: [PATCH] vkd3d-shader/hlsl: Implement the countbits() intrinsic. --- libs/vkd3d-shader/hlsl.c | 1 + libs/vkd3d-shader/hlsl.h | 1 + libs/vkd3d-shader/hlsl.y | 15 ++++++++ libs/vkd3d-shader/hlsl_codegen.c | 52 +++++++++++++++++++++++++++ libs/vkd3d-shader/hlsl_constant_ops.c | 29 +++++++++++++++ libs/vkd3d-shader/tpf.c | 1 + tests/hlsl/bitwise.shader_test | 16 ++++----- 7 files changed, 107 insertions(+), 8 deletions(-) diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c index 5a25efdee..738a55f15 100644 --- a/libs/vkd3d-shader/hlsl.c +++ b/libs/vkd3d-shader/hlsl.c @@ -3726,6 +3726,7 @@ const char *debug_hlsl_expr_op(enum hlsl_ir_expr_op op) [HLSL_OP1_CEIL] = "ceil", [HLSL_OP1_COS] = "cos", [HLSL_OP1_COS_REDUCED] = "cos_reduced", + [HLSL_OP1_COUNTBITS] = "countbits", [HLSL_OP1_DSX] = "dsx", [HLSL_OP1_DSX_COARSE] = "dsx_coarse", [HLSL_OP1_DSX_FINE] = "dsx_fine", diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h index 591535a0d..77eea87f0 100644 --- a/libs/vkd3d-shader/hlsl.h +++ b/libs/vkd3d-shader/hlsl.h @@ -720,6 +720,7 @@ enum hlsl_ir_expr_op HLSL_OP1_CEIL, HLSL_OP1_COS, HLSL_OP1_COS_REDUCED, /* Reduced range [-pi, pi], writes to .x */ + HLSL_OP1_COUNTBITS, HLSL_OP1_DSX, HLSL_OP1_DSX_COARSE, HLSL_OP1_DSX_FINE, diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index d83ad9fe7..71632363c 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -3579,6 +3579,20 @@ static bool intrinsic_cosh(struct hlsl_ctx *ctx, return write_cosh_or_sinh(ctx, params, loc, false); } +static bool intrinsic_countbits(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS] = {0}; + struct hlsl_type *type; + + if (!elementwise_intrinsic_uint_convert_args(ctx, params, loc)) + return false; + type = convert_numeric_type(ctx, params->args[0]->data_type, HLSL_TYPE_UINT); + + operands[0] = params->args[0]; + return add_expr(ctx, params->instrs, HLSL_OP1_COUNTBITS, operands, type, loc); +} + static bool intrinsic_cross(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -5317,6 +5331,7 @@ intrinsic_functions[] = {"clip", 1, true, intrinsic_clip}, {"cos", 1, true, intrinsic_cos}, {"cosh", 1, true, intrinsic_cosh}, + {"countbits", 1, true, intrinsic_countbits}, {"cross", 2, true, intrinsic_cross}, {"ddx", 1, true, intrinsic_ddx}, {"ddx_coarse", 1, true, intrinsic_ddx_coarse}, diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index d4ea08138..130280d3b 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -11062,6 +11062,12 @@ static bool sm4_generate_vsir_instr_expr(struct hlsl_ctx *ctx, sm4_generate_vsir_expr_with_two_destinations(ctx, program, VSIR_OP_SINCOS, expr, 1); return true; + case HLSL_OP1_COUNTBITS: + VKD3D_ASSERT(hlsl_type_is_integer(dst_type)); + VKD3D_ASSERT(hlsl_version_ge(ctx, 5, 0)); + generate_vsir_instr_expr_single_instr_op(ctx, program, expr, VSIR_OP_COUNTBITS, 0, 0, true); + return true; + case HLSL_OP1_DSX: VKD3D_ASSERT(type_is_float(dst_type)); generate_vsir_instr_expr_single_instr_op(ctx, program, expr, VSIR_OP_DSX, 0, 0, true); @@ -14097,6 +14103,51 @@ static void loop_unrolling_execute(struct hlsl_ctx *ctx, struct hlsl_block *bloc hlsl_transform_ir(ctx, resolve_loops, block, NULL); } +static bool lower_countbits(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, struct hlsl_block *block) +{ + struct hlsl_ir_function_decl *func; + struct hlsl_ir_node *call, *rhs; + struct hlsl_ir_expr *expr; + struct hlsl_ir_var *lhs; + char *body; + + /* Like vkd3d_popcount(). */ + static const char template[] = + "typedef uint%u uintX;\n" + "uintX countbits(uintX v)\n" + "{\n" + " v -= (v >> 1) & 0x55555555;\n" + " v = (v & 0x33333333) + ((v >> 2) & 0x33333333);\n" + " return (((v + (v >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24;\n" + "}\n"; + + if (node->type != HLSL_IR_EXPR) + return false; + + expr = hlsl_ir_expr(node); + if (expr->op != HLSL_OP1_COUNTBITS) + return false; + + rhs = expr->operands[0].node; + if (!(body = hlsl_sprintf_alloc(ctx, template, hlsl_type_component_count(rhs->data_type)))) + return false; + func = hlsl_compile_internal_function(ctx, "countbits", body); + vkd3d_free(body); + if (!func) + return false; + + lhs = func->parameters.vars[0]; + hlsl_block_add_simple_store(ctx, block, lhs, rhs); + + if (!(call = hlsl_new_call(ctx, func, &node->loc))) + return false; + hlsl_block_add_instr(block, call); + + hlsl_block_add_simple_load(ctx, block, func->return_var, &node->loc); + + return true; +} + static bool lower_f16tof32(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, struct hlsl_block *block) { struct hlsl_ir_function_decl *func; @@ -14355,6 +14406,7 @@ static void process_entry_function(struct hlsl_ctx *ctx, struct list *semantic_v if (hlsl_version_ge(ctx, 4, 0) && hlsl_version_lt(ctx, 5, 0)) { + lower_ir(ctx, lower_countbits, body); lower_ir(ctx, lower_f16tof32, body); lower_ir(ctx, lower_f32tof16, body); } diff --git a/libs/vkd3d-shader/hlsl_constant_ops.c b/libs/vkd3d-shader/hlsl_constant_ops.c index 252ed51a4..71e7c7c88 100644 --- a/libs/vkd3d-shader/hlsl_constant_ops.c +++ b/libs/vkd3d-shader/hlsl_constant_ops.c @@ -280,6 +280,31 @@ static bool fold_cos(struct hlsl_ctx *ctx, struct hlsl_constant_value *dst, return true; } +static bool fold_countbits(struct hlsl_ctx *ctx, struct hlsl_constant_value *dst, + const struct hlsl_type *dst_type, const struct hlsl_ir_constant *src) +{ + enum hlsl_base_type type = dst_type->e.numeric.type; + unsigned int k; + + VKD3D_ASSERT(type == src->node.data_type->e.numeric.type); + + for (k = 0; k < dst_type->e.numeric.dimx; ++k) + { + switch (type) + { + case HLSL_TYPE_UINT: + dst->u[k].u = vkd3d_popcount(src->value.u[k].u); + break; + + default: + FIXME("Fold 'countbits' for type %s.\n", debug_hlsl_type(ctx, dst_type)); + return false; + } + } + + return true; +} + static bool fold_exp2(struct hlsl_ctx *ctx, struct hlsl_constant_value *dst, const struct hlsl_type *dst_type, const struct hlsl_ir_constant *src) { @@ -1407,6 +1432,10 @@ bool hlsl_fold_constant_exprs(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, success = fold_cos(ctx, &res, instr->data_type, arg1); break; + case HLSL_OP1_COUNTBITS: + success = fold_countbits(ctx, &res, instr->data_type, arg1); + break; + case HLSL_OP1_EXP2: success = fold_exp2(ctx, &res, instr->data_type, arg1); break; diff --git a/libs/vkd3d-shader/tpf.c b/libs/vkd3d-shader/tpf.c index 7ff2a305c..6e523250f 100644 --- a/libs/vkd3d-shader/tpf.c +++ b/libs/vkd3d-shader/tpf.c @@ -4248,6 +4248,7 @@ static void tpf_handle_instruction(struct tpf_compiler *tpf, const struct vkd3d_ case VSIR_OP_BREAK: case VSIR_OP_CASE: case VSIR_OP_CONTINUE: + case VSIR_OP_COUNTBITS: case VSIR_OP_CUT: case VSIR_OP_CUT_STREAM: case VSIR_OP_DCL_STREAM: diff --git a/tests/hlsl/bitwise.shader_test b/tests/hlsl/bitwise.shader_test index 3b249efd4..ab1231e22 100644 --- a/tests/hlsl/bitwise.shader_test +++ b/tests/hlsl/bitwise.shader_test @@ -1,4 +1,4 @@ -[pixel shader todo] +[pixel shader todo(sm<4)] uint4 u; float4 main() : sv_target @@ -8,11 +8,11 @@ float4 main() : sv_target [test] uniform 0 uint4 0 0xffffffff 0xcccccccc 0x31415926 -todo(sm<6 | msl & sm>=6) draw quad +todo(sm<4 | msl & sm>=6) draw quad if(sm>=4) probe (0, 0) f32(0, 32, 16, 12) if(sm<4) probe (0, 0) f32(0, 0, 0, 0) -[pixel shader todo] +[pixel shader] float4 main(float4 pos : sv_position) : sv_target { uint4x4 umat = @@ -27,15 +27,15 @@ float4 main(float4 pos : sv_position) : sv_target } [test] -todo(sm<6 | msl & sm>=6) draw quad +todo(msl & sm>=6) draw quad if(sm>=4) probe(0, 0) f32(1, 1, 2, 1) if(sm>=4) probe(1, 0) f32(2, 2, 3, 1) if(sm>=4) probe(2, 0) f32(2, 2, 3, 2) if(sm>=4) probe(3, 0) f32(3, 3, 4, 1) -if(sm<4) probe(0, 0) f32(0, 0, 0, 0) -if(sm<4) probe(1, 0) f32(0, 0, 0, 0) -if(sm<4) probe(2, 0) f32(0, 0, 0, 0) -if(sm<4) probe(3, 0) f32(0, 0, 0, 0) +if(sm<4) todo probe(0, 0) f32(0, 0, 0, 0) +if(sm<4) todo probe(1, 0) f32(0, 0, 0, 0) +if(sm<4) todo probe(2, 0) f32(0, 0, 0, 0) +if(sm<4) todo probe(3, 0) f32(0, 0, 0, 0) [require] shader model >= 4.0