From e35604dbf0976c4aa7d19eadca5dcdc98438ee8d Mon Sep 17 00:00:00 2001 From: Petrichor Park Date: Tue, 13 Aug 2024 11:34:23 -0500 Subject: [PATCH] vkd3d-shader/hlsl: Implement the firstbitlow() intrinsic. --- configure.ac | 1 + include/private/vkd3d_common.h | 18 +++++++++ libs/vkd3d-shader/hlsl.c | 1 + libs/vkd3d-shader/hlsl.h | 1 + libs/vkd3d-shader/hlsl.y | 19 +++++++++ libs/vkd3d-shader/hlsl_codegen.c | 58 +++++++++++++++++++++++++++ libs/vkd3d-shader/hlsl_constant_ops.c | 32 +++++++++++++++ libs/vkd3d-shader/tpf.c | 1 + tests/hlsl/bitwise.shader_test | 8 ++-- 9 files changed, 135 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 884d9238c..c552ee524 100644 --- a/configure.ac +++ b/configure.ac @@ -173,6 +173,7 @@ AS_IF([test "x$with_xcb" != "xno"], dnl Check for functions VKD3D_CHECK_FUNC([HAVE_BUILTIN_CLZ], [__builtin_clz], [__builtin_clz(0)]) +VKD3D_CHECK_FUNC([HAVE_BUILTIN_CTZ], [__builtin_ctz], [__builtin_ctz(0)]) VKD3D_CHECK_FUNC([HAVE_BUILTIN_POPCOUNT], [__builtin_popcount], [__builtin_popcount(0)]) VKD3D_CHECK_FUNC([HAVE_BUILTIN_ADD_OVERFLOW], [__builtin_add_overflow], [__builtin_add_overflow(0, 0, (int *)0)]) VKD3D_CHECK_FUNC([HAVE_SYNC_ADD_AND_FETCH], [__sync_add_and_fetch], [__sync_add_and_fetch((int *)0, 0)]) diff --git a/include/private/vkd3d_common.h b/include/private/vkd3d_common.h index e7b65dec7..1e8bd36a1 100644 --- a/include/private/vkd3d_common.h +++ b/include/private/vkd3d_common.h @@ -343,6 +343,24 @@ static inline unsigned int vkd3d_log2i(unsigned int x) #endif } +static inline unsigned int vkd3d_ctz(uint32_t v) +{ +#ifdef HAVE_BUILTIN_CTZ + return __builtin_ctz(v); +#else + unsigned int c = 31; + + v &= -v; + c = (v & 0x0000ffff) ? c - 16 : c; + c = (v & 0x00ff00ff) ? c - 8 : c; + c = (v & 0x0f0f0f0f) ? c - 4 : c; + c = (v & 0x33333333) ? c - 2 : c; + c = (v & 0x55555555) ? c - 1 : c; + + return c; +#endif +} + static inline void *vkd3d_memmem( const void *haystack, size_t haystack_len, const void *needle, size_t needle_len) { const char *str = haystack; diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c index 6c6245b97..ec1e27d94 100644 --- a/libs/vkd3d-shader/hlsl.c +++ b/libs/vkd3d-shader/hlsl.c @@ -3733,6 +3733,7 @@ const char *debug_hlsl_expr_op(enum hlsl_ir_expr_op op) [HLSL_OP1_COS] = "cos", [HLSL_OP1_COS_REDUCED] = "cos_reduced", [HLSL_OP1_COUNTBITS] = "countbits", + [HLSL_OP1_CTZ] = "ctz", [HLSL_OP1_DSX] = "dsx", [HLSL_OP1_DSX_COARSE] = "dsx_coarse", [HLSL_OP1_DSX_FINE] = "dsx_fine", diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h index ed109816b..82d77ca23 100644 --- a/libs/vkd3d-shader/hlsl.h +++ b/libs/vkd3d-shader/hlsl.h @@ -722,6 +722,7 @@ enum hlsl_ir_expr_op HLSL_OP1_COS, HLSL_OP1_COS_REDUCED, /* Reduced range [-pi, pi], writes to .x */ HLSL_OP1_COUNTBITS, + HLSL_OP1_CTZ, HLSL_OP1_DSX, HLSL_OP1_DSX_COARSE, HLSL_OP1_DSX_FINE, diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y index d22b89915..d3004d7cc 100644 --- a/libs/vkd3d-shader/hlsl.y +++ b/libs/vkd3d-shader/hlsl.y @@ -4005,6 +4005,24 @@ static bool intrinsic_firstbithigh(struct hlsl_ctx *ctx, return add_expr(ctx, params->instrs, HLSL_OP3_TERNARY, operands, type, loc); } +static bool intrinsic_firstbitlow(struct hlsl_ctx *ctx, + const struct parse_initializer *params, const struct vkd3d_shader_location *loc) +{ + struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS] = {0}; + struct hlsl_type *type; + + if (hlsl_version_lt(ctx, 4, 0)) + hlsl_error(ctx, loc, VKD3D_SHADER_ERROR_HLSL_INCOMPATIBLE_PROFILE, + "The 'firstbitlow' intrinsic requires shader model 4.0 or higher."); + + if (!elementwise_intrinsic_uint_convert_args(ctx, params, loc)) + return false; + type = convert_numeric_type(ctx, params->args[0]->data_type, HLSL_TYPE_UINT); + + operands[0] = params->args[0]; + return add_expr(ctx, params->instrs, HLSL_OP1_CTZ, operands, type, loc); +} + static bool intrinsic_floor(struct hlsl_ctx *ctx, const struct parse_initializer *params, const struct vkd3d_shader_location *loc) { @@ -5416,6 +5434,7 @@ intrinsic_functions[] = {"f32tof16", 1, true, intrinsic_f32tof16}, {"faceforward", 3, true, intrinsic_faceforward}, {"firstbithigh", 1, true, intrinsic_firstbithigh}, + {"firstbitlow", 1, true, intrinsic_firstbitlow}, {"floor", 1, true, intrinsic_floor}, {"fmod", 2, true, intrinsic_fmod}, {"frac", 1, true, intrinsic_frac}, diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 61dcf5543..81161a397 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -11077,6 +11077,12 @@ static bool sm4_generate_vsir_instr_expr(struct hlsl_ctx *ctx, generate_vsir_instr_expr_single_instr_op(ctx, program, expr, VSIR_OP_COUNTBITS, 0, 0, true); return true; + case HLSL_OP1_CTZ: + VKD3D_ASSERT(hlsl_type_is_integer(dst_type)); + VKD3D_ASSERT(hlsl_version_ge(ctx, 5, 0)); + generate_vsir_instr_expr_single_instr_op(ctx, program, expr, VSIR_OP_FIRSTBIT_LO, 0, 0, true); + return true; + case HLSL_OP1_DSX: VKD3D_ASSERT(type_is_float(dst_type)); generate_vsir_instr_expr_single_instr_op(ctx, program, expr, VSIR_OP_DSX, 0, 0, true); @@ -14157,6 +14163,57 @@ static bool lower_countbits(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, str return true; } +static bool lower_ctz(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, struct hlsl_block *block) +{ + struct hlsl_ir_function_decl *func; + struct hlsl_ir_node *call, *rhs; + struct hlsl_ir_expr *expr; + struct hlsl_ir_var *lhs; + char *body; + + /* ctz() returns the bit number of the least significant 1-bit. + * Bit numbers count from the least significant bit. */ + static const char template[] = + "typedef uint%u uintX;\n" + "uintX ctz(uintX v)\n" + "{\n" + " uintX c = 31;\n" + " v &= -v;\n" + " c = (v & 0x0000ffff) ? c - 16 : c;\n" + " c = (v & 0x00ff00ff) ? c - 8 : c;\n" + " c = (v & 0x0f0f0f0f) ? c - 4 : c;\n" + " c = (v & 0x33333333) ? c - 2 : c;\n" + " c = (v & 0x55555555) ? c - 1 : c;\n" + " return v ? c : -1;\n" + "}\n"; + + if (node->type != HLSL_IR_EXPR) + return false; + + expr = hlsl_ir_expr(node); + if (expr->op != HLSL_OP1_CTZ) + return false; + + rhs = expr->operands[0].node; + if (!(body = hlsl_sprintf_alloc(ctx, template, hlsl_type_component_count(rhs->data_type)))) + return false; + func = hlsl_compile_internal_function(ctx, "ctz", body); + vkd3d_free(body); + if (!func) + return false; + + lhs = func->parameters.vars[0]; + hlsl_block_add_simple_store(ctx, block, lhs, rhs); + + if (!(call = hlsl_new_call(ctx, func, &node->loc))) + return false; + hlsl_block_add_instr(block, call); + + hlsl_block_add_simple_load(ctx, block, func->return_var, &node->loc); + + return true; +} + static bool lower_f16tof32(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, struct hlsl_block *block) { struct hlsl_ir_function_decl *func; @@ -14479,6 +14536,7 @@ static void process_entry_function(struct hlsl_ctx *ctx, struct list *semantic_v if (hlsl_version_ge(ctx, 4, 0) && hlsl_version_lt(ctx, 5, 0)) { lower_ir(ctx, lower_countbits, body); + lower_ir(ctx, lower_ctz, body); lower_ir(ctx, lower_f16tof32, body); lower_ir(ctx, lower_f32tof16, body); lower_ir(ctx, lower_find_msb, body); diff --git a/libs/vkd3d-shader/hlsl_constant_ops.c b/libs/vkd3d-shader/hlsl_constant_ops.c index 2fd2987f7..5252c10f9 100644 --- a/libs/vkd3d-shader/hlsl_constant_ops.c +++ b/libs/vkd3d-shader/hlsl_constant_ops.c @@ -334,6 +334,34 @@ static bool fold_countbits(struct hlsl_ctx *ctx, struct hlsl_constant_value *dst return true; } +static bool fold_ctz(struct hlsl_ctx *ctx, struct hlsl_constant_value *dst, + const struct hlsl_type *dst_type, const struct hlsl_ir_constant *src) +{ + enum hlsl_base_type type = dst_type->e.numeric.type; + unsigned int k; + + VKD3D_ASSERT(type == src->node.data_type->e.numeric.type); + + for (k = 0; k < dst_type->e.numeric.dimx; ++k) + { + switch (type) + { + case HLSL_TYPE_UINT: + if (!src->value.u[k].u) + dst->u[k].u = ~0u; + else + dst->u[k].u = vkd3d_ctz(src->value.u[k].u); + break; + + default: + FIXME("Fold 'ctz' for type %s.\n", debug_hlsl_type(ctx, dst_type)); + return false; + } + } + + return true; +} + static bool fold_exp2(struct hlsl_ctx *ctx, struct hlsl_constant_value *dst, const struct hlsl_type *dst_type, const struct hlsl_ir_constant *src) { @@ -1469,6 +1497,10 @@ bool hlsl_fold_constant_exprs(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, success = fold_countbits(ctx, &res, instr->data_type, arg1); break; + case HLSL_OP1_CTZ: + success = fold_ctz(ctx, &res, instr->data_type, arg1); + break; + case HLSL_OP1_EXP2: success = fold_exp2(ctx, &res, instr->data_type, arg1); break; diff --git a/libs/vkd3d-shader/tpf.c b/libs/vkd3d-shader/tpf.c index bf8c324e2..8f2d3dd48 100644 --- a/libs/vkd3d-shader/tpf.c +++ b/libs/vkd3d-shader/tpf.c @@ -4275,6 +4275,7 @@ static void tpf_handle_instruction(struct tpf_compiler *tpf, const struct vkd3d_ case VSIR_OP_F16TOF32: case VSIR_OP_F32TOF16: case VSIR_OP_FIRSTBIT_HI: + case VSIR_OP_FIRSTBIT_LO: case VSIR_OP_FIRSTBIT_SHI: case VSIR_OP_FRC: case VSIR_OP_FTOI: diff --git a/tests/hlsl/bitwise.shader_test b/tests/hlsl/bitwise.shader_test index 4e4c39c27..ba95c5e92 100644 --- a/tests/hlsl/bitwise.shader_test +++ b/tests/hlsl/bitwise.shader_test @@ -254,7 +254,7 @@ probe (0, 0) rgba (0.0, 1.0, 1.0, 0.0) format r32g32b32a32-uint size (2d, 640, 480) -[pixel shader todo] +[pixel shader] uint4 u; uint4 main() : sv_target @@ -264,10 +264,10 @@ uint4 main() : sv_target [test] uniform 0 uint4 0 0xffffffff 0x00001000 0x00760400 -todo(sm<6 | msl & sm>=6) draw quad +todo(msl & sm>=6) draw quad probe (0, 0) u32(0xffffffff, 0, 12, 10) -[pixel shader todo] +[pixel shader] uint4 main(float4 pos : sv_position) : sv_target { uint4x4 umat = @@ -282,7 +282,7 @@ uint4 main(float4 pos : sv_position) : sv_target } [test] -todo(sm<6 | msl & sm>=6) draw quad +todo(msl & sm>=6) draw quad probe(0, 0) u32(0, 1, 0, 2) probe(1, 0) u32(0, 1, 0, 3) probe(2, 0) u32(0, 1, 0, 2)