From cf3e251a9f6ecd1c8654dd29a9111244953988e0 Mon Sep 17 00:00:00 2001
From: Nikolay Sivov <nsivov@codeweavers.com>
Date: Thu, 17 Oct 2024 23:21:18 +0200
Subject: [PATCH] vkd3d-shader/hlsl: Implement the f32tof16() intrinsic.

Signed-off-by: Nikolay Sivov <nsivov@codeweavers.com>
---
 Makefile.am                      |  1 +
 libs/vkd3d-shader/hlsl.c         |  1 +
 libs/vkd3d-shader/hlsl.h         |  1 +
 libs/vkd3d-shader/hlsl.y         | 16 ++++++
 libs/vkd3d-shader/hlsl_codegen.c | 75 +++++++++++++++++++++++++++
 libs/vkd3d-shader/tpf.c          |  6 +++
 tests/hlsl/f32tof16.shader_test  | 87 ++++++++++++++++++++++++++++++++
 7 files changed, 187 insertions(+)
 create mode 100644 tests/hlsl/f32tof16.shader_test

diff --git a/Makefile.am b/Makefile.am
index dbfb1f60d..988c31c43 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -121,6 +121,7 @@ vkd3d_shader_tests = \
 	tests/hlsl/exp.shader_test \
 	tests/hlsl/expr-indexing.shader_test \
 	tests/hlsl/f16tof32.shader_test \
+	tests/hlsl/f32tof16.shader_test \
 	tests/hlsl/faceforward.shader_test \
 	tests/hlsl/ffp-point-size.shader_test \
 	tests/hlsl/float-comparison.shader_test \
diff --git a/libs/vkd3d-shader/hlsl.c b/libs/vkd3d-shader/hlsl.c
index 6ad0117fd..c7aa148ea 100644
--- a/libs/vkd3d-shader/hlsl.c
+++ b/libs/vkd3d-shader/hlsl.c
@@ -3164,6 +3164,7 @@ const char *debug_hlsl_expr_op(enum hlsl_ir_expr_op op)
         [HLSL_OP1_DSY_FINE]     = "dsy_fine",
         [HLSL_OP1_EXP2]         = "exp2",
         [HLSL_OP1_F16TOF32]     = "f16tof32",
+        [HLSL_OP1_F32TOF16]     = "f32tof16",
         [HLSL_OP1_FLOOR]        = "floor",
         [HLSL_OP1_FRACT]        = "fract",
         [HLSL_OP1_LOG2]         = "log2",
diff --git a/libs/vkd3d-shader/hlsl.h b/libs/vkd3d-shader/hlsl.h
index b20f96464..2d5c2e8de 100644
--- a/libs/vkd3d-shader/hlsl.h
+++ b/libs/vkd3d-shader/hlsl.h
@@ -693,6 +693,7 @@ enum hlsl_ir_expr_op
     HLSL_OP1_DSY_FINE,
     HLSL_OP1_EXP2,
     HLSL_OP1_F16TOF32,
+    HLSL_OP1_F32TOF16,
     HLSL_OP1_FLOOR,
     HLSL_OP1_FRACT,
     HLSL_OP1_LOG2,
diff --git a/libs/vkd3d-shader/hlsl.y b/libs/vkd3d-shader/hlsl.y
index 58c6071cf..49cff4c81 100644
--- a/libs/vkd3d-shader/hlsl.y
+++ b/libs/vkd3d-shader/hlsl.y
@@ -4024,6 +4024,21 @@ static bool intrinsic_f16tof32(struct hlsl_ctx *ctx,
     return add_expr(ctx, params->instrs, HLSL_OP1_F16TOF32, operands, type, loc);
 }
 
+static bool intrinsic_f32tof16(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    struct hlsl_ir_node *operands[HLSL_MAX_OPERANDS] = {0};
+    struct hlsl_type *type;
+
+    if (!elementwise_intrinsic_float_convert_args(ctx, params, loc))
+        return false;
+
+    type = convert_numeric_type(ctx, params->args[0]->data_type, HLSL_TYPE_UINT);
+
+    operands[0] = params->args[0];
+    return add_expr(ctx, params->instrs, HLSL_OP1_F32TOF16, operands, type, loc);
+}
+
 static bool intrinsic_floor(struct hlsl_ctx *ctx,
         const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
 {
@@ -5199,6 +5214,7 @@ intrinsic_functions[] =
     {"exp",                                 1, true,  intrinsic_exp},
     {"exp2",                                1, true,  intrinsic_exp2},
     {"f16tof32",                            1, true,  intrinsic_f16tof32},
+    {"f32tof16",                            1, true,  intrinsic_f32tof16},
     {"faceforward",                         3, true,  intrinsic_faceforward},
     {"floor",                               1, true,  intrinsic_floor},
     {"fmod",                                2, true,  intrinsic_fmod},
diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c
index 3da967451..6e1b2b437 100644
--- a/libs/vkd3d-shader/hlsl_codegen.c
+++ b/libs/vkd3d-shader/hlsl_codegen.c
@@ -7859,6 +7859,78 @@ static bool lower_f16tof32(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, stru
     return true;
 }
 
+static bool lower_f32tof16(struct hlsl_ctx *ctx, struct hlsl_ir_node *node, struct hlsl_block *block)
+{
+    struct hlsl_ir_node *call, *rhs, *store;
+    struct hlsl_ir_function_decl *func;
+    unsigned int component_count;
+    struct hlsl_ir_load *load;
+    struct hlsl_ir_expr *expr;
+    struct hlsl_ir_var *lhs;
+    char *body;
+
+    static const char template[] =
+    "typedef uint%u uintX;\n"
+    "uintX soft_f32tof16(float%u x)\n"
+    "{\n"
+    "    uintX v = asuint(x);\n"
+    "    uintX v_abs = v & 0x7fffffff;\n"
+    "    uintX sign_bit = (v >> 16) & 0x8000;\n"
+    "    uintX exp = (v >> 23) & 0xff;\n"
+    "    uintX mantissa = v & 0x7fffff;\n"
+    "    uintX nan16;\n"
+    "    uintX nan = (v & 0x7f800000) == 0x7f800000;\n"
+    "    uintX val;\n"
+    "\n"
+    "    val = 113 - exp;\n"
+    "    val = (mantissa + 0x800000) >> val;\n"
+    "    val >>= 13;\n"
+    "\n"
+    "    val = (exp - 127) < -38 ? 0 : val;\n"
+    "\n"
+    "    val = v_abs < 0x38800000 ? val : (v_abs + 0xc8000000) >> 13;\n"
+    "    val = v_abs > 0x47ffe000 ? 0x7bff : val;\n"
+    "\n"
+    "    nan16 = (((v >> 13) | (v >> 3) | v) & 0x3ff) + 0x7c00;\n"
+    "    val = nan ? nan16 : val;\n"
+    "\n"
+    "    return (val & 0x7fff) + sign_bit;\n"
+    "}\n";
+
+    if (node->type != HLSL_IR_EXPR)
+        return false;
+
+    expr = hlsl_ir_expr(node);
+
+    if (expr->op != HLSL_OP1_F32TOF16)
+        return false;
+
+    rhs = expr->operands[0].node;
+    component_count = hlsl_type_component_count(rhs->data_type);
+
+    if (!(body = hlsl_sprintf_alloc(ctx, template, component_count, component_count)))
+        return false;
+
+    if (!(func = hlsl_compile_internal_function(ctx, "soft_f32tof16", body)))
+        return false;
+
+    lhs = func->parameters.vars[0];
+
+    if (!(store = hlsl_new_simple_store(ctx, lhs, rhs)))
+        return false;
+    hlsl_block_add_instr(block, store);
+
+    if (!(call = hlsl_new_call(ctx, func, &node->loc)))
+        return false;
+    hlsl_block_add_instr(block, call);
+
+    if (!(load = hlsl_new_var_load(ctx, func->return_var, &node->loc)))
+        return false;
+    hlsl_block_add_instr(block, &load->node);
+
+    return true;
+}
+
 static void process_entry_function(struct hlsl_ctx *ctx,
         const struct hlsl_block *global_uniform_block, struct hlsl_ir_function_decl *entry_func)
 {
@@ -7887,7 +7959,10 @@ static void process_entry_function(struct hlsl_ctx *ctx,
         return;
 
     if (hlsl_version_ge(ctx, 4, 0) && hlsl_version_lt(ctx, 5, 0))
+    {
         lower_ir(ctx, lower_f16tof32, body);
+        lower_ir(ctx, lower_f32tof16, body);
+    }
 
     lower_return(ctx, entry_func, body, false);
 
diff --git a/libs/vkd3d-shader/tpf.c b/libs/vkd3d-shader/tpf.c
index 835a3846c..b4bf75ccd 100644
--- a/libs/vkd3d-shader/tpf.c
+++ b/libs/vkd3d-shader/tpf.c
@@ -5693,6 +5693,12 @@ static void write_sm4_expr(const struct tpf_compiler *tpf, const struct hlsl_ir_
             write_sm4_unary_op(tpf, VKD3D_SM5_OP_F16TOF32, &expr->node, arg1, 0);
             break;
 
+        case HLSL_OP1_F32TOF16:
+            VKD3D_ASSERT(dst_type->e.numeric.type == HLSL_TYPE_UINT);
+            VKD3D_ASSERT(hlsl_version_ge(tpf->ctx, 5, 0));
+            write_sm4_unary_op(tpf, VKD3D_SM5_OP_F32TOF16, &expr->node, arg1, 0);
+            break;
+
         case HLSL_OP1_FLOOR:
             VKD3D_ASSERT(type_is_float(dst_type));
             write_sm4_unary_op(tpf, VKD3D_SM4_OP_ROUND_NI, &expr->node, arg1, 0);
diff --git a/tests/hlsl/f32tof16.shader_test b/tests/hlsl/f32tof16.shader_test
new file mode 100644
index 000000000..ef8efe9bc
--- /dev/null
+++ b/tests/hlsl/f32tof16.shader_test
@@ -0,0 +1,87 @@
+[require]
+shader model >= 4.0
+
+[pixel shader]
+uniform float a;
+
+uint4 main() : sv_target
+{
+    return f32tof16(a);
+}
+
+[test]
+uniform 0 float 42.0
+draw quad
+probe (0, 0) rgbaui (0x5140, 0x5140, 0x5140, 0x5140)
+
+% zero
+uniform 0 float 0.0
+draw quad
+probe (0, 0) rgbaui (0x0, 0x0, 0x0, 0x0)
+
+% negative zero
+uniform 0 uint 0x8000
+draw quad
+probe (0, 0) rgba(-0.0, -0.0, -0.0, -0.0)
+
+% subnormal number
+uniform 0 float 5.9604645e-08
+draw quad
+probe (0, 0) rgbaui (0x1, 0x1, 0x1, 0x1)
+
+% subnormal number
+uniform 0 float 1.1920929e-07
+draw quad
+probe (0, 0) rgbaui (0x2, 0x2, 0x2, 0x2)
+
+% subnormal number
+uniform 0 float 2.3841858e-07
+draw quad
+probe (0, 0) rgbaui (0x4, 0x4, 0x4, 0x4)
+
+% subnormal number
+uniform 0 float 4.7683716e-07
+draw quad
+probe (0, 0) rgbaui (0x8, 0x8, 0x8, 0x8)
+
+% subnormal number
+uniform 0 float 9.536743e-07
+draw quad
+probe (0, 0) rgbaui (0x10, 0x10, 0x10, 0x10)
+
+% subnormal number
+uniform 0 float 1.9073486e-06
+draw quad
+probe (0, 0) rgbaui (0x20, 0x20, 0x20, 0x20)
+
+% subnormal number
+uniform 0 float 3.8146973e-06
+draw quad
+probe (0, 0) rgbaui (0x40, 0x40, 0x40, 0x40)
+
+% subnormal number
+uniform 0 float 7.6293945e-06
+draw quad
+probe (0, 0) rgbaui (0x80, 0x80, 0x80, 0x80)
+
+% subnormal number
+uniform 0 float 1.5258789e-05
+draw quad
+probe (0, 0) rgbaui (0x100, 0x100, 0x100, 0x100)
+
+% subnormal number
+uniform 0 float 3.0517578e-05
+draw quad
+probe (0, 0) rgbaui (0x200, 0x200, 0x200, 0x200)
+
+% I'd love to use rgba probes here but msvc doesn't scanf infinity :(
+
+% positive infinity
+uniform 0 uint 0x7f800000
+draw quad
+probe (0, 0) rgbaui (0x7c00, 0x7c00, 0x7c00, 0x7c00)
+
+% negative infinity
+uniform 0 uint 0xff800000
+draw quad
+probe (0, 0) rgbaui (0xfc00, 0xfc00, 0xfc00, 0xfc00)