From df5bd15017140c87a2b2a3f5749cc5151afcd546 Mon Sep 17 00:00:00 2001
From: Alistair Leslie-Hughes <leslie_alistair@hotmail.com>
Date: Tue, 6 May 2025 06:42:46 +1000
Subject: [PATCH] Updated vkd3d to 4289ec60a1f79f68ea9bd3624141b5657b82d6c8.

---
 libs/vkd3d/include/private/vkd3d_version.h  |   2 +-
 libs/vkd3d/libs/vkd3d-shader/dxil.c         | 330 ++++++++++++--------
 libs/vkd3d/libs/vkd3d-shader/hlsl.c         |  63 ++++
 libs/vkd3d/libs/vkd3d-shader/hlsl.h         |  19 ++
 libs/vkd3d/libs/vkd3d-shader/hlsl.y         |  70 ++++-
 libs/vkd3d/libs/vkd3d-shader/hlsl_codegen.c | 231 +++++++++++---
 libs/vkd3d/libs/vkd3d-shader/tpf.c          |   3 +
 7 files changed, 544 insertions(+), 174 deletions(-)

diff --git a/libs/vkd3d/include/private/vkd3d_version.h b/libs/vkd3d/include/private/vkd3d_version.h
index 795bc2dc490..fb2e2f11f8b 100644
--- a/libs/vkd3d/include/private/vkd3d_version.h
+++ b/libs/vkd3d/include/private/vkd3d_version.h
@@ -1 +1 @@
-#define VKD3D_VCS_ID " (git a4f58be0)"
+#define VKD3D_VCS_ID " (git 4289ec60)"
diff --git a/libs/vkd3d/libs/vkd3d-shader/dxil.c b/libs/vkd3d/libs/vkd3d-shader/dxil.c
index 775be85334e..52bab40b553 100644
--- a/libs/vkd3d/libs/vkd3d-shader/dxil.c
+++ b/libs/vkd3d/libs/vkd3d-shader/dxil.c
@@ -647,6 +647,7 @@ enum sm6_value_type
     VALUE_TYPE_REG,
     VALUE_TYPE_ICB,
     VALUE_TYPE_HANDLE,
+    VALUE_TYPE_SSA,
 };
 
 struct sm6_function_data
@@ -663,6 +664,11 @@ struct sm6_handle_data
     bool non_uniform;
 };
 
+struct sm6_ssa_data
+{
+    unsigned int id;
+};
+
 struct sm6_value
 {
     const struct sm6_type *type;
@@ -673,10 +679,11 @@ struct sm6_value
     union
     {
         struct sm6_function_data function;
-        struct vkd3d_shader_register reg;
         const struct vkd3d_shader_immediate_constant_buffer *icb;
         struct sm6_handle_data handle;
+        struct sm6_ssa_data ssa;
     } u;
+    struct vkd3d_shader_register reg;
 };
 
 struct dxil_record
@@ -2232,7 +2239,15 @@ static inline struct sm6_value *sm6_parser_get_current_value(const struct sm6_pa
 
 static inline bool sm6_value_is_register(const struct sm6_value *value)
 {
-    return value->value_type == VALUE_TYPE_REG;
+    switch (value->value_type)
+    {
+        case VALUE_TYPE_REG:
+        case VALUE_TYPE_SSA:
+            return true;
+
+        default:
+            return false;
+    }
 }
 
 static bool sm6_value_is_handle(const struct sm6_value *value)
@@ -2242,18 +2257,18 @@ static bool sm6_value_is_handle(const struct sm6_value *value)
 
 static inline bool sm6_value_is_constant(const struct sm6_value *value)
 {
-    return sm6_value_is_register(value) && register_is_constant(&value->u.reg);
+    return sm6_value_is_register(value) && register_is_constant(&value->reg);
 }
 
 static bool sm6_value_is_constant_zero(const struct sm6_value *value)
 {
     /* Constant vectors do not occur. */
-    return sm6_value_is_register(value) && register_is_scalar_constant_zero(&value->u.reg);
+    return sm6_value_is_register(value) && register_is_scalar_constant_zero(&value->reg);
 }
 
 static inline bool sm6_value_is_undef(const struct sm6_value *value)
 {
-    return sm6_value_is_register(value) && value->u.reg.type == VKD3DSPR_UNDEF;
+    return sm6_value_is_register(value) && value->reg.type == VKD3DSPR_UNDEF;
 }
 
 static bool sm6_value_vector_is_constant_or_undef(const struct sm6_value **values, unsigned int count)
@@ -2272,26 +2287,26 @@ static bool sm6_value_is_icb(const struct sm6_value *value)
 
 static bool sm6_value_is_ssa(const struct sm6_value *value)
 {
-    return sm6_value_is_register(value) && register_is_ssa(&value->u.reg);
+    return sm6_value_is_register(value) && register_is_ssa(&value->reg);
 }
 
 static bool sm6_value_is_numeric_array(const struct sm6_value *value)
 {
-    return sm6_value_is_register(value) && register_is_numeric_array(&value->u.reg);
+    return sm6_value_is_register(value) && register_is_numeric_array(&value->reg);
 }
 
 static inline unsigned int sm6_value_get_constant_uint(const struct sm6_value *value)
 {
     if (!sm6_value_is_constant(value))
         return UINT_MAX;
-    return register_get_uint_value(&value->u.reg);
+    return register_get_uint_value(&value->reg);
 }
 
 static uint64_t sm6_value_get_constant_uint64(const struct sm6_value *value)
 {
     if (!sm6_value_is_constant(value))
         return UINT64_MAX;
-    return register_get_uint64_value(&value->u.reg);
+    return register_get_uint64_value(&value->reg);
 }
 
 static unsigned int sm6_parser_alloc_ssa_id(struct sm6_parser *sm6)
@@ -2390,15 +2405,55 @@ static enum vkd3d_data_type vkd3d_data_type_from_sm6_type(const struct sm6_type
     return VKD3D_DATA_UINT;
 }
 
+static void sm6_register_from_value(struct vkd3d_shader_register *reg, const struct sm6_value *value)
+{
+    switch (value->value_type)
+    {
+        case VALUE_TYPE_REG:
+            *reg = value->reg;
+            break;
+
+        case VALUE_TYPE_SSA:
+            register_init_with_id(reg, VKD3DSPR_SSA, vkd3d_data_type_from_sm6_type(
+                    sm6_type_get_scalar_type(value->type, 0)), value->u.ssa.id);
+            reg->dimension = sm6_type_is_scalar(value->type) ? VSIR_DIMENSION_SCALAR : VSIR_DIMENSION_VEC4;
+            break;
+
+        case VALUE_TYPE_FUNCTION:
+        case VALUE_TYPE_HANDLE:
+        case VALUE_TYPE_ICB:
+            vkd3d_unreachable();
+    }
+}
+
+static void sm6_parser_init_ssa_value(struct sm6_parser *sm6, struct sm6_value *value)
+{
+    unsigned int id;
+
+    if (register_is_ssa(&value->reg) && value->reg.idx[0].offset)
+    {
+        id = value->reg.idx[0].offset;
+        TRACE("Using forward-allocated id %u.\n", id);
+    }
+    else
+    {
+        id = sm6_parser_alloc_ssa_id(sm6);
+    }
+
+    value->value_type = VALUE_TYPE_SSA;
+    value->u.ssa.id = id;
+    sm6_register_from_value(&value->reg, value);
+}
+
 static void register_init_ssa_vector(struct vkd3d_shader_register *reg, const struct sm6_type *type,
         unsigned int component_count, struct sm6_value *value, struct sm6_parser *sm6)
 {
     enum vkd3d_data_type data_type;
     unsigned int id;
 
-    if (value && register_is_ssa(&value->u.reg) && value->u.reg.idx[0].offset)
+    if (value && register_is_ssa(&value->reg) && value->reg.idx[0].offset)
     {
-        id = value->u.reg.idx[0].offset;
+        id = value->reg.idx[0].offset;
         TRACE("Using forward-allocated id %u.\n", id);
     }
     else
@@ -2450,13 +2505,6 @@ static void dst_param_init_vector(struct vkd3d_shader_dst_param *param, unsigned
     param->shift = 0;
 }
 
-static void dst_param_init_ssa_scalar(struct vkd3d_shader_dst_param *param, const struct sm6_type *type,
-        struct sm6_value *value, struct sm6_parser *sm6)
-{
-    dst_param_init(param);
-    register_init_ssa_scalar(&param->reg, type, value, sm6);
-}
-
 static inline void src_param_init(struct vkd3d_shader_src_param *param)
 {
     param->swizzle = VKD3D_SHADER_SWIZZLE(X, X, X, X);
@@ -2480,7 +2528,7 @@ static void src_param_init_vector(struct vkd3d_shader_src_param *param, unsigned
 static void src_param_init_from_value(struct vkd3d_shader_src_param *param, const struct sm6_value *src)
 {
     src_param_init(param);
-    param->reg = src->u.reg;
+    sm6_register_from_value(&param->reg, src);
 }
 
 static void src_param_init_vector_from_reg(struct vkd3d_shader_src_param *param,
@@ -2547,9 +2595,9 @@ static bool instruction_dst_param_init_ssa_scalar(struct vkd3d_shader_instructio
     if (!(param = instruction_dst_params_alloc(ins, 1, sm6)))
         return false;
 
-    dst_param_init_ssa_scalar(param, dst->type, dst, sm6);
-    param->write_mask = VKD3DSP_WRITEMASK_0;
-    dst->u.reg = param->reg;
+    dst_param_init(param);
+    sm6_parser_init_ssa_value(sm6, dst);
+    sm6_register_from_value(&param->reg, dst);
     return true;
 }
 
@@ -2560,8 +2608,8 @@ static void instruction_dst_param_init_ssa_vector(struct vkd3d_shader_instructio
     struct sm6_value *dst = sm6_parser_get_current_value(sm6);
 
     dst_param_init_vector(param, component_count);
-    register_init_ssa_vector(&param->reg, sm6_type_get_scalar_type(dst->type, 0), component_count, dst, sm6);
-    dst->u.reg = param->reg;
+    sm6_parser_init_ssa_value(sm6, dst);
+    sm6_register_from_value(&param->reg, dst);
 }
 
 static bool instruction_dst_param_init_temp_vector(struct vkd3d_shader_instruction *ins, struct sm6_parser *sm6)
@@ -2576,7 +2624,7 @@ static bool instruction_dst_param_init_temp_vector(struct vkd3d_shader_instructi
     param->write_mask = VKD3DSP_WRITEMASK_ALL;
     param->reg.idx[0].offset = 0;
     param->reg.dimension = VSIR_DIMENSION_VEC4;
-    dst->u.reg = param->reg;
+    dst->reg = param->reg;
 
     return true;
 }
@@ -2856,11 +2904,9 @@ static size_t sm6_parser_get_value_idx_by_ref(struct sm6_parser *sm6, const stru
         else
         {
             value->type = fwd_type;
-            value->value_type = VALUE_TYPE_REG;
-            register_init_with_id(&value->u.reg, VKD3DSPR_SSA, vkd3d_data_type_from_sm6_type(
-                    sm6_type_get_scalar_type(fwd_type, 0)), sm6_parser_alloc_ssa_id(sm6));
-            value->u.reg.dimension = sm6_type_is_scalar(fwd_type) ? VSIR_DIMENSION_SCALAR
-                    : VSIR_DIMENSION_VEC4;
+            value->value_type = VALUE_TYPE_SSA;
+            value->u.ssa.id = sm6_parser_alloc_ssa_id(sm6);
+            sm6_register_from_value(&value->reg, value);
         }
     }
 
@@ -2990,6 +3036,13 @@ static float register_get_float_value(const struct vkd3d_shader_register *reg)
     return bitcast_uint_to_float(reg->u.immconst_u32[0]);
 }
 
+static inline float sm6_value_get_constant_float(const struct sm6_value *value)
+{
+    if (!sm6_value_is_constant(value))
+        return UINT_MAX;
+    return register_get_float_value(&value->reg);
+}
+
 static enum vkd3d_result value_allocate_constant_array(struct sm6_value *dst, const struct sm6_type *type,
         const uint64_t *operands, struct sm6_parser *sm6)
 {
@@ -3066,6 +3119,7 @@ static enum vkd3d_result sm6_parser_init_constexpr_gep(struct sm6_parser *sm6, c
         struct sm6_value *dst)
 {
     const struct sm6_type *elem_type, *pointee_type, *gep_type, *ptr_type;
+    struct vkd3d_shader_register reg;
     struct sm6_value *operands[3];
     unsigned int i, j, offset;
     uint64_t value;
@@ -3109,7 +3163,9 @@ static enum vkd3d_result sm6_parser_init_constexpr_gep(struct sm6_parser *sm6, c
         }
     }
 
-    if (operands[0]->u.reg.idx_count > 1)
+    sm6_register_from_value(&reg, operands[0]);
+
+    if (reg.idx_count > 1)
     {
         WARN("Unsupported stacked GEP.\n");
         vkd3d_shader_parser_error(&sm6->p, VKD3D_SHADER_ERROR_DXIL_INVALID_OPERAND,
@@ -3170,10 +3226,10 @@ static enum vkd3d_result sm6_parser_init_constexpr_gep(struct sm6_parser *sm6, c
                 "Module does not define a pointer type for a constexpr GEP result.");
         return VKD3D_ERROR_INVALID_SHADER;
     }
-    dst->u.reg = operands[0]->u.reg;
-    dst->u.reg.idx[1].offset = offset;
-    dst->u.reg.idx[1].is_in_bounds = record->code == CST_CODE_CE_INBOUNDS_GEP;
-    dst->u.reg.idx_count = 2;
+    dst->reg = reg;
+    dst->reg.idx[1].offset = offset;
+    dst->reg.idx[1].is_in_bounds = record->code == CST_CODE_CE_INBOUNDS_GEP;
+    dst->reg.idx_count = 2;
 
     return VKD3D_OK;
 }
@@ -3232,7 +3288,7 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
         dst->type = type;
         dst->value_type = VALUE_TYPE_REG;
         dst->is_back_ref = true;
-        vsir_register_init(&dst->u.reg, reg_type, reg_data_type, 0);
+        vsir_register_init(&dst->reg, reg_type, reg_data_type, 0);
 
         switch (record->code)
         {
@@ -3257,9 +3313,9 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
 
                 value = decode_rotated_signed_value(record->operands[0]);
                 if (type->u.width <= 32)
-                    dst->u.reg.u.immconst_u32[0] = value & ((1ull << type->u.width) - 1);
+                    dst->reg.u.immconst_u32[0] = value & ((1ull << type->u.width) - 1);
                 else
-                    dst->u.reg.u.immconst_u64[0] = value;
+                    dst->reg.u.immconst_u64[0] = value;
 
                 break;
 
@@ -3274,11 +3330,11 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
                 }
 
                 if (type->u.width == 16)
-                    dst->u.reg.u.immconst_u32[0] = record->operands[0];
+                    dst->reg.u.immconst_u32[0] = record->operands[0];
                 else if (type->u.width == 32)
-                    dst->u.reg.u.immconst_f32[0] = bitcast_uint_to_float(record->operands[0]);
+                    dst->reg.u.immconst_f32[0] = bitcast_uint_to_float(record->operands[0]);
                 else if (type->u.width == 64)
-                    dst->u.reg.u.immconst_f64[0] = bitcast_uint64_to_double(record->operands[0]);
+                    dst->reg.u.immconst_f64[0] = bitcast_uint64_to_double(record->operands[0]);
                 else
                     vkd3d_unreachable();
 
@@ -3344,13 +3400,13 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
 
                 /* Resolve later in case forward refs exist. */
                 dst->type = type;
-                dst->u.reg.type = VKD3DSPR_COUNT;
-                dst->u.reg.idx[0].offset = value;
+                dst->reg.type = VKD3DSPR_COUNT;
+                dst->reg.idx[0].offset = value;
                 break;
 
             case CST_CODE_UNDEF:
                 dxil_record_validate_operand_max_count(record, 0, sm6);
-                dst->u.reg.type = VKD3DSPR_UNDEF;
+                dst->reg.type = VKD3DSPR_UNDEF;
                 /* Mark as explicitly undefined, not the result of a missing constant code or instruction. */
                 dst->is_undefined = true;
                 break;
@@ -3359,7 +3415,7 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
                 FIXME("Unhandled constant code %u.\n", record->code);
                 vkd3d_shader_parser_error(&sm6->p, VKD3D_SHADER_ERROR_DXIL_INVALID_OPERAND,
                         "Constant code %u is unhandled.", record->code);
-                dst->u.reg.type = VKD3DSPR_UNDEF;
+                dst->reg.type = VKD3DSPR_UNDEF;
                 break;
         }
 
@@ -3377,12 +3433,12 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
     for (i = base_value_idx; i < sm6->value_count; ++i)
     {
         dst = &sm6->values[i];
-        if (dst->u.reg.type != VKD3DSPR_COUNT)
+        if (dst->reg.type != VKD3DSPR_COUNT)
             continue;
 
         type = dst->type;
 
-        src = &sm6->values[dst->u.reg.idx[0].offset];
+        src = &sm6->values[dst->reg.idx[0].offset];
         if (!sm6_value_is_numeric_array(src))
         {
             WARN("Value is not an array.\n");
@@ -3393,7 +3449,7 @@ static enum vkd3d_result sm6_parser_constants_init(struct sm6_parser *sm6, const
 
         *dst = *src;
         dst->type = type;
-        dst->u.reg.data_type = vkd3d_data_type_from_sm6_type(type->u.pointer.type);
+        dst->reg.data_type = vkd3d_data_type_from_sm6_type(type->u.pointer.type);
     }
 
     return VKD3D_OK;
@@ -3442,7 +3498,7 @@ static void sm6_parser_declare_icb(struct sm6_parser *sm6, const struct sm6_type
     ins = sm6_parser_add_instruction(sm6, VKD3DSIH_DCL_IMMEDIATE_CONSTANT_BUFFER);
     /* The icb value index will be resolved later so forward references can be handled. */
     ins->declaration.icb = (void *)(intptr_t)init;
-    register_init_with_id(&dst->u.reg, VKD3DSPR_IMMCONSTBUFFER, data_type, init);
+    register_init_with_id(&dst->reg, VKD3DSPR_IMMCONSTBUFFER, data_type, init);
 }
 
 static void sm6_parser_declare_indexable_temp(struct sm6_parser *sm6, const struct sm6_type *elem_type,
@@ -3464,7 +3520,7 @@ static void sm6_parser_declare_indexable_temp(struct sm6_parser *sm6, const stru
     /* The initialiser value index will be resolved later so forward references can be handled. */
     ins->declaration.indexable_temp.initialiser = (void *)(uintptr_t)init;
 
-    register_init_with_id(&dst->u.reg, VKD3DSPR_IDXTEMP, data_type, ins->declaration.indexable_temp.register_idx);
+    register_init_with_id(&dst->reg, VKD3DSPR_IDXTEMP, data_type, ins->declaration.indexable_temp.register_idx);
 }
 
 static void sm6_parser_declare_tgsm_raw(struct sm6_parser *sm6, const struct sm6_type *elem_type,
@@ -3477,7 +3533,7 @@ static void sm6_parser_declare_tgsm_raw(struct sm6_parser *sm6, const struct sm6
     ins = sm6_parser_add_instruction(sm6, VKD3DSIH_DCL_TGSM_RAW);
     dst_param_init(&ins->declaration.tgsm_raw.reg);
     register_init_with_id(&ins->declaration.tgsm_raw.reg.reg, VKD3DSPR_GROUPSHAREDMEM, data_type, sm6->tgsm_count++);
-    dst->u.reg = ins->declaration.tgsm_raw.reg.reg;
+    dst->reg = ins->declaration.tgsm_raw.reg.reg;
     dst->structure_stride = 0;
     ins->declaration.tgsm_raw.alignment = alignment;
     byte_count = elem_type->u.width / 8u;
@@ -3503,7 +3559,7 @@ static void sm6_parser_declare_tgsm_structured(struct sm6_parser *sm6, const str
     dst_param_init(&ins->declaration.tgsm_structured.reg);
     register_init_with_id(&ins->declaration.tgsm_structured.reg.reg, VKD3DSPR_GROUPSHAREDMEM,
             data_type, sm6->tgsm_count++);
-    dst->u.reg = ins->declaration.tgsm_structured.reg.reg;
+    dst->reg = ins->declaration.tgsm_structured.reg.reg;
     structure_stride = elem_type->u.width / 8u;
     if (structure_stride != 4)
     {
@@ -3812,11 +3868,11 @@ static enum vkd3d_result sm6_parser_globals_init(struct sm6_parser *sm6)
         const struct vkd3d_shader_immediate_constant_buffer *icb;
         struct sm6_value *value = &sm6->values[i];
 
-        if (!sm6_value_is_register(value) || value->u.reg.type != VKD3DSPR_IMMCONSTBUFFER)
+        if (!sm6_value_is_register(value) || value->reg.type != VKD3DSPR_IMMCONSTBUFFER)
             continue;
 
-        if ((icb = resolve_forward_initialiser(value->u.reg.idx[0].offset, sm6)))
-            value->u.reg.idx[0].offset = icb->register_idx;
+        if ((icb = resolve_forward_initialiser(value->reg.idx[0].offset, sm6)))
+            value->reg.idx[0].offset = icb->register_idx;
     }
 
     return VKD3D_OK;
@@ -4027,8 +4083,9 @@ struct function_emission_state
     unsigned int temp_idx;
 };
 
-static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6, const struct vkd3d_shader_register **operand_regs,
-        unsigned int component_count, struct function_emission_state *state, struct vkd3d_shader_register *reg);
+static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6,
+        const struct vkd3d_shader_register *operand_regs, unsigned int component_count,
+        struct function_emission_state *state, struct vkd3d_shader_register *reg);
 
 static void sm6_parser_emit_alloca(struct sm6_parser *sm6, const struct dxil_record *record,
         struct vkd3d_shader_instruction *ins, struct sm6_value *dst)
@@ -4136,11 +4193,11 @@ static enum vkd3d_shader_opcode map_dx_atomicrmw_op(uint64_t code)
 static void sm6_parser_emit_atomicrmw(struct sm6_parser *sm6, const struct dxil_record *record,
         struct function_emission_state *state, struct sm6_value *dst)
 {
-    struct vkd3d_shader_register coord, const_offset, const_zero;
-    const struct vkd3d_shader_register *regs[2];
     struct vkd3d_shader_dst_param *dst_params;
     struct vkd3d_shader_src_param *src_params;
+    struct vkd3d_shader_register regs[2], reg;
     struct vkd3d_shader_instruction *ins;
+    struct vkd3d_shader_register coord;
     const struct sm6_value *ptr, *src;
     enum vkd3d_shader_opcode op;
     unsigned int i = 0;
@@ -4152,7 +4209,9 @@ static void sm6_parser_emit_atomicrmw(struct sm6_parser *sm6, const struct dxil_
             || !sm6_value_validate_is_backward_ref(ptr, sm6))
         return;
 
-    if (ptr->u.reg.type != VKD3DSPR_GROUPSHAREDMEM)
+    sm6_register_from_value(&reg, ptr);
+
+    if (reg.type != VKD3DSPR_GROUPSHAREDMEM)
     {
         WARN("Register is not groupshared.\n");
         vkd3d_shader_parser_error(&sm6->p, VKD3D_SHADER_ERROR_DXIL_INVALID_OPERAND,
@@ -4187,17 +4246,11 @@ static void sm6_parser_emit_atomicrmw(struct sm6_parser *sm6, const struct dxil_
 
     if (ptr->structure_stride)
     {
-        if (ptr->u.reg.idx[1].rel_addr)
-        {
-            regs[0] = &ptr->u.reg.idx[1].rel_addr->reg;
-        }
+        if (reg.idx[1].rel_addr)
+            regs[0] = reg.idx[1].rel_addr->reg;
         else
-        {
-            register_make_constant_uint(&const_offset, ptr->u.reg.idx[1].offset);
-            regs[0] = &const_offset;
-        }
-        register_make_constant_uint(&const_zero, 0);
-        regs[1] = &const_zero;
+            register_make_constant_uint(&regs[0], reg.idx[1].offset);
+        register_make_constant_uint(&regs[1], 0);
         if (!sm6_parser_emit_reg_composite_construct(sm6, regs, 2, state, &coord))
             return;
     }
@@ -4214,18 +4267,18 @@ static void sm6_parser_emit_atomicrmw(struct sm6_parser *sm6, const struct dxil_
         src_param_make_constant_uint(&src_params[0], 0);
     src_param_init_from_value(&src_params[1], src);
 
+    sm6_parser_init_ssa_value(sm6, dst);
+
     dst_params = instruction_dst_params_alloc(ins, 2, sm6);
-    register_init_ssa_scalar(&dst_params[0].reg, dst->type, dst, sm6);
+    sm6_register_from_value(&dst_params[0].reg, dst);
     dst_param_init(&dst_params[0]);
 
-    dst_params[1].reg = ptr->u.reg;
+    dst_params[1].reg = reg;
     dst_params[1].reg.data_type = VKD3D_DATA_UNUSED;
     dst_params[1].reg.idx[1].rel_addr = NULL;
     dst_params[1].reg.idx[1].offset = ~0u;
     dst_params[1].reg.idx_count = 1;
     dst_param_init(&dst_params[1]);
-
-    dst->u.reg = dst_params[0].reg;
 }
 
 static enum vkd3d_shader_opcode map_binary_op(uint64_t code, const struct sm6_type *type_a,
@@ -4401,9 +4454,9 @@ static void sm6_parser_emit_binop(struct sm6_parser *sm6, const struct dxil_reco
 
         dst_param_init(&dst_params[0]);
         dst_param_init(&dst_params[1]);
-        register_init_ssa_scalar(&dst_params[index].reg, a->type, dst, sm6);
+        sm6_parser_init_ssa_value(sm6, dst);
+        sm6_register_from_value(&dst_params[index].reg, dst);
         vsir_dst_param_init_null(&dst_params[index ^ 1]);
-        dst->u.reg = dst_params[index].reg;
     }
     else
     {
@@ -4464,7 +4517,7 @@ static void sm6_parser_emit_br(struct sm6_parser *sm6, const struct dxil_record
         dxil_record_validate_operand_max_count(record, i, sm6);
 
         code_block->terminator.type = TERMINATOR_COND_BR;
-        code_block->terminator.conditional_reg = value->u.reg;
+        sm6_register_from_value(&code_block->terminator.conditional_reg, value);
         code_block->terminator.true_block = sm6_function_get_block(function, record->operands[0], sm6);
         code_block->terminator.false_block = sm6_function_get_block(function, record->operands[1], sm6);
     }
@@ -4472,8 +4525,9 @@ static void sm6_parser_emit_br(struct sm6_parser *sm6, const struct dxil_record
     ins->opcode = VKD3DSIH_NOP;
 }
 
-static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6, const struct vkd3d_shader_register **operand_regs,
-        unsigned int component_count, struct function_emission_state *state, struct vkd3d_shader_register *reg)
+static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6,
+        const struct vkd3d_shader_register *operand_regs, unsigned int component_count,
+        struct function_emission_state *state, struct vkd3d_shader_register *reg)
 {
     struct vkd3d_shader_instruction *ins = state->ins;
     struct vkd3d_shader_src_param *src_params;
@@ -4483,25 +4537,25 @@ static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6, cons
 
     if (component_count == 1)
     {
-        *reg = *operand_regs[0];
+        *reg = operand_regs[0];
         return true;
     }
 
     for (i = 0; i < component_count; ++i)
-        all_constant &= register_is_constant(operand_regs[i]);
+        all_constant &= register_is_constant(&operand_regs[i]);
 
     if (all_constant)
     {
-        vsir_register_init(reg, VKD3DSPR_IMMCONST, operand_regs[0]->data_type, 0);
+        vsir_register_init(reg, VKD3DSPR_IMMCONST, operand_regs[0].data_type, 0);
         reg->dimension = VSIR_DIMENSION_VEC4;
         for (i = 0; i < component_count; ++i)
-            reg->u.immconst_u32[i] = operand_regs[i]->u.immconst_u32[0];
+            reg->u.immconst_u32[i] = operand_regs[i].u.immconst_u32[0];
         for (; i < VKD3D_VEC4_SIZE; ++i)
             reg->u.immconst_u32[i] = 0;
         return true;
     }
 
-    register_init_with_id(reg, VKD3DSPR_TEMP, operand_regs[0]->data_type, state->temp_idx++);
+    register_init_with_id(reg, VKD3DSPR_TEMP, operand_regs[0].data_type, state->temp_idx++);
     reg->dimension = VSIR_DIMENSION_VEC4;
 
     for (i = 0; i < component_count; ++i, ++ins)
@@ -4512,7 +4566,7 @@ static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6, cons
             return false;
 
         src_param_init(&src_params[0]);
-        src_params[0].reg = *operand_regs[i];
+        src_params[0].reg = operand_regs[i];
 
         if (!(dst_param = instruction_dst_params_alloc(ins, 1, sm6)))
             return false;
@@ -4530,11 +4584,11 @@ static bool sm6_parser_emit_reg_composite_construct(struct sm6_parser *sm6, cons
 static bool sm6_parser_emit_composite_construct(struct sm6_parser *sm6, const struct sm6_value **operands,
         unsigned int component_count, struct function_emission_state *state, struct vkd3d_shader_register *reg)
 {
-    const struct vkd3d_shader_register *operand_regs[VKD3D_VEC4_SIZE];
+    struct vkd3d_shader_register operand_regs[VKD3D_VEC4_SIZE];
     unsigned int i;
 
     for (i = 0; i < component_count; ++i)
-        operand_regs[i] = &operands[i]->u.reg;
+        sm6_register_from_value(&operand_regs[i], operands[i]);
 
     return sm6_parser_emit_reg_composite_construct(sm6, operand_regs, component_count, state, reg);
 }
@@ -4543,19 +4597,18 @@ static bool sm6_parser_emit_coordinate_construct(struct sm6_parser *sm6, const s
         unsigned int max_operands, const struct sm6_value *z_operand, struct function_emission_state *state,
         struct vkd3d_shader_register *reg)
 {
-    const struct vkd3d_shader_register *operand_regs[VKD3D_VEC4_SIZE];
+    struct vkd3d_shader_register operand_regs[VKD3D_VEC4_SIZE];
     unsigned int component_count;
 
     for (component_count = 0; component_count < max_operands; ++component_count)
     {
         if (!z_operand && operands[component_count]->is_undefined)
             break;
-        operand_regs[component_count] = &operands[component_count]->u.reg;
+        sm6_register_from_value(&operand_regs[component_count], operands[component_count]);
     }
+
     if (z_operand)
-    {
-        operand_regs[component_count++] = &z_operand->u.reg;
-    }
+        sm6_register_from_value(&operand_regs[component_count++], z_operand);
 
     return sm6_parser_emit_reg_composite_construct(sm6, operand_regs, component_count, state, reg);
 }
@@ -4780,7 +4833,7 @@ static void sm6_parser_emit_dx_atomic_binop(struct sm6_parser *sm6, enum dx_intr
     }
     else
     {
-        reg = operands[coord_idx]->u.reg;
+        sm6_register_from_value(&reg, operands[coord_idx]);
     }
 
     for (i = coord_idx + coord_count; i < coord_idx + 3; ++i)
@@ -4810,7 +4863,7 @@ static void sm6_parser_emit_dx_atomic_binop(struct sm6_parser *sm6, enum dx_intr
     dst_param_init(&dst_params[1]);
     sm6_register_from_handle(sm6, &resource->u.handle, &dst_params[1].reg);
 
-    dst->u.reg = dst_params[0].reg;
+    dst->reg = dst_params[0].reg;
 }
 
 static void sm6_parser_emit_dx_barrier(struct sm6_parser *sm6, enum dx_intrinsic_opcode op,
@@ -5957,7 +6010,7 @@ static void sm6_parser_emit_dx_sincos(struct sm6_parser *sm6, enum dx_intrinsic_
     dst_param_init(&dst_params[1]);
     register_init_ssa_scalar(&dst_params[index].reg, dst->type, dst, sm6);
     vsir_dst_param_init_null(&dst_params[index ^ 1]);
-    dst->u.reg = dst_params[index].reg;
+    dst->reg = dst_params[index].reg;
 }
 
 static void sm6_parser_emit_dx_split_double(struct sm6_parser *sm6, enum dx_intrinsic_opcode op,
@@ -6574,7 +6627,7 @@ static void sm6_parser_emit_unhandled(struct sm6_parser *sm6, struct vkd3d_shade
         return;
 
     type = sm6_type_get_scalar_type(dst->type, 0);
-    vsir_register_init(&dst->u.reg, VKD3DSPR_UNDEF, vkd3d_data_type_from_sm6_type(type), 0);
+    vsir_register_init(&dst->reg, VKD3DSPR_UNDEF, vkd3d_data_type_from_sm6_type(type), 0);
     /* dst->is_undefined is not set here because it flags only explicitly undefined values. */
 }
 
@@ -6680,7 +6733,7 @@ static void sm6_parser_emit_call(struct sm6_parser *sm6, const struct dxil_recor
                 "Expected a constant integer dx intrinsic function id.");
         return;
     }
-    sm6_parser_decode_dx_op(sm6, register_get_uint_value(&op_value->u.reg),
+    sm6_parser_decode_dx_op(sm6, sm6_value_get_constant_uint(op_value),
             fn_value->u.function.name, &operands[1], operand_count - 1, state, dst);
 }
 
@@ -6826,10 +6879,10 @@ static void sm6_parser_emit_cast(struct sm6_parser *sm6, const struct dxil_recor
 
     if (handler_idx == VKD3DSIH_NOP)
     {
-        dst->u.reg = value->u.reg;
+        sm6_register_from_value(&dst->reg, value);
         /* Set the result type for casts from 16-bit min precision. */
         if (type->u.width != 16)
-            dst->u.reg.data_type = vkd3d_data_type_from_sm6_type(type);
+            dst->reg.data_type = vkd3d_data_type_from_sm6_type(type);
         return;
     }
 
@@ -6841,7 +6894,7 @@ static void sm6_parser_emit_cast(struct sm6_parser *sm6, const struct dxil_recor
 
     /* bitcast */
     if (handler_idx == VKD3DSIH_MOV)
-        src_param->reg.data_type = dst->u.reg.data_type;
+        src_param->reg.data_type = dst->reg.data_type;
 }
 
 struct sm6_cmp_info
@@ -6994,6 +7047,7 @@ static void sm6_parser_emit_cmpxchg(struct sm6_parser *sm6, const struct dxil_re
     struct vkd3d_shader_dst_param *dst_params;
     struct vkd3d_shader_src_param *src_params;
     const struct sm6_value *ptr, *cmp, *new;
+    struct vkd3d_shader_register reg;
     unsigned int i = 0;
     bool is_volatile;
     uint64_t code;
@@ -7003,7 +7057,9 @@ static void sm6_parser_emit_cmpxchg(struct sm6_parser *sm6, const struct dxil_re
             || !sm6_value_validate_is_backward_ref(ptr, sm6))
         return;
 
-    if (ptr->u.reg.type != VKD3DSPR_GROUPSHAREDMEM)
+    sm6_register_from_value(&reg, ptr);
+
+    if (reg.type != VKD3DSPR_GROUPSHAREDMEM)
     {
         WARN("Register is not groupshared.\n");
         vkd3d_shader_parser_error(&sm6->p, VKD3D_SHADER_ERROR_DXIL_INVALID_OPERAND,
@@ -7063,10 +7119,10 @@ static void sm6_parser_emit_cmpxchg(struct sm6_parser *sm6, const struct dxil_re
         return;
     register_init_ssa_scalar(&dst_params[0].reg, dst->type, dst, sm6);
     dst_param_init(&dst_params[0]);
-    dst_params[1].reg = ptr->u.reg;
+    dst_params[1].reg = reg;
     dst_param_init(&dst_params[1]);
 
-    dst->u.reg = dst_params[0].reg;
+    dst->reg = dst_params[0].reg;
 }
 
 static void sm6_parser_emit_extractval(struct sm6_parser *sm6, const struct dxil_record *record,
@@ -7122,7 +7178,7 @@ static void sm6_parser_emit_extractval(struct sm6_parser *sm6, const struct dxil
 
     if (!(src_param = instruction_src_params_alloc(ins, 1, sm6)))
         return;
-    src_param->reg = src->u.reg;
+    sm6_register_from_value(&src_param->reg, src);
     src_param_init_scalar(src_param, elem_idx);
 
     instruction_dst_param_init_ssa_scalar(ins, sm6);
@@ -7135,7 +7191,7 @@ static void sm6_parser_emit_gep(struct sm6_parser *sm6, const struct dxil_record
     unsigned int elem_idx, operand_idx = 2;
     enum bitcode_address_space addr_space;
     const struct sm6_value *elem_value;
-    struct vkd3d_shader_register *reg;
+    struct vkd3d_shader_register reg;
     const struct sm6_value *src;
     bool is_in_bounds;
 
@@ -7149,7 +7205,9 @@ static void sm6_parser_emit_gep(struct sm6_parser *sm6, const struct dxil_record
         return;
     }
 
-    if (src->u.reg.idx_count > 1)
+    sm6_register_from_value(&reg, src);
+
+    if (reg.idx_count > 1)
     {
         WARN("Unsupported stacked GEP.\n");
         vkd3d_shader_parser_error(&sm6->p, VKD3D_SHADER_ERROR_DXIL_INVALID_OPERAND,
@@ -7218,12 +7276,12 @@ static void sm6_parser_emit_gep(struct sm6_parser *sm6, const struct dxil_record
         return;
     }
 
-    reg = &dst->u.reg;
-    *reg = src->u.reg;
-    reg->idx[1].offset = 0;
-    register_index_address_init(&reg->idx[1], elem_value, sm6);
-    reg->idx[1].is_in_bounds = is_in_bounds;
-    reg->idx_count = 2;
+    reg.idx[1].offset = 0;
+    register_index_address_init(&reg.idx[1], elem_value, sm6);
+    reg.idx[1].is_in_bounds = is_in_bounds;
+    reg.idx_count = 2;
+
+    dst->reg = reg;
     dst->structure_stride = src->structure_stride;
 
     ins->opcode = VKD3DSIH_NOP;
@@ -7235,6 +7293,7 @@ static void sm6_parser_emit_load(struct sm6_parser *sm6, const struct dxil_recor
     const struct sm6_type *elem_type = NULL, *pointee_type;
     unsigned int alignment, operand_count, i = 0;
     struct vkd3d_shader_src_param *src_params;
+    struct vkd3d_shader_register reg;
     const struct sm6_value *ptr;
     uint64_t alignment_code;
 
@@ -7272,25 +7331,29 @@ static void sm6_parser_emit_load(struct sm6_parser *sm6, const struct dxil_recor
     if (record->operands[i])
         WARN("Ignoring volatile modifier.\n");
 
+    sm6_register_from_value(&reg, ptr);
+
     if (ptr->structure_stride)
     {
-        VKD3D_ASSERT(ptr->u.reg.type == VKD3DSPR_GROUPSHAREDMEM);
+        VKD3D_ASSERT(reg.type == VKD3DSPR_GROUPSHAREDMEM);
         vsir_instruction_init(ins, &sm6->p.location, VKD3DSIH_LD_STRUCTURED);
 
         if (!(src_params = instruction_src_params_alloc(ins, 3, sm6)))
             return;
-        if (ptr->u.reg.idx[1].rel_addr)
-            src_params[0] = *ptr->u.reg.idx[1].rel_addr;
+        if (reg.idx[1].rel_addr)
+            src_params[0] = *reg.idx[1].rel_addr;
         else
-            src_param_make_constant_uint(&src_params[0], ptr->u.reg.idx[1].offset);
+            src_param_make_constant_uint(&src_params[0], reg.idx[1].offset);
         /* Struct offset is always zero as there is no struct, just an array. */
         src_param_make_constant_uint(&src_params[1], 0);
         src_param_init_from_value(&src_params[2], ptr);
         src_params[2].reg.alignment = alignment;
+        /* The offset is already in src_params[0]. */
+        src_params[2].reg.idx_count = 1;
     }
     else
     {
-        operand_count = 1 + (ptr->u.reg.type == VKD3DSPR_GROUPSHAREDMEM);
+        operand_count = 1 + (reg.type == VKD3DSPR_GROUPSHAREDMEM);
         vsir_instruction_init(ins, &sm6->p.location, (operand_count > 1) ? VKD3DSIH_LD_RAW : VKD3DSIH_MOV);
 
         if (!(src_params = instruction_src_params_alloc(ins, operand_count, sm6)))
@@ -7341,11 +7404,11 @@ static void sm6_parser_emit_phi(struct sm6_parser *sm6, const struct dxil_record
     }
 
     dst->type = type;
-    register_init_ssa_scalar(&dst->u.reg, type, dst, sm6);
+    register_init_ssa_scalar(&dst->reg, type, dst, sm6);
 
     if (!(phi = sm6_block_phi_require_space(code_block, sm6)))
         return;
-    phi->reg = dst->u.reg;
+    sm6_register_from_value(&phi->reg, dst);
     phi->incoming_count = record->operand_count / 2u;
 
     if (!vkd3d_array_reserve((void **)&phi->incoming, &phi->incoming_capacity, phi->incoming_count,
@@ -7421,6 +7484,7 @@ static void sm6_parser_emit_store(struct sm6_parser *sm6, const struct dxil_reco
     struct vkd3d_shader_src_param *src_params;
     struct vkd3d_shader_dst_param *dst_param;
     const struct sm6_value *ptr, *src;
+    struct vkd3d_shader_register reg;
     uint64_t alignment_code;
 
     if (!(ptr = sm6_parser_get_value_by_ref(sm6, record, NULL, &i))
@@ -7455,24 +7519,26 @@ static void sm6_parser_emit_store(struct sm6_parser *sm6, const struct dxil_reco
     if (record->operands[i])
         WARN("Ignoring volatile modifier.\n");
 
+    sm6_register_from_value(&reg, ptr);
+
     if (ptr->structure_stride)
     {
-        VKD3D_ASSERT(ptr->u.reg.type == VKD3DSPR_GROUPSHAREDMEM);
+        VKD3D_ASSERT(reg.type == VKD3DSPR_GROUPSHAREDMEM);
         vsir_instruction_init(ins, &sm6->p.location, VKD3DSIH_STORE_STRUCTURED);
 
         if (!(src_params = instruction_src_params_alloc(ins, 3, sm6)))
             return;
-        if (ptr->u.reg.idx[1].rel_addr)
-            src_params[0] = *ptr->u.reg.idx[1].rel_addr;
+        if (reg.idx[1].rel_addr)
+            src_params[0] = *reg.idx[1].rel_addr;
         else
-            src_param_make_constant_uint(&src_params[0], ptr->u.reg.idx[1].offset);
+            src_param_make_constant_uint(&src_params[0], reg.idx[1].offset);
         /* Struct offset is always zero as there is no struct, just an array. */
         src_param_make_constant_uint(&src_params[1], 0);
         src_param_init_from_value(&src_params[2], src);
     }
     else
     {
-        operand_count = 1 + (ptr->u.reg.type == VKD3DSPR_GROUPSHAREDMEM);
+        operand_count = 1 + (reg.type == VKD3DSPR_GROUPSHAREDMEM);
         vsir_instruction_init(ins, &sm6->p.location, (operand_count > 1) ? VKD3DSIH_STORE_RAW : VKD3DSIH_MOV);
 
         if (!(src_params = instruction_src_params_alloc(ins, operand_count, sm6)))
@@ -7484,7 +7550,7 @@ static void sm6_parser_emit_store(struct sm6_parser *sm6, const struct dxil_reco
 
     dst_param = instruction_dst_params_alloc(ins, 1, sm6);
     dst_param_init(dst_param);
-    dst_param->reg = ptr->u.reg;
+    dst_param->reg = reg;
     dst_param->reg.alignment = alignment;
     /* Groupshared stores contain the address in the src params. */
     if (dst_param->reg.type != VKD3DSPR_IDXTEMP)
@@ -7529,7 +7595,7 @@ static void sm6_parser_emit_switch(struct sm6_parser *sm6, const struct dxil_rec
         return;
     }
 
-    terminator->conditional_reg = src->u.reg;
+    sm6_register_from_value(&terminator->conditional_reg, src);
     terminator->type = TERMINATOR_SWITCH;
 
     terminator->case_count = record->operand_count / 2u;
@@ -7645,7 +7711,7 @@ static bool sm6_metadata_get_uint_value(const struct sm6_parser *sm6,
     if (!sm6_type_is_integer(value->type))
         return false;
 
-    *u = register_get_uint_value(&value->u.reg);
+    *u = sm6_value_get_constant_uint(value);
 
     return true;
 }
@@ -7664,7 +7730,7 @@ static bool sm6_metadata_get_uint64_value(const struct sm6_parser *sm6,
     if (!sm6_type_is_integer(value->type))
         return false;
 
-    *u = register_get_uint64_value(&value->u.reg);
+    *u = sm6_value_get_constant_uint(value);
 
     return true;
 }
@@ -7683,7 +7749,7 @@ static bool sm6_metadata_get_float_value(const struct sm6_parser *sm6,
     if (!sm6_type_is_floating_point(value->type))
         return false;
 
-    *f = register_get_float_value(&value->u.reg);
+    *f = sm6_value_get_constant_float(value);
 
     return true;
 }
@@ -7868,7 +7934,7 @@ static void metadata_attachment_record_apply(const struct dxil_record *record, e
             }
             else if (metadata_node_get_unary_uint(node, &operand, sm6))
             {
-                dst->u.reg.non_uniform = !!operand;
+                dst->reg.non_uniform = !!operand;
             }
         }
         else
@@ -7940,13 +8006,13 @@ static enum vkd3d_result sm6_function_resolve_phi_incomings(const struct sm6_fun
                             "A PHI incoming value is not a constant or SSA register.");
                     return VKD3D_ERROR_INVALID_SHADER;
                 }
-                if (src->u.reg.data_type != phi->reg.data_type)
+                if (src->reg.data_type != phi->reg.data_type)
                 {
                     WARN("Type mismatch.\n");
                     vkd3d_shader_parser_warning(&sm6->p, VKD3D_SHADER_WARNING_DXIL_TYPE_MISMATCH,
                             "The type of a phi incoming value does not match the result type.");
                 }
-                phi->incoming[j].reg = src->u.reg;
+                sm6_register_from_value(&phi->incoming[j].reg, src);
             }
         }
     }
diff --git a/libs/vkd3d/libs/vkd3d-shader/hlsl.c b/libs/vkd3d/libs/vkd3d-shader/hlsl.c
index a6b46474812..0f9aafbe13e 100644
--- a/libs/vkd3d/libs/vkd3d-shader/hlsl.c
+++ b/libs/vkd3d/libs/vkd3d-shader/hlsl.c
@@ -2291,6 +2291,26 @@ struct hlsl_ir_node *hlsl_new_interlocked(struct hlsl_ctx *ctx, enum hlsl_interl
     return &interlocked->node;
 }
 
+static struct hlsl_ir_node *hlsl_new_sync(struct hlsl_ctx *ctx,
+        uint32_t sync_flags, const struct vkd3d_shader_location *loc)
+{
+    struct hlsl_ir_sync *sync;
+
+    if (!(sync = hlsl_alloc(ctx, sizeof(*sync))))
+        return NULL;
+
+    init_node(&sync->node, HLSL_IR_SYNC, NULL, loc);
+    sync->sync_flags = sync_flags;
+
+    return &sync->node;
+}
+
+struct hlsl_ir_node *hlsl_block_add_sync(struct hlsl_ctx *ctx, struct hlsl_block *block,
+        uint32_t sync_flags, const struct vkd3d_shader_location *loc)
+{
+    return append_new_instr(ctx, block, hlsl_new_sync(ctx, sync_flags, loc));
+}
+
 bool hlsl_index_is_noncontiguous(struct hlsl_ir_index *index)
 {
     struct hlsl_type *type = index->val.node->data_type;
@@ -2681,6 +2701,18 @@ static struct hlsl_ir_node *clone_interlocked(struct hlsl_ctx *ctx,
     return &dst->node;
 }
 
+static struct hlsl_ir_node *clone_sync(struct hlsl_ctx *ctx, struct hlsl_ir_sync *src)
+{
+    struct hlsl_ir_sync *dst;
+
+    if (!(dst = hlsl_alloc(ctx, sizeof(*dst))))
+        return NULL;
+    init_node(&dst->node, HLSL_IR_SYNC, NULL, &src->node.loc);
+    dst->sync_flags = src->sync_flags;
+
+    return &dst->node;
+}
+
 static struct hlsl_ir_node *clone_compile(struct hlsl_ctx *ctx,
         struct clone_instr_map *map, struct hlsl_ir_compile *compile)
 {
@@ -2884,6 +2916,9 @@ static struct hlsl_ir_node *clone_instr(struct hlsl_ctx *ctx,
         case HLSL_IR_INTERLOCKED:
             return clone_interlocked(ctx, map, hlsl_ir_interlocked(instr));
 
+        case HLSL_IR_SYNC:
+            return clone_sync(ctx, hlsl_ir_sync(instr));
+
         case HLSL_IR_COMPILE:
             return clone_compile(ctx, map, hlsl_ir_compile(instr));
 
@@ -3341,7 +3376,9 @@ const char *hlsl_node_type_to_string(enum hlsl_ir_node_type type)
         [HLSL_IR_STORE          ] = "HLSL_IR_STORE",
         [HLSL_IR_SWITCH         ] = "HLSL_IR_SWITCH",
         [HLSL_IR_SWIZZLE        ] = "HLSL_IR_SWIZZLE",
+
         [HLSL_IR_INTERLOCKED    ] = "HLSL_IR_INTERLOCKED",
+        [HLSL_IR_SYNC           ] = "HLSL_IR_SYNC",
 
         [HLSL_IR_COMPILE]             = "HLSL_IR_COMPILE",
         [HLSL_IR_SAMPLER_STATE]       = "HLSL_IR_SAMPLER_STATE",
@@ -3831,6 +3868,19 @@ static void dump_ir_interlocked(struct vkd3d_string_buffer *buffer, const struct
     vkd3d_string_buffer_printf(buffer, ")");
 }
 
+static void dump_ir_sync(struct vkd3d_string_buffer *buffer, const struct hlsl_ir_sync *sync)
+{
+    vkd3d_string_buffer_printf(buffer, "sync");
+    if (sync->sync_flags & VKD3DSSF_GLOBAL_UAV)
+        vkd3d_string_buffer_printf(buffer, "_uglobal");
+    if (sync->sync_flags & VKD3DSSF_THREAD_GROUP_UAV)
+        vkd3d_string_buffer_printf(buffer, "_ugroup");
+    if (sync->sync_flags & VKD3DSSF_GROUP_SHARED_MEMORY)
+        vkd3d_string_buffer_printf(buffer, "_g");
+    if (sync->sync_flags & VKD3DSSF_THREAD_GROUP)
+        vkd3d_string_buffer_printf(buffer, "_t");
+}
+
 static void dump_ir_compile(struct hlsl_ctx *ctx, struct vkd3d_string_buffer *buffer,
         const struct hlsl_ir_compile *compile)
 {
@@ -3968,6 +4018,10 @@ static void dump_instr(struct hlsl_ctx *ctx, struct vkd3d_string_buffer *buffer,
             dump_ir_interlocked(buffer, hlsl_ir_interlocked(instr));
             break;
 
+        case HLSL_IR_SYNC:
+            dump_ir_sync(buffer, hlsl_ir_sync(instr));
+            break;
+
         case HLSL_IR_COMPILE:
             dump_ir_compile(ctx, buffer, hlsl_ir_compile(instr));
             break;
@@ -4205,6 +4259,11 @@ static void free_ir_interlocked(struct hlsl_ir_interlocked *interlocked)
     vkd3d_free(interlocked);
 }
 
+static void free_ir_sync(struct hlsl_ir_sync *sync)
+{
+    vkd3d_free(sync);
+}
+
 static void free_ir_compile(struct hlsl_ir_compile *compile)
 {
     unsigned int i;
@@ -4295,6 +4354,10 @@ void hlsl_free_instr(struct hlsl_ir_node *node)
             free_ir_interlocked(hlsl_ir_interlocked(node));
             break;
 
+        case HLSL_IR_SYNC:
+            free_ir_sync(hlsl_ir_sync(node));
+            break;
+
         case HLSL_IR_COMPILE:
             free_ir_compile(hlsl_ir_compile(node));
             break;
diff --git a/libs/vkd3d/libs/vkd3d-shader/hlsl.h b/libs/vkd3d/libs/vkd3d-shader/hlsl.h
index 8cb805a2e66..9eb86534f81 100644
--- a/libs/vkd3d/libs/vkd3d-shader/hlsl.h
+++ b/libs/vkd3d/libs/vkd3d-shader/hlsl.h
@@ -329,7 +329,9 @@ enum hlsl_ir_node_type
     HLSL_IR_STORE,
     HLSL_IR_SWIZZLE,
     HLSL_IR_SWITCH,
+
     HLSL_IR_INTERLOCKED,
+    HLSL_IR_SYNC,
 
     HLSL_IR_COMPILE,
     HLSL_IR_SAMPLER_STATE,
@@ -1006,6 +1008,15 @@ struct hlsl_ir_interlocked
     struct hlsl_src coords, cmp_value, value;
 };
 
+/* Represents a thread synchronization instruction such as GroupMemoryBarrier().*/
+struct hlsl_ir_sync
+{
+    struct hlsl_ir_node node;
+
+    /* Flags from enum vkd3d_shader_sync_flags. */
+    uint32_t sync_flags;
+};
+
 struct hlsl_scope
 {
     /* Item entry for hlsl_ctx.scopes. */
@@ -1343,6 +1354,12 @@ static inline struct hlsl_ir_interlocked *hlsl_ir_interlocked(const struct hlsl_
     return CONTAINING_RECORD(node, struct hlsl_ir_interlocked, node);
 }
 
+static inline struct hlsl_ir_sync *hlsl_ir_sync(const struct hlsl_ir_node *node)
+{
+    VKD3D_ASSERT(node->type == HLSL_IR_SYNC);
+    return CONTAINING_RECORD(node, struct hlsl_ir_sync, node);
+}
+
 static inline struct hlsl_ir_compile *hlsl_ir_compile(const struct hlsl_ir_node *node)
 {
     VKD3D_ASSERT(node->type == HLSL_IR_COMPILE);
@@ -1582,6 +1599,8 @@ void hlsl_block_add_store_parent(struct hlsl_ctx *ctx, struct hlsl_block *block,
         unsigned int writemask, const struct vkd3d_shader_location *loc);
 struct hlsl_ir_node *hlsl_block_add_swizzle(struct hlsl_ctx *ctx, struct hlsl_block *block, uint32_t s,
         unsigned int width, struct hlsl_ir_node *val, const struct vkd3d_shader_location *loc);
+struct hlsl_ir_node *hlsl_block_add_sync(struct hlsl_ctx *ctx, struct hlsl_block *block,
+        uint32_t sync_flags, const struct vkd3d_shader_location *loc);
 struct hlsl_ir_node *hlsl_block_add_uint_constant(struct hlsl_ctx *ctx, struct hlsl_block *block,
         unsigned int n, const struct vkd3d_shader_location *loc);
 struct hlsl_ir_node *hlsl_block_add_unary_expr(struct hlsl_ctx *ctx, struct hlsl_block *block,
diff --git a/libs/vkd3d/libs/vkd3d-shader/hlsl.y b/libs/vkd3d/libs/vkd3d-shader/hlsl.y
index 702fd30bda3..05657d27b38 100644
--- a/libs/vkd3d/libs/vkd3d-shader/hlsl.y
+++ b/libs/vkd3d/libs/vkd3d-shader/hlsl.y
@@ -574,13 +574,14 @@ static struct hlsl_default_value evaluate_static_expression(struct hlsl_ctx *ctx
                 /* fall-through */
             case HLSL_IR_CALL:
             case HLSL_IR_IF:
+            case HLSL_IR_INTERLOCKED:
             case HLSL_IR_LOOP:
             case HLSL_IR_JUMP:
             case HLSL_IR_RESOURCE_LOAD:
             case HLSL_IR_RESOURCE_STORE:
             case HLSL_IR_SWITCH:
-            case HLSL_IR_INTERLOCKED:
             case HLSL_IR_STATEBLOCK_CONSTANT:
+            case HLSL_IR_SYNC:
                 hlsl_error(ctx, &node->loc, VKD3D_SHADER_ERROR_HLSL_INVALID_SYNTAX,
                         "Expected literal expression.");
                 break;
@@ -5110,6 +5111,67 @@ static bool intrinsic_InterlockedXor(struct hlsl_ctx *ctx,
     return intrinsic_interlocked(ctx, HLSL_INTERLOCKED_XOR, params, loc, "InterlockedXor");
 }
 
+static void validate_group_barrier_profile(struct hlsl_ctx *ctx, const struct vkd3d_shader_location *loc)
+{
+    if (ctx->profile->type != VKD3D_SHADER_TYPE_COMPUTE || hlsl_version_lt(ctx, 5, 0))
+    {
+        hlsl_error(ctx, loc, VKD3D_SHADER_ERROR_HLSL_INCOMPATIBLE_PROFILE,
+                "Group barriers can only be used in compute shaders 5.0 or higher.");
+    }
+}
+
+static bool intrinsic_AllMemoryBarrier(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    validate_group_barrier_profile(ctx, loc);
+    return !!hlsl_block_add_sync(ctx, params->instrs, VKD3DSSF_GLOBAL_UAV
+            | VKD3DSSF_GROUP_SHARED_MEMORY, loc);
+}
+
+static bool intrinsic_AllMemoryBarrierWithGroupSync(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    validate_group_barrier_profile(ctx, loc);
+    return !!hlsl_block_add_sync(ctx, params->instrs, VKD3DSSF_GLOBAL_UAV
+            | VKD3DSSF_GROUP_SHARED_MEMORY | VKD3DSSF_THREAD_GROUP, loc);
+}
+
+static bool intrinsic_DeviceMemoryBarrier(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    if ((ctx->profile->type != VKD3D_SHADER_TYPE_COMPUTE && ctx->profile->type != VKD3D_SHADER_TYPE_PIXEL)
+            || hlsl_version_lt(ctx, 5, 0))
+    {
+        hlsl_error(ctx, loc, VKD3D_SHADER_ERROR_HLSL_INCOMPATIBLE_PROFILE,
+                "DeviceMemoryBarrier() can only be used in pixel and compute shaders 5.0 or higher.");
+    }
+    return !!hlsl_block_add_sync(ctx, params->instrs, VKD3DSSF_GLOBAL_UAV, loc);
+}
+
+static bool intrinsic_DeviceMemoryBarrierWithGroupSync(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    validate_group_barrier_profile(ctx, loc);
+    return !!hlsl_block_add_sync(ctx, params->instrs, VKD3DSSF_GLOBAL_UAV
+            | VKD3DSSF_THREAD_GROUP, loc);
+}
+
+static bool intrinsic_GroupMemoryBarrier(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    validate_group_barrier_profile(ctx, loc);
+    return !!hlsl_block_add_sync(ctx, params->instrs,
+            VKD3DSSF_GROUP_SHARED_MEMORY, loc);
+}
+
+static bool intrinsic_GroupMemoryBarrierWithGroupSync(struct hlsl_ctx *ctx,
+        const struct parse_initializer *params, const struct vkd3d_shader_location *loc)
+{
+    validate_group_barrier_profile(ctx, loc);
+    return !!hlsl_block_add_sync(ctx, params->instrs,
+            VKD3DSSF_GROUP_SHARED_MEMORY | VKD3DSSF_THREAD_GROUP, loc);
+}
+
 static const struct intrinsic_function
 {
     const char *name;
@@ -5121,8 +5183,14 @@ static const struct intrinsic_function
 intrinsic_functions[] =
 {
     /* Note: these entries should be kept in alphabetical order. */
+    {"AllMemoryBarrier",                    0, true,  intrinsic_AllMemoryBarrier},
+    {"AllMemoryBarrierWithGroupSync",       0, true,  intrinsic_AllMemoryBarrierWithGroupSync},
     {"D3DCOLORtoUBYTE4",                    1, true,  intrinsic_d3dcolor_to_ubyte4},
+    {"DeviceMemoryBarrier",                 0, true,  intrinsic_DeviceMemoryBarrier},
+    {"DeviceMemoryBarrierWithGroupSync",    0, true,  intrinsic_DeviceMemoryBarrierWithGroupSync},
     {"GetRenderTargetSampleCount",          0, true,  intrinsic_GetRenderTargetSampleCount},
+    {"GroupMemoryBarrier",                  0, true,  intrinsic_GroupMemoryBarrier},
+    {"GroupMemoryBarrierWithGroupSync",     0, true,  intrinsic_GroupMemoryBarrierWithGroupSync},
     {"InterlockedAdd",                     -1, true,  intrinsic_InterlockedAdd},
     {"InterlockedAnd",                     -1, true,  intrinsic_InterlockedAnd},
     {"InterlockedCompareExchange",          4, true,  intrinsic_InterlockedCompareExchange},
diff --git a/libs/vkd3d/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d/libs/vkd3d-shader/hlsl_codegen.c
index bc14885af2b..9c3affda534 100644
--- a/libs/vkd3d/libs/vkd3d-shader/hlsl_codegen.c
+++ b/libs/vkd3d/libs/vkd3d-shader/hlsl_codegen.c
@@ -288,7 +288,7 @@ static bool types_are_semantic_equivalent(struct hlsl_ctx *ctx, const struct hls
 
 static struct hlsl_ir_var *add_semantic_var(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *func,
         struct hlsl_ir_var *var, struct hlsl_type *type, uint32_t modifiers, struct hlsl_semantic *semantic,
-        uint32_t index, bool output, bool force_align, const struct vkd3d_shader_location *loc)
+        uint32_t index, bool output, bool force_align, bool create, const struct vkd3d_shader_location *loc)
 {
     struct hlsl_semantic new_semantic;
     struct hlsl_ir_var *ext_var;
@@ -311,6 +311,11 @@ static struct hlsl_ir_var *add_semantic_var(struct hlsl_ctx *ctx, struct hlsl_ir
                     || ext_var->data_type->class <= HLSL_CLASS_VECTOR);
             VKD3D_ASSERT(hlsl_type_is_primitive_array(type) || type->class <= HLSL_CLASS_VECTOR);
 
+            vkd3d_free(new_name);
+
+            if (!create)
+                return ext_var;
+
             if (output)
             {
                 if (index >= semantic->reported_duplicated_output_next_index)
@@ -336,11 +341,12 @@ static struct hlsl_ir_var *add_semantic_var(struct hlsl_ctx *ctx, struct hlsl_ir
                 }
             }
 
-            vkd3d_free(new_name);
             return ext_var;
         }
     }
 
+    VKD3D_ASSERT(create);
+
     if (!(hlsl_clone_semantic(ctx, &new_semantic, semantic)))
     {
         vkd3d_free(new_name);
@@ -429,7 +435,7 @@ static void prepend_input_copy(struct hlsl_ctx *ctx, struct hlsl_ir_function_dec
             prim_type_src->modifiers = var->data_type->modifiers & HLSL_PRIMITIVE_MODIFIERS_MASK;
 
             if (!(input = add_semantic_var(ctx, func, var, prim_type_src,
-                    modifiers, semantic, semantic_index + i, false, force_align, loc)))
+                    modifiers, semantic, semantic_index + i, false, force_align, true, loc)))
                 return;
             hlsl_init_simple_deref_from_var(&prim_deref, input);
 
@@ -442,7 +448,7 @@ static void prepend_input_copy(struct hlsl_ctx *ctx, struct hlsl_ir_function_dec
         else
         {
             if (!(input = add_semantic_var(ctx, func, var, vector_type_src,
-                    modifiers, semantic, semantic_index + i, false, force_align, loc)))
+                    modifiers, semantic, semantic_index + i, false, force_align, true, loc)))
                 return;
 
             if (!(load = hlsl_new_var_load(ctx, input, &var->loc)))
@@ -550,9 +556,9 @@ static void prepend_input_var_copy(struct hlsl_ctx *ctx, struct hlsl_ir_function
     list_move_head(&func->body.instrs, &block.instrs);
 }
 
-static void append_output_copy(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *func,
-        struct hlsl_ir_load *rhs, uint32_t modifiers,
-        struct hlsl_semantic *semantic, uint32_t semantic_index, bool force_align)
+static void append_output_copy(struct hlsl_ctx *ctx, struct hlsl_block *block,
+        struct hlsl_ir_function_decl *func, struct hlsl_ir_load *rhs, uint32_t modifiers,
+        struct hlsl_semantic *semantic, uint32_t semantic_index, bool force_align, bool create)
 {
     struct hlsl_type *type = rhs->node.data_type, *vector_type;
     struct vkd3d_shader_location *loc = &rhs->node.loc;
@@ -582,49 +588,48 @@ static void append_output_copy(struct hlsl_ctx *ctx, struct hlsl_ir_function_dec
         struct hlsl_ir_node *load;
 
         if (!(output = add_semantic_var(ctx, func, var, vector_type,
-                modifiers, semantic, semantic_index + i, true, force_align, loc)))
+                modifiers, semantic, semantic_index + i, true, force_align, create, loc)))
             return;
 
         if (type->class == HLSL_CLASS_MATRIX)
         {
-            c = hlsl_block_add_uint_constant(ctx, &func->body, i, &var->loc);
-            load = hlsl_block_add_load_index(ctx, &func->body, &rhs->src, c, &var->loc);
+            c = hlsl_block_add_uint_constant(ctx, block, i, &var->loc);
+            load = hlsl_block_add_load_index(ctx, block, &rhs->src, c, &var->loc);
         }
         else
         {
             VKD3D_ASSERT(i == 0);
 
-            load = hlsl_block_add_load_index(ctx, &func->body, &rhs->src, NULL, &var->loc);
+            load = hlsl_block_add_load_index(ctx, block, &rhs->src, NULL, &var->loc);
         }
 
-        hlsl_block_add_simple_store(ctx, &func->body, output, load);
+        hlsl_block_add_simple_store(ctx, block, output, load);
     }
 }
 
-static void append_output_copy_recurse(struct hlsl_ctx *ctx,
-        struct hlsl_ir_function_decl *func, struct hlsl_ir_load *rhs, uint32_t modifiers,
-        struct hlsl_semantic *semantic, uint32_t semantic_index, bool force_align)
+static void append_output_copy_recurse(struct hlsl_ctx *ctx, struct hlsl_block *block,
+        struct hlsl_ir_function_decl *func, const struct hlsl_type *type, struct hlsl_ir_load *rhs, uint32_t modifiers,
+        struct hlsl_semantic *semantic, uint32_t semantic_index, bool force_align, bool create)
 {
     struct vkd3d_shader_location *loc = &rhs->node.loc;
-    struct hlsl_type *type = rhs->node.data_type;
     struct hlsl_ir_var *var = rhs->src.var;
     struct hlsl_ir_node *c;
     unsigned int i;
 
     if (type->class == HLSL_CLASS_ARRAY || type->class == HLSL_CLASS_STRUCT)
     {
-        struct hlsl_ir_load *element_load;
-        struct hlsl_struct_field *field;
-        uint32_t elem_semantic_index;
-
         for (i = 0; i < hlsl_type_element_count(type); ++i)
         {
-            uint32_t element_modifiers;
+            uint32_t element_modifiers, elem_semantic_index;
+            const struct hlsl_type *element_type;
+            struct hlsl_ir_load *element_load;
+            struct hlsl_struct_field *field;
 
             if (type->class == HLSL_CLASS_ARRAY)
             {
                 elem_semantic_index = semantic_index
                         + i * hlsl_type_get_array_element_reg_size(type->e.array.type, HLSL_REGSET_NUMERIC) / 4;
+                element_type = type->e.array.type;
                 element_modifiers = modifiers;
                 force_align = true;
             }
@@ -637,23 +642,24 @@ static void append_output_copy_recurse(struct hlsl_ctx *ctx,
                 semantic = &field->semantic;
                 elem_semantic_index = semantic->index;
                 loc = &field->loc;
+                element_type = field->type;
                 element_modifiers = combine_field_storage_modifiers(modifiers, field->storage_modifiers);
                 force_align = (i == 0);
             }
 
-            c = hlsl_block_add_uint_constant(ctx, &func->body, i, &var->loc);
+            c = hlsl_block_add_uint_constant(ctx, block, i, &var->loc);
 
             if (!(element_load = hlsl_new_load_index(ctx, &rhs->src, c, loc)))
                 return;
-            hlsl_block_add_instr(&func->body, &element_load->node);
+            hlsl_block_add_instr(block, &element_load->node);
 
-            append_output_copy_recurse(ctx, func, element_load, element_modifiers,
-                    semantic, elem_semantic_index, force_align);
+            append_output_copy_recurse(ctx, block, func, element_type, element_load, element_modifiers, semantic,
+                    elem_semantic_index, force_align, create);
         }
     }
     else
     {
-        append_output_copy(ctx, func, rhs, modifiers, semantic, semantic_index, force_align);
+        append_output_copy(ctx, block, func, rhs, modifiers, semantic, semantic_index, force_align, create);
     }
 }
 
@@ -669,7 +675,8 @@ static void append_output_var_copy(struct hlsl_ctx *ctx, struct hlsl_ir_function
         return;
     hlsl_block_add_instr(&func->body, &load->node);
 
-    append_output_copy_recurse(ctx, func, load, var->storage_modifiers, &var->semantic, var->semantic.index, false);
+    append_output_copy_recurse(ctx, &func->body, func, var->data_type, load, var->storage_modifiers,
+            &var->semantic, var->semantic.index, false, true);
 }
 
 bool hlsl_transform_ir(struct hlsl_ctx *ctx, bool (*func)(struct hlsl_ctx *ctx, struct hlsl_ir_node *, void *),
@@ -2453,6 +2460,7 @@ static bool copy_propagation_transform_block(struct hlsl_ctx *ctx, struct hlsl_b
 
             case HLSL_IR_INTERLOCKED:
                 progress |= copy_propagation_transform_interlocked(ctx, hlsl_ir_interlocked(instr), state);
+                break;
 
             default:
                 break;
@@ -2894,6 +2902,16 @@ static void record_vectorizable_store(struct hlsl_ctx *ctx, struct hlsl_block *b
     ++state->count;
 }
 
+static void mark_store_groups_dirty(struct hlsl_ctx *ctx,
+        struct vectorize_stores_state *state, struct hlsl_ir_var *var)
+{
+    for (unsigned int i = 0; i < state->count; ++i)
+    {
+        if (state->groups[i].stores[0]->lhs.var == var)
+            state->groups[i].dirty = true;
+    }
+}
+
 static void find_vectorizable_store_groups(struct hlsl_ctx *ctx, struct hlsl_block *block,
         struct vectorize_stores_state *state)
 {
@@ -2907,20 +2925,21 @@ static void find_vectorizable_store_groups(struct hlsl_ctx *ctx, struct hlsl_blo
         }
         else if (instr->type == HLSL_IR_LOAD)
         {
-            struct hlsl_ir_var *var = hlsl_ir_load(instr)->src.var;
-
             /* By vectorizing store A with store B, we are effectively moving
              * store A down to happen at the same time as store B.
              * If there was a load of the same variable between the two, this
              * would be incorrect.
              * Therefore invalidate all stores to this variable. As above, we
              * could be more granular if necessary. */
-
-            for (unsigned int i = 0; i < state->count; ++i)
-            {
-                if (state->groups[i].stores[0]->lhs.var == var)
-                    state->groups[i].dirty = true;
-            }
+            mark_store_groups_dirty(ctx, state, hlsl_ir_load(instr)->src.var);
+        }
+        else if (instr->type == HLSL_IR_INTERLOCKED)
+        {
+            /* An interlocked operation can be used on shared memory variables,
+             * and it is at the same time both a store and a load, thus, we
+             * should also mark all stores to this variable as dirty once we
+             * find one.*/
+            mark_store_groups_dirty(ctx, state, hlsl_ir_interlocked(instr)->dst.var);
         }
         else if (instr->type == HLSL_IR_IF)
         {
@@ -3338,6 +3357,59 @@ static bool split_struct_copies(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr
     return true;
 }
 
+struct stream_append_ctx
+{
+    struct hlsl_ir_function_decl *func;
+    bool created;
+};
+
+static bool lower_stream_appends(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context)
+{
+    struct stream_append_ctx *append_ctx = context;
+    struct hlsl_ir_resource_store *store;
+    const struct hlsl_ir_node *rhs;
+    const struct hlsl_type *type;
+    struct hlsl_ir_var *var;
+    struct hlsl_block block;
+
+    if (instr->type != HLSL_IR_RESOURCE_STORE)
+        return false;
+
+    store = hlsl_ir_resource_store(instr);
+    if (store->store_type != HLSL_RESOURCE_STREAM_APPEND)
+        return false;
+
+    rhs = store->value.node;
+    var = store->resource.var;
+    type = hlsl_get_stream_output_type(var->data_type);
+
+    if (rhs->type != HLSL_IR_LOAD)
+    {
+        hlsl_fixme(ctx, &instr->loc, "Stream append rhs is not HLSL_IR_LOAD. Broadcast may be missing.");
+        return false;
+    }
+
+    VKD3D_ASSERT(var->regs[HLSL_REGSET_STREAM_OUTPUTS].allocated);
+
+    if (var->regs[HLSL_REGSET_STREAM_OUTPUTS].index)
+    {
+        hlsl_fixme(ctx, &instr->loc, "Append to an output stream with a nonzero stream index.");
+        return false;
+    }
+
+    hlsl_block_init(&block);
+
+    append_output_copy_recurse(ctx, &block, append_ctx->func, type->e.so.type, hlsl_ir_load(rhs), var->storage_modifiers,
+            &var->semantic, var->semantic.index, false, !append_ctx->created);
+    append_ctx->created = true;
+
+    list_move_before(&instr->entry, &block.instrs);
+    hlsl_src_remove(&store->value);
+
+    return true;
+
+}
+
 static bool split_matrix_copies(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context)
 {
     const struct hlsl_ir_node *rhs;
@@ -5127,11 +5199,12 @@ static bool dce(struct hlsl_ctx *ctx, struct hlsl_ir_node *instr, void *context)
 
         case HLSL_IR_CALL:
         case HLSL_IR_IF:
+        case HLSL_IR_INTERLOCKED:
         case HLSL_IR_JUMP:
         case HLSL_IR_LOOP:
         case HLSL_IR_RESOURCE_STORE:
         case HLSL_IR_SWITCH:
-        case HLSL_IR_INTERLOCKED:
+        case HLSL_IR_SYNC:
             break;
         case HLSL_IR_STATEBLOCK_CONSTANT:
             /* Stateblock constants should not appear in the shader program. */
@@ -5415,6 +5488,7 @@ static void compute_liveness_recurse(struct hlsl_block *block, unsigned int loop
         }
         case HLSL_IR_CONSTANT:
         case HLSL_IR_STRING_CONSTANT:
+        case HLSL_IR_SYNC:
             break;
         case HLSL_IR_COMPILE:
         case HLSL_IR_SAMPLER_STATE:
@@ -6441,7 +6515,8 @@ static void allocate_semantic_register(struct hlsl_ctx *ctx, struct hlsl_ir_var
     }
 }
 
-static void allocate_semantic_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func)
+static void allocate_semantic_registers(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func,
+        uint32_t *output_reg_count)
 {
     struct register_allocator in_prim_allocator = {0}, patch_constant_out_patch_allocator = {0};
     struct register_allocator input_allocator = {0}, output_allocator = {0};
@@ -6478,6 +6553,8 @@ static void allocate_semantic_registers(struct hlsl_ctx *ctx, struct hlsl_ir_fun
             allocate_semantic_register(ctx, var, &output_allocator, true, !is_pixel_shader);
     }
 
+    *output_reg_count = output_allocator.reg_count;
+
     vkd3d_free(in_prim_allocator.allocations);
     vkd3d_free(patch_constant_out_patch_allocator.allocations);
     vkd3d_free(input_allocator.allocations);
@@ -7641,6 +7718,42 @@ static void validate_and_record_stream_outputs(struct hlsl_ctx *ctx)
     /* TODO: check that maxvertexcount * outputdatasize <= 1024. */
 }
 
+static void validate_max_output_size(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func,
+        uint32_t output_reg_count)
+{
+    unsigned int max_output_size, comp_count = 0;
+    unsigned int *reg_comp_count;
+    struct hlsl_ir_var *var;
+    uint32_t id;
+
+    if (ctx->result)
+        return;
+
+    if (!(reg_comp_count = hlsl_calloc(ctx, output_reg_count, sizeof(*reg_comp_count))))
+        return;
+
+    LIST_FOR_EACH_ENTRY(var, &entry_func->extern_vars, struct hlsl_ir_var, extern_entry)
+    {
+        if (!var->is_output_semantic)
+            continue;
+
+        VKD3D_ASSERT(var->regs[HLSL_REGSET_NUMERIC].allocated);
+        id = var->regs[HLSL_REGSET_NUMERIC].id;
+        reg_comp_count[id] = max(reg_comp_count[id], vkd3d_log2i(var->regs[HLSL_REGSET_NUMERIC].writemask) + 1);
+    }
+
+    for (id = 0; id < output_reg_count; ++id)
+        comp_count += reg_comp_count[id];
+
+    max_output_size = ctx->max_vertex_count * comp_count;
+    if (max_output_size > 1024)
+        hlsl_error(ctx, &entry_func->loc, VKD3D_SHADER_ERROR_HLSL_INVALID_MAX_VERTEX_COUNT,
+                "Max vertex count (%u) * output data component count (%u) = %u, which is greater than 1024.",
+                ctx->max_vertex_count, comp_count, max_output_size);
+
+    vkd3d_free(reg_comp_count);
+}
+
 static void remove_unreachable_code(struct hlsl_ctx *ctx, struct hlsl_block *body)
 {
     struct hlsl_ir_node *instr, *next;
@@ -10718,8 +10831,20 @@ static bool sm4_generate_vsir_instr_resource_store(struct hlsl_ctx *ctx,
 
     if (store->store_type != HLSL_RESOURCE_STORE)
     {
-        hlsl_fixme(ctx, &instr->loc, "Stream output operations.");
-        return false;
+        enum vkd3d_shader_opcode opcode = store->store_type == HLSL_RESOURCE_STREAM_APPEND
+                ? VKD3DSIH_EMIT : VKD3DSIH_CUT;
+
+        VKD3D_ASSERT(!store->value.node && !store->coords.node);
+        VKD3D_ASSERT(store->resource.var->regs[HLSL_REGSET_STREAM_OUTPUTS].allocated);
+
+        if (store->resource.var->regs[HLSL_REGSET_STREAM_OUTPUTS].index)
+        {
+            hlsl_fixme(ctx, &instr->loc, "Stream output operation with a nonzero stream index.");
+            return false;
+        }
+
+        ins = generate_vsir_add_program_instruction(ctx, program, &store->node.loc, opcode, 0, 0);
+        return !!ins;
     }
 
     if (!store->resource.var->is_uniform)
@@ -11264,6 +11389,19 @@ static bool sm4_generate_vsir_instr_jump(struct hlsl_ctx *ctx,
     }
 }
 
+static bool sm4_generate_vsir_instr_sync(struct hlsl_ctx *ctx,
+        struct vsir_program *program, const struct hlsl_ir_sync *sync)
+{
+    const struct hlsl_ir_node *instr = &sync->node;
+    struct vkd3d_shader_instruction *ins;
+
+    if (!(ins = generate_vsir_add_program_instruction(ctx, program, &instr->loc, VKD3DSIH_SYNC, 0, 0)))
+        return false;
+    ins->flags = sync->sync_flags;
+
+    return true;
+}
+
 static void sm4_generate_vsir_block(struct hlsl_ctx *ctx, struct hlsl_block *block, struct vsir_program *program);
 
 static void sm4_generate_vsir_instr_if(struct hlsl_ctx *ctx, struct vsir_program *program, struct hlsl_ir_if *iff)
@@ -11414,6 +11552,10 @@ static void sm4_generate_vsir_block(struct hlsl_ctx *ctx, struct hlsl_block *blo
                 sm4_generate_vsir_instr_interlocked(ctx, program, hlsl_ir_interlocked(instr));
                 break;
 
+            case HLSL_IR_SYNC:
+                sm4_generate_vsir_instr_sync(ctx, program, hlsl_ir_sync(instr));
+                break;
+
             default:
                 break;
         }
@@ -13236,6 +13378,8 @@ static void process_entry_function(struct hlsl_ctx *ctx,
     struct hlsl_block static_initializers, global_uniforms;
     struct hlsl_block *const body = &entry_func->body;
     struct recursive_call_ctx recursive_call_ctx;
+    struct stream_append_ctx stream_append_ctx;
+    uint32_t output_reg_count;
     struct hlsl_ir_var *var;
     unsigned int i;
     bool progress;
@@ -13461,6 +13605,10 @@ static void process_entry_function(struct hlsl_ctx *ctx,
     {
         allocate_stream_outputs(ctx);
         validate_and_record_stream_outputs(ctx);
+
+        memset(&stream_append_ctx, 0, sizeof(stream_append_ctx));
+        stream_append_ctx.func = entry_func;
+        hlsl_transform_ir(ctx, lower_stream_appends, body, &stream_append_ctx);
     }
 
     if (profile->major_version < 4)
@@ -13519,7 +13667,10 @@ static void process_entry_function(struct hlsl_ctx *ctx,
 
     allocate_register_reservations(ctx, &ctx->extern_vars);
     allocate_register_reservations(ctx, &entry_func->extern_vars);
-    allocate_semantic_registers(ctx, entry_func);
+    allocate_semantic_registers(ctx, entry_func, &output_reg_count);
+
+    if (profile->type == VKD3D_SHADER_TYPE_GEOMETRY)
+        validate_max_output_size(ctx, entry_func, output_reg_count);
 }
 
 int hlsl_emit_bytecode(struct hlsl_ctx *ctx, struct hlsl_ir_function_decl *entry_func,
diff --git a/libs/vkd3d/libs/vkd3d-shader/tpf.c b/libs/vkd3d/libs/vkd3d-shader/tpf.c
index c29bedfaaa9..29b03871e05 100644
--- a/libs/vkd3d/libs/vkd3d-shader/tpf.c
+++ b/libs/vkd3d/libs/vkd3d-shader/tpf.c
@@ -4165,6 +4165,7 @@ static void tpf_handle_instruction(struct tpf_compiler *tpf, const struct vkd3d_
         case VKD3DSIH_BREAK:
         case VKD3DSIH_CASE:
         case VKD3DSIH_CONTINUE:
+        case VKD3DSIH_CUT:
         case VKD3DSIH_DEFAULT:
         case VKD3DSIH_DISCARD:
         case VKD3DSIH_DIV:
@@ -4178,6 +4179,7 @@ static void tpf_handle_instruction(struct tpf_compiler *tpf, const struct vkd3d_
         case VKD3DSIH_DSY_COARSE:
         case VKD3DSIH_DSY_FINE:
         case VKD3DSIH_ELSE:
+        case VKD3DSIH_EMIT:
         case VKD3DSIH_ENDIF:
         case VKD3DSIH_ENDLOOP:
         case VKD3DSIH_ENDSWITCH:
@@ -4213,6 +4215,7 @@ static void tpf_handle_instruction(struct tpf_compiler *tpf, const struct vkd3d_
         case VKD3DSIH_IMM_ATOMIC_UMIN:
         case VKD3DSIH_IMM_ATOMIC_OR:
         case VKD3DSIH_IMM_ATOMIC_XOR:
+        case VKD3DSIH_SYNC:
         case VKD3DSIH_IMUL:
         case VKD3DSIH_INE:
         case VKD3DSIH_INEG:
-- 
2.47.2