/*
 * Copyright 2023 Conor McCarthy for CodeWeavers
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
 */

#include "vkd3d_shader_private.h"

bool vsir_program_init(struct vsir_program *program, const struct vkd3d_shader_version *version, unsigned int reserve)
{
    program->shader_version = *version;
    return shader_instruction_array_init(&program->instructions, reserve);
}

void vsir_program_cleanup(struct vsir_program *program)
{
    size_t i;

    for (i = 0; i < program->block_name_count; ++i)
        vkd3d_free((void *)program->block_names[i]);
    vkd3d_free(program->block_names);
    shader_instruction_array_destroy(&program->instructions);
    shader_signature_cleanup(&program->input_signature);
    shader_signature_cleanup(&program->output_signature);
    shader_signature_cleanup(&program->patch_constant_signature);
}

static inline bool shader_register_is_phase_instance_id(const struct vkd3d_shader_register *reg)
{
    return reg->type == VKD3DSPR_FORKINSTID || reg->type == VKD3DSPR_JOININSTID;
}

static bool vsir_instruction_is_dcl(const struct vkd3d_shader_instruction *instruction)
{
    enum vkd3d_shader_opcode handler_idx = instruction->handler_idx;
    return (VKD3DSIH_DCL <= handler_idx && handler_idx <= VKD3DSIH_DCL_VERTICES_OUT)
            || handler_idx == VKD3DSIH_HS_DECLS;
}

static void vkd3d_shader_instruction_make_nop(struct vkd3d_shader_instruction *ins)
{
    struct vkd3d_shader_location location = ins->location;

    vsir_instruction_init(ins, &location, VKD3DSIH_NOP);
}

static void remove_dcl_temps(struct vsir_program *program)
{
    unsigned int i;

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];

        if (ins->handler_idx == VKD3DSIH_DCL_TEMPS)
            vkd3d_shader_instruction_make_nop(ins);
    }
}

static bool vsir_instruction_init_with_params(struct vsir_program *program,
        struct vkd3d_shader_instruction *ins, const struct vkd3d_shader_location *location,
        enum vkd3d_shader_opcode handler_idx, unsigned int dst_count, unsigned int src_count)
{
    vsir_instruction_init(ins, location, handler_idx);
    ins->dst_count = dst_count;
    ins->src_count = src_count;

    if (!(ins->dst = vsir_program_get_dst_params(program, ins->dst_count)))
    {
        ERR("Failed to allocate %u destination parameters.\n", dst_count);
        return false;
    }

    if (!(ins->src = vsir_program_get_src_params(program, ins->src_count)))
    {
        ERR("Failed to allocate %u source parameters.\n", src_count);
        return false;
    }

    memset(ins->dst, 0, sizeof(*ins->dst) * ins->dst_count);
    memset(ins->src, 0, sizeof(*ins->src) * ins->src_count);
    return true;
}

static enum vkd3d_result vsir_program_lower_texkills(struct vsir_program *program)
{
    struct vkd3d_shader_instruction_array *instructions = &program->instructions;
    struct vkd3d_shader_instruction *texkill_ins, *ins;
    unsigned int components_read = 3 + (program->shader_version.major >= 2);
    unsigned int tmp_idx = ~0u;
    unsigned int i, k;

    for (i = 0; i < instructions->count; ++i)
    {
        texkill_ins = &instructions->elements[i];

        if (texkill_ins->handler_idx != VKD3DSIH_TEXKILL)
            continue;

        if (!shader_instruction_array_insert_at(instructions, i + 1, components_read + 1))
            return VKD3D_ERROR_OUT_OF_MEMORY;

        if (tmp_idx == ~0u)
            tmp_idx = program->temp_count++;

        /* tmp = ins->dst[0] < 0  */

        ins = &instructions->elements[i + 1];
        if (!vsir_instruction_init_with_params(program, ins, &texkill_ins->location, VKD3DSIH_LTO, 1, 2))
            return VKD3D_ERROR_OUT_OF_MEMORY;

        vsir_register_init(&ins->dst[0].reg, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
        ins->dst[0].reg.dimension = VSIR_DIMENSION_VEC4;
        ins->dst[0].reg.idx[0].offset = tmp_idx;
        ins->dst[0].write_mask = VKD3DSP_WRITEMASK_ALL;

        ins->src[0].reg = texkill_ins->dst[0].reg;
        vsir_register_init(&ins->src[1].reg, VKD3DSPR_IMMCONST, VKD3D_DATA_FLOAT, 0);
        ins->src[1].reg.dimension = VSIR_DIMENSION_VEC4;
        ins->src[1].reg.u.immconst_f32[0] = 0.0f;
        ins->src[1].reg.u.immconst_f32[1] = 0.0f;
        ins->src[1].reg.u.immconst_f32[2] = 0.0f;
        ins->src[1].reg.u.immconst_f32[3] = 0.0f;

        /* tmp.x = tmp.x || tmp.y */
        /* tmp.x = tmp.x || tmp.z */
        /* tmp.x = tmp.x || tmp.w, if sm >= 2.0 */

        for (k = 1; k < components_read; ++k)
        {
            ins = &instructions->elements[i + 1 + k];
            if (!(vsir_instruction_init_with_params(program, ins, &texkill_ins->location, VKD3DSIH_OR, 1, 2)))
                return VKD3D_ERROR_OUT_OF_MEMORY;

            vsir_register_init(&ins->dst[0].reg, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
            ins->dst[0].reg.dimension = VSIR_DIMENSION_VEC4;
            ins->dst[0].reg.idx[0].offset = tmp_idx;
            ins->dst[0].write_mask = VKD3DSP_WRITEMASK_0;

            vsir_register_init(&ins->src[0].reg, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
            ins->src[0].reg.dimension = VSIR_DIMENSION_VEC4;
            ins->src[0].reg.idx[0].offset = tmp_idx;
            ins->src[0].swizzle = VKD3D_SHADER_SWIZZLE(X, X, X, X);
            vsir_register_init(&ins->src[1].reg, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
            ins->src[1].reg.dimension = VSIR_DIMENSION_VEC4;
            ins->src[1].reg.idx[0].offset = tmp_idx;
            ins->src[1].swizzle = vkd3d_shader_create_swizzle(k, k, k, k);
        }

        /* discard_nz tmp.x */

        ins = &instructions->elements[i + 1 + components_read];
        if (!(vsir_instruction_init_with_params(program, ins, &texkill_ins->location, VKD3DSIH_DISCARD, 0, 1)))
            return VKD3D_ERROR_OUT_OF_MEMORY;
        ins->flags = VKD3D_SHADER_CONDITIONAL_OP_NZ;

        vsir_register_init(&ins->src[0].reg, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
        ins->src[0].reg.dimension = VSIR_DIMENSION_VEC4;
        ins->src[0].reg.idx[0].offset = tmp_idx;
        ins->src[0].swizzle = VKD3D_SHADER_SWIZZLE(X, X, X, X);

        /* Make the original instruction no-op */
        vkd3d_shader_instruction_make_nop(texkill_ins);
    }

    return VKD3D_OK;
}

static void shader_register_eliminate_phase_addressing(struct vkd3d_shader_register *reg,
        unsigned int instance_id)
{
    unsigned int i;

    for (i = 0; i < reg->idx_count; ++i)
    {
        if (reg->idx[i].rel_addr && shader_register_is_phase_instance_id(&reg->idx[i].rel_addr->reg))
        {
            reg->idx[i].rel_addr = NULL;
            reg->idx[i].offset += instance_id;
        }
    }
}

static void shader_instruction_eliminate_phase_instance_id(struct vkd3d_shader_instruction *ins,
        unsigned int instance_id)
{
    struct vkd3d_shader_register *reg;
    unsigned int i;

    for (i = 0; i < ins->src_count; ++i)
    {
        reg = (struct vkd3d_shader_register *)&ins->src[i].reg;
        if (shader_register_is_phase_instance_id(reg))
        {
            vsir_register_init(reg, VKD3DSPR_IMMCONST, reg->data_type, 0);
            reg->u.immconst_u32[0] = instance_id;
            continue;
        }
        shader_register_eliminate_phase_addressing(reg, instance_id);
    }

    for (i = 0; i < ins->dst_count; ++i)
        shader_register_eliminate_phase_addressing((struct vkd3d_shader_register *)&ins->dst[i].reg, instance_id);
}

static const struct vkd3d_shader_varying_map *find_varying_map(
        const struct vkd3d_shader_varying_map_info *varying_map, unsigned int signature_idx)
{
    unsigned int i;

    for (i = 0; i < varying_map->varying_count; ++i)
    {
        if (varying_map->varying_map[i].output_signature_index == signature_idx)
            return &varying_map->varying_map[i];
    }

    return NULL;
}

static enum vkd3d_result vsir_program_remap_output_signature(struct vsir_program *program,
        const struct vkd3d_shader_compile_info *compile_info, struct vkd3d_shader_message_context *message_context)
{
    const struct vkd3d_shader_location location = {.source_name = compile_info->source_name};
    struct shader_signature *signature = &program->output_signature;
    const struct vkd3d_shader_varying_map_info *varying_map;
    unsigned int i;

    if (!(varying_map = vkd3d_find_struct(compile_info->next, VARYING_MAP_INFO)))
        return VKD3D_OK;

    for (i = 0; i < signature->element_count; ++i)
    {
        const struct vkd3d_shader_varying_map *map = find_varying_map(varying_map, i);
        struct signature_element *e = &signature->elements[i];

        if (map)
        {
            unsigned int input_mask = map->input_mask;

            e->target_location = map->input_register_index;

            /* It is illegal in Vulkan if the next shader uses the same varying
             * location with a different mask. */
            if (input_mask && input_mask != e->mask)
            {
                vkd3d_shader_error(message_context, &location, VKD3D_SHADER_ERROR_VSIR_NOT_IMPLEMENTED,
                        "Aborting due to not yet implemented feature: "
                        "Output mask %#x does not match input mask %#x.",
                        e->mask, input_mask);
                return VKD3D_ERROR_NOT_IMPLEMENTED;
            }
        }
        else
        {
            e->target_location = SIGNATURE_TARGET_LOCATION_UNUSED;
        }
    }

    for (i = 0; i < varying_map->varying_count; ++i)
    {
        if (varying_map->varying_map[i].output_signature_index >= signature->element_count)
        {
            vkd3d_shader_error(message_context, &location, VKD3D_SHADER_ERROR_VSIR_NOT_IMPLEMENTED,
                    "Aborting due to not yet implemented feature: "
                    "The next stage consumes varyings not written by this stage.");
            return VKD3D_ERROR_NOT_IMPLEMENTED;
        }
    }

    return VKD3D_OK;
}

struct hull_flattener
{
    struct vkd3d_shader_instruction_array instructions;

    unsigned int instance_count;
    unsigned int phase_body_idx;
    enum vkd3d_shader_opcode phase;
    struct vkd3d_shader_location last_ret_location;
};

static bool flattener_is_in_fork_or_join_phase(const struct hull_flattener *flattener)
{
    return flattener->phase == VKD3DSIH_HS_FORK_PHASE || flattener->phase == VKD3DSIH_HS_JOIN_PHASE;
}

struct shader_phase_location
{
    unsigned int index;
    unsigned int instance_count;
    unsigned int instruction_count;
};

struct shader_phase_location_array
{
    /* Unlikely worst case: one phase for each component of each output register. */
    struct shader_phase_location locations[MAX_REG_OUTPUT * VKD3D_VEC4_SIZE];
    unsigned int count;
};

static void flattener_eliminate_phase_related_dcls(struct hull_flattener *normaliser,
        unsigned int index, struct shader_phase_location_array *locations)
{
    struct vkd3d_shader_instruction *ins = &normaliser->instructions.elements[index];
    struct shader_phase_location *loc;
    bool b;

    if (ins->handler_idx == VKD3DSIH_HS_FORK_PHASE || ins->handler_idx == VKD3DSIH_HS_JOIN_PHASE)
    {
        b = flattener_is_in_fork_or_join_phase(normaliser);
        /* Reset the phase info. */
        normaliser->phase_body_idx = ~0u;
        normaliser->phase = ins->handler_idx;
        normaliser->instance_count = 1;
        /* Leave the first occurrence and delete the rest. */
        if (b)
            vkd3d_shader_instruction_make_nop(ins);
        return;
    }
    else if (ins->handler_idx == VKD3DSIH_DCL_HS_FORK_PHASE_INSTANCE_COUNT
            || ins->handler_idx == VKD3DSIH_DCL_HS_JOIN_PHASE_INSTANCE_COUNT)
    {
        normaliser->instance_count = ins->declaration.count + !ins->declaration.count;
        vkd3d_shader_instruction_make_nop(ins);
        return;
    }
    else if (ins->handler_idx == VKD3DSIH_DCL_INPUT && shader_register_is_phase_instance_id(
            &ins->declaration.dst.reg))
    {
        vkd3d_shader_instruction_make_nop(ins);
        return;
    }

    if (normaliser->phase == VKD3DSIH_INVALID || vsir_instruction_is_dcl(ins))
        return;

    if (normaliser->phase_body_idx == ~0u)
        normaliser->phase_body_idx = index;

    if (ins->handler_idx == VKD3DSIH_RET)
    {
        normaliser->last_ret_location = ins->location;
        vkd3d_shader_instruction_make_nop(ins);
        if (locations->count >= ARRAY_SIZE(locations->locations))
        {
            FIXME("Insufficient space for phase location.\n");
            return;
        }
        loc = &locations->locations[locations->count++];
        loc->index = normaliser->phase_body_idx;
        loc->instance_count = normaliser->instance_count;
        loc->instruction_count = index - normaliser->phase_body_idx;
    }
}

static enum vkd3d_result flattener_flatten_phases(struct hull_flattener *normaliser,
        struct shader_phase_location_array *locations)
{
    struct shader_phase_location *loc;
    unsigned int i, j, k, end, count;

    for (i = 0, count = 0; i < locations->count; ++i)
        count += (locations->locations[i].instance_count - 1) * locations->locations[i].instruction_count;

    if (!shader_instruction_array_reserve(&normaliser->instructions, normaliser->instructions.count + count))
        return VKD3D_ERROR_OUT_OF_MEMORY;
    end = normaliser->instructions.count;
    normaliser->instructions.count += count;

    for (i = locations->count; i > 0; --i)
    {
        loc = &locations->locations[i - 1];
        j = loc->index + loc->instruction_count;
        memmove(&normaliser->instructions.elements[j + count], &normaliser->instructions.elements[j],
                (end - j) * sizeof(*normaliser->instructions.elements));
        end = j;
        count -= (loc->instance_count - 1) * loc->instruction_count;
        loc->index += count;
    }

    for (i = 0, count = 0; i < locations->count; ++i)
    {
        loc = &locations->locations[i];
        /* Make a copy of the non-dcl instructions for each instance. */
        for (j = 1; j < loc->instance_count; ++j)
        {
            for (k = 0; k < loc->instruction_count; ++k)
            {
                if (!shader_instruction_array_clone_instruction(&normaliser->instructions,
                        loc->index + loc->instruction_count * j + k, loc->index + k))
                    return VKD3D_ERROR_OUT_OF_MEMORY;
            }
        }
        /* Replace each reference to the instance id with a constant instance id. */
        for (j = 0; j < loc->instance_count; ++j)
        {
            for (k = 0; k < loc->instruction_count; ++k)
                shader_instruction_eliminate_phase_instance_id(
                        &normaliser->instructions.elements[loc->index + loc->instruction_count * j + k], j);
        }
    }

    return VKD3D_OK;
}

void vsir_register_init(struct vkd3d_shader_register *reg, enum vkd3d_shader_register_type reg_type,
        enum vkd3d_data_type data_type, unsigned int idx_count)
{
    reg->type = reg_type;
    reg->precision = VKD3D_SHADER_REGISTER_PRECISION_DEFAULT;
    reg->non_uniform = false;
    reg->data_type = data_type;
    reg->idx[0].offset = ~0u;
    reg->idx[0].rel_addr = NULL;
    reg->idx[0].is_in_bounds = false;
    reg->idx[1].offset = ~0u;
    reg->idx[1].rel_addr = NULL;
    reg->idx[1].is_in_bounds = false;
    reg->idx[2].offset = ~0u;
    reg->idx[2].rel_addr = NULL;
    reg->idx[2].is_in_bounds = false;
    reg->idx_count = idx_count;
    reg->dimension = VSIR_DIMENSION_SCALAR;
    reg->alignment = 0;
}

void vsir_src_param_init(struct vkd3d_shader_src_param *param, enum vkd3d_shader_register_type reg_type,
        enum vkd3d_data_type data_type, unsigned int idx_count)
{
    vsir_register_init(&param->reg, reg_type, data_type, idx_count);
    param->swizzle = 0;
    param->modifiers = VKD3DSPSM_NONE;
}

void vsir_dst_param_init(struct vkd3d_shader_dst_param *param, enum vkd3d_shader_register_type reg_type,
        enum vkd3d_data_type data_type, unsigned int idx_count)
{
    vsir_register_init(&param->reg, reg_type, data_type, idx_count);
    param->write_mask = VKD3DSP_WRITEMASK_0;
    param->modifiers = VKD3DSPDM_NONE;
    param->shift = 0;
}

void vsir_src_param_init_label(struct vkd3d_shader_src_param *param, unsigned int label_id)
{
    vsir_src_param_init(param, VKD3DSPR_LABEL, VKD3D_DATA_UNUSED, 1);
    param->reg.dimension = VSIR_DIMENSION_NONE;
    param->reg.idx[0].offset = label_id;
}

static void src_param_init_ssa_bool(struct vkd3d_shader_src_param *src, unsigned int idx)
{
    vsir_src_param_init(src, VKD3DSPR_SSA, VKD3D_DATA_BOOL, 1);
    src->reg.idx[0].offset = idx;
}

static void src_param_init_temp_bool(struct vkd3d_shader_src_param *src, unsigned int idx)
{
    vsir_src_param_init(src, VKD3DSPR_TEMP, VKD3D_DATA_BOOL, 1);
    src->reg.idx[0].offset = idx;
}

static void dst_param_init_ssa_bool(struct vkd3d_shader_dst_param *dst, unsigned int idx)
{
    vsir_dst_param_init(dst, VKD3DSPR_SSA, VKD3D_DATA_BOOL, 1);
    dst->reg.idx[0].offset = idx;
}

static void dst_param_init_temp_bool(struct vkd3d_shader_dst_param *dst, unsigned int idx)
{
    vsir_dst_param_init(dst, VKD3DSPR_TEMP, VKD3D_DATA_BOOL, 1);
    dst->reg.idx[0].offset = idx;
}

static void dst_param_init_temp_uint(struct vkd3d_shader_dst_param *dst, unsigned int idx)
{
    vsir_dst_param_init(dst, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
    dst->reg.idx[0].offset = idx;
    dst->write_mask = VKD3DSP_WRITEMASK_0;
}

static void src_param_init_temp_uint(struct vkd3d_shader_src_param *src, unsigned int idx)
{
    vsir_src_param_init(src, VKD3DSPR_TEMP, VKD3D_DATA_UINT, 1);
    src->reg.idx[0].offset = idx;
}

static void src_param_init_const_uint(struct vkd3d_shader_src_param *src, uint32_t value)
{
    vsir_src_param_init(src, VKD3DSPR_IMMCONST, VKD3D_DATA_UINT, 0);
    src->reg.u.immconst_u32[0] = value;
}

void vsir_instruction_init(struct vkd3d_shader_instruction *ins, const struct vkd3d_shader_location *location,
        enum vkd3d_shader_opcode handler_idx)
{
    memset(ins, 0, sizeof(*ins));
    ins->location = *location;
    ins->handler_idx = handler_idx;
}

static bool vsir_instruction_init_label(struct vkd3d_shader_instruction *ins,
        const struct vkd3d_shader_location *location, unsigned int label_id, struct vsir_program *program)
{
    struct vkd3d_shader_src_param *src_param;

    if (!(src_param = vsir_program_get_src_params(program, 1)))
        return false;

    vsir_src_param_init_label(src_param, label_id);

    vsir_instruction_init(ins, location, VKD3DSIH_LABEL);
    ins->src = src_param;
    ins->src_count = 1;

    return true;
}

static enum vkd3d_result instruction_array_flatten_hull_shader_phases(struct vkd3d_shader_instruction_array *src_instructions)
{
    struct hull_flattener flattener = {*src_instructions};
    struct vkd3d_shader_instruction_array *instructions;
    struct shader_phase_location_array locations;
    enum vkd3d_result result = VKD3D_OK;
    unsigned int i;

    instructions = &flattener.instructions;

    flattener.phase = VKD3DSIH_INVALID;
    for (i = 0, locations.count = 0; i < instructions->count; ++i)
        flattener_eliminate_phase_related_dcls(&flattener, i, &locations);

    if ((result = flattener_flatten_phases(&flattener, &locations)) < 0)
        return result;

    if (flattener.phase != VKD3DSIH_INVALID)
    {
        if (!shader_instruction_array_reserve(&flattener.instructions, flattener.instructions.count + 1))
            return VKD3D_ERROR_OUT_OF_MEMORY;
        vsir_instruction_init(&instructions->elements[instructions->count++], &flattener.last_ret_location, VKD3DSIH_RET);
    }

    *src_instructions = flattener.instructions;
    return result;
}

struct control_point_normaliser
{
    struct vkd3d_shader_instruction_array instructions;
    enum vkd3d_shader_opcode phase;
    struct vkd3d_shader_src_param *outpointid_param;
};

static bool control_point_normaliser_is_in_control_point_phase(const struct control_point_normaliser *normaliser)
{
    return normaliser->phase == VKD3DSIH_HS_CONTROL_POINT_PHASE;
}

static struct vkd3d_shader_src_param *instruction_array_create_outpointid_param(
        struct vkd3d_shader_instruction_array *instructions)
{
    struct vkd3d_shader_src_param *rel_addr;

    if (!(rel_addr = shader_src_param_allocator_get(&instructions->src_params, 1)))
        return NULL;

    vsir_register_init(&rel_addr->reg, VKD3DSPR_OUTPOINTID, VKD3D_DATA_UINT, 0);
    rel_addr->swizzle = 0;
    rel_addr->modifiers = 0;

    return rel_addr;
}

static void shader_dst_param_normalise_outpointid(struct vkd3d_shader_dst_param *dst_param,
        struct control_point_normaliser *normaliser)
{
    struct vkd3d_shader_register *reg = &dst_param->reg;

    if (control_point_normaliser_is_in_control_point_phase(normaliser) && reg->type == VKD3DSPR_OUTPUT)
    {
        /* The TPF reader validates idx_count. */
        assert(reg->idx_count == 1);
        reg->idx[1] = reg->idx[0];
        /* The control point id param is implicit here. Avoid later complications by inserting it. */
        reg->idx[0].offset = 0;
        reg->idx[0].rel_addr = normaliser->outpointid_param;
        ++reg->idx_count;
    }
}

static void shader_dst_param_io_init(struct vkd3d_shader_dst_param *param, const struct signature_element *e,
        enum vkd3d_shader_register_type reg_type, unsigned int idx_count)
{
    param->write_mask = e->mask;
    param->modifiers = 0;
    param->shift = 0;
    vsir_register_init(&param->reg, reg_type, vkd3d_data_type_from_component_type(e->component_type), idx_count);
}

static enum vkd3d_result control_point_normaliser_emit_hs_input(struct control_point_normaliser *normaliser,
        const struct shader_signature *s, unsigned int input_control_point_count, unsigned int dst,
        const struct vkd3d_shader_location *location)
{
    struct vkd3d_shader_instruction *ins;
    struct vkd3d_shader_dst_param *param;
    const struct signature_element *e;
    unsigned int i, count;

    for (i = 0, count = 1; i < s->element_count; ++i)
        count += !!s->elements[i].used_mask;

    if (!shader_instruction_array_reserve(&normaliser->instructions, normaliser->instructions.count + count))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    memmove(&normaliser->instructions.elements[dst + count], &normaliser->instructions.elements[dst],
            (normaliser->instructions.count - dst) * sizeof(*normaliser->instructions.elements));
    normaliser->instructions.count += count;

    ins = &normaliser->instructions.elements[dst];
    vsir_instruction_init(ins, location, VKD3DSIH_HS_CONTROL_POINT_PHASE);
    ins->flags = 1;
    ++ins;

    for (i = 0; i < s->element_count; ++i)
    {
        e = &s->elements[i];
        if (!e->used_mask)
            continue;

        if (e->sysval_semantic != VKD3D_SHADER_SV_NONE)
        {
            vsir_instruction_init(ins, location, VKD3DSIH_DCL_INPUT_SIV);
            param = &ins->declaration.register_semantic.reg;
            ins->declaration.register_semantic.sysval_semantic = vkd3d_siv_from_sysval(e->sysval_semantic);
        }
        else
        {
            vsir_instruction_init(ins, location, VKD3DSIH_DCL_INPUT);
            param = &ins->declaration.dst;
        }

        shader_dst_param_io_init(param, e, VKD3DSPR_INPUT, 2);
        param->reg.idx[0].offset = input_control_point_count;
        param->reg.idx[1].offset = e->register_index;
        param->write_mask = e->mask;

        ++ins;
    }

    return VKD3D_OK;
}

static enum vkd3d_result instruction_array_normalise_hull_shader_control_point_io(
        struct vkd3d_shader_instruction_array *src_instructions, const struct shader_signature *input_signature)
{
    struct vkd3d_shader_instruction_array *instructions;
    struct control_point_normaliser normaliser;
    unsigned int input_control_point_count;
    struct vkd3d_shader_location location;
    struct vkd3d_shader_instruction *ins;
    enum vkd3d_result ret;
    unsigned int i, j;

    if (!(normaliser.outpointid_param = instruction_array_create_outpointid_param(src_instructions)))
    {
        ERR("Failed to allocate src param.\n");
        return VKD3D_ERROR_OUT_OF_MEMORY;
    }
    normaliser.instructions = *src_instructions;
    instructions = &normaliser.instructions;
    normaliser.phase = VKD3DSIH_INVALID;

    for (i = 0; i < normaliser.instructions.count; ++i)
    {
        ins = &instructions->elements[i];

        switch (ins->handler_idx)
        {
            case VKD3DSIH_HS_CONTROL_POINT_PHASE:
            case VKD3DSIH_HS_FORK_PHASE:
            case VKD3DSIH_HS_JOIN_PHASE:
                normaliser.phase = ins->handler_idx;
                break;
            default:
                if (vsir_instruction_is_dcl(ins))
                    break;
                for (j = 0; j < ins->dst_count; ++j)
                    shader_dst_param_normalise_outpointid(&ins->dst[j], &normaliser);
                break;
        }
    }

    normaliser.phase = VKD3DSIH_INVALID;
    input_control_point_count = 1;

    for (i = 0; i < instructions->count; ++i)
    {
        ins = &instructions->elements[i];

        switch (ins->handler_idx)
        {
            case VKD3DSIH_DCL_INPUT_CONTROL_POINT_COUNT:
                input_control_point_count = ins->declaration.count;
                break;
            case VKD3DSIH_HS_CONTROL_POINT_PHASE:
                *src_instructions = normaliser.instructions;
                return VKD3D_OK;
            case VKD3DSIH_HS_FORK_PHASE:
            case VKD3DSIH_HS_JOIN_PHASE:
                /* ins may be relocated if the instruction array expands. */
                location = ins->location;
                ret = control_point_normaliser_emit_hs_input(&normaliser, input_signature,
                        input_control_point_count, i, &location);
                *src_instructions = normaliser.instructions;
                return ret;
            default:
                break;
        }
    }

    *src_instructions = normaliser.instructions;
    return VKD3D_OK;
}

struct io_normaliser
{
    struct vkd3d_shader_instruction_array instructions;
    enum vkd3d_shader_type shader_type;
    uint8_t major;
    struct shader_signature *input_signature;
    struct shader_signature *output_signature;
    struct shader_signature *patch_constant_signature;

    unsigned int instance_count;
    unsigned int phase_body_idx;
    enum vkd3d_shader_opcode phase;
    unsigned int output_control_point_count;

    struct vkd3d_shader_src_param *outpointid_param;

    struct vkd3d_shader_dst_param *input_dcl_params[MAX_REG_OUTPUT];
    struct vkd3d_shader_dst_param *output_dcl_params[MAX_REG_OUTPUT];
    struct vkd3d_shader_dst_param *pc_dcl_params[MAX_REG_OUTPUT];
    uint8_t input_range_map[MAX_REG_OUTPUT][VKD3D_VEC4_SIZE];
    uint8_t output_range_map[MAX_REG_OUTPUT][VKD3D_VEC4_SIZE];
    uint8_t pc_range_map[MAX_REG_OUTPUT][VKD3D_VEC4_SIZE];

    bool use_vocp;
};

static bool io_normaliser_is_in_fork_or_join_phase(const struct io_normaliser *normaliser)
{
    return normaliser->phase == VKD3DSIH_HS_FORK_PHASE || normaliser->phase == VKD3DSIH_HS_JOIN_PHASE;
}

static bool io_normaliser_is_in_control_point_phase(const struct io_normaliser *normaliser)
{
    return normaliser->phase == VKD3DSIH_HS_CONTROL_POINT_PHASE;
}

static unsigned int shader_signature_find_element_for_reg(const struct shader_signature *signature,
        unsigned int reg_idx, unsigned int write_mask)
{
    unsigned int i, base_write_mask;

    for (i = 0; i < signature->element_count; ++i)
    {
        struct signature_element *e = &signature->elements[i];
        if (e->register_index <= reg_idx && e->register_index + e->register_count > reg_idx
                && (e->mask & write_mask) == write_mask)
        {
            return i;
        }
    }

    /* Validated in the TPF reader, but failure in signature_element_range_expand_mask()
     * can land us here on an unmatched vector mask. */
    FIXME("Failed to find signature element for register index %u, mask %#x; using scalar mask.\n",
            reg_idx, write_mask);
    base_write_mask = 1u << vsir_write_mask_get_component_idx(write_mask);
    if (base_write_mask != write_mask)
        return shader_signature_find_element_for_reg(signature, reg_idx, base_write_mask);

    vkd3d_unreachable();
}

struct signature_element *vsir_signature_find_element_for_reg(const struct shader_signature *signature,
        unsigned int reg_idx, unsigned int write_mask)
{
    return &signature->elements[shader_signature_find_element_for_reg(signature, reg_idx, write_mask)];
}

static unsigned int range_map_get_register_count(uint8_t range_map[][VKD3D_VEC4_SIZE],
        unsigned int register_idx, uint32_t write_mask)
{
    return range_map[register_idx][vsir_write_mask_get_component_idx(write_mask)];
}

static void range_map_set_register_range(uint8_t range_map[][VKD3D_VEC4_SIZE], unsigned int register_idx,
        unsigned int register_count, uint32_t write_mask, bool is_dcl_indexrange)
{
    unsigned int i, j, r, c, component_idx, component_count;

    assert(write_mask <= VKD3DSP_WRITEMASK_ALL);
    component_idx = vsir_write_mask_get_component_idx(write_mask);
    component_count = vsir_write_mask_component_count(write_mask);

    assert(register_idx < MAX_REG_OUTPUT && MAX_REG_OUTPUT - register_idx >= register_count);

    if (range_map[register_idx][component_idx] > register_count && is_dcl_indexrange)
    {
        /* Validated in the TPF reader. */
        assert(range_map[register_idx][component_idx] != UINT8_MAX);
        return;
    }
    if (range_map[register_idx][component_idx] == register_count)
    {
        /* Already done. This happens when fxc splits a register declaration by
         * component(s). The dcl_indexrange instructions are split too. */
        return;
    }
    range_map[register_idx][component_idx] = register_count;

    for (i = 0; i < register_count; ++i)
    {
        r = register_idx + i;
        for (j = !i; j < component_count; ++j)
        {
            c = component_idx + j;
            /* A synthetic patch constant range which overlaps an existing range can start upstream of it
             * for fork/join phase instancing, but ranges declared by dcl_indexrange should not overlap.
             * The latter is validated in the TPF reader. */
            assert(!range_map[r][c] || !is_dcl_indexrange);
            range_map[r][c] = UINT8_MAX;
        }
    }
}

static void io_normaliser_add_index_range(struct io_normaliser *normaliser,
        const struct vkd3d_shader_instruction *ins)
{
    const struct vkd3d_shader_index_range *range = &ins->declaration.index_range;
    const struct vkd3d_shader_register *reg = &range->dst.reg;
    unsigned int reg_idx, write_mask, element_idx;
    const struct shader_signature *signature;
    uint8_t (*range_map)[VKD3D_VEC4_SIZE];

    switch (reg->type)
    {
        case VKD3DSPR_INPUT:
        case VKD3DSPR_INCONTROLPOINT:
            range_map = normaliser->input_range_map;
            signature = normaliser->input_signature;
            break;
        case VKD3DSPR_OUTCONTROLPOINT:
            range_map = normaliser->output_range_map;
            signature = normaliser->output_signature;
            break;
        case VKD3DSPR_OUTPUT:
            if (!io_normaliser_is_in_fork_or_join_phase(normaliser))
            {
                range_map = normaliser->output_range_map;
                signature = normaliser->output_signature;
                break;
            }
            /* fall through */
        case VKD3DSPR_PATCHCONST:
            range_map = normaliser->pc_range_map;
            signature = normaliser->patch_constant_signature;
            break;
        default:
            /* Validated in the TPF reader. */
            vkd3d_unreachable();
    }

    reg_idx = reg->idx[reg->idx_count - 1].offset;
    write_mask = range->dst.write_mask;
    element_idx = shader_signature_find_element_for_reg(signature, reg_idx, write_mask);
    range_map_set_register_range(range_map, reg_idx, range->register_count,
            signature->elements[element_idx].mask, true);
}

static int signature_element_mask_compare(const void *a, const void *b)
{
    const struct signature_element *e = a, *f = b;
    int ret;

    return (ret = vkd3d_u32_compare(e->mask, f->mask)) ? ret : vkd3d_u32_compare(e->register_index, f->register_index);
}

static bool sysval_semantics_should_merge(const struct signature_element *e, const struct signature_element *f)
{
    if (e->sysval_semantic < VKD3D_SHADER_SV_TESS_FACTOR_QUADEDGE
            || e->sysval_semantic > VKD3D_SHADER_SV_TESS_FACTOR_LINEDEN)
        return false;

    return e->sysval_semantic == f->sysval_semantic
            /* Line detail and density must be merged together to match the SPIR-V array.
             * This deletes one of the two sysvals, but these are not used. */
            || (e->sysval_semantic == VKD3D_SHADER_SV_TESS_FACTOR_LINEDET
            && f->sysval_semantic == VKD3D_SHADER_SV_TESS_FACTOR_LINEDEN)
            || (e->sysval_semantic == VKD3D_SHADER_SV_TESS_FACTOR_LINEDEN
            && f->sysval_semantic == VKD3D_SHADER_SV_TESS_FACTOR_LINEDET);
}

/* Merge tess factor sysvals because they are an array in SPIR-V. */
static void shader_signature_map_patch_constant_index_ranges(struct shader_signature *s,
        uint8_t range_map[][VKD3D_VEC4_SIZE])
{
    struct signature_element *e, *f;
    unsigned int i, j, register_count;

    qsort(s->elements, s->element_count, sizeof(s->elements[0]), signature_element_mask_compare);

    for (i = 0; i < s->element_count; i += register_count)
    {
        e = &s->elements[i];
        register_count = 1;

        if (!e->sysval_semantic)
            continue;

        for (j = i + 1; j < s->element_count; ++j, ++register_count)
        {
            f = &s->elements[j];
            if (f->register_index != e->register_index + register_count || !sysval_semantics_should_merge(e, f))
                break;
        }
        if (register_count < 2)
            continue;

        range_map_set_register_range(range_map, e->register_index, register_count, e->mask, false);
    }
}

static int signature_element_register_compare(const void *a, const void *b)
{
    const struct signature_element *e = a, *f = b;

    return vkd3d_u32_compare(e->register_index, f->register_index);
}

static int signature_element_index_compare(const void *a, const void *b)
{
    const struct signature_element *e = a, *f = b;

    return vkd3d_u32_compare(e->sort_index, f->sort_index);
}

static unsigned int signature_element_range_expand_mask(struct signature_element *e, unsigned int register_count,
        uint8_t range_map[][VKD3D_VEC4_SIZE])
{
    unsigned int i, j, component_idx, component_count, merged_write_mask = e->mask;

    /* dcl_indexrange instructions can declare a subset of the full mask, and the masks of
     * the elements within the range may differ. TPF's handling of arrayed inputs with
     * dcl_indexrange is really just a hack. Here we create a mask which covers all element
     * masks, and check for collisions with other ranges. */

    for (i = 1; i < register_count; ++i)
        merged_write_mask |= e[i].mask;

    if (merged_write_mask == e->mask)
        return merged_write_mask;

    /* Reaching this point is very rare to begin with, and collisions are even rarer or
     * impossible. If the latter shows up, the fallback in shader_signature_find_element_for_reg()
     * may be sufficient. */

    component_idx = vsir_write_mask_get_component_idx(e->mask);
    component_count = vsir_write_mask_component_count(e->mask);

    for (i = e->register_index; i < e->register_index + register_count; ++i)
    {
        for (j = 0; j < component_idx; ++j)
            if (range_map[i][j])
                break;
        for (j = component_idx + component_count; j < VKD3D_VEC4_SIZE; ++j)
            if (range_map[i][j])
                break;
    }

    if (i == register_count)
    {
        WARN("Expanding mask %#x to %#x for %s, base reg %u, count %u.\n", e->mask, merged_write_mask,
                e->semantic_name, e->register_index, register_count);
        return merged_write_mask;
    }

    WARN("Cannot expand mask %#x to %#x for %s, base reg %u, count %u.\n", e->mask, merged_write_mask,
            e->semantic_name, e->register_index, register_count);
    return e->mask;
}

static bool shader_signature_merge(struct shader_signature *s, uint8_t range_map[][VKD3D_VEC4_SIZE],
        bool is_patch_constant)
{
    unsigned int i, j, element_count, new_count, register_count;
    struct signature_element *elements;
    struct signature_element *e, *f;
    bool used;

    element_count = s->element_count;
    if (!(elements = vkd3d_malloc(element_count * sizeof(*elements))))
        return false;
    memcpy(elements, s->elements, element_count * sizeof(*elements));

    qsort(elements, element_count, sizeof(elements[0]), signature_element_register_compare);

    for (i = 0, new_count = 0; i < element_count; i = j, elements[new_count++] = *e)
    {
        e = &elements[i];
        j = i + 1;

        if (e->register_index == ~0u)
            continue;

        /* Do not merge if the register index will be relative-addressed. */
        if (range_map_get_register_count(range_map, e->register_index, e->mask) > 1)
            continue;

        used = e->used_mask;

        for (; j < element_count; ++j)
        {
            f = &elements[j];

            /* Merge different components of the same register unless sysvals are different,
             * or it will be relative-addressed. */
            if (f->register_index != e->register_index || f->sysval_semantic != e->sysval_semantic
                    || range_map_get_register_count(range_map, f->register_index, f->mask) > 1)
                break;

            TRACE("Merging %s, reg %u, mask %#x, sysval %#x with %s, mask %#x, sysval %#x.\n", e->semantic_name,
                    e->register_index, e->mask, e->sysval_semantic, f->semantic_name, f->mask, f->sysval_semantic);
            assert(!(e->mask & f->mask));

            e->mask |= f->mask;
            e->used_mask |= f->used_mask;
            e->semantic_index = min(e->semantic_index, f->semantic_index);

            /* The first element may have no interpolation mode if it is unused. Elements which
             * actually have different interpolation modes are assigned different registers. */
            if (f->used_mask && !used)
            {
                if (e->interpolation_mode && e->interpolation_mode != f->interpolation_mode)
                    FIXME("Mismatching interpolation modes %u and %u.\n", e->interpolation_mode, f->interpolation_mode);
                else
                    e->interpolation_mode = f->interpolation_mode;
            }
        }
    }
    element_count = new_count;
    vkd3d_free(s->elements);
    s->elements = elements;
    s->element_count = element_count;

    if (is_patch_constant)
        shader_signature_map_patch_constant_index_ranges(s, range_map);

    for (i = 0, new_count = 0; i < element_count; i += register_count, elements[new_count++] = *e)
    {
        e = &elements[i];
        register_count = 1;

        if (e->register_index >= MAX_REG_OUTPUT)
            continue;

        register_count = range_map_get_register_count(range_map, e->register_index, e->mask);
        assert(register_count != UINT8_MAX);
        register_count += !register_count;

        if (register_count > 1)
        {
            TRACE("Merging %s, base reg %u, count %u.\n", e->semantic_name, e->register_index, register_count);
            e->register_count = register_count;
            e->mask = signature_element_range_expand_mask(e, register_count, range_map);
        }
    }
    element_count = new_count;

    /* Restoring the original order is required for sensible trace output. */
    qsort(elements, element_count, sizeof(elements[0]), signature_element_index_compare);

    s->element_count = element_count;

    return true;
}

static unsigned int shader_register_normalise_arrayed_addressing(struct vkd3d_shader_register *reg,
        unsigned int id_idx, unsigned int register_index)
{
    assert(id_idx < ARRAY_SIZE(reg->idx) - 1);

    /* For a relative-addressed register index, move the id up a slot to separate it from the address,
     * because rel_addr can be replaced with a constant offset in some cases. */
    if (reg->idx[id_idx].rel_addr)
    {
        reg->idx[id_idx + 1].rel_addr = NULL;
        reg->idx[id_idx + 1].offset = reg->idx[id_idx].offset;
        reg->idx[id_idx].offset -= register_index;
        if (id_idx)
        {
            /* idx[id_idx] now contains the array index, which must be moved below the control point id. */
            struct vkd3d_shader_register_index tmp = reg->idx[id_idx];
            reg->idx[id_idx] = reg->idx[id_idx - 1];
            reg->idx[id_idx - 1] = tmp;
        }
        ++id_idx;
    }
    /* Otherwise we have no address for the arrayed register, so insert one. This happens e.g. where
     * tessellation level registers are merged into an array because they're an array in SPIR-V. */
    else
    {
        ++id_idx;
        memmove(&reg->idx[1], &reg->idx[0], id_idx * sizeof(reg->idx[0]));
        reg->idx[0].rel_addr = NULL;
        reg->idx[0].offset = reg->idx[id_idx].offset - register_index;
    }

    return id_idx;
}

static bool shader_dst_param_io_normalise(struct vkd3d_shader_dst_param *dst_param, bool is_io_dcl,
         struct io_normaliser *normaliser)
 {
    unsigned int id_idx, reg_idx, write_mask, element_idx;
    struct vkd3d_shader_register *reg = &dst_param->reg;
    struct vkd3d_shader_dst_param **dcl_params;
    const struct shader_signature *signature;
    const struct signature_element *e;

    switch (reg->type)
    {
        case VKD3DSPR_OUTPUT:
            reg_idx = reg->idx[reg->idx_count - 1].offset;
            if (io_normaliser_is_in_fork_or_join_phase(normaliser))
            {
                signature = normaliser->patch_constant_signature;
                /* Convert patch constant outputs to the patch constant register type to avoid the need
                 * to convert compiler symbols when accessed as inputs in a later stage. */
                reg->type = VKD3DSPR_PATCHCONST;
                dcl_params = normaliser->pc_dcl_params;
            }
            else
            {
                signature = normaliser->output_signature;
                dcl_params = normaliser->output_dcl_params;
            }
            break;

        case VKD3DSPR_PATCHCONST:
            reg_idx = reg->idx[reg->idx_count - 1].offset;
            signature = normaliser->patch_constant_signature;
            dcl_params = normaliser->pc_dcl_params;
            break;

        case VKD3DSPR_COLOROUT:
            reg_idx = reg->idx[0].offset;
            signature = normaliser->output_signature;
            reg->type = VKD3DSPR_OUTPUT;
            dcl_params = normaliser->output_dcl_params;
            break;

        case VKD3DSPR_INCONTROLPOINT:
        case VKD3DSPR_INPUT:
            reg_idx = reg->idx[reg->idx_count - 1].offset;
            signature = normaliser->input_signature;
            reg->type = VKD3DSPR_INPUT;
            dcl_params = normaliser->input_dcl_params;
            break;

        case VKD3DSPR_ATTROUT:
            reg_idx = SM1_COLOR_REGISTER_OFFSET + reg->idx[0].offset;
            signature = normaliser->output_signature;
            reg->type = VKD3DSPR_OUTPUT;
            dcl_params = normaliser->output_dcl_params;
            break;

        case VKD3DSPR_RASTOUT:
            reg_idx = SM1_RASTOUT_REGISTER_OFFSET + reg->idx[0].offset;
            signature = normaliser->output_signature;
            reg->type = VKD3DSPR_OUTPUT;
            dcl_params = normaliser->output_dcl_params;
            break;

        default:
            return true;
    }

    id_idx = reg->idx_count - 1;
    write_mask = dst_param->write_mask;
    element_idx = shader_signature_find_element_for_reg(signature, reg_idx, write_mask);
    e = &signature->elements[element_idx];

    dst_param->write_mask >>= vsir_write_mask_get_component_idx(e->mask);
    if (is_io_dcl)
    {
        /* Validated in the TPF reader. */
        assert(element_idx < ARRAY_SIZE(normaliser->input_dcl_params));

        if (dcl_params[element_idx])
        {
            /* Merge split declarations into a single one. */
            dcl_params[element_idx]->write_mask |= dst_param->write_mask;
            /* Turn this into a nop. */
            return false;
        }
        else
        {
            dcl_params[element_idx] = dst_param;
        }
    }

    if (io_normaliser_is_in_control_point_phase(normaliser) && reg->type == VKD3DSPR_OUTPUT)
    {
        if (is_io_dcl)
        {
            /* Emit an array size for the control points for consistency with inputs. */
            reg->idx[0].offset = normaliser->output_control_point_count;
        }
        else
        {
            /* The control point id param. */
            assert(reg->idx[0].rel_addr);
        }
        id_idx = 1;
    }

    if ((e->register_count > 1 || vsir_sysval_semantic_is_tess_factor(e->sysval_semantic)))
    {
        if (is_io_dcl)
        {
            /* For control point I/O, idx 0 contains the control point count.
             * Ensure it is moved up to the next slot. */
            reg->idx[id_idx].offset = reg->idx[0].offset;
            reg->idx[0].offset = e->register_count;
            ++id_idx;
        }
        else
        {
            id_idx = shader_register_normalise_arrayed_addressing(reg, id_idx, e->register_index);
        }
    }

    /* Replace the register index with the signature element index */
    reg->idx[id_idx].offset = element_idx;
    reg->idx_count = id_idx + 1;

    return true;
}

static void shader_src_param_io_normalise(struct vkd3d_shader_src_param *src_param,
        struct io_normaliser *normaliser)
{
    unsigned int i, id_idx, reg_idx, write_mask, element_idx, component_idx;
    struct vkd3d_shader_register *reg = &src_param->reg;
    const struct shader_signature *signature;
    const struct signature_element *e;

    /* Input/output registers from one phase can be used as inputs in
     * subsequent phases. Specifically:
     *
     *   - Control phase inputs are available as "vicp" in fork and join
     *     phases.
     *   - Control phase outputs are available as "vocp" in fork and join
     *     phases.
     *   - Fork phase patch constants are available as "vpc" in join
     *     phases.
     *
     * We handle "vicp" here by converting INCONTROLPOINT src registers to
     * type INPUT so they match the control phase declarations. We handle
     * "vocp" by converting OUTCONTROLPOINT registers to type OUTPUT.
     * Merging fork and join phases handles "vpc". */

    switch (reg->type)
    {
        case VKD3DSPR_PATCHCONST:
            reg_idx = reg->idx[reg->idx_count - 1].offset;
            signature = normaliser->patch_constant_signature;
            break;

        case VKD3DSPR_INCONTROLPOINT:
            reg->type = VKD3DSPR_INPUT;
            /* fall through */
        case VKD3DSPR_INPUT:
            if (normaliser->major < 3 && normaliser->shader_type == VKD3D_SHADER_TYPE_PIXEL)
                reg_idx = SM1_COLOR_REGISTER_OFFSET + reg->idx[0].offset;
            else
                reg_idx = reg->idx[reg->idx_count - 1].offset;
            signature = normaliser->input_signature;
            break;

        case VKD3DSPR_OUTCONTROLPOINT:
            reg->type = VKD3DSPR_OUTPUT;
            /* fall through */
        case VKD3DSPR_OUTPUT:
            reg_idx = reg->idx[reg->idx_count - 1].offset;
            signature = normaliser->output_signature;
            break;

        case VKD3DSPR_TEXTURE:
            if (normaliser->shader_type != VKD3D_SHADER_TYPE_PIXEL)
                return;
            reg->type = VKD3DSPR_INPUT;
            reg_idx = reg->idx[0].offset;
            signature = normaliser->input_signature;
            break;

        default:
            return;
    }

    id_idx = reg->idx_count - 1;
    write_mask = VKD3DSP_WRITEMASK_0 << vsir_swizzle_get_component(src_param->swizzle, 0);
    element_idx = shader_signature_find_element_for_reg(signature, reg_idx, write_mask);

    e = &signature->elements[element_idx];
    if ((e->register_count > 1 || vsir_sysval_semantic_is_tess_factor(e->sysval_semantic)))
        id_idx = shader_register_normalise_arrayed_addressing(reg, id_idx, e->register_index);
    reg->idx[id_idx].offset = element_idx;
    reg->idx_count = id_idx + 1;

    if ((component_idx = vsir_write_mask_get_component_idx(e->mask)))
    {
        for (i = 0; i < VKD3D_VEC4_SIZE; ++i)
            if (vsir_swizzle_get_component(src_param->swizzle, i))
                src_param->swizzle -= component_idx << VKD3D_SHADER_SWIZZLE_SHIFT(i);
    }
}

static void shader_instruction_normalise_io_params(struct vkd3d_shader_instruction *ins,
        struct io_normaliser *normaliser)
{
    struct vkd3d_shader_register *reg;
    unsigned int i;

    switch (ins->handler_idx)
    {
        case VKD3DSIH_DCL_INPUT:
            if (normaliser->shader_type == VKD3D_SHADER_TYPE_HULL)
            {
                reg = &ins->declaration.dst.reg;

                if (reg->type == VKD3DSPR_OUTCONTROLPOINT)
                    normaliser->use_vocp = true;

                /* We don't need to keep OUTCONTROLPOINT or PATCHCONST input declarations since their
                * equivalents were declared earlier, but INCONTROLPOINT may be the first occurrence. */
                if (reg->type == VKD3DSPR_OUTCONTROLPOINT || reg->type == VKD3DSPR_PATCHCONST)
                    vkd3d_shader_instruction_make_nop(ins);
                else if (reg->type == VKD3DSPR_INCONTROLPOINT)
                    reg->type = VKD3DSPR_INPUT;
            }
            /* fall through */
        case VKD3DSIH_DCL_INPUT_PS:
        case VKD3DSIH_DCL_OUTPUT:
            if (!shader_dst_param_io_normalise(&ins->declaration.dst, true, normaliser))
                vkd3d_shader_instruction_make_nop(ins);
            break;
        case VKD3DSIH_DCL_INPUT_SGV:
        case VKD3DSIH_DCL_INPUT_SIV:
        case VKD3DSIH_DCL_INPUT_PS_SGV:
        case VKD3DSIH_DCL_INPUT_PS_SIV:
        case VKD3DSIH_DCL_OUTPUT_SIV:
            if (!shader_dst_param_io_normalise(&ins->declaration.register_semantic.reg, true, normaliser))
                vkd3d_shader_instruction_make_nop(ins);
            break;
        case VKD3DSIH_HS_CONTROL_POINT_PHASE:
        case VKD3DSIH_HS_FORK_PHASE:
        case VKD3DSIH_HS_JOIN_PHASE:
            normaliser->phase = ins->handler_idx;
            memset(normaliser->input_dcl_params, 0, sizeof(normaliser->input_dcl_params));
            memset(normaliser->output_dcl_params, 0, sizeof(normaliser->output_dcl_params));
            memset(normaliser->pc_dcl_params, 0, sizeof(normaliser->pc_dcl_params));
            break;
        default:
            if (vsir_instruction_is_dcl(ins))
                break;
            for (i = 0; i < ins->dst_count; ++i)
                shader_dst_param_io_normalise(&ins->dst[i], false, normaliser);
            for (i = 0; i < ins->src_count; ++i)
                shader_src_param_io_normalise(&ins->src[i], normaliser);
            break;
    }
}

static enum vkd3d_result vsir_program_normalise_io_registers(struct vsir_program *program)
{
    struct io_normaliser normaliser = {program->instructions};
    struct vkd3d_shader_instruction *ins;
    bool has_control_point_phase;
    unsigned int i, j;

    normaliser.phase = VKD3DSIH_INVALID;
    normaliser.shader_type = program->shader_version.type;
    normaliser.major = program->shader_version.major;
    normaliser.input_signature = &program->input_signature;
    normaliser.output_signature = &program->output_signature;
    normaliser.patch_constant_signature = &program->patch_constant_signature;

    for (i = 0, has_control_point_phase = false; i < program->instructions.count; ++i)
    {
        ins = &program->instructions.elements[i];

        switch (ins->handler_idx)
        {
            case VKD3DSIH_DCL_OUTPUT_CONTROL_POINT_COUNT:
                normaliser.output_control_point_count = ins->declaration.count;
                break;
            case VKD3DSIH_DCL_INDEX_RANGE:
                io_normaliser_add_index_range(&normaliser, ins);
                vkd3d_shader_instruction_make_nop(ins);
                break;
            case VKD3DSIH_HS_CONTROL_POINT_PHASE:
                has_control_point_phase = true;
                /* fall through */
            case VKD3DSIH_HS_FORK_PHASE:
            case VKD3DSIH_HS_JOIN_PHASE:
                normaliser.phase = ins->handler_idx;
                break;
            default:
                break;
        }
    }

    if (normaliser.shader_type == VKD3D_SHADER_TYPE_HULL && !has_control_point_phase)
    {
        /* Inputs and outputs must match for the default phase, so merge ranges must match too. */
        for (i = 0; i < MAX_REG_OUTPUT; ++i)
        {
            for (j = 0; j < VKD3D_VEC4_SIZE; ++j)
            {
                if (!normaliser.input_range_map[i][j] && normaliser.output_range_map[i][j])
                    normaliser.input_range_map[i][j] = normaliser.output_range_map[i][j];
                else if (normaliser.input_range_map[i][j] && !normaliser.output_range_map[i][j])
                    normaliser.output_range_map[i][j] = normaliser.input_range_map[i][j];
                else assert(normaliser.input_range_map[i][j] == normaliser.output_range_map[i][j]);
            }
        }
    }

    if (!shader_signature_merge(&program->input_signature, normaliser.input_range_map, false)
            || !shader_signature_merge(&program->output_signature, normaliser.output_range_map, false)
            || !shader_signature_merge(&program->patch_constant_signature, normaliser.pc_range_map, true))
    {
        program->instructions = normaliser.instructions;
        return VKD3D_ERROR_OUT_OF_MEMORY;
    }

    normaliser.phase = VKD3DSIH_INVALID;
    for (i = 0; i < normaliser.instructions.count; ++i)
        shader_instruction_normalise_io_params(&normaliser.instructions.elements[i], &normaliser);

    program->instructions = normaliser.instructions;
    program->use_vocp = normaliser.use_vocp;
    return VKD3D_OK;
}

struct flat_constant_def
{
    enum vkd3d_shader_d3dbc_constant_register set;
    uint32_t index;
    uint32_t value[4];
};

struct flat_constants_normaliser
{
    struct flat_constant_def *defs;
    size_t def_count, defs_capacity;
};

static bool get_flat_constant_register_type(const struct vkd3d_shader_register *reg,
        enum vkd3d_shader_d3dbc_constant_register *set, uint32_t *index)
{
    static const struct
    {
        enum vkd3d_shader_register_type type;
        enum vkd3d_shader_d3dbc_constant_register set;
        uint32_t offset;
    }
    regs[] =
    {
        {VKD3DSPR_CONST, VKD3D_SHADER_D3DBC_FLOAT_CONSTANT_REGISTER, 0},
        {VKD3DSPR_CONST2, VKD3D_SHADER_D3DBC_FLOAT_CONSTANT_REGISTER, 2048},
        {VKD3DSPR_CONST3, VKD3D_SHADER_D3DBC_FLOAT_CONSTANT_REGISTER, 4096},
        {VKD3DSPR_CONST4, VKD3D_SHADER_D3DBC_FLOAT_CONSTANT_REGISTER, 6144},
        {VKD3DSPR_CONSTINT, VKD3D_SHADER_D3DBC_INT_CONSTANT_REGISTER, 0},
        {VKD3DSPR_CONSTBOOL, VKD3D_SHADER_D3DBC_BOOL_CONSTANT_REGISTER, 0},
    };

    unsigned int i;

    for (i = 0; i < ARRAY_SIZE(regs); ++i)
    {
        if (reg->type == regs[i].type)
        {
            if (reg->idx[0].rel_addr)
            {
                FIXME("Unhandled relative address.\n");
                return false;
            }

            *set = regs[i].set;
            *index = regs[i].offset + reg->idx[0].offset;
            return true;
        }
    }

    return false;
}

static void shader_register_normalise_flat_constants(struct vkd3d_shader_src_param *param,
        const struct flat_constants_normaliser *normaliser)
{
    enum vkd3d_shader_d3dbc_constant_register set;
    uint32_t index;
    size_t i, j;

    if (!get_flat_constant_register_type(&param->reg, &set, &index))
        return;

    for (i = 0; i < normaliser->def_count; ++i)
    {
        if (normaliser->defs[i].set == set && normaliser->defs[i].index == index)
        {
            param->reg.type = VKD3DSPR_IMMCONST;
            param->reg.idx_count = 0;
            param->reg.dimension = VSIR_DIMENSION_VEC4;
            for (j = 0; j < 4; ++j)
                param->reg.u.immconst_u32[j] = normaliser->defs[i].value[j];
            return;
        }
    }

    param->reg.type = VKD3DSPR_CONSTBUFFER;
    param->reg.idx[0].offset = set; /* register ID */
    param->reg.idx[1].offset = set; /* register index */
    param->reg.idx[2].offset = index; /* buffer index */
    param->reg.idx_count = 3;
}

static enum vkd3d_result instruction_array_normalise_flat_constants(struct vsir_program *program)
{
    struct flat_constants_normaliser normaliser = {0};
    unsigned int i, j;

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];

        if (ins->handler_idx == VKD3DSIH_DEF || ins->handler_idx == VKD3DSIH_DEFI || ins->handler_idx == VKD3DSIH_DEFB)
        {
            struct flat_constant_def *def;

            if (!vkd3d_array_reserve((void **)&normaliser.defs, &normaliser.defs_capacity,
                    normaliser.def_count + 1, sizeof(*normaliser.defs)))
            {
                vkd3d_free(normaliser.defs);
                return VKD3D_ERROR_OUT_OF_MEMORY;
            }

            def = &normaliser.defs[normaliser.def_count++];

            get_flat_constant_register_type((struct vkd3d_shader_register *)&ins->dst[0].reg, &def->set, &def->index);
            for (j = 0; j < 4; ++j)
                def->value[j] = ins->src[0].reg.u.immconst_u32[j];

            vkd3d_shader_instruction_make_nop(ins);
        }
        else
        {
            for (j = 0; j < ins->src_count; ++j)
                shader_register_normalise_flat_constants(&ins->src[j], &normaliser);
        }
    }

    vkd3d_free(normaliser.defs);
    return VKD3D_OK;
}

static void remove_dead_code(struct vsir_program *program)
{
    size_t i, depth = 0;
    bool dead = false;

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];

        switch (ins->handler_idx)
        {
            case VKD3DSIH_IF:
            case VKD3DSIH_LOOP:
            case VKD3DSIH_SWITCH:
                if (dead)
                {
                    vkd3d_shader_instruction_make_nop(ins);
                    ++depth;
                }
                break;

            case VKD3DSIH_ENDIF:
            case VKD3DSIH_ENDLOOP:
            case VKD3DSIH_ENDSWITCH:
            case VKD3DSIH_ELSE:
                if (dead)
                {
                    if (depth > 0)
                    {
                        if (ins->handler_idx != VKD3DSIH_ELSE)
                            --depth;
                        vkd3d_shader_instruction_make_nop(ins);
                    }
                    else
                    {
                        dead = false;
                    }
                }
                break;

            /* `depth' is counted with respect to where the dead code
             * segment began. So it starts at zero and it signals the
             * termination of the dead code segment when it would
             * become negative. */
            case VKD3DSIH_BREAK:
            case VKD3DSIH_RET:
            case VKD3DSIH_CONTINUE:
                if (dead)
                {
                    vkd3d_shader_instruction_make_nop(ins);
                }
                else
                {
                    dead = true;
                    depth = 0;
                }
                break;

            /* If `case' or `default' appears at zero depth, it means
             * that they are a possible target for the corresponding
             * switch, so the code is live again. */
            case VKD3DSIH_CASE:
            case VKD3DSIH_DEFAULT:
                if (dead)
                {
                    if (depth == 0)
                        dead = false;
                    else
                        vkd3d_shader_instruction_make_nop(ins);
                }
                break;

            /* Phase instructions can only appear in hull shaders and
             * outside of any block. When a phase returns, control is
             * moved to the following phase, so they make code live
             * again. */
            case VKD3DSIH_HS_CONTROL_POINT_PHASE:
            case VKD3DSIH_HS_FORK_PHASE:
            case VKD3DSIH_HS_JOIN_PHASE:
                dead = false;
                break;

            default:
                if (dead)
                    vkd3d_shader_instruction_make_nop(ins);
                break;
        }
    }
}

static enum vkd3d_result vsir_program_normalise_combined_samplers(struct vsir_program *program,
        struct vkd3d_shader_message_context *message_context)
{
    unsigned int i;

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];
        struct vkd3d_shader_src_param *srcs;

        switch (ins->handler_idx)
        {
            case VKD3DSIH_TEX:
                if (!(srcs = shader_src_param_allocator_get(&program->instructions.src_params, 3)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;
                memset(srcs, 0, sizeof(*srcs) * 3);

                ins->handler_idx = VKD3DSIH_SAMPLE;

                srcs[0] = ins->src[0];

                srcs[1].reg.type = VKD3DSPR_RESOURCE;
                srcs[1].reg.idx[0] = ins->src[1].reg.idx[0];
                srcs[1].reg.idx[1] = ins->src[1].reg.idx[0];
                srcs[1].reg.idx_count = 2;
                srcs[1].reg.data_type = VKD3D_DATA_RESOURCE;
                srcs[1].swizzle = VKD3D_SHADER_NO_SWIZZLE;

                srcs[2].reg.type = VKD3DSPR_SAMPLER;
                srcs[2].reg.idx[0] = ins->src[1].reg.idx[0];
                srcs[2].reg.idx[1] = ins->src[1].reg.idx[0];
                srcs[2].reg.idx_count = 2;
                srcs[2].reg.data_type = VKD3D_DATA_SAMPLER;

                ins->src = srcs;
                ins->src_count = 3;
                break;

            case VKD3DSIH_TEXBEM:
            case VKD3DSIH_TEXBEML:
            case VKD3DSIH_TEXCOORD:
            case VKD3DSIH_TEXDEPTH:
            case VKD3DSIH_TEXDP3:
            case VKD3DSIH_TEXDP3TEX:
            case VKD3DSIH_TEXLDD:
            case VKD3DSIH_TEXLDL:
            case VKD3DSIH_TEXM3x2PAD:
            case VKD3DSIH_TEXM3x2TEX:
            case VKD3DSIH_TEXM3x3DIFF:
            case VKD3DSIH_TEXM3x3PAD:
            case VKD3DSIH_TEXM3x3SPEC:
            case VKD3DSIH_TEXM3x3TEX:
            case VKD3DSIH_TEXM3x3VSPEC:
            case VKD3DSIH_TEXREG2AR:
            case VKD3DSIH_TEXREG2GB:
            case VKD3DSIH_TEXREG2RGB:
                vkd3d_shader_error(message_context, &ins->location, VKD3D_SHADER_ERROR_VSIR_NOT_IMPLEMENTED,
                        "Aborting due to not yet implemented feature: "
                        "Combined sampler instruction %#x.", ins->handler_idx);
                return VKD3D_ERROR_NOT_IMPLEMENTED;

            default:
                break;
        }
    }

    return VKD3D_OK;
}

struct cf_flattener_if_info
{
    struct vkd3d_shader_src_param *false_param;
    unsigned int id;
    uint32_t merge_block_id;
    unsigned int else_block_id;
};

struct cf_flattener_loop_info
{
    unsigned int header_block_id;
    unsigned int continue_block_id;
    uint32_t merge_block_id;
};

struct cf_flattener_switch_case
{
    unsigned int value;
    unsigned int block_id;
};

struct cf_flattener_switch_info
{
    size_t ins_location;
    const struct vkd3d_shader_src_param *condition;
    unsigned int id;
    unsigned int merge_block_id;
    unsigned int default_block_id;
    struct cf_flattener_switch_case *cases;
    size_t cases_size;
    unsigned int cases_count;
};

struct cf_flattener_info
{
    union
    {
        struct cf_flattener_if_info if_;
        struct cf_flattener_loop_info loop;
        struct cf_flattener_switch_info switch_;
    } u;

    enum
    {
        VKD3D_BLOCK_IF,
        VKD3D_BLOCK_LOOP,
        VKD3D_BLOCK_SWITCH,
    } current_block;
    bool inside_block;
};

struct cf_flattener
{
    struct vsir_program *program;

    struct vkd3d_shader_location location;
    enum vkd3d_result status;

    struct vkd3d_shader_instruction *instructions;
    size_t instruction_capacity;
    size_t instruction_count;

    unsigned int block_id;
    const char **block_names;
    size_t block_name_capacity;
    size_t block_name_count;

    unsigned int branch_id;
    unsigned int loop_id;
    unsigned int switch_id;

    unsigned int control_flow_depth;
    struct cf_flattener_info *control_flow_info;
    size_t control_flow_info_size;
};

static void cf_flattener_set_error(struct cf_flattener *flattener, enum vkd3d_result error)
{
    if (flattener->status != VKD3D_OK)
        return;
    flattener->status = error;
}

static struct vkd3d_shader_instruction *cf_flattener_require_space(struct cf_flattener *flattener, size_t count)
{
    if (!vkd3d_array_reserve((void **)&flattener->instructions, &flattener->instruction_capacity,
            flattener->instruction_count + count, sizeof(*flattener->instructions)))
    {
        ERR("Failed to allocate instructions.\n");
        cf_flattener_set_error(flattener, VKD3D_ERROR_OUT_OF_MEMORY);
        return NULL;
    }
    return &flattener->instructions[flattener->instruction_count];
}

static bool cf_flattener_copy_instruction(struct cf_flattener *flattener,
        const struct vkd3d_shader_instruction *instruction)
{
    struct vkd3d_shader_instruction *dst_ins;

    if (instruction->handler_idx == VKD3DSIH_NOP)
        return true;

    if (!(dst_ins = cf_flattener_require_space(flattener, 1)))
        return false;

    *dst_ins = *instruction;
    ++flattener->instruction_count;
    return true;
}

static unsigned int cf_flattener_alloc_block_id(struct cf_flattener *flattener)
{
    return ++flattener->block_id;
}

static struct vkd3d_shader_src_param *instruction_src_params_alloc(struct vkd3d_shader_instruction *ins,
        unsigned int count, struct cf_flattener *flattener)
{
    struct vkd3d_shader_src_param *params;

    if (!(params = vsir_program_get_src_params(flattener->program, count)))
    {
        cf_flattener_set_error(flattener, VKD3D_ERROR_OUT_OF_MEMORY);
        return NULL;
    }
    ins->src = params;
    ins->src_count = count;
    return params;
}

static void cf_flattener_emit_label(struct cf_flattener *flattener, unsigned int label_id)
{
    struct vkd3d_shader_instruction *ins;

    if (!(ins = cf_flattener_require_space(flattener, 1)))
        return;
    if (vsir_instruction_init_label(ins, &flattener->location, label_id, flattener->program))
        ++flattener->instruction_count;
    else
        cf_flattener_set_error(flattener, VKD3D_ERROR_OUT_OF_MEMORY);
}

/* For conditional branches, this returns the false target branch parameter. */
static struct vkd3d_shader_src_param *cf_flattener_emit_branch(struct cf_flattener *flattener,
        unsigned int merge_block_id, unsigned int continue_block_id,
        const struct vkd3d_shader_src_param *condition, unsigned int true_id, unsigned int false_id,
        unsigned int flags)
{
    struct vkd3d_shader_src_param *src_params, *false_branch_param;
    struct vkd3d_shader_instruction *ins;

    if (!(ins = cf_flattener_require_space(flattener, 1)))
        return NULL;
    vsir_instruction_init(ins, &flattener->location, VKD3DSIH_BRANCH);

    if (condition)
    {
        if (!(src_params = instruction_src_params_alloc(ins, 4 + !!continue_block_id, flattener)))
            return NULL;
        src_params[0] = *condition;
        if (flags == VKD3D_SHADER_CONDITIONAL_OP_Z)
        {
            vsir_src_param_init_label(&src_params[1], false_id);
            vsir_src_param_init_label(&src_params[2], true_id);
            false_branch_param = &src_params[1];
        }
        else
        {
            vsir_src_param_init_label(&src_params[1], true_id);
            vsir_src_param_init_label(&src_params[2], false_id);
            false_branch_param = &src_params[2];
        }
        vsir_src_param_init_label(&src_params[3], merge_block_id);
        if (continue_block_id)
            vsir_src_param_init_label(&src_params[4], continue_block_id);
    }
    else
    {
        if (!(src_params = instruction_src_params_alloc(ins, merge_block_id ? 3 : 1, flattener)))
            return NULL;
        vsir_src_param_init_label(&src_params[0], true_id);
        if (merge_block_id)
        {
            /* An unconditional branch may only have merge information for a loop, which
             * must have both a merge block and continue block. */
            vsir_src_param_init_label(&src_params[1], merge_block_id);
            vsir_src_param_init_label(&src_params[2], continue_block_id);
        }
        false_branch_param = NULL;
    }

    ++flattener->instruction_count;

    return false_branch_param;
}

static void cf_flattener_emit_conditional_branch_and_merge(struct cf_flattener *flattener,
        const struct vkd3d_shader_src_param *condition, unsigned int true_id, unsigned int flags)
{
    unsigned int merge_block_id;

    merge_block_id = cf_flattener_alloc_block_id(flattener);
    cf_flattener_emit_branch(flattener, merge_block_id, 0, condition, true_id, merge_block_id, flags);
    cf_flattener_emit_label(flattener, merge_block_id);
}

static void cf_flattener_emit_unconditional_branch(struct cf_flattener *flattener, unsigned int target_block_id)
{
    cf_flattener_emit_branch(flattener, 0, 0, NULL, target_block_id, 0, 0);
}

static struct cf_flattener_info *cf_flattener_push_control_flow_level(struct cf_flattener *flattener)
{
    if (!vkd3d_array_reserve((void **)&flattener->control_flow_info, &flattener->control_flow_info_size,
            flattener->control_flow_depth + 1, sizeof(*flattener->control_flow_info)))
    {
        ERR("Failed to allocate control flow info structure.\n");
        cf_flattener_set_error(flattener, VKD3D_ERROR_OUT_OF_MEMORY);
        return NULL;
    }

    return &flattener->control_flow_info[flattener->control_flow_depth++];
}

static void cf_flattener_pop_control_flow_level(struct cf_flattener *flattener)
{
    struct cf_flattener_info *cf_info;

    cf_info = &flattener->control_flow_info[--flattener->control_flow_depth];
    memset(cf_info, 0, sizeof(*cf_info));
}

static struct cf_flattener_info *cf_flattener_find_innermost_loop(struct cf_flattener *flattener)
{
    int depth;

    for (depth = flattener->control_flow_depth - 1; depth >= 0; --depth)
    {
        if (flattener->control_flow_info[depth].current_block == VKD3D_BLOCK_LOOP)
            return &flattener->control_flow_info[depth];
    }

    return NULL;
}

static struct cf_flattener_info *cf_flattener_find_innermost_breakable_cf_construct(struct cf_flattener *flattener)
{
    int depth;

    for (depth = flattener->control_flow_depth - 1; depth >= 0; --depth)
    {
        if (flattener->control_flow_info[depth].current_block == VKD3D_BLOCK_LOOP
                || flattener->control_flow_info[depth].current_block == VKD3D_BLOCK_SWITCH)
            return &flattener->control_flow_info[depth];
    }

    return NULL;
}

static void VKD3D_PRINTF_FUNC(3, 4) cf_flattener_create_block_name(struct cf_flattener *flattener,
        unsigned int block_id, const char *fmt, ...)
{
    struct vkd3d_string_buffer buffer;
    size_t block_name_count;
    va_list args;

    --block_id;

    block_name_count = max(flattener->block_name_count, block_id + 1);
    if (!vkd3d_array_reserve((void **)&flattener->block_names, &flattener->block_name_capacity,
            block_name_count, sizeof(*flattener->block_names)))
        return;
    memset(&flattener->block_names[flattener->block_name_count], 0,
            (block_name_count - flattener->block_name_count) * sizeof(*flattener->block_names));
    flattener->block_name_count = block_name_count;

    vkd3d_string_buffer_init(&buffer);
    va_start(args, fmt);
    vkd3d_string_buffer_vprintf(&buffer, fmt, args);
    va_end(args);

    flattener->block_names[block_id] = buffer.buffer;
}

static enum vkd3d_result cf_flattener_iterate_instruction_array(struct cf_flattener *flattener,
        struct vkd3d_shader_message_context *message_context)
{
    bool main_block_open, is_hull_shader, after_declarations_section;
    struct vkd3d_shader_instruction_array *instructions;
    struct vsir_program *program = flattener->program;
    struct vkd3d_shader_instruction *dst_ins;
    size_t i;

    instructions = &program->instructions;
    is_hull_shader = program->shader_version.type == VKD3D_SHADER_TYPE_HULL;
    main_block_open = !is_hull_shader;
    after_declarations_section = is_hull_shader;

    if (!cf_flattener_require_space(flattener, instructions->count + 1))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    for (i = 0; i < instructions->count; ++i)
    {
        unsigned int loop_header_block_id, loop_body_block_id, continue_block_id, merge_block_id, true_block_id;
        const struct vkd3d_shader_instruction *instruction = &instructions->elements[i];
        const struct vkd3d_shader_src_param *src = instruction->src;
        struct cf_flattener_info *cf_info;

        flattener->location = instruction->location;

        /* Declarations should occur before the first code block, which in hull shaders is marked by the first
         * phase instruction, and in all other shader types begins with the first label instruction.
         * Declaring an indexable temp with function scope is not considered a declaration,
         * because it needs to live inside a function. */
        if (!after_declarations_section && instruction->handler_idx != VKD3DSIH_NOP)
        {
            bool is_function_indexable = instruction->handler_idx == VKD3DSIH_DCL_INDEXABLE_TEMP
                    && instruction->declaration.indexable_temp.has_function_scope;

            if (!vsir_instruction_is_dcl(instruction) || is_function_indexable)
            {
                after_declarations_section = true;
                cf_flattener_emit_label(flattener, cf_flattener_alloc_block_id(flattener));
            }
        }

        cf_info = flattener->control_flow_depth
                ? &flattener->control_flow_info[flattener->control_flow_depth - 1] : NULL;

        switch (instruction->handler_idx)
        {
            case VKD3DSIH_HS_CONTROL_POINT_PHASE:
            case VKD3DSIH_HS_FORK_PHASE:
            case VKD3DSIH_HS_JOIN_PHASE:
                if (!cf_flattener_copy_instruction(flattener, instruction))
                    return VKD3D_ERROR_OUT_OF_MEMORY;
                if (instruction->handler_idx != VKD3DSIH_HS_CONTROL_POINT_PHASE || !instruction->flags)
                    after_declarations_section = false;
                break;

            case VKD3DSIH_LABEL:
                vkd3d_shader_error(message_context, &instruction->location,
                        VKD3D_SHADER_ERROR_VSIR_NOT_IMPLEMENTED,
                        "Aborting due to not yet implemented feature: Label instruction.");
                return VKD3D_ERROR_NOT_IMPLEMENTED;

            case VKD3DSIH_IF:
                if (!(cf_info = cf_flattener_push_control_flow_level(flattener)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                true_block_id = cf_flattener_alloc_block_id(flattener);
                merge_block_id = cf_flattener_alloc_block_id(flattener);
                cf_info->u.if_.false_param = cf_flattener_emit_branch(flattener, merge_block_id, 0,
                        src, true_block_id, merge_block_id, instruction->flags);
                if (!cf_info->u.if_.false_param)
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                cf_flattener_emit_label(flattener, true_block_id);

                cf_info->u.if_.id = flattener->branch_id;
                cf_info->u.if_.merge_block_id = merge_block_id;
                cf_info->u.if_.else_block_id = 0;
                cf_info->inside_block = true;
                cf_info->current_block = VKD3D_BLOCK_IF;

                cf_flattener_create_block_name(flattener, merge_block_id, "branch%u_merge", flattener->branch_id);
                cf_flattener_create_block_name(flattener, true_block_id, "branch%u_true", flattener->branch_id);
                ++flattener->branch_id;
                break;

            case VKD3DSIH_ELSE:
                if (cf_info->inside_block)
                    cf_flattener_emit_unconditional_branch(flattener, cf_info->u.if_.merge_block_id);

                cf_info->u.if_.else_block_id = cf_flattener_alloc_block_id(flattener);
                cf_info->u.if_.false_param->reg.idx[0].offset = cf_info->u.if_.else_block_id;

                cf_flattener_create_block_name(flattener,
                        cf_info->u.if_.else_block_id, "branch%u_false", cf_info->u.if_.id);
                cf_flattener_emit_label(flattener, cf_info->u.if_.else_block_id);

                cf_info->inside_block = true;
                break;

            case VKD3DSIH_ENDIF:
                if (cf_info->inside_block)
                    cf_flattener_emit_unconditional_branch(flattener, cf_info->u.if_.merge_block_id);

                cf_flattener_emit_label(flattener, cf_info->u.if_.merge_block_id);

                cf_flattener_pop_control_flow_level(flattener);
                break;

            case VKD3DSIH_LOOP:
                if (!(cf_info = cf_flattener_push_control_flow_level(flattener)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                loop_header_block_id = cf_flattener_alloc_block_id(flattener);
                loop_body_block_id = cf_flattener_alloc_block_id(flattener);
                continue_block_id = cf_flattener_alloc_block_id(flattener);
                merge_block_id = cf_flattener_alloc_block_id(flattener);

                cf_flattener_emit_unconditional_branch(flattener, loop_header_block_id);
                cf_flattener_emit_label(flattener, loop_header_block_id);
                cf_flattener_emit_branch(flattener, merge_block_id, continue_block_id,
                        NULL, loop_body_block_id, 0, 0);

                cf_flattener_emit_label(flattener, loop_body_block_id);

                cf_info->u.loop.header_block_id = loop_header_block_id;
                cf_info->u.loop.continue_block_id = continue_block_id;
                cf_info->u.loop.merge_block_id = merge_block_id;
                cf_info->current_block = VKD3D_BLOCK_LOOP;
                cf_info->inside_block = true;

                cf_flattener_create_block_name(flattener, loop_header_block_id, "loop%u_header", flattener->loop_id);
                cf_flattener_create_block_name(flattener, loop_body_block_id, "loop%u_body", flattener->loop_id);
                cf_flattener_create_block_name(flattener, continue_block_id, "loop%u_continue", flattener->loop_id);
                cf_flattener_create_block_name(flattener, merge_block_id, "loop%u_merge", flattener->loop_id);
                ++flattener->loop_id;
                break;

            case VKD3DSIH_ENDLOOP:
                if (cf_info->inside_block)
                    cf_flattener_emit_unconditional_branch(flattener, cf_info->u.loop.continue_block_id);

                cf_flattener_emit_label(flattener, cf_info->u.loop.continue_block_id);
                cf_flattener_emit_unconditional_branch(flattener, cf_info->u.loop.header_block_id);
                cf_flattener_emit_label(flattener, cf_info->u.loop.merge_block_id);

                cf_flattener_pop_control_flow_level(flattener);
                break;

            case VKD3DSIH_SWITCH:
                if (!(cf_info = cf_flattener_push_control_flow_level(flattener)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                merge_block_id = cf_flattener_alloc_block_id(flattener);

                cf_info->u.switch_.ins_location = flattener->instruction_count;
                cf_info->u.switch_.condition = src;

                if (!(dst_ins = cf_flattener_require_space(flattener, 1)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;
                vsir_instruction_init(dst_ins, &instruction->location, VKD3DSIH_SWITCH_MONOLITHIC);
                ++flattener->instruction_count;

                cf_info->u.switch_.id = flattener->switch_id;
                cf_info->u.switch_.merge_block_id = merge_block_id;
                cf_info->u.switch_.cases = NULL;
                cf_info->u.switch_.cases_size = 0;
                cf_info->u.switch_.cases_count = 0;
                cf_info->u.switch_.default_block_id = 0;
                cf_info->inside_block = false;
                cf_info->current_block = VKD3D_BLOCK_SWITCH;

                cf_flattener_create_block_name(flattener, merge_block_id, "switch%u_merge", flattener->switch_id);
                ++flattener->switch_id;

                if (!vkd3d_array_reserve((void **)&cf_info->u.switch_.cases, &cf_info->u.switch_.cases_size,
                        10, sizeof(*cf_info->u.switch_.cases)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                break;

            case VKD3DSIH_ENDSWITCH:
            {
                struct vkd3d_shader_src_param *src_params;
                unsigned int j;

                if (!cf_info->u.switch_.default_block_id)
                    cf_info->u.switch_.default_block_id = cf_info->u.switch_.merge_block_id;

                cf_flattener_emit_label(flattener, cf_info->u.switch_.merge_block_id);

                /* The SWITCH instruction is completed when the endswitch
                 * instruction is processed because we do not know the number
                 * of case statements or the default block id in advance.*/
                dst_ins = &flattener->instructions[cf_info->u.switch_.ins_location];
                if (!(src_params = instruction_src_params_alloc(dst_ins, cf_info->u.switch_.cases_count * 2 + 3, flattener)))
                {
                    vkd3d_free(cf_info->u.switch_.cases);
                    return VKD3D_ERROR_OUT_OF_MEMORY;
                }
                src_params[0] = *cf_info->u.switch_.condition;
                vsir_src_param_init_label(&src_params[1], cf_info->u.switch_.default_block_id);
                vsir_src_param_init_label(&src_params[2], cf_info->u.switch_.merge_block_id);
                for (j = 0; j < cf_info->u.switch_.cases_count; ++j)
                {
                    unsigned int index = j * 2 + 3;
                    vsir_src_param_init(&src_params[index], VKD3DSPR_IMMCONST, VKD3D_DATA_UINT, 0);
                    src_params[index].reg.u.immconst_u32[0] = cf_info->u.switch_.cases[j].value;
                    vsir_src_param_init_label(&src_params[index + 1], cf_info->u.switch_.cases[j].block_id);
                }
                vkd3d_free(cf_info->u.switch_.cases);

                cf_flattener_pop_control_flow_level(flattener);
                break;
            }

            case VKD3DSIH_CASE:
            {
                unsigned int label_id, value;

                if (src->swizzle != VKD3D_SHADER_SWIZZLE(X, X, X, X))
                {
                    WARN("Unexpected src swizzle %#x.\n", src->swizzle);
                    vkd3d_shader_error(message_context, &instruction->location,
                            VKD3D_SHADER_ERROR_VSIR_INVALID_SWIZZLE,
                            "The swizzle for a switch case value is not scalar X.");
                    cf_flattener_set_error(flattener, VKD3D_ERROR_INVALID_SHADER);
                }
                value = *src->reg.u.immconst_u32;

                if (!vkd3d_array_reserve((void **)&cf_info->u.switch_.cases, &cf_info->u.switch_.cases_size,
                        cf_info->u.switch_.cases_count + 1, sizeof(*cf_info->u.switch_.cases)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                label_id = cf_flattener_alloc_block_id(flattener);
                if (cf_info->inside_block) /* fall-through */
                    cf_flattener_emit_unconditional_branch(flattener, label_id);

                cf_info->u.switch_.cases[cf_info->u.switch_.cases_count].value = value;
                cf_info->u.switch_.cases[cf_info->u.switch_.cases_count].block_id = label_id;
                ++cf_info->u.switch_.cases_count;

                cf_flattener_emit_label(flattener, label_id);
                cf_flattener_create_block_name(flattener, label_id, "switch%u_case%u", cf_info->u.switch_.id, value);
                cf_info->inside_block = true;
                break;
            }

            case VKD3DSIH_DEFAULT:
                cf_info->u.switch_.default_block_id = cf_flattener_alloc_block_id(flattener);
                if (cf_info->inside_block) /* fall-through */
                    cf_flattener_emit_unconditional_branch(flattener, cf_info->u.switch_.default_block_id);

                cf_flattener_emit_label(flattener, cf_info->u.switch_.default_block_id);

                cf_flattener_create_block_name(flattener, cf_info->u.switch_.default_block_id,
                        "switch%u_default", cf_info->u.switch_.id);
                cf_info->inside_block = true;
                break;

            case VKD3DSIH_BREAK:
            {
                struct cf_flattener_info *breakable_cf_info;

                if (!(breakable_cf_info = cf_flattener_find_innermost_breakable_cf_construct(flattener)))
                {
                    FIXME("Unhandled break instruction.\n");
                    return VKD3D_ERROR_INVALID_SHADER;
                }

                if (breakable_cf_info->current_block == VKD3D_BLOCK_LOOP)
                {
                    cf_flattener_emit_unconditional_branch(flattener, breakable_cf_info->u.loop.merge_block_id);
                }
                else if (breakable_cf_info->current_block == VKD3D_BLOCK_SWITCH)
                {
                    cf_flattener_emit_unconditional_branch(flattener, breakable_cf_info->u.switch_.merge_block_id);
                }

                cf_info->inside_block = false;
                break;
            }

            case VKD3DSIH_BREAKP:
            {
                struct cf_flattener_info *loop_cf_info;

                if (!(loop_cf_info = cf_flattener_find_innermost_loop(flattener)))
                {
                    ERR("Invalid 'breakc' instruction outside loop.\n");
                    return VKD3D_ERROR_INVALID_SHADER;
                }

                cf_flattener_emit_conditional_branch_and_merge(flattener,
                        src, loop_cf_info->u.loop.merge_block_id, instruction->flags);
                break;
            }

            case VKD3DSIH_CONTINUE:
            {
                struct cf_flattener_info *loop_cf_info;

                if (!(loop_cf_info = cf_flattener_find_innermost_loop(flattener)))
                {
                    ERR("Invalid 'continue' instruction outside loop.\n");
                    return VKD3D_ERROR_INVALID_SHADER;
                }

                cf_flattener_emit_unconditional_branch(flattener, loop_cf_info->u.loop.continue_block_id);

                cf_info->inside_block = false;
                break;
            }

            case VKD3DSIH_CONTINUEP:
            {
                struct cf_flattener_info *loop_cf_info;

                if (!(loop_cf_info = cf_flattener_find_innermost_loop(flattener)))
                {
                    ERR("Invalid 'continuec' instruction outside loop.\n");
                    return VKD3D_ERROR_INVALID_SHADER;
                }

                cf_flattener_emit_conditional_branch_and_merge(flattener,
                        src, loop_cf_info->u.loop.continue_block_id, instruction->flags);
                break;
            }

            case VKD3DSIH_RET:
                if (!cf_flattener_copy_instruction(flattener, instruction))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                if (cf_info)
                    cf_info->inside_block = false;
                else
                    main_block_open = false;
                break;

            default:
                if (!cf_flattener_copy_instruction(flattener, instruction))
                    return VKD3D_ERROR_OUT_OF_MEMORY;
                break;
        }
    }

    if (main_block_open)
    {
        if (!(dst_ins = cf_flattener_require_space(flattener, 1)))
            return VKD3D_ERROR_OUT_OF_MEMORY;
        vsir_instruction_init(dst_ins, &flattener->location, VKD3DSIH_RET);
        ++flattener->instruction_count;
    }

    return flattener->status;
}

static enum vkd3d_result vsir_program_flatten_control_flow_constructs(struct vsir_program *program,
        struct vkd3d_shader_message_context *message_context)
{
    struct cf_flattener flattener = {.program = program};
    enum vkd3d_result result;

    if ((result = cf_flattener_iterate_instruction_array(&flattener, message_context)) >= 0)
    {
        vkd3d_free(program->instructions.elements);
        program->instructions.elements = flattener.instructions;
        program->instructions.capacity = flattener.instruction_capacity;
        program->instructions.count = flattener.instruction_count;
        program->block_count = flattener.block_id;
    }
    else
    {
        vkd3d_free(flattener.instructions);
    }

    vkd3d_free(flattener.control_flow_info);
    /* Simpler to always free these in vsir_program_cleanup(). */
    program->block_names = flattener.block_names;
    program->block_name_count = flattener.block_name_count;

    return result;
}

static unsigned int label_from_src_param(const struct vkd3d_shader_src_param *param)
{
    assert(param->reg.type == VKD3DSPR_LABEL);
    return param->reg.idx[0].offset;
}

static bool reserve_instructions(struct vkd3d_shader_instruction **instructions, size_t *capacity, size_t count)
{
    if (!vkd3d_array_reserve((void **)instructions, capacity, count, sizeof(**instructions)))
    {
        ERR("Failed to allocate instructions.\n");
        return false;
    }

    return true;
}

/* A record represents replacing a jump from block `switch_label' to
 * block `target_label' with a jump from block `if_label' to block
 * `target_label'. */
struct lower_switch_to_if_ladder_block_mapping
{
    unsigned int switch_label;
    unsigned int if_label;
    unsigned int target_label;
};

static bool lower_switch_to_if_ladder_add_block_mapping(struct lower_switch_to_if_ladder_block_mapping **block_map,
        size_t *map_capacity, size_t *map_count, unsigned int switch_label, unsigned int if_label, unsigned int target_label)
{
    if (!vkd3d_array_reserve((void **)block_map, map_capacity, *map_count + 1, sizeof(**block_map)))
    {
        ERR("Failed to allocate block mapping.\n");
        return false;
    }

    (*block_map)[*map_count].switch_label = switch_label;
    (*block_map)[*map_count].if_label = if_label;
    (*block_map)[*map_count].target_label = target_label;

    *map_count += 1;

    return true;
}

static enum vkd3d_result lower_switch_to_if_ladder(struct vsir_program *program)
{
    unsigned int block_count = program->block_count, ssa_count = program->ssa_count, current_label = 0, if_label;
    size_t ins_capacity = 0, ins_count = 0, i, map_capacity = 0, map_count = 0;
    struct vkd3d_shader_instruction *instructions = NULL;
    struct lower_switch_to_if_ladder_block_mapping *block_map = NULL;

    if (!reserve_instructions(&instructions, &ins_capacity, program->instructions.count))
        goto fail;

    /* First subpass: convert SWITCH_MONOLITHIC instructions to
     * selection ladders, keeping a map between blocks before and
     * after the subpass. */
    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];
        unsigned int case_count, j, default_label;

        switch (ins->handler_idx)
        {
            case VKD3DSIH_LABEL:
                current_label = label_from_src_param(&ins->src[0]);
                if (!reserve_instructions(&instructions, &ins_capacity, ins_count + 1))
                    goto fail;
                instructions[ins_count++] = *ins;
                continue;

            case VKD3DSIH_SWITCH_MONOLITHIC:
                break;

            default:
                if (!reserve_instructions(&instructions, &ins_capacity, ins_count + 1))
                    goto fail;
                instructions[ins_count++] = *ins;
                continue;
        }

        case_count = (ins->src_count - 3) / 2;
        default_label = label_from_src_param(&ins->src[1]);

        /* In principle we can have a switch with no cases, and we
         * just have to jump to the default label. */
        if (case_count == 0)
        {
            if (!reserve_instructions(&instructions, &ins_capacity, ins_count + 1))
                goto fail;

            if (!vsir_instruction_init_with_params(program, &instructions[ins_count],
                    &ins->location, VKD3DSIH_BRANCH, 0, 1))
                goto fail;
            vsir_src_param_init_label(&instructions[ins_count].src[0], default_label);
            ++ins_count;
        }

        if (!reserve_instructions(&instructions, &ins_capacity, ins_count + 3 * case_count - 1))
            goto fail;

        if_label = current_label;

        for (j = 0; j < case_count; ++j)
        {
            unsigned int fallthrough_label, case_label = label_from_src_param(&ins->src[3 + 2 * j + 1]);

            if (!vsir_instruction_init_with_params(program,
                    &instructions[ins_count], &ins->location, VKD3DSIH_IEQ, 1, 2))
                goto fail;
            dst_param_init_ssa_bool(&instructions[ins_count].dst[0], ssa_count);
            instructions[ins_count].src[0] = ins->src[0];
            instructions[ins_count].src[1] = ins->src[3 + 2 * j];
            ++ins_count;

            /* For all cases except the last one we fall through to
             * the following case; the last one has to jump to the
             * default label. */
            if (j == case_count - 1)
                fallthrough_label = default_label;
            else
                fallthrough_label = block_count + 1;

            if (!vsir_instruction_init_with_params(program, &instructions[ins_count],
                    &ins->location, VKD3DSIH_BRANCH, 0, 3))
                goto fail;
            src_param_init_ssa_bool(&instructions[ins_count].src[0], ssa_count);
            vsir_src_param_init_label(&instructions[ins_count].src[1], case_label);
            vsir_src_param_init_label(&instructions[ins_count].src[2], fallthrough_label);
            ++ins_count;

            ++ssa_count;

            if (!lower_switch_to_if_ladder_add_block_mapping(&block_map, &map_capacity, &map_count,
                    current_label, if_label, case_label))
                goto fail;

            if (j == case_count - 1)
            {
                if (!lower_switch_to_if_ladder_add_block_mapping(&block_map, &map_capacity, &map_count,
                        current_label, if_label, default_label))
                    goto fail;
            }
            else
            {
                if (!vsir_instruction_init_with_params(program,
                        &instructions[ins_count], &ins->location, VKD3DSIH_LABEL, 0, 1))
                    goto fail;
                vsir_src_param_init_label(&instructions[ins_count].src[0], ++block_count);
                ++ins_count;

                if_label = block_count;
            }
        }
    }

    /* Second subpass: creating new blocks might have broken
     * references in PHI instructions, so we use the block map to fix
     * them. */
    current_label = 0;
    for (i = 0; i < ins_count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &instructions[i];
        struct vkd3d_shader_src_param *new_src;
        unsigned int j, l, new_src_count = 0;

        switch (ins->handler_idx)
        {
            case VKD3DSIH_LABEL:
                current_label = label_from_src_param(&ins->src[0]);
                continue;

            case VKD3DSIH_PHI:
                break;

            default:
                continue;
        }

        /* First count how many source parameters we need. */
        for (j = 0; j < ins->src_count; j += 2)
        {
            unsigned int source_label = label_from_src_param(&ins->src[j + 1]);
            size_t k, match_count = 0;

            for (k = 0; k < map_count; ++k)
            {
                struct lower_switch_to_if_ladder_block_mapping *mapping = &block_map[k];

                if (mapping->switch_label == source_label && mapping->target_label == current_label)
                    match_count += 1;
            }

            new_src_count += (match_count != 0) ? 2 * match_count : 2;
        }

        assert(new_src_count >= ins->src_count);

        /* Allocate more source parameters if needed. */
        if (new_src_count == ins->src_count)
        {
            new_src = ins->src;
        }
        else
        {
            if (!(new_src = vsir_program_get_src_params(program, new_src_count)))
            {
                ERR("Failed to allocate %u source parameters.\n", new_src_count);
                goto fail;
            }
        }

        /* Then do the copy. */
        for (j = 0, l = 0; j < ins->src_count; j += 2)
        {
            unsigned int source_label = label_from_src_param(&ins->src[j + 1]);
            size_t k, match_count = 0;

            for (k = 0; k < map_count; ++k)
            {
                struct lower_switch_to_if_ladder_block_mapping *mapping = &block_map[k];

                if (mapping->switch_label == source_label && mapping->target_label == current_label)
                {
                    match_count += 1;

                    new_src[l] = ins->src[j];
                    new_src[l + 1] = ins->src[j + 1];
                    new_src[l + 1].reg.idx[0].offset = mapping->if_label;
                    l += 2;
                }
            }

            if (match_count == 0)
            {
                new_src[l] = ins->src[j];
                new_src[l + 1] = ins->src[j + 1];
                l += 2;
            }
        }

        assert(l == new_src_count);

        ins->src_count = new_src_count;
        ins->src = new_src;
    }

    vkd3d_free(program->instructions.elements);
    vkd3d_free(block_map);
    program->instructions.elements = instructions;
    program->instructions.capacity = ins_capacity;
    program->instructions.count = ins_count;
    program->block_count = block_count;
    program->ssa_count = ssa_count;

    return VKD3D_OK;

fail:
    vkd3d_free(instructions);
    vkd3d_free(block_map);

    return VKD3D_ERROR_OUT_OF_MEMORY;
}

static void materialize_ssas_to_temps_process_src_param(struct vsir_program *program,
        struct vkd3d_shader_src_param *src);

/* This is idempotent: it can be safely applied more than once on the
 * same register. */
static void materialize_ssas_to_temps_process_reg(struct vsir_program *program, struct vkd3d_shader_register *reg)
{
    unsigned int i;

    if (reg->type == VKD3DSPR_SSA)
    {
        reg->type = VKD3DSPR_TEMP;
        reg->idx[0].offset += program->temp_count;
    }

    for (i = 0; i < reg->idx_count; ++i)
        if (reg->idx[i].rel_addr)
            materialize_ssas_to_temps_process_src_param(program, reg->idx[i].rel_addr);
}

static void materialize_ssas_to_temps_process_dst_param(struct vsir_program *program,
        struct vkd3d_shader_dst_param *dst)
{
    materialize_ssas_to_temps_process_reg(program, &dst->reg);
}

static void materialize_ssas_to_temps_process_src_param(struct vsir_program *program,
        struct vkd3d_shader_src_param *src)
{
    materialize_ssas_to_temps_process_reg(program, &src->reg);
}

static const struct vkd3d_shader_src_param *materialize_ssas_to_temps_compute_source(struct vkd3d_shader_instruction *ins,
        unsigned int label)
{
    unsigned int i;

    assert(ins->handler_idx == VKD3DSIH_PHI);

    for (i = 0; i < ins->src_count; i += 2)
    {
        if (label_from_src_param(&ins->src[i + 1]) == label)
            return &ins->src[i];
    }

    vkd3d_unreachable();
}

static bool materialize_ssas_to_temps_synthesize_mov(struct vsir_program *program,
        struct vkd3d_shader_instruction *instruction, const struct vkd3d_shader_location *loc,
        const struct vkd3d_shader_dst_param *dest, const struct vkd3d_shader_src_param *cond,
        const struct vkd3d_shader_src_param *source, bool invert)
{
    struct vkd3d_shader_src_param *src;
    struct vkd3d_shader_dst_param *dst;

    if (!vsir_instruction_init_with_params(program, instruction, loc,
            cond ? VKD3DSIH_MOVC : VKD3DSIH_MOV, 1, cond ? 3 : 1))
        return false;

    dst = instruction->dst;
    src = instruction->src;

    dst[0] = *dest;
    materialize_ssas_to_temps_process_dst_param(program, &dst[0]);

    assert(dst[0].write_mask == VKD3DSP_WRITEMASK_0);
    assert(dst[0].modifiers == 0);
    assert(dst[0].shift == 0);

    if (cond)
    {
        src[0] = *cond;
        src[1 + invert] = *source;
        memset(&src[2 - invert], 0, sizeof(src[2 - invert]));
        src[2 - invert].reg = dst[0].reg;
        materialize_ssas_to_temps_process_src_param(program, &src[1]);
        materialize_ssas_to_temps_process_src_param(program, &src[2]);
    }
    else
    {
        src[0] = *source;
        materialize_ssas_to_temps_process_src_param(program, &src[0]);
    }

    return true;
}

static enum vkd3d_result vsir_program_materialise_ssas_to_temps(struct vsir_program *program)
{
    struct vkd3d_shader_instruction *instructions = NULL;
    struct materialize_ssas_to_temps_block_data
    {
        size_t phi_begin;
        size_t phi_count;
    } *block_index = NULL;
    size_t ins_capacity = 0, ins_count = 0, i;
    unsigned int current_label = 0;

    if (!reserve_instructions(&instructions, &ins_capacity, program->instructions.count))
        goto fail;

    if (!(block_index = vkd3d_calloc(program->block_count, sizeof(*block_index))))
    {
        ERR("Failed to allocate block index.\n");
        goto fail;
    }

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];

        switch (ins->handler_idx)
        {
            case VKD3DSIH_LABEL:
                current_label = label_from_src_param(&ins->src[0]);
                break;

            case VKD3DSIH_PHI:
                assert(current_label != 0);
                assert(i != 0);
                if (block_index[current_label - 1].phi_begin == 0)
                    block_index[current_label - 1].phi_begin = i;
                block_index[current_label - 1].phi_count += 1;
                break;

            default:
                current_label = 0;
                break;
        }
    }

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &program->instructions.elements[i];
        size_t j;

        for (j = 0; j < ins->dst_count; ++j)
            materialize_ssas_to_temps_process_dst_param(program, &ins->dst[j]);

        for (j = 0; j < ins->src_count; ++j)
            materialize_ssas_to_temps_process_src_param(program, &ins->src[j]);

        switch (ins->handler_idx)
        {
            case VKD3DSIH_LABEL:
                current_label = label_from_src_param(&ins->src[0]);
                break;

            case VKD3DSIH_BRANCH:
            {
                if (vsir_register_is_label(&ins->src[0].reg))
                {
                    const struct materialize_ssas_to_temps_block_data *data = &block_index[label_from_src_param(&ins->src[0]) - 1];

                    if (!reserve_instructions(&instructions, &ins_capacity, ins_count + data->phi_count))
                        goto fail;

                    for (j = data->phi_begin; j < data->phi_begin + data->phi_count; ++j)
                    {
                        const struct vkd3d_shader_src_param *source;

                        source = materialize_ssas_to_temps_compute_source(&program->instructions.elements[j],
                                current_label);
                        if (!materialize_ssas_to_temps_synthesize_mov(program, &instructions[ins_count],
                                &ins->location, &program->instructions.elements[j].dst[0], NULL, source, false))
                            goto fail;

                        ++ins_count;
                    }
                }
                else
                {
                    struct materialize_ssas_to_temps_block_data *data_true = &block_index[label_from_src_param(&ins->src[1]) - 1],
                            *data_false = &block_index[label_from_src_param(&ins->src[2]) - 1];
                    const struct vkd3d_shader_src_param *cond = &ins->src[0];

                    if (!reserve_instructions(&instructions, &ins_capacity,
                            ins_count + data_true->phi_count + data_false->phi_count))
                        goto fail;

                    for (j = data_true->phi_begin; j < data_true->phi_begin + data_true->phi_count; ++j)
                    {
                        const struct vkd3d_shader_src_param *source;

                        source = materialize_ssas_to_temps_compute_source(&program->instructions.elements[j],
                                current_label);
                        if (!materialize_ssas_to_temps_synthesize_mov(program, &instructions[ins_count],
                                &ins->location, &program->instructions.elements[j].dst[0], cond, source, false))
                            goto fail;

                        ++ins_count;
                    }

                    for (j = data_false->phi_begin; j < data_false->phi_begin + data_false->phi_count; ++j)
                    {
                        const struct vkd3d_shader_src_param *source;

                        source = materialize_ssas_to_temps_compute_source(&program->instructions.elements[j],
                                current_label);
                        if (!materialize_ssas_to_temps_synthesize_mov(program, &instructions[ins_count],
                                &ins->location, &program->instructions.elements[j].dst[0], cond, source, true))
                            goto fail;

                        ++ins_count;
                    }
                }
                break;
            }

            case VKD3DSIH_PHI:
                continue;

            default:
                break;
        }

        if (!reserve_instructions(&instructions, &ins_capacity, ins_count + 1))
            goto fail;

        instructions[ins_count++] = *ins;
    }

    vkd3d_free(program->instructions.elements);
    vkd3d_free(block_index);
    program->instructions.elements = instructions;
    program->instructions.capacity = ins_capacity;
    program->instructions.count = ins_count;
    program->temp_count += program->ssa_count;
    program->ssa_count = 0;

    return VKD3D_OK;

fail:
    vkd3d_free(instructions);
    vkd3d_free(block_index);

    return VKD3D_ERROR_OUT_OF_MEMORY;
}

struct vsir_block_list
{
    struct vsir_block **blocks;
    size_t count, capacity;
};

static void vsir_block_list_init(struct vsir_block_list *list)
{
    memset(list, 0, sizeof(*list));
}

static void vsir_block_list_cleanup(struct vsir_block_list *list)
{
    vkd3d_free(list->blocks);
}

static enum vkd3d_result vsir_block_list_add_checked(struct vsir_block_list *list, struct vsir_block *block)
{
    if (!vkd3d_array_reserve((void **)&list->blocks, &list->capacity, list->count + 1, sizeof(*list->blocks)))
    {
        ERR("Cannot extend block list.\n");
        return VKD3D_ERROR_OUT_OF_MEMORY;
    }

    list->blocks[list->count++] = block;

    return VKD3D_OK;
}

static enum vkd3d_result vsir_block_list_add(struct vsir_block_list *list, struct vsir_block *block)
{
    size_t i;

    for (i = 0; i < list->count; ++i)
        if (block == list->blocks[i])
            return VKD3D_FALSE;

    return vsir_block_list_add_checked(list, block);
}

/* It is guaranteed that the relative order is kept. */
static void vsir_block_list_remove_index(struct vsir_block_list *list, size_t idx)
{
    --list->count;
    memmove(&list->blocks[idx], &list->blocks[idx + 1], (list->count - idx) * sizeof(*list->blocks));
}

struct vsir_block
{
    unsigned int label, order_pos;
    /* `begin' points to the instruction immediately following the
     * LABEL that introduces the block. `end' points to the terminator
     * instruction (either BRANCH or RET). They can coincide, meaning
     * that the block is empty. */
    struct vkd3d_shader_instruction *begin, *end;
    struct vsir_block_list predecessors, successors;
    uint32_t *dominates;
};

static enum vkd3d_result vsir_block_init(struct vsir_block *block, unsigned int label, size_t block_count)
{
    size_t byte_count;

    if (block_count > SIZE_MAX - (sizeof(*block->dominates) * CHAR_BIT - 1))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    block_count = align(block_count, sizeof(*block->dominates) * CHAR_BIT);
    byte_count = block_count / CHAR_BIT;

    assert(label);
    memset(block, 0, sizeof(*block));
    block->label = label;
    vsir_block_list_init(&block->predecessors);
    vsir_block_list_init(&block->successors);

    if (!(block->dominates = vkd3d_malloc(byte_count)))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    memset(block->dominates, 0xff, byte_count);

    return VKD3D_OK;
}

static void vsir_block_cleanup(struct vsir_block *block)
{
    if (block->label == 0)
        return;
    vsir_block_list_cleanup(&block->predecessors);
    vsir_block_list_cleanup(&block->successors);
    vkd3d_free(block->dominates);
}

static int block_compare(const void *ptr1, const void *ptr2)
{
    const struct vsir_block *block1 = *(const struct vsir_block **)ptr1;
    const struct vsir_block *block2 = *(const struct vsir_block **)ptr2;

    return vkd3d_u32_compare(block1->label, block2->label);
}

static void vsir_block_list_sort(struct vsir_block_list *list)
{
    qsort(list->blocks, list->count, sizeof(*list->blocks), block_compare);
}

static bool vsir_block_list_search(struct vsir_block_list *list, struct vsir_block *block)
{
    return !!bsearch(&block, list->blocks, list->count, sizeof(*list->blocks), block_compare);
}

struct vsir_cfg_structure_list
{
    struct vsir_cfg_structure *structures;
    size_t count, capacity;
    unsigned int end;
};

struct vsir_cfg_structure
{
    enum vsir_cfg_structure_type
    {
        /* Execute a block of the original VSIR program. */
        STRUCTURE_TYPE_BLOCK,
        /* Execute a loop, which is identified by an index. */
        STRUCTURE_TYPE_LOOP,
        /* Execute a `return' or a (possibly) multilevel `break' or
         * `continue', targeting a loop by its index. If `condition'
         * is non-NULL, then the jump is conditional (this is
         * currently not allowed for `return'). */
        STRUCTURE_TYPE_JUMP,
    } type;
    union
    {
        struct vsir_block *block;
        struct
        {
            struct vsir_cfg_structure_list body;
            unsigned idx;
        } loop;
        struct
        {
            enum vsir_cfg_jump_type
            {
                /* NONE is available as an intermediate value, but it
                 * is not allowed in valid structured programs. */
                JUMP_NONE,
                JUMP_BREAK,
                JUMP_CONTINUE,
                JUMP_RET,
            } type;
            unsigned int target;
            struct vkd3d_shader_src_param *condition;
            bool invert_condition;
        } jump;
    } u;
};

static void vsir_cfg_structure_init(struct vsir_cfg_structure *structure, enum vsir_cfg_structure_type type);
static void vsir_cfg_structure_cleanup(struct vsir_cfg_structure *structure);

static void vsir_cfg_structure_list_cleanup(struct vsir_cfg_structure_list *list)
{
    unsigned int i;

    for (i = 0; i < list->count; ++i)
        vsir_cfg_structure_cleanup(&list->structures[i]);
    vkd3d_free(list->structures);
}

static struct vsir_cfg_structure *vsir_cfg_structure_list_append(struct vsir_cfg_structure_list *list,
        enum vsir_cfg_structure_type type)
{
    struct vsir_cfg_structure *ret;

    if (!vkd3d_array_reserve((void **)&list->structures, &list->capacity, list->count + 1,
            sizeof(*list->structures)))
        return NULL;

    ret = &list->structures[list->count++];

    vsir_cfg_structure_init(ret, type);

    return ret;
}

static void vsir_cfg_structure_init(struct vsir_cfg_structure *structure, enum vsir_cfg_structure_type type)
{
    memset(structure, 0, sizeof(*structure));
    structure->type = type;
}

static void vsir_cfg_structure_cleanup(struct vsir_cfg_structure *structure)
{
    if (structure->type == STRUCTURE_TYPE_LOOP)
        vsir_cfg_structure_list_cleanup(&structure->u.loop.body);
}

struct vsir_cfg
{
    struct vkd3d_shader_message_context *message_context;
    struct vsir_program *program;
    struct vsir_block *blocks;
    struct vsir_block *entry;
    size_t block_count;
    struct vkd3d_string_buffer debug_buffer;

    struct vsir_block_list *loops;
    size_t loops_count, loops_capacity;
    size_t *loops_by_header;

    struct vsir_block_list order;
    struct cfg_loop_interval
    {
        /* `begin' is the position of the first block of the loop in
         * the topological sort; `end' is the position of the first
         * block after the loop. In other words, `begin' is where a
         * `continue' instruction would jump and `end' is where a
         * `break' instruction would jump. */
        unsigned int begin, end;
        /* Each loop interval can be natural or synthetic. Natural
         * intervals are added to represent loops given by CFG back
         * edges. Synthetic intervals do not correspond to loops in
         * the input CFG, but are added to leverage their `break'
         * instruction in order to execute forward edges.
         *
         * For a synthetic loop interval it's not really important
         * which one is the `begin' block, since we don't need to
         * execute `continue' for them. So we have some leeway for
         * moving it provided that these conditions are met: 1. the
         * interval must contain all `break' instructions that target
         * it, which in practice means that `begin' can be moved
         * backward and not forward; 2. intervals must remain properly
         * nested (for each pair of intervals, either one contains the
         * other or they are disjoint).
         *
         * Subject to these conditions, we try to reuse the same loop
         * as much as possible (if many forward edges target the same
         * block), but we still try to keep `begin' as forward as
         * possible, to keep the loop scope as small as possible. */
        bool synthetic;
    } *loop_intervals;
    size_t loop_interval_count, loop_interval_capacity;

    struct vsir_cfg_structure_list structured_program;

    struct vkd3d_shader_instruction *instructions;
    size_t ins_capacity, ins_count;
    unsigned int jump_target_temp_idx;
    unsigned int temp_count;
};

static void vsir_cfg_cleanup(struct vsir_cfg *cfg)
{
    size_t i;

    for (i = 0; i < cfg->block_count; ++i)
        vsir_block_cleanup(&cfg->blocks[i]);

    for (i = 0; i < cfg->loops_count; ++i)
        vsir_block_list_cleanup(&cfg->loops[i]);

    vsir_block_list_cleanup(&cfg->order);

    vsir_cfg_structure_list_cleanup(&cfg->structured_program);

    vkd3d_free(cfg->blocks);
    vkd3d_free(cfg->loops);
    vkd3d_free(cfg->loops_by_header);
    vkd3d_free(cfg->loop_intervals);

    if (TRACE_ON())
        vkd3d_string_buffer_cleanup(&cfg->debug_buffer);
}

static enum vkd3d_result vsir_cfg_add_loop_interval(struct vsir_cfg *cfg, unsigned int begin,
        unsigned int end, bool synthetic)
{
    struct cfg_loop_interval *interval;

    if (!vkd3d_array_reserve((void **)&cfg->loop_intervals, &cfg->loop_interval_capacity,
            cfg->loop_interval_count + 1, sizeof(*cfg->loop_intervals)))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    interval = &cfg->loop_intervals[cfg->loop_interval_count++];

    interval->begin = begin;
    interval->end = end;
    interval->synthetic = synthetic;

    return VKD3D_OK;
}

static bool vsir_block_dominates(struct vsir_block *b1, struct vsir_block *b2)
{
    return bitmap_is_set(b1->dominates, b2->label - 1);
}

static enum vkd3d_result vsir_cfg_add_edge(struct vsir_cfg *cfg, struct vsir_block *block,
        struct vkd3d_shader_src_param *successor_param)
{
    unsigned int target = label_from_src_param(successor_param);
    struct vsir_block *successor = &cfg->blocks[target - 1];
    enum vkd3d_result ret;

    assert(successor->label != 0);

    if ((ret = vsir_block_list_add(&block->successors, successor)) < 0)
        return ret;

    if ((ret = vsir_block_list_add(&successor->predecessors, block)) < 0)
        return ret;

    return VKD3D_OK;
}

static void vsir_cfg_dump_dot(struct vsir_cfg *cfg)
{
    size_t i, j;

    TRACE("digraph cfg {\n");

    for (i = 0; i < cfg->block_count; ++i)
    {
        struct vsir_block *block = &cfg->blocks[i];
        const char *shape;

        if (block->label == 0)
            continue;

        switch (block->end->handler_idx)
        {
            case VKD3DSIH_RET:
                shape = "trapezium";
                break;

            case VKD3DSIH_BRANCH:
                shape = vsir_register_is_label(&block->end->src[0].reg) ? "ellipse" : "box";
                break;

            default:
                vkd3d_unreachable();
        }

        TRACE("  n%u [label=\"%u\", shape=\"%s\"];\n", block->label, block->label, shape);

        for (j = 0; j < block->successors.count; ++j)
            TRACE("  n%u -> n%u;\n", block->label, block->successors.blocks[j]->label);
    }

    TRACE("}\n");
}

static void vsir_cfg_structure_list_dump(struct vsir_cfg *cfg, struct vsir_cfg_structure_list *list);

static void vsir_cfg_structure_dump(struct vsir_cfg *cfg, struct vsir_cfg_structure *structure)
{
    switch (structure->type)
    {
        case STRUCTURE_TYPE_BLOCK:
            TRACE("%sblock %u\n", cfg->debug_buffer.buffer, structure->u.block->label);
            break;

        case STRUCTURE_TYPE_LOOP:
            TRACE("%s%u : loop {\n", cfg->debug_buffer.buffer, structure->u.loop.idx);

            vsir_cfg_structure_list_dump(cfg, &structure->u.loop.body);

            TRACE("%s}  # %u\n", cfg->debug_buffer.buffer, structure->u.loop.idx);
            break;

        case STRUCTURE_TYPE_JUMP:
        {
            const char *type_str;

            switch (structure->u.jump.type)
            {
                case JUMP_RET:
                    TRACE("%sret\n", cfg->debug_buffer.buffer);
                    return;

                case JUMP_BREAK:
                    type_str = "break";
                    break;

                case JUMP_CONTINUE:
                    type_str = "continue";
                    break;

                default:
                    vkd3d_unreachable();
            }

            TRACE("%s%s%s %u\n", cfg->debug_buffer.buffer, type_str,
                    structure->u.jump.condition ? "c" : "", structure->u.jump.target);
            break;
        }

        default:
            vkd3d_unreachable();
    }
}

static void vsir_cfg_structure_list_dump(struct vsir_cfg *cfg, struct vsir_cfg_structure_list *list)
{
    unsigned int i;

    vkd3d_string_buffer_printf(&cfg->debug_buffer, "  ");

    for (i = 0; i < list->count; ++i)
        vsir_cfg_structure_dump(cfg, &list->structures[i]);

    vkd3d_string_buffer_truncate(&cfg->debug_buffer, cfg->debug_buffer.content_size - 2);
}

static void vsir_cfg_dump_structured_program(struct vsir_cfg *cfg)
{
    unsigned int i;

    for (i = 0; i < cfg->structured_program.count; ++i)
        vsir_cfg_structure_dump(cfg, &cfg->structured_program.structures[i]);
}

static enum vkd3d_result vsir_cfg_init(struct vsir_cfg *cfg, struct vsir_program *program,
        struct vkd3d_shader_message_context *message_context)
{
    struct vsir_block *current_block = NULL;
    enum vkd3d_result ret;
    size_t i;

    memset(cfg, 0, sizeof(*cfg));
    cfg->message_context = message_context;
    cfg->program = program;
    cfg->block_count = program->block_count;

    vsir_block_list_init(&cfg->order);

    if (!(cfg->blocks = vkd3d_calloc(cfg->block_count, sizeof(*cfg->blocks))))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    if (TRACE_ON())
        vkd3d_string_buffer_init(&cfg->debug_buffer);

    for (i = 0; i < program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *instruction = &program->instructions.elements[i];

        switch (instruction->handler_idx)
        {
            case VKD3DSIH_PHI:
            case VKD3DSIH_SWITCH_MONOLITHIC:
                vkd3d_unreachable();

            case VKD3DSIH_LABEL:
            {
                unsigned int label = label_from_src_param(&instruction->src[0]);

                assert(!current_block);
                assert(label > 0);
                assert(label <= cfg->block_count);
                current_block = &cfg->blocks[label - 1];
                assert(current_block->label == 0);
                if ((ret = vsir_block_init(current_block, label, program->block_count)) < 0)
                    goto fail;
                current_block->begin = &program->instructions.elements[i + 1];
                if (!cfg->entry)
                    cfg->entry = current_block;
                break;
            }

            case VKD3DSIH_BRANCH:
            case VKD3DSIH_RET:
                assert(current_block);
                current_block->end = instruction;
                current_block = NULL;
                break;

            default:
                break;
        }
    }

    for (i = 0; i < cfg->block_count; ++i)
    {
        struct vsir_block *block = &cfg->blocks[i];

        if (block->label == 0)
            continue;

        switch (block->end->handler_idx)
        {
            case VKD3DSIH_RET:
                break;

            case VKD3DSIH_BRANCH:
                if (vsir_register_is_label(&block->end->src[0].reg))
                {
                    if ((ret = vsir_cfg_add_edge(cfg, block, &block->end->src[0])) < 0)
                        goto fail;
                }
                else
                {
                    if ((ret = vsir_cfg_add_edge(cfg, block, &block->end->src[1])) < 0)
                        goto fail;

                    if ((ret = vsir_cfg_add_edge(cfg, block, &block->end->src[2])) < 0)
                        goto fail;
                }
                break;

            default:
                vkd3d_unreachable();
        }
    }

    if (TRACE_ON())
        vsir_cfg_dump_dot(cfg);

    return VKD3D_OK;

fail:
    vsir_cfg_cleanup(cfg);

    return ret;
}

/* Block A dominates block B if every path from the entry point to B
 * must pass through A. Naively compute the set of blocks that are
 * dominated by `reference' by running a graph visit starting from the
 * entry point (which must be the initial value of `current') and
 * avoiding `reference'. Running this for all the blocks takes
 * quadratic time: if in the future something better is sought after,
 * the standard tool seems to be the Lengauer-Tarjan algorithm. */
static void vsir_cfg_compute_dominators_recurse(struct vsir_block *current, struct vsir_block *reference)
{
    size_t i;

    assert(current->label != 0);

    if (current == reference)
        return;

    if (!bitmap_is_set(reference->dominates, current->label - 1))
        return;

    bitmap_clear(reference->dominates, current->label - 1);

    for (i = 0; i < current->successors.count; ++i)
        vsir_cfg_compute_dominators_recurse(current->successors.blocks[i], reference);
}

static void vsir_cfg_compute_dominators(struct vsir_cfg *cfg)
{
    size_t i, j;

    for (i = 0; i < cfg->block_count; ++i)
    {
        struct vsir_block *block = &cfg->blocks[i];

        if (block->label == 0)
            continue;

        vsir_cfg_compute_dominators_recurse(cfg->entry, block);

        if (TRACE_ON())
        {
            vkd3d_string_buffer_printf(&cfg->debug_buffer, "Block %u dominates:", block->label);
            for (j = 0; j < cfg->block_count; j++)
            {
                struct vsir_block *block2 = &cfg->blocks[j];

                if (block2->label == 0)
                    continue;

                if (vsir_block_dominates(block, block2))
                    vkd3d_string_buffer_printf(&cfg->debug_buffer, " %u", block2->label);
            }
            TRACE("%s\n", cfg->debug_buffer.buffer);
            vkd3d_string_buffer_clear(&cfg->debug_buffer);
        }
    }
}

/* A back edge is an edge X -> Y for which block Y dominates block
 * X. All the other edges are forward edges, and it is required that
 * the input CFG is reducible, i.e., it is acyclic once you strip away
 * the back edges.
 *
 * Each back edge X -> Y defines a loop: block X is the header block,
 * block Y is the back edge block, and the loop consists of all the
 * blocks which are dominated by the header block and have a path to
 * the back edge block that doesn't pass through the header block
 * (including the header block itself). It can be proved that all the
 * blocks in such a path (connecting a loop block to the back edge
 * block without passing through the header block) belong to the same
 * loop.
 *
 * If the input CFG is reducible its loops are properly nested (i.e.,
 * each two loops are either disjoint or one is contained in the
 * other), provided that each block has at most one incoming back
 * edge. If this condition does not hold, a synthetic block can be
 * introduced as the only back edge block for the given header block,
 * with all the previous back edge now being forward edges to the
 * synthetic block. This is not currently implemented (but it is
 * rarely found in practice anyway). */
static enum vkd3d_result vsir_cfg_scan_loop(struct vsir_block_list *loop, struct vsir_block *block,
        struct vsir_block *header)
{
    enum vkd3d_result ret;
    size_t i;

    if ((ret = vsir_block_list_add(loop, block)) < 0)
        return ret;

    if (ret == VKD3D_FALSE || block == header)
        return VKD3D_OK;

    for (i = 0; i < block->predecessors.count; ++i)
    {
        if ((ret = vsir_cfg_scan_loop(loop, block->predecessors.blocks[i], header)) < 0)
            return ret;
    }

    return VKD3D_OK;
}

static enum vkd3d_result vsir_cfg_compute_loops(struct vsir_cfg *cfg)
{
    size_t i, j, k;

    if (!(cfg->loops_by_header = vkd3d_calloc(cfg->block_count, sizeof(*cfg->loops_by_header))))
        return VKD3D_ERROR_OUT_OF_MEMORY;
    memset(cfg->loops_by_header, 0xff, cfg->block_count * sizeof(*cfg->loops_by_header));

    for (i = 0; i < cfg->block_count; ++i)
    {
        struct vsir_block *block = &cfg->blocks[i];

        if (block->label == 0)
            continue;

        for (j = 0; j < block->successors.count; ++j)
        {
            struct vsir_block *header = block->successors.blocks[j];
            struct vsir_block_list *loop;
            enum vkd3d_result ret;

            /* Is this a back edge? */
            if (!vsir_block_dominates(header, block))
                continue;

            if (!vkd3d_array_reserve((void **)&cfg->loops, &cfg->loops_capacity, cfg->loops_count + 1, sizeof(*cfg->loops)))
                return VKD3D_ERROR_OUT_OF_MEMORY;

            loop = &cfg->loops[cfg->loops_count];
            vsir_block_list_init(loop);

            if ((ret = vsir_cfg_scan_loop(loop, block, header)) < 0)
                return ret;

            vsir_block_list_sort(loop);

            if (TRACE_ON())
            {
                vkd3d_string_buffer_printf(&cfg->debug_buffer, "Back edge %u -> %u with loop:", block->label, header->label);

                for (k = 0; k < loop->count; ++k)
                    vkd3d_string_buffer_printf(&cfg->debug_buffer, " %u", loop->blocks[k]->label);

                TRACE("%s\n", cfg->debug_buffer.buffer);
                vkd3d_string_buffer_clear(&cfg->debug_buffer);
            }

            if (cfg->loops_by_header[header->label - 1] != SIZE_MAX)
            {
                FIXME("Block %u is header to more than one loop, this is not implemented.\n", header->label);
                vkd3d_shader_error(cfg->message_context, &header->begin->location, VKD3D_SHADER_ERROR_VSIR_NOT_IMPLEMENTED,
                        "Block %u is header to more than one loop, this is not implemented.", header->label);
                return VKD3D_ERROR_NOT_IMPLEMENTED;
            }

            cfg->loops_by_header[header->label - 1] = cfg->loops_count;

            ++cfg->loops_count;
        }
    }

    return VKD3D_OK;
}

struct vsir_cfg_node_sorter
{
    struct vsir_cfg *cfg;
    struct vsir_cfg_node_sorter_stack_item
    {
        struct vsir_block_list *loop;
        unsigned int seen_count;
        unsigned int begin;
    } *stack;
    size_t stack_count, stack_capacity;
    struct vsir_block_list available_blocks;
};

static enum vkd3d_result vsir_cfg_node_sorter_make_node_available(struct vsir_cfg_node_sorter *sorter, struct vsir_block *block)
{
    struct vsir_block_list *loop = NULL;
    struct vsir_cfg_node_sorter_stack_item *item;
    enum vkd3d_result ret;

    if (sorter->cfg->loops_by_header[block->label - 1] != SIZE_MAX)
        loop = &sorter->cfg->loops[sorter->cfg->loops_by_header[block->label - 1]];

    if ((ret = vsir_block_list_add_checked(&sorter->available_blocks, block)) < 0)
        return ret;

    if (!loop)
        return VKD3D_OK;

    if (!vkd3d_array_reserve((void **)&sorter->stack, &sorter->stack_capacity, sorter->stack_count + 1, sizeof(*sorter->stack)))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    item = &sorter->stack[sorter->stack_count++];
    item->loop = loop;
    item->seen_count = 0;
    item->begin = sorter->cfg->order.count;

    return VKD3D_OK;
}

/* Topologically sort the blocks according to the forward edges. By
 * definition if the input CFG is reducible then its forward edges
 * form a DAG, so a topological sorting exists. In order to compute it
 * we keep an array with the incoming degree for each block and an
 * available list of all the blocks whose incoming degree has reached
 * zero. At each step we pick a block from the available list and
 * strip it away from the graph, updating the incoming degrees and
 * available list.
 *
 * In principle at each step we can pick whatever node we want from
 * the available list, and will get a topological sort
 * anyway. However, we use these two criteria to give to the computed
 * order additional properties:
 *
 *  1. we keep track of which loops we're into, and pick blocks
 *     belonging to the current innermost loop, so that loops are kept
 *     contiguous in the order; this can always be done when the input
 *     CFG is reducible;
 *
 *  2. subject to the requirement above, we always pick the most
 *     recently added block to the available list, because this tends
 *     to keep related blocks and require fewer control flow
 *     primitives.
 */
static enum vkd3d_result vsir_cfg_sort_nodes(struct vsir_cfg *cfg)
{
    struct vsir_cfg_node_sorter sorter = { .cfg = cfg };
    unsigned int *in_degrees = NULL;
    enum vkd3d_result ret;
    size_t i;

    if (!(in_degrees = vkd3d_calloc(cfg->block_count, sizeof(*in_degrees))))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    for (i = 0; i < cfg->block_count; ++i)
    {
        struct vsir_block *block = &cfg->blocks[i];

        if (block->label == 0)
        {
            in_degrees[i] = UINT_MAX;
            continue;
        }

        in_degrees[i] = block->predecessors.count;

        /* Do not count back edges. */
        if (cfg->loops_by_header[i] != SIZE_MAX)
        {
            assert(in_degrees[i] > 0);
            in_degrees[i] -= 1;
        }

        if (in_degrees[i] == 0 && block != cfg->entry)
        {
            WARN("Unexpected entry point %u.\n", block->label);
            vkd3d_shader_error(cfg->message_context, &block->begin->location, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW,
                    "Block %u is unreachable from the entry point.", block->label);
            ret = VKD3D_ERROR_INVALID_SHADER;
            goto fail;
        }
    }

    if (in_degrees[cfg->entry->label - 1] != 0)
    {
        WARN("Entry point has %u incoming forward edges.\n", in_degrees[cfg->entry->label - 1]);
        vkd3d_shader_error(cfg->message_context, &cfg->entry->begin->location, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW,
                "The entry point block has %u incoming forward edges.", in_degrees[cfg->entry->label - 1]);
        ret = VKD3D_ERROR_INVALID_SHADER;
        goto fail;
    }

    vsir_block_list_init(&sorter.available_blocks);

    if ((ret = vsir_cfg_node_sorter_make_node_available(&sorter, cfg->entry)) < 0)
        goto fail;

    while (sorter.available_blocks.count != 0)
    {
        struct vsir_cfg_node_sorter_stack_item *inner_stack_item = NULL;
        struct vsir_block *block;
        size_t new_seen_count;

        if (sorter.stack_count != 0)
            inner_stack_item = &sorter.stack[sorter.stack_count - 1];

        for (i = sorter.available_blocks.count - 1; ; --i)
        {
            if (i == SIZE_MAX)
            {
                ERR("Couldn't find any viable next block, is the input CFG reducible?\n");
                ret = VKD3D_ERROR_INVALID_SHADER;
                goto fail;
            }

            block = sorter.available_blocks.blocks[i];

            if (!inner_stack_item || vsir_block_list_search(inner_stack_item->loop, block))
                break;
        }

        vsir_block_list_remove_index(&sorter.available_blocks, i);
        block->order_pos = cfg->order.count;
        if ((ret = vsir_block_list_add_checked(&cfg->order, block)) < 0)
            goto fail;

        /* Close loops: since each loop is a strict subset of any
         * outer loop, we just need to track how many blocks we've
         * seen; when I close a loop I mark the same number of seen
         * blocks for the next outer loop. */
        new_seen_count = 1;
        while (sorter.stack_count != 0)
        {
            inner_stack_item = &sorter.stack[sorter.stack_count - 1];

            inner_stack_item->seen_count += new_seen_count;

            assert(inner_stack_item->seen_count <= inner_stack_item->loop->count);
            if (inner_stack_item->seen_count != inner_stack_item->loop->count)
                break;

            if ((ret = vsir_cfg_add_loop_interval(cfg, inner_stack_item->begin,
                    cfg->order.count, false)) < 0)
                goto fail;

            new_seen_count = inner_stack_item->loop->count;
            --sorter.stack_count;
        }

        /* Remove (forward) edges and make new nodes available. */
        for (i = 0; i < block->successors.count; ++i)
        {
            struct vsir_block *successor = block->successors.blocks[i];

            if (vsir_block_dominates(successor, block))
                continue;

            assert(in_degrees[successor->label - 1] > 0);
            --in_degrees[successor->label - 1];

            if (in_degrees[successor->label - 1] == 0)
            {
                if ((ret = vsir_cfg_node_sorter_make_node_available(&sorter, successor)) < 0)
                    goto fail;
            }
        }
    }

    if (cfg->order.count != cfg->block_count)
    {
        /* There is a cycle of forward edges. */
        WARN("The control flow graph is not reducible.\n");
        vkd3d_shader_error(cfg->message_context, &cfg->entry->begin->location, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW,
                "The control flow graph is not reducible.");
        ret = VKD3D_ERROR_INVALID_SHADER;
        goto fail;
    }

    assert(sorter.stack_count == 0);

    vkd3d_free(in_degrees);
    vkd3d_free(sorter.stack);
    vsir_block_list_cleanup(&sorter.available_blocks);

    if (TRACE_ON())
    {
        vkd3d_string_buffer_printf(&cfg->debug_buffer, "Block order:");

        for (i = 0; i < cfg->order.count; ++i)
            vkd3d_string_buffer_printf(&cfg->debug_buffer, " %u", cfg->order.blocks[i]->label);

        TRACE("%s\n", cfg->debug_buffer.buffer);
        vkd3d_string_buffer_clear(&cfg->debug_buffer);
    }

    return VKD3D_OK;

fail:
    vkd3d_free(in_degrees);
    vkd3d_free(sorter.stack);
    vsir_block_list_cleanup(&sorter.available_blocks);

    return ret;
}

/* Sort loop intervals first by ascending begin time and then by
 * descending end time, so that inner intervals appear after outer
 * ones and disjoint intervals appear in their proper order. */
static int compare_loop_intervals(const void *ptr1, const void *ptr2)
{
    const struct cfg_loop_interval *interval1 = ptr1;
    const struct cfg_loop_interval *interval2 = ptr2;

    if (interval1->begin != interval2->begin)
        return vkd3d_u32_compare(interval1->begin, interval2->begin);

    return -vkd3d_u32_compare(interval1->end, interval2->end);
}

static enum vkd3d_result vsir_cfg_generate_synthetic_loop_intervals(struct vsir_cfg *cfg)
{
    enum vkd3d_result ret;
    size_t i, j, k;

    for (i = 0; i < cfg->block_count; ++i)
    {
        struct vsir_block *block = &cfg->blocks[i];

        if (block->label == 0)
            continue;

        for (j = 0; j < block->successors.count; ++j)
        {
            struct vsir_block *successor = block->successors.blocks[j];
            struct cfg_loop_interval *extend = NULL;
            unsigned int begin;
            enum
            {
                ACTION_DO_NOTHING,
                ACTION_CREATE_NEW,
                ACTION_EXTEND,
            } action = ACTION_CREATE_NEW;

            /* We've already contructed loop intervals for the back
             * edges, there's nothing more to do. */
            if (vsir_block_dominates(successor, block))
                continue;

            assert(block->order_pos < successor->order_pos);

            /* Jumping from a block to the following one is always
             * possible, so nothing to do. */
            if (block->order_pos + 1 == successor->order_pos)
                continue;

            /* Let's look for a loop interval that already breaks at
             * `successor' and either contains or can be extended to
             * contain `block'. */
            for (k = 0; k < cfg->loop_interval_count; ++k)
            {
                struct cfg_loop_interval *interval = &cfg->loop_intervals[k];

                if (interval->end != successor->order_pos)
                    continue;

                if (interval->begin <= block->order_pos)
                {
                    action = ACTION_DO_NOTHING;
                    break;
                }

                if (interval->synthetic)
                {
                    action = ACTION_EXTEND;
                    extend = interval;
                    break;
                }
            }

            if (action == ACTION_DO_NOTHING)
                continue;

            /* Ok, we have to decide where the new or replacing
             * interval has to begin. These are the rules: 1. it must
             * begin before `block'; 2. intervals must be properly
             * nested; 3. the new interval should begin as late as
             * possible, to limit control flow depth and extension. */
            begin = block->order_pos;

            /* Our candidate interval is always [begin,
             * successor->order_pos), and we move `begin' backward
             * until the candidate interval contains all the intervals
             * whose endpoint lies in the candidate interval
             * itself. */
            for (k = 0; k < cfg->loop_interval_count; ++k)
            {
                struct cfg_loop_interval *interval = &cfg->loop_intervals[k];

                if (begin < interval->end && interval->end < successor->order_pos)
                    begin = min(begin, interval->begin);
            }

            /* New we have to care about the intervals whose begin
             * point lies in the candidate interval. We cannot move
             * the candidate interval endpoint, because it is
             * important that the loop break target matches
             * `successor'. So we have to move that interval's begin
             * point to the begin point of the candidate interval,
             * i.e. `begin'. But what if the interval we should extend
             * backward is not synthetic? This cannot happen,
             * fortunately, because it would mean that there is a jump
             * entering a loop via a block which is not the loop
             * header, so the CFG would not be reducible. */
            for (k = 0; k < cfg->loop_interval_count; ++k)
            {
                struct cfg_loop_interval *interval = &cfg->loop_intervals[k];

                if (interval->begin < successor->order_pos && successor->order_pos < interval->end)
                {
                    if (interval->synthetic)
                        interval->begin = min(begin, interval->begin);
                    assert(begin >= interval->begin);
                }
            }

            if (action == ACTION_EXTEND)
                extend->begin = begin;
            else if ((ret = vsir_cfg_add_loop_interval(cfg, begin, successor->order_pos, true)) < 0)
                return ret;
        }
    }

    qsort(cfg->loop_intervals, cfg->loop_interval_count, sizeof(*cfg->loop_intervals), compare_loop_intervals);

    if (TRACE_ON())
        for (i = 0; i < cfg->loop_interval_count; ++i)
            TRACE("%s loop interval %u - %u\n", cfg->loop_intervals[i].synthetic ? "Synthetic" : "Natural",
                    cfg->loop_intervals[i].begin, cfg->loop_intervals[i].end);

    return VKD3D_OK;
}

struct vsir_cfg_edge_action
{
    enum vsir_cfg_jump_type jump_type;
    unsigned int target;
    struct vsir_block *successor;
};

static void vsir_cfg_compute_edge_action(struct vsir_cfg *cfg, struct vsir_block *block,
        struct vsir_block *successor, struct vsir_cfg_edge_action *action)
{
    unsigned int i;

    action->target = UINT_MAX;
    action->successor = successor;

    if (successor->order_pos <= block->order_pos)
    {
        /* The successor is before the current block, so we have to
         * use `continue'. The target loop is the innermost that
         * contains the current block and has the successor as
         * `continue' target. */
        for (i = 0; i < cfg->loop_interval_count; ++i)
        {
            struct cfg_loop_interval *interval = &cfg->loop_intervals[i];

            if (interval->begin == successor->order_pos && block->order_pos < interval->end)
                action->target = i;

            if (interval->begin > successor->order_pos)
                break;
        }

        assert(action->target != UINT_MAX);
        action->jump_type = JUMP_CONTINUE;
    }
    else
    {
        /* The successor is after the current block, so we have to use
         * `break', or possibly just jump to the following block. The
         * target loop is the outermost that contains the current
         * block and has the successor as `break' target. */
        for (i = 0; i < cfg->loop_interval_count; ++i)
        {
            struct cfg_loop_interval *interval = &cfg->loop_intervals[i];

            if (interval->begin <= block->order_pos && interval->end == successor->order_pos)
            {
                action->target = i;
                break;
            }
        }

        if (action->target == UINT_MAX)
        {
            assert(successor->order_pos == block->order_pos + 1);
            action->jump_type = JUMP_NONE;
        }
        else
        {
            action->jump_type = JUMP_BREAK;
        }
    }
}

static enum vkd3d_result vsir_cfg_build_structured_program(struct vsir_cfg *cfg)
{
    unsigned int i, stack_depth = 1, open_interval_idx = 0;
    struct vsir_cfg_structure_list **stack = NULL;

    /* It's enough to allocate up to the maximum interval stacking
     * depth (plus one for the full program), but this is simpler. */
    if (!(stack = vkd3d_calloc(cfg->loop_interval_count + 1, sizeof(*stack))))
        goto fail;
    cfg->structured_program.end = cfg->order.count;
    stack[0] = &cfg->structured_program;

    for (i = 0; i < cfg->order.count; ++i)
    {
        struct vsir_block *block = cfg->order.blocks[i];
        struct vsir_cfg_structure *structure;

        assert(stack_depth > 0);

        /* Open loop intervals. */
        while (open_interval_idx < cfg->loop_interval_count)
        {
            struct cfg_loop_interval *interval = &cfg->loop_intervals[open_interval_idx];

            if (interval->begin != i)
                break;

            if (!(structure = vsir_cfg_structure_list_append(stack[stack_depth - 1], STRUCTURE_TYPE_LOOP)))
                goto fail;
            structure->u.loop.idx = open_interval_idx++;

            structure->u.loop.body.end = interval->end;
            stack[stack_depth++] = &structure->u.loop.body;
        }

        /* Execute the block. */
        if (!(structure = vsir_cfg_structure_list_append(stack[stack_depth - 1], STRUCTURE_TYPE_BLOCK)))
            goto fail;
        structure->u.block = block;

        /* Generate between zero and two jump instructions. */
        switch (block->end->handler_idx)
        {
            case VKD3DSIH_BRANCH:
            {
                struct vsir_cfg_edge_action action_true, action_false;
                bool invert_condition = false;

                if (vsir_register_is_label(&block->end->src[0].reg))
                {
                    unsigned int target = label_from_src_param(&block->end->src[0]);
                    struct vsir_block *successor = &cfg->blocks[target - 1];

                    vsir_cfg_compute_edge_action(cfg, block, successor, &action_true);
                    action_false = action_true;
                }
                else
                {
                    unsigned int target = label_from_src_param(&block->end->src[1]);
                    struct vsir_block *successor = &cfg->blocks[target - 1];

                    vsir_cfg_compute_edge_action(cfg, block, successor, &action_true);

                    target = label_from_src_param(&block->end->src[2]);
                    successor = &cfg->blocks[target - 1];

                    vsir_cfg_compute_edge_action(cfg, block, successor, &action_false);
                }

                /* This will happen if the branch is unconditional,
                 * but also if it's conditional with the same target
                 * in both branches, which can happen in some corner
                 * cases, e.g. when converting switch instructions to
                 * selection ladders. */
                if (action_true.successor == action_false.successor)
                {
                    assert(action_true.jump_type == action_false.jump_type);
                }
                else
                {
                    /* At most one branch can just fall through to the
                     * next block, in which case we make sure it's the
                     * false branch. */
                    if (action_true.jump_type == JUMP_NONE)
                    {
                        struct vsir_cfg_edge_action tmp = action_true;
                        action_true = action_false;
                        action_false = tmp;
                        invert_condition = true;
                    }

                    assert(action_true.jump_type != JUMP_NONE);

                    if (!(structure = vsir_cfg_structure_list_append(stack[stack_depth - 1], STRUCTURE_TYPE_JUMP)))
                        goto fail;
                    structure->u.jump.type = action_true.jump_type;
                    structure->u.jump.target = action_true.target;
                    structure->u.jump.condition = &block->end->src[0];
                    structure->u.jump.invert_condition = invert_condition;
                }

                if (action_false.jump_type != JUMP_NONE)
                {
                    if (!(structure = vsir_cfg_structure_list_append(stack[stack_depth - 1], STRUCTURE_TYPE_JUMP)))
                        goto fail;
                    structure->u.jump.type = action_false.jump_type;
                    structure->u.jump.target = action_false.target;
                }
                break;
            }

            case VKD3DSIH_RET:
                if (!(structure = vsir_cfg_structure_list_append(stack[stack_depth - 1], STRUCTURE_TYPE_JUMP)))
                    goto fail;
                structure->u.jump.type = JUMP_RET;
                break;

            default:
                vkd3d_unreachable();
        }

        /* Close loop intervals. */
        while (stack_depth > 0)
        {
            if (stack[stack_depth - 1]->end != i + 1)
                break;

            --stack_depth;
        }
    }

    assert(stack_depth == 0);
    assert(open_interval_idx == cfg->loop_interval_count);

    if (TRACE_ON())
        vsir_cfg_dump_structured_program(cfg);

    vkd3d_free(stack);

    return VKD3D_OK;

fail:
    vkd3d_free(stack);

    return VKD3D_ERROR_OUT_OF_MEMORY;
}

static enum vkd3d_result vsir_cfg_structure_list_emit(struct vsir_cfg *cfg,
        struct vsir_cfg_structure_list *list, unsigned int loop_idx)
{
    const struct vkd3d_shader_location no_loc = {0};
    enum vkd3d_result ret;
    size_t i;

    for (i = 0; i < list->count; ++i)
    {
        struct vsir_cfg_structure *structure = &list->structures[i];

        switch (structure->type)
        {
            case STRUCTURE_TYPE_BLOCK:
            {
                struct vsir_block *block = structure->u.block;

                if (!reserve_instructions(&cfg->instructions, &cfg->ins_capacity, cfg->ins_count + (block->end - block->begin)))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                memcpy(&cfg->instructions[cfg->ins_count], block->begin, (char *)block->end - (char *)block->begin);

                cfg->ins_count += block->end - block->begin;
                break;
            }

            case STRUCTURE_TYPE_LOOP:
            {
                if (!reserve_instructions(&cfg->instructions, &cfg->ins_capacity, cfg->ins_count + 1))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                vsir_instruction_init(&cfg->instructions[cfg->ins_count++], &no_loc, VKD3DSIH_LOOP);

                if ((ret = vsir_cfg_structure_list_emit(cfg, &structure->u.loop.body, structure->u.loop.idx)) < 0)
                    return ret;

                if (!reserve_instructions(&cfg->instructions, &cfg->ins_capacity, cfg->ins_count + 5))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                vsir_instruction_init(&cfg->instructions[cfg->ins_count++], &no_loc, VKD3DSIH_ENDLOOP);

                /* Add a trampoline to implement multilevel jumping depending on the stored
                 * jump_target value. */
                if (loop_idx != UINT_MAX)
                {
                    /* If the multilevel jump is a `continue' and the target is the loop we're inside
                     * right now, then we can finally do the `continue'. */
                    const unsigned int outer_continue_target = loop_idx << 1 | 1;
                    /* If the multilevel jump is a `continue' to any other target, or if it is a `break'
                     * and the target is not the loop we just finished emitting, then it means that
                     * we have to reach an outer loop, so we keep breaking. */
                    const unsigned int inner_break_target = structure->u.loop.idx << 1;

                    if (!vsir_instruction_init_with_params(cfg->program, &cfg->instructions[cfg->ins_count],
                            &no_loc, VKD3DSIH_IEQ, 1, 2))
                        return VKD3D_ERROR_OUT_OF_MEMORY;

                    dst_param_init_temp_bool(&cfg->instructions[cfg->ins_count].dst[0], cfg->temp_count);
                    src_param_init_temp_uint(&cfg->instructions[cfg->ins_count].src[0], cfg->jump_target_temp_idx);
                    src_param_init_const_uint(&cfg->instructions[cfg->ins_count].src[1], outer_continue_target);

                    ++cfg->ins_count;

                    if (!vsir_instruction_init_with_params(cfg->program, &cfg->instructions[cfg->ins_count],
                            &no_loc, VKD3DSIH_CONTINUEP, 0, 1))
                        return VKD3D_ERROR_OUT_OF_MEMORY;

                    src_param_init_temp_bool(&cfg->instructions[cfg->ins_count].src[0], cfg->temp_count);

                    ++cfg->ins_count;
                    ++cfg->temp_count;

                    if (!vsir_instruction_init_with_params(cfg->program, &cfg->instructions[cfg->ins_count],
                            &no_loc, VKD3DSIH_IEQ, 1, 2))
                        return VKD3D_ERROR_OUT_OF_MEMORY;

                    dst_param_init_temp_bool(&cfg->instructions[cfg->ins_count].dst[0], cfg->temp_count);
                    src_param_init_temp_uint(&cfg->instructions[cfg->ins_count].src[0], cfg->jump_target_temp_idx);
                    src_param_init_const_uint(&cfg->instructions[cfg->ins_count].src[1], inner_break_target);

                    ++cfg->ins_count;

                    if (!vsir_instruction_init_with_params(cfg->program, &cfg->instructions[cfg->ins_count],
                            &no_loc, VKD3DSIH_BREAKP, 0, 1))
                        return VKD3D_ERROR_OUT_OF_MEMORY;
                    cfg->instructions[cfg->ins_count].flags |= VKD3D_SHADER_CONDITIONAL_OP_Z;

                    src_param_init_temp_bool(&cfg->instructions[cfg->ins_count].src[0], cfg->temp_count);

                    ++cfg->ins_count;
                    ++cfg->temp_count;
                }

                break;
            }

            case STRUCTURE_TYPE_JUMP:
            {
                /* Encode the jump target as the loop index plus a bit to remember whether
                 * we're breaking or continueing. */
                unsigned int jump_target = structure->u.jump.target << 1;
                enum vkd3d_shader_opcode opcode;

                switch (structure->u.jump.type)
                {
                    case JUMP_CONTINUE:
                        /* If we're continueing the loop we're directly inside, then we can emit a
                         * `continue'. Otherwise we first have to break all the loops between here
                         * and the loop to continue, recording our intention to continue
                         * in the lowest bit of jump_target. */
                        if (structure->u.jump.target == loop_idx)
                        {
                            opcode = structure->u.jump.condition ? VKD3DSIH_CONTINUEP : VKD3DSIH_CONTINUE;
                            break;
                        }
                        jump_target |= 1;
                        /* fall through */

                    case JUMP_BREAK:
                        opcode = structure->u.jump.condition ? VKD3DSIH_BREAKP : VKD3DSIH_BREAK;
                        break;

                    case JUMP_RET:
                        assert(!structure->u.jump.condition);
                        opcode = VKD3DSIH_RET;
                        break;

                    default:
                        vkd3d_unreachable();
                }

                if (!reserve_instructions(&cfg->instructions, &cfg->ins_capacity, cfg->ins_count + 2))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                if (opcode == VKD3DSIH_BREAK || opcode == VKD3DSIH_BREAKP)
                {
                    if (!vsir_instruction_init_with_params(cfg->program, &cfg->instructions[cfg->ins_count],
                            &no_loc, VKD3DSIH_MOV, 1, 1))
                        return VKD3D_ERROR_OUT_OF_MEMORY;

                    dst_param_init_temp_uint(&cfg->instructions[cfg->ins_count].dst[0], cfg->jump_target_temp_idx);
                    src_param_init_const_uint(&cfg->instructions[cfg->ins_count].src[0], jump_target);

                    ++cfg->ins_count;
                }

                if (!vsir_instruction_init_with_params(cfg->program, &cfg->instructions[cfg->ins_count],
                        &no_loc, opcode, 0, !!structure->u.jump.condition))
                    return VKD3D_ERROR_OUT_OF_MEMORY;

                if (structure->u.jump.invert_condition)
                    cfg->instructions[cfg->ins_count].flags |= VKD3D_SHADER_CONDITIONAL_OP_Z;

                if (structure->u.jump.condition)
                    cfg->instructions[cfg->ins_count].src[0] = *structure->u.jump.condition;

                ++cfg->ins_count;
                break;
            }

            default:
                vkd3d_unreachable();
        }
    }

    return VKD3D_OK;
}

static enum vkd3d_result vsir_cfg_emit_structured_program(struct vsir_cfg *cfg)
{
    enum vkd3d_result ret;
    size_t i;

    cfg->jump_target_temp_idx = cfg->program->temp_count;
    cfg->temp_count = cfg->program->temp_count + 1;

    if (!reserve_instructions(&cfg->instructions, &cfg->ins_capacity, cfg->program->instructions.count))
        return VKD3D_ERROR_OUT_OF_MEMORY;

    /* Copy declarations until the first block. */
    for (i = 0; i < cfg->program->instructions.count; ++i)
    {
        struct vkd3d_shader_instruction *ins = &cfg->program->instructions.elements[i];

        if (ins->handler_idx == VKD3DSIH_LABEL)
            break;

        cfg->instructions[cfg->ins_count++] = *ins;
    }

    if ((ret = vsir_cfg_structure_list_emit(cfg, &cfg->structured_program, UINT_MAX)) < 0)
        goto fail;

    vkd3d_free(cfg->program->instructions.elements);
    cfg->program->instructions.elements = cfg->instructions;
    cfg->program->instructions.capacity = cfg->ins_capacity;
    cfg->program->instructions.count = cfg->ins_count;
    cfg->program->temp_count = cfg->temp_count;

    return VKD3D_OK;

fail:
    vkd3d_free(cfg->instructions);

    return ret;
}

enum vkd3d_result vsir_program_normalise(struct vsir_program *program, uint64_t config_flags,
        const struct vkd3d_shader_compile_info *compile_info, struct vkd3d_shader_message_context *message_context)
{
    enum vkd3d_result result = VKD3D_OK;

    remove_dcl_temps(program);

    if ((result = vsir_program_lower_texkills(program)) < 0)
        return result;

    if (program->shader_version.major >= 6)
    {
        struct vsir_cfg cfg;

        if ((result = lower_switch_to_if_ladder(program)) < 0)
            return result;

        if ((result = vsir_program_materialise_ssas_to_temps(program)) < 0)
            return result;

        if ((result = vsir_cfg_init(&cfg, program, message_context)) < 0)
            return result;

        vsir_cfg_compute_dominators(&cfg);

        if ((result = vsir_cfg_compute_loops(&cfg)) < 0)
        {
            vsir_cfg_cleanup(&cfg);
            return result;
        }

        if ((result = vsir_cfg_sort_nodes(&cfg)) < 0)
        {
            vsir_cfg_cleanup(&cfg);
            return result;
        }

        if ((result = vsir_cfg_generate_synthetic_loop_intervals(&cfg)) < 0)
        {
            vsir_cfg_cleanup(&cfg);
            return result;
        }

        if ((result = vsir_cfg_build_structured_program(&cfg)) < 0)
        {
            vsir_cfg_cleanup(&cfg);
            return result;
        }

        if ((result = vsir_cfg_emit_structured_program(&cfg)) < 0)
        {
            vsir_cfg_cleanup(&cfg);
            return result;
        }

        vsir_cfg_cleanup(&cfg);
    }
    else
    {
        if (program->shader_version.type != VKD3D_SHADER_TYPE_PIXEL)
        {
            if ((result = vsir_program_remap_output_signature(program, compile_info, message_context)) < 0)
                return result;
        }

        if (program->shader_version.type == VKD3D_SHADER_TYPE_HULL)
        {
            if ((result = instruction_array_flatten_hull_shader_phases(&program->instructions)) < 0)
                return result;

            if ((result = instruction_array_normalise_hull_shader_control_point_io(&program->instructions,
                    &program->input_signature)) < 0)
                return result;
        }

        if ((result = vsir_program_normalise_io_registers(program)) < 0)
            return result;

        if ((result = instruction_array_normalise_flat_constants(program)) < 0)
            return result;

        remove_dead_code(program);

        if ((result = vsir_program_normalise_combined_samplers(program, message_context)) < 0)
            return result;
    }

    if ((result = vsir_program_flatten_control_flow_constructs(program, message_context)) < 0)
        return result;

    if (TRACE_ON())
        vkd3d_shader_trace(program);

    if ((result = vsir_program_validate(program, config_flags,
            compile_info->source_name, message_context)) < 0)
        return result;

    return result;
}

struct validation_context
{
    struct vkd3d_shader_message_context *message_context;
    const struct vsir_program *program;
    size_t instruction_idx;
    struct vkd3d_shader_location null_location;
    bool invalid_instruction_idx;
    enum vkd3d_result status;
    bool dcl_temps_found;
    enum vkd3d_shader_opcode phase;
    enum cf_type
    {
        CF_TYPE_UNKNOWN = 0,
        CF_TYPE_STRUCTURED,
        CF_TYPE_BLOCKS,
    } cf_type;
    bool inside_block;

    struct validation_context_temp_data
    {
        enum vsir_dimension dimension;
        size_t first_seen;
    } *temps;

    struct validation_context_ssa_data
    {
        enum vsir_dimension dimension;
        enum vkd3d_data_type data_type;
        size_t first_seen;
        uint32_t write_mask;
        uint32_t read_mask;
        size_t first_assigned;
    } *ssas;

    enum vkd3d_shader_opcode *blocks;
    size_t depth;
    size_t blocks_capacity;
};

static void VKD3D_PRINTF_FUNC(3, 4) validator_error(struct validation_context *ctx,
        enum vkd3d_shader_error error, const char *format, ...)
{
    struct vkd3d_string_buffer buf;
    va_list args;

    vkd3d_string_buffer_init(&buf);

    va_start(args, format);
    vkd3d_string_buffer_vprintf(&buf, format, args);
    va_end(args);

    if (ctx->invalid_instruction_idx)
    {
        vkd3d_shader_error(ctx->message_context, &ctx->null_location, error, "%s", buf.buffer);
        ERR("VSIR validation error: %s\n", buf.buffer);
    }
    else
    {
        const struct vkd3d_shader_instruction *ins = &ctx->program->instructions.elements[ctx->instruction_idx];
        vkd3d_shader_error(ctx->message_context, &ins->location, error,
                "instruction %zu: %s", ctx->instruction_idx + 1, buf.buffer);
        ERR("VSIR validation error: instruction %zu: %s\n", ctx->instruction_idx + 1, buf.buffer);
    }

    vkd3d_string_buffer_cleanup(&buf);

    if (!ctx->status)
        ctx->status = VKD3D_ERROR_INVALID_SHADER;
}

static void vsir_validate_src_param(struct validation_context *ctx,
        const struct vkd3d_shader_src_param *src);

static void vsir_validate_register(struct validation_context *ctx,
        const struct vkd3d_shader_register *reg)
{
    unsigned int i;

    if (reg->type >= VKD3DSPR_COUNT)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE, "Invalid register type %#x.",
                reg->type);

    if (reg->precision >= VKD3D_SHADER_REGISTER_PRECISION_COUNT)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_PRECISION, "Invalid register precision %#x.",
                reg->precision);

    if (reg->data_type >= VKD3D_DATA_COUNT)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DATA_TYPE, "Invalid register data type %#x.",
                reg->data_type);

    if (reg->dimension >= VSIR_DIMENSION_COUNT)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION, "Invalid register dimension %#x.",
                reg->dimension);

    if (reg->idx_count > ARRAY_SIZE(reg->idx))
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid register index count %u.",
                reg->idx_count);

    for (i = 0; i < min(reg->idx_count, ARRAY_SIZE(reg->idx)); ++i)
    {
        const struct vkd3d_shader_src_param *param = reg->idx[i].rel_addr;
        if (reg->idx[i].rel_addr)
            vsir_validate_src_param(ctx, param);
    }

    switch (reg->type)
    {
        case VKD3DSPR_TEMP:
        {
            struct validation_context_temp_data *data;

            if (reg->idx_count != 1)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid index count %u for a TEMP register.",
                        reg->idx_count);
                break;
            }

            if (reg->idx[0].rel_addr)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX, "Non-NULL relative address for a TEMP register.");

            if (reg->idx[0].offset >= ctx->program->temp_count)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX, "TEMP register index %u exceeds the maximum count %u.",
                        reg->idx[0].offset, ctx->program->temp_count);
                break;
            }

            data = &ctx->temps[reg->idx[0].offset];

            if (reg->dimension == VSIR_DIMENSION_NONE)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION, "Invalid dimension NONE for a TEMP register.");
                break;
            }

            /* TEMP registers can be scalar or vec4, provided that
             * each individual register always appears with the same
             * dimension. */
            if (data->dimension == VSIR_DIMENSION_NONE)
            {
                data->dimension = reg->dimension;
                data->first_seen = ctx->instruction_idx;
            }
            else if (data->dimension != reg->dimension)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION, "Invalid dimension %#x for a TEMP register: "
                        "it has already been seen with dimension %#x at instruction %zu.",
                        reg->dimension, data->dimension, data->first_seen);
            }
            break;
        }

        case VKD3DSPR_SSA:
        {
            struct validation_context_ssa_data *data;

            if (reg->idx_count != 1)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid index count %u for a SSA register.",
                        reg->idx_count);
                break;
            }

            if (reg->idx[0].rel_addr)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX, "Non-NULL relative address for a SSA register.");

            if (reg->idx[0].offset >= ctx->program->ssa_count)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX,
                        "SSA register index %u exceeds the maximum count %u.",
                        reg->idx[0].offset, ctx->program->ssa_count);
                break;
            }

            data = &ctx->ssas[reg->idx[0].offset];

            if (reg->dimension == VSIR_DIMENSION_NONE)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION, "Invalid dimension NONE for a SSA register.");
                break;
            }

            /* SSA registers can be scalar or vec4, provided that each
             * individual register always appears with the same
             * dimension. */
            if (data->dimension == VSIR_DIMENSION_NONE)
            {
                data->dimension = reg->dimension;
                data->data_type = reg->data_type;
                data->first_seen = ctx->instruction_idx;
            }
            else
            {
                if (data->dimension != reg->dimension)
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION, "Invalid dimension %#x for a SSA register: "
                            "it has already been seen with dimension %#x at instruction %zu.",
                            reg->dimension, data->dimension, data->first_seen);

                if (data_type_is_64_bit(data->data_type) != data_type_is_64_bit(reg->data_type))
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DATA_TYPE, "Invalid data type %#x for a SSA register: "
                            "it has already been seen with data type %#x at instruction %zu.",
                            reg->data_type, data->data_type, data->first_seen);
            }
            break;
        }

        case VKD3DSPR_LABEL:
            if (reg->precision != VKD3D_SHADER_REGISTER_PRECISION_DEFAULT)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_PRECISION, "Invalid precision %#x for a LABEL register.",
                        reg->precision);

            if (reg->data_type != VKD3D_DATA_UNUSED)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DATA_TYPE, "Invalid data type %#x for a LABEL register.",
                        reg->data_type);

            if (reg->dimension != VSIR_DIMENSION_NONE)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION, "Invalid dimension %#x for a LABEL register.",
                        reg->dimension);

            if (reg->idx_count != 1)
            {
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid index count %u for a LABEL register.",
                        reg->idx_count);
                break;
            }

            if (reg->idx[0].rel_addr)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX, "Non-NULL relative address for a LABEL register.");

            /* Index == 0 is invalid, but it is temporarily allowed
             * for intermediate stages. Once we support validation
             * dialects we can selectively check for that. */
            if (reg->idx[0].offset > ctx->program->block_count)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX,
                        "LABEL register index %u exceeds the maximum count %u.",
                        reg->idx[0].offset, ctx->program->block_count);
            break;

        case VKD3DSPR_NULL:
            if (reg->idx_count != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid index count %u for a NULL register.",
                        reg->idx_count);
            break;

        case VKD3DSPR_IMMCONST:
            if (reg->idx_count != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid index count %u for a IMMCONST register.",
                        reg->idx_count);
            break;

        case VKD3DSPR_IMMCONST64:
            if (reg->idx_count != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_INDEX_COUNT, "Invalid index count %u for a IMMCONST64 register.",
                        reg->idx_count);
            break;

        default:
            break;
    }
}

static void vsir_validate_dst_param(struct validation_context *ctx,
        const struct vkd3d_shader_dst_param *dst)
{
    vsir_validate_register(ctx, &dst->reg);

    if (dst->write_mask & ~VKD3DSP_WRITEMASK_ALL)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_WRITE_MASK, "Destination has invalid write mask %#x.",
                dst->write_mask);

    switch (dst->reg.dimension)
    {
        case VSIR_DIMENSION_SCALAR:
            if (dst->write_mask != VKD3DSP_WRITEMASK_0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_WRITE_MASK, "Scalar destination has invalid write mask %#x.",
                    dst->write_mask);
            break;

        case VSIR_DIMENSION_VEC4:
            if (dst->write_mask == 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_WRITE_MASK, "Vec4 destination has empty write mask.");
            break;

        default:
            if (dst->write_mask != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_WRITE_MASK, "Destination of dimension %u has invalid write mask %#x.",
                    dst->reg.dimension, dst->write_mask);
            break;
    }

    if (dst->modifiers & ~VKD3DSPDM_MASK)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_MODIFIERS, "Destination has invalid modifiers %#x.",
                dst->modifiers);

    switch (dst->shift)
    {
        case 0:
        case 1:
        case 2:
        case 3:
        case 13:
        case 14:
        case 15:
            break;

        default:
            validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SHIFT, "Destination has invalid shift %#x.",
                    dst->shift);
    }

    switch (dst->reg.type)
    {
        case VKD3DSPR_SSA:
            if (dst->reg.idx[0].offset < ctx->program->ssa_count)
            {
                struct validation_context_ssa_data *data = &ctx->ssas[dst->reg.idx[0].offset];

                if (data->write_mask == 0)
                {
                    data->write_mask = dst->write_mask;
                    data->first_assigned = ctx->instruction_idx;
                }
                else
                {
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SSA_USAGE,
                            "SSA register is already assigned at instruction %zu.",
                            data->first_assigned);
                }
            }
            break;

        case VKD3DSPR_IMMCONST:
            validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                    "Invalid IMMCONST register used as destination parameter.");
            break;

        case VKD3DSPR_IMMCONST64:
            validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                    "Invalid IMMCONST64 register used as destination parameter.");
            break;

        default:
            break;
    }
}

static void vsir_validate_src_param(struct validation_context *ctx,
        const struct vkd3d_shader_src_param *src)
{
    vsir_validate_register(ctx, &src->reg);

    if (src->swizzle & ~0x03030303u)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SWIZZLE, "Source has invalid swizzle %#x.",
                src->swizzle);

    if (src->reg.dimension != VSIR_DIMENSION_VEC4 && src->swizzle != 0)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SWIZZLE, "Source of dimension %u has invalid swizzle %#x.",
                src->reg.dimension, src->swizzle);

    if (src->modifiers >= VKD3DSPSM_COUNT)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_MODIFIERS, "Source has invalid modifiers %#x.",
                src->modifiers);

    switch (src->reg.type)
    {
        case VKD3DSPR_SSA:
            if (src->reg.idx[0].offset < ctx->program->ssa_count)
            {
                struct validation_context_ssa_data *data = &ctx->ssas[src->reg.idx[0].offset];
                unsigned int i;

                for (i = 0; i < VKD3D_VEC4_SIZE; ++i)
                    data->read_mask |= (1u << vsir_swizzle_get_component(src->swizzle, i));
            }
            break;

        default:
            break;
    }
}

static void vsir_validate_dst_count(struct validation_context *ctx,
        const struct vkd3d_shader_instruction *instruction, unsigned int count)
{
    if (instruction->dst_count != count)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DEST_COUNT,
                "Invalid destination count %u for an instruction of type %#x, expected %u.",
                        instruction->dst_count, instruction->handler_idx, count);
}

static void vsir_validate_src_count(struct validation_context *ctx,
        const struct vkd3d_shader_instruction *instruction, unsigned int count)
{
    if (instruction->src_count != count)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SOURCE_COUNT,
                "Invalid source count %u for an instruction of type %#x, expected %u.",
                instruction->src_count, instruction->handler_idx, count);
}

static bool vsir_validate_src_min_count(struct validation_context *ctx,
        const struct vkd3d_shader_instruction *instruction, unsigned int count)
{
    if (instruction->src_count < count)
    {
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SOURCE_COUNT,
                "Invalid source count %u for an instruction of type %#x, expected at least %u.",
                instruction->src_count, instruction->handler_idx, count);
        return false;
    }

    return true;
}

static bool vsir_validate_src_max_count(struct validation_context *ctx,
        const struct vkd3d_shader_instruction *instruction, unsigned int count)
{
    if (instruction->src_count > count)
    {
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SOURCE_COUNT,
                "Invalid source count %u for an instruction of type %#x, expected at most %u.",
                instruction->src_count, instruction->handler_idx, count);
        return false;
    }

    return true;
}

static const char *name_from_cf_type(enum cf_type type)
{
    switch (type)
    {
        case CF_TYPE_STRUCTURED:
            return "structured";
        case CF_TYPE_BLOCKS:
            return "block-based";
        default:
            vkd3d_unreachable();
    }
}

static void vsir_validate_cf_type(struct validation_context *ctx,
        const struct vkd3d_shader_instruction *instruction, enum cf_type expected_type)
{
    assert(ctx->cf_type != CF_TYPE_UNKNOWN);
    assert(expected_type != CF_TYPE_UNKNOWN);
    if (ctx->cf_type != expected_type)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "Invalid instruction %#x in %s shader.",
                instruction->handler_idx, name_from_cf_type(ctx->cf_type));
}

static void vsir_validate_instruction(struct validation_context *ctx)
{
    const struct vkd3d_shader_version *version = &ctx->program->shader_version;
    const struct vkd3d_shader_instruction *instruction;
    size_t i;

    instruction = &ctx->program->instructions.elements[ctx->instruction_idx];

    for (i = 0; i < instruction->dst_count; ++i)
        vsir_validate_dst_param(ctx, &instruction->dst[i]);

    for (i = 0; i < instruction->src_count; ++i)
        vsir_validate_src_param(ctx, &instruction->src[i]);

    if (instruction->handler_idx >= VKD3DSIH_INVALID)
    {
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_HANDLER, "Invalid instruction handler %#x.",
                instruction->handler_idx);
    }

    switch (instruction->handler_idx)
    {
        case VKD3DSIH_HS_DECLS:
        case VKD3DSIH_HS_CONTROL_POINT_PHASE:
        case VKD3DSIH_HS_FORK_PHASE:
        case VKD3DSIH_HS_JOIN_PHASE:
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (version->type != VKD3D_SHADER_TYPE_HULL)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_HANDLER, "Phase instruction %#x is only valid in a hull shader.",
                        instruction->handler_idx);
            if (ctx->depth != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "Phase instruction %#x must appear to top level.",
                        instruction->handler_idx);
            ctx->phase = instruction->handler_idx;
            ctx->dcl_temps_found = false;
            return;

        default:
            break;
    }

    if (version->type == VKD3D_SHADER_TYPE_HULL && ctx->phase == VKD3DSIH_INVALID)
        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_HANDLER,
                "Instruction %#x appear before any phase instruction in a hull shader.",
                instruction->handler_idx);

    /* We support two different control flow types in shaders:
     * block-based, like DXIL and SPIR-V, and structured, like D3DBC
     * and TPF. The shader is detected as block-based when its first
     * instruction, except for DCL_* and phases, is a LABEL. Currently
     * we mandate that each shader is either purely block-based or
     * purely structured. In principle we could allow structured
     * constructs in a block, provided they are confined in a single
     * block, but need for that hasn't arisen yet, so we don't. */
    if (ctx->cf_type == CF_TYPE_UNKNOWN && !vsir_instruction_is_dcl(instruction))
    {
        if (instruction->handler_idx == VKD3DSIH_LABEL)
            ctx->cf_type = CF_TYPE_BLOCKS;
        else
            ctx->cf_type = CF_TYPE_STRUCTURED;
    }

    if (ctx->cf_type == CF_TYPE_BLOCKS && !vsir_instruction_is_dcl(instruction))
    {
        switch (instruction->handler_idx)
        {
            case VKD3DSIH_LABEL:
                if (ctx->inside_block)
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "Invalid LABEL instruction inside a block.");
                ctx->inside_block = true;
                break;

            case VKD3DSIH_RET:
            case VKD3DSIH_BRANCH:
            case VKD3DSIH_SWITCH_MONOLITHIC:
                if (!ctx->inside_block)
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "Invalid instruction %#x outside any block.",
                            instruction->handler_idx);
                ctx->inside_block = false;
                break;

            default:
                if (!ctx->inside_block)
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "Invalid instruction %#x outside any block.",
                            instruction->handler_idx);
                break;
        }
    }

    switch (instruction->handler_idx)
    {
        case VKD3DSIH_DCL_TEMPS:
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (ctx->dcl_temps_found)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_DUPLICATE_DCL_TEMPS, "Duplicate DCL_TEMPS instruction.");
            if (instruction->declaration.count > ctx->program->temp_count)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DCL_TEMPS,
                        "Invalid DCL_TEMPS count %u, expected at most %u.",
                        instruction->declaration.count, ctx->program->temp_count);
            ctx->dcl_temps_found = true;
            break;

        case VKD3DSIH_IF:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 1);
            if (!vkd3d_array_reserve((void **)&ctx->blocks, &ctx->blocks_capacity, ctx->depth + 1, sizeof(*ctx->blocks)))
                return;
            ctx->blocks[ctx->depth++] = instruction->handler_idx;
            break;

        case VKD3DSIH_IFC:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 2);
            if (!vkd3d_array_reserve((void **)&ctx->blocks, &ctx->blocks_capacity, ctx->depth + 1, sizeof(*ctx->blocks)))
                return;
            ctx->blocks[ctx->depth++] = VKD3DSIH_IF;
            break;

        case VKD3DSIH_ELSE:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (ctx->depth == 0 || ctx->blocks[ctx->depth - 1] != VKD3DSIH_IF)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "ELSE instruction doesn't terminate IF block.");
            else
                ctx->blocks[ctx->depth - 1] = instruction->handler_idx;
            break;

        case VKD3DSIH_ENDIF:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (ctx->depth == 0 || (ctx->blocks[ctx->depth - 1] != VKD3DSIH_IF && ctx->blocks[ctx->depth - 1] != VKD3DSIH_ELSE))
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "ENDIF instruction doesn't terminate IF/ELSE block.");
            else
                --ctx->depth;
            break;

        case VKD3DSIH_LOOP:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, version->major <= 3 ? 2 : 0);
            if (!vkd3d_array_reserve((void **)&ctx->blocks, &ctx->blocks_capacity, ctx->depth + 1, sizeof(*ctx->blocks)))
                return;
            ctx->blocks[ctx->depth++] = instruction->handler_idx;
            break;

        case VKD3DSIH_ENDLOOP:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (ctx->depth == 0 || ctx->blocks[ctx->depth - 1] != VKD3DSIH_LOOP)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "ENDLOOP instruction doesn't terminate LOOP block.");
            else
                --ctx->depth;
            break;

        case VKD3DSIH_REP:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 1);
            if (!vkd3d_array_reserve((void **)&ctx->blocks, &ctx->blocks_capacity, ctx->depth + 1, sizeof(*ctx->blocks)))
                return;
            ctx->blocks[ctx->depth++] = instruction->handler_idx;
            break;

        case VKD3DSIH_ENDREP:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (ctx->depth == 0 || ctx->blocks[ctx->depth - 1] != VKD3DSIH_REP)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "ENDREP instruction doesn't terminate REP block.");
            else
                --ctx->depth;
            break;

        case VKD3DSIH_SWITCH:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 1);
            if (!vkd3d_array_reserve((void **)&ctx->blocks, &ctx->blocks_capacity, ctx->depth + 1, sizeof(*ctx->blocks)))
                return;
            ctx->blocks[ctx->depth++] = instruction->handler_idx;
            break;

        case VKD3DSIH_ENDSWITCH:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_STRUCTURED);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            if (ctx->depth == 0 || ctx->blocks[ctx->depth - 1] != VKD3DSIH_SWITCH)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "ENDSWITCH instruction doesn't terminate SWITCH block.");
            else
                --ctx->depth;
            break;

        case VKD3DSIH_RET:
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 0);
            break;

        case VKD3DSIH_LABEL:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_BLOCKS);
            vsir_validate_dst_count(ctx, instruction, 0);
            vsir_validate_src_count(ctx, instruction, 1);
            if (instruction->src_count >= 1 && !vsir_register_is_label(&instruction->src[0].reg))
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                        "Invalid register of type %#x in a LABEL instruction, expected LABEL.",
                        instruction->src[0].reg.type);
            break;

        case VKD3DSIH_BRANCH:
            vsir_validate_cf_type(ctx, instruction, CF_TYPE_BLOCKS);
            vsir_validate_dst_count(ctx, instruction, 0);
            if (!vsir_validate_src_min_count(ctx, instruction, 1))
                break;
            if (vsir_register_is_label(&instruction->src[0].reg))
            {
                /* Unconditional branch: parameters are jump label,
                 * optional merge label, optional continue label. */
                vsir_validate_src_max_count(ctx, instruction, 3);

                for (i = 0; i < instruction->src_count; ++i)
                {
                    if (!vsir_register_is_label(&instruction->src[i].reg))
                        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                                "Invalid register of type %#x in unconditional BRANCH instruction, expected LABEL.",
                                instruction->src[i].reg.type);
                }
            }
            else
            {
                /* Conditional branch: parameters are condition, true
                 * jump label, false jump label, optional merge label,
                 * optional continue label. */
                vsir_validate_src_min_count(ctx, instruction, 3);
                vsir_validate_src_max_count(ctx, instruction, 5);

                for (i = 1; i < instruction->src_count; ++i)
                {
                    if (!vsir_register_is_label(&instruction->src[i].reg))
                        validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                                "Invalid register of type %#x in conditional BRANCH instruction, expected LABEL.",
                                instruction->src[i].reg.type);
                }
            }
            break;

        case VKD3DSIH_SWITCH_MONOLITHIC:
        {
            unsigned int case_count;

            vsir_validate_cf_type(ctx, instruction, CF_TYPE_BLOCKS);
            vsir_validate_dst_count(ctx, instruction, 0);
            /* Parameters are source, default label, merge label and
             * then pairs of constant value and case label. */
            if (!vsir_validate_src_min_count(ctx, instruction, 3))
                break;
            if (instruction->src_count % 2 != 1)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SOURCE_COUNT,
                        "Invalid source count %u for a monolithic SWITCH instruction, it must be an odd number.",
                        instruction->src_count);

            if (!vsir_register_is_label(&instruction->src[1].reg))
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                        "Invalid default label register of type %#x in monolithic SWITCH instruction, expected LABEL.",
                        instruction->src[1].reg.type);

            if (!vsir_register_is_label(&instruction->src[2].reg))
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                        "Invalid merge label register of type %#x in monolithic SWITCH instruction, expected LABEL.",
                        instruction->src[2].reg.type);

            case_count = (instruction->src_count - 3) / 2;

            for (i = 0; i < case_count; ++i)
            {
                unsigned int value_idx = 3 + 2 * i;
                unsigned int label_idx = 3 + 2 * i + 1;

                if (!register_is_constant(&instruction->src[value_idx].reg))
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                            "Invalid value register for case %zu of type %#x in monolithic SWITCH instruction, "
                            "expected IMMCONST or IMMCONST64.", i, instruction->src[value_idx].reg.type);

                if (!vsir_register_is_label(&instruction->src[label_idx].reg))
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                            "Invalid label register for case %zu of type %#x in monolithic SWITCH instruction, "
                            "expected LABEL.", i, instruction->src[value_idx].reg.type);
            }
            break;
        }

        case VKD3DSIH_PHI:
        {
            unsigned int incoming_count;

            vsir_validate_cf_type(ctx, instruction, CF_TYPE_BLOCKS);
            vsir_validate_dst_count(ctx, instruction, 1);
            vsir_validate_src_min_count(ctx, instruction, 2);
            if (instruction->src_count % 2 != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SOURCE_COUNT,
                        "Invalid source count %u for a PHI instruction, it must be an even number.",
                        instruction->src_count);
            incoming_count = instruction->src_count / 2;

            if (!register_is_ssa(&instruction->dst[0].reg))
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                        "Invalid destination of type %#x in PHI instruction, expected SSA.",
                        instruction->dst[0].reg.type);

            if (instruction->dst[0].reg.dimension != VSIR_DIMENSION_SCALAR)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION,
                        "Invalid destination dimension %#x in PHI instruction, expected scalar.",
                        instruction->dst[0].reg.dimension);

            if (instruction->dst[0].modifiers != VKD3DSPDM_NONE)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_MODIFIERS,
                        "Invalid modifiers %#x for the destination of a PHI instruction, expected none.",
                        instruction->dst[0].modifiers);

            if (instruction->dst[0].shift != 0)
                validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SHIFT,
                        "Invalid shift %#x for the destination of a PHI instruction, expected none.",
                        instruction->dst[0].shift);

            for (i = 0; i < incoming_count; ++i)
            {
                unsigned int value_idx = 2 * i;
                unsigned int label_idx = 2 * i + 1;

                if (!register_is_constant(&instruction->src[value_idx].reg) && !register_is_ssa(&instruction->src[value_idx].reg))
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                            "Invalid value register for incoming %zu of type %#x in PHI instruction, "
                            "expected SSA, IMMCONST or IMMCONST64.", i, instruction->src[value_idx].reg.type);

                if (instruction->src[value_idx].reg.dimension != VSIR_DIMENSION_SCALAR)
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_DIMENSION,
                            "Invalid value dimension %#x for incoming %zu in PHI instruction, expected scalar.",
                            instruction->src[value_idx].reg.dimension, i);

                if (!vsir_register_is_label(&instruction->src[label_idx].reg))
                    validator_error(ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_REGISTER_TYPE,
                            "Invalid label register for case %zu of type %#x in PHI instruction, "
                            "expected LABEL.", i, instruction->src[value_idx].reg.type);
            }
            break;
        }

        default:
            break;
    }
}

enum vkd3d_result vsir_program_validate(struct vsir_program *program, uint64_t config_flags,
        const char *source_name, struct vkd3d_shader_message_context *message_context)
{
    struct validation_context ctx =
    {
        .message_context = message_context,
        .program = program,
        .null_location = {.source_name = source_name},
        .status = VKD3D_OK,
        .phase = VKD3DSIH_INVALID,
    };
    unsigned int i;

    if (!(config_flags & VKD3D_SHADER_CONFIG_FLAG_FORCE_VALIDATION))
        return VKD3D_OK;

    if (!(ctx.temps = vkd3d_calloc(ctx.program->temp_count, sizeof(*ctx.temps))))
        goto fail;

    if (!(ctx.ssas = vkd3d_calloc(ctx.program->ssa_count, sizeof(*ctx.ssas))))
        goto fail;

    for (ctx.instruction_idx = 0; ctx.instruction_idx < program->instructions.count; ++ctx.instruction_idx)
        vsir_validate_instruction(&ctx);

    ctx.invalid_instruction_idx = true;

    if (ctx.depth != 0)
        validator_error(&ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "%zu nested blocks were not closed.", ctx.depth);

    if (ctx.inside_block)
        validator_error(&ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_CONTROL_FLOW, "Last block was not closed.");

    for (i = 0; i < ctx.program->ssa_count; ++i)
    {
        struct validation_context_ssa_data *data = &ctx.ssas[i];

        if ((data->write_mask | data->read_mask) != data->write_mask)
            validator_error(&ctx, VKD3D_SHADER_ERROR_VSIR_INVALID_SSA_USAGE,
                    "SSA register %u has invalid read mask %#x, which is not a subset of the write mask %#x "
                    "at the point of definition.", i, data->read_mask, data->write_mask);
    }

    vkd3d_free(ctx.blocks);
    vkd3d_free(ctx.temps);
    vkd3d_free(ctx.ssas);

    return ctx.status;

fail:
    vkd3d_free(ctx.blocks);
    vkd3d_free(ctx.temps);
    vkd3d_free(ctx.ssas);

    return VKD3D_ERROR_OUT_OF_MEMORY;
}