diff --git a/libs/vkd3d-shader/hlsl_codegen.c b/libs/vkd3d-shader/hlsl_codegen.c index 0ad75b03..9974dcd4 100644 --- a/libs/vkd3d-shader/hlsl_codegen.c +++ b/libs/vkd3d-shader/hlsl_codegen.c @@ -2491,6 +2491,231 @@ enum validation_result DEREF_VALIDATION_NOT_CONSTANT, }; +struct vectorize_exprs_state +{ + struct vectorizable_exprs_group + { + struct hlsl_block *block; + struct hlsl_ir_expr *exprs[4]; + uint8_t expr_count, component_count; + } *groups; + size_t count, capacity; +}; + +static bool is_same_vectorizable_source(struct hlsl_ir_node *a, struct hlsl_ir_node *b) +{ + /* TODO: We can also vectorize different constants. */ + + if (a->type == HLSL_IR_SWIZZLE) + a = hlsl_ir_swizzle(a)->val.node; + if (b->type == HLSL_IR_SWIZZLE) + b = hlsl_ir_swizzle(b)->val.node; + + return a == b; +} + +static bool is_same_vectorizable_expr(struct hlsl_ir_expr *a, struct hlsl_ir_expr *b) +{ + if (a->op != b->op) + return false; + + for (size_t j = 0; j < HLSL_MAX_OPERANDS; ++j) + { + if (!a->operands[j].node) + break; + if (!is_same_vectorizable_source(a->operands[j].node, b->operands[j].node)) + return false; + } + + return true; +} + +static void record_vectorizable_expr(struct hlsl_ctx *ctx, struct hlsl_block *block, + struct hlsl_ir_expr *expr, struct vectorize_exprs_state *state) +{ + if (expr->node.data_type->class > HLSL_CLASS_VECTOR) + return; + + /* These are the only current ops that are not per-component. */ + if (expr->op == HLSL_OP1_COS_REDUCED || expr->op == HLSL_OP1_SIN_REDUCED + || expr->op == HLSL_OP2_DOT || expr->op == HLSL_OP3_DP2ADD) + return; + + for (size_t i = 0; i < state->count; ++i) + { + struct vectorizable_exprs_group *group = &state->groups[i]; + struct hlsl_ir_expr *other = group->exprs[0]; + + /* These are SSA instructions, which means they have the same value + * regardless of what block they're in. However, being in different + * blocks may mean that one expression or the other is not always + * executed. */ + + if (expr->node.data_type->e.numeric.dimx + group->component_count <= 4 + && group->block == block + && is_same_vectorizable_expr(expr, other)) + { + group->exprs[group->expr_count++] = expr; + group->component_count += expr->node.data_type->e.numeric.dimx; + return; + } + } + + if (!hlsl_array_reserve(ctx, (void **)&state->groups, + &state->capacity, state->count + 1, sizeof(*state->groups))) + return; + state->groups[state->count].block = block; + state->groups[state->count].exprs[0] = expr; + state->groups[state->count].expr_count = 1; + state->groups[state->count].component_count = expr->node.data_type->e.numeric.dimx; + ++state->count; +} + +static void find_vectorizable_expr_groups(struct hlsl_ctx *ctx, struct hlsl_block *block, + struct vectorize_exprs_state *state) +{ + struct hlsl_ir_node *instr; + + LIST_FOR_EACH_ENTRY(instr, &block->instrs, struct hlsl_ir_node, entry) + { + if (instr->type == HLSL_IR_EXPR) + { + record_vectorizable_expr(ctx, block, hlsl_ir_expr(instr), state); + } + else if (instr->type == HLSL_IR_IF) + { + struct hlsl_ir_if *iff = hlsl_ir_if(instr); + + find_vectorizable_expr_groups(ctx, &iff->then_block, state); + find_vectorizable_expr_groups(ctx, &iff->else_block, state); + } + else if (instr->type == HLSL_IR_LOOP) + { + find_vectorizable_expr_groups(ctx, &hlsl_ir_loop(instr)->body, state); + } + else if (instr->type == HLSL_IR_SWITCH) + { + struct hlsl_ir_switch *s = hlsl_ir_switch(instr); + struct hlsl_ir_switch_case *c; + + LIST_FOR_EACH_ENTRY(c, &s->cases, struct hlsl_ir_switch_case, entry) + find_vectorizable_expr_groups(ctx, &c->body, state); + } + } +} + +/* Combine sequences like + * + * 3: @1.x + * 4: @2.x + * 5: @3 * @4 + * 6: @1.y + * 7: @2.x + * 8: @6 * @7 + * + * into + * + * 5_1: @1.xy + * 5_2: @2.xx + * 5_3: @5_1 * @5_2 + * 5: @5_3.x + * 8: @5_3.y + * + * Each operand to an expression needs to refer to the same ultimate source + * (in this case @1 and @2 respectively), but can be a swizzle thereof. + * + * In practice the swizzles @5 and @8 can generally then be vectorized again, + * either as part of another expression, or as part of a store. + */ +static bool vectorize_exprs(struct hlsl_ctx *ctx, struct hlsl_block *block) +{ + struct vectorize_exprs_state state = {0}; + bool progress = false; + + find_vectorizable_expr_groups(ctx, block, &state); + + for (unsigned int i = 0; i < state.count; ++i) + { + struct vectorizable_exprs_group *group = &state.groups[i]; + struct hlsl_ir_node *args[HLSL_MAX_OPERANDS] = {0}; + uint32_t swizzles[HLSL_MAX_OPERANDS] = {0}; + struct hlsl_ir_node *arg, *combined; + unsigned int component_count = 0; + struct hlsl_type *combined_type; + struct hlsl_block new_block; + struct hlsl_ir_expr *expr; + + if (group->expr_count == 1) + continue; + + hlsl_block_init(&new_block); + + for (unsigned int j = 0; j < group->expr_count; ++j) + { + expr = group->exprs[j]; + + for (unsigned int a = 0; a < HLSL_MAX_OPERANDS; ++a) + { + uint32_t arg_swizzle; + + if (!(arg = expr->operands[a].node)) + break; + + if (arg->type == HLSL_IR_SWIZZLE) + arg_swizzle = hlsl_ir_swizzle(arg)->u.vector; + else + arg_swizzle = HLSL_SWIZZLE(X, Y, Z, W); + + /* Mask out the invalid components. */ + arg_swizzle &= (1u << VKD3D_SHADER_SWIZZLE_SHIFT(arg->data_type->e.numeric.dimx)) - 1; + swizzles[a] |= arg_swizzle << VKD3D_SHADER_SWIZZLE_SHIFT(component_count); + } + + component_count += expr->node.data_type->e.numeric.dimx; + } + + expr = group->exprs[0]; + for (unsigned int a = 0; a < HLSL_MAX_OPERANDS; ++a) + { + if (!(arg = expr->operands[a].node)) + break; + if (arg->type == HLSL_IR_SWIZZLE) + arg = hlsl_ir_swizzle(arg)->val.node; + args[a] = hlsl_block_add_swizzle(ctx, &new_block, swizzles[a], component_count, arg, &arg->loc); + } + + combined_type = hlsl_get_vector_type(ctx, expr->node.data_type->e.numeric.type, component_count); + combined = hlsl_block_add_expr(ctx, &new_block, expr->op, args, combined_type, &expr->node.loc); + + list_move_before(&expr->node.entry, &new_block.instrs); + + TRACE("Combining %u %s instructions into %p.\n", group->expr_count, + debug_hlsl_expr_op(group->exprs[0]->op), combined); + + component_count = 0; + for (unsigned int j = 0; j < group->expr_count; ++j) + { + struct hlsl_ir_node *replacement; + + expr = group->exprs[j]; + + if (!(replacement = hlsl_new_swizzle(ctx, + HLSL_SWIZZLE(X, Y, Z, W) >> VKD3D_SHADER_SWIZZLE_SHIFT(component_count), + expr->node.data_type->e.numeric.dimx, combined, &expr->node.loc))) + goto out; + component_count += expr->node.data_type->e.numeric.dimx; + list_add_before(&expr->node.entry, &replacement->entry); + hlsl_replace_node(&expr->node, replacement); + } + + progress = true; + } + +out: + vkd3d_free(state.groups); + return progress; +} + static enum validation_result validate_component_index_range_from_deref(struct hlsl_ctx *ctx, const struct hlsl_deref *deref) { @@ -12589,6 +12814,7 @@ static void process_entry_function(struct hlsl_ctx *ctx, struct recursive_call_ctx recursive_call_ctx; struct hlsl_ir_var *var; unsigned int i; + bool progress; ctx->is_patch_constant_func = entry_func == ctx->patch_constant_func; @@ -12783,6 +13009,16 @@ static void process_entry_function(struct hlsl_ctx *ctx, hlsl_transform_ir(ctx, lower_separate_samples, body, NULL); hlsl_transform_ir(ctx, validate_dereferences, body, NULL); + + do + { + progress = vectorize_exprs(ctx, body); + compute_liveness(ctx, entry_func); + progress |= hlsl_transform_ir(ctx, dce, body, NULL); + progress |= hlsl_transform_ir(ctx, fold_swizzle_chains, body, NULL); + progress |= hlsl_transform_ir(ctx, remove_trivial_swizzles, body, NULL); + } while (progress); + hlsl_transform_ir(ctx, track_object_components_sampler_dim, body, NULL); if (hlsl_version_ge(ctx, 4, 0))