mirror of
https://github.com/Dasharo/linux.git
synced 2026-03-06 15:25:10 -08:00
sched_ext: Add cgroup support
Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. A BPF scheduler may not implement or implement only
subset of cgroup features. The implemented features can be indicated using
%SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features
that are not implemented, a warning is triggered.
While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.
v7: - cgroup interface file visibility toggling is dropped in favor just
warning messages. Dynamically changing interface visiblity caused more
confusion than helping.
v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE.
- Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for
!CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED.
v5: - Flipped the locking order between scx_cgroup_rwsem and
cpus_read_lock() to avoid locking order conflict w/ cpuset. Better
documentation around locking.
- sched_move_task() takes an early exit if the source and destination
are identical. This triggered the warning in scx_cgroup_can_attach()
as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup
migration path so that ops.cgroup_prep_move() is skipped for identity
migrations so that its invocations always match ops.cgroup_move()
one-to-one.
v4: - Example schedulers moved into their own patches.
- Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi.
v3: - Make scx_example_pair switch all tasks by default.
- Convert to BPF inline iterators.
- scx_bpf_task_cgroup() is added to determine the current cgroup from
CPU controller's POV. This allows BPF schedulers to accurately track
CPU cgroup membership.
- scx_example_flatcg added. This demonstrates flattened hierarchy
implementation of CPU cgroup control and shows significant performance
improvement when cgroups which are nested multiple levels are under
competition.
v2: - Build fixes for different CONFIG combinations.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Andrea Righi <andrea.righi@canonical.com>
This commit is contained in:
@@ -188,6 +188,9 @@ struct sched_ext_entity {
|
||||
bool disallow; /* reject switching into SCX */
|
||||
|
||||
/* cold fields */
|
||||
#ifdef CONFIG_EXT_GROUP_SCHED
|
||||
struct cgroup *cgrp_moving_from;
|
||||
#endif
|
||||
/* must be the last field, see init_scx_entity() */
|
||||
struct list_head tasks_node;
|
||||
};
|
||||
|
||||
@@ -1055,6 +1055,12 @@ config RT_GROUP_SCHED
|
||||
realtime bandwidth for them.
|
||||
See Documentation/scheduler/sched-rt-group.rst for more information.
|
||||
|
||||
config EXT_GROUP_SCHED
|
||||
bool
|
||||
depends on SCHED_CLASS_EXT && CGROUP_SCHED
|
||||
select GROUP_SCHED_WEIGHT
|
||||
default y
|
||||
|
||||
endif #CGROUP_SCHED
|
||||
|
||||
config SCHED_MM_CID
|
||||
|
||||
@@ -8364,6 +8364,9 @@ void __init sched_init(void)
|
||||
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
||||
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
|
||||
#endif /* CONFIG_FAIR_GROUP_SCHED */
|
||||
#ifdef CONFIG_EXT_GROUP_SCHED
|
||||
root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
|
||||
#endif /* CONFIG_EXT_GROUP_SCHED */
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
|
||||
ptr += nr_cpu_ids * sizeof(void **);
|
||||
@@ -8801,6 +8804,7 @@ struct task_group *sched_create_group(struct task_group *parent)
|
||||
if (!alloc_rt_sched_group(tg, parent))
|
||||
goto err;
|
||||
|
||||
scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
|
||||
alloc_uclamp_sched_group(tg, parent);
|
||||
|
||||
return tg;
|
||||
@@ -8928,6 +8932,7 @@ void sched_move_task(struct task_struct *tsk)
|
||||
put_prev_task(rq, tsk);
|
||||
|
||||
sched_change_group(tsk, group);
|
||||
scx_move_task(tsk);
|
||||
|
||||
if (queued)
|
||||
enqueue_task(rq, tsk, queue_flags);
|
||||
@@ -8965,6 +8970,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct task_group *tg = css_tg(css);
|
||||
struct task_group *parent = css_tg(css->parent);
|
||||
int ret;
|
||||
|
||||
ret = scx_tg_online(tg);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (parent)
|
||||
sched_online_group(tg, parent);
|
||||
@@ -8979,6 +8989,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct task_group *tg = css_tg(css);
|
||||
|
||||
scx_tg_offline(tg);
|
||||
}
|
||||
|
||||
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct task_group *tg = css_tg(css);
|
||||
@@ -8996,9 +9013,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
sched_unregister_group(tg);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
struct task_struct *task;
|
||||
struct cgroup_subsys_state *css;
|
||||
|
||||
@@ -9006,9 +9023,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
|
||||
if (!sched_rt_can_attach(css_tg(css), task))
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
return scx_cgroup_can_attach(tset);
|
||||
}
|
||||
|
||||
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
@@ -9017,6 +9034,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
|
||||
|
||||
cgroup_taskset_for_each(task, css, tset)
|
||||
sched_move_task(task);
|
||||
|
||||
scx_cgroup_finish_attach();
|
||||
}
|
||||
|
||||
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
|
||||
{
|
||||
scx_cgroup_cancel_attach(tset);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_UCLAMP_TASK_GROUP
|
||||
@@ -9196,15 +9220,25 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
|
||||
#ifdef CONFIG_GROUP_SCHED_WEIGHT
|
||||
static unsigned long tg_weight(struct task_group *tg)
|
||||
{
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
return scale_load_down(tg->shares);
|
||||
#else
|
||||
return sched_weight_from_cgroup(tg->scx_weight);
|
||||
#endif
|
||||
}
|
||||
|
||||
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
|
||||
struct cftype *cftype, u64 shareval)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (shareval > scale_load_down(ULONG_MAX))
|
||||
shareval = MAX_SHARES;
|
||||
return sched_group_set_shares(css_tg(css), scale_load(shareval));
|
||||
ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
|
||||
if (!ret)
|
||||
scx_group_set_weight(css_tg(css),
|
||||
sched_weight_to_cgroup(shareval));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
|
||||
@@ -9595,7 +9629,12 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
|
||||
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft, s64 idle)
|
||||
{
|
||||
return sched_group_set_idle(css_tg(css), idle);
|
||||
int ret;
|
||||
|
||||
ret = sched_group_set_idle(css_tg(css), idle);
|
||||
if (!ret)
|
||||
scx_group_set_idle(css_tg(css), idle);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -9722,13 +9761,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft, u64 cgrp_weight)
|
||||
{
|
||||
unsigned long weight;
|
||||
int ret;
|
||||
|
||||
if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
|
||||
return -ERANGE;
|
||||
|
||||
weight = sched_weight_from_cgroup(cgrp_weight);
|
||||
|
||||
return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
if (!ret)
|
||||
scx_group_set_weight(css_tg(css), cgrp_weight);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
|
||||
@@ -9753,7 +9796,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft, s64 nice)
|
||||
{
|
||||
unsigned long weight;
|
||||
int idx;
|
||||
int idx, ret;
|
||||
|
||||
if (nice < MIN_NICE || nice > MAX_NICE)
|
||||
return -ERANGE;
|
||||
@@ -9762,7 +9805,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
|
||||
idx = array_index_nospec(idx, 40);
|
||||
weight = sched_prio_to_weight[idx];
|
||||
|
||||
return sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
|
||||
if (!ret)
|
||||
scx_group_set_weight(css_tg(css),
|
||||
sched_weight_to_cgroup(weight));
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_GROUP_SCHED_WEIGHT */
|
||||
|
||||
@@ -9878,14 +9925,14 @@ static struct cftype cpu_files[] = {
|
||||
struct cgroup_subsys cpu_cgrp_subsys = {
|
||||
.css_alloc = cpu_cgroup_css_alloc,
|
||||
.css_online = cpu_cgroup_css_online,
|
||||
.css_offline = cpu_cgroup_css_offline,
|
||||
.css_released = cpu_cgroup_css_released,
|
||||
.css_free = cpu_cgroup_css_free,
|
||||
.css_extra_stat_show = cpu_extra_stat_show,
|
||||
.css_local_stat_show = cpu_local_stat_show,
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
.can_attach = cpu_cgroup_can_attach,
|
||||
#endif
|
||||
.attach = cpu_cgroup_attach,
|
||||
.cancel_attach = cpu_cgroup_cancel_attach,
|
||||
.legacy_cftypes = cpu_legacy_files,
|
||||
.dfl_cftypes = cpu_files,
|
||||
.early_init = true,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -67,3 +67,25 @@ static inline void scx_update_idle(struct rq *rq, bool idle)
|
||||
#else
|
||||
static inline void scx_update_idle(struct rq *rq, bool idle) {}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
#ifdef CONFIG_EXT_GROUP_SCHED
|
||||
int scx_tg_online(struct task_group *tg);
|
||||
void scx_tg_offline(struct task_group *tg);
|
||||
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
|
||||
void scx_move_task(struct task_struct *p);
|
||||
void scx_cgroup_finish_attach(void);
|
||||
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
|
||||
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
|
||||
void scx_group_set_idle(struct task_group *tg, bool idle);
|
||||
#else /* CONFIG_EXT_GROUP_SCHED */
|
||||
static inline int scx_tg_online(struct task_group *tg) { return 0; }
|
||||
static inline void scx_tg_offline(struct task_group *tg) {}
|
||||
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
|
||||
static inline void scx_move_task(struct task_struct *p) {}
|
||||
static inline void scx_cgroup_finish_attach(void) {}
|
||||
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
|
||||
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
|
||||
static inline void scx_group_set_idle(struct task_group *tg, bool idle) {}
|
||||
#endif /* CONFIG_EXT_GROUP_SCHED */
|
||||
#endif /* CONFIG_CGROUP_SCHED */
|
||||
|
||||
@@ -459,6 +459,11 @@ struct task_group {
|
||||
struct rt_bandwidth rt_bandwidth;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_EXT_GROUP_SCHED
|
||||
u32 scx_flags; /* SCX_TG_* */
|
||||
u32 scx_weight;
|
||||
#endif
|
||||
|
||||
struct rcu_head rcu;
|
||||
struct list_head list;
|
||||
|
||||
|
||||
@@ -61,6 +61,7 @@ s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
|
||||
bool scx_bpf_task_running(const struct task_struct *p) __ksym;
|
||||
s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
|
||||
struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
|
||||
struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
|
||||
|
||||
static inline __attribute__((format(printf, 1, 2)))
|
||||
void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
|
||||
|
||||
@@ -95,6 +95,32 @@ void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p,
|
||||
void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p)
|
||||
{}
|
||||
|
||||
s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp,
|
||||
struct scx_cgroup_init_args *args)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp)
|
||||
{}
|
||||
|
||||
s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p,
|
||||
struct cgroup *from, struct cgroup *to)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p,
|
||||
struct cgroup *from, struct cgroup *to)
|
||||
{}
|
||||
|
||||
void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p,
|
||||
struct cgroup *from, struct cgroup *to)
|
||||
{}
|
||||
|
||||
void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
|
||||
{}
|
||||
|
||||
s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init)
|
||||
{
|
||||
return 0;
|
||||
@@ -126,6 +152,12 @@ struct sched_ext_ops maximal_ops = {
|
||||
.enable = maximal_enable,
|
||||
.exit_task = maximal_exit_task,
|
||||
.disable = maximal_disable,
|
||||
.cgroup_init = maximal_cgroup_init,
|
||||
.cgroup_exit = maximal_cgroup_exit,
|
||||
.cgroup_prep_move = maximal_cgroup_prep_move,
|
||||
.cgroup_move = maximal_cgroup_move,
|
||||
.cgroup_cancel_move = maximal_cgroup_cancel_move,
|
||||
.cgroup_set_weight = maximal_cgroup_set_weight,
|
||||
.init = maximal_init,
|
||||
.exit = maximal_exit,
|
||||
.name = "maximal",
|
||||
|
||||
Reference in New Issue
Block a user