mirror of
https://github.com/armbian/linux-cix.git
synced 2026-01-06 12:30:45 -08:00
Merge tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cpuset now support isolated cpus.partition type, which will enable dynamic CPU isolation - pids.peak added to remember the max number of pids used - holes in cgroup namespace plugged - internal cleanups * tag 'cgroup-for-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (25 commits) cgroup: use strscpy() is more robust and safer iocost_monitor: reorder BlkgIterator cgroup: simplify code in cgroup_apply_control cgroup: Make cgroup_get_from_id() prettier cgroup/cpuset: remove unreachable code cgroup: Remove CFTYPE_PRESSURE cgroup: Improve cftype add/rm error handling kselftest/cgroup: Add cpuset v2 partition root state test cgroup/cpuset: Update description of cpuset.cpus.partition in cgroup-v2.rst cgroup/cpuset: Make partition invalid if cpumask change violates exclusivity rule cgroup/cpuset: Relocate a code block in validate_change() cgroup/cpuset: Show invalid partition reason string cgroup/cpuset: Add a new isolated cpus.partition type cgroup/cpuset: Relax constraints to partition & cpus changes cgroup/cpuset: Allow no-task partition to have empty cpuset.cpus.effective cgroup/cpuset: Miscellaneous cleanups & add helper functions cgroup/cpuset: Enable update_tasks_cpumask() on top_cpuset cgroup: add pids.peak interface for pids controller cgroup: Remove data-race around cgrp_dfl_visible cgroup: Fix build failure when CONFIG_SHRINKER_DEBUG ...
This commit is contained in:
@@ -2190,75 +2190,93 @@ Cpuset Interface Files
|
||||
|
||||
It accepts only the following input values when written to.
|
||||
|
||||
======== ================================
|
||||
"root" a partition root
|
||||
"member" a non-root member of a partition
|
||||
======== ================================
|
||||
========== =====================================
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"isolated" Partition root without load balancing
|
||||
========== =====================================
|
||||
|
||||
When set to be a partition root, the current cgroup is the
|
||||
root of a new partition or scheduling domain that comprises
|
||||
itself and all its descendants except those that are separate
|
||||
partition roots themselves and their descendants. The root
|
||||
cgroup is always a partition root.
|
||||
The root cgroup is always a partition root and its state
|
||||
cannot be changed. All other non-root cgroups start out as
|
||||
"member".
|
||||
|
||||
There are constraints on where a partition root can be set.
|
||||
It can only be set in a cgroup if all the following conditions
|
||||
are true.
|
||||
When set to "root", the current cgroup is the root of a new
|
||||
partition or scheduling domain that comprises itself and all
|
||||
its descendants except those that are separate partition roots
|
||||
themselves and their descendants.
|
||||
|
||||
1) The "cpuset.cpus" is not empty and the list of CPUs are
|
||||
exclusive, i.e. they are not shared by any of its siblings.
|
||||
2) The parent cgroup is a partition root.
|
||||
3) The "cpuset.cpus" is also a proper subset of the parent's
|
||||
"cpuset.cpus.effective".
|
||||
4) There is no child cgroups with cpuset enabled. This is for
|
||||
eliminating corner cases that have to be handled if such a
|
||||
condition is allowed.
|
||||
When set to "isolated", the CPUs in that partition root will
|
||||
be in an isolated state without any load balancing from the
|
||||
scheduler. Tasks placed in such a partition with multiple
|
||||
CPUs should be carefully distributed and bound to each of the
|
||||
individual CPUs for optimal performance.
|
||||
|
||||
Setting it to partition root will take the CPUs away from the
|
||||
effective CPUs of the parent cgroup. Once it is set, this
|
||||
file cannot be reverted back to "member" if there are any child
|
||||
cgroups with cpuset enabled.
|
||||
The value shown in "cpuset.cpus.effective" of a partition root
|
||||
is the CPUs that the partition root can dedicate to a potential
|
||||
new child partition root. The new child subtracts available
|
||||
CPUs from its parent "cpuset.cpus.effective".
|
||||
|
||||
A parent partition cannot distribute all its CPUs to its
|
||||
child partitions. There must be at least one cpu left in the
|
||||
parent partition.
|
||||
A partition root ("root" or "isolated") can be in one of the
|
||||
two possible states - valid or invalid. An invalid partition
|
||||
root is in a degraded state where some state information may
|
||||
be retained, but behaves more like a "member".
|
||||
|
||||
Once becoming a partition root, changes to "cpuset.cpus" is
|
||||
generally allowed as long as the first condition above is true,
|
||||
the change will not take away all the CPUs from the parent
|
||||
partition and the new "cpuset.cpus" value is a superset of its
|
||||
children's "cpuset.cpus" values.
|
||||
All possible state transitions among "member", "root" and
|
||||
"isolated" are allowed.
|
||||
|
||||
Sometimes, external factors like changes to ancestors'
|
||||
"cpuset.cpus" or cpu hotplug can cause the state of the partition
|
||||
root to change. On read, the "cpuset.sched.partition" file
|
||||
can show the following values.
|
||||
On read, the "cpuset.cpus.partition" file can show the following
|
||||
values.
|
||||
|
||||
============== ==============================
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"root invalid" Invalid partition root
|
||||
============== ==============================
|
||||
============================= =====================================
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"isolated" Partition root without load balancing
|
||||
"root invalid (<reason>)" Invalid partition root
|
||||
"isolated invalid (<reason>)" Invalid isolated partition root
|
||||
============================= =====================================
|
||||
|
||||
It is a partition root if the first 2 partition root conditions
|
||||
above are true and at least one CPU from "cpuset.cpus" is
|
||||
granted by the parent cgroup.
|
||||
In the case of an invalid partition root, a descriptive string on
|
||||
why the partition is invalid is included within parentheses.
|
||||
|
||||
A partition root can become invalid if none of CPUs requested
|
||||
in "cpuset.cpus" can be granted by the parent cgroup or the
|
||||
parent cgroup is no longer a partition root itself. In this
|
||||
case, it is not a real partition even though the restriction
|
||||
of the first partition root condition above will still apply.
|
||||
The cpu affinity of all the tasks in the cgroup will then be
|
||||
associated with CPUs in the nearest ancestor partition.
|
||||
For a partition root to become valid, the following conditions
|
||||
must be met.
|
||||
|
||||
An invalid partition root can be transitioned back to a
|
||||
real partition root if at least one of the requested CPUs
|
||||
can now be granted by its parent. In this case, the cpu
|
||||
affinity of all the tasks in the formerly invalid partition
|
||||
will be associated to the CPUs of the newly formed partition.
|
||||
Changing the partition state of an invalid partition root to
|
||||
"member" is always allowed even if child cpusets are present.
|
||||
1) The "cpuset.cpus" is exclusive with its siblings , i.e. they
|
||||
are not shared by any of its siblings (exclusivity rule).
|
||||
2) The parent cgroup is a valid partition root.
|
||||
3) The "cpuset.cpus" is not empty and must contain at least
|
||||
one of the CPUs from parent's "cpuset.cpus", i.e. they overlap.
|
||||
4) The "cpuset.cpus.effective" cannot be empty unless there is
|
||||
no task associated with this partition.
|
||||
|
||||
External events like hotplug or changes to "cpuset.cpus" can
|
||||
cause a valid partition root to become invalid and vice versa.
|
||||
Note that a task cannot be moved to a cgroup with empty
|
||||
"cpuset.cpus.effective".
|
||||
|
||||
For a valid partition root with the sibling cpu exclusivity
|
||||
rule enabled, changes made to "cpuset.cpus" that violate the
|
||||
exclusivity rule will invalidate the partition as well as its
|
||||
sibiling partitions with conflicting cpuset.cpus values. So
|
||||
care must be taking in changing "cpuset.cpus".
|
||||
|
||||
A valid non-root parent partition may distribute out all its CPUs
|
||||
to its child partitions when there is no task associated with it.
|
||||
|
||||
Care must be taken to change a valid partition root to
|
||||
"member" as all its child partitions, if present, will become
|
||||
invalid causing disruption to tasks running in those child
|
||||
partitions. These inactivated partitions could be recovered if
|
||||
their parent is switched back to a partition root with a proper
|
||||
set of "cpuset.cpus".
|
||||
|
||||
Poll and inotify events are triggered whenever the state of
|
||||
"cpuset.cpus.partition" changes. That includes changes caused
|
||||
by write to "cpuset.cpus.partition", cpu hotplug or other
|
||||
changes that modify the validity status of the partition.
|
||||
This will allow user space agents to monitor unexpected changes
|
||||
to "cpuset.cpus.partition" without the need to do continuous
|
||||
polling.
|
||||
|
||||
|
||||
Device controller
|
||||
|
||||
@@ -19,8 +19,8 @@ int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len)
|
||||
return -EINVAL;
|
||||
|
||||
cgrp = cgroup_get_from_id(cgrp_id);
|
||||
if (!cgrp)
|
||||
return -ENOENT;
|
||||
if (IS_ERR(cgrp))
|
||||
return PTR_ERR(cgrp);
|
||||
css = cgroup_get_e_css(cgrp, &io_cgrp_subsys);
|
||||
if (!css) {
|
||||
ret = -ENOENT;
|
||||
|
||||
@@ -126,11 +126,11 @@ enum {
|
||||
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
|
||||
CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
|
||||
CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */
|
||||
CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */
|
||||
|
||||
/* internal flags, do not use outside cgroup core proper */
|
||||
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
|
||||
__CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */
|
||||
__CFTYPE_ADDED = (1 << 18),
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -384,7 +384,7 @@ struct cgroup {
|
||||
/*
|
||||
* The depth this cgroup is at. The root is at depth zero and each
|
||||
* step down the hierarchy increments the level. This along with
|
||||
* ancestor_ids[] can determine whether a given cgroup is a
|
||||
* ancestors[] can determine whether a given cgroup is a
|
||||
* descendant of another without traversing the hierarchy.
|
||||
*/
|
||||
int level;
|
||||
@@ -504,8 +504,8 @@ struct cgroup {
|
||||
/* Used to store internal freezer state */
|
||||
struct cgroup_freezer_state freezer;
|
||||
|
||||
/* ids of the ancestors at each level including self */
|
||||
u64 ancestor_ids[];
|
||||
/* All ancestors including self */
|
||||
struct cgroup *ancestors[];
|
||||
};
|
||||
|
||||
/*
|
||||
@@ -522,11 +522,15 @@ struct cgroup_root {
|
||||
/* Unique id for this hierarchy. */
|
||||
int hierarchy_id;
|
||||
|
||||
/* The root cgroup. Root is destroyed on its release. */
|
||||
/*
|
||||
* The root cgroup. The containing cgroup_root will be destroyed on its
|
||||
* release. cgrp->ancestors[0] will be used overflowing into the
|
||||
* following field. cgrp_ancestor_storage must immediately follow.
|
||||
*/
|
||||
struct cgroup cgrp;
|
||||
|
||||
/* for cgrp->ancestor_ids[0] */
|
||||
u64 cgrp_ancestor_id_storage;
|
||||
/* must follow cgrp for cgrp->ancestors[0], see above */
|
||||
struct cgroup *cgrp_ancestor_storage;
|
||||
|
||||
/* Number of cgroups in the hierarchy, used only for /proc/cgroups */
|
||||
atomic_t nr_cgrps;
|
||||
|
||||
@@ -575,7 +575,7 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
|
||||
{
|
||||
if (cgrp->root != ancestor->root || cgrp->level < ancestor->level)
|
||||
return false;
|
||||
return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor);
|
||||
return cgrp->ancestors[ancestor->level] == ancestor;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -592,11 +592,9 @@ static inline bool cgroup_is_descendant(struct cgroup *cgrp,
|
||||
static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp,
|
||||
int ancestor_level)
|
||||
{
|
||||
if (cgrp->level < ancestor_level)
|
||||
if (ancestor_level < 0 || ancestor_level > cgrp->level)
|
||||
return NULL;
|
||||
while (cgrp && cgrp->level > ancestor_level)
|
||||
cgrp = cgroup_parent(cgrp);
|
||||
return cgrp;
|
||||
return cgrp->ancestors[ancestor_level];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -748,11 +746,6 @@ static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
|
||||
|
||||
static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
|
||||
{}
|
||||
|
||||
static inline struct cgroup *cgroup_get_from_id(u64 id)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif /* !CONFIG_CGROUPS */
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
||||
@@ -250,6 +250,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
|
||||
|
||||
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
||||
bool threadgroup);
|
||||
void cgroup_attach_lock(bool lock_threadgroup);
|
||||
void cgroup_attach_unlock(bool lock_threadgroup);
|
||||
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
|
||||
bool *locked)
|
||||
__acquires(&cgroup_threadgroup_rwsem);
|
||||
|
||||
@@ -59,8 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
|
||||
int retval = 0;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
cpus_read_lock();
|
||||
percpu_down_write(&cgroup_threadgroup_rwsem);
|
||||
cgroup_attach_lock(true);
|
||||
for_each_root(root) {
|
||||
struct cgroup *from_cgrp;
|
||||
|
||||
@@ -72,8 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
|
||||
if (retval)
|
||||
break;
|
||||
}
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
cpus_read_unlock();
|
||||
cgroup_attach_unlock(true);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
return retval;
|
||||
|
||||
@@ -217,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = {
|
||||
|
||||
static struct file_system_type cgroup2_fs_type;
|
||||
static struct cftype cgroup_base_files[];
|
||||
static struct cftype cgroup_psi_files[];
|
||||
|
||||
/* cgroup optional features */
|
||||
enum cgroup_opt_features {
|
||||
@@ -1689,12 +1690,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
|
||||
css->flags &= ~CSS_VISIBLE;
|
||||
|
||||
if (!css->ss) {
|
||||
if (cgroup_on_dfl(cgrp))
|
||||
cfts = cgroup_base_files;
|
||||
else
|
||||
cfts = cgroup1_base_files;
|
||||
|
||||
cgroup_addrm_files(css, cgrp, cfts, false);
|
||||
if (cgroup_on_dfl(cgrp)) {
|
||||
cgroup_addrm_files(css, cgrp,
|
||||
cgroup_base_files, false);
|
||||
if (cgroup_psi_enabled())
|
||||
cgroup_addrm_files(css, cgrp,
|
||||
cgroup_psi_files, false);
|
||||
} else {
|
||||
cgroup_addrm_files(css, cgrp,
|
||||
cgroup1_base_files, false);
|
||||
}
|
||||
} else {
|
||||
list_for_each_entry(cfts, &css->ss->cfts, node)
|
||||
cgroup_addrm_files(css, cgrp, cfts, false);
|
||||
@@ -1717,14 +1722,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
|
||||
return 0;
|
||||
|
||||
if (!css->ss) {
|
||||
if (cgroup_on_dfl(cgrp))
|
||||
cfts = cgroup_base_files;
|
||||
else
|
||||
cfts = cgroup1_base_files;
|
||||
if (cgroup_on_dfl(cgrp)) {
|
||||
ret = cgroup_addrm_files(&cgrp->self, cgrp,
|
||||
cgroup_base_files, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (cgroup_psi_enabled()) {
|
||||
ret = cgroup_addrm_files(&cgrp->self, cgrp,
|
||||
cgroup_psi_files, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
}
|
||||
} else {
|
||||
cgroup_addrm_files(css, cgrp,
|
||||
cgroup1_base_files, true);
|
||||
}
|
||||
} else {
|
||||
list_for_each_entry(cfts, &css->ss->cfts, node) {
|
||||
ret = cgroup_addrm_files(css, cgrp, cfts, true);
|
||||
@@ -2050,7 +2063,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
|
||||
}
|
||||
root_cgrp->kn = kernfs_root_to_node(root->kf_root);
|
||||
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
|
||||
root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
|
||||
root_cgrp->ancestors[0] = root_cgrp;
|
||||
|
||||
ret = css_populate_dir(&root_cgrp->self);
|
||||
if (ret)
|
||||
@@ -2173,7 +2186,7 @@ static int cgroup_get_tree(struct fs_context *fc)
|
||||
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
|
||||
int ret;
|
||||
|
||||
cgrp_dfl_visible = true;
|
||||
WRITE_ONCE(cgrp_dfl_visible, true);
|
||||
cgroup_get_live(&cgrp_dfl_root.cgrp);
|
||||
ctx->root = &cgrp_dfl_root;
|
||||
|
||||
@@ -2361,7 +2374,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
|
||||
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
|
||||
} else {
|
||||
/* if no hierarchy exists, everyone is in "/" */
|
||||
ret = strlcpy(buf, "/", buflen);
|
||||
ret = strscpy(buf, "/", buflen);
|
||||
}
|
||||
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
@@ -2393,7 +2406,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path);
|
||||
* write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
|
||||
* CPU hotplug is disabled on entry.
|
||||
*/
|
||||
static void cgroup_attach_lock(bool lock_threadgroup)
|
||||
void cgroup_attach_lock(bool lock_threadgroup)
|
||||
{
|
||||
cpus_read_lock();
|
||||
if (lock_threadgroup)
|
||||
@@ -2404,7 +2417,7 @@ static void cgroup_attach_lock(bool lock_threadgroup)
|
||||
* cgroup_attach_unlock - Undo cgroup_attach_lock()
|
||||
* @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
|
||||
*/
|
||||
static void cgroup_attach_unlock(bool lock_threadgroup)
|
||||
void cgroup_attach_unlock(bool lock_threadgroup)
|
||||
{
|
||||
if (lock_threadgroup)
|
||||
percpu_up_write(&cgroup_threadgroup_rwsem);
|
||||
@@ -3292,11 +3305,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
|
||||
* making the following cgroup_update_dfl_csses() properly update
|
||||
* css associations of all tasks in the subtree.
|
||||
*/
|
||||
ret = cgroup_update_dfl_csses(cgrp);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
return 0;
|
||||
return cgroup_update_dfl_csses(cgrp);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -4132,8 +4141,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
|
||||
restart:
|
||||
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
|
||||
/* does cft->flags tell us to skip this file on @cgrp? */
|
||||
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
|
||||
continue;
|
||||
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
|
||||
continue;
|
||||
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
|
||||
@@ -4198,21 +4205,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
|
||||
cft->ss = NULL;
|
||||
|
||||
/* revert flags set by cgroup core while adding @cfts */
|
||||
cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
|
||||
cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
|
||||
__CFTYPE_ADDED);
|
||||
}
|
||||
}
|
||||
|
||||
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
|
||||
{
|
||||
struct cftype *cft;
|
||||
int ret = 0;
|
||||
|
||||
for (cft = cfts; cft->name[0] != '\0'; cft++) {
|
||||
struct kernfs_ops *kf_ops;
|
||||
|
||||
WARN_ON(cft->ss || cft->kf_ops);
|
||||
|
||||
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
|
||||
continue;
|
||||
if (cft->flags & __CFTYPE_ADDED) {
|
||||
ret = -EBUSY;
|
||||
break;
|
||||
}
|
||||
|
||||
if (cft->seq_start)
|
||||
kf_ops = &cgroup_kf_ops;
|
||||
@@ -4226,26 +4237,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
|
||||
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
|
||||
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
|
||||
if (!kf_ops) {
|
||||
cgroup_exit_cftypes(cfts);
|
||||
return -ENOMEM;
|
||||
ret = -ENOMEM;
|
||||
break;
|
||||
}
|
||||
kf_ops->atomic_write_len = cft->max_write_len;
|
||||
}
|
||||
|
||||
cft->kf_ops = kf_ops;
|
||||
cft->ss = ss;
|
||||
cft->flags |= __CFTYPE_ADDED;
|
||||
}
|
||||
|
||||
return 0;
|
||||
if (ret)
|
||||
cgroup_exit_cftypes(cfts);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
|
||||
{
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
if (!cfts || !cfts[0].ss)
|
||||
return -ENOENT;
|
||||
|
||||
list_del(&cfts->node);
|
||||
cgroup_apply_cftypes(cfts, false);
|
||||
cgroup_exit_cftypes(cfts);
|
||||
@@ -4267,6 +4278,12 @@ int cgroup_rm_cftypes(struct cftype *cfts)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!cfts || cfts[0].name[0] == '\0')
|
||||
return 0;
|
||||
|
||||
if (!(cfts[0].flags & __CFTYPE_ADDED))
|
||||
return -ENOENT;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = cgroup_rm_cftypes_locked(cfts);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
@@ -5151,10 +5168,13 @@ static struct cftype cgroup_base_files[] = {
|
||||
.name = "cpu.stat",
|
||||
.seq_show = cpu_stat_show,
|
||||
},
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
||||
static struct cftype cgroup_psi_files[] = {
|
||||
#ifdef CONFIG_PSI
|
||||
{
|
||||
.name = "io.pressure",
|
||||
.flags = CFTYPE_PRESSURE,
|
||||
.seq_show = cgroup_io_pressure_show,
|
||||
.write = cgroup_io_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@@ -5162,7 +5182,6 @@ static struct cftype cgroup_base_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "memory.pressure",
|
||||
.flags = CFTYPE_PRESSURE,
|
||||
.seq_show = cgroup_memory_pressure_show,
|
||||
.write = cgroup_memory_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@@ -5170,7 +5189,6 @@ static struct cftype cgroup_base_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "cpu.pressure",
|
||||
.flags = CFTYPE_PRESSURE,
|
||||
.seq_show = cgroup_cpu_pressure_show,
|
||||
.write = cgroup_cpu_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@@ -5452,8 +5470,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
||||
int ret;
|
||||
|
||||
/* allocate the cgroup and its ID, 0 is reserved for the root */
|
||||
cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
|
||||
GFP_KERNEL);
|
||||
cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
|
||||
if (!cgrp)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
@@ -5505,7 +5522,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
|
||||
cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
|
||||
cgrp->ancestors[tcgrp->level] = tcgrp;
|
||||
|
||||
if (tcgrp != cgrp) {
|
||||
tcgrp->nr_descendants++;
|
||||
@@ -5938,6 +5955,7 @@ int __init cgroup_init(void)
|
||||
|
||||
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
|
||||
|
||||
cgroup_rstat_boot();
|
||||
@@ -6058,19 +6076,22 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
|
||||
/*
|
||||
* cgroup_get_from_id : get the cgroup associated with cgroup id
|
||||
* @id: cgroup id
|
||||
* On success return the cgrp, on failure return NULL
|
||||
* On success return the cgrp or ERR_PTR on failure
|
||||
* Only cgroups within current task's cgroup NS are valid.
|
||||
*/
|
||||
struct cgroup *cgroup_get_from_id(u64 id)
|
||||
{
|
||||
struct kernfs_node *kn;
|
||||
struct cgroup *cgrp = NULL;
|
||||
struct cgroup *cgrp, *root_cgrp;
|
||||
|
||||
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
|
||||
if (!kn)
|
||||
goto out;
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
if (kernfs_type(kn) != KERNFS_DIR)
|
||||
goto put;
|
||||
if (kernfs_type(kn) != KERNFS_DIR) {
|
||||
kernfs_put(kn);
|
||||
return ERR_PTR(-ENOENT);
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
@@ -6079,9 +6100,19 @@ struct cgroup *cgroup_get_from_id(u64 id)
|
||||
cgrp = NULL;
|
||||
|
||||
rcu_read_unlock();
|
||||
put:
|
||||
kernfs_put(kn);
|
||||
out:
|
||||
|
||||
if (!cgrp)
|
||||
return ERR_PTR(-ENOENT);
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
if (!cgroup_is_descendant(cgrp, root_cgrp)) {
|
||||
cgroup_put(cgrp);
|
||||
return ERR_PTR(-ENOENT);
|
||||
}
|
||||
|
||||
return cgrp;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
|
||||
@@ -6111,7 +6142,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct cgroup *cgrp;
|
||||
int ssid, count = 0;
|
||||
|
||||
if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
|
||||
if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
|
||||
continue;
|
||||
|
||||
seq_printf(m, "%d:", root->hierarchy_id);
|
||||
@@ -6653,8 +6684,12 @@ struct cgroup *cgroup_get_from_path(const char *path)
|
||||
{
|
||||
struct kernfs_node *kn;
|
||||
struct cgroup *cgrp = ERR_PTR(-ENOENT);
|
||||
struct cgroup *root_cgrp;
|
||||
|
||||
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
|
||||
spin_lock_irq(&css_set_lock);
|
||||
root_cgrp = current_cgns_cgroup_from_root(&cgrp_dfl_root);
|
||||
kn = kernfs_walk_and_get(root_cgrp->kn, path);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
if (!kn)
|
||||
goto out;
|
||||
|
||||
@@ -6812,9 +6847,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
|
||||
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
|
||||
continue;
|
||||
|
||||
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
|
||||
continue;
|
||||
|
||||
if (prefix)
|
||||
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
|
||||
|
||||
@@ -6834,8 +6866,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
int ssid;
|
||||
ssize_t ret = 0;
|
||||
|
||||
ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
|
||||
NULL);
|
||||
ret = show_delegatable_files(cgroup_base_files, buf + ret,
|
||||
PAGE_SIZE - ret, NULL);
|
||||
if (cgroup_psi_enabled())
|
||||
ret += show_delegatable_files(cgroup_psi_files, buf + ret,
|
||||
PAGE_SIZE - ret, NULL);
|
||||
|
||||
for_each_subsys(ss, ssid)
|
||||
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -47,6 +47,7 @@ struct pids_cgroup {
|
||||
*/
|
||||
atomic64_t counter;
|
||||
atomic64_t limit;
|
||||
int64_t watermark;
|
||||
|
||||
/* Handle for "pids.events" */
|
||||
struct cgroup_file events_file;
|
||||
@@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
|
||||
kfree(css_pids(css));
|
||||
}
|
||||
|
||||
static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
|
||||
{
|
||||
/*
|
||||
* This is racy, but we don't need perfectly accurate tallying of
|
||||
* the watermark, and this lets us avoid extra atomic overhead.
|
||||
*/
|
||||
if (nr_pids > READ_ONCE(p->watermark))
|
||||
WRITE_ONCE(p->watermark, nr_pids);
|
||||
}
|
||||
|
||||
/**
|
||||
* pids_cancel - uncharge the local pid count
|
||||
* @pids: the pid cgroup state
|
||||
@@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num)
|
||||
{
|
||||
struct pids_cgroup *p;
|
||||
|
||||
for (p = pids; parent_pids(p); p = parent_pids(p))
|
||||
atomic64_add(num, &p->counter);
|
||||
for (p = pids; parent_pids(p); p = parent_pids(p)) {
|
||||
int64_t new = atomic64_add_return(num, &p->counter);
|
||||
|
||||
pids_update_watermark(p, new);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
|
||||
*/
|
||||
if (new > limit)
|
||||
goto revert;
|
||||
|
||||
/*
|
||||
* Not technically accurate if we go over limit somewhere up
|
||||
* the hierarchy, but that's tolerable for the watermark.
|
||||
*/
|
||||
pids_update_watermark(p, new);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
|
||||
return atomic64_read(&pids->counter);
|
||||
}
|
||||
|
||||
static s64 pids_peak_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
struct pids_cgroup *pids = css_pids(css);
|
||||
|
||||
return READ_ONCE(pids->watermark);
|
||||
}
|
||||
|
||||
static int pids_events_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct pids_cgroup *pids = css_pids(seq_css(sf));
|
||||
@@ -331,6 +359,11 @@ static struct cftype pids_files[] = {
|
||||
.read_s64 = pids_current_read,
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
},
|
||||
{
|
||||
.name = "peak",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.read_s64 = pids_peak_read,
|
||||
},
|
||||
{
|
||||
.name = "events",
|
||||
.seq_show = pids_events_show,
|
||||
|
||||
@@ -5104,8 +5104,8 @@ struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
cgrp = cgroup_get_from_id(ino);
|
||||
if (!cgrp)
|
||||
return ERR_PTR(-ENOENT);
|
||||
if (IS_ERR(cgrp))
|
||||
return ERR_CAST(cgrp);
|
||||
|
||||
css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
|
||||
if (css)
|
||||
|
||||
@@ -40,16 +40,17 @@ static noinline bool
|
||||
nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level)
|
||||
{
|
||||
struct cgroup *cgrp;
|
||||
u64 cgid;
|
||||
|
||||
if (!sk_fullsock(sk))
|
||||
return false;
|
||||
|
||||
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
|
||||
if (level > cgrp->level)
|
||||
cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level);
|
||||
if (!cgrp)
|
||||
return false;
|
||||
|
||||
memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64));
|
||||
|
||||
cgid = cgroup_id(cgrp);
|
||||
memcpy(dest, &cgid, sizeof(u64));
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -61,6 +61,11 @@ autop_names = {
|
||||
}
|
||||
|
||||
class BlkgIterator:
|
||||
def __init__(self, root_blkcg, q_id, include_dying=False):
|
||||
self.include_dying = include_dying
|
||||
self.blkgs = []
|
||||
self.walk(root_blkcg, q_id, '')
|
||||
|
||||
def blkcg_name(blkcg):
|
||||
return blkcg.css.cgroup.kn.name.string_().decode('utf-8')
|
||||
|
||||
@@ -82,11 +87,6 @@ class BlkgIterator:
|
||||
blkcg.css.children.address_of_(), 'css.sibling'):
|
||||
self.walk(c, q_id, path)
|
||||
|
||||
def __init__(self, root_blkcg, q_id, include_dying=False):
|
||||
self.include_dying = include_dying
|
||||
self.blkgs = []
|
||||
self.walk(root_blkcg, q_id, '')
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.blkgs)
|
||||
|
||||
|
||||
@@ -77,7 +77,7 @@ static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
|
||||
break;
|
||||
|
||||
// convert cgroup-id to a map index
|
||||
cgrp_id = BPF_CORE_READ(cgrp, ancestor_ids[i]);
|
||||
cgrp_id = BPF_CORE_READ(cgrp, ancestors[i], kn, id);
|
||||
elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
|
||||
if (!elem)
|
||||
continue;
|
||||
|
||||
1
tools/testing/selftests/cgroup/.gitignore
vendored
1
tools/testing/selftests/cgroup/.gitignore
vendored
@@ -5,3 +5,4 @@ test_freezer
|
||||
test_kmem
|
||||
test_kill
|
||||
test_cpu
|
||||
wait_inotify
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
CFLAGS += -Wall -pthread
|
||||
|
||||
all:
|
||||
all: ${HELPER_PROGS}
|
||||
|
||||
TEST_FILES := with_stress.sh
|
||||
TEST_PROGS := test_stress.sh
|
||||
TEST_PROGS := test_stress.sh test_cpuset_prs.sh
|
||||
TEST_GEN_FILES := wait_inotify
|
||||
TEST_GEN_PROGS = test_memcontrol
|
||||
TEST_GEN_PROGS += test_kmem
|
||||
TEST_GEN_PROGS += test_core
|
||||
|
||||
674
tools/testing/selftests/cgroup/test_cpuset_prs.sh
Executable file
674
tools/testing/selftests/cgroup/test_cpuset_prs.sh
Executable file
File diff suppressed because it is too large
Load Diff
87
tools/testing/selftests/cgroup/wait_inotify.c
Normal file
87
tools/testing/selftests/cgroup/wait_inotify.c
Normal file
@@ -0,0 +1,87 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Wait until an inotify event on the given cgroup file.
|
||||
*/
|
||||
#include <linux/limits.h>
|
||||
#include <sys/inotify.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/ptrace.h>
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <poll.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
|
||||
static const char usage[] = "Usage: %s [-v] <cgroup_file>\n";
|
||||
static char *file;
|
||||
static int verbose;
|
||||
|
||||
static inline void fail_message(char *msg)
|
||||
{
|
||||
fprintf(stderr, msg, file);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
char *cmd = argv[0];
|
||||
int c, fd;
|
||||
struct pollfd fds = { .events = POLLIN, };
|
||||
|
||||
while ((c = getopt(argc, argv, "v")) != -1) {
|
||||
switch (c) {
|
||||
case 'v':
|
||||
verbose++;
|
||||
break;
|
||||
}
|
||||
argv++, argc--;
|
||||
}
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, usage, cmd);
|
||||
return -1;
|
||||
}
|
||||
file = argv[1];
|
||||
fd = open(file, O_RDONLY);
|
||||
if (fd < 0)
|
||||
fail_message("Cgroup file %s not found!\n");
|
||||
close(fd);
|
||||
|
||||
fd = inotify_init();
|
||||
if (fd < 0)
|
||||
fail_message("inotify_init() fails on %s!\n");
|
||||
if (inotify_add_watch(fd, file, IN_MODIFY) < 0)
|
||||
fail_message("inotify_add_watch() fails on %s!\n");
|
||||
fds.fd = fd;
|
||||
|
||||
/*
|
||||
* poll waiting loop
|
||||
*/
|
||||
for (;;) {
|
||||
int ret = poll(&fds, 1, 10000);
|
||||
|
||||
if (ret < 0) {
|
||||
if (errno == EINTR)
|
||||
continue;
|
||||
perror("poll");
|
||||
exit(1);
|
||||
}
|
||||
if ((ret > 0) && (fds.revents & POLLIN))
|
||||
break;
|
||||
}
|
||||
if (verbose) {
|
||||
struct inotify_event events[10];
|
||||
long len;
|
||||
|
||||
usleep(1000);
|
||||
len = read(fd, events, sizeof(events));
|
||||
printf("Number of events read = %ld\n",
|
||||
len/sizeof(struct inotify_event));
|
||||
}
|
||||
close(fd);
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user