2007-10-18 23:39:30 -07:00
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
2010-03-10 15:22:20 -08:00
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
2007-10-18 23:39:30 -07:00
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
# include <linux/cgroup.h>
2011-06-02 21:20:51 +10:00
# include <linux/cred.h>
2009-09-23 15:56:19 -07:00
# include <linux/ctype.h>
2007-10-18 23:39:30 -07:00
# include <linux/errno.h>
2011-06-02 21:20:51 +10:00
# include <linux/init_task.h>
2007-10-18 23:39:30 -07:00
# include <linux/kernel.h>
# include <linux/list.h>
# include <linux/mm.h>
# include <linux/mutex.h>
# include <linux/mount.h>
# include <linux/pagemap.h>
2007-10-18 23:39:35 -07:00
# include <linux/proc_fs.h>
2007-10-18 23:39:30 -07:00
# include <linux/rcupdate.h>
# include <linux/sched.h>
2007-10-18 23:39:36 -07:00
# include <linux/backing-dev.h>
2007-10-18 23:39:30 -07:00
# include <linux/seq_file.h>
# include <linux/slab.h>
# include <linux/magic.h>
# include <linux/spinlock.h>
# include <linux/string.h>
2007-10-18 23:39:32 -07:00
# include <linux/sort.h>
2007-10-18 23:39:38 -07:00
# include <linux/kmod.h>
2010-03-10 15:22:09 -08:00
# include <linux/module.h>
2007-10-18 23:39:44 -07:00
# include <linux/delayacct.h>
# include <linux/cgroupstats.h>
2013-01-10 11:49:27 +08:00
# include <linux/hashtable.h>
2008-07-26 03:46:43 -04:00
# include <linux/namei.h>
2009-07-29 15:04:04 -07:00
# include <linux/pid_namespace.h>
2009-09-23 15:56:23 -07:00
# include <linux/idr.h>
2009-09-23 15:56:28 -07:00
# include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
2010-03-10 15:22:20 -08:00
# include <linux/eventfd.h>
# include <linux/poll.h>
2013-03-13 09:17:09 +08:00
# include <linux/flex_array.h> /* used in cgroup_attach_task */
2012-04-21 09:13:46 +02:00
# include <linux/kthread.h>
2007-10-18 23:39:44 -07:00
2011-07-26 16:09:06 -07:00
# include <linux/atomic.h>
2007-10-18 23:39:30 -07:00
2011-12-12 18:12:21 -08:00
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
* cgroup_root_mutex nests inside cgroup_mutex and should be held to modify
* cgroupfs_root of any cgroup hierarchy - subsys list, flags,
* release_agent_path and so on. Modifying requires both cgroup_mutex and
* cgroup_root_mutex. Readers can acquire either of the two. This is to
* break the following locking order cycle.
*
* A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem
* B. namespace_sem -> cgroup_mutex
*
* B happens only through cgroup_show_options() and using cgroup_root_mutex
* breaks it.
*/
2013-04-07 09:29:51 -07:00
# ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX ( cgroup_mutex ) ;
2013-08-08 20:11:22 -04:00
EXPORT_SYMBOL_GPL ( cgroup_mutex ) ; /* only for lockdep */
2013-04-07 09:29:51 -07:00
# else
2007-10-18 23:39:38 -07:00
static DEFINE_MUTEX ( cgroup_mutex ) ;
2013-04-07 09:29:51 -07:00
# endif
2011-12-12 18:12:21 -08:00
static DEFINE_MUTEX ( cgroup_root_mutex ) ;
2007-10-18 23:39:38 -07:00
2010-03-10 15:22:07 -08:00
/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
2012-09-13 09:50:55 +02:00
* populated with the built in subsystems, and modular subsystems are
2010-03-10 15:22:07 -08:00
* registered after that. The mutable section of this array is protected by
* cgroup_mutex.
*/
2012-09-12 16:12:06 +02:00
# define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys,
2012-09-12 16:12:05 +02:00
# define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
2013-06-24 15:21:47 -07:00
static struct cgroup_subsys * cgroup_subsys [ CGROUP_SUBSYS_COUNT ] = {
2007-10-18 23:39:30 -07:00
# include <linux/cgroup_subsys.h>
} ;
/*
2013-06-24 15:21:47 -07:00
* The dummy hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
2007-10-18 23:39:30 -07:00
*/
2013-06-24 15:21:47 -07:00
static struct cgroupfs_root cgroup_dummy_root ;
/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
static struct cgroup * const cgroup_dummy_top = & cgroup_dummy_root . top_cgroup ;
2007-10-18 23:39:30 -07:00
2012-04-01 12:09:56 -07:00
/*
* cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
*/
struct cfent {
struct list_head node ;
struct dentry * dentry ;
struct cftype * type ;
2013-04-18 23:09:52 -07:00
/* file xattrs */
struct simple_xattrs xattrs ;
2012-04-01 12:09:56 -07:00
} ;
2009-04-02 16:57:25 -07:00
/*
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
# define CSS_ID_MAX (65535)
struct css_id {
/*
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
2012-11-05 09:16:58 -08:00
* is called after synchronize_rcu(). But for safe use, css_tryget()
* should be used for avoiding race.
2009-04-02 16:57:25 -07:00
*/
2010-02-24 19:41:39 +01:00
struct cgroup_subsys_state __rcu * css ;
2009-04-02 16:57:25 -07:00
/*
* ID of this css.
*/
unsigned short id ;
/*
* Depth in hierarchy which this ID belongs to.
*/
unsigned short depth ;
/*
* ID is freed by RCU. (and lookup routine is RCU safe.)
*/
struct rcu_head rcu_head ;
/*
* Hierarchy of CSS ID belongs to.
*/
unsigned short stack [ 0 ] ; /* Array of Length (depth+1) */
} ;
2010-03-10 15:22:20 -08:00
/*
2011-03-30 22:57:33 -03:00
* cgroup_event represents events which userspace want to receive.
2010-03-10 15:22:20 -08:00
*/
struct cgroup_event {
/*
* Cgroup which the event belongs to.
*/
struct cgroup * cgrp ;
/*
* Control file which the event associated.
*/
struct cftype * cft ;
/*
* eventfd to signal userspace about the event.
*/
struct eventfd_ctx * eventfd ;
/*
* Each of these stored in a list by the cgroup.
*/
struct list_head list ;
/*
* All fields below needed to unregister event when
* userspace closes eventfd.
*/
poll_table pt ;
wait_queue_head_t * wqh ;
wait_queue_t wait ;
struct work_struct remove ;
} ;
2009-04-02 16:57:25 -07:00
2007-10-18 23:39:30 -07:00
/* The list of hierarchy roots */
2013-06-24 15:21:47 -07:00
static LIST_HEAD ( cgroup_roots ) ;
static int cgroup_root_count ;
2007-10-18 23:39:30 -07:00
2013-04-14 11:36:57 -07:00
/*
* Hierarchy ID allocation and mapping. It follows the same exclusion
* rules as other root ops - both cgroup_mutex and cgroup_root_mutex for
* writes, either for reads.
*/
2013-04-14 11:36:58 -07:00
static DEFINE_IDR ( cgroup_hierarchy_idr ) ;
2009-09-23 15:56:23 -07:00
2013-03-01 15:01:56 +08:00
static struct cgroup_name root_cgroup_name = { . name = " / " } ;
2013-06-18 18:53:53 +08:00
/*
* Assign a monotonically increasing serial number to cgroups. It
* guarantees cgroups with bigger numbers are newer than those with smaller
* numbers. Also, as cgroups are always appended to the parent's
* ->children list, it guarantees that sibling cgroups are always sorted in
2013-06-18 11:14:22 -07:00
* the ascending serial number order on the list. Protected by
* cgroup_mutex.
2013-06-18 18:53:53 +08:00
*/
2013-06-18 11:14:22 -07:00
static u64 cgroup_serial_nr_next = 1 ;
2013-06-18 18:53:53 +08:00
2007-10-18 23:39:30 -07:00
/* This flag indicates whether tasks in the fork and exit paths should
2008-02-23 15:24:09 -08:00
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
* be called.
2007-10-18 23:39:30 -07:00
*/
2008-07-25 01:46:56 -07:00
static int need_forkexit_callback __read_mostly ;
2007-10-18 23:39:30 -07:00
2013-06-28 16:24:11 -07:00
static struct cftype cgroup_base_files [ ] ;
2013-06-13 19:27:42 -07:00
static void cgroup_offline_fn ( struct work_struct * work ) ;
2012-11-19 08:13:37 -08:00
static int cgroup_destroy_locked ( struct cgroup * cgrp ) ;
2013-08-08 20:11:23 -04:00
static int cgroup_addrm_files ( struct cgroup * cgrp , struct cftype cfts [ ] ,
bool is_add ) ;
2012-11-19 08:13:37 -08:00
2007-10-18 23:39:30 -07:00
/* convenient tests for these bits */
2013-06-12 21:04:53 -07:00
static inline bool cgroup_is_dead ( const struct cgroup * cgrp )
2007-10-18 23:39:30 -07:00
{
2013-06-12 21:04:53 -07:00
return test_bit ( CGRP_DEAD , & cgrp - > flags ) ;
2007-10-18 23:39:30 -07:00
}
2013-04-08 19:00:38 -07:00
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
* @ancestor: possible ancestor of @cgrp
*
* Test whether @cgrp is a descendant of @ancestor. It also returns %true
* if @cgrp == @ancestor. This function is safe to call as long as @cgrp
* and @ancestor are accessible.
*/
bool cgroup_is_descendant ( struct cgroup * cgrp , struct cgroup * ancestor )
{
while ( cgrp ) {
if ( cgrp = = ancestor )
return true ;
cgrp = cgrp - > parent ;
}
return false ;
}
EXPORT_SYMBOL_GPL ( cgroup_is_descendant ) ;
2007-10-18 23:39:30 -07:00
2008-02-07 00:13:46 -08:00
static int cgroup_is_releasable ( const struct cgroup * cgrp )
2007-10-18 23:39:38 -07:00
{
const int bits =
2007-10-18 23:40:44 -07:00
( 1 < < CGRP_RELEASABLE ) |
( 1 < < CGRP_NOTIFY_ON_RELEASE ) ;
return ( cgrp - > flags & bits ) = = bits ;
2007-10-18 23:39:38 -07:00
}
2008-02-07 00:13:46 -08:00
static int notify_on_release ( const struct cgroup * cgrp )
2007-10-18 23:39:38 -07:00
{
2007-10-18 23:40:44 -07:00
return test_bit ( CGRP_NOTIFY_ON_RELEASE , & cgrp - > flags ) ;
2007-10-18 23:39:38 -07:00
}
2013-06-25 11:53:37 -07:00
/**
* for_each_subsys - iterate all loaded cgroup subsystems
* @ss: the iteration cursor
* @i: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
*
* Should be called under cgroup_mutex.
*/
# define for_each_subsys(ss, i) \
for ((i) = 0; (i) < CGROUP_SUBSYS_COUNT; (i)++) \
if (({ lockdep_assert_held(&cgroup_mutex); \
!((ss) = cgroup_subsys[i]); })) { } \
else
/**
* for_each_builtin_subsys - iterate all built-in cgroup subsystems
* @ss: the iteration cursor
* @i: the index of @ss, CGROUP_BUILTIN_SUBSYS_COUNT after reaching the end
*
* Bulit-in subsystems are always present and iteration itself doesn't
* require any synchronization.
*/
# define for_each_builtin_subsys(ss, i) \
for ((i) = 0; (i) < CGROUP_BUILTIN_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[i]) || true); (i)++)
2013-06-24 15:21:48 -07:00
/* iterate each subsystem attached to a hierarchy */
# define for_each_root_subsys(root, ss) \
list_for_each_entry((ss), &(root)->subsys_list, sibling)
2007-10-18 23:39:30 -07:00
2013-06-24 15:21:48 -07:00
/* iterate across the active hierarchies */
# define for_each_active_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
2007-10-18 23:39:30 -07:00
2012-04-01 12:09:55 -07:00
static inline struct cgroup * __d_cgrp ( struct dentry * dentry )
{
return dentry - > d_fsdata ;
}
2012-04-01 12:09:56 -07:00
static inline struct cfent * __d_cfe ( struct dentry * dentry )
2012-04-01 12:09:55 -07:00
{
return dentry - > d_fsdata ;
}
2012-04-01 12:09:56 -07:00
static inline struct cftype * __d_cft ( struct dentry * dentry )
{
return __d_cfe ( dentry ) - > type ;
}
2013-04-07 09:29:51 -07:00
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
*
2013-04-07 09:29:51 -07:00
* On success, returns true; the mutex should be later unlocked. On
* failure returns false with no lock held.
2013-04-07 09:29:51 -07:00
*/
2013-04-07 09:29:51 -07:00
static bool cgroup_lock_live_group ( struct cgroup * cgrp )
2013-04-07 09:29:51 -07:00
{
mutex_lock ( & cgroup_mutex ) ;
2013-06-12 21:04:53 -07:00
if ( cgroup_is_dead ( cgrp ) ) {
2013-04-07 09:29:51 -07:00
mutex_unlock ( & cgroup_mutex ) ;
return false ;
}
return true ;
}
2007-10-18 23:39:38 -07:00
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD ( release_list ) ;
2009-07-25 16:47:45 +02:00
static DEFINE_RAW_SPINLOCK ( release_list_lock ) ;
2007-10-18 23:39:38 -07:00
static void cgroup_release_agent ( struct work_struct * work ) ;
static DECLARE_WORK ( release_agent_work , cgroup_release_agent ) ;
2007-10-18 23:40:44 -07:00
static void check_for_release ( struct cgroup * cgrp ) ;
2007-10-18 23:39:38 -07:00
2013-06-12 21:04:50 -07:00
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
* direction, a css_set is naturally associated with multiple cgroups.
* This M:N relationship is represented by the following link structure
* which exists for each association and allows traversing the associations
* from both sides.
*/
struct cgrp_cset_link {
/* the cgroup and css_set this link associates */
struct cgroup * cgrp ;
struct css_set * cset ;
/* list of cgrp_cset_links anchored at cgrp->cset_links */
struct list_head cset_link ;
/* list of cgrp_cset_links anchored at css_set->cgrp_links */
struct list_head cgrp_link ;
2007-10-18 23:39:36 -07:00
} ;
/* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
static struct css_set init_css_set ;
2013-06-12 21:04:50 -07:00
static struct cgrp_cset_link init_cgrp_cset_link ;
2007-10-18 23:39:36 -07:00
2010-03-10 15:22:09 -08:00
static int cgroup_init_idr ( struct cgroup_subsys * ss ,
struct cgroup_subsys_state * css ) ;
2009-04-02 16:57:25 -07:00
2007-10-18 23:39:36 -07:00
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
static DEFINE_RWLOCK ( css_set_lock ) ;
static int css_set_count ;
2009-09-23 15:56:22 -07:00
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
2008-04-29 01:00:11 -07:00
# define CSS_SET_HASH_BITS 7
2013-01-10 11:49:27 +08:00
static DEFINE_HASHTABLE ( css_set_table , CSS_SET_HASH_BITS ) ;
2008-04-29 01:00:11 -07:00
2013-01-10 11:49:27 +08:00
static unsigned long css_set_hash ( struct cgroup_subsys_state * css [ ] )
2008-04-29 01:00:11 -07:00
{
2013-01-10 11:49:27 +08:00
unsigned long key = 0UL ;
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
int i ;
2008-04-29 01:00:11 -07:00
2013-06-25 11:53:37 -07:00
for_each_subsys ( ss , i )
2013-01-10 11:49:27 +08:00
key + = ( unsigned long ) css [ i ] ;
key = ( key > > 16 ) ^ key ;
2008-04-29 01:00:11 -07:00
2013-01-10 11:49:27 +08:00
return key ;
2008-04-29 01:00:11 -07:00
}
2007-10-18 23:39:36 -07:00
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
* compiled into their kernel but not actually in use */
2008-07-25 01:46:56 -07:00
static int use_task_css_set_links __read_mostly ;
2007-10-18 23:39:36 -07:00
2013-06-12 21:04:49 -07:00
static void __put_css_set ( struct css_set * cset , int taskexit )
2007-10-18 23:39:33 -07:00
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link , * tmp_link ;
2013-06-12 21:04:49 -07:00
2008-10-18 20:28:03 -07:00
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
2013-06-12 21:04:49 -07:00
if ( atomic_add_unless ( & cset - > refcount , - 1 , 1 ) )
2008-10-18 20:28:03 -07:00
return ;
write_lock ( & css_set_lock ) ;
2013-06-12 21:04:49 -07:00
if ( ! atomic_dec_and_test ( & cset - > refcount ) ) {
2008-10-18 20:28:03 -07:00
write_unlock ( & css_set_lock ) ;
return ;
}
2007-10-18 23:39:38 -07:00
2009-09-23 15:56:23 -07:00
/* This css_set is dead. unlink it and release cgroup refcounts */
2013-06-12 21:04:49 -07:00
hash_del ( & cset - > hlist ) ;
2009-09-23 15:56:23 -07:00
css_set_count - - ;
2013-06-12 21:04:50 -07:00
list_for_each_entry_safe ( link , tmp_link , & cset - > cgrp_links , cgrp_link ) {
2009-09-23 15:56:23 -07:00
struct cgroup * cgrp = link - > cgrp ;
2013-06-12 21:04:49 -07:00
2013-06-12 21:04:50 -07:00
list_del ( & link - > cset_link ) ;
list_del ( & link - > cgrp_link ) ;
2013-01-24 14:43:28 +08:00
2013-06-12 21:04:54 -07:00
/* @cgrp can't go away while we're holding css_set_lock */
2013-06-12 21:04:55 -07:00
if ( list_empty ( & cgrp - > cset_links ) & & notify_on_release ( cgrp ) ) {
2007-10-18 23:39:38 -07:00
if ( taskexit )
2007-10-18 23:40:44 -07:00
set_bit ( CGRP_RELEASABLE , & cgrp - > flags ) ;
check_for_release ( cgrp ) ;
2007-10-18 23:39:38 -07:00
}
2009-09-23 15:56:23 -07:00
kfree ( link ) ;
2007-10-18 23:39:38 -07:00
}
2009-09-23 15:56:23 -07:00
write_unlock ( & css_set_lock ) ;
2013-06-12 21:04:49 -07:00
kfree_rcu ( cset , rcu_head ) ;
2007-10-18 23:39:36 -07:00
}
/*
* refcounted get/put for css_set objects
*/
2013-06-12 21:04:49 -07:00
static inline void get_css_set ( struct css_set * cset )
2007-10-18 23:39:36 -07:00
{
2013-06-12 21:04:49 -07:00
atomic_inc ( & cset - > refcount ) ;
2007-10-18 23:39:36 -07:00
}
2013-06-12 21:04:49 -07:00
static inline void put_css_set ( struct css_set * cset )
2007-10-18 23:39:36 -07:00
{
2013-06-12 21:04:49 -07:00
__put_css_set ( cset , 0 ) ;
2007-10-18 23:39:36 -07:00
}
2013-06-12 21:04:49 -07:00
static inline void put_css_set_taskexit ( struct css_set * cset )
2007-10-18 23:39:38 -07:00
{
2013-06-12 21:04:49 -07:00
__put_css_set ( cset , 1 ) ;
2007-10-18 23:39:38 -07:00
}
2013-06-24 15:21:48 -07:00
/**
2009-09-23 15:56:22 -07:00
* compare_css_sets - helper function for find_existing_css_set().
2013-06-12 21:04:49 -07:00
* @cset: candidate css_set being tested
* @old_cset: existing css_set for a task
2009-09-23 15:56:22 -07:00
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
2013-07-31 16:18:36 +08:00
* Returns true if "cset" matches "old_cset" except for the hierarchy
2009-09-23 15:56:22 -07:00
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
2013-06-12 21:04:49 -07:00
static bool compare_css_sets ( struct css_set * cset ,
struct css_set * old_cset ,
2009-09-23 15:56:22 -07:00
struct cgroup * new_cgrp ,
struct cgroup_subsys_state * template [ ] )
{
struct list_head * l1 , * l2 ;
2013-06-12 21:04:49 -07:00
if ( memcmp ( template , cset - > subsys , sizeof ( cset - > subsys ) ) ) {
2009-09-23 15:56:22 -07:00
/* Not all subsystems matched */
return false ;
}
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in heirarchies with no subsystems. We
* could get by with just this check alone (and skip the
* memcmp above) but on most setups the memcmp check will
* avoid the need for this more expensive check on almost all
* candidates.
*/
2013-06-12 21:04:50 -07:00
l1 = & cset - > cgrp_links ;
l2 = & old_cset - > cgrp_links ;
2009-09-23 15:56:22 -07:00
while ( 1 ) {
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link1 , * link2 ;
2013-06-12 21:04:49 -07:00
struct cgroup * cgrp1 , * cgrp2 ;
2009-09-23 15:56:22 -07:00
l1 = l1 - > next ;
l2 = l2 - > next ;
/* See if we reached the end - both lists are equal length. */
2013-06-12 21:04:50 -07:00
if ( l1 = = & cset - > cgrp_links ) {
BUG_ON ( l2 ! = & old_cset - > cgrp_links ) ;
2009-09-23 15:56:22 -07:00
break ;
} else {
2013-06-12 21:04:50 -07:00
BUG_ON ( l2 = = & old_cset - > cgrp_links ) ;
2009-09-23 15:56:22 -07:00
}
/* Locate the cgroups associated with these links. */
2013-06-12 21:04:50 -07:00
link1 = list_entry ( l1 , struct cgrp_cset_link , cgrp_link ) ;
link2 = list_entry ( l2 , struct cgrp_cset_link , cgrp_link ) ;
cgrp1 = link1 - > cgrp ;
cgrp2 = link2 - > cgrp ;
2009-09-23 15:56:22 -07:00
/* Hierarchies should be linked in the same order. */
2013-06-12 21:04:49 -07:00
BUG_ON ( cgrp1 - > root ! = cgrp2 - > root ) ;
2009-09-23 15:56:22 -07:00
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
2013-06-12 21:04:49 -07:00
if ( cgrp1 - > root = = new_cgrp - > root ) {
if ( cgrp1 ! = new_cgrp )
2009-09-23 15:56:22 -07:00
return false ;
} else {
2013-06-12 21:04:49 -07:00
if ( cgrp1 ! = cgrp2 )
2009-09-23 15:56:22 -07:00
return false ;
}
}
return true ;
}
2013-06-24 15:21:48 -07:00
/**
* find_existing_css_set - init css array and find the matching css_set
* @old_cset: the css_set that we're using before the cgroup transition
* @cgrp: the cgroup that we're moving into
* @template: out param for the new set of csses, should be clear on entry
2007-10-18 23:39:36 -07:00
*/
2013-06-12 21:04:49 -07:00
static struct css_set * find_existing_css_set ( struct css_set * old_cset ,
struct cgroup * cgrp ,
struct cgroup_subsys_state * template [ ] )
2007-10-18 23:39:36 -07:00
{
2007-10-18 23:40:44 -07:00
struct cgroupfs_root * root = cgrp - > root ;
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2013-01-10 11:49:27 +08:00
unsigned long key ;
2013-06-24 15:21:48 -07:00
int i ;
2007-10-18 23:39:36 -07:00
2010-03-10 15:22:07 -08:00
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
2013-06-25 11:53:37 -07:00
for_each_subsys ( ss , i ) {
2012-08-23 16:53:31 -04:00
if ( root - > subsys_mask & ( 1UL < < i ) ) {
2007-10-18 23:39:36 -07:00
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
2007-10-18 23:40:44 -07:00
template [ i ] = cgrp - > subsys [ i ] ;
2007-10-18 23:39:36 -07:00
} else {
/* Subsystem is not in this hierarchy, so we
* don't want to change the subsystem state */
2013-06-12 21:04:49 -07:00
template [ i ] = old_cset - > subsys [ i ] ;
2007-10-18 23:39:36 -07:00
}
}
2013-01-10 11:49:27 +08:00
key = css_set_hash ( template ) ;
2013-06-12 21:04:49 -07:00
hash_for_each_possible ( css_set_table , cset , hlist , key ) {
if ( ! compare_css_sets ( cset , old_cset , cgrp , template ) )
2009-09-23 15:56:22 -07:00
continue ;
/* This css_set matches what we need */
2013-06-12 21:04:49 -07:00
return cset ;
2008-04-29 01:00:11 -07:00
}
2007-10-18 23:39:36 -07:00
/* No existing cgroup group matched */
return NULL ;
}
2013-06-12 21:04:50 -07:00
static void free_cgrp_cset_links ( struct list_head * links_to_free )
2007-10-18 23:39:36 -07:00
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link , * tmp_link ;
2008-07-25 01:46:55 -07:00
2013-06-12 21:04:50 -07:00
list_for_each_entry_safe ( link , tmp_link , links_to_free , cset_link ) {
list_del ( & link - > cset_link ) ;
2007-10-18 23:39:36 -07:00
kfree ( link ) ;
}
}
2013-06-12 21:04:50 -07:00
/**
* allocate_cgrp_cset_links - allocate cgrp_cset_links
* @count: the number of links to allocate
* @tmp_links: list_head the allocated links are put on
*
* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
* through ->cset_link. Returns 0 on success or -errno.
2008-07-29 22:33:19 -07:00
*/
2013-06-12 21:04:50 -07:00
static int allocate_cgrp_cset_links ( int count , struct list_head * tmp_links )
2008-07-29 22:33:19 -07:00
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2008-07-29 22:33:19 -07:00
int i ;
2013-06-12 21:04:50 -07:00
INIT_LIST_HEAD ( tmp_links ) ;
2008-07-29 22:33:19 -07:00
for ( i = 0 ; i < count ; i + + ) {
2013-06-12 21:04:51 -07:00
link = kzalloc ( sizeof ( * link ) , GFP_KERNEL ) ;
2008-07-29 22:33:19 -07:00
if ( ! link ) {
2013-06-12 21:04:50 -07:00
free_cgrp_cset_links ( tmp_links ) ;
2008-07-29 22:33:19 -07:00
return - ENOMEM ;
}
2013-06-12 21:04:50 -07:00
list_add ( & link - > cset_link , tmp_links ) ;
2008-07-29 22:33:19 -07:00
}
return 0 ;
}
2009-01-07 18:07:42 -08:00
/**
* link_css_set - a helper function to link a css_set to a cgroup
2013-06-12 21:04:50 -07:00
* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
2013-06-12 21:04:49 -07:00
* @cset: the css_set to be linked
2009-01-07 18:07:42 -08:00
* @cgrp: the destination cgroup
*/
2013-06-12 21:04:50 -07:00
static void link_css_set ( struct list_head * tmp_links , struct css_set * cset ,
struct cgroup * cgrp )
2009-01-07 18:07:42 -08:00
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2009-01-07 18:07:42 -08:00
2013-06-12 21:04:50 -07:00
BUG_ON ( list_empty ( tmp_links ) ) ;
link = list_first_entry ( tmp_links , struct cgrp_cset_link , cset_link ) ;
link - > cset = cset ;
2009-09-23 15:56:22 -07:00
link - > cgrp = cgrp ;
2013-06-12 21:04:50 -07:00
list_move ( & link - > cset_link , & cgrp - > cset_links ) ;
2009-09-23 15:56:22 -07:00
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
2013-06-12 21:04:50 -07:00
list_add_tail ( & link - > cgrp_link , & cset - > cgrp_links ) ;
2009-01-07 18:07:42 -08:00
}
2013-06-24 15:21:48 -07:00
/**
* find_css_set - return a new css_set with one cgroup updated
* @old_cset: the baseline css_set
* @cgrp: the cgroup to be updated
*
* Return a new css_set that's equivalent to @old_cset, but with @cgrp
* substituted into the appropriate hierarchy.
2007-10-18 23:39:36 -07:00
*/
2013-06-12 21:04:49 -07:00
static struct css_set * find_css_set ( struct css_set * old_cset ,
struct cgroup * cgrp )
2007-10-18 23:39:36 -07:00
{
2013-06-24 15:21:48 -07:00
struct cgroup_subsys_state * template [ CGROUP_SUBSYS_COUNT ] = { } ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2013-06-12 21:04:50 -07:00
struct list_head tmp_links ;
struct cgrp_cset_link * link ;
2013-01-10 11:49:27 +08:00
unsigned long key ;
2008-04-29 01:00:11 -07:00
2013-06-24 15:21:48 -07:00
lockdep_assert_held ( & cgroup_mutex ) ;
2007-10-18 23:39:36 -07:00
/* First see if we already have a cgroup group that matches
* the desired set */
2008-07-25 01:46:54 -07:00
read_lock ( & css_set_lock ) ;
2013-06-12 21:04:49 -07:00
cset = find_existing_css_set ( old_cset , cgrp , template ) ;
if ( cset )
get_css_set ( cset ) ;
2008-07-25 01:46:54 -07:00
read_unlock ( & css_set_lock ) ;
2007-10-18 23:39:36 -07:00
2013-06-12 21:04:49 -07:00
if ( cset )
return cset ;
2007-10-18 23:39:36 -07:00
2013-06-12 21:04:51 -07:00
cset = kzalloc ( sizeof ( * cset ) , GFP_KERNEL ) ;
2013-06-12 21:04:49 -07:00
if ( ! cset )
2007-10-18 23:39:36 -07:00
return NULL ;
2013-06-12 21:04:50 -07:00
/* Allocate all the cgrp_cset_link objects that we'll need */
2013-06-24 15:21:47 -07:00
if ( allocate_cgrp_cset_links ( cgroup_root_count , & tmp_links ) < 0 ) {
2013-06-12 21:04:49 -07:00
kfree ( cset ) ;
2007-10-18 23:39:36 -07:00
return NULL ;
}
2013-06-12 21:04:49 -07:00
atomic_set ( & cset - > refcount , 1 ) ;
2013-06-12 21:04:50 -07:00
INIT_LIST_HEAD ( & cset - > cgrp_links ) ;
2013-06-12 21:04:49 -07:00
INIT_LIST_HEAD ( & cset - > tasks ) ;
INIT_HLIST_NODE ( & cset - > hlist ) ;
2007-10-18 23:39:36 -07:00
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
2013-06-12 21:04:49 -07:00
memcpy ( cset - > subsys , template , sizeof ( cset - > subsys ) ) ;
2007-10-18 23:39:36 -07:00
write_lock ( & css_set_lock ) ;
/* Add reference counts and links from the new css_set. */
2013-06-12 21:04:50 -07:00
list_for_each_entry ( link , & old_cset - > cgrp_links , cgrp_link ) {
2009-09-23 15:56:22 -07:00
struct cgroup * c = link - > cgrp ;
2013-06-12 21:04:50 -07:00
2009-09-23 15:56:22 -07:00
if ( c - > root = = cgrp - > root )
c = cgrp ;
2013-06-12 21:04:50 -07:00
link_css_set ( & tmp_links , cset , c ) ;
2009-09-23 15:56:22 -07:00
}
2007-10-18 23:39:36 -07:00
2013-06-12 21:04:50 -07:00
BUG_ON ( ! list_empty ( & tmp_links ) ) ;
2007-10-18 23:39:36 -07:00
css_set_count + + ;
2008-04-29 01:00:11 -07:00
/* Add this cgroup group to the hash table */
2013-06-12 21:04:49 -07:00
key = css_set_hash ( cset - > subsys ) ;
hash_add ( css_set_table , & cset - > hlist , key ) ;
2008-04-29 01:00:11 -07:00
2007-10-18 23:39:36 -07:00
write_unlock ( & css_set_lock ) ;
2013-06-12 21:04:49 -07:00
return cset ;
2007-10-18 23:39:33 -07:00
}
2009-09-23 15:56:22 -07:00
/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex held.
*/
static struct cgroup * task_cgroup_from_root ( struct task_struct * task ,
struct cgroupfs_root * root )
{
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2009-09-23 15:56:22 -07:00
struct cgroup * res = NULL ;
BUG_ON ( ! mutex_is_locked ( & cgroup_mutex ) ) ;
read_lock ( & css_set_lock ) ;
/*
* No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen
* is that it exits and its css is set back to init_css_set.
*/
2013-06-21 15:52:04 -07:00
cset = task_css_set ( task ) ;
2013-06-12 21:04:49 -07:00
if ( cset = = & init_css_set ) {
2009-09-23 15:56:22 -07:00
res = & root - > top_cgroup ;
} else {
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
list_for_each_entry ( link , & cset - > cgrp_links , cgrp_link ) {
2009-09-23 15:56:22 -07:00
struct cgroup * c = link - > cgrp ;
2013-06-12 21:04:50 -07:00
2009-09-23 15:56:22 -07:00
if ( c - > root = = root ) {
res = c ;
break ;
}
}
}
read_unlock ( & css_set_lock ) ;
BUG_ON ( ! res ) ;
return res ;
}
2007-10-18 23:39:30 -07:00
/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
* See "The task_lock() exception", at the end of this comment.
*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
2008-02-07 00:14:43 -08:00
* cgroup_attach_task() can increment it again. Because a count of zero
2007-10-18 23:39:30 -07:00
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
* (usually) take cgroup_mutex. These are the two most performance
* critical pieces of code here. The exception occurs on cgroup_exit(),
* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
* is taken, and if the cgroup count is zero, a usermode call made
2008-02-23 15:24:09 -08:00
* to the release agent with the name of the cgroup (path relative to
* the root of cgroup file system) as the argument.
2007-10-18 23:39:30 -07:00
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, top_cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that top_cgroup cannot be deleted.
*
* The task_lock() exception
*
* The need for this exception arises from the action of
2012-11-20 22:06:18 +08:00
* cgroup_attach_task(), which overwrites one task's cgroup pointer with
2008-02-23 15:24:09 -08:00
* another. It does so using cgroup_mutex, however there are
2007-10-18 23:39:30 -07:00
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
2012-11-20 22:06:18 +08:00
* in cgroup_attach_task(), modifying a task's cgroup pointer we use
2007-10-18 23:39:30 -07:00
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* P.S. One more locking exception. RCU is used to guard the
2008-02-07 00:14:43 -08:00
* update of a tasks cgroup pointer by cgroup_attach_task()
2007-10-18 23:39:30 -07:00
*/
/*
* A couple of forward declarations required, due to cyclic reference loop:
* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
* -> cgroup_mkdir.
*/
2011-07-26 01:41:39 -04:00
static int cgroup_mkdir ( struct inode * dir , struct dentry * dentry , umode_t mode ) ;
2012-06-10 17:13:09 -04:00
static struct dentry * cgroup_lookup ( struct inode * , struct dentry * , unsigned int ) ;
2007-10-18 23:39:30 -07:00
static int cgroup_rmdir ( struct inode * unused_dir , struct dentry * dentry ) ;
2013-06-28 16:24:11 -07:00
static int cgroup_populate_dir ( struct cgroup * cgrp , unsigned long subsys_mask ) ;
2009-09-21 17:01:11 -07:00
static const struct inode_operations cgroup_dir_inode_operations ;
2009-10-01 15:43:56 -07:00
static const struct file_operations proc_cgroupstats_operations ;
2007-10-18 23:39:35 -07:00
static struct backing_dev_info cgroup_backing_dev_info = {
2009-06-12 14:45:52 +02:00
. name = " cgroup " ,
2008-04-30 00:54:37 -07:00
. capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK ,
2007-10-18 23:39:35 -07:00
} ;
2007-10-18 23:39:30 -07:00
2009-04-02 16:57:25 -07:00
static int alloc_css_id ( struct cgroup_subsys * ss ,
struct cgroup * parent , struct cgroup * child ) ;
2011-07-26 01:55:55 -04:00
static struct inode * cgroup_new_inode ( umode_t mode , struct super_block * sb )
2007-10-18 23:39:30 -07:00
{
struct inode * inode = new_inode ( sb ) ;
if ( inode ) {
2010-10-23 11:19:54 -04:00
inode - > i_ino = get_next_ino ( ) ;
2007-10-18 23:39:30 -07:00
inode - > i_mode = mode ;
2008-11-14 10:39:12 +11:00
inode - > i_uid = current_fsuid ( ) ;
inode - > i_gid = current_fsgid ( ) ;
2007-10-18 23:39:30 -07:00
inode - > i_atime = inode - > i_mtime = inode - > i_ctime = CURRENT_TIME ;
inode - > i_mapping - > backing_dev_info = & cgroup_backing_dev_info ;
}
return inode ;
}
2013-03-01 15:01:56 +08:00
static struct cgroup_name * cgroup_alloc_name ( struct dentry * dentry )
{
struct cgroup_name * name ;
name = kmalloc ( sizeof ( * name ) + dentry - > d_name . len + 1 , GFP_KERNEL ) ;
if ( ! name )
return NULL ;
strcpy ( name - > name , dentry - > d_name . name ) ;
return name ;
}
2013-01-24 14:31:42 +08:00
static void cgroup_free_fn ( struct work_struct * work )
{
2013-06-13 19:27:42 -07:00
struct cgroup * cgrp = container_of ( work , struct cgroup , destroy_work ) ;
2013-01-24 14:31:42 +08:00
struct cgroup_subsys * ss ;
mutex_lock ( & cgroup_mutex ) ;
/*
* Release the subsystem state objects.
*/
2013-08-08 20:11:23 -04:00
for_each_root_subsys ( cgrp - > root , ss ) {
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
ss - > css_free ( css ) ;
}
2013-01-24 14:31:42 +08:00
cgrp - > root - > number_of_cgroups - - ;
mutex_unlock ( & cgroup_mutex ) ;
2013-04-08 14:35:02 +08:00
/*
* We get a ref to the parent's dentry, and put the ref when
* this cgroup is being freed, so it's guaranteed that the
* parent won't be destroyed before its children.
*/
dput ( cgrp - > parent - > dentry ) ;
2013-01-24 14:31:42 +08:00
/*
* Drop the active superblock reference that we took when we
2013-04-26 11:58:02 -07:00
* created the cgroup. This will free cgrp->root, if we are
* holding the last reference to @sb.
2013-01-24 14:31:42 +08:00
*/
deactivate_super ( cgrp - > root - > sb ) ;
/*
* if we're getting rid of the cgroup, refcount should ensure
* that there are no pidlists left.
*/
BUG_ON ( ! list_empty ( & cgrp - > pidlists ) ) ;
simple_xattrs_free ( & cgrp - > xattrs ) ;
2013-03-01 15:01:56 +08:00
kfree ( rcu_dereference_raw ( cgrp - > name ) ) ;
2013-01-24 14:31:42 +08:00
kfree ( cgrp ) ;
}
static void cgroup_free_rcu ( struct rcu_head * head )
{
struct cgroup * cgrp = container_of ( head , struct cgroup , rcu_head ) ;
2013-06-13 19:27:42 -07:00
INIT_WORK ( & cgrp - > destroy_work , cgroup_free_fn ) ;
schedule_work ( & cgrp - > destroy_work ) ;
2013-01-24 14:31:42 +08:00
}
2007-10-18 23:39:30 -07:00
static void cgroup_diput ( struct dentry * dentry , struct inode * inode )
{
/* is dentry a directory ? if so, kfree() associated cgroup */
if ( S_ISDIR ( inode - > i_mode ) ) {
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = dentry - > d_fsdata ;
2013-01-24 14:31:42 +08:00
2013-06-12 21:04:53 -07:00
BUG_ON ( ! ( cgroup_is_dead ( cgrp ) ) ) ;
2013-01-24 14:31:42 +08:00
call_rcu ( & cgrp - > rcu_head , cgroup_free_rcu ) ;
2012-04-01 12:09:56 -07:00
} else {
struct cfent * cfe = __d_cfe ( dentry ) ;
struct cgroup * cgrp = dentry - > d_parent - > d_fsdata ;
WARN_ONCE ( ! list_empty ( & cfe - > node ) & &
cgrp ! = & cgrp - > root - > top_cgroup ,
" cfe still linked for %s \n " , cfe - > type - > name ) ;
2013-04-18 23:09:52 -07:00
simple_xattrs_free ( & cfe - > xattrs ) ;
2012-04-01 12:09:56 -07:00
kfree ( cfe ) ;
2007-10-18 23:39:30 -07:00
}
iput ( inode ) ;
}
2011-01-14 05:31:45 +00:00
static int cgroup_delete ( const struct dentry * d )
{
return 1 ;
}
2007-10-18 23:39:30 -07:00
static void remove_dir ( struct dentry * d )
{
struct dentry * parent = dget ( d - > d_parent ) ;
d_delete ( d ) ;
simple_rmdir ( parent - > d_inode , d ) ;
dput ( parent ) ;
}
2013-01-21 18:18:33 +08:00
static void cgroup_rm_file ( struct cgroup * cgrp , const struct cftype * cft )
2007-10-18 23:39:30 -07:00
{
2012-04-01 12:09:56 -07:00
struct cfent * cfe ;
2007-10-18 23:39:30 -07:00
2012-04-01 12:09:56 -07:00
lockdep_assert_held ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
lockdep_assert_held ( & cgroup_mutex ) ;
2011-01-07 17:49:34 +11:00
2013-01-21 18:18:33 +08:00
/*
* If we're doing cleanup due to failure of cgroup_create(),
* the corresponding @cfe may not exist.
*/
2012-04-01 12:09:56 -07:00
list_for_each_entry ( cfe , & cgrp - > files , node ) {
struct dentry * d = cfe - > dentry ;
if ( cft & & cfe - > type ! = cft )
continue ;
dget ( d ) ;
d_delete ( d ) ;
2012-07-03 10:38:06 -07:00
simple_unlink ( cgrp - > dentry - > d_inode , d ) ;
2012-04-01 12:09:56 -07:00
list_del_init ( & cfe - > node ) ;
dput ( d ) ;
2013-01-21 18:18:33 +08:00
break ;
2007-10-18 23:39:30 -07:00
}
2012-04-01 12:09:56 -07:00
}
2012-08-23 16:53:29 -04:00
/**
2013-06-28 16:24:11 -07:00
* cgroup_clear_dir - remove subsys files in a cgroup directory
2013-06-28 16:24:10 -07:00
* @cgrp: target cgroup
2012-08-23 16:53:29 -04:00
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
2013-06-28 16:24:11 -07:00
static void cgroup_clear_dir ( struct cgroup * cgrp , unsigned long subsys_mask )
2012-04-01 12:09:56 -07:00
{
2012-08-23 16:53:29 -04:00
struct cgroup_subsys * ss ;
2013-07-12 12:34:02 -07:00
int i ;
2012-04-01 12:09:56 -07:00
2013-07-12 12:34:02 -07:00
for_each_subsys ( ss , i ) {
2012-08-23 16:53:29 -04:00
struct cftype_set * set ;
2013-07-12 12:34:02 -07:00
if ( ! test_bit ( i , & subsys_mask ) )
2012-08-23 16:53:29 -04:00
continue ;
list_for_each_entry ( set , & ss - > cftsets , node )
2013-08-08 20:11:23 -04:00
cgroup_addrm_files ( cgrp , set - > cfts , false ) ;
2012-08-23 16:53:29 -04:00
}
2007-10-18 23:39:30 -07:00
}
/*
* NOTE : the dentry must have been dget()'ed
*/
static void cgroup_d_remove_dir ( struct dentry * dentry )
{
2011-01-07 17:49:34 +11:00
struct dentry * parent ;
2007-10-18 23:39:30 -07:00
2011-01-07 17:49:34 +11:00
parent = dentry - > d_parent ;
spin_lock ( & parent - > d_lock ) ;
2011-01-14 11:34:34 +08:00
spin_lock_nested ( & dentry - > d_lock , DENTRY_D_LOCK_NESTED ) ;
2007-10-18 23:39:30 -07:00
list_del_init ( & dentry - > d_u . d_child ) ;
2011-01-07 17:49:34 +11:00
spin_unlock ( & dentry - > d_lock ) ;
spin_unlock ( & parent - > d_lock ) ;
2007-10-18 23:39:30 -07:00
remove_dir ( dentry ) ;
}
2010-03-10 15:22:07 -08:00
/*
2010-03-10 15:22:09 -08:00
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
2010-03-10 15:22:07 -08:00
*/
2007-10-18 23:39:30 -07:00
static int rebind_subsystems ( struct cgroupfs_root * root ,
2013-06-24 15:21:47 -07:00
unsigned long added_mask , unsigned removed_mask )
2007-10-18 23:39:30 -07:00
{
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = & root - > top_cgroup ;
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2013-07-12 13:38:17 -07:00
unsigned long pinned = 0 ;
2013-06-28 17:07:30 -07:00
int i , ret ;
2007-10-18 23:39:30 -07:00
2010-03-10 15:22:07 -08:00
BUG_ON ( ! mutex_is_locked ( & cgroup_mutex ) ) ;
2011-12-12 18:12:21 -08:00
BUG_ON ( ! mutex_is_locked ( & cgroup_root_mutex ) ) ;
2010-03-10 15:22:07 -08:00
2007-10-18 23:39:30 -07:00
/* Check that any added subsystems are currently free */
2013-06-25 11:53:37 -07:00
for_each_subsys ( ss , i ) {
2013-07-12 13:38:17 -07:00
if ( ! ( added_mask & ( 1 < < i ) ) )
2007-10-18 23:39:30 -07:00
continue ;
2013-06-25 11:53:37 -07:00
2013-07-12 13:38:17 -07:00
/* is the subsystem mounted elsewhere? */
2013-06-24 15:21:47 -07:00
if ( ss - > root ! = & cgroup_dummy_root ) {
2013-07-12 13:38:17 -07:00
ret = - EBUSY ;
goto out_put ;
2007-10-18 23:39:30 -07:00
}
2013-07-12 13:38:17 -07:00
/* pin the module */
if ( ! try_module_get ( ss - > module ) ) {
ret = - ENOENT ;
goto out_put ;
}
pinned | = 1 < < i ;
}
/* subsys could be missing if unloaded between parsing and here */
if ( added_mask ! = pinned ) {
ret = - ENOENT ;
goto out_put ;
2007-10-18 23:39:30 -07:00
}
2013-06-28 17:07:30 -07:00
ret = cgroup_populate_dir ( cgrp , added_mask ) ;
if ( ret )
2013-07-12 13:38:17 -07:00
goto out_put ;
2013-06-28 17:07:30 -07:00
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
cgroup_clear_dir ( cgrp , removed_mask ) ;
2013-06-25 11:53:37 -07:00
for_each_subsys ( ss , i ) {
2007-10-18 23:39:30 -07:00
unsigned long bit = 1UL < < i ;
2013-06-25 11:53:37 -07:00
2012-08-23 16:53:31 -04:00
if ( bit & added_mask ) {
2007-10-18 23:39:30 -07:00
/* We're binding this subsystem to this hierarchy */
2007-10-18 23:40:44 -07:00
BUG_ON ( cgrp - > subsys [ i ] ) ;
2013-06-24 15:21:47 -07:00
BUG_ON ( ! cgroup_dummy_top - > subsys [ i ] ) ;
BUG_ON ( cgroup_dummy_top - > subsys [ i ] - > cgroup ! = cgroup_dummy_top ) ;
2013-06-24 15:21:47 -07:00
2013-06-24 15:21:47 -07:00
cgrp - > subsys [ i ] = cgroup_dummy_top - > subsys [ i ] ;
2007-10-18 23:40:44 -07:00
cgrp - > subsys [ i ] - > cgroup = cgrp ;
2009-01-07 18:07:42 -08:00
list_move ( & ss - > sibling , & root - > subsys_list ) ;
2009-01-07 18:07:37 -08:00
ss - > root = root ;
2007-10-18 23:39:30 -07:00
if ( ss - > bind )
2013-08-08 20:11:23 -04:00
ss - > bind ( cgrp - > subsys [ i ] ) ;
2013-06-24 15:21:47 -07:00
2010-03-10 15:22:09 -08:00
/* refcount was already taken, and we're keeping it */
2013-06-24 15:21:47 -07:00
root - > subsys_mask | = bit ;
2012-08-23 16:53:31 -04:00
} else if ( bit & removed_mask ) {
2007-10-18 23:39:30 -07:00
/* We're removing this subsystem */
2013-06-24 15:21:47 -07:00
BUG_ON ( cgrp - > subsys [ i ] ! = cgroup_dummy_top - > subsys [ i ] ) ;
2007-10-18 23:40:44 -07:00
BUG_ON ( cgrp - > subsys [ i ] - > cgroup ! = cgrp ) ;
2013-06-24 15:21:47 -07:00
2007-10-18 23:39:30 -07:00
if ( ss - > bind )
2013-08-08 20:11:23 -04:00
ss - > bind ( cgroup_dummy_top - > subsys [ i ] ) ;
2013-06-24 15:21:47 -07:00
cgroup_dummy_top - > subsys [ i ] - > cgroup = cgroup_dummy_top ;
2007-10-18 23:40:44 -07:00
cgrp - > subsys [ i ] = NULL ;
2013-06-24 15:21:47 -07:00
cgroup_subsys [ i ] - > root = & cgroup_dummy_root ;
list_move ( & ss - > sibling , & cgroup_dummy_root . subsys_list ) ;
2013-06-24 15:21:47 -07:00
2010-03-10 15:22:09 -08:00
/* subsystem is now free - drop reference on module */
module_put ( ss - > module ) ;
2013-06-24 15:21:47 -07:00
root - > subsys_mask & = ~ bit ;
2007-10-18 23:39:30 -07:00
}
}
2013-06-25 18:04:54 -07:00
/*
* Mark @root has finished binding subsystems. @root->subsys_mask
* now matches the bound subsystems.
*/
root - > flags | = CGRP_ROOT_SUBSYS_BOUND ;
2007-10-18 23:39:30 -07:00
return 0 ;
2013-07-12 13:38:17 -07:00
out_put :
for_each_subsys ( ss , i )
if ( pinned & ( 1 < < i ) )
module_put ( ss - > module ) ;
return ret ;
2007-10-18 23:39:30 -07:00
}
2011-12-08 21:32:45 -05:00
static int cgroup_show_options ( struct seq_file * seq , struct dentry * dentry )
2007-10-18 23:39:30 -07:00
{
2011-12-08 21:32:45 -05:00
struct cgroupfs_root * root = dentry - > d_sb - > s_fs_info ;
2007-10-18 23:39:30 -07:00
struct cgroup_subsys * ss ;
2011-12-12 18:12:21 -08:00
mutex_lock ( & cgroup_root_mutex ) ;
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss )
2007-10-18 23:39:30 -07:00
seq_printf ( seq , " ,%s " , ss - > name ) ;
2013-04-14 20:15:26 -07:00
if ( root - > flags & CGRP_ROOT_SANE_BEHAVIOR )
seq_puts ( seq , " ,sane_behavior " ) ;
2013-04-14 20:15:25 -07:00
if ( root - > flags & CGRP_ROOT_NOPREFIX )
2007-10-18 23:39:30 -07:00
seq_puts ( seq , " ,noprefix " ) ;
2013-04-14 20:15:25 -07:00
if ( root - > flags & CGRP_ROOT_XATTR )
2012-08-23 16:53:30 -04:00
seq_puts ( seq , " ,xattr " ) ;
2007-10-18 23:39:38 -07:00
if ( strlen ( root - > release_agent_path ) )
seq_printf ( seq , " ,release_agent=%s " , root - > release_agent_path ) ;
2012-11-19 08:13:38 -08:00
if ( test_bit ( CGRP_CPUSET_CLONE_CHILDREN , & root - > top_cgroup . flags ) )
2010-10-27 15:33:35 -07:00
seq_puts ( seq , " ,clone_children " ) ;
2009-09-23 15:56:19 -07:00
if ( strlen ( root - > name ) )
seq_printf ( seq , " ,name=%s " , root - > name ) ;
2011-12-12 18:12:21 -08:00
mutex_unlock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
return 0 ;
}
struct cgroup_sb_opts {
2012-08-23 16:53:31 -04:00
unsigned long subsys_mask ;
2007-10-18 23:39:30 -07:00
unsigned long flags ;
2007-10-18 23:39:38 -07:00
char * release_agent ;
2012-11-19 08:13:38 -08:00
bool cpuset_clone_children ;
2009-09-23 15:56:19 -07:00
char * name ;
2009-09-23 15:56:23 -07:00
/* User explicitly requested empty subsystem */
bool none ;
2009-09-23 15:56:19 -07:00
struct cgroupfs_root * new_root ;
2009-09-23 15:56:23 -07:00
2007-10-18 23:39:30 -07:00
} ;
2010-03-10 15:22:07 -08:00
/*
2013-06-24 15:21:47 -07:00
* Convert a hierarchy specifier into a bitmask of subsystems and
* flags. Call with cgroup_mutex held to protect the cgroup_subsys[]
* array. This function takes refcounts on subsystems to be used, unless it
* returns error, in which case no refcounts are taken.
2010-03-10 15:22:07 -08:00
*/
2010-03-10 15:22:09 -08:00
static int parse_cgroupfs_options ( char * data , struct cgroup_sb_opts * opts )
2007-10-18 23:39:30 -07:00
{
2010-10-27 15:33:37 -07:00
char * token , * o = data ;
bool all_ss = false , one_ss = false ;
2009-06-17 16:26:33 -07:00
unsigned long mask = ( unsigned long ) - 1 ;
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
int i ;
2009-06-17 16:26:33 -07:00
2010-03-10 15:22:07 -08:00
BUG_ON ( ! mutex_is_locked ( & cgroup_mutex ) ) ;
2009-06-17 16:26:33 -07:00
# ifdef CONFIG_CPUSETS
mask = ~ ( 1UL < < cpuset_subsys_id ) ;
# endif
2007-10-18 23:39:30 -07:00
2009-09-23 15:56:19 -07:00
memset ( opts , 0 , sizeof ( * opts ) ) ;
2007-10-18 23:39:30 -07:00
while ( ( token = strsep ( & o , " , " ) ) ! = NULL ) {
if ( ! * token )
return - EINVAL ;
2010-10-27 15:33:37 -07:00
if ( ! strcmp ( token , " none " ) ) {
2009-09-23 15:56:23 -07:00
/* Explicitly have no subsystems */
opts - > none = true ;
2010-10-27 15:33:37 -07:00
continue ;
}
if ( ! strcmp ( token , " all " ) ) {
/* Mutually exclusive option 'all' + subsystem name */
if ( one_ss )
return - EINVAL ;
all_ss = true ;
continue ;
}
2013-04-14 20:15:26 -07:00
if ( ! strcmp ( token , " __DEVEL__sane_behavior " ) ) {
opts - > flags | = CGRP_ROOT_SANE_BEHAVIOR ;
continue ;
}
2010-10-27 15:33:37 -07:00
if ( ! strcmp ( token , " noprefix " ) ) {
2013-04-14 20:15:25 -07:00
opts - > flags | = CGRP_ROOT_NOPREFIX ;
2010-10-27 15:33:37 -07:00
continue ;
}
if ( ! strcmp ( token , " clone_children " ) ) {
2012-11-19 08:13:38 -08:00
opts - > cpuset_clone_children = true ;
2010-10-27 15:33:37 -07:00
continue ;
}
2012-08-23 16:53:30 -04:00
if ( ! strcmp ( token , " xattr " ) ) {
2013-04-14 20:15:25 -07:00
opts - > flags | = CGRP_ROOT_XATTR ;
2012-08-23 16:53:30 -04:00
continue ;
}
2010-10-27 15:33:37 -07:00
if ( ! strncmp ( token , " release_agent= " , 14 ) ) {
2007-10-18 23:39:38 -07:00
/* Specifying two release agents is forbidden */
if ( opts - > release_agent )
return - EINVAL ;
2009-09-23 15:56:19 -07:00
opts - > release_agent =
2010-08-10 18:02:54 -07:00
kstrndup ( token + 14 , PATH_MAX - 1 , GFP_KERNEL ) ;
2007-10-18 23:39:38 -07:00
if ( ! opts - > release_agent )
return - ENOMEM ;
2010-10-27 15:33:37 -07:00
continue ;
}
if ( ! strncmp ( token , " name= " , 5 ) ) {
2009-09-23 15:56:19 -07:00
const char * name = token + 5 ;
/* Can't specify an empty name */
if ( ! strlen ( name ) )
return - EINVAL ;
/* Must match [\w.-]+ */
for ( i = 0 ; i < strlen ( name ) ; i + + ) {
char c = name [ i ] ;
if ( isalnum ( c ) )
continue ;
if ( ( c = = ' . ' ) | | ( c = = ' - ' ) | | ( c = = ' _ ' ) )
continue ;
return - EINVAL ;
}
/* Specifying two names is forbidden */
if ( opts - > name )
return - EINVAL ;
opts - > name = kstrndup ( name ,
2010-08-10 18:02:54 -07:00
MAX_CGROUP_ROOT_NAMELEN - 1 ,
2009-09-23 15:56:19 -07:00
GFP_KERNEL ) ;
if ( ! opts - > name )
return - ENOMEM ;
2010-10-27 15:33:37 -07:00
continue ;
}
2013-06-25 11:53:37 -07:00
for_each_subsys ( ss , i ) {
2010-10-27 15:33:37 -07:00
if ( strcmp ( token , ss - > name ) )
continue ;
if ( ss - > disabled )
continue ;
/* Mutually exclusive option 'all' + subsystem name */
if ( all_ss )
return - EINVAL ;
2012-08-23 16:53:31 -04:00
set_bit ( i , & opts - > subsys_mask ) ;
2010-10-27 15:33:37 -07:00
one_ss = true ;
break ;
}
if ( i = = CGROUP_SUBSYS_COUNT )
return - ENOENT ;
}
/*
* If the 'all' option was specified select all the subsystems,
2011-12-27 14:25:55 +08:00
* otherwise if 'none', 'name=' and a subsystem name options
* were not specified, let's default to 'all'
2010-10-27 15:33:37 -07:00
*/
2013-06-25 11:53:37 -07:00
if ( all_ss | | ( ! one_ss & & ! opts - > none & & ! opts - > name ) )
for_each_subsys ( ss , i )
if ( ! ss - > disabled )
set_bit ( i , & opts - > subsys_mask ) ;
2007-10-18 23:39:30 -07:00
2009-09-23 15:56:23 -07:00
/* Consistency checks */
2013-04-14 20:15:26 -07:00
if ( opts - > flags & CGRP_ROOT_SANE_BEHAVIOR ) {
pr_warning ( " cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk \n " ) ;
if ( opts - > flags & CGRP_ROOT_NOPREFIX ) {
pr_err ( " cgroup: sane_behavior: noprefix is not allowed \n " ) ;
return - EINVAL ;
}
if ( opts - > cpuset_clone_children ) {
pr_err ( " cgroup: sane_behavior: clone_children is not allowed \n " ) ;
return - EINVAL ;
}
}
2009-06-17 16:26:33 -07:00
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
2013-04-14 20:15:25 -07:00
if ( ( opts - > flags & CGRP_ROOT_NOPREFIX ) & & ( opts - > subsys_mask & mask ) )
2009-06-17 16:26:33 -07:00
return - EINVAL ;
2009-09-23 15:56:23 -07:00
/* Can't specify "none" and some subsystems */
2012-08-23 16:53:31 -04:00
if ( opts - > subsys_mask & & opts - > none )
2009-09-23 15:56:23 -07:00
return - EINVAL ;
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
2012-08-23 16:53:31 -04:00
if ( ! opts - > subsys_mask & & ! opts - > name )
2007-10-18 23:39:30 -07:00
return - EINVAL ;
return 0 ;
}
static int cgroup_remount ( struct super_block * sb , int * flags , char * data )
{
int ret = 0 ;
struct cgroupfs_root * root = sb - > s_fs_info ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = & root - > top_cgroup ;
2007-10-18 23:39:30 -07:00
struct cgroup_sb_opts opts ;
2012-08-23 16:53:31 -04:00
unsigned long added_mask , removed_mask ;
2007-10-18 23:39:30 -07:00
2013-04-14 20:15:26 -07:00
if ( root - > flags & CGRP_ROOT_SANE_BEHAVIOR ) {
pr_err ( " cgroup: sane_behavior: remount is not allowed \n " ) ;
return - EINVAL ;
}
2007-10-18 23:40:44 -07:00
mutex_lock ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
mutex_lock ( & cgroup_mutex ) ;
2011-12-12 18:12:21 -08:00
mutex_lock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
/* See what subsystems are wanted */
ret = parse_cgroupfs_options ( data , & opts ) ;
if ( ret )
goto out_unlock ;
2013-06-24 15:21:47 -07:00
if ( opts . subsys_mask ! = root - > subsys_mask | | opts . release_agent )
2012-04-01 12:09:54 -07:00
pr_warning ( " cgroup: option changes via remount are deprecated (pid=%d comm=%s) \n " ,
task_tgid_nr ( current ) , current - > comm ) ;
2012-08-23 16:53:31 -04:00
added_mask = opts . subsys_mask & ~ root - > subsys_mask ;
removed_mask = root - > subsys_mask & ~ opts . subsys_mask ;
2012-08-23 16:53:29 -04:00
2010-03-10 15:22:09 -08:00
/* Don't allow flags or name to change at remount */
2013-06-27 19:37:26 -07:00
if ( ( ( opts . flags ^ root - > flags ) & CGRP_ROOT_OPTION_MASK ) | |
2010-03-10 15:22:09 -08:00
( opts . name & & strcmp ( opts . name , root - > name ) ) ) {
2013-06-27 19:37:26 -07:00
pr_err ( " cgroup: option or name mismatch, new: 0x%lx \" %s \" , old: 0x%lx \" %s \" \n " ,
opts . flags & CGRP_ROOT_OPTION_MASK , opts . name ? : " " ,
root - > flags & CGRP_ROOT_OPTION_MASK , root - > name ) ;
2009-09-23 15:56:19 -07:00
ret = - EINVAL ;
goto out_unlock ;
}
2013-06-28 17:07:30 -07:00
/* remounting is not allowed for populated hierarchies */
if ( root - > number_of_cgroups > 1 ) {
ret = - EBUSY ;
goto out_unlock ;
}
2013-06-24 15:21:47 -07:00
ret = rebind_subsystems ( root , added_mask , removed_mask ) ;
2013-06-28 17:07:30 -07:00
if ( ret )
2009-04-02 16:57:30 -07:00
goto out_unlock ;
2007-10-18 23:39:30 -07:00
2007-10-18 23:39:38 -07:00
if ( opts . release_agent )
strcpy ( root - > release_agent_path , opts . release_agent ) ;
2007-10-18 23:39:30 -07:00
out_unlock :
2009-04-02 16:57:27 -07:00
kfree ( opts . release_agent ) ;
2009-09-23 15:56:19 -07:00
kfree ( opts . name ) ;
2011-12-12 18:12:21 -08:00
mutex_unlock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2007-10-18 23:40:44 -07:00
mutex_unlock ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
return ret ;
}
2009-09-21 17:01:09 -07:00
static const struct super_operations cgroup_ops = {
2007-10-18 23:39:30 -07:00
. statfs = simple_statfs ,
. drop_inode = generic_delete_inode ,
. show_options = cgroup_show_options ,
. remount_fs = cgroup_remount ,
} ;
2008-10-18 20:28:04 -07:00
static void init_cgroup_housekeeping ( struct cgroup * cgrp )
{
INIT_LIST_HEAD ( & cgrp - > sibling ) ;
INIT_LIST_HEAD ( & cgrp - > children ) ;
2012-04-01 12:09:56 -07:00
INIT_LIST_HEAD ( & cgrp - > files ) ;
2013-06-12 21:04:50 -07:00
INIT_LIST_HEAD ( & cgrp - > cset_links ) ;
2008-10-18 20:28:04 -07:00
INIT_LIST_HEAD ( & cgrp - > release_list ) ;
2009-09-23 15:56:27 -07:00
INIT_LIST_HEAD ( & cgrp - > pidlists ) ;
mutex_init ( & cgrp - > pidlist_mutex ) ;
2010-03-10 15:22:20 -08:00
INIT_LIST_HEAD ( & cgrp - > event_list ) ;
spin_lock_init ( & cgrp - > event_list_lock ) ;
2012-08-23 16:53:30 -04:00
simple_xattrs_init ( & cgrp - > xattrs ) ;
2008-10-18 20:28:04 -07:00
}
2009-09-23 15:56:19 -07:00
2007-10-18 23:39:30 -07:00
static void init_cgroup_root ( struct cgroupfs_root * root )
{
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = & root - > top_cgroup ;
2012-04-01 12:09:54 -07:00
2007-10-18 23:39:30 -07:00
INIT_LIST_HEAD ( & root - > subsys_list ) ;
INIT_LIST_HEAD ( & root - > root_list ) ;
root - > number_of_cgroups = 1 ;
2007-10-18 23:40:44 -07:00
cgrp - > root = root ;
2013-06-21 15:52:33 -07:00
RCU_INIT_POINTER ( cgrp - > name , & root_cgroup_name ) ;
2008-10-18 20:28:04 -07:00
init_cgroup_housekeeping ( cgrp ) ;
2013-07-31 09:50:50 +08:00
idr_init ( & root - > cgroup_idr ) ;
2007-10-18 23:39:30 -07:00
}
2013-06-25 11:53:37 -07:00
static int cgroup_init_root_id ( struct cgroupfs_root * root , int start , int end )
2009-09-23 15:56:23 -07:00
{
2013-04-14 11:36:58 -07:00
int id ;
2009-09-23 15:56:23 -07:00
2013-04-14 11:36:57 -07:00
lockdep_assert_held ( & cgroup_mutex ) ;
lockdep_assert_held ( & cgroup_root_mutex ) ;
2013-06-25 11:53:37 -07:00
id = idr_alloc_cyclic ( & cgroup_hierarchy_idr , root , start , end ,
GFP_KERNEL ) ;
2013-04-14 11:36:58 -07:00
if ( id < 0 )
return id ;
root - > hierarchy_id = id ;
2013-04-14 11:36:56 -07:00
return 0 ;
}
static void cgroup_exit_root_id ( struct cgroupfs_root * root )
{
2013-04-14 11:36:57 -07:00
lockdep_assert_held ( & cgroup_mutex ) ;
lockdep_assert_held ( & cgroup_root_mutex ) ;
2013-04-14 11:36:56 -07:00
2013-04-14 11:36:57 -07:00
if ( root - > hierarchy_id ) {
2013-04-14 11:36:58 -07:00
idr_remove ( & cgroup_hierarchy_idr , root - > hierarchy_id ) ;
2013-04-14 11:36:56 -07:00
root - > hierarchy_id = 0 ;
}
2009-09-23 15:56:23 -07:00
}
2007-10-18 23:39:30 -07:00
static int cgroup_test_super ( struct super_block * sb , void * data )
{
2009-09-23 15:56:19 -07:00
struct cgroup_sb_opts * opts = data ;
2007-10-18 23:39:30 -07:00
struct cgroupfs_root * root = sb - > s_fs_info ;
2009-09-23 15:56:19 -07:00
/* If we asked for a name then it must match */
if ( opts - > name & & strcmp ( opts - > name , root - > name ) )
return 0 ;
2007-10-18 23:39:30 -07:00
2009-09-23 15:56:23 -07:00
/*
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match
*/
2012-08-23 16:53:31 -04:00
if ( ( opts - > subsys_mask | | opts - > none )
& & ( opts - > subsys_mask ! = root - > subsys_mask ) )
2007-10-18 23:39:30 -07:00
return 0 ;
return 1 ;
}
2009-09-23 15:56:19 -07:00
static struct cgroupfs_root * cgroup_root_from_opts ( struct cgroup_sb_opts * opts )
{
struct cgroupfs_root * root ;
2012-08-23 16:53:31 -04:00
if ( ! opts - > subsys_mask & & ! opts - > none )
2009-09-23 15:56:19 -07:00
return NULL ;
root = kzalloc ( sizeof ( * root ) , GFP_KERNEL ) ;
if ( ! root )
return ERR_PTR ( - ENOMEM ) ;
init_cgroup_root ( root ) ;
2009-09-23 15:56:23 -07:00
2013-06-25 18:04:54 -07:00
/*
* We need to set @root->subsys_mask now so that @root can be
* matched by cgroup_test_super() before it finishes
* initialization; otherwise, competing mounts with the same
* options may try to bind the same subsystems instead of waiting
* for the first one leading to unexpected mount errors.
* SUBSYS_BOUND will be set once actual binding is complete.
*/
2012-08-23 16:53:31 -04:00
root - > subsys_mask = opts - > subsys_mask ;
2009-09-23 15:56:19 -07:00
root - > flags = opts - > flags ;
if ( opts - > release_agent )
strcpy ( root - > release_agent_path , opts - > release_agent ) ;
if ( opts - > name )
strcpy ( root - > name , opts - > name ) ;
2012-11-19 08:13:38 -08:00
if ( opts - > cpuset_clone_children )
set_bit ( CGRP_CPUSET_CLONE_CHILDREN , & root - > top_cgroup . flags ) ;
2009-09-23 15:56:19 -07:00
return root ;
}
2013-04-14 11:36:56 -07:00
static void cgroup_free_root ( struct cgroupfs_root * root )
2009-09-23 15:56:23 -07:00
{
2013-04-14 11:36:56 -07:00
if ( root ) {
/* hierarhcy ID shoulid already have been released */
WARN_ON_ONCE ( root - > hierarchy_id ) ;
2009-09-23 15:56:23 -07:00
2013-07-31 09:50:50 +08:00
idr_destroy ( & root - > cgroup_idr ) ;
2013-04-14 11:36:56 -07:00
kfree ( root ) ;
}
2009-09-23 15:56:23 -07:00
}
2007-10-18 23:39:30 -07:00
static int cgroup_set_super ( struct super_block * sb , void * data )
{
int ret ;
2009-09-23 15:56:19 -07:00
struct cgroup_sb_opts * opts = data ;
/* If we don't have a new root, we can't set up a new sb */
if ( ! opts - > new_root )
return - EINVAL ;
2012-08-23 16:53:31 -04:00
BUG_ON ( ! opts - > subsys_mask & & ! opts - > none ) ;
2007-10-18 23:39:30 -07:00
ret = set_anon_super ( sb , NULL ) ;
if ( ret )
return ret ;
2009-09-23 15:56:19 -07:00
sb - > s_fs_info = opts - > new_root ;
opts - > new_root - > sb = sb ;
2007-10-18 23:39:30 -07:00
sb - > s_blocksize = PAGE_CACHE_SIZE ;
sb - > s_blocksize_bits = PAGE_CACHE_SHIFT ;
sb - > s_magic = CGROUP_SUPER_MAGIC ;
sb - > s_op = & cgroup_ops ;
return 0 ;
}
static int cgroup_get_rootdir ( struct super_block * sb )
{
2010-12-21 13:29:29 -05:00
static const struct dentry_operations cgroup_dops = {
. d_iput = cgroup_diput ,
2011-01-14 05:31:45 +00:00
. d_delete = cgroup_delete ,
2010-12-21 13:29:29 -05:00
} ;
2007-10-18 23:39:30 -07:00
struct inode * inode =
cgroup_new_inode ( S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR , sb ) ;
if ( ! inode )
return - ENOMEM ;
inode - > i_fop = & simple_dir_operations ;
inode - > i_op = & cgroup_dir_inode_operations ;
/* directories start off with i_nlink == 2 (for "." entry) */
inc_nlink ( inode ) ;
2012-01-08 22:15:13 -05:00
sb - > s_root = d_make_root ( inode ) ;
if ( ! sb - > s_root )
2007-10-18 23:39:30 -07:00
return - ENOMEM ;
2010-12-21 13:29:29 -05:00
/* for everything else we want ->d_op set */
sb - > s_d_op = & cgroup_dops ;
2007-10-18 23:39:30 -07:00
return 0 ;
}
2010-07-26 13:23:11 +04:00
static struct dentry * cgroup_mount ( struct file_system_type * fs_type ,
2007-10-18 23:39:30 -07:00
int flags , const char * unused_dev_name ,
2010-07-26 13:23:11 +04:00
void * data )
2007-10-18 23:39:30 -07:00
{
struct cgroup_sb_opts opts ;
2009-09-23 15:56:19 -07:00
struct cgroupfs_root * root ;
2007-10-18 23:39:30 -07:00
int ret = 0 ;
struct super_block * sb ;
2009-09-23 15:56:19 -07:00
struct cgroupfs_root * new_root ;
2013-06-28 17:07:30 -07:00
struct list_head tmp_links ;
2011-12-12 18:12:21 -08:00
struct inode * inode ;
2013-06-28 17:07:30 -07:00
const struct cred * cred ;
2007-10-18 23:39:30 -07:00
/* First find the desired set of subsystems */
2010-03-10 15:22:07 -08:00
mutex_lock ( & cgroup_mutex ) ;
2007-10-18 23:39:30 -07:00
ret = parse_cgroupfs_options ( data , & opts ) ;
2010-03-10 15:22:07 -08:00
mutex_unlock ( & cgroup_mutex ) ;
2009-09-23 15:56:19 -07:00
if ( ret )
goto out_err ;
/*
* Allocate a new cgroup root. We may not need it if we're
* reusing an existing hierarchy.
*/
new_root = cgroup_root_from_opts ( & opts ) ;
if ( IS_ERR ( new_root ) ) {
ret = PTR_ERR ( new_root ) ;
2013-07-12 13:38:17 -07:00
goto out_err ;
2007-10-18 23:39:38 -07:00
}
2009-09-23 15:56:19 -07:00
opts . new_root = new_root ;
2007-10-18 23:39:30 -07:00
2009-09-23 15:56:19 -07:00
/* Locate an existing or new sb for this hierarchy */
2012-06-25 12:55:37 +01:00
sb = sget ( fs_type , cgroup_test_super , cgroup_set_super , 0 , & opts ) ;
2007-10-18 23:39:30 -07:00
if ( IS_ERR ( sb ) ) {
2009-09-23 15:56:19 -07:00
ret = PTR_ERR ( sb ) ;
2013-04-14 11:36:56 -07:00
cgroup_free_root ( opts . new_root ) ;
2013-07-12 13:38:17 -07:00
goto out_err ;
2007-10-18 23:39:30 -07:00
}
2009-09-23 15:56:19 -07:00
root = sb - > s_fs_info ;
BUG_ON ( ! root ) ;
if ( root = = opts . new_root ) {
/* We used the new root structure, so this is a new hierarchy */
2009-01-07 18:07:42 -08:00
struct cgroup * root_cgrp = & root - > top_cgroup ;
2009-09-23 15:56:19 -07:00
struct cgroupfs_root * existing_root ;
2008-04-29 01:00:13 -07:00
int i ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2007-10-18 23:39:30 -07:00
BUG_ON ( sb - > s_root ! = NULL ) ;
ret = cgroup_get_rootdir ( sb ) ;
if ( ret )
goto drop_new_super ;
2007-10-18 23:39:36 -07:00
inode = sb - > s_root - > d_inode ;
2007-10-18 23:39:30 -07:00
2007-10-18 23:39:36 -07:00
mutex_lock ( & inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
mutex_lock ( & cgroup_mutex ) ;
2011-12-12 18:12:21 -08:00
mutex_lock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
2013-07-31 09:50:50 +08:00
root_cgrp - > id = idr_alloc ( & root - > cgroup_idr , root_cgrp ,
0 , 1 , GFP_KERNEL ) ;
if ( root_cgrp - > id < 0 )
goto unlock_drop ;
2011-12-12 18:12:21 -08:00
/* Check for name clashes with existing mounts */
ret = - EBUSY ;
if ( strlen ( root - > name ) )
for_each_active_root ( existing_root )
if ( ! strcmp ( existing_root - > name , root - > name ) )
goto unlock_drop ;
2009-09-23 15:56:19 -07:00
2007-10-18 23:39:36 -07:00
/*
* We're accessing css_set_count without locking
* css_set_lock here, but that's OK - it can only be
* increased by someone holding cgroup_lock, and
* that's us. The worst that can happen is that we
* have some link structures left over
*/
2013-06-12 21:04:50 -07:00
ret = allocate_cgrp_cset_links ( css_set_count , & tmp_links ) ;
2011-12-12 18:12:21 -08:00
if ( ret )
goto unlock_drop ;
2007-10-18 23:39:36 -07:00
2013-06-25 11:53:37 -07:00
/* ID 0 is reserved for dummy root, 1 for unified hierarchy */
ret = cgroup_init_root_id ( root , 2 , 0 ) ;
2013-04-14 11:36:56 -07:00
if ( ret )
goto unlock_drop ;
2013-06-28 17:07:30 -07:00
sb - > s_root - > d_fsdata = root_cgrp ;
root_cgrp - > dentry = sb - > s_root ;
/*
* We're inside get_sb() and will call lookup_one_len() to
* create the root files, which doesn't work if SELinux is
* in use. The following cred dancing somehow works around
* it. See 2ce9738ba ("cgroupfs: use init_cred when
* populating new cgroupfs mount") for more details.
*/
cred = override_creds ( & init_cred ) ;
2013-08-08 20:11:23 -04:00
ret = cgroup_addrm_files ( root_cgrp , cgroup_base_files , true ) ;
2013-06-28 17:07:30 -07:00
if ( ret )
goto rm_base_files ;
2013-06-24 15:21:47 -07:00
ret = rebind_subsystems ( root , root - > subsys_mask , 0 ) ;
2013-06-28 17:07:30 -07:00
if ( ret )
goto rm_base_files ;
revert_creds ( cred ) ;
2010-03-10 15:22:09 -08:00
/*
* There must be no failure case after here, since rebinding
* takes care of subsystems' refcounts, which are explicitly
* dropped in the failure exit path.
*/
2007-10-18 23:39:30 -07:00
2013-06-24 15:21:47 -07:00
list_add ( & root - > root_list , & cgroup_roots ) ;
cgroup_root_count + + ;
2007-10-18 23:39:30 -07:00
2007-10-18 23:39:36 -07:00
/* Link the top cgroup in this hierarchy into all
* the css_set objects */
write_lock ( & css_set_lock ) ;
2013-06-12 21:04:49 -07:00
hash_for_each ( css_set_table , i , cset , hlist )
2013-06-12 21:04:50 -07:00
link_css_set ( & tmp_links , cset , root_cgrp ) ;
2007-10-18 23:39:36 -07:00
write_unlock ( & css_set_lock ) ;
2013-06-12 21:04:50 -07:00
free_cgrp_cset_links ( & tmp_links ) ;
2007-10-18 23:39:36 -07:00
2009-01-07 18:07:42 -08:00
BUG_ON ( ! list_empty ( & root_cgrp - > children ) ) ;
2007-10-18 23:39:30 -07:00
BUG_ON ( root - > number_of_cgroups ! = 1 ) ;
2011-12-12 18:12:21 -08:00
mutex_unlock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2009-09-23 15:56:18 -07:00
mutex_unlock ( & inode - > i_mutex ) ;
2009-09-23 15:56:19 -07:00
} else {
/*
* We re-used an existing hierarchy - the new root (if
* any) is not needed
*/
2013-04-14 11:36:56 -07:00
cgroup_free_root ( opts . new_root ) ;
2013-04-14 20:15:26 -07:00
2013-06-29 14:06:10 -07:00
if ( ( root - > flags ^ opts . flags ) & CGRP_ROOT_OPTION_MASK ) {
2013-05-26 21:33:09 +08:00
if ( ( root - > flags | opts . flags ) & CGRP_ROOT_SANE_BEHAVIOR ) {
pr_err ( " cgroup: sane_behavior: new mount options should match the existing superblock \n " ) ;
ret = - EINVAL ;
goto drop_new_super ;
} else {
pr_warning ( " cgroup: new mount options do not match the existing superblock, will be ignored \n " ) ;
}
2013-04-14 20:15:26 -07:00
}
2007-10-18 23:39:30 -07:00
}
2009-09-23 15:56:19 -07:00
kfree ( opts . release_agent ) ;
kfree ( opts . name ) ;
2010-07-26 13:23:11 +04:00
return dget ( sb - > s_root ) ;
2007-10-18 23:39:30 -07:00
2013-06-28 17:07:30 -07:00
rm_base_files :
free_cgrp_cset_links ( & tmp_links ) ;
2013-08-08 20:11:23 -04:00
cgroup_addrm_files ( & root - > top_cgroup , cgroup_base_files , false ) ;
2013-06-28 17:07:30 -07:00
revert_creds ( cred ) ;
2011-12-12 18:12:21 -08:00
unlock_drop :
2013-04-14 11:36:56 -07:00
cgroup_exit_root_id ( root ) ;
2011-12-12 18:12:21 -08:00
mutex_unlock ( & cgroup_root_mutex ) ;
mutex_unlock ( & cgroup_mutex ) ;
mutex_unlock ( & inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
drop_new_super :
2009-05-06 01:34:22 -04:00
deactivate_locked_super ( sb ) ;
2009-09-23 15:56:19 -07:00
out_err :
kfree ( opts . release_agent ) ;
kfree ( opts . name ) ;
2010-07-26 13:23:11 +04:00
return ERR_PTR ( ret ) ;
2007-10-18 23:39:30 -07:00
}
static void cgroup_kill_sb ( struct super_block * sb ) {
struct cgroupfs_root * root = sb - > s_fs_info ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = & root - > top_cgroup ;
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link , * tmp_link ;
2007-10-18 23:39:30 -07:00
int ret ;
BUG_ON ( ! root ) ;
BUG_ON ( root - > number_of_cgroups ! = 1 ) ;
2007-10-18 23:40:44 -07:00
BUG_ON ( ! list_empty ( & cgrp - > children ) ) ;
2007-10-18 23:39:30 -07:00
2013-06-28 17:07:30 -07:00
mutex_lock ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
mutex_lock ( & cgroup_mutex ) ;
2011-12-12 18:12:21 -08:00
mutex_lock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
/* Rebind all subsystems back to the default hierarchy */
2013-06-25 18:04:54 -07:00
if ( root - > flags & CGRP_ROOT_SUBSYS_BOUND ) {
ret = rebind_subsystems ( root , 0 , root - > subsys_mask ) ;
/* Shouldn't be able to fail ... */
BUG_ON ( ret ) ;
}
2007-10-18 23:39:30 -07:00
2007-10-18 23:39:36 -07:00
/*
2013-06-12 21:04:50 -07:00
* Release all the links from cset_links to this hierarchy's
2007-10-18 23:39:36 -07:00
* root cgroup
*/
write_lock ( & css_set_lock ) ;
2008-07-25 01:46:55 -07:00
2013-06-12 21:04:50 -07:00
list_for_each_entry_safe ( link , tmp_link , & cgrp - > cset_links , cset_link ) {
list_del ( & link - > cset_link ) ;
list_del ( & link - > cgrp_link ) ;
2007-10-18 23:39:36 -07:00
kfree ( link ) ;
}
write_unlock ( & css_set_lock ) ;
2009-01-29 14:25:22 -08:00
if ( ! list_empty ( & root - > root_list ) ) {
list_del ( & root - > root_list ) ;
2013-06-24 15:21:47 -07:00
cgroup_root_count - - ;
2009-01-29 14:25:22 -08:00
}
2009-01-07 18:07:41 -08:00
2013-04-14 11:36:56 -07:00
cgroup_exit_root_id ( root ) ;
2011-12-12 18:12:21 -08:00
mutex_unlock ( & cgroup_root_mutex ) ;
2007-10-18 23:39:30 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2013-06-28 17:07:30 -07:00
mutex_unlock ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
2012-08-23 16:53:30 -04:00
simple_xattrs_free ( & cgrp - > xattrs ) ;
2007-10-18 23:39:30 -07:00
kill_litter_super ( sb ) ;
2013-04-14 11:36:56 -07:00
cgroup_free_root ( root ) ;
2007-10-18 23:39:30 -07:00
}
static struct file_system_type cgroup_fs_type = {
. name = " cgroup " ,
2010-07-26 13:23:11 +04:00
. mount = cgroup_mount ,
2007-10-18 23:39:30 -07:00
. kill_sb = cgroup_kill_sb ,
} ;
2010-08-05 13:53:35 -07:00
static struct kobject * cgroup_kobj ;
2008-02-23 15:24:09 -08:00
/**
* cgroup_path - generate the path of a cgroup
* @cgrp: the cgroup in question
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
2013-03-01 15:01:56 +08:00
* Writes path of cgroup into buf. Returns 0 on success, -errno on error.
*
* We can't generate cgroup path using dentry->d_name, as accessing
* dentry->name must be protected by irq-unsafe dentry->d_lock or parent
* inode's i_mutex, while on the other hand cgroup_path() can be called
* with some irq-safe spinlocks held.
2007-10-18 23:39:30 -07:00
*/
2007-10-18 23:40:44 -07:00
int cgroup_path ( const struct cgroup * cgrp , char * buf , int buflen )
2007-10-18 23:39:30 -07:00
{
2013-03-01 15:01:56 +08:00
int ret = - ENAMETOOLONG ;
2007-10-18 23:39:30 -07:00
char * start ;
2012-11-19 08:13:36 -08:00
2013-04-14 10:32:19 -07:00
if ( ! cgrp - > parent ) {
if ( strlcpy ( buf , " / " , buflen ) > = buflen )
return - ENAMETOOLONG ;
2007-10-18 23:39:30 -07:00
return 0 ;
}
2012-11-08 21:36:38 +08:00
start = buf + buflen - 1 ;
* start = ' \0 ' ;
2010-04-22 17:29:24 +08:00
2013-03-01 15:01:56 +08:00
rcu_read_lock ( ) ;
2013-04-14 10:32:19 -07:00
do {
2013-03-01 15:01:56 +08:00
const char * name = cgroup_name ( cgrp ) ;
int len ;
len = strlen ( name ) ;
2007-10-18 23:39:30 -07:00
if ( ( start - = len ) < buf )
2013-03-01 15:01:56 +08:00
goto out ;
memcpy ( start , name , len ) ;
2010-04-22 17:29:24 +08:00
2007-10-18 23:39:30 -07:00
if ( - - start < buf )
2013-03-01 15:01:56 +08:00
goto out ;
2007-10-18 23:39:30 -07:00
* start = ' / ' ;
2013-03-01 15:01:56 +08:00
cgrp = cgrp - > parent ;
2013-04-14 10:32:19 -07:00
} while ( cgrp - > parent ) ;
2013-03-01 15:01:56 +08:00
ret = 0 ;
2007-10-18 23:39:30 -07:00
memmove ( buf , start , buf + buflen - start ) ;
2013-03-01 15:01:56 +08:00
out :
rcu_read_unlock ( ) ;
return ret ;
2007-10-18 23:39:30 -07:00
}
2010-03-10 15:22:11 -08:00
EXPORT_SYMBOL_GPL ( cgroup_path ) ;
2007-10-18 23:39:30 -07:00
2013-04-14 20:50:08 -07:00
/**
2013-07-11 16:34:48 -07:00
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
2013-04-14 20:50:08 -07:00
* @task: target task
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
2013-07-11 16:34:48 -07:00
* Determine @task's cgroup on the first (the one with the lowest non-zero
* hierarchy_id) cgroup hierarchy and copy its path into @buf. This
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
* Returns 0 on success, fails with -%ENAMETOOLONG if @buflen is too short.
2013-04-14 20:50:08 -07:00
*/
2013-07-11 16:34:48 -07:00
int task_cgroup_path ( struct task_struct * task , char * buf , size_t buflen )
2013-04-14 20:50:08 -07:00
{
struct cgroupfs_root * root ;
2013-07-11 16:34:48 -07:00
struct cgroup * cgrp ;
int hierarchy_id = 1 , ret = 0 ;
if ( buflen < 2 )
return - ENAMETOOLONG ;
2013-04-14 20:50:08 -07:00
mutex_lock ( & cgroup_mutex ) ;
2013-07-11 16:34:48 -07:00
root = idr_get_next ( & cgroup_hierarchy_idr , & hierarchy_id ) ;
2013-04-14 20:50:08 -07:00
if ( root ) {
cgrp = task_cgroup_from_root ( task , root ) ;
ret = cgroup_path ( cgrp , buf , buflen ) ;
2013-07-11 16:34:48 -07:00
} else {
/* if no hierarchy exists, everyone is in "/" */
memcpy ( buf , " / " , 2 ) ;
2013-04-14 20:50:08 -07:00
}
mutex_unlock ( & cgroup_mutex ) ;
return ret ;
}
2013-07-11 16:34:48 -07:00
EXPORT_SYMBOL_GPL ( task_cgroup_path ) ;
2013-04-14 20:50:08 -07:00
2011-12-12 18:12:21 -08:00
/*
* Control Group taskset
*/
2011-12-12 18:12:21 -08:00
struct task_and_cgroup {
struct task_struct * task ;
struct cgroup * cgrp ;
2013-07-31 16:18:36 +08:00
struct css_set * cset ;
2011-12-12 18:12:21 -08:00
} ;
2011-12-12 18:12:21 -08:00
struct cgroup_taskset {
struct task_and_cgroup single ;
struct flex_array * tc_array ;
int tc_array_len ;
int idx ;
struct cgroup * cur_cgrp ;
} ;
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
*
* @tset iteration is initialized and the first task is returned.
*/
struct task_struct * cgroup_taskset_first ( struct cgroup_taskset * tset )
{
if ( tset - > tc_array ) {
tset - > idx = 0 ;
return cgroup_taskset_next ( tset ) ;
} else {
tset - > cur_cgrp = tset - > single . cgrp ;
return tset - > single . task ;
}
}
EXPORT_SYMBOL_GPL ( cgroup_taskset_first ) ;
/**
* cgroup_taskset_next - iterate to the next task in taskset
* @tset: taskset of interest
*
* Return the next task in @tset. Iteration must have been initialized
* with cgroup_taskset_first().
*/
struct task_struct * cgroup_taskset_next ( struct cgroup_taskset * tset )
{
struct task_and_cgroup * tc ;
if ( ! tset - > tc_array | | tset - > idx > = tset - > tc_array_len )
return NULL ;
tc = flex_array_get ( tset - > tc_array , tset - > idx + + ) ;
tset - > cur_cgrp = tc - > cgrp ;
return tc - > task ;
}
EXPORT_SYMBOL_GPL ( cgroup_taskset_next ) ;
/**
* cgroup_taskset_cur_cgroup - return the matching cgroup for the current task
* @tset: taskset of interest
*
* Return the cgroup for the current (last returned) task of @tset. This
* function must be preceded by either cgroup_taskset_first() or
* cgroup_taskset_next().
*/
struct cgroup * cgroup_taskset_cur_cgroup ( struct cgroup_taskset * tset )
{
return tset - > cur_cgrp ;
}
EXPORT_SYMBOL_GPL ( cgroup_taskset_cur_cgroup ) ;
/**
* cgroup_taskset_size - return the number of tasks in taskset
* @tset: taskset of interest
*/
int cgroup_taskset_size ( struct cgroup_taskset * tset )
{
return tset - > tc_array ? tset - > tc_array_len : 1 ;
}
EXPORT_SYMBOL_GPL ( cgroup_taskset_size ) ;
2011-05-26 16:25:20 -07:00
/*
* cgroup_task_migrate - move a task from one cgroup to another.
*
2012-11-20 22:06:18 +08:00
* Must be called with cgroup_mutex and threadgroup locked.
2011-05-26 16:25:20 -07:00
*/
2013-06-12 21:04:49 -07:00
static void cgroup_task_migrate ( struct cgroup * old_cgrp ,
struct task_struct * tsk ,
struct css_set * new_cset )
2011-05-26 16:25:20 -07:00
{
2013-06-12 21:04:49 -07:00
struct css_set * old_cset ;
2011-05-26 16:25:20 -07:00
/*
2011-12-21 20:18:35 -08:00
* We are synchronized through threadgroup_lock() against PF_EXITING
* setting such that we can't race against cgroup_exit() changing the
* css_set to init_css_set and dropping the old one.
2011-05-26 16:25:20 -07:00
*/
2011-12-21 20:03:18 +01:00
WARN_ON_ONCE ( tsk - > flags & PF_EXITING ) ;
2013-06-21 15:52:04 -07:00
old_cset = task_css_set ( tsk ) ;
2011-05-26 16:25:20 -07:00
task_lock ( tsk ) ;
2013-06-12 21:04:49 -07:00
rcu_assign_pointer ( tsk - > cgroups , new_cset ) ;
2011-05-26 16:25:20 -07:00
task_unlock ( tsk ) ;
/* Update the css_set linked lists if we're using them */
write_lock ( & css_set_lock ) ;
if ( ! list_empty ( & tsk - > cg_list ) )
2013-06-12 21:04:49 -07:00
list_move ( & tsk - > cg_list , & new_cset - > tasks ) ;
2011-05-26 16:25:20 -07:00
write_unlock ( & css_set_lock ) ;
/*
2013-06-12 21:04:49 -07:00
* We just gained a reference on old_cset by taking it from the
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
2011-05-26 16:25:20 -07:00
*/
2013-06-12 21:04:49 -07:00
set_bit ( CGRP_RELEASABLE , & old_cgrp - > flags ) ;
put_css_set ( old_cset ) ;
2011-05-26 16:25:20 -07:00
}
2008-02-23 15:24:09 -08:00
/**
2013-03-13 09:17:09 +08:00
* cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2011-05-26 16:25:20 -07:00
* @cgrp: the cgroup to attach to
2013-03-13 09:17:09 +08:00
* @tsk: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
2011-05-26 16:25:20 -07:00
*
2011-12-12 18:12:21 -08:00
* Call holding cgroup_mutex and the group_rwsem of the leader. Will take
2013-03-13 09:17:09 +08:00
* task_lock of @tsk or each thread in the threadgroup individually in turn.
2011-05-26 16:25:20 -07:00
*/
2013-04-07 09:29:51 -07:00
static int cgroup_attach_task ( struct cgroup * cgrp , struct task_struct * tsk ,
bool threadgroup )
2011-05-26 16:25:20 -07:00
{
int retval , i , group_size ;
struct cgroup_subsys * ss , * failed_ss = NULL ;
struct cgroupfs_root * root = cgrp - > root ;
/* threadgroup list cursor and array */
2013-03-13 09:17:09 +08:00
struct task_struct * leader = tsk ;
2011-12-12 18:12:21 -08:00
struct task_and_cgroup * tc ;
2011-05-26 16:25:21 -07:00
struct flex_array * group ;
2011-12-12 18:12:21 -08:00
struct cgroup_taskset tset = { } ;
2011-05-26 16:25:20 -07:00
/*
* step 0: in order to do expensive, possibly blocking operations for
* every thread, we cannot iterate the thread group list, since it needs
* rcu or tasklist locked. instead, build an array of all threads in the
2011-12-12 18:12:21 -08:00
* group - group_rwsem prevents new threads from appearing, and if
* threads exit, this will just be an over-estimate.
2011-05-26 16:25:20 -07:00
*/
2013-03-13 09:17:09 +08:00
if ( threadgroup )
group_size = get_nr_threads ( tsk ) ;
else
group_size = 1 ;
2011-05-26 16:25:21 -07:00
/* flex_array supports very large thread-groups better than kmalloc. */
2011-12-12 18:12:21 -08:00
group = flex_array_alloc ( sizeof ( * tc ) , group_size , GFP_KERNEL ) ;
2011-05-26 16:25:20 -07:00
if ( ! group )
return - ENOMEM ;
2011-05-26 16:25:21 -07:00
/* pre-allocate to guarantee space while iterating in rcu read-side. */
2013-03-12 15:36:00 -07:00
retval = flex_array_prealloc ( group , 0 , group_size , GFP_KERNEL ) ;
2011-05-26 16:25:21 -07:00
if ( retval )
goto out_free_group_list ;
2011-05-26 16:25:20 -07:00
i = 0 ;
2012-01-03 21:18:31 -08:00
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
rcu_read_lock ( ) ;
2011-05-26 16:25:20 -07:00
do {
2011-12-12 18:12:21 -08:00
struct task_and_cgroup ent ;
2011-12-12 18:12:21 -08:00
/* @tsk either already exited or can't exit until the end */
if ( tsk - > flags & PF_EXITING )
continue ;
2011-05-26 16:25:20 -07:00
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON ( i > = group_size ) ;
2011-12-12 18:12:21 -08:00
ent . task = tsk ;
ent . cgrp = task_cgroup_from_root ( tsk , root ) ;
2011-12-21 20:18:37 -08:00
/* nothing to do if this task is already in the cgroup */
if ( ent . cgrp = = cgrp )
continue ;
2012-01-30 12:51:56 -08:00
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
*/
2011-12-12 18:12:21 -08:00
retval = flex_array_put ( group , i , & ent , GFP_ATOMIC ) ;
2011-05-26 16:25:21 -07:00
BUG_ON ( retval ! = 0 ) ;
2011-05-26 16:25:20 -07:00
i + + ;
2013-03-13 09:17:09 +08:00
if ( ! threadgroup )
break ;
2011-05-26 16:25:20 -07:00
} while_each_thread ( leader , tsk ) ;
2012-01-03 21:18:31 -08:00
rcu_read_unlock ( ) ;
2011-05-26 16:25:20 -07:00
/* remember the number of threads in the array for later. */
group_size = i ;
2011-12-12 18:12:21 -08:00
tset . tc_array = group ;
tset . tc_array_len = group_size ;
2011-05-26 16:25:20 -07:00
2011-12-12 18:12:21 -08:00
/* methods shouldn't be called if no task is actually migrating */
retval = 0 ;
2011-12-21 20:18:37 -08:00
if ( ! group_size )
2011-12-21 20:18:36 -08:00
goto out_free_group_list ;
2011-12-12 18:12:21 -08:00
2011-05-26 16:25:20 -07:00
/*
* step 1: check that we can legitimately attach to the cgroup.
*/
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss ) {
2013-08-08 20:11:23 -04:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
2011-05-26 16:25:20 -07:00
if ( ss - > can_attach ) {
2013-08-08 20:11:23 -04:00
retval = ss - > can_attach ( css , & tset ) ;
2011-05-26 16:25:20 -07:00
if ( retval ) {
failed_ss = ss ;
goto out_cancel_attach ;
}
}
}
/*
* step 2: make sure css_sets exist for all threads to be migrated.
* we use find_css_set, which allocates a new one if necessary.
*/
for ( i = 0 ; i < group_size ; i + + ) {
2013-06-21 15:52:04 -07:00
struct css_set * old_cset ;
2011-12-12 18:12:21 -08:00
tc = flex_array_get ( group , i ) ;
2013-06-21 15:52:04 -07:00
old_cset = task_css_set ( tc - > task ) ;
2013-07-31 16:18:36 +08:00
tc - > cset = find_css_set ( old_cset , cgrp ) ;
if ( ! tc - > cset ) {
2012-01-30 12:51:56 -08:00
retval = - ENOMEM ;
goto out_put_css_set_refs ;
2011-05-26 16:25:20 -07:00
}
}
/*
2011-12-12 18:12:22 -08:00
* step 3: now that we're guaranteed success wrt the css_sets,
* proceed to move all tasks to the new cgroup. There are no
* failure cases after here, so this is the commit point.
2011-05-26 16:25:20 -07:00
*/
for ( i = 0 ; i < group_size ; i + + ) {
2011-12-12 18:12:21 -08:00
tc = flex_array_get ( group , i ) ;
2013-07-31 16:18:36 +08:00
cgroup_task_migrate ( tc - > cgrp , tc - > task , tc - > cset ) ;
2011-05-26 16:25:20 -07:00
}
/* nothing is sensitive to fork() after this point. */
/*
2011-12-12 18:12:22 -08:00
* step 4: do subsystem attach callbacks.
2011-05-26 16:25:20 -07:00
*/
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss ) {
2013-08-08 20:11:23 -04:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
2011-05-26 16:25:20 -07:00
if ( ss - > attach )
2013-08-08 20:11:23 -04:00
ss - > attach ( css , & tset ) ;
2011-05-26 16:25:20 -07:00
}
/*
* step 5: success! and cleanup
*/
retval = 0 ;
2012-01-30 12:51:56 -08:00
out_put_css_set_refs :
if ( retval ) {
for ( i = 0 ; i < group_size ; i + + ) {
tc = flex_array_get ( group , i ) ;
2013-07-31 16:18:36 +08:00
if ( ! tc - > cset )
2012-01-30 12:51:56 -08:00
break ;
2013-07-31 16:18:36 +08:00
put_css_set ( tc - > cset ) ;
2012-01-30 12:51:56 -08:00
}
2011-05-26 16:25:20 -07:00
}
out_cancel_attach :
if ( retval ) {
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss ) {
2013-08-08 20:11:23 -04:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
2011-12-12 18:12:22 -08:00
if ( ss = = failed_ss )
2011-05-26 16:25:20 -07:00
break ;
if ( ss - > cancel_attach )
2013-08-08 20:11:23 -04:00
ss - > cancel_attach ( css , & tset ) ;
2011-05-26 16:25:20 -07:00
}
}
out_free_group_list :
2011-05-26 16:25:21 -07:00
flex_array_free ( group ) ;
2011-05-26 16:25:20 -07:00
return retval ;
}
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
2011-12-12 18:12:21 -08:00
* function to attach either it or all tasks in its threadgroup. Will lock
* cgroup_mutex and threadgroup; may take task_lock of task.
2011-05-26 16:25:20 -07:00
*/
static int attach_task_by_pid ( struct cgroup * cgrp , u64 pid , bool threadgroup )
2007-10-18 23:39:32 -07:00
{
struct task_struct * tsk ;
2008-11-14 10:39:19 +11:00
const struct cred * cred = current_cred ( ) , * tcred ;
2007-10-18 23:39:32 -07:00
int ret ;
2011-05-26 16:25:20 -07:00
if ( ! cgroup_lock_live_group ( cgrp ) )
return - ENODEV ;
2012-01-03 21:18:30 -08:00
retry_find_task :
rcu_read_lock ( ) ;
2007-10-18 23:39:32 -07:00
if ( pid ) {
2008-02-07 00:14:47 -08:00
tsk = find_task_by_vpid ( pid ) ;
2011-05-26 16:25:20 -07:00
if ( ! tsk ) {
2007-10-18 23:39:32 -07:00
rcu_read_unlock ( ) ;
2012-01-03 21:18:30 -08:00
ret = - ESRCH ;
goto out_unlock_cgroup ;
2007-10-18 23:39:32 -07:00
}
2011-05-26 16:25:20 -07:00
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
*/
2008-11-14 10:39:19 +11:00
tcred = __task_cred ( tsk ) ;
2012-03-12 15:44:39 -07:00
if ( ! uid_eq ( cred - > euid , GLOBAL_ROOT_UID ) & &
! uid_eq ( cred - > euid , tcred - > uid ) & &
! uid_eq ( cred - > euid , tcred - > suid ) ) {
2008-11-14 10:39:19 +11:00
rcu_read_unlock ( ) ;
2012-01-03 21:18:30 -08:00
ret = - EACCES ;
goto out_unlock_cgroup ;
2007-10-18 23:39:32 -07:00
}
2012-01-03 21:18:30 -08:00
} else
tsk = current ;
2011-12-12 18:12:21 -08:00
if ( threadgroup )
2012-01-03 21:18:30 -08:00
tsk = tsk - > group_leader ;
2012-04-21 09:13:46 +02:00
/*
2013-03-19 13:45:20 -07:00
* Workqueue threads may acquire PF_NO_SETAFFINITY and become
2012-04-21 09:13:46 +02:00
* trapped in a cpuset, or RT worker may be born in a cgroup
* with no rt_runtime allocated. Just say no.
*/
2013-03-19 13:45:20 -07:00
if ( tsk = = kthreadd_task | | ( tsk - > flags & PF_NO_SETAFFINITY ) ) {
2012-04-21 09:13:46 +02:00
ret = - EINVAL ;
rcu_read_unlock ( ) ;
goto out_unlock_cgroup ;
}
2012-01-03 21:18:30 -08:00
get_task_struct ( tsk ) ;
rcu_read_unlock ( ) ;
2011-12-12 18:12:21 -08:00
2012-01-03 21:18:30 -08:00
threadgroup_lock ( tsk ) ;
if ( threadgroup ) {
if ( ! thread_group_leader ( tsk ) ) {
/*
* a race with de_thread from another thread's exec()
* may strip us of our leadership, if this happens,
* there is no choice but to throw this task away and
* try again; this is
* "double-double-toil-and-trouble-check locking".
*/
threadgroup_unlock ( tsk ) ;
put_task_struct ( tsk ) ;
goto retry_find_task ;
}
2013-03-13 09:17:09 +08:00
}
ret = cgroup_attach_task ( cgrp , tsk , threadgroup ) ;
2011-12-12 18:12:21 -08:00
threadgroup_unlock ( tsk ) ;
2007-10-18 23:39:32 -07:00
put_task_struct ( tsk ) ;
2012-01-03 21:18:30 -08:00
out_unlock_cgroup :
2013-04-07 09:29:51 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2007-10-18 23:39:32 -07:00
return ret ;
}
2013-04-07 09:29:51 -07:00
/**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
*/
int cgroup_attach_task_all ( struct task_struct * from , struct task_struct * tsk )
{
struct cgroupfs_root * root ;
int retval = 0 ;
2013-04-07 09:29:51 -07:00
mutex_lock ( & cgroup_mutex ) ;
2013-04-07 09:29:51 -07:00
for_each_active_root ( root ) {
2013-07-31 16:18:36 +08:00
struct cgroup * from_cgrp = task_cgroup_from_root ( from , root ) ;
2013-04-07 09:29:51 -07:00
2013-07-31 16:18:36 +08:00
retval = cgroup_attach_task ( from_cgrp , tsk , false ) ;
2013-04-07 09:29:51 -07:00
if ( retval )
break ;
}
2013-04-07 09:29:51 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2013-04-07 09:29:51 -07:00
return retval ;
}
EXPORT_SYMBOL_GPL ( cgroup_attach_task_all ) ;
2008-07-25 01:47:01 -07:00
static int cgroup_tasks_write ( struct cgroup * cgrp , struct cftype * cft , u64 pid )
2011-05-26 16:25:20 -07:00
{
return attach_task_by_pid ( cgrp , pid , false ) ;
}
static int cgroup_procs_write ( struct cgroup * cgrp , struct cftype * cft , u64 tgid )
2008-07-25 01:47:01 -07:00
{
2012-01-03 21:18:30 -08:00
return attach_task_by_pid ( cgrp , tgid , true ) ;
2008-07-25 01:47:01 -07:00
}
2008-07-25 01:46:59 -07:00
static int cgroup_release_agent_write ( struct cgroup * cgrp , struct cftype * cft ,
const char * buffer )
{
BUILD_BUG_ON ( sizeof ( cgrp - > root - > release_agent_path ) < PATH_MAX ) ;
2010-10-27 15:33:37 -07:00
if ( strlen ( buffer ) > = PATH_MAX )
return - EINVAL ;
2008-07-25 01:46:59 -07:00
if ( ! cgroup_lock_live_group ( cgrp ) )
return - ENODEV ;
2011-12-12 18:12:21 -08:00
mutex_lock ( & cgroup_root_mutex ) ;
2008-07-25 01:46:59 -07:00
strcpy ( cgrp - > root - > release_agent_path , buffer ) ;
2011-12-12 18:12:21 -08:00
mutex_unlock ( & cgroup_root_mutex ) ;
2013-04-07 09:29:51 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2008-07-25 01:46:59 -07:00
return 0 ;
}
static int cgroup_release_agent_show ( struct cgroup * cgrp , struct cftype * cft ,
struct seq_file * seq )
{
if ( ! cgroup_lock_live_group ( cgrp ) )
return - ENODEV ;
seq_puts ( seq , cgrp - > root - > release_agent_path ) ;
seq_putc ( seq , ' \n ' ) ;
2013-04-07 09:29:51 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2008-07-25 01:46:59 -07:00
return 0 ;
}
2013-04-14 20:15:26 -07:00
static int cgroup_sane_behavior_show ( struct cgroup * cgrp , struct cftype * cft ,
struct seq_file * seq )
{
seq_printf ( seq , " %d \n " , cgroup_sane_behavior ( cgrp ) ) ;
2008-07-25 01:46:59 -07:00
return 0 ;
}
2013-08-08 20:11:23 -04:00
/* return the css for the given cgroup file */
static struct cgroup_subsys_state * cgroup_file_css ( struct cfent * cfe )
{
struct cftype * cft = cfe - > type ;
struct cgroup * cgrp = __d_cgrp ( cfe - > dentry - > d_parent ) ;
if ( cft - > ss )
return cgrp - > subsys [ cft - > ss - > subsys_id ] ;
return NULL ;
}
2008-07-25 01:47:00 -07:00
/* A buffer size big enough for numbers or short strings */
# define CGROUP_LOCAL_BUFFER_SIZE 64
2008-04-29 01:00:06 -07:00
static ssize_t cgroup_write_X64 ( struct cgroup * cgrp , struct cftype * cft ,
2008-04-29 00:59:56 -07:00
struct file * file ,
const char __user * userbuf ,
size_t nbytes , loff_t * unused_ppos )
2007-10-18 23:39:33 -07:00
{
2008-07-25 01:47:00 -07:00
char buffer [ CGROUP_LOCAL_BUFFER_SIZE ] ;
2007-10-18 23:39:33 -07:00
int retval = 0 ;
char * end ;
if ( ! nbytes )
return - EINVAL ;
if ( nbytes > = sizeof ( buffer ) )
return - E2BIG ;
if ( copy_from_user ( buffer , userbuf , nbytes ) )
return - EFAULT ;
buffer [ nbytes ] = 0 ; /* nul-terminate */
2008-04-29 01:00:06 -07:00
if ( cft - > write_u64 ) {
2009-10-26 16:49:36 -07:00
u64 val = simple_strtoull ( strstrip ( buffer ) , & end , 0 ) ;
2008-04-29 01:00:06 -07:00
if ( * end )
return - EINVAL ;
retval = cft - > write_u64 ( cgrp , cft , val ) ;
} else {
2009-10-26 16:49:36 -07:00
s64 val = simple_strtoll ( strstrip ( buffer ) , & end , 0 ) ;
2008-04-29 01:00:06 -07:00
if ( * end )
return - EINVAL ;
retval = cft - > write_s64 ( cgrp , cft , val ) ;
}
2007-10-18 23:39:33 -07:00
if ( ! retval )
retval = nbytes ;
return retval ;
}
2008-07-25 01:46:58 -07:00
static ssize_t cgroup_write_string ( struct cgroup * cgrp , struct cftype * cft ,
struct file * file ,
const char __user * userbuf ,
size_t nbytes , loff_t * unused_ppos )
{
2008-07-25 01:47:00 -07:00
char local_buffer [ CGROUP_LOCAL_BUFFER_SIZE ] ;
2008-07-25 01:46:58 -07:00
int retval = 0 ;
size_t max_bytes = cft - > max_write_len ;
char * buffer = local_buffer ;
if ( ! max_bytes )
max_bytes = sizeof ( local_buffer ) - 1 ;
if ( nbytes > = max_bytes )
return - E2BIG ;
/* Allocate a dynamic buffer if we need one */
if ( nbytes > = sizeof ( local_buffer ) ) {
buffer = kmalloc ( nbytes + 1 , GFP_KERNEL ) ;
if ( buffer = = NULL )
return - ENOMEM ;
}
2008-07-29 22:33:18 -07:00
if ( nbytes & & copy_from_user ( buffer , userbuf , nbytes ) ) {
retval = - EFAULT ;
goto out ;
}
2008-07-25 01:46:58 -07:00
buffer [ nbytes ] = 0 ; /* nul-terminate */
2009-10-26 16:49:36 -07:00
retval = cft - > write_string ( cgrp , cft , strstrip ( buffer ) ) ;
2008-07-25 01:46:58 -07:00
if ( ! retval )
retval = nbytes ;
2008-07-29 22:33:18 -07:00
out :
2008-07-25 01:46:58 -07:00
if ( buffer ! = local_buffer )
kfree ( buffer ) ;
return retval ;
}
2007-10-18 23:39:30 -07:00
static ssize_t cgroup_file_write ( struct file * file , const char __user * buf ,
size_t nbytes , loff_t * ppos )
{
struct cftype * cft = __d_cft ( file - > f_dentry ) ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = __d_cgrp ( file - > f_dentry - > d_parent ) ;
2007-10-18 23:39:30 -07:00
2007-10-18 23:39:33 -07:00
if ( cft - > write )
2007-10-18 23:40:44 -07:00
return cft - > write ( cgrp , cft , file , buf , nbytes , ppos ) ;
2008-04-29 01:00:06 -07:00
if ( cft - > write_u64 | | cft - > write_s64 )
return cgroup_write_X64 ( cgrp , cft , file , buf , nbytes , ppos ) ;
2008-07-25 01:46:58 -07:00
if ( cft - > write_string )
return cgroup_write_string ( cgrp , cft , file , buf , nbytes , ppos ) ;
2008-04-29 01:00:08 -07:00
if ( cft - > trigger ) {
int ret = cft - > trigger ( cgrp , ( unsigned int ) cft - > private ) ;
return ret ? ret : nbytes ;
}
2007-10-18 23:39:33 -07:00
return - EINVAL ;
2007-10-18 23:39:30 -07:00
}
2008-04-29 00:59:56 -07:00
static ssize_t cgroup_read_u64 ( struct cgroup * cgrp , struct cftype * cft ,
struct file * file ,
char __user * buf , size_t nbytes ,
loff_t * ppos )
2007-10-18 23:39:30 -07:00
{
2008-07-25 01:47:00 -07:00
char tmp [ CGROUP_LOCAL_BUFFER_SIZE ] ;
2008-04-29 00:59:56 -07:00
u64 val = cft - > read_u64 ( cgrp , cft ) ;
2007-10-18 23:39:30 -07:00
int len = sprintf ( tmp , " %llu \n " , ( unsigned long long ) val ) ;
return simple_read_from_buffer ( buf , nbytes , ppos , tmp , len ) ;
}
2008-04-29 01:00:06 -07:00
static ssize_t cgroup_read_s64 ( struct cgroup * cgrp , struct cftype * cft ,
struct file * file ,
char __user * buf , size_t nbytes ,
loff_t * ppos )
{
2008-07-25 01:47:00 -07:00
char tmp [ CGROUP_LOCAL_BUFFER_SIZE ] ;
2008-04-29 01:00:06 -07:00
s64 val = cft - > read_s64 ( cgrp , cft ) ;
int len = sprintf ( tmp , " %lld \n " , ( long long ) val ) ;
return simple_read_from_buffer ( buf , nbytes , ppos , tmp , len ) ;
}
2007-10-18 23:39:30 -07:00
static ssize_t cgroup_file_read ( struct file * file , char __user * buf ,
size_t nbytes , loff_t * ppos )
{
struct cftype * cft = __d_cft ( file - > f_dentry ) ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = __d_cgrp ( file - > f_dentry - > d_parent ) ;
2007-10-18 23:39:30 -07:00
if ( cft - > read )
2007-10-18 23:40:44 -07:00
return cft - > read ( cgrp , cft , file , buf , nbytes , ppos ) ;
2008-04-29 00:59:56 -07:00
if ( cft - > read_u64 )
return cgroup_read_u64 ( cgrp , cft , file , buf , nbytes , ppos ) ;
2008-04-29 01:00:06 -07:00
if ( cft - > read_s64 )
return cgroup_read_s64 ( cgrp , cft , file , buf , nbytes , ppos ) ;
2007-10-18 23:39:30 -07:00
return - EINVAL ;
}
2008-04-29 01:00:01 -07:00
/*
* seqfile ops/methods for returning structured data. Currently just
* supports string->u64 maps, but can be extended in future.
*/
static int cgroup_map_add ( struct cgroup_map_cb * cb , const char * key , u64 value )
{
struct seq_file * sf = cb - > state ;
return seq_printf ( sf , " %s %llu \n " , key , ( unsigned long long ) value ) ;
}
static int cgroup_seqfile_show ( struct seq_file * m , void * arg )
{
2013-07-31 17:36:25 +08:00
struct cfent * cfe = m - > private ;
struct cftype * cft = cfe - > type ;
struct cgroup * cgrp = __d_cgrp ( cfe - > dentry - > d_parent ) ;
2008-04-29 01:00:14 -07:00
if ( cft - > read_map ) {
struct cgroup_map_cb cb = {
. fill = cgroup_map_add ,
. state = m ,
} ;
2013-07-31 17:36:25 +08:00
return cft - > read_map ( cgrp , cft , & cb ) ;
2008-04-29 01:00:14 -07:00
}
2013-07-31 17:36:25 +08:00
return cft - > read_seq_string ( cgrp , cft , m ) ;
2008-04-29 01:00:01 -07:00
}
2009-10-01 15:43:56 -07:00
static const struct file_operations cgroup_seqfile_operations = {
2008-04-29 01:00:01 -07:00
. read = seq_read ,
2008-07-25 01:46:59 -07:00
. write = cgroup_file_write ,
2008-04-29 01:00:01 -07:00
. llseek = seq_lseek ,
2013-07-31 17:36:25 +08:00
. release = single_release ,
2008-04-29 01:00:01 -07:00
} ;
2007-10-18 23:39:30 -07:00
static int cgroup_file_open ( struct inode * inode , struct file * file )
{
2013-08-08 20:11:23 -04:00
struct cfent * cfe = __d_cfe ( file - > f_dentry ) ;
struct cftype * cft = __d_cft ( file - > f_dentry ) ;
struct cgroup_subsys_state * css = cgroup_file_css ( cfe ) ;
2007-10-18 23:39:30 -07:00
int err ;
err = generic_file_open ( inode , file ) ;
if ( err )
return err ;
2013-08-08 20:11:23 -04:00
/*
* If the file belongs to a subsystem, pin the css. Will be
* unpinned either on open failure or release. This ensures that
* @css stays alive for all file operations.
*/
if ( css & & ! css_tryget ( css ) )
return - ENODEV ;
2009-01-07 18:07:33 -08:00
2008-04-29 01:00:14 -07:00
if ( cft - > read_map | | cft - > read_seq_string ) {
2008-04-29 01:00:01 -07:00
file - > f_op = & cgroup_seqfile_operations ;
2013-07-31 17:36:25 +08:00
err = single_open ( file , cgroup_seqfile_show , cfe ) ;
} else if ( cft - > open ) {
2007-10-18 23:39:30 -07:00
err = cft - > open ( inode , file ) ;
2013-07-31 17:36:25 +08:00
}
2007-10-18 23:39:30 -07:00
2013-08-08 20:11:23 -04:00
if ( css & & err )
css_put ( css ) ;
2007-10-18 23:39:30 -07:00
return err ;
}
static int cgroup_file_release ( struct inode * inode , struct file * file )
{
2013-08-08 20:11:23 -04:00
struct cfent * cfe = __d_cfe ( file - > f_dentry ) ;
2007-10-18 23:39:30 -07:00
struct cftype * cft = __d_cft ( file - > f_dentry ) ;
2013-08-08 20:11:23 -04:00
struct cgroup_subsys_state * css = cgroup_file_css ( cfe ) ;
int ret = 0 ;
2007-10-18 23:39:30 -07:00
if ( cft - > release )
2013-08-08 20:11:23 -04:00
ret = cft - > release ( inode , file ) ;
if ( css )
css_put ( css ) ;
return ret ;
2007-10-18 23:39:30 -07:00
}
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
static int cgroup_rename ( struct inode * old_dir , struct dentry * old_dentry ,
struct inode * new_dir , struct dentry * new_dentry )
{
2013-03-01 15:01:56 +08:00
int ret ;
struct cgroup_name * name , * old_name ;
struct cgroup * cgrp ;
/*
* It's convinient to use parent dir's i_mutex to protected
* cgrp->name.
*/
lockdep_assert_held ( & old_dir - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
if ( ! S_ISDIR ( old_dentry - > d_inode - > i_mode ) )
return - ENOTDIR ;
if ( new_dentry - > d_inode )
return - EEXIST ;
if ( old_dir ! = new_dir )
return - EIO ;
2013-03-01 15:01:56 +08:00
cgrp = __d_cgrp ( old_dentry ) ;
2013-06-14 11:18:22 -07:00
/*
* This isn't a proper migration and its usefulness is very
* limited. Disallow if sane_behavior.
*/
if ( cgroup_sane_behavior ( cgrp ) )
return - EPERM ;
2013-03-01 15:01:56 +08:00
name = cgroup_alloc_name ( new_dentry ) ;
if ( ! name )
return - ENOMEM ;
ret = simple_rename ( old_dir , old_dentry , new_dir , new_dentry ) ;
if ( ret ) {
kfree ( name ) ;
return ret ;
}
2013-06-21 15:52:33 -07:00
old_name = rcu_dereference_protected ( cgrp - > name , true ) ;
2013-03-01 15:01:56 +08:00
rcu_assign_pointer ( cgrp - > name , name ) ;
kfree_rcu ( old_name , rcu_head ) ;
return 0 ;
2007-10-18 23:39:30 -07:00
}
2012-08-23 16:53:30 -04:00
static struct simple_xattrs * __d_xattrs ( struct dentry * dentry )
{
if ( S_ISDIR ( dentry - > d_inode - > i_mode ) )
return & __d_cgrp ( dentry ) - > xattrs ;
else
2013-04-18 23:09:52 -07:00
return & __d_cfe ( dentry ) - > xattrs ;
2012-08-23 16:53:30 -04:00
}
static inline int xattr_enabled ( struct dentry * dentry )
{
struct cgroupfs_root * root = dentry - > d_sb - > s_fs_info ;
2013-04-14 20:15:25 -07:00
return root - > flags & CGRP_ROOT_XATTR ;
2012-08-23 16:53:30 -04:00
}
static bool is_valid_xattr ( const char * name )
{
if ( ! strncmp ( name , XATTR_TRUSTED_PREFIX , XATTR_TRUSTED_PREFIX_LEN ) | |
! strncmp ( name , XATTR_SECURITY_PREFIX , XATTR_SECURITY_PREFIX_LEN ) )
return true ;
return false ;
}
static int cgroup_setxattr ( struct dentry * dentry , const char * name ,
const void * val , size_t size , int flags )
{
if ( ! xattr_enabled ( dentry ) )
return - EOPNOTSUPP ;
if ( ! is_valid_xattr ( name ) )
return - EINVAL ;
return simple_xattr_set ( __d_xattrs ( dentry ) , name , val , size , flags ) ;
}
static int cgroup_removexattr ( struct dentry * dentry , const char * name )
{
if ( ! xattr_enabled ( dentry ) )
return - EOPNOTSUPP ;
if ( ! is_valid_xattr ( name ) )
return - EINVAL ;
return simple_xattr_remove ( __d_xattrs ( dentry ) , name ) ;
}
static ssize_t cgroup_getxattr ( struct dentry * dentry , const char * name ,
void * buf , size_t size )
{
if ( ! xattr_enabled ( dentry ) )
return - EOPNOTSUPP ;
if ( ! is_valid_xattr ( name ) )
return - EINVAL ;
return simple_xattr_get ( __d_xattrs ( dentry ) , name , buf , size ) ;
}
static ssize_t cgroup_listxattr ( struct dentry * dentry , char * buf , size_t size )
{
if ( ! xattr_enabled ( dentry ) )
return - EOPNOTSUPP ;
return simple_xattr_list ( __d_xattrs ( dentry ) , buf , size ) ;
}
2009-10-01 15:43:56 -07:00
static const struct file_operations cgroup_file_operations = {
2007-10-18 23:39:30 -07:00
. read = cgroup_file_read ,
. write = cgroup_file_write ,
. llseek = generic_file_llseek ,
. open = cgroup_file_open ,
. release = cgroup_file_release ,
} ;
2012-08-23 16:53:30 -04:00
static const struct inode_operations cgroup_file_inode_operations = {
. setxattr = cgroup_setxattr ,
. getxattr = cgroup_getxattr ,
. listxattr = cgroup_listxattr ,
. removexattr = cgroup_removexattr ,
} ;
2009-09-21 17:01:11 -07:00
static const struct inode_operations cgroup_dir_inode_operations = {
2011-01-14 05:31:45 +00:00
. lookup = cgroup_lookup ,
2007-10-18 23:39:30 -07:00
. mkdir = cgroup_mkdir ,
. rmdir = cgroup_rmdir ,
. rename = cgroup_rename ,
2012-08-23 16:53:30 -04:00
. setxattr = cgroup_setxattr ,
. getxattr = cgroup_getxattr ,
. listxattr = cgroup_listxattr ,
. removexattr = cgroup_removexattr ,
2007-10-18 23:39:30 -07:00
} ;
2012-06-10 17:13:09 -04:00
static struct dentry * cgroup_lookup ( struct inode * dir , struct dentry * dentry , unsigned int flags )
2011-01-14 05:31:45 +00:00
{
if ( dentry - > d_name . len > NAME_MAX )
return ERR_PTR ( - ENAMETOOLONG ) ;
d_add ( dentry , NULL ) ;
return NULL ;
}
2010-03-10 15:22:20 -08:00
/*
* Check if a file is a control file
*/
static inline struct cftype * __file_cft ( struct file * file )
{
2013-01-23 17:07:38 -05:00
if ( file_inode ( file ) - > i_fop ! = & cgroup_file_operations )
2010-03-10 15:22:20 -08:00
return ERR_PTR ( - EINVAL ) ;
return __d_cft ( file - > f_dentry ) ;
}
2011-07-26 01:55:55 -04:00
static int cgroup_create_file ( struct dentry * dentry , umode_t mode ,
2011-01-07 17:49:20 +11:00
struct super_block * sb )
{
2007-10-18 23:39:30 -07:00
struct inode * inode ;
if ( ! dentry )
return - ENOENT ;
if ( dentry - > d_inode )
return - EEXIST ;
inode = cgroup_new_inode ( mode , sb ) ;
if ( ! inode )
return - ENOMEM ;
if ( S_ISDIR ( mode ) ) {
inode - > i_op = & cgroup_dir_inode_operations ;
inode - > i_fop = & simple_dir_operations ;
/* start off with i_nlink == 2 (for "." entry) */
inc_nlink ( inode ) ;
2012-11-19 08:13:36 -08:00
inc_nlink ( dentry - > d_parent - > d_inode ) ;
2007-10-18 23:39:30 -07:00
2012-11-19 08:13:37 -08:00
/*
* Control reaches here with cgroup_mutex held.
* @inode->i_mutex should nest outside cgroup_mutex but we
* want to populate it immediately without releasing
* cgroup_mutex. As @inode isn't visible to anyone else
* yet, trylock will always succeed without affecting
* lockdep checks.
*/
WARN_ON_ONCE ( ! mutex_trylock ( & inode - > i_mutex ) ) ;
2007-10-18 23:39:30 -07:00
} else if ( S_ISREG ( mode ) ) {
inode - > i_size = 0 ;
inode - > i_fop = & cgroup_file_operations ;
2012-08-23 16:53:30 -04:00
inode - > i_op = & cgroup_file_inode_operations ;
2007-10-18 23:39:30 -07:00
}
d_instantiate ( dentry , inode ) ;
dget ( dentry ) ; /* Extra count - pin the dentry in core */
return 0 ;
}
2009-04-02 16:57:29 -07:00
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* returns cft->mode if ->mode is not 0
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
2011-07-26 01:55:55 -04:00
static umode_t cgroup_file_mode ( const struct cftype * cft )
2009-04-02 16:57:29 -07:00
{
2011-07-26 01:55:55 -04:00
umode_t mode = 0 ;
2009-04-02 16:57:29 -07:00
if ( cft - > mode )
return cft - > mode ;
if ( cft - > read | | cft - > read_u64 | | cft - > read_s64 | |
cft - > read_map | | cft - > read_seq_string )
mode | = S_IRUGO ;
if ( cft - > write | | cft - > write_u64 | | cft - > write_s64 | |
cft - > write_string | | cft - > trigger )
mode | = S_IWUSR ;
return mode ;
}
2013-08-08 20:11:23 -04:00
static int cgroup_add_file ( struct cgroup * cgrp , struct cftype * cft )
2007-10-18 23:39:30 -07:00
{
2007-10-18 23:40:44 -07:00
struct dentry * dir = cgrp - > dentry ;
2012-04-01 12:09:56 -07:00
struct cgroup * parent = __d_cgrp ( dir ) ;
2007-10-18 23:39:30 -07:00
struct dentry * dentry ;
2012-04-01 12:09:56 -07:00
struct cfent * cfe ;
2007-10-18 23:39:30 -07:00
int error ;
2011-07-26 01:55:55 -04:00
umode_t mode ;
2007-10-18 23:39:30 -07:00
char name [ MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2 ] = { 0 } ;
2012-04-01 12:09:55 -07:00
2013-08-08 20:11:23 -04:00
if ( cft - > ss & & ! ( cgrp - > root - > flags & CGRP_ROOT_NOPREFIX ) ) {
strcpy ( name , cft - > ss - > name ) ;
2007-10-18 23:39:30 -07:00
strcat ( name , " . " ) ;
}
strcat ( name , cft - > name ) ;
2012-04-01 12:09:56 -07:00
2007-10-18 23:39:30 -07:00
BUG_ON ( ! mutex_is_locked ( & dir - > d_inode - > i_mutex ) ) ;
2012-04-01 12:09:56 -07:00
cfe = kzalloc ( sizeof ( * cfe ) , GFP_KERNEL ) ;
if ( ! cfe )
return - ENOMEM ;
2007-10-18 23:39:30 -07:00
dentry = lookup_one_len ( name , dir , strlen ( name ) ) ;
2012-04-01 12:09:56 -07:00
if ( IS_ERR ( dentry ) ) {
2007-10-18 23:39:30 -07:00
error = PTR_ERR ( dentry ) ;
2012-04-01 12:09:56 -07:00
goto out ;
}
2013-05-14 19:44:20 +08:00
cfe - > type = ( void * ) cft ;
cfe - > dentry = dentry ;
dentry - > d_fsdata = cfe ;
simple_xattrs_init ( & cfe - > xattrs ) ;
2012-04-01 12:09:56 -07:00
mode = cgroup_file_mode ( cft ) ;
error = cgroup_create_file ( dentry , mode | S_IFREG , cgrp - > root - > sb ) ;
if ( ! error ) {
list_add_tail ( & cfe - > node , & parent - > files ) ;
cfe = NULL ;
}
dput ( dentry ) ;
out :
kfree ( cfe ) ;
2007-10-18 23:39:30 -07:00
return error ;
}
2013-06-28 16:24:10 -07:00
/**
* cgroup_addrm_files - add or remove files to a cgroup directory
* @cgrp: the target cgroup
* @cfts: array of cftypes to be added
* @is_add: whether to add or remove
*
* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
2013-08-08 20:11:23 -04:00
* For removals, this function never fails. If addition fails, this
* function doesn't remove files already added. The caller is responsible
* for cleaning up.
2013-06-28 16:24:10 -07:00
*/
2013-08-08 20:11:23 -04:00
static int cgroup_addrm_files ( struct cgroup * cgrp , struct cftype cfts [ ] ,
bool is_add )
2007-10-18 23:39:30 -07:00
{
2012-08-23 16:53:30 -04:00
struct cftype * cft ;
2013-06-28 16:24:10 -07:00
int ret ;
lockdep_assert_held ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
lockdep_assert_held ( & cgroup_mutex ) ;
2012-04-01 12:09:55 -07:00
for ( cft = cfts ; cft - > name [ 0 ] ! = ' \0 ' ; cft + + ) {
2012-12-06 14:38:57 +08:00
/* does cft->flags tell us to skip this file on @cgrp? */
2013-04-14 20:15:26 -07:00
if ( ( cft - > flags & CFTYPE_INSANE ) & & cgroup_sane_behavior ( cgrp ) )
continue ;
2012-12-06 14:38:57 +08:00
if ( ( cft - > flags & CFTYPE_NOT_ON_ROOT ) & & ! cgrp - > parent )
continue ;
if ( ( cft - > flags & CFTYPE_ONLY_ON_ROOT ) & & cgrp - > parent )
continue ;
2013-01-21 18:18:33 +08:00
if ( is_add ) {
2013-08-08 20:11:23 -04:00
ret = cgroup_add_file ( cgrp , cft ) ;
2013-06-28 16:24:10 -07:00
if ( ret ) {
2013-01-21 18:18:33 +08:00
pr_warn ( " cgroup_addrm_files: failed to add %s, err=%d \n " ,
2013-06-28 16:24:10 -07:00
cft - > name , ret ) ;
return ret ;
}
2013-01-21 18:18:33 +08:00
} else {
cgroup_rm_file ( cgrp , cft ) ;
2012-04-01 12:09:55 -07:00
}
2007-10-18 23:39:30 -07:00
}
2013-06-28 16:24:10 -07:00
return 0 ;
2007-10-18 23:39:30 -07:00
}
2012-04-01 12:09:55 -07:00
static void cgroup_cfts_prepare ( void )
2013-06-18 18:48:37 +08:00
__acquires ( & cgroup_mutex )
2012-04-01 12:09:55 -07:00
{
/*
* Thanks to the entanglement with vfs inode locking, we can't walk
* the existing cgroups under cgroup_mutex and create files.
2013-06-18 18:48:37 +08:00
* Instead, we use cgroup_for_each_descendant_pre() and drop RCU
* read lock before calling cgroup_addrm_files().
2012-04-01 12:09:55 -07:00
*/
mutex_lock ( & cgroup_mutex ) ;
}
2013-08-08 20:11:23 -04:00
static int cgroup_cfts_commit ( struct cftype * cfts , bool is_add )
2013-06-18 18:48:37 +08:00
__releases ( & cgroup_mutex )
2012-04-01 12:09:55 -07:00
{
LIST_HEAD ( pending ) ;
2013-08-08 20:11:23 -04:00
struct cgroup_subsys * ss = cfts [ 0 ] . ss ;
2013-06-18 18:48:37 +08:00
struct cgroup * cgrp , * root = & ss - > root - > top_cgroup ;
2013-06-18 18:40:19 +08:00
struct super_block * sb = ss - > root - > sb ;
2013-06-18 18:48:37 +08:00
struct dentry * prev = NULL ;
struct inode * inode ;
2013-06-18 11:14:22 -07:00
u64 update_before ;
2013-06-28 16:24:11 -07:00
int ret = 0 ;
2012-04-01 12:09:55 -07:00
/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
2013-06-24 15:21:47 -07:00
if ( ! cfts | | ss - > root = = & cgroup_dummy_root | |
2013-06-18 18:48:37 +08:00
! atomic_inc_not_zero ( & sb - > s_active ) ) {
mutex_unlock ( & cgroup_mutex ) ;
2013-06-28 16:24:11 -07:00
return 0 ;
2012-04-01 12:09:55 -07:00
}
2013-06-18 18:48:37 +08:00
/*
* All cgroups which are created after we drop cgroup_mutex will
* have the updated set of files, so we only need to update the
2013-06-18 11:14:22 -07:00
* cgroups created before the current @cgroup_serial_nr_next.
2013-06-18 18:48:37 +08:00
*/
2013-06-18 11:14:22 -07:00
update_before = cgroup_serial_nr_next ;
2013-06-18 18:48:37 +08:00
2012-04-01 12:09:55 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2013-06-18 18:48:37 +08:00
/* @root always needs to be updated */
inode = root - > dentry - > d_inode ;
mutex_lock ( & inode - > i_mutex ) ;
mutex_lock ( & cgroup_mutex ) ;
2013-08-08 20:11:23 -04:00
ret = cgroup_addrm_files ( root , cfts , is_add ) ;
2013-06-18 18:48:37 +08:00
mutex_unlock ( & cgroup_mutex ) ;
mutex_unlock ( & inode - > i_mutex ) ;
2013-06-28 16:24:11 -07:00
if ( ret )
goto out_deact ;
2013-06-18 18:48:37 +08:00
/* add/rm files for all cgroups created before */
rcu_read_lock ( ) ;
cgroup_for_each_descendant_pre ( cgrp , root ) {
if ( cgroup_is_dead ( cgrp ) )
continue ;
inode = cgrp - > dentry - > d_inode ;
dget ( cgrp - > dentry ) ;
rcu_read_unlock ( ) ;
dput ( prev ) ;
prev = cgrp - > dentry ;
2012-04-01 12:09:55 -07:00
mutex_lock ( & inode - > i_mutex ) ;
mutex_lock ( & cgroup_mutex ) ;
2013-06-18 11:14:22 -07:00
if ( cgrp - > serial_nr < update_before & & ! cgroup_is_dead ( cgrp ) )
2013-08-08 20:11:23 -04:00
ret = cgroup_addrm_files ( cgrp , cfts , is_add ) ;
2012-04-01 12:09:55 -07:00
mutex_unlock ( & cgroup_mutex ) ;
mutex_unlock ( & inode - > i_mutex ) ;
2013-06-18 18:48:37 +08:00
rcu_read_lock ( ) ;
2013-06-28 16:24:11 -07:00
if ( ret )
break ;
2012-04-01 12:09:55 -07:00
}
2013-06-18 18:48:37 +08:00
rcu_read_unlock ( ) ;
dput ( prev ) ;
2013-06-28 16:24:11 -07:00
out_deact :
2013-06-18 18:48:37 +08:00
deactivate_super ( sb ) ;
2013-06-28 16:24:11 -07:00
return ret ;
2012-04-01 12:09:55 -07:00
}
/**
* cgroup_add_cftypes - add an array of cftypes to a subsystem
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Register @cfts to @ss. Files described by @cfts are created for all
* existing cgroups to which @ss is attached and all future cgroups will
* have them too. This function can be called anytime whether @ss is
* attached or not.
*
* Returns 0 on successful registration, -errno on failure. Note that this
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
2012-08-23 16:53:30 -04:00
int cgroup_add_cftypes ( struct cgroup_subsys * ss , struct cftype * cfts )
2012-04-01 12:09:55 -07:00
{
struct cftype_set * set ;
2013-08-08 20:11:23 -04:00
struct cftype * cft ;
2013-06-28 16:24:11 -07:00
int ret ;
2012-04-01 12:09:55 -07:00
set = kzalloc ( sizeof ( * set ) , GFP_KERNEL ) ;
if ( ! set )
return - ENOMEM ;
2013-08-08 20:11:23 -04:00
for ( cft = cfts ; cft - > name [ 0 ] ! = ' \0 ' ; cft + + )
cft - > ss = ss ;
2012-04-01 12:09:55 -07:00
cgroup_cfts_prepare ( ) ;
set - > cfts = cfts ;
list_add_tail ( & set - > node , & ss - > cftsets ) ;
2013-08-08 20:11:23 -04:00
ret = cgroup_cfts_commit ( cfts , true ) ;
2013-06-28 16:24:11 -07:00
if ( ret )
2013-08-08 20:11:23 -04:00
cgroup_rm_cftypes ( cfts ) ;
2013-06-28 16:24:11 -07:00
return ret ;
2012-04-01 12:09:55 -07:00
}
EXPORT_SYMBOL_GPL ( cgroup_add_cftypes ) ;
2012-04-01 12:09:56 -07:00
/**
* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
* @cfts: zero-length name terminated array of cftypes
*
2013-08-08 20:11:23 -04:00
* Unregister @cfts. Files described by @cfts are removed from all
* existing cgroups and all future cgroups won't have them either. This
* function can be called anytime whether @cfts' subsys is attached or not.
2012-04-01 12:09:56 -07:00
*
* Returns 0 on successful unregistration, -ENOENT if @cfts is not
2013-08-08 20:11:23 -04:00
* registered.
2012-04-01 12:09:56 -07:00
*/
2013-08-08 20:11:23 -04:00
int cgroup_rm_cftypes ( struct cftype * cfts )
2012-04-01 12:09:56 -07:00
{
struct cftype_set * set ;
2013-08-08 20:11:23 -04:00
if ( ! cfts | | ! cfts [ 0 ] . ss )
return - ENOENT ;
2012-04-01 12:09:56 -07:00
cgroup_cfts_prepare ( ) ;
2013-08-08 20:11:23 -04:00
list_for_each_entry ( set , & cfts [ 0 ] . ss - > cftsets , node ) {
2012-04-01 12:09:56 -07:00
if ( set - > cfts = = cfts ) {
2013-06-18 18:41:53 +08:00
list_del ( & set - > node ) ;
kfree ( set ) ;
2013-08-08 20:11:23 -04:00
cgroup_cfts_commit ( cfts , false ) ;
2012-04-01 12:09:56 -07:00
return 0 ;
}
}
2013-08-08 20:11:23 -04:00
cgroup_cfts_commit ( NULL , false ) ;
2012-04-01 12:09:56 -07:00
return - ENOENT ;
}
2008-02-23 15:24:09 -08:00
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
* Return the number of tasks in the cgroup.
*/
2007-10-18 23:40:44 -07:00
int cgroup_task_count ( const struct cgroup * cgrp )
2007-10-18 23:39:32 -07:00
{
int count = 0 ;
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2007-10-18 23:39:32 -07:00
2007-10-18 23:39:36 -07:00
read_lock ( & css_set_lock ) ;
2013-06-12 21:04:50 -07:00
list_for_each_entry ( link , & cgrp - > cset_links , cset_link )
count + = atomic_read ( & link - > cset - > refcount ) ;
2007-10-18 23:39:36 -07:00
read_unlock ( & css_set_lock ) ;
2007-10-18 23:39:32 -07:00
return count ;
}
2007-10-18 23:39:36 -07:00
/*
* Advance a list_head iterator. The iterator should be positioned at
* the start of a css_set
*/
2013-06-12 21:04:50 -07:00
static void cgroup_advance_iter ( struct cgroup * cgrp , struct cgroup_iter * it )
2007-10-18 23:39:36 -07:00
{
2013-06-12 21:04:50 -07:00
struct list_head * l = it - > cset_link ;
struct cgrp_cset_link * link ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2007-10-18 23:39:36 -07:00
/* Advance to the next non-empty css_set */
do {
l = l - > next ;
2013-06-12 21:04:50 -07:00
if ( l = = & cgrp - > cset_links ) {
it - > cset_link = NULL ;
2007-10-18 23:39:36 -07:00
return ;
}
2013-06-12 21:04:50 -07:00
link = list_entry ( l , struct cgrp_cset_link , cset_link ) ;
cset = link - > cset ;
2013-06-12 21:04:49 -07:00
} while ( list_empty ( & cset - > tasks ) ) ;
2013-06-12 21:04:50 -07:00
it - > cset_link = l ;
2013-06-12 21:04:49 -07:00
it - > task = cset - > tasks . next ;
2007-10-18 23:39:36 -07:00
}
2008-02-07 00:14:42 -08:00
/*
* To reduce the fork() overhead for systems that are not actually
* using their cgroups capability, we don't maintain the lists running
* through each css_set to its tasks until we see the list actually
* used - in other words after the first call to cgroup_iter_start().
*/
2008-04-29 00:59:54 -07:00
static void cgroup_enable_task_cg_lists ( void )
2008-02-07 00:14:42 -08:00
{
struct task_struct * p , * g ;
write_lock ( & css_set_lock ) ;
use_task_css_set_links = 1 ;
2012-02-08 03:37:27 +01:00
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock ( & tasklist_lock ) ;
2008-02-07 00:14:42 -08:00
do_each_thread ( g , p ) {
task_lock ( p ) ;
2008-04-17 11:37:15 +08:00
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
*/
if ( ! ( p - > flags & PF_EXITING ) & & list_empty ( & p - > cg_list ) )
2013-06-21 15:52:04 -07:00
list_add ( & p - > cg_list , & task_css_set ( p ) - > tasks ) ;
2008-02-07 00:14:42 -08:00
task_unlock ( p ) ;
} while_each_thread ( g , p ) ;
2012-02-08 03:37:27 +01:00
read_unlock ( & tasklist_lock ) ;
2008-02-07 00:14:42 -08:00
write_unlock ( & css_set_lock ) ;
}
2013-05-24 10:55:38 +09:00
/**
* cgroup_next_sibling - find the next sibling of a given cgroup
* @pos: the current cgroup
*
* This function returns the next sibling of @pos and should be called
* under RCU read lock. The only requirement is that @pos is accessible.
* The next sibling is guaranteed to be returned regardless of @pos's
* state.
*/
struct cgroup * cgroup_next_sibling ( struct cgroup * pos )
{
struct cgroup * next ;
WARN_ON_ONCE ( ! rcu_read_lock_held ( ) ) ;
/*
* @pos could already have been removed. Once a cgroup is removed,
* its ->sibling.next is no longer updated when its next sibling
2013-06-13 19:27:42 -07:00
* changes. As CGRP_DEAD assertion is serialized and happens
* before the cgroup is taken off the ->sibling list, if we see it
* unasserted, it's guaranteed that the next sibling hasn't
* finished its grace period even if it's already removed, and thus
* safe to dereference from this RCU critical section. If
* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
* to be visible as %true here.
2013-05-24 10:55:38 +09:00
*/
2013-06-12 21:04:53 -07:00
if ( likely ( ! cgroup_is_dead ( pos ) ) ) {
2013-05-24 10:55:38 +09:00
next = list_entry_rcu ( pos - > sibling . next , struct cgroup , sibling ) ;
if ( & next - > sibling ! = & pos - > parent - > children )
return next ;
return NULL ;
}
/*
* Can't dereference the next pointer. Each cgroup is given a
* monotonically increasing unique serial number and always
* appended to the sibling list, so the next one can be found by
* walking the parent's children until we see a cgroup with higher
* serial number than @pos's.
*
* While this path can be slow, it's taken only when either the
* current cgroup is removed or iteration and removal race.
*/
list_for_each_entry_rcu ( next , & pos - > parent - > children , sibling )
if ( next - > serial_nr > pos - > serial_nr )
return next ;
return NULL ;
}
EXPORT_SYMBOL_GPL ( cgroup_next_sibling ) ;
2012-11-09 09:12:29 -08:00
/**
* cgroup_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
* @cgroup: cgroup whose descendants to walk
*
* To be used by cgroup_for_each_descendant_pre(). Find the next
* descendant to visit for pre-order traversal of @cgroup's descendants.
2013-05-24 10:55:38 +09:00
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
2012-11-09 09:12:29 -08:00
*/
struct cgroup * cgroup_next_descendant_pre ( struct cgroup * pos ,
struct cgroup * cgroup )
{
struct cgroup * next ;
WARN_ON_ONCE ( ! rcu_read_lock_held ( ) ) ;
/* if first iteration, pretend we just visited @cgroup */
2013-05-24 10:50:24 +09:00
if ( ! pos )
2012-11-09 09:12:29 -08:00
pos = cgroup ;
/* visit the first child if exists */
next = list_first_or_null_rcu ( & pos - > children , struct cgroup , sibling ) ;
if ( next )
return next ;
/* no child, visit my or the closest ancestor's next sibling */
2013-05-24 10:50:24 +09:00
while ( pos ! = cgroup ) {
2013-05-24 10:55:38 +09:00
next = cgroup_next_sibling ( pos ) ;
if ( next )
2012-11-09 09:12:29 -08:00
return next ;
pos = pos - > parent ;
2013-05-24 10:50:24 +09:00
}
2012-11-09 09:12:29 -08:00
return NULL ;
}
EXPORT_SYMBOL_GPL ( cgroup_next_descendant_pre ) ;
2013-01-07 08:49:33 -08:00
/**
* cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
* @pos: cgroup of interest
*
* Return the rightmost descendant of @pos. If there's no descendant,
* @pos is returned. This can be used during pre-order traversal to skip
* subtree of @pos.
2013-05-24 10:55:38 +09:00
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct rightmost descendant as long as @pos is
* accessible.
2013-01-07 08:49:33 -08:00
*/
struct cgroup * cgroup_rightmost_descendant ( struct cgroup * pos )
{
struct cgroup * last , * tmp ;
WARN_ON_ONCE ( ! rcu_read_lock_held ( ) ) ;
do {
last = pos ;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL ;
list_for_each_entry_rcu ( tmp , & last - > children , sibling )
pos = tmp ;
} while ( pos ) ;
return last ;
}
EXPORT_SYMBOL_GPL ( cgroup_rightmost_descendant ) ;
2012-11-09 09:12:29 -08:00
static struct cgroup * cgroup_leftmost_descendant ( struct cgroup * pos )
{
struct cgroup * last ;
do {
last = pos ;
pos = list_first_or_null_rcu ( & pos - > children , struct cgroup ,
sibling ) ;
} while ( pos ) ;
return last ;
}
/**
* cgroup_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
* @cgroup: cgroup whose descendants to walk
*
* To be used by cgroup_for_each_descendant_post(). Find the next
* descendant to visit for post-order traversal of @cgroup's descendants.
2013-05-24 10:55:38 +09:00
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
2012-11-09 09:12:29 -08:00
*/
struct cgroup * cgroup_next_descendant_post ( struct cgroup * pos ,
struct cgroup * cgroup )
{
struct cgroup * next ;
WARN_ON_ONCE ( ! rcu_read_lock_held ( ) ) ;
/* if first iteration, visit the leftmost descendant */
if ( ! pos ) {
next = cgroup_leftmost_descendant ( cgroup ) ;
return next ! = cgroup ? next : NULL ;
}
/* if there's an unvisited sibling, visit its leftmost descendant */
2013-05-24 10:55:38 +09:00
next = cgroup_next_sibling ( pos ) ;
if ( next )
2012-11-09 09:12:29 -08:00
return cgroup_leftmost_descendant ( next ) ;
/* no sibling left, visit parent */
next = pos - > parent ;
return next ! = cgroup ? next : NULL ;
}
EXPORT_SYMBOL_GPL ( cgroup_next_descendant_post ) ;
2007-10-18 23:40:44 -07:00
void cgroup_iter_start ( struct cgroup * cgrp , struct cgroup_iter * it )
2011-12-27 07:46:26 +02:00
__acquires ( css_set_lock )
2007-10-18 23:39:36 -07:00
{
/*
* The first time anyone tries to iterate across a cgroup,
* we need to enable the list linking each css_set to its
* tasks, and fix up all existing tasks.
*/
2008-02-07 00:14:42 -08:00
if ( ! use_task_css_set_links )
cgroup_enable_task_cg_lists ( ) ;
2007-10-18 23:39:36 -07:00
read_lock ( & css_set_lock ) ;
2013-06-12 21:04:50 -07:00
it - > cset_link = & cgrp - > cset_links ;
2007-10-18 23:40:44 -07:00
cgroup_advance_iter ( cgrp , it ) ;
2007-10-18 23:39:36 -07:00
}
2007-10-18 23:40:44 -07:00
struct task_struct * cgroup_iter_next ( struct cgroup * cgrp ,
2007-10-18 23:39:36 -07:00
struct cgroup_iter * it )
{
struct task_struct * res ;
struct list_head * l = it - > task ;
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2007-10-18 23:39:36 -07:00
/* If the iterator cg is NULL, we have no tasks */
2013-06-12 21:04:50 -07:00
if ( ! it - > cset_link )
2007-10-18 23:39:36 -07:00
return NULL ;
res = list_entry ( l , struct task_struct , cg_list ) ;
/* Advance iterator to find next entry */
l = l - > next ;
2013-06-12 21:04:50 -07:00
link = list_entry ( it - > cset_link , struct cgrp_cset_link , cset_link ) ;
if ( l = = & link - > cset - > tasks ) {
2007-10-18 23:39:36 -07:00
/* We reached the end of this task list - move on to
* the next cg_cgroup_link */
2007-10-18 23:40:44 -07:00
cgroup_advance_iter ( cgrp , it ) ;
2007-10-18 23:39:36 -07:00
} else {
it - > task = l ;
}
return res ;
}
2007-10-18 23:40:44 -07:00
void cgroup_iter_end ( struct cgroup * cgrp , struct cgroup_iter * it )
2011-12-27 07:46:26 +02:00
__releases ( css_set_lock )
2007-10-18 23:39:36 -07:00
{
read_unlock ( & css_set_lock ) ;
}
2008-02-07 00:14:42 -08:00
static inline int started_after_time ( struct task_struct * t1 ,
struct timespec * time ,
struct task_struct * t2 )
{
int start_diff = timespec_compare ( & t1 - > start_time , time ) ;
if ( start_diff > 0 ) {
return 1 ;
} else if ( start_diff < 0 ) {
return 0 ;
} else {
/*
* Arbitrarily, if two processes started at the same
* time, we'll say that the lower pointer value
* started first. Note that t2 may have exited by now
* so this may not be a valid pointer any longer, but
* that's fine - it still serves to distinguish
* between two tasks started (effectively) simultaneously.
*/
return t1 > t2 ;
}
}
/*
* This function is a callback from heap_insert() and is used to order
* the heap.
* In this case we order the heap in descending task start time.
*/
static inline int started_after ( void * p1 , void * p2 )
{
struct task_struct * t1 = p1 ;
struct task_struct * t2 = p2 ;
return started_after_time ( t1 , & t2 - > start_time , t2 ) ;
}
/**
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
* @scan: struct cgroup_scanner containing arguments for the scan
*
* Arguments include pointers to callback functions test_task() and
* process_task().
* Iterate through all the tasks in a cgroup, calling test_task() for each,
* and if it returns true, call process_task() for it also.
* The test_task pointer may be NULL, meaning always true (select all tasks).
* Effectively duplicates cgroup_iter_{start,next,end}()
* but does not lock css_set_lock for the call to process_task().
* The struct cgroup_scanner may be embedded in any structure of the caller's
* creation.
* It is guaranteed that process_task() will act on every task that
* is a member of the cgroup for the duration of this call. This
* function may or may not call process_task() for tasks that exit
* or move to a different cgroup during the call, or are forked or
* move into the cgroup during the call.
*
* Note that test_task() may be called with locks held, and may in some
* situations be called multiple times for the same task, so it should
* be cheap.
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
* pre-allocated and will be used for heap operations (and its "gt" member will
* be overwritten), else a temporary heap will be used (allocation of which
* may cause this function to fail).
*/
int cgroup_scan_tasks ( struct cgroup_scanner * scan )
{
int retval , i ;
struct cgroup_iter it ;
struct task_struct * p , * dropped ;
/* Never dereference latest_task, since it's not refcounted */
struct task_struct * latest_task = NULL ;
struct ptr_heap tmp_heap ;
struct ptr_heap * heap ;
struct timespec latest_time = { 0 , 0 } ;
if ( scan - > heap ) {
/* The caller supplied our heap and pre-allocated its memory */
heap = scan - > heap ;
heap - > gt = & started_after ;
} else {
/* We need to allocate our own heap memory */
heap = & tmp_heap ;
retval = heap_init ( heap , PAGE_SIZE , GFP_KERNEL , & started_after ) ;
if ( retval )
/* cannot allocate the heap */
return retval ;
}
again :
/*
* Scan tasks in the cgroup, using the scanner's "test_task" callback
* to determine which are of interest, and using the scanner's
* "process_task" callback to process any of them that need an update.
* Since we don't want to hold any locks during the task updates,
* gather tasks to be processed in a heap structure.
* The heap is sorted by descending task start time.
* If the statically-sized heap fills up, we overflow tasks that
* started later, and in future iterations only consider tasks that
* started after the latest task in the previous pass. This
* guarantees forward progress and that we don't miss any tasks.
*/
heap - > size = 0 ;
2013-07-31 16:18:36 +08:00
cgroup_iter_start ( scan - > cgrp , & it ) ;
while ( ( p = cgroup_iter_next ( scan - > cgrp , & it ) ) ) {
2008-02-07 00:14:42 -08:00
/*
* Only affect tasks that qualify per the caller's callback,
* if he provided one
*/
if ( scan - > test_task & & ! scan - > test_task ( p , scan ) )
continue ;
/*
* Only process tasks that started after the last task
* we processed
*/
if ( ! started_after_time ( p , & latest_time , latest_task ) )
continue ;
dropped = heap_insert ( heap , p ) ;
if ( dropped = = NULL ) {
/*
* The new task was inserted; the heap wasn't
* previously full
*/
get_task_struct ( p ) ;
} else if ( dropped ! = p ) {
/*
* The new task was inserted, and pushed out a
* different task
*/
get_task_struct ( p ) ;
put_task_struct ( dropped ) ;
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
2013-07-31 16:18:36 +08:00
cgroup_iter_end ( scan - > cgrp , & it ) ;
2008-02-07 00:14:42 -08:00
if ( heap - > size ) {
for ( i = 0 ; i < heap - > size ; i + + ) {
2008-04-29 00:59:55 -07:00
struct task_struct * q = heap - > ptrs [ i ] ;
2008-02-07 00:14:42 -08:00
if ( i = = 0 ) {
2008-04-29 00:59:55 -07:00
latest_time = q - > start_time ;
latest_task = q ;
2008-02-07 00:14:42 -08:00
}
/* Process the task per the caller's callback */
2008-04-29 00:59:55 -07:00
scan - > process_task ( q , scan ) ;
put_task_struct ( q ) ;
2008-02-07 00:14:42 -08:00
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.
* Not the most efficient way to do it, but it avoids
* having to take callback_mutex in the fork path
*/
goto again ;
}
if ( heap = = & tmp_heap )
heap_free ( & tmp_heap ) ;
return 0 ;
}
2013-04-07 09:29:50 -07:00
static void cgroup_transfer_one_task ( struct task_struct * task ,
struct cgroup_scanner * scan )
{
struct cgroup * new_cgroup = scan - > data ;
2013-04-07 09:29:51 -07:00
mutex_lock ( & cgroup_mutex ) ;
2013-04-07 09:29:50 -07:00
cgroup_attach_task ( new_cgroup , task , false ) ;
2013-04-07 09:29:51 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2013-04-07 09:29:50 -07:00
}
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*/
int cgroup_transfer_tasks ( struct cgroup * to , struct cgroup * from )
{
struct cgroup_scanner scan ;
2013-07-31 16:18:36 +08:00
scan . cgrp = from ;
2013-04-07 09:29:50 -07:00
scan . test_task = NULL ; /* select all tasks in cgroup */
scan . process_task = cgroup_transfer_one_task ;
scan . heap = NULL ;
scan . data = to ;
return cgroup_scan_tasks ( & scan ) ;
}
2007-10-18 23:39:32 -07:00
/*
2009-09-23 15:56:26 -07:00
* Stuff for reading the 'tasks'/'procs' files.
2007-10-18 23:39:32 -07:00
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
2012-01-20 11:58:43 +08:00
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS ,
CGROUP_FILE_TASKS ,
} ;
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type ; struct pid_namespace * ns ; } key ;
/* array of xids */
pid_t * list ;
/* how many elements the above list has */
int length ;
/* how many files are using the current array */
int use_count ;
/* each of these stored in a list by its cgroup */
struct list_head links ;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup * owner ;
/* protects the other fields */
2013-08-01 09:52:15 +08:00
struct rw_semaphore rwsem ;
2012-01-20 11:58:43 +08:00
} ;
2009-09-23 15:56:28 -07:00
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
# define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void * pidlist_allocate ( int count )
{
if ( PIDLIST_TOO_LARGE ( count ) )
return vmalloc ( count * sizeof ( pid_t ) ) ;
else
return kmalloc ( count * sizeof ( pid_t ) , GFP_KERNEL ) ;
}
static void pidlist_free ( void * p )
{
if ( is_vmalloc_addr ( p ) )
vfree ( p ) ;
else
kfree ( p ) ;
}
2007-10-18 23:39:32 -07:00
/*
2009-09-23 15:56:26 -07:00
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
2013-03-12 15:36:00 -07:00
* Returns the number of unique elements.
2007-10-18 23:39:32 -07:00
*/
2013-03-12 15:36:00 -07:00
static int pidlist_uniq ( pid_t * list , int length )
2007-10-18 23:39:32 -07:00
{
2009-09-23 15:56:26 -07:00
int src , dest = 1 ;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if ( length = = 0 | | length = = 1 )
return length ;
/* src and dest walk down the list; dest counts unique elements */
for ( src = 1 ; src < length ; src + + ) {
/* find next unique element */
while ( list [ src ] = = list [ src - 1 ] ) {
src + + ;
if ( src = = length )
goto after ;
}
/* dest always points to where the next unique element goes */
list [ dest ] = list [ src ] ;
dest + + ;
}
after :
return dest ;
}
static int cmppid ( const void * a , const void * b )
{
return * ( pid_t * ) a - * ( pid_t * ) b ;
}
2009-09-23 15:56:27 -07:00
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist * cgroup_pidlist_find ( struct cgroup * cgrp ,
enum cgroup_filetype type )
{
struct cgroup_pidlist * l ;
/* don't need task_nsproxy() if we're looking at ourself */
2010-03-02 14:51:53 -08:00
struct pid_namespace * ns = task_active_pid_ns ( current ) ;
2010-03-10 15:22:12 -08:00
2009-09-23 15:56:27 -07:00
/*
2013-08-01 09:52:15 +08:00
* We can't drop the pidlist_mutex before taking the l->rwsem in case
2009-09-23 15:56:27 -07:00
* the last ref-holder is trying to remove l from the list at the same
* time. Holding the pidlist_mutex precludes somebody taking whichever
* list we find out from under us - compare release_pid_array().
*/
mutex_lock ( & cgrp - > pidlist_mutex ) ;
list_for_each_entry ( l , & cgrp - > pidlists , links ) {
if ( l - > key . type = = type & & l - > key . ns = = ns ) {
/* make sure l doesn't vanish out from under us */
2013-08-01 09:52:15 +08:00
down_write ( & l - > rwsem ) ;
2009-09-23 15:56:27 -07:00
mutex_unlock ( & cgrp - > pidlist_mutex ) ;
return l ;
}
}
/* entry not found; create a new one */
2013-06-12 21:04:51 -07:00
l = kzalloc ( sizeof ( struct cgroup_pidlist ) , GFP_KERNEL ) ;
2009-09-23 15:56:27 -07:00
if ( ! l ) {
mutex_unlock ( & cgrp - > pidlist_mutex ) ;
return l ;
}
2013-08-01 09:52:15 +08:00
init_rwsem ( & l - > rwsem ) ;
down_write ( & l - > rwsem ) ;
2009-09-23 15:56:27 -07:00
l - > key . type = type ;
2010-03-10 15:22:12 -08:00
l - > key . ns = get_pid_ns ( ns ) ;
2009-09-23 15:56:27 -07:00
l - > owner = cgrp ;
list_add ( & l - > links , & cgrp - > pidlists ) ;
mutex_unlock ( & cgrp - > pidlist_mutex ) ;
return l ;
}
2009-09-23 15:56:26 -07:00
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
2009-09-23 15:56:27 -07:00
static int pidlist_array_load ( struct cgroup * cgrp , enum cgroup_filetype type ,
struct cgroup_pidlist * * lp )
2009-09-23 15:56:26 -07:00
{
pid_t * array ;
int length ;
int pid , n = 0 ; /* used for populating the array */
2007-10-18 23:39:36 -07:00
struct cgroup_iter it ;
struct task_struct * tsk ;
2009-09-23 15:56:26 -07:00
struct cgroup_pidlist * l ;
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count ( cgrp ) ;
2009-09-23 15:56:28 -07:00
array = pidlist_allocate ( length ) ;
2009-09-23 15:56:26 -07:00
if ( ! array )
return - ENOMEM ;
/* now, populate the array */
2007-10-18 23:40:44 -07:00
cgroup_iter_start ( cgrp , & it ) ;
while ( ( tsk = cgroup_iter_next ( cgrp , & it ) ) ) {
2009-09-23 15:56:26 -07:00
if ( unlikely ( n = = length ) )
2007-10-18 23:39:36 -07:00
break ;
2009-09-23 15:56:26 -07:00
/* get tgid or pid for procs or tasks file respectively */
2009-09-23 15:56:27 -07:00
if ( type = = CGROUP_FILE_PROCS )
pid = task_tgid_vnr ( tsk ) ;
else
pid = task_pid_vnr ( tsk ) ;
2009-09-23 15:56:26 -07:00
if ( pid > 0 ) /* make sure to only use valid results */
array [ n + + ] = pid ;
2007-10-18 23:39:36 -07:00
}
2007-10-18 23:40:44 -07:00
cgroup_iter_end ( cgrp , & it ) ;
2009-09-23 15:56:26 -07:00
length = n ;
/* now sort & (if procs) strip out duplicates */
sort ( array , length , sizeof ( pid_t ) , cmppid , NULL ) ;
2009-09-23 15:56:27 -07:00
if ( type = = CGROUP_FILE_PROCS )
2013-03-12 15:36:00 -07:00
length = pidlist_uniq ( array , length ) ;
2009-09-23 15:56:27 -07:00
l = cgroup_pidlist_find ( cgrp , type ) ;
if ( ! l ) {
2009-09-23 15:56:28 -07:00
pidlist_free ( array ) ;
2009-09-23 15:56:27 -07:00
return - ENOMEM ;
2009-09-23 15:56:26 -07:00
}
2009-09-23 15:56:27 -07:00
/* store array, freeing old if necessary - lock already held */
2009-09-23 15:56:28 -07:00
pidlist_free ( l - > list ) ;
2009-09-23 15:56:26 -07:00
l - > list = array ;
l - > length = length ;
l - > use_count + + ;
2013-08-01 09:52:15 +08:00
up_write ( & l - > rwsem ) ;
2009-09-23 15:56:27 -07:00
* lp = l ;
2009-09-23 15:56:26 -07:00
return 0 ;
2007-10-18 23:39:32 -07:00
}
2007-10-18 23:39:44 -07:00
/**
2008-02-23 15:24:09 -08:00
* cgroupstats_build - build and fill cgroupstats
2007-10-18 23:39:44 -07:00
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
2008-02-23 15:24:09 -08:00
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
2007-10-18 23:39:44 -07:00
*/
int cgroupstats_build ( struct cgroupstats * stats , struct dentry * dentry )
{
int ret = - EINVAL ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp ;
2007-10-18 23:39:44 -07:00
struct cgroup_iter it ;
struct task_struct * tsk ;
2008-11-19 15:36:48 -08:00
2007-10-18 23:39:44 -07:00
/*
2008-11-19 15:36:48 -08:00
* Validate dentry by checking the superblock operations,
* and make sure it's a directory.
2007-10-18 23:39:44 -07:00
*/
2008-11-19 15:36:48 -08:00
if ( dentry - > d_sb - > s_op ! = & cgroup_ops | |
! S_ISDIR ( dentry - > d_inode - > i_mode ) )
2007-10-18 23:39:44 -07:00
goto err ;
ret = 0 ;
2007-10-18 23:40:44 -07:00
cgrp = dentry - > d_fsdata ;
2007-10-18 23:39:44 -07:00
2007-10-18 23:40:44 -07:00
cgroup_iter_start ( cgrp , & it ) ;
while ( ( tsk = cgroup_iter_next ( cgrp , & it ) ) ) {
2007-10-18 23:39:44 -07:00
switch ( tsk - > state ) {
case TASK_RUNNING :
stats - > nr_running + + ;
break ;
case TASK_INTERRUPTIBLE :
stats - > nr_sleeping + + ;
break ;
case TASK_UNINTERRUPTIBLE :
stats - > nr_uninterruptible + + ;
break ;
case TASK_STOPPED :
stats - > nr_stopped + + ;
break ;
default :
if ( delayacct_is_task_waiting_on_io ( tsk ) )
stats - > nr_io_wait + + ;
break ;
}
}
2007-10-18 23:40:44 -07:00
cgroup_iter_end ( cgrp , & it ) ;
2007-10-18 23:39:44 -07:00
err :
return ret ;
}
2009-09-23 15:56:25 -07:00
2008-10-18 20:28:04 -07:00
/*
2009-09-23 15:56:26 -07:00
* seq_file methods for the tasks/procs files. The seq_file position is the
2008-10-18 20:28:04 -07:00
* next pid to display; the seq_file iterator is a pointer to the pid
2009-09-23 15:56:26 -07:00
* in the cgroup->l->list array.
2008-10-18 20:28:04 -07:00
*/
2009-09-23 15:56:26 -07:00
static void * cgroup_pidlist_start ( struct seq_file * s , loff_t * pos )
2008-10-18 20:28:04 -07:00
{
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
2009-09-23 15:56:26 -07:00
struct cgroup_pidlist * l = s - > private ;
2008-10-18 20:28:04 -07:00
int index = 0 , pid = * pos ;
int * iter ;
2013-08-01 09:52:15 +08:00
down_read ( & l - > rwsem ) ;
2008-10-18 20:28:04 -07:00
if ( pid ) {
2009-09-23 15:56:26 -07:00
int end = l - > length ;
2008-10-21 16:11:20 +11:00
2008-10-18 20:28:04 -07:00
while ( index < end ) {
int mid = ( index + end ) / 2 ;
2009-09-23 15:56:26 -07:00
if ( l - > list [ mid ] = = pid ) {
2008-10-18 20:28:04 -07:00
index = mid ;
break ;
2009-09-23 15:56:26 -07:00
} else if ( l - > list [ mid ] < = pid )
2008-10-18 20:28:04 -07:00
index = mid + 1 ;
else
end = mid ;
}
}
/* If we're off the end of the array, we're done */
2009-09-23 15:56:26 -07:00
if ( index > = l - > length )
2008-10-18 20:28:04 -07:00
return NULL ;
/* Update the abstract position to be the actual pid that we found */
2009-09-23 15:56:26 -07:00
iter = l - > list + index ;
2008-10-18 20:28:04 -07:00
* pos = * iter ;
return iter ;
2007-10-18 23:39:32 -07:00
}
2009-09-23 15:56:26 -07:00
static void cgroup_pidlist_stop ( struct seq_file * s , void * v )
2008-10-18 20:28:04 -07:00
{
2009-09-23 15:56:26 -07:00
struct cgroup_pidlist * l = s - > private ;
2013-08-01 09:52:15 +08:00
up_read ( & l - > rwsem ) ;
2008-10-18 20:28:04 -07:00
}
2009-09-23 15:56:26 -07:00
static void * cgroup_pidlist_next ( struct seq_file * s , void * v , loff_t * pos )
2008-10-18 20:28:04 -07:00
{
2009-09-23 15:56:26 -07:00
struct cgroup_pidlist * l = s - > private ;
pid_t * p = v ;
pid_t * end = l - > list + l - > length ;
2008-10-18 20:28:04 -07:00
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p + + ;
if ( p > = end ) {
return NULL ;
} else {
* pos = * p ;
return p ;
}
}
2009-09-23 15:56:26 -07:00
static int cgroup_pidlist_show ( struct seq_file * s , void * v )
2008-10-18 20:28:04 -07:00
{
return seq_printf ( s , " %d \n " , * ( int * ) v ) ;
}
2009-09-23 15:56:26 -07:00
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
. start = cgroup_pidlist_start ,
. stop = cgroup_pidlist_stop ,
. next = cgroup_pidlist_next ,
. show = cgroup_pidlist_show ,
2008-10-18 20:28:04 -07:00
} ;
2009-09-23 15:56:26 -07:00
static void cgroup_release_pid_array ( struct cgroup_pidlist * l )
2008-10-18 20:28:04 -07:00
{
2009-09-23 15:56:27 -07:00
/*
* the case where we're the last user of this particular pidlist will
* have us remove it from the cgroup's list, which entails taking the
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
* pidlist_mutex, we have to take pidlist_mutex first.
*/
mutex_lock ( & l - > owner - > pidlist_mutex ) ;
2013-08-01 09:52:15 +08:00
down_write ( & l - > rwsem ) ;
2009-09-23 15:56:26 -07:00
BUG_ON ( ! l - > use_count ) ;
if ( ! - - l - > use_count ) {
2009-09-23 15:56:27 -07:00
/* we're the last user if refcount is 0; remove and free */
list_del ( & l - > links ) ;
mutex_unlock ( & l - > owner - > pidlist_mutex ) ;
2009-09-23 15:56:28 -07:00
pidlist_free ( l - > list ) ;
2009-09-23 15:56:27 -07:00
put_pid_ns ( l - > key . ns ) ;
2013-08-01 09:52:15 +08:00
up_write ( & l - > rwsem ) ;
2009-09-23 15:56:27 -07:00
kfree ( l ) ;
return ;
2008-10-18 20:28:04 -07:00
}
2009-09-23 15:56:27 -07:00
mutex_unlock ( & l - > owner - > pidlist_mutex ) ;
2013-08-01 09:52:15 +08:00
up_write ( & l - > rwsem ) ;
2008-10-18 20:28:04 -07:00
}
2009-09-23 15:56:26 -07:00
static int cgroup_pidlist_release ( struct inode * inode , struct file * file )
2007-10-18 23:39:32 -07:00
{
2009-09-23 15:56:26 -07:00
struct cgroup_pidlist * l ;
2007-10-18 23:39:32 -07:00
if ( ! ( file - > f_mode & FMODE_READ ) )
return 0 ;
2009-09-23 15:56:26 -07:00
/*
* the seq_file will only be initialized if the file was opened for
* reading; hence we check if it's not null only in that case.
*/
l = ( ( struct seq_file * ) file - > private_data ) - > private ;
cgroup_release_pid_array ( l ) ;
2008-10-18 20:28:04 -07:00
return seq_release ( inode , file ) ;
}
2009-09-23 15:56:26 -07:00
static const struct file_operations cgroup_pidlist_operations = {
2008-10-18 20:28:04 -07:00
. read = seq_read ,
. llseek = seq_lseek ,
. write = cgroup_file_write ,
2009-09-23 15:56:26 -07:00
. release = cgroup_pidlist_release ,
2008-10-18 20:28:04 -07:00
} ;
/*
2009-09-23 15:56:26 -07:00
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
2008-10-18 20:28:04 -07:00
*/
2009-09-23 15:56:26 -07:00
/* helper function for the two below it */
2009-09-23 15:56:27 -07:00
static int cgroup_pidlist_open ( struct file * file , enum cgroup_filetype type )
2008-10-18 20:28:04 -07:00
{
struct cgroup * cgrp = __d_cgrp ( file - > f_dentry - > d_parent ) ;
2009-09-23 15:56:27 -07:00
struct cgroup_pidlist * l ;
2008-10-18 20:28:04 -07:00
int retval ;
/* Nothing to do for write-only files */
if ( ! ( file - > f_mode & FMODE_READ ) )
return 0 ;
2007-10-18 23:39:32 -07:00
2009-09-23 15:56:26 -07:00
/* have the array populated */
2009-09-23 15:56:27 -07:00
retval = pidlist_array_load ( cgrp , type , & l ) ;
2009-09-23 15:56:26 -07:00
if ( retval )
return retval ;
/* configure file information */
file - > f_op = & cgroup_pidlist_operations ;
2007-10-18 23:39:32 -07:00
2009-09-23 15:56:26 -07:00
retval = seq_open ( file , & cgroup_pidlist_seq_operations ) ;
2008-10-18 20:28:04 -07:00
if ( retval ) {
2009-09-23 15:56:26 -07:00
cgroup_release_pid_array ( l ) ;
2008-10-18 20:28:04 -07:00
return retval ;
2007-10-18 23:39:32 -07:00
}
2009-09-23 15:56:26 -07:00
( ( struct seq_file * ) file - > private_data ) - > private = l ;
2007-10-18 23:39:32 -07:00
return 0 ;
}
2009-09-23 15:56:26 -07:00
static int cgroup_tasks_open ( struct inode * unused , struct file * file )
{
2009-09-23 15:56:27 -07:00
return cgroup_pidlist_open ( file , CGROUP_FILE_TASKS ) ;
2009-09-23 15:56:26 -07:00
}
static int cgroup_procs_open ( struct inode * unused , struct file * file )
{
2009-09-23 15:56:27 -07:00
return cgroup_pidlist_open ( file , CGROUP_FILE_PROCS ) ;
2009-09-23 15:56:26 -07:00
}
2007-10-18 23:39:32 -07:00
2007-10-18 23:40:44 -07:00
static u64 cgroup_read_notify_on_release ( struct cgroup * cgrp ,
2007-10-18 23:39:38 -07:00
struct cftype * cft )
{
2007-10-18 23:40:44 -07:00
return notify_on_release ( cgrp ) ;
2007-10-18 23:39:38 -07:00
}
2008-07-25 01:47:01 -07:00
static int cgroup_write_notify_on_release ( struct cgroup * cgrp ,
struct cftype * cft ,
u64 val )
{
clear_bit ( CGRP_RELEASABLE , & cgrp - > flags ) ;
if ( val )
set_bit ( CGRP_NOTIFY_ON_RELEASE , & cgrp - > flags ) ;
else
clear_bit ( CGRP_NOTIFY_ON_RELEASE , & cgrp - > flags ) ;
return 0 ;
}
2013-06-18 18:41:10 +08:00
/*
* When dput() is called asynchronously, if umount has been done and
* then deactivate_super() in cgroup_free_fn() kills the superblock,
* there's a small window that vfs will see the root dentry with non-zero
* refcnt and trigger BUG().
*
* That's why we hold a reference before dput() and drop it right after.
*/
static void cgroup_dput ( struct cgroup * cgrp )
{
struct super_block * sb = cgrp - > root - > sb ;
atomic_inc ( & sb - > s_active ) ;
dput ( cgrp - > dentry ) ;
deactivate_super ( sb ) ;
}
2010-03-10 15:22:20 -08:00
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void cgroup_event_remove ( struct work_struct * work )
{
struct cgroup_event * event = container_of ( work , struct cgroup_event ,
remove ) ;
struct cgroup * cgrp = event - > cgrp ;
2013-02-18 18:56:14 +08:00
remove_wait_queue ( event - > wqh , & event - > wait ) ;
2010-03-10 15:22:20 -08:00
event - > cft - > unregister_event ( cgrp , event - > cft , event - > eventfd ) ;
2013-02-18 18:56:14 +08:00
/* Notify userspace the event is going away. */
eventfd_signal ( event - > eventfd , 1 ) ;
2010-03-10 15:22:20 -08:00
eventfd_ctx_put ( event - > eventfd ) ;
kfree ( event ) ;
2013-06-18 18:41:10 +08:00
cgroup_dput ( cgrp ) ;
2010-03-10 15:22:20 -08:00
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake ( wait_queue_t * wait , unsigned mode ,
int sync , void * key )
{
struct cgroup_event * event = container_of ( wait ,
struct cgroup_event , wait ) ;
struct cgroup * cgrp = event - > cgrp ;
unsigned long flags = ( unsigned long ) key ;
if ( flags & POLLHUP ) {
/*
2013-02-18 18:56:14 +08:00
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
2010-03-10 15:22:20 -08:00
*/
2013-02-18 18:56:14 +08:00
spin_lock ( & cgrp - > event_list_lock ) ;
if ( ! list_empty ( & event - > list ) ) {
list_del_init ( & event - > list ) ;
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work ( & event - > remove ) ;
}
spin_unlock ( & cgrp - > event_list_lock ) ;
2010-03-10 15:22:20 -08:00
}
return 0 ;
}
static void cgroup_event_ptable_queue_proc ( struct file * file ,
wait_queue_head_t * wqh , poll_table * pt )
{
struct cgroup_event * event = container_of ( pt ,
struct cgroup_event , pt ) ;
event - > wqh = wqh ;
add_wait_queue ( wqh , & event - > wait ) ;
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control ( struct cgroup * cgrp , struct cftype * cft ,
const char * buffer )
{
2013-08-01 09:51:47 +08:00
struct cgroup_event * event ;
2013-02-18 14:13:35 +08:00
struct cgroup * cgrp_cfile ;
2010-03-10 15:22:20 -08:00
unsigned int efd , cfd ;
2013-08-01 09:51:47 +08:00
struct file * efile ;
struct file * cfile ;
2010-03-10 15:22:20 -08:00
char * endp ;
int ret ;
efd = simple_strtoul ( buffer , & endp , 10 ) ;
if ( * endp ! = ' ' )
return - EINVAL ;
buffer = endp + 1 ;
cfd = simple_strtoul ( buffer , & endp , 10 ) ;
if ( ( * endp ! = ' ' ) & & ( * endp ! = ' \0 ' ) )
return - EINVAL ;
buffer = endp + 1 ;
event = kzalloc ( sizeof ( * event ) , GFP_KERNEL ) ;
if ( ! event )
return - ENOMEM ;
event - > cgrp = cgrp ;
INIT_LIST_HEAD ( & event - > list ) ;
init_poll_funcptr ( & event - > pt , cgroup_event_ptable_queue_proc ) ;
init_waitqueue_func_entry ( & event - > wait , cgroup_event_wake ) ;
INIT_WORK ( & event - > remove , cgroup_event_remove ) ;
efile = eventfd_fget ( efd ) ;
if ( IS_ERR ( efile ) ) {
ret = PTR_ERR ( efile ) ;
2013-08-01 09:51:47 +08:00
goto out_kfree ;
2010-03-10 15:22:20 -08:00
}
event - > eventfd = eventfd_ctx_fileget ( efile ) ;
if ( IS_ERR ( event - > eventfd ) ) {
ret = PTR_ERR ( event - > eventfd ) ;
2013-08-01 09:51:47 +08:00
goto out_put_efile ;
2010-03-10 15:22:20 -08:00
}
cfile = fget ( cfd ) ;
if ( ! cfile ) {
ret = - EBADF ;
2013-08-01 09:51:47 +08:00
goto out_put_eventfd ;
2010-03-10 15:22:20 -08:00
}
/* the process need read permission on control file */
2011-06-19 12:55:10 -04:00
/* AV: shouldn't we check that it's been opened for read instead? */
2013-01-23 17:07:38 -05:00
ret = inode_permission ( file_inode ( cfile ) , MAY_READ ) ;
2010-03-10 15:22:20 -08:00
if ( ret < 0 )
2013-08-01 09:51:47 +08:00
goto out_put_cfile ;
2010-03-10 15:22:20 -08:00
event - > cft = __file_cft ( cfile ) ;
if ( IS_ERR ( event - > cft ) ) {
ret = PTR_ERR ( event - > cft ) ;
2013-08-01 09:51:47 +08:00
goto out_put_cfile ;
2010-03-10 15:22:20 -08:00
}
2013-02-18 14:13:35 +08:00
/*
* The file to be monitored must be in the same cgroup as
* cgroup.event_control is.
*/
cgrp_cfile = __d_cgrp ( cfile - > f_dentry - > d_parent ) ;
if ( cgrp_cfile ! = cgrp ) {
ret = - EINVAL ;
2013-08-01 09:51:47 +08:00
goto out_put_cfile ;
2013-02-18 14:13:35 +08:00
}
2010-03-10 15:22:20 -08:00
if ( ! event - > cft - > register_event | | ! event - > cft - > unregister_event ) {
ret = - EINVAL ;
2013-08-01 09:51:47 +08:00
goto out_put_cfile ;
2010-03-10 15:22:20 -08:00
}
ret = event - > cft - > register_event ( cgrp , event - > cft ,
event - > eventfd , buffer ) ;
if ( ret )
2013-08-01 09:51:47 +08:00
goto out_put_cfile ;
2010-03-10 15:22:20 -08:00
2013-04-26 11:58:03 -07:00
efile - > f_op - > poll ( efile , & event - > pt ) ;
2010-03-10 15:22:20 -08:00
2010-03-10 15:22:34 -08:00
/*
* Events should be removed after rmdir of cgroup directory, but before
* destroying subsystem state objects. Let's take reference to cgroup
* directory dentry to do that.
*/
dget ( cgrp - > dentry ) ;
2010-03-10 15:22:20 -08:00
spin_lock ( & cgrp - > event_list_lock ) ;
list_add ( & event - > list , & cgrp - > event_list ) ;
spin_unlock ( & cgrp - > event_list_lock ) ;
fput ( cfile ) ;
fput ( efile ) ;
return 0 ;
2013-08-01 09:51:47 +08:00
out_put_cfile :
fput ( cfile ) ;
out_put_eventfd :
eventfd_ctx_put ( event - > eventfd ) ;
out_put_efile :
fput ( efile ) ;
out_kfree :
2010-03-10 15:22:20 -08:00
kfree ( event ) ;
return ret ;
}
2010-10-27 15:33:35 -07:00
static u64 cgroup_clone_children_read ( struct cgroup * cgrp ,
struct cftype * cft )
{
2012-11-19 08:13:38 -08:00
return test_bit ( CGRP_CPUSET_CLONE_CHILDREN , & cgrp - > flags ) ;
2010-10-27 15:33:35 -07:00
}
static int cgroup_clone_children_write ( struct cgroup * cgrp ,
struct cftype * cft ,
u64 val )
{
if ( val )
2012-11-19 08:13:38 -08:00
set_bit ( CGRP_CPUSET_CLONE_CHILDREN , & cgrp - > flags ) ;
2010-10-27 15:33:35 -07:00
else
2012-11-19 08:13:38 -08:00
clear_bit ( CGRP_CPUSET_CLONE_CHILDREN , & cgrp - > flags ) ;
2010-10-27 15:33:35 -07:00
return 0 ;
}
2013-06-03 19:14:34 -07:00
static struct cftype cgroup_base_files [ ] = {
2007-10-18 23:39:38 -07:00
{
2013-06-03 19:14:34 -07:00
. name = " cgroup.procs " ,
2009-09-23 15:56:26 -07:00
. open = cgroup_procs_open ,
2011-05-26 16:25:20 -07:00
. write_u64 = cgroup_procs_write ,
2009-09-23 15:56:26 -07:00
. release = cgroup_pidlist_release ,
2011-05-26 16:25:20 -07:00
. mode = S_IRUGO | S_IWUSR ,
2009-09-23 15:56:26 -07:00
} ,
2007-10-18 23:39:38 -07:00
{
2013-06-03 19:14:34 -07:00
. name = " cgroup.event_control " ,
2010-03-10 15:22:20 -08:00
. write_string = cgroup_write_event_control ,
. mode = S_IWUGO ,
} ,
2010-10-27 15:33:35 -07:00
{
. name = " cgroup.clone_children " ,
2013-04-14 20:15:26 -07:00
. flags = CFTYPE_INSANE ,
2010-10-27 15:33:35 -07:00
. read_u64 = cgroup_clone_children_read ,
. write_u64 = cgroup_clone_children_write ,
} ,
2013-04-14 20:15:26 -07:00
{
. name = " cgroup.sane_behavior " ,
. flags = CFTYPE_ONLY_ON_ROOT ,
. read_seq_string = cgroup_sane_behavior_show ,
} ,
2013-06-03 19:14:34 -07:00
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
* don't exist if sane_behavior. If you're depending on these, be
* prepared to be burned.
*/
{
. name = " tasks " ,
. flags = CFTYPE_INSANE , /* use "procs" instead */
. open = cgroup_tasks_open ,
. write_u64 = cgroup_tasks_write ,
. release = cgroup_pidlist_release ,
. mode = S_IRUGO | S_IWUSR ,
} ,
{
. name = " notify_on_release " ,
. flags = CFTYPE_INSANE ,
. read_u64 = cgroup_read_notify_on_release ,
. write_u64 = cgroup_write_notify_on_release ,
} ,
2012-04-01 12:09:55 -07:00
{
. name = " release_agent " ,
2013-06-03 19:13:55 -07:00
. flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT ,
2012-04-01 12:09:55 -07:00
. read_seq_string = cgroup_release_agent_show ,
. write_string = cgroup_release_agent_write ,
. max_write_len = PATH_MAX ,
} ,
2012-04-01 12:09:55 -07:00
{ } /* terminate */
2007-10-18 23:39:32 -07:00
} ;
2012-08-23 16:53:29 -04:00
/**
2013-06-28 16:24:11 -07:00
* cgroup_populate_dir - create subsys files in a cgroup directory
2012-08-23 16:53:29 -04:00
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be added
2013-06-28 16:24:11 -07:00
*
* On failure, no file is added.
2012-08-23 16:53:29 -04:00
*/
2013-06-28 16:24:11 -07:00
static int cgroup_populate_dir ( struct cgroup * cgrp , unsigned long subsys_mask )
2007-10-18 23:39:30 -07:00
{
struct cgroup_subsys * ss ;
2013-07-12 12:34:02 -07:00
int i , ret = 0 ;
2007-10-18 23:39:30 -07:00
2012-04-01 12:09:55 -07:00
/* process cftsets of each subsystem */
2013-07-12 12:34:02 -07:00
for_each_subsys ( ss , i ) {
2012-04-01 12:09:55 -07:00
struct cftype_set * set ;
2013-07-12 12:34:02 -07:00
if ( ! test_bit ( i , & subsys_mask ) )
2012-08-23 16:53:29 -04:00
continue ;
2012-04-01 12:09:55 -07:00
2013-06-28 16:24:11 -07:00
list_for_each_entry ( set , & ss - > cftsets , node ) {
2013-08-08 20:11:23 -04:00
ret = cgroup_addrm_files ( cgrp , set - > cfts , true ) ;
2013-06-28 16:24:11 -07:00
if ( ret < 0 )
goto err ;
}
2007-10-18 23:39:30 -07:00
}
2012-04-01 12:09:55 -07:00
2009-04-02 16:57:25 -07:00
/* This cgroup is ready now */
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( cgrp - > root , ss ) {
2009-04-02 16:57:25 -07:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
2013-06-21 15:52:33 -07:00
struct css_id * id = rcu_dereference_protected ( css - > id , true ) ;
2009-04-02 16:57:25 -07:00
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
2013-06-21 15:52:33 -07:00
if ( id )
rcu_assign_pointer ( id - > css , css ) ;
2009-04-02 16:57:25 -07:00
}
2007-10-18 23:39:30 -07:00
return 0 ;
2013-06-28 16:24:11 -07:00
err :
cgroup_clear_dir ( cgrp , subsys_mask ) ;
return ret ;
2007-10-18 23:39:30 -07:00
}
2012-04-01 12:09:56 -07:00
static void css_dput_fn ( struct work_struct * work )
{
struct cgroup_subsys_state * css =
container_of ( work , struct cgroup_subsys_state , dput_work ) ;
2013-06-18 18:41:10 +08:00
cgroup_dput ( css - > cgroup ) ;
2012-04-01 12:09:56 -07:00
}
2013-06-13 19:39:16 -07:00
static void css_release ( struct percpu_ref * ref )
{
struct cgroup_subsys_state * css =
container_of ( ref , struct cgroup_subsys_state , refcnt ) ;
schedule_work ( & css - > dput_work ) ;
}
2007-10-18 23:39:30 -07:00
static void init_cgroup_css ( struct cgroup_subsys_state * css ,
struct cgroup_subsys * ss ,
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp )
2007-10-18 23:39:30 -07:00
{
2007-10-18 23:40:44 -07:00
css - > cgroup = cgrp ;
2013-08-08 20:11:22 -04:00
css - > ss = ss ;
2007-10-18 23:39:30 -07:00
css - > flags = 0 ;
2009-04-02 16:57:25 -07:00
css - > id = NULL ;
2013-06-24 15:21:47 -07:00
if ( cgrp = = cgroup_dummy_top )
2012-11-19 08:13:36 -08:00
css - > flags | = CSS_ROOT ;
2007-10-18 23:40:44 -07:00
BUG_ON ( cgrp - > subsys [ ss - > subsys_id ] ) ;
cgrp - > subsys [ ss - > subsys_id ] = css ;
2012-04-01 12:09:56 -07:00
/*
2012-11-05 09:16:58 -08:00
* css holds an extra ref to @cgrp->dentry which is put on the last
* css_put(). dput() requires process context, which css_put() may
* be called without. @css->dput_work will be used to invoke
* dput() asynchronously from css_put().
2012-04-01 12:09:56 -07:00
*/
INIT_WORK ( & css - > dput_work , css_dput_fn ) ;
2007-10-18 23:39:30 -07:00
}
2013-07-31 16:16:40 +08:00
/* invoke ->css_online() on a new CSS and mark it online if successful */
2012-11-19 08:13:38 -08:00
static int online_css ( struct cgroup_subsys * ss , struct cgroup * cgrp )
2012-11-19 08:13:37 -08:00
{
2013-08-08 20:11:23 -04:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
2012-11-19 08:13:38 -08:00
int ret = 0 ;
2012-11-19 08:13:37 -08:00
lockdep_assert_held ( & cgroup_mutex ) ;
2012-11-19 08:13:38 -08:00
if ( ss - > css_online )
2013-08-08 20:11:23 -04:00
ret = ss - > css_online ( css ) ;
2012-11-19 08:13:38 -08:00
if ( ! ret )
2013-08-08 20:11:23 -04:00
css - > flags | = CSS_ONLINE ;
2012-11-19 08:13:38 -08:00
return ret ;
2012-11-19 08:13:37 -08:00
}
2013-07-31 16:16:40 +08:00
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
2012-11-19 08:13:37 -08:00
static void offline_css ( struct cgroup_subsys * ss , struct cgroup * cgrp )
{
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
lockdep_assert_held ( & cgroup_mutex ) ;
if ( ! ( css - > flags & CSS_ONLINE ) )
return ;
2013-03-12 15:35:59 -07:00
if ( ss - > css_offline )
2013-08-08 20:11:23 -04:00
ss - > css_offline ( css ) ;
2012-11-19 08:13:37 -08:00
2013-08-08 20:11:23 -04:00
css - > flags & = ~ CSS_ONLINE ;
2012-11-19 08:13:37 -08:00
}
2007-10-18 23:39:30 -07:00
/*
2008-02-23 15:24:09 -08:00
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
* @dentry: dentry of the new cgroup
* @mode: mode to set on new inode
2007-10-18 23:39:30 -07:00
*
2008-02-23 15:24:09 -08:00
* Must be called with the mutex on the parent inode held
2007-10-18 23:39:30 -07:00
*/
static long cgroup_create ( struct cgroup * parent , struct dentry * dentry ,
2011-07-26 01:55:55 -04:00
umode_t mode )
2007-10-18 23:39:30 -07:00
{
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp ;
2013-03-01 15:01:56 +08:00
struct cgroup_name * name ;
2007-10-18 23:39:30 -07:00
struct cgroupfs_root * root = parent - > root ;
int err = 0 ;
struct cgroup_subsys * ss ;
struct super_block * sb = root - > sb ;
2012-11-19 09:02:12 -08:00
/* allocate the cgroup and its ID, 0 is reserved for the root */
2007-10-18 23:40:44 -07:00
cgrp = kzalloc ( sizeof ( * cgrp ) , GFP_KERNEL ) ;
if ( ! cgrp )
2007-10-18 23:39:30 -07:00
return - ENOMEM ;
2013-03-01 15:01:56 +08:00
name = cgroup_alloc_name ( dentry ) ;
if ( ! name )
goto err_free_cgrp ;
rcu_assign_pointer ( cgrp - > name , name ) ;
2013-07-31 09:50:50 +08:00
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
cgrp - > id = idr_alloc ( & root - > cgroup_idr , NULL , 1 , 0 , GFP_KERNEL ) ;
2012-11-19 09:02:12 -08:00
if ( cgrp - > id < 0 )
2013-03-01 15:01:56 +08:00
goto err_free_name ;
2012-11-19 09:02:12 -08:00
2012-11-05 09:16:59 -08:00
/*
* Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
* anyway so that locking is contained inside cgroup proper and we
* don't get nasty surprises if we ever grow another caller.
*/
if ( ! cgroup_lock_live_group ( parent ) ) {
err = - ENODEV ;
2012-11-19 09:02:12 -08:00
goto err_free_id ;
2012-11-05 09:16:59 -08:00
}
2007-10-18 23:39:30 -07:00
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc ( & sb - > s_active ) ;
2008-10-18 20:28:04 -07:00
init_cgroup_housekeeping ( cgrp ) ;
2007-10-18 23:39:30 -07:00
2013-01-24 14:30:22 +08:00
dentry - > d_fsdata = cgrp ;
cgrp - > dentry = dentry ;
2007-10-18 23:40:44 -07:00
cgrp - > parent = parent ;
cgrp - > root = parent - > root ;
2007-10-18 23:39:30 -07:00
2008-03-04 14:28:19 -08:00
if ( notify_on_release ( parent ) )
set_bit ( CGRP_NOTIFY_ON_RELEASE , & cgrp - > flags ) ;
2012-11-19 08:13:38 -08:00
if ( test_bit ( CGRP_CPUSET_CLONE_CHILDREN , & parent - > flags ) )
set_bit ( CGRP_CPUSET_CLONE_CHILDREN , & cgrp - > flags ) ;
2010-10-27 15:33:35 -07:00
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss ) {
2012-09-13 12:20:58 -07:00
struct cgroup_subsys_state * css ;
2010-02-02 13:44:10 -08:00
2013-08-08 20:11:23 -04:00
css = ss - > css_alloc ( parent - > subsys [ ss - > subsys_id ] ) ;
2007-10-18 23:39:30 -07:00
if ( IS_ERR ( css ) ) {
err = PTR_ERR ( css ) ;
2012-11-19 08:13:38 -08:00
goto err_free_all ;
2007-10-18 23:39:30 -07:00
}
2013-06-13 19:39:16 -07:00
err = percpu_ref_init ( & css - > refcnt , css_release ) ;
2013-07-31 16:16:28 +08:00
if ( err ) {
2013-08-08 20:11:23 -04:00
ss - > css_free ( css ) ;
2013-06-13 19:39:16 -07:00
goto err_free_all ;
2013-07-31 16:16:28 +08:00
}
2013-06-13 19:39:16 -07:00
2007-10-18 23:40:44 -07:00
init_cgroup_css ( css , ss , cgrp ) ;
2013-06-13 19:39:16 -07:00
2010-02-02 13:44:10 -08:00
if ( ss - > use_id ) {
err = alloc_css_id ( ss , parent , cgrp ) ;
if ( err )
2012-11-19 08:13:38 -08:00
goto err_free_all ;
2010-02-02 13:44:10 -08:00
}
2007-10-18 23:39:30 -07:00
}
2012-11-19 08:13:36 -08:00
/*
* Create directory. cgroup_create_file() returns with the new
* directory locked on success so that it can be populated without
* dropping cgroup_mutex.
*/
2012-11-19 08:13:36 -08:00
err = cgroup_create_file ( dentry , S_IFDIR | mode , sb ) ;
2007-10-18 23:39:30 -07:00
if ( err < 0 )
2012-11-19 08:13:38 -08:00
goto err_free_all ;
2012-11-19 08:13:36 -08:00
lockdep_assert_held ( & dentry - > d_inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
2013-06-18 11:14:22 -07:00
cgrp - > serial_nr = cgroup_serial_nr_next + + ;
2013-05-24 10:55:38 +09:00
2012-11-19 08:13:36 -08:00
/* allocation complete, commit to creation */
list_add_tail_rcu ( & cgrp - > sibling , & cgrp - > parent - > children ) ;
root - > number_of_cgroups + + ;
2012-11-19 08:13:36 -08:00
2012-11-19 08:13:38 -08:00
/* each css holds a ref to the cgroup's dentry */
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss )
2012-11-05 09:16:58 -08:00
dget ( dentry ) ;
2012-04-01 12:09:56 -07:00
2013-04-08 14:35:02 +08:00
/* hold a ref to the parent's dentry */
dget ( parent - > dentry ) ;
2012-11-19 08:13:38 -08:00
/* creation succeeded, notify subsystems */
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss ) {
2012-11-19 08:13:38 -08:00
err = online_css ( ss , cgrp ) ;
if ( err )
goto err_destroy ;
2012-11-30 17:31:23 +04:00
if ( ss - > broken_hierarchy & & ! ss - > warned_broken_hierarchy & &
parent - > parent ) {
pr_warning ( " cgroup: %s (%d) created nested cgroup for controller \" %s \" which has incomplete hierarchy support. Nested cgroups may change behavior in the future. \n " ,
current - > comm , current - > pid , ss - > name ) ;
if ( ! strcmp ( ss - > name , " memory " ) )
pr_warning ( " cgroup: \" memory \" requires setting use_hierarchy to 1 on the root. \n " ) ;
ss - > warned_broken_hierarchy = true ;
}
2012-11-09 09:12:29 -08:00
}
2013-07-31 09:50:50 +08:00
idr_replace ( & root - > cgroup_idr , cgrp , cgrp - > id ) ;
2013-08-08 20:11:23 -04:00
err = cgroup_addrm_files ( cgrp , cgroup_base_files , true ) ;
2013-06-28 16:24:11 -07:00
if ( err )
goto err_destroy ;
err = cgroup_populate_dir ( cgrp , root - > subsys_mask ) ;
2012-11-19 08:13:38 -08:00
if ( err )
goto err_destroy ;
2007-10-18 23:39:30 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2007-10-18 23:40:44 -07:00
mutex_unlock ( & cgrp - > dentry - > d_inode - > i_mutex ) ;
2007-10-18 23:39:30 -07:00
return 0 ;
2012-11-19 08:13:38 -08:00
err_free_all :
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss ) {
2013-06-13 19:39:16 -07:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
if ( css ) {
percpu_ref_cancel_init ( & css - > refcnt ) ;
2013-08-08 20:11:23 -04:00
ss - > css_free ( css ) ;
2013-06-13 19:39:16 -07:00
}
2007-10-18 23:39:30 -07:00
}
mutex_unlock ( & cgroup_mutex ) ;
/* Release the reference count that we took on the superblock */
deactivate_super ( sb ) ;
2012-11-19 09:02:12 -08:00
err_free_id :
2013-07-31 09:50:50 +08:00
idr_remove ( & root - > cgroup_idr , cgrp - > id ) ;
2013-03-01 15:01:56 +08:00
err_free_name :
kfree ( rcu_dereference_raw ( cgrp - > name ) ) ;
2012-11-19 08:13:38 -08:00
err_free_cgrp :
2007-10-18 23:40:44 -07:00
kfree ( cgrp ) ;
2007-10-18 23:39:30 -07:00
return err ;
2012-11-19 08:13:38 -08:00
err_destroy :
cgroup_destroy_locked ( cgrp ) ;
mutex_unlock ( & cgroup_mutex ) ;
mutex_unlock ( & dentry - > d_inode - > i_mutex ) ;
return err ;
2007-10-18 23:39:30 -07:00
}
2011-07-26 01:41:39 -04:00
static int cgroup_mkdir ( struct inode * dir , struct dentry * dentry , umode_t mode )
2007-10-18 23:39:30 -07:00
{
struct cgroup * c_parent = dentry - > d_parent - > d_fsdata ;
/* the vfs holds inode->i_mutex already */
return cgroup_create ( c_parent , dentry , mode | S_IFDIR ) ;
}
2013-06-13 19:39:16 -07:00
static void cgroup_css_killed ( struct cgroup * cgrp )
{
if ( ! atomic_dec_and_test ( & cgrp - > css_kill_cnt ) )
return ;
/* percpu ref's of all css's are killed, kick off the next step */
INIT_WORK ( & cgrp - > destroy_work , cgroup_offline_fn ) ;
schedule_work ( & cgrp - > destroy_work ) ;
}
static void css_ref_killed_fn ( struct percpu_ref * ref )
{
struct cgroup_subsys_state * css =
container_of ( ref , struct cgroup_subsys_state , refcnt ) ;
cgroup_css_killed ( css - > cgroup ) ;
}
/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* @cgrp: cgroup to be destroyed
*
* css's make use of percpu refcnts whose killing latency shouldn't be
* exposed to userland and are RCU protected. Also, cgroup core needs to
* guarantee that css_tryget() won't succeed by the time ->css_offline() is
* invoked. To satisfy all the requirements, destruction is implemented in
* the following two steps.
*
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
* userland visible parts and start killing the percpu refcnts of
* css's. Set up so that the next stage will be kicked off once all
* the percpu refcnts are confirmed to be killed.
*
* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
* rest of destruction. Once all cgroup references are gone, the
* cgroup is RCU-freed.
*
* This function implements s1. After this step, @cgrp is gone as far as
* the userland is concerned and a new cgroup with the same name may be
* created. As cgroup doesn't care about the names internally, this
* doesn't cause any problem.
*/
2012-11-19 08:13:37 -08:00
static int cgroup_destroy_locked ( struct cgroup * cgrp )
__releases ( & cgroup_mutex ) __acquires ( & cgroup_mutex )
2007-10-18 23:39:30 -07:00
{
2012-11-19 08:13:37 -08:00
struct dentry * d = cgrp - > dentry ;
2010-03-10 15:22:34 -08:00
struct cgroup_event * event , * tmp ;
2012-11-05 09:16:58 -08:00
struct cgroup_subsys * ss ;
2013-06-12 21:04:54 -07:00
bool empty ;
2007-10-18 23:39:30 -07:00
2012-11-19 08:13:37 -08:00
lockdep_assert_held ( & d - > d_inode - > i_mutex ) ;
lockdep_assert_held ( & cgroup_mutex ) ;
2013-06-12 21:04:54 -07:00
/*
2013-06-12 21:04:55 -07:00
* css_set_lock synchronizes access to ->cset_links and prevents
* @cgrp from being removed while __put_css_set() is in progress.
2013-06-12 21:04:54 -07:00
*/
read_lock ( & css_set_lock ) ;
2013-06-12 21:04:55 -07:00
empty = list_empty ( & cgrp - > cset_links ) & & list_empty ( & cgrp - > children ) ;
2013-06-12 21:04:54 -07:00
read_unlock ( & css_set_lock ) ;
if ( ! empty )
2007-10-18 23:39:30 -07:00
return - EBUSY ;
2012-11-05 09:16:58 -08:00
2012-11-05 09:16:59 -08:00
/*
2013-06-13 19:39:16 -07:00
* Block new css_tryget() by killing css refcnts. cgroup core
* guarantees that, by the time ->css_offline() is invoked, no new
* css reference will be given out via css_tryget(). We can't
* simply call percpu_ref_kill() and proceed to offlining css's
* because percpu_ref_kill() doesn't guarantee that the ref is seen
* as killed on all CPUs on return.
*
* Use percpu_ref_kill_and_confirm() to get notifications as each
* css is confirmed to be seen as killed on all CPUs. The
* notification callback keeps track of the number of css's to be
* killed and schedules cgroup_offline_fn() to perform the rest of
* destruction once the percpu refs of all css's are confirmed to
* be killed.
2012-11-05 09:16:59 -08:00
*/
2013-06-13 19:39:16 -07:00
atomic_set ( & cgrp - > css_kill_cnt , 1 ) ;
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( cgrp - > root , ss ) {
2012-11-05 09:16:58 -08:00
struct cgroup_subsys_state * css = cgrp - > subsys [ ss - > subsys_id ] ;
2013-06-13 19:39:16 -07:00
/*
* Killing would put the base ref, but we need to keep it
* alive until after ->css_offline.
*/
percpu_ref_get ( & css - > refcnt ) ;
atomic_inc ( & cgrp - > css_kill_cnt ) ;
percpu_ref_kill_and_confirm ( & css - > refcnt , css_ref_killed_fn ) ;
2009-04-02 16:57:26 -07:00
}
2013-06-13 19:39:16 -07:00
cgroup_css_killed ( cgrp ) ;
2013-06-13 19:27:41 -07:00
/*
* Mark @cgrp dead. This prevents further task migration and child
* creation by disabling cgroup_lock_live_group(). Note that
* CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to
* resume iteration after dropping RCU read lock. See
* cgroup_next_sibling() for details.
*/
2013-06-12 21:04:53 -07:00
set_bit ( CGRP_DEAD , & cgrp - > flags ) ;
2012-11-05 09:16:59 -08:00
2013-06-13 19:27:41 -07:00
/* CGRP_DEAD is set, remove from ->release_list for the last time */
raw_spin_lock ( & release_list_lock ) ;
if ( ! list_empty ( & cgrp - > release_list ) )
list_del_init ( & cgrp - > release_list ) ;
raw_spin_unlock ( & release_list_lock ) ;
/*
2013-06-28 16:24:10 -07:00
* Clear and remove @cgrp directory. The removal puts the base ref
* but we aren't quite done with @cgrp yet, so hold onto it.
2013-06-13 19:27:41 -07:00
*/
2013-06-28 16:24:11 -07:00
cgroup_clear_dir ( cgrp , cgrp - > root - > subsys_mask ) ;
2013-08-08 20:11:23 -04:00
cgroup_addrm_files ( cgrp , cgroup_base_files , false ) ;
2013-06-13 19:27:41 -07:00
dget ( d ) ;
cgroup_d_remove_dir ( d ) ;
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock ( & cgrp - > event_list_lock ) ;
list_for_each_entry_safe ( event , tmp , & cgrp - > event_list , list ) {
list_del_init ( & event - > list ) ;
schedule_work ( & event - > remove ) ;
}
spin_unlock ( & cgrp - > event_list_lock ) ;
2013-06-13 19:27:42 -07:00
return 0 ;
} ;
2013-06-13 19:39:16 -07:00
/**
* cgroup_offline_fn - the second step of cgroup destruction
* @work: cgroup->destroy_free_work
*
* This function is invoked from a work item for a cgroup which is being
* destroyed after the percpu refcnts of all css's are guaranteed to be
* seen as killed on all CPUs, and performs the rest of destruction. This
* is the second step of destruction described in the comment above
* cgroup_destroy_locked().
*/
2013-06-13 19:27:42 -07:00
static void cgroup_offline_fn ( struct work_struct * work )
{
struct cgroup * cgrp = container_of ( work , struct cgroup , destroy_work ) ;
struct cgroup * parent = cgrp - > parent ;
struct dentry * d = cgrp - > dentry ;
struct cgroup_subsys * ss ;
mutex_lock ( & cgroup_mutex ) ;
2013-06-13 19:39:16 -07:00
/*
* css_tryget() is guaranteed to fail now. Tell subsystems to
* initate destruction.
*/
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( cgrp - > root , ss )
2012-11-19 08:13:37 -08:00
offline_css ( ss , cgrp ) ;
2012-11-05 09:16:58 -08:00
/*
2013-06-13 19:39:16 -07:00
* Put the css refs from cgroup_destroy_locked(). Each css holds
* an extra reference to the cgroup's dentry and cgroup removal
* proceeds regardless of css refs. On the last put of each css,
* whenever that may be, the extra dentry ref is put so that dentry
* destruction happens only after all css's are released.
2012-11-05 09:16:58 -08:00
*/
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( cgrp - > root , ss )
2012-11-05 09:16:58 -08:00
css_put ( cgrp - > subsys [ ss - > subsys_id ] ) ;
2007-10-18 23:39:30 -07:00
2009-01-07 18:08:36 -08:00
/* delete this cgroup from parent->children */
2012-11-09 09:12:29 -08:00
list_del_rcu ( & cgrp - > sibling ) ;
2012-04-01 12:09:54 -07:00
2013-07-31 09:50:50 +08:00
/*
* We should remove the cgroup object from idr before its grace
* period starts, so we won't be looking up a cgroup while the
* cgroup is being freed.
*/
idr_remove ( & cgrp - > root - > cgroup_idr , cgrp - > id ) ;
cgrp - > id = - 1 ;
2007-10-18 23:39:30 -07:00
dput ( d ) ;
2007-10-18 23:40:44 -07:00
set_bit ( CGRP_RELEASABLE , & parent - > flags ) ;
2007-10-18 23:39:38 -07:00
check_for_release ( parent ) ;
2013-06-13 19:27:42 -07:00
mutex_unlock ( & cgroup_mutex ) ;
2007-10-18 23:39:30 -07:00
}
2012-11-19 08:13:37 -08:00
static int cgroup_rmdir ( struct inode * unused_dir , struct dentry * dentry )
{
int ret ;
mutex_lock ( & cgroup_mutex ) ;
ret = cgroup_destroy_locked ( dentry - > d_fsdata ) ;
mutex_unlock ( & cgroup_mutex ) ;
return ret ;
}
2012-04-01 12:09:55 -07:00
static void __init_or_module cgroup_init_cftsets ( struct cgroup_subsys * ss )
{
INIT_LIST_HEAD ( & ss - > cftsets ) ;
/*
* base_cftset is embedded in subsys itself, no need to worry about
* deregistration.
*/
if ( ss - > base_cftypes ) {
2013-08-08 20:11:23 -04:00
struct cftype * cft ;
for ( cft = ss - > base_cftypes ; cft - > name [ 0 ] ! = ' \0 ' ; cft + + )
cft - > ss = ss ;
2012-04-01 12:09:55 -07:00
ss - > base_cftset . cfts = ss - > base_cftypes ;
list_add_tail ( & ss - > base_cftset . node , & ss - > cftsets ) ;
}
}
2008-04-29 01:00:07 -07:00
static void __init cgroup_init_subsys ( struct cgroup_subsys * ss )
2007-10-18 23:39:30 -07:00
{
struct cgroup_subsys_state * css ;
2007-11-14 16:58:54 -08:00
printk ( KERN_INFO " Initializing cgroup subsys %s \n " , ss - > name ) ;
2007-10-18 23:39:30 -07:00
2012-11-19 08:13:36 -08:00
mutex_lock ( & cgroup_mutex ) ;
2012-04-01 12:09:55 -07:00
/* init base cftset */
cgroup_init_cftsets ( ss ) ;
2007-10-18 23:39:30 -07:00
/* Create the top cgroup state for this subsystem */
2013-06-24 15:21:47 -07:00
list_add ( & ss - > sibling , & cgroup_dummy_root . subsys_list ) ;
ss - > root = & cgroup_dummy_root ;
2013-08-08 20:11:23 -04:00
css = ss - > css_alloc ( cgroup_dummy_top - > subsys [ ss - > subsys_id ] ) ;
2007-10-18 23:39:30 -07:00
/* We don't handle early failures gracefully */
BUG_ON ( IS_ERR ( css ) ) ;
2013-06-24 15:21:47 -07:00
init_cgroup_css ( css , ss , cgroup_dummy_top ) ;
2007-10-18 23:39:30 -07:00
2008-04-29 01:00:13 -07:00
/* Update the init_css_set to contain a subsys
2007-10-18 23:39:36 -07:00
* pointer to this state - since the subsystem is
2008-04-29 01:00:13 -07:00
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
2012-11-19 08:13:36 -08:00
init_css_set . subsys [ ss - > subsys_id ] = css ;
2007-10-18 23:39:30 -07:00
need_forkexit_callback | = ss - > fork | | ss - > exit ;
2008-04-29 01:00:13 -07:00
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON ( ! list_empty ( & init_task . tasks ) ) ;
2013-06-24 15:21:47 -07:00
BUG_ON ( online_css ( ss , cgroup_dummy_top ) ) ;
2012-11-09 09:12:29 -08:00
2012-11-19 08:13:36 -08:00
mutex_unlock ( & cgroup_mutex ) ;
2010-03-10 15:22:09 -08:00
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
BUG_ON ( ss - > module ) ;
2007-10-18 23:39:30 -07:00
}
2010-03-10 15:22:09 -08:00
/**
* cgroup_load_subsys: load and register a modular subsystem at runtime
* @ss: the subsystem to load
*
* This function should be called in a modular subsystem's initcall. If the
2010-03-16 11:47:56 +01:00
* subsystem is built as a module, it will be assigned a new subsys_id and set
2010-03-10 15:22:09 -08:00
* up for use. If the subsystem is built-in anyway, work is delegated to the
* simpler cgroup_init_subsys.
*/
int __init_or_module cgroup_load_subsys ( struct cgroup_subsys * ss )
{
struct cgroup_subsys_state * css ;
2012-11-19 08:13:37 -08:00
int i , ret ;
2013-02-27 17:06:00 -08:00
struct hlist_node * tmp ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2013-01-10 11:49:27 +08:00
unsigned long key ;
2010-03-10 15:22:09 -08:00
/* check name and function validity */
if ( ss - > name = = NULL | | strlen ( ss - > name ) > MAX_CGROUP_TYPE_NAMELEN | |
2012-11-19 08:13:38 -08:00
ss - > css_alloc = = NULL | | ss - > css_free = = NULL )
2010-03-10 15:22:09 -08:00
return - EINVAL ;
/*
* we don't support callbacks in modular subsystems. this check is
* before the ss->module check for consistency; a subsystem that could
* be a module should still have no callbacks even if the user isn't
* compiling it as one.
*/
if ( ss - > fork | | ss - > exit )
return - EINVAL ;
/*
* an optionally modular subsystem is built-in: we want to do nothing,
* since cgroup_init_subsys will have already taken care of it.
*/
if ( ss - > module = = NULL ) {
2012-09-13 09:50:55 +02:00
/* a sanity check */
2013-06-24 15:21:47 -07:00
BUG_ON ( cgroup_subsys [ ss - > subsys_id ] ! = ss ) ;
2010-03-10 15:22:09 -08:00
return 0 ;
}
2012-04-01 12:09:55 -07:00
/* init base cftset */
cgroup_init_cftsets ( ss ) ;
2010-03-10 15:22:09 -08:00
mutex_lock ( & cgroup_mutex ) ;
2013-06-24 15:21:47 -07:00
cgroup_subsys [ ss - > subsys_id ] = ss ;
2010-03-10 15:22:09 -08:00
/*
2012-11-19 08:13:38 -08:00
* no ss->css_alloc seems to need anything important in the ss
2013-06-24 15:21:47 -07:00
* struct, so this can happen first (i.e. before the dummy root
2012-11-19 08:13:38 -08:00
* attachment).
2010-03-10 15:22:09 -08:00
*/
2013-08-08 20:11:23 -04:00
css = ss - > css_alloc ( cgroup_dummy_top - > subsys [ ss - > subsys_id ] ) ;
2010-03-10 15:22:09 -08:00
if ( IS_ERR ( css ) ) {
2013-06-24 15:21:47 -07:00
/* failure case - need to deassign the cgroup_subsys[] slot. */
cgroup_subsys [ ss - > subsys_id ] = NULL ;
2010-03-10 15:22:09 -08:00
mutex_unlock ( & cgroup_mutex ) ;
return PTR_ERR ( css ) ;
}
2013-06-24 15:21:47 -07:00
list_add ( & ss - > sibling , & cgroup_dummy_root . subsys_list ) ;
ss - > root = & cgroup_dummy_root ;
2010-03-10 15:22:09 -08:00
/* our new subsystem will be attached to the dummy hierarchy. */
2013-06-24 15:21:47 -07:00
init_cgroup_css ( css , ss , cgroup_dummy_top ) ;
2010-03-10 15:22:09 -08:00
/* init_idr must be after init_cgroup_css because it sets css->id. */
if ( ss - > use_id ) {
2012-11-19 08:13:37 -08:00
ret = cgroup_init_idr ( ss , css ) ;
if ( ret )
goto err_unload ;
2010-03-10 15:22:09 -08:00
}
/*
* Now we need to entangle the css into the existing css_sets. unlike
* in cgroup_init_subsys, there are now multiple css_sets, so each one
* will need a new pointer to it; done by iterating the css_set_table.
* furthermore, modifying the existing css_sets will corrupt the hash
* table state, so each changed css_set will need its hash recomputed.
* this is all done under the css_set_lock.
*/
write_lock ( & css_set_lock ) ;
2013-06-12 21:04:49 -07:00
hash_for_each_safe ( css_set_table , i , tmp , cset , hlist ) {
2013-01-10 11:49:27 +08:00
/* skip entries that we already rehashed */
2013-06-12 21:04:49 -07:00
if ( cset - > subsys [ ss - > subsys_id ] )
2013-01-10 11:49:27 +08:00
continue ;
/* remove existing entry */
2013-06-12 21:04:49 -07:00
hash_del ( & cset - > hlist ) ;
2013-01-10 11:49:27 +08:00
/* set new value */
2013-06-12 21:04:49 -07:00
cset - > subsys [ ss - > subsys_id ] = css ;
2013-01-10 11:49:27 +08:00
/* recompute hash and restore entry */
2013-06-12 21:04:49 -07:00
key = css_set_hash ( cset - > subsys ) ;
hash_add ( css_set_table , & cset - > hlist , key ) ;
2010-03-10 15:22:09 -08:00
}
write_unlock ( & css_set_lock ) ;
2013-06-24 15:21:47 -07:00
ret = online_css ( ss , cgroup_dummy_top ) ;
2012-11-19 08:13:38 -08:00
if ( ret )
goto err_unload ;
2012-11-09 09:12:29 -08:00
2010-03-10 15:22:09 -08:00
/* success! */
mutex_unlock ( & cgroup_mutex ) ;
return 0 ;
2012-11-19 08:13:37 -08:00
err_unload :
mutex_unlock ( & cgroup_mutex ) ;
/* @ss can't be mounted here as try_module_get() would fail */
cgroup_unload_subsys ( ss ) ;
return ret ;
2010-03-10 15:22:09 -08:00
}
EXPORT_SYMBOL_GPL ( cgroup_load_subsys ) ;
2010-03-10 15:22:09 -08:00
/**
* cgroup_unload_subsys: unload a modular subsystem
* @ss: the subsystem to unload
*
* This function should be called in a modular subsystem's exitcall. When this
* function is invoked, the refcount on the subsystem's module will be 0, so
* the subsystem will not be attached to any hierarchy.
*/
void cgroup_unload_subsys ( struct cgroup_subsys * ss )
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2010-03-10 15:22:09 -08:00
BUG_ON ( ss - > module = = NULL ) ;
/*
* we shouldn't be called if the subsystem is in use, and the use of
2013-07-12 13:38:17 -07:00
* try_module_get() in rebind_subsystems() should ensure that it
2010-03-10 15:22:09 -08:00
* doesn't start being used while we're killing it off.
*/
2013-06-24 15:21:47 -07:00
BUG_ON ( ss - > root ! = & cgroup_dummy_root ) ;
2010-03-10 15:22:09 -08:00
mutex_lock ( & cgroup_mutex ) ;
2012-11-19 08:13:37 -08:00
2013-06-24 15:21:47 -07:00
offline_css ( ss , cgroup_dummy_top ) ;
2012-11-19 08:13:37 -08:00
2013-02-27 17:03:49 -08:00
if ( ss - > use_id )
2012-11-19 08:13:37 -08:00
idr_destroy ( & ss - > idr ) ;
2010-03-10 15:22:09 -08:00
/* deassign the subsys_id */
2013-06-24 15:21:47 -07:00
cgroup_subsys [ ss - > subsys_id ] = NULL ;
2010-03-10 15:22:09 -08:00
2013-06-24 15:21:47 -07:00
/* remove subsystem from the dummy root's list of subsystems */
2011-03-22 16:30:13 -07:00
list_del_init ( & ss - > sibling ) ;
2010-03-10 15:22:09 -08:00
/*
2013-06-24 15:21:47 -07:00
* disentangle the css from all css_sets attached to the dummy
* top. as in loading, we need to pay our respects to the hashtable
* gods.
2010-03-10 15:22:09 -08:00
*/
write_lock ( & css_set_lock ) ;
2013-06-24 15:21:47 -07:00
list_for_each_entry ( link , & cgroup_dummy_top - > cset_links , cset_link ) {
2013-06-12 21:04:50 -07:00
struct css_set * cset = link - > cset ;
2013-01-10 11:49:27 +08:00
unsigned long key ;
2010-03-10 15:22:09 -08:00
2013-06-12 21:04:49 -07:00
hash_del ( & cset - > hlist ) ;
cset - > subsys [ ss - > subsys_id ] = NULL ;
key = css_set_hash ( cset - > subsys ) ;
hash_add ( css_set_table , & cset - > hlist , key ) ;
2010-03-10 15:22:09 -08:00
}
write_unlock ( & css_set_lock ) ;
/*
2013-06-24 15:21:47 -07:00
* remove subsystem's css from the cgroup_dummy_top and free it -
* need to free before marking as null because ss->css_free needs
* the cgrp->subsys pointer to find their state. note that this
* also takes care of freeing the css_id.
2010-03-10 15:22:09 -08:00
*/
2013-08-08 20:11:23 -04:00
ss - > css_free ( cgroup_dummy_top - > subsys [ ss - > subsys_id ] ) ;
2013-06-24 15:21:47 -07:00
cgroup_dummy_top - > subsys [ ss - > subsys_id ] = NULL ;
2010-03-10 15:22:09 -08:00
mutex_unlock ( & cgroup_mutex ) ;
}
EXPORT_SYMBOL_GPL ( cgroup_unload_subsys ) ;
2007-10-18 23:39:30 -07:00
/**
2008-02-23 15:24:09 -08:00
* cgroup_init_early - cgroup initialization at system boot
*
* Initialize cgroups at system boot, and initialize any
* subsystems that request early init.
2007-10-18 23:39:30 -07:00
*/
int __init cgroup_init_early ( void )
{
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2007-10-18 23:39:30 -07:00
int i ;
2013-06-25 11:53:37 -07:00
2008-10-18 20:28:03 -07:00
atomic_set ( & init_css_set . refcount , 1 ) ;
2013-06-12 21:04:50 -07:00
INIT_LIST_HEAD ( & init_css_set . cgrp_links ) ;
2007-10-18 23:39:36 -07:00
INIT_LIST_HEAD ( & init_css_set . tasks ) ;
2008-04-29 01:00:11 -07:00
INIT_HLIST_NODE ( & init_css_set . hlist ) ;
2007-10-18 23:39:36 -07:00
css_set_count = 1 ;
2013-06-24 15:21:47 -07:00
init_cgroup_root ( & cgroup_dummy_root ) ;
cgroup_root_count = 1 ;
2013-06-21 15:52:33 -07:00
RCU_INIT_POINTER ( init_task . cgroups , & init_css_set ) ;
2007-10-18 23:39:36 -07:00
2013-06-12 21:04:50 -07:00
init_cgrp_cset_link . cset = & init_css_set ;
2013-06-24 15:21:47 -07:00
init_cgrp_cset_link . cgrp = cgroup_dummy_top ;
list_add ( & init_cgrp_cset_link . cset_link , & cgroup_dummy_top - > cset_links ) ;
2013-06-12 21:04:50 -07:00
list_add ( & init_cgrp_cset_link . cgrp_link , & init_css_set . cgrp_links ) ;
2007-10-18 23:39:30 -07:00
2013-06-25 11:53:37 -07:00
/* at bootup time, we don't worry about modular subsystems */
for_each_builtin_subsys ( ss , i ) {
2007-10-18 23:39:30 -07:00
BUG_ON ( ! ss - > name ) ;
BUG_ON ( strlen ( ss - > name ) > MAX_CGROUP_TYPE_NAMELEN ) ;
2012-11-19 08:13:38 -08:00
BUG_ON ( ! ss - > css_alloc ) ;
BUG_ON ( ! ss - > css_free ) ;
2007-10-18 23:39:30 -07:00
if ( ss - > subsys_id ! = i ) {
2007-11-14 16:58:54 -08:00
printk ( KERN_ERR " cgroup: Subsys %s id == %d \n " ,
2007-10-18 23:39:30 -07:00
ss - > name , ss - > subsys_id ) ;
BUG ( ) ;
}
if ( ss - > early_init )
cgroup_init_subsys ( ss ) ;
}
return 0 ;
}
/**
2008-02-23 15:24:09 -08:00
* cgroup_init - cgroup initialization
*
* Register cgroup filesystem and /proc file, and initialize
* any subsystems that didn't request early init.
2007-10-18 23:39:30 -07:00
*/
int __init cgroup_init ( void )
{
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2013-01-10 11:49:27 +08:00
unsigned long key ;
2013-06-25 11:53:37 -07:00
int i , err ;
2007-10-18 23:39:35 -07:00
err = bdi_init ( & cgroup_backing_dev_info ) ;
if ( err )
return err ;
2007-10-18 23:39:30 -07:00
2013-06-25 11:53:37 -07:00
for_each_builtin_subsys ( ss , i ) {
2007-10-18 23:39:30 -07:00
if ( ! ss - > early_init )
cgroup_init_subsys ( ss ) ;
2009-04-02 16:57:25 -07:00
if ( ss - > use_id )
2010-03-10 15:22:09 -08:00
cgroup_init_idr ( ss , init_css_set . subsys [ ss - > subsys_id ] ) ;
2007-10-18 23:39:30 -07:00
}
2013-04-14 11:36:56 -07:00
/* allocate id for the dummy hierarchy */
2013-04-14 11:36:57 -07:00
mutex_lock ( & cgroup_mutex ) ;
mutex_lock ( & cgroup_root_mutex ) ;
2013-06-25 11:53:37 -07:00
/* Add init_css_set to the hash table */
key = css_set_hash ( init_css_set . subsys ) ;
hash_add ( css_set_table , & init_css_set . hlist , key ) ;
2013-06-25 11:53:37 -07:00
BUG_ON ( cgroup_init_root_id ( & cgroup_dummy_root , 0 , 1 ) ) ;
2010-08-05 13:53:35 -07:00
2013-07-31 09:50:50 +08:00
err = idr_alloc ( & cgroup_dummy_root . cgroup_idr , cgroup_dummy_top ,
0 , 1 , GFP_KERNEL ) ;
BUG_ON ( err < 0 ) ;
2013-04-14 11:36:57 -07:00
mutex_unlock ( & cgroup_root_mutex ) ;
mutex_unlock ( & cgroup_mutex ) ;
2010-08-05 13:53:35 -07:00
cgroup_kobj = kobject_create_and_add ( " cgroup " , fs_kobj ) ;
if ( ! cgroup_kobj ) {
err = - ENOMEM ;
2007-10-18 23:39:30 -07:00
goto out ;
2010-08-05 13:53:35 -07:00
}
err = register_filesystem ( & cgroup_fs_type ) ;
if ( err < 0 ) {
kobject_put ( cgroup_kobj ) ;
goto out ;
}
2007-10-18 23:39:30 -07:00
2008-04-29 01:00:08 -07:00
proc_create ( " cgroups " , 0 , NULL , & proc_cgroupstats_operations ) ;
2007-10-18 23:39:35 -07:00
2007-10-18 23:39:30 -07:00
out :
2007-10-18 23:39:35 -07:00
if ( err )
bdi_destroy ( & cgroup_backing_dev_info ) ;
2007-10-18 23:39:30 -07:00
return err ;
}
2007-10-18 23:39:33 -07:00
2007-10-18 23:39:35 -07:00
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
* doesn't really matter if tsk->cgroup changes after we read it,
2008-02-07 00:14:43 -08:00
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2007-10-18 23:39:35 -07:00
* anyway. No need to check that tsk->cgroup != NULL, thanks to
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
* cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
2013-04-19 23:11:24 -04:00
int proc_cgroup_show ( struct seq_file * m , void * v )
2007-10-18 23:39:35 -07:00
{
struct pid * pid ;
struct task_struct * tsk ;
char * buf ;
int retval ;
struct cgroupfs_root * root ;
retval = - ENOMEM ;
buf = kmalloc ( PAGE_SIZE , GFP_KERNEL ) ;
if ( ! buf )
goto out ;
retval = - ESRCH ;
pid = m - > private ;
tsk = get_pid_task ( pid , PIDTYPE_PID ) ;
if ( ! tsk )
goto out_free ;
retval = 0 ;
mutex_lock ( & cgroup_mutex ) ;
2009-01-07 18:07:41 -08:00
for_each_active_root ( root ) {
2007-10-18 23:39:35 -07:00
struct cgroup_subsys * ss ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp ;
2007-10-18 23:39:35 -07:00
int count = 0 ;
2009-09-23 15:56:23 -07:00
seq_printf ( m , " %d: " , root - > hierarchy_id ) ;
2013-06-24 15:21:48 -07:00
for_each_root_subsys ( root , ss )
2007-10-18 23:39:35 -07:00
seq_printf ( m , " %s%s " , count + + ? " , " : " " , ss - > name ) ;
2009-09-23 15:56:19 -07:00
if ( strlen ( root - > name ) )
seq_printf ( m , " %sname=%s " , count ? " , " : " " ,
root - > name ) ;
2007-10-18 23:39:35 -07:00
seq_putc ( m , ' : ' ) ;
2009-09-23 15:56:22 -07:00
cgrp = task_cgroup_from_root ( tsk , root ) ;
2007-10-18 23:40:44 -07:00
retval = cgroup_path ( cgrp , buf , PAGE_SIZE ) ;
2007-10-18 23:39:35 -07:00
if ( retval < 0 )
goto out_unlock ;
seq_puts ( m , buf ) ;
seq_putc ( m , ' \n ' ) ;
}
out_unlock :
mutex_unlock ( & cgroup_mutex ) ;
put_task_struct ( tsk ) ;
out_free :
kfree ( buf ) ;
out :
return retval ;
}
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show ( struct seq_file * m , void * v )
{
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2007-10-18 23:39:35 -07:00
int i ;
2008-04-04 14:29:57 -07:00
seq_puts ( m , " #subsys_name \t hierarchy \t num_cgroups \t enabled \n " ) ;
2010-03-10 15:22:07 -08:00
/*
* ideally we don't want subsystems moving around while we do this.
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys/hierarchy state.
*/
2007-10-18 23:39:35 -07:00
mutex_lock ( & cgroup_mutex ) ;
2013-06-25 11:53:37 -07:00
for_each_subsys ( ss , i )
2009-09-23 15:56:23 -07:00
seq_printf ( m , " %s \t %d \t %d \t %d \n " ,
ss - > name , ss - > root - > hierarchy_id ,
2008-04-04 14:29:57 -07:00
ss - > root - > number_of_cgroups , ! ss - > disabled ) ;
2013-06-25 11:53:37 -07:00
2007-10-18 23:39:35 -07:00
mutex_unlock ( & cgroup_mutex ) ;
return 0 ;
}
static int cgroupstats_open ( struct inode * inode , struct file * file )
{
2008-03-29 03:07:28 +00:00
return single_open ( file , proc_cgroupstats_show , NULL ) ;
2007-10-18 23:39:35 -07:00
}
2009-10-01 15:43:56 -07:00
static const struct file_operations proc_cgroupstats_operations = {
2007-10-18 23:39:35 -07:00
. open = cgroupstats_open ,
. read = seq_read ,
. llseek = seq_lseek ,
. release = single_release ,
} ;
2007-10-18 23:39:33 -07:00
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
2008-02-23 15:24:09 -08:00
* @child: pointer to task_struct of forking parent process.
2007-10-18 23:39:33 -07:00
*
* Description: A task inherits its parent's cgroup at fork().
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
2012-10-18 17:52:07 -07:00
* it was not made under the protection of RCU or cgroup_mutex, so
* might no longer be a valid cgroup pointer. cgroup_attach_task() might
* have already changed current->cgroups, allowing the previously
* referenced cgroup group to be removed and freed.
2007-10-18 23:39:33 -07:00
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork ( struct task_struct * child )
{
2012-10-18 17:52:07 -07:00
task_lock ( current ) ;
2013-06-21 15:52:04 -07:00
get_css_set ( task_css_set ( current ) ) ;
2007-10-18 23:39:36 -07:00
child - > cgroups = current - > cgroups ;
2012-10-18 17:52:07 -07:00
task_unlock ( current ) ;
2007-10-18 23:39:36 -07:00
INIT_LIST_HEAD ( & child - > cg_list ) ;
2007-10-18 23:39:33 -07:00
}
2007-10-18 23:39:36 -07:00
/**
2008-02-23 15:24:09 -08:00
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
2012-10-16 15:03:14 -07:00
* Adds the task to the list running through its css_set if necessary and
* call the subsystem fork() callbacks. Has to be after the task is
* visible on the task list in case we race with the first call to
* cgroup_iter_start() - to guarantee that the new task ends up on its
* list.
2008-02-23 15:24:09 -08:00
*/
2007-10-18 23:39:36 -07:00
void cgroup_post_fork ( struct task_struct * child )
{
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2012-10-16 15:03:14 -07:00
int i ;
2012-02-08 03:37:27 +01:00
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
* to the tasklist under the tasklist_lock as well. If the child wasn't
* yet in the tasklist when we walked through it from
* cgroup_enable_task_cg_lists(), then use_task_css_set_links value
* should be visible now due to the paired locking and barriers implied
* by LOCK/UNLOCK: it is written before the tasklist_lock unlock
* in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
* lock on fork.
*/
2007-10-18 23:39:36 -07:00
if ( use_task_css_set_links ) {
write_lock ( & css_set_lock ) ;
2012-10-18 17:40:30 -07:00
task_lock ( child ) ;
if ( list_empty ( & child - > cg_list ) )
2013-06-21 15:52:04 -07:00
list_add ( & child - > cg_list , & task_css_set ( child ) - > tasks ) ;
2012-10-18 17:40:30 -07:00
task_unlock ( child ) ;
2007-10-18 23:39:36 -07:00
write_unlock ( & css_set_lock ) ;
}
2012-10-16 15:03:14 -07:00
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
if ( need_forkexit_callback ) {
2013-03-05 10:57:03 +08:00
/*
* fork/exit callbacks are supported only for builtin
* subsystems, and the builtin section of the subsys
* array is immutable, so we don't need to lock the
* subsys array here. On the other hand, modular section
* of the array can be freed at module unload, so we
* can't touch that.
*/
2013-06-25 11:53:37 -07:00
for_each_builtin_subsys ( ss , i )
2012-10-16 15:03:14 -07:00
if ( ss - > fork )
ss - > fork ( child ) ;
}
2007-10-18 23:39:36 -07:00
}
2012-10-16 15:03:14 -07:00
2007-10-18 23:39:33 -07:00
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
2008-02-23 15:24:09 -08:00
* @run_callback: run exit callbacks?
2007-10-18 23:39:33 -07:00
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* the_top_cgroup_hack:
*
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
*
* We call cgroup_exit() while the task is still competent to
* handle notify_on_release(), then leave the task attached to the
* root cgroup in each hierarchy for the remainder of its exit.
*
* To do this properly, we would increment the reference count on
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
* code we would add a second cgroup function call, to drop that
* reference. This would just create an unnecessary hot spot on
* the top_cgroup reference count, to no avail.
*
* Normally, holding a reference to a cgroup without bumping its
* count is unsafe. The cgroup could go away, or someone could
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
2008-02-07 00:14:43 -08:00
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
2007-10-18 23:39:33 -07:00
*/
void cgroup_exit ( struct task_struct * tsk , int run_callbacks )
{
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2011-02-07 17:02:20 +01:00
int i ;
2007-10-18 23:39:36 -07:00
/*
* Unlink from the css_set task list if necessary.
* Optimistically check cg_list before taking
* css_set_lock
*/
if ( ! list_empty ( & tsk - > cg_list ) ) {
write_lock ( & css_set_lock ) ;
if ( ! list_empty ( & tsk - > cg_list ) )
2011-03-22 16:30:13 -07:00
list_del_init ( & tsk - > cg_list ) ;
2007-10-18 23:39:36 -07:00
write_unlock ( & css_set_lock ) ;
}
2007-10-18 23:39:33 -07:00
/* Reassign the task to the init_css_set. */
task_lock ( tsk ) ;
2013-06-21 15:52:04 -07:00
cset = task_css_set ( tsk ) ;
RCU_INIT_POINTER ( tsk - > cgroups , & init_css_set ) ;
2011-02-07 17:02:20 +01:00
if ( run_callbacks & & need_forkexit_callback ) {
2013-03-05 10:57:03 +08:00
/*
* fork/exit callbacks are supported only for builtin
* subsystems, see cgroup_post_fork() for details.
*/
2013-06-25 11:53:37 -07:00
for_each_builtin_subsys ( ss , i ) {
2011-02-07 17:02:20 +01:00
if ( ss - > exit ) {
2013-08-08 20:11:23 -04:00
struct cgroup_subsys_state * old_css = cset - > subsys [ i ] ;
struct cgroup_subsys_state * css = task_css ( tsk , i ) ;
2013-06-25 11:53:37 -07:00
2013-08-08 20:11:23 -04:00
ss - > exit ( css , old_css , tsk ) ;
2011-02-07 17:02:20 +01:00
}
}
}
2007-10-18 23:39:33 -07:00
task_unlock ( tsk ) ;
2011-02-07 17:02:20 +01:00
2013-06-12 21:04:49 -07:00
put_css_set_taskexit ( cset ) ;
2007-10-18 23:39:33 -07:00
}
2007-10-18 23:39:34 -07:00
2007-10-18 23:40:44 -07:00
static void check_for_release ( struct cgroup * cgrp )
2007-10-18 23:39:38 -07:00
{
2013-03-01 15:06:07 +08:00
if ( cgroup_is_releasable ( cgrp ) & &
2013-06-12 21:04:55 -07:00
list_empty ( & cgrp - > cset_links ) & & list_empty ( & cgrp - > children ) ) {
2013-03-01 15:06:07 +08:00
/*
* Control Group is currently removeable. If it's not
2007-10-18 23:39:38 -07:00
* already queued for a userspace notification, queue
2013-03-01 15:06:07 +08:00
* it now
*/
2007-10-18 23:39:38 -07:00
int need_schedule_work = 0 ;
2013-03-01 15:06:07 +08:00
2009-07-25 16:47:45 +02:00
raw_spin_lock ( & release_list_lock ) ;
2013-06-12 21:04:53 -07:00
if ( ! cgroup_is_dead ( cgrp ) & &
2007-10-18 23:40:44 -07:00
list_empty ( & cgrp - > release_list ) ) {
list_add ( & cgrp - > release_list , & release_list ) ;
2007-10-18 23:39:38 -07:00
need_schedule_work = 1 ;
}
2009-07-25 16:47:45 +02:00
raw_spin_unlock ( & release_list_lock ) ;
2007-10-18 23:39:38 -07:00
if ( need_schedule_work )
schedule_work ( & release_agent_work ) ;
}
}
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*/
static void cgroup_release_agent ( struct work_struct * work )
{
BUG_ON ( work ! = & release_agent_work ) ;
mutex_lock ( & cgroup_mutex ) ;
2009-07-25 16:47:45 +02:00
raw_spin_lock ( & release_list_lock ) ;
2007-10-18 23:39:38 -07:00
while ( ! list_empty ( & release_list ) ) {
char * argv [ 3 ] , * envp [ 3 ] ;
int i ;
2008-07-25 01:46:59 -07:00
char * pathbuf = NULL , * agentbuf = NULL ;
2007-10-18 23:40:44 -07:00
struct cgroup * cgrp = list_entry ( release_list . next ,
2007-10-18 23:39:38 -07:00
struct cgroup ,
release_list ) ;
2007-10-18 23:40:44 -07:00
list_del_init ( & cgrp - > release_list ) ;
2009-07-25 16:47:45 +02:00
raw_spin_unlock ( & release_list_lock ) ;
2007-10-18 23:39:38 -07:00
pathbuf = kmalloc ( PAGE_SIZE , GFP_KERNEL ) ;
2008-07-25 01:46:59 -07:00
if ( ! pathbuf )
goto continue_free ;
if ( cgroup_path ( cgrp , pathbuf , PAGE_SIZE ) < 0 )
goto continue_free ;
agentbuf = kstrdup ( cgrp - > root - > release_agent_path , GFP_KERNEL ) ;
if ( ! agentbuf )
goto continue_free ;
2007-10-18 23:39:38 -07:00
i = 0 ;
2008-07-25 01:46:59 -07:00
argv [ i + + ] = agentbuf ;
argv [ i + + ] = pathbuf ;
2007-10-18 23:39:38 -07:00
argv [ i ] = NULL ;
i = 0 ;
/* minimal command environment */
envp [ i + + ] = " HOME=/ " ;
envp [ i + + ] = " PATH=/sbin:/bin:/usr/sbin:/usr/bin " ;
envp [ i ] = NULL ;
/* Drop the lock while we invoke the usermode helper,
* since the exec could involve hitting disk and hence
* be a slow process */
mutex_unlock ( & cgroup_mutex ) ;
call_usermodehelper ( argv [ 0 ] , argv , envp , UMH_WAIT_EXEC ) ;
mutex_lock ( & cgroup_mutex ) ;
2008-07-25 01:46:59 -07:00
continue_free :
kfree ( pathbuf ) ;
kfree ( agentbuf ) ;
2009-07-25 16:47:45 +02:00
raw_spin_lock ( & release_list_lock ) ;
2007-10-18 23:39:38 -07:00
}
2009-07-25 16:47:45 +02:00
raw_spin_unlock ( & release_list_lock ) ;
2007-10-18 23:39:38 -07:00
mutex_unlock ( & cgroup_mutex ) ;
}
2008-04-04 14:29:57 -07:00
static int __init cgroup_disable ( char * str )
{
2013-06-25 11:53:37 -07:00
struct cgroup_subsys * ss ;
2008-04-04 14:29:57 -07:00
char * token ;
2013-06-25 11:53:37 -07:00
int i ;
2008-04-04 14:29:57 -07:00
while ( ( token = strsep ( & str , " , " ) ) ! = NULL ) {
if ( ! * token )
continue ;
2012-09-13 09:50:55 +02:00
2013-06-25 11:53:37 -07:00
/*
* cgroup_disable, being at boot time, can't know about
* module subsystems, so we don't worry about them.
*/
for_each_builtin_subsys ( ss , i ) {
2008-04-04 14:29:57 -07:00
if ( ! strcmp ( token , ss - > name ) ) {
ss - > disabled = 1 ;
printk ( KERN_INFO " Disabling %s control group "
" subsystem \n " , ss - > name ) ;
break ;
}
}
}
return 1 ;
}
__setup ( " cgroup_disable= " , cgroup_disable ) ;
2009-04-02 16:57:25 -07:00
/*
* Functons for CSS ID.
*/
2013-06-12 21:04:53 -07:00
/* to get ID other than 0, this should be called when !cgroup_is_dead() */
2009-04-02 16:57:25 -07:00
unsigned short css_id ( struct cgroup_subsys_state * css )
{
2010-05-11 14:06:58 -07:00
struct css_id * cssid ;
/*
* This css_id() can return correct value when somone has refcnt
* on this or this is under rcu_read_lock(). Once css->id is allocated,
* it's unchanged until freed.
*/
2013-06-13 19:39:16 -07:00
cssid = rcu_dereference_raw ( css - > id ) ;
2009-04-02 16:57:25 -07:00
if ( cssid )
return cssid - > id ;
return 0 ;
}
2010-03-10 15:22:11 -08:00
EXPORT_SYMBOL_GPL ( css_id ) ;
2009-04-02 16:57:25 -07:00
2010-05-11 14:06:59 -07:00
/**
* css_is_ancestor - test "root" css is an ancestor of "child"
* @child: the css to be tested.
* @root: the css supporsed to be an ancestor of the child.
*
* Returns true if "root" is an ancestor of "child" in its hierarchy. Because
2012-05-29 15:06:24 -07:00
* this function reads css->id, the caller must hold rcu_read_lock().
2010-05-11 14:06:59 -07:00
* But, considering usual usage, the csses should be valid objects after test.
* Assuming that the caller will do some action to the child if this returns
* returns true, the caller must take "child";s reference count.
* If "child" is valid object and this returns true, "root" is valid, too.
*/
2009-04-02 16:57:25 -07:00
bool css_is_ancestor ( struct cgroup_subsys_state * child ,
2009-04-02 16:57:38 -07:00
const struct cgroup_subsys_state * root )
2009-04-02 16:57:25 -07:00
{
2010-05-11 14:06:59 -07:00
struct css_id * child_id ;
struct css_id * root_id ;
2009-04-02 16:57:25 -07:00
2010-05-11 14:06:59 -07:00
child_id = rcu_dereference ( child - > id ) ;
2012-05-29 15:06:24 -07:00
if ( ! child_id )
return false ;
2010-05-11 14:06:59 -07:00
root_id = rcu_dereference ( root - > id ) ;
2012-05-29 15:06:24 -07:00
if ( ! root_id )
return false ;
if ( child_id - > depth < root_id - > depth )
return false ;
if ( child_id - > stack [ root_id - > depth ] ! = root_id - > id )
return false ;
return true ;
2009-04-02 16:57:25 -07:00
}
void free_css_id ( struct cgroup_subsys * ss , struct cgroup_subsys_state * css )
{
2013-06-21 15:52:33 -07:00
struct css_id * id = rcu_dereference_protected ( css - > id , true ) ;
2009-04-02 16:57:25 -07:00
/* When this is called before css_id initialization, id can be NULL */
if ( ! id )
return ;
BUG_ON ( ! ss - > use_id ) ;
rcu_assign_pointer ( id - > css , NULL ) ;
rcu_assign_pointer ( css - > id , NULL ) ;
2012-03-21 16:34:21 -07:00
spin_lock ( & ss - > id_lock ) ;
2009-04-02 16:57:25 -07:00
idr_remove ( & ss - > idr , id - > id ) ;
2012-03-21 16:34:21 -07:00
spin_unlock ( & ss - > id_lock ) ;
2011-03-15 17:56:10 +08:00
kfree_rcu ( id , rcu_head ) ;
2009-04-02 16:57:25 -07:00
}
2010-03-10 15:22:11 -08:00
EXPORT_SYMBOL_GPL ( free_css_id ) ;
2009-04-02 16:57:25 -07:00
/*
* This is called by init or create(). Then, calls to this function are
* always serialized (By cgroup_mutex() at create()).
*/
static struct css_id * get_new_cssid ( struct cgroup_subsys * ss , int depth )
{
struct css_id * newid ;
2013-02-27 17:04:54 -08:00
int ret , size ;
2009-04-02 16:57:25 -07:00
BUG_ON ( ! ss - > use_id ) ;
size = sizeof ( * newid ) + sizeof ( unsigned short ) * ( depth + 1 ) ;
newid = kzalloc ( size , GFP_KERNEL ) ;
if ( ! newid )
return ERR_PTR ( - ENOMEM ) ;
2013-02-27 17:04:54 -08:00
idr_preload ( GFP_KERNEL ) ;
2012-03-21 16:34:21 -07:00
spin_lock ( & ss - > id_lock ) ;
2009-04-02 16:57:25 -07:00
/* Don't use 0. allocates an ID of 1-65535 */
2013-02-27 17:04:54 -08:00
ret = idr_alloc ( & ss - > idr , newid , 1 , CSS_ID_MAX + 1 , GFP_NOWAIT ) ;
2012-03-21 16:34:21 -07:00
spin_unlock ( & ss - > id_lock ) ;
2013-02-27 17:04:54 -08:00
idr_preload_end ( ) ;
2009-04-02 16:57:25 -07:00
/* Returns error when there are no free spaces for new ID.*/
2013-02-27 17:04:54 -08:00
if ( ret < 0 )
2009-04-02 16:57:25 -07:00
goto err_out ;
2013-02-27 17:04:54 -08:00
newid - > id = ret ;
2009-04-02 16:57:25 -07:00
newid - > depth = depth ;
return newid ;
err_out :
kfree ( newid ) ;
2013-02-27 17:04:54 -08:00
return ERR_PTR ( ret ) ;
2009-04-02 16:57:25 -07:00
}
2010-03-10 15:22:09 -08:00
static int __init_or_module cgroup_init_idr ( struct cgroup_subsys * ss ,
struct cgroup_subsys_state * rootcss )
2009-04-02 16:57:25 -07:00
{
struct css_id * newid ;
2012-03-21 16:34:21 -07:00
spin_lock_init ( & ss - > id_lock ) ;
2009-04-02 16:57:25 -07:00
idr_init ( & ss - > idr ) ;
newid = get_new_cssid ( ss , 0 ) ;
if ( IS_ERR ( newid ) )
return PTR_ERR ( newid ) ;
newid - > stack [ 0 ] = newid - > id ;
2013-06-21 15:52:33 -07:00
RCU_INIT_POINTER ( newid - > css , rootcss ) ;
RCU_INIT_POINTER ( rootcss - > id , newid ) ;
2009-04-02 16:57:25 -07:00
return 0 ;
}
static int alloc_css_id ( struct cgroup_subsys * ss , struct cgroup * parent ,
struct cgroup * child )
{
int subsys_id , i , depth = 0 ;
struct cgroup_subsys_state * parent_css , * child_css ;
2010-04-22 17:30:00 +08:00
struct css_id * child_id , * parent_id ;
2009-04-02 16:57:25 -07:00
subsys_id = ss - > subsys_id ;
parent_css = parent - > subsys [ subsys_id ] ;
child_css = child - > subsys [ subsys_id ] ;
2013-06-21 15:52:33 -07:00
parent_id = rcu_dereference_protected ( parent_css - > id , true ) ;
2010-06-04 14:15:03 -07:00
depth = parent_id - > depth + 1 ;
2009-04-02 16:57:25 -07:00
child_id = get_new_cssid ( ss , depth ) ;
if ( IS_ERR ( child_id ) )
return PTR_ERR ( child_id ) ;
for ( i = 0 ; i < depth ; i + + )
child_id - > stack [ i ] = parent_id - > stack [ i ] ;
child_id - > stack [ depth ] = child_id - > id ;
/*
* child_id->css pointer will be set after this cgroup is available
* see cgroup_populate_dir()
*/
rcu_assign_pointer ( child_css - > id , child_id ) ;
return 0 ;
}
/**
* css_lookup - lookup css by id
* @ss: cgroup subsys to be looked into.
* @id: the id
*
* Returns pointer to cgroup_subsys_state if there is valid one with id.
* NULL if not. Should be called under rcu_read_lock()
*/
struct cgroup_subsys_state * css_lookup ( struct cgroup_subsys * ss , int id )
{
struct css_id * cssid = NULL ;
BUG_ON ( ! ss - > use_id ) ;
cssid = idr_find ( & ss - > idr , id ) ;
if ( unlikely ( ! cssid ) )
return NULL ;
return rcu_dereference ( cssid - > css ) ;
}
2010-03-10 15:22:11 -08:00
EXPORT_SYMBOL_GPL ( css_lookup ) ;
2009-04-02 16:57:25 -07:00
2011-02-14 11:20:01 +02:00
/*
* get corresponding css from file open on cgroupfs directory
*/
struct cgroup_subsys_state * cgroup_css_from_dir ( struct file * f , int id )
{
struct cgroup * cgrp ;
struct inode * inode ;
struct cgroup_subsys_state * css ;
2013-01-23 17:07:38 -05:00
inode = file_inode ( f ) ;
2011-02-14 11:20:01 +02:00
/* check in cgroup filesystem dir */
if ( inode - > i_op ! = & cgroup_dir_inode_operations )
return ERR_PTR ( - EBADF ) ;
if ( id < 0 | | id > = CGROUP_SUBSYS_COUNT )
return ERR_PTR ( - EINVAL ) ;
/* get cgroup */
cgrp = __d_cgrp ( f - > f_dentry ) ;
css = cgrp - > subsys [ id ] ;
return css ? css : ERR_PTR ( - ENOENT ) ;
}
2009-09-23 15:56:20 -07:00
# ifdef CONFIG_CGROUP_DEBUG
2013-08-08 20:11:23 -04:00
static struct cgroup_subsys_state *
debug_css_alloc ( struct cgroup_subsys_state * parent_css )
2009-09-23 15:56:20 -07:00
{
struct cgroup_subsys_state * css = kzalloc ( sizeof ( * css ) , GFP_KERNEL ) ;
if ( ! css )
return ERR_PTR ( - ENOMEM ) ;
return css ;
}
2013-08-08 20:11:23 -04:00
static void debug_css_free ( struct cgroup_subsys_state * css )
2009-09-23 15:56:20 -07:00
{
2013-08-08 20:11:23 -04:00
kfree ( css ) ;
2009-09-23 15:56:20 -07:00
}
2013-06-14 11:17:19 +08:00
static u64 debug_taskcount_read ( struct cgroup * cgrp , struct cftype * cft )
2009-09-23 15:56:20 -07:00
{
2013-06-14 11:17:19 +08:00
return cgroup_task_count ( cgrp ) ;
2009-09-23 15:56:20 -07:00
}
2013-06-14 11:17:19 +08:00
static u64 current_css_set_read ( struct cgroup * cgrp , struct cftype * cft )
2009-09-23 15:56:20 -07:00
{
return ( u64 ) ( unsigned long ) current - > cgroups ;
}
2013-06-14 11:17:19 +08:00
static u64 current_css_set_refcount_read ( struct cgroup * cgrp ,
struct cftype * cft )
2009-09-23 15:56:20 -07:00
{
u64 count ;
rcu_read_lock ( ) ;
2013-06-21 15:52:04 -07:00
count = atomic_read ( & task_css_set ( current ) - > refcount ) ;
2009-09-23 15:56:20 -07:00
rcu_read_unlock ( ) ;
return count ;
}
2013-06-14 11:17:19 +08:00
static int current_css_set_cg_links_read ( struct cgroup * cgrp ,
2009-09-23 15:56:22 -07:00
struct cftype * cft ,
struct seq_file * seq )
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2013-06-12 21:04:49 -07:00
struct css_set * cset ;
2009-09-23 15:56:22 -07:00
read_lock ( & css_set_lock ) ;
rcu_read_lock ( ) ;
2013-06-12 21:04:49 -07:00
cset = rcu_dereference ( current - > cgroups ) ;
2013-06-12 21:04:50 -07:00
list_for_each_entry ( link , & cset - > cgrp_links , cgrp_link ) {
2009-09-23 15:56:22 -07:00
struct cgroup * c = link - > cgrp ;
const char * name ;
if ( c - > dentry )
name = c - > dentry - > d_name . name ;
else
name = " ? " ;
2009-09-23 15:56:23 -07:00
seq_printf ( seq , " Root %d group %s \n " ,
c - > root - > hierarchy_id , name ) ;
2009-09-23 15:56:22 -07:00
}
rcu_read_unlock ( ) ;
read_unlock ( & css_set_lock ) ;
return 0 ;
}
# define MAX_TASKS_SHOWN_PER_CSS 25
2013-06-14 11:17:19 +08:00
static int cgroup_css_links_read ( struct cgroup * cgrp ,
2009-09-23 15:56:22 -07:00
struct cftype * cft ,
struct seq_file * seq )
{
2013-06-12 21:04:50 -07:00
struct cgrp_cset_link * link ;
2009-09-23 15:56:22 -07:00
read_lock ( & css_set_lock ) ;
2013-06-14 11:17:19 +08:00
list_for_each_entry ( link , & cgrp - > cset_links , cset_link ) {
2013-06-12 21:04:50 -07:00
struct css_set * cset = link - > cset ;
2009-09-23 15:56:22 -07:00
struct task_struct * task ;
int count = 0 ;
2013-06-12 21:04:49 -07:00
seq_printf ( seq , " css_set %p \n " , cset ) ;
list_for_each_entry ( task , & cset - > tasks , cg_list ) {
2009-09-23 15:56:22 -07:00
if ( count + + > MAX_TASKS_SHOWN_PER_CSS ) {
seq_puts ( seq , " ... \n " ) ;
break ;
} else {
seq_printf ( seq , " task %d \n " ,
task_pid_vnr ( task ) ) ;
}
}
}
read_unlock ( & css_set_lock ) ;
return 0 ;
}
2009-09-23 15:56:20 -07:00
static u64 releasable_read ( struct cgroup * cgrp , struct cftype * cft )
{
return test_bit ( CGRP_RELEASABLE , & cgrp - > flags ) ;
}
static struct cftype debug_files [ ] = {
{
. name = " taskcount " ,
. read_u64 = debug_taskcount_read ,
} ,
{
. name = " current_css_set " ,
. read_u64 = current_css_set_read ,
} ,
{
. name = " current_css_set_refcount " ,
. read_u64 = current_css_set_refcount_read ,
} ,
2009-09-23 15:56:22 -07:00
{
. name = " current_css_set_cg_links " ,
. read_seq_string = current_css_set_cg_links_read ,
} ,
{
. name = " cgroup_css_links " ,
. read_seq_string = cgroup_css_links_read ,
} ,
2009-09-23 15:56:20 -07:00
{
. name = " releasable " ,
. read_u64 = releasable_read ,
} ,
2012-04-01 12:09:55 -07:00
{ } /* terminate */
} ;
2009-09-23 15:56:20 -07:00
struct cgroup_subsys debug_subsys = {
. name = " debug " ,
2012-11-19 08:13:38 -08:00
. css_alloc = debug_css_alloc ,
. css_free = debug_css_free ,
2009-09-23 15:56:20 -07:00
. subsys_id = debug_subsys_id ,
2012-04-01 12:09:55 -07:00
. base_cftypes = debug_files ,
2009-09-23 15:56:20 -07:00
} ;
# endif /* CONFIG_CGROUP_DEBUG */