Newer
Older
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#include <linux/cgroup.h>
#include <linux/cred.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/kmod.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
* Expiring in the middle is a performance problem not a correctness one.
* 1 sec should be enough.
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
/*
* cgroup_tree_mutex nests above cgroup_mutex and protects cftypes, file
* creation/removal and hierarchy changing operations including cgroup
* creation, removal, css association and controller rebinding. This outer
* lock is needed mainly to resolve the circular dependency between kernfs
* active ref and cgroup_mutex. cgroup_tree_mutex nests above both.
*/
static DEFINE_MUTEX(cgroup_tree_mutex);
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */
static DEFINE_MUTEX(cgroup_mutex);
/*
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
static DEFINE_SPINLOCK(release_agent_path_lock);
#define cgroup_assert_mutexes_or_rcu_locked() \
rcu_lockdep_assert(rcu_read_lock_held() || \
lockdep_is_held(&cgroup_mutex), \
"cgroup_[tree_]mutex or RCU read lock required");
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_destroy_wq;
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
static struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
* The dummy hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
static struct cgroupfs_root cgroup_dummy_root;
/* dummy_top is a shorthand for the dummy hierarchy's top cgroup */
static struct cgroup * const cgroup_dummy_top = &cgroup_dummy_root.top_cgroup;
/* The list of hierarchy roots */
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
/*
* Assign a monotonically increasing serial number to cgroups. It
* guarantees cgroups with bigger numbers are newer than those with smaller
* numbers. Also, as cgroups are always appended to the parent's
* ->children list, it guarantees that sibling cgroups are always sorted in
* the ascending serial number order on the list. Protected by
* cgroup_mutex.
static u64 cgroup_serial_nr_next = 1;
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
* be called.
static int need_forkexit_callback __read_mostly;
Tejun Heo
committed
static struct cftype cgroup_base_files[];
static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long added_mask, unsigned removed_mask);
static void cgroup_destroy_css_killed(struct cgroup *cgrp);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns the dummy_css)
* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
* function must be called either under cgroup_mutex or rcu_read_lock() and
* the caller is responsible for pinning the returned css if it wants to
* keep accessing it outside the said locks. This function may return
* %NULL if @cgrp doesn't have @subsys_id enabled.
*/
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
if (ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_tree_mutex) ||
lockdep_is_held(&cgroup_mutex));
else
return &cgrp->dummy_css;
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
return test_bit(CGRP_DEAD, &cgrp->flags);
struct cgroup_subsys_state *seq_css(struct seq_file *seq)
{
struct kernfs_open_file *of = seq->private;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = seq_cft(seq);
/*
* This is open and unprotected implementation of cgroup_css().
* seq_css() is only called from a kernfs file operation which has
* an active reference on the file. Because all the subsystem
* files are drained before a css is disassociated with a cgroup,
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
if (cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->dummy_css;
}
EXPORT_SYMBOL_GPL(seq_css);
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
* @ancestor: possible ancestor of @cgrp
*
* Test whether @cgrp is a descendant of @ancestor. It also returns %true
* if @cgrp == @ancestor. This function is safe to call as long as @cgrp
* and @ancestor are accessible.
*/
bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
{
while (cgrp) {
if (cgrp == ancestor)
return true;
cgrp = cgrp->parent;
}
return false;
}
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
const int bits =
(1 << CGRP_RELEASABLE) |
(1 << CGRP_NOTIFY_ON_RELEASE);
return (cgrp->flags & bits) == bits;
}
static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_tree_mutex) || \
lockdep_is_held(&cgroup_mutex)))) { } \
else
* for_each_subsys - iterate all enabled cgroup subsystems
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
#define for_each_subsys(ss, ssid) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/* iterate across the active hierarchies */
#define for_each_active_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
*
* On success, returns true; the mutex should be later unlocked. On
* failure returns false with no lock held.
static bool cgroup_lock_live_group(struct cgroup *cgrp)
{
mutex_lock(&cgroup_mutex);
mutex_unlock(&cgroup_mutex);
return false;
}
return true;
}
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
* direction, a css_set is naturally associated with multiple cgroups.
* This M:N relationship is represented by the following link structure
* which exists for each association and allows traversing the associations
* from both sides.
*/
struct cgrp_cset_link {
/* the cgroup and css_set this link associates */
struct cgroup *cgrp;
struct css_set *cset;
/* list of cgrp_cset_links anchored at cgrp->cset_links */
struct list_head cset_link;
/* list of cgrp_cset_links anchored at css_set->cgrp_links */
struct list_head cgrp_link;
};
/* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
static struct css_set init_css_set;
static struct cgrp_cset_link init_cgrp_cset_link;
* css_set_rwsem protects the list of css_set objects, and the chain of
* tasks off each css_set.
static DECLARE_RWSEM(css_set_rwsem);
static int css_set_count;
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
struct cgroup_subsys *ss;
int i;
key += (unsigned long)css[i];
key = (key >> 16) ^ key;
static void put_css_set_locked(struct css_set *cset, bool taskexit)
struct cgrp_cset_link *link, *tmp_link;
lockdep_assert_held(&css_set_rwsem);
if (!atomic_dec_and_test(&cset->refcount))
return;
/* This css_set is dead. unlink it and release cgroup refcounts */
hash_del(&cset->hlist);
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
struct cgroup *cgrp = link->cgrp;
list_del(&link->cset_link);
list_del(&link->cgrp_link);
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links) && notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
kfree(link);
kfree_rcu(cset, rcu_head);
static void put_css_set(struct css_set *cset, bool taskexit)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
if (atomic_add_unless(&cset->refcount, -1, 1))
return;
down_write(&css_set_rwsem);
put_css_set_locked(cset, taskexit);
up_write(&css_set_rwsem);
}
/*
* refcounted get/put for css_set objects
*/
static inline void get_css_set(struct css_set *cset)
atomic_inc(&cset->refcount);
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
* @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
* Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cset,
struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
if (memcmp(template, cset->subsys, sizeof(cset->subsys))) {
/* Not all subsystems matched */
return false;
}
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in heirarchies with no subsystems. We
* could get by with just this check alone (and skip the
* memcmp above) but on most setups the memcmp check will
* avoid the need for this more expensive check on almost all
* candidates.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
struct cgrp_cset_link *link1, *link2;
struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
if (cgrp1->root == new_cgrp->root) {
if (cgrp1 != new_cgrp)
return false;
} else {
if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
/**
* find_existing_css_set - init css array and find the matching css_set
* @old_cset: the css_set that we're using before the cgroup transition
* @cgrp: the cgroup that we're moving into
* @template: out param for the new set of csses, should be clear on entry
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
struct cgroupfs_root *root = cgrp->root;
struct css_set *cset;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
if (root->subsys_mask & (1UL << i)) {
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
template[i] = cgroup_css(cgrp, ss);
} else {
/* Subsystem is not in this hierarchy, so we
* don't want to change the subsystem state */
template[i] = old_cset->subsys[i];
hash_for_each_possible(css_set_table, cset, hlist, key) {
if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
/* No existing cgroup group matched */
return NULL;
}
static void free_cgrp_cset_links(struct list_head *links_to_free)
struct cgrp_cset_link *link, *tmp_link;
list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
list_del(&link->cset_link);
kfree(link);
}
}
/**
* allocate_cgrp_cset_links - allocate cgrp_cset_links
* @count: the number of links to allocate
* @tmp_links: list_head the allocated links are put on
*
* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
* through ->cset_link. Returns 0 on success or -errno.
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
struct cgrp_cset_link *link;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL);
free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
* @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgroup *cgrp)
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
list_move(&link->cset_link, &cgrp->cset_links);
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
/**
* find_css_set - return a new css_set with one cgroup updated
* @old_cset: the baseline css_set
* @cgrp: the cgroup to be updated
*
* Return a new css_set that's equivalent to @old_cset, but with @cgrp
* substituted into the appropriate hierarchy.
static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup *cgrp)
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
down_read(&css_set_rwsem);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
up_read(&css_set_rwsem);
if (cset)
return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
return NULL;
}
atomic_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
c = cgrp;
link_css_set(&tmp_links, cset, c);
BUG_ON(!list_empty(&tmp_links));
css_set_count++;
/* Add this cgroup group to the hash table */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
up_write(&css_set_rwsem);
static struct cgroupfs_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *top_cgrp = kf_root->kn->priv;
return top_cgrp->root;
}
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end)
{
int id;
lockdep_assert_held(&cgroup_mutex);
id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, start, end,
GFP_KERNEL);
if (id < 0)
return id;
root->hierarchy_id = id;
return 0;
}
static void cgroup_exit_root_id(struct cgroupfs_root *root)
{
lockdep_assert_held(&cgroup_mutex);
if (root->hierarchy_id) {
idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
root->hierarchy_id = 0;
}
}
static void cgroup_free_root(struct cgroupfs_root *root)
{
if (root) {
/* hierarhcy ID shoulid already have been released */
WARN_ON_ONCE(root->hierarchy_id);
idr_destroy(&root->cgroup_idr);
kfree(root);
}
}
static void cgroup_destroy_root(struct cgroupfs_root *root)
struct cgroup *cgrp = &root->top_cgroup;
struct cgrp_cset_link *link, *tmp_link;
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
BUG_ON(!list_empty(&cgrp->children));
/* Rebind all subsystems back to the default hierarchy */
WARN_ON(rebind_subsystems(root, 0, root->subsys_mask));
/*
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
down_write(&css_set_rwsem);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
kfree(link);
}
up_write(&css_set_rwsem);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
cgroup_root_count--;
}
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
cgroup_free_root(root);
}
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroupfs_root *root)
{
struct cgroup *res = NULL;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_rwsem);
if (cset == &init_css_set) {
res = &root->top_cgroup;
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
BUG_ON(!res);
return res;
}
/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_rwsem held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroupfs_root *root)
{
/*
* No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen
* is that it exits and its css is set back to init_css_set.
*/
return cset_cgroup_from_root(task_css_set(task), root);
}
/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
* See "The task_lock() exception", at the end of this comment.
*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
* (usually) take cgroup_mutex. These are the two most performance
* critical pieces of code here. The exception occurs on cgroup_exit(),
* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
* is taken, and if the cgroup count is zero, a usermode call made
* to the release agent with the name of the cgroup (path relative to
* the root of cgroup file system) as the argument.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, top_cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that top_cgroup cannot be deleted.
*
* The task_lock() exception
*
* The need for this exception arises from the action of
* cgroup_attach_task(), which overwrites one task's cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
* in cgroup_attach_task(), modifying a task's cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
Tejun Heo
committed
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
{
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
cft->ss->name, cft->name);
else
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
}
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* returns cft->mode if ->mode is not 0
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
{
umode_t mode = 0;
if (cft->mode)
return cft->mode;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write_string ||
cft->trigger)
mode |= S_IWUSR;
return mode;
}
static void cgroup_free_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
Tejun Heo
committed
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
if (cgrp->parent) {
/*
* We get a ref to the parent, and put the ref when this
* cgroup is being freed, so it's guaranteed that the
* parent won't be destroyed before its children.
*/
cgroup_put(cgrp->parent);
kernfs_put(cgrp->kn);
kfree(cgrp);
} else {
/*
* This is top cgroup's refcnt reaching zero, which
* indicates that the root should be released.
*/
cgroup_destroy_root(cgrp->root);
}
}
static void cgroup_free_rcu(struct rcu_head *head)
{
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
static void cgroup_get(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
WARN_ON_ONCE(atomic_read(&cgrp->refcnt) <= 0);
atomic_inc(&cgrp->refcnt);
static void cgroup_put(struct cgroup *cgrp)
{
if (!atomic_dec_and_test(&cgrp->refcnt))
return;
if (WARN_ON_ONCE(cgrp->parent && !cgroup_is_dead(cgrp)))
/*
* XXX: cgrp->id is only used to look up css's. As cgroup and
* css's lifetimes will be decoupled, it should be made
* per-subsystem and moved to css->id so that lookups are
* successful until the target css is released.
*/
mutex_lock(&cgroup_mutex);
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
mutex_unlock(&cgroup_mutex);
cgrp->id = -1;
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
lockdep_assert_held(&cgroup_tree_mutex);
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
Tejun Heo
committed
* cgroup_clear_dir - remove subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
Tejun Heo
committed
static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
struct cgroup_subsys *ss;
Tejun Heo
committed
int i;
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!test_bit(i, &subsys_mask))
list_for_each_entry(cfts, &ss->cfts, node)
cgroup_addrm_files(cgrp, cfts, false);
}
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long added_mask, unsigned removed_mask)
struct cgroup *cgrp = &root->top_cgroup;
Tejun Heo
committed
int i, ret;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
/* Check that any added subsystems are currently free */
for_each_subsys(ss, i)
if ((added_mask & (1 << i)) && ss->root != &cgroup_dummy_root)
return -EBUSY;