Newer
Older
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#include <linux/cgroup.h>
#include <linux/errno.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/backing-dev.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/spinlock.h>
#include <linux/string.h>
#include <linux/kmod.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <asm/atomic.h>
static DEFINE_MUTEX(cgroup_mutex);
/* Generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) &_x ## _subsys,
static struct cgroup_subsys *subsys[] = {
#include <linux/cgroup_subsys.h>
};
#define MAX_CGROUP_ROOT_NAMELEN 64
/*
* A cgroupfs_root represents the root of a cgroup hierarchy,
* and may be associated with a superblock to form an active
* hierarchy
*/
struct cgroupfs_root {
struct super_block *sb;
/*
* The bitmask of subsystems intended to be attached to this
* hierarchy
*/
unsigned long subsys_bits;
/* The bitmask of subsystems currently attached to this hierarchy */
unsigned long actual_subsys_bits;
/* A list running through the attached subsystems */
struct list_head subsys_list;
/* The root cgroup for this hierarchy */
struct cgroup top_cgroup;
/* Tracks how many cgroups are currently defined in hierarchy.*/
int number_of_cgroups;
/* A list running through the active hierarchies */
struct list_head root_list;
/* Hierarchy-specific flags */
unsigned long flags;
/* The path to use for release notifications. */
char release_agent_path[PATH_MAX];
/* The name for this hierarchy - may be empty */
char name[MAX_CGROUP_ROOT_NAMELEN];
};
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
* single cgroup, and all tasks are part of that cgroup.
*/
static struct cgroupfs_root rootnode;
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
#define CSS_ID_MAX (65535)
struct css_id {
/*
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
struct cgroup_subsys_state *css;
/*
* ID of this css.
*/
unsigned short id;
/*
* Depth in hierarchy which this ID belongs to.
*/
unsigned short depth;
/*
* ID is freed by RCU. (and lookup routine is RCU safe.)
*/
struct rcu_head rcu_head;
/*
* Hierarchy of CSS ID belongs to.
*/
unsigned short stack[0]; /* Array of Length (depth+1) */
};
/* The list of hierarchy roots */
static LIST_HEAD(roots);
static int root_count;
/* dummytop is a shorthand for the dummy hierarchy's top cgroup */
#define dummytop (&rootnode.top_cgroup)
/* This flag indicates whether tasks in the fork and exit paths should
* check for fork/exit handlers to call. This avoids us having to do
* extra work in the fork/exit path if none of the subsystems need to
* be called.
static int need_forkexit_callback __read_mostly;
/* convenient tests for these bits */
inline int cgroup_is_removed(const struct cgroup *cgrp)
return test_bit(CGRP_REMOVED, &cgrp->flags);
}
/* bits in struct cgroupfs_root flags field */
enum {
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
};
static int cgroup_is_releasable(const struct cgroup *cgrp)
{
const int bits =
(1 << CGRP_RELEASABLE) |
(1 << CGRP_NOTIFY_ON_RELEASE);
return (cgrp->flags & bits) == bits;
}
static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
*/
#define for_each_subsys(_root, _ss) \
list_for_each_entry(_ss, &_root->subsys_list, sibling)
/* for_each_active_root() allows you to iterate across the active hierarchies */
#define for_each_active_root(_root) \
list_for_each_entry(_root, &roots, root_list)
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
static DEFINE_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
* List running through cg_cgroup_links associated with a
* cgroup, anchored on cgroup->css_sets
*/
struct list_head cgrp_link_list;
/*
* List running through cg_cgroup_links pointing at a
* single css_set object, anchored on css_set->cg_links
*/
struct list_head cg_link_list;
struct css_set *cg;
};
/* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
static DEFINE_RWLOCK(css_set_lock);
static int css_set_count;
/* hash table for cgroup groups. This improves the performance to
* find an existing css_set */
#define CSS_SET_HASH_BITS 7
#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)
static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];
static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
{
int i;
int index;
unsigned long tmp = 0UL;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
tmp += (unsigned long)css[i];
tmp = (tmp >> 16) ^ tmp;
index = hash_long(tmp, CSS_SET_HASH_BITS);
return &css_set_table[index];
}
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
* compiled into their kernel but not actually in use */
static int use_task_css_set_links __read_mostly;
/* When we create or destroy a css_set, the operation simply
* takes/releases a reference count on all the cgroups referenced
* by subsystems in this css_set. This can end up multiple-counting
* some cgroups, but that's OK - the ref-count is just a
* busy/not-busy indicator; ensuring that we only count each cgroup
* once would require taking a global lock to ensure that no
* subsystems moved between hierarchies while we were doing so.
*
* Possible TODO: decide at boot time based on the number of
* registered subsystems and the number of CPUs or NUMA nodes whether
* it's better for performance to ref-count every subsystem, or to
* take a global lock and only add one ref count to each hierarchy.
*/
/*
* unlink a css_set from the list and free it
*/
static void unlink_css_set(struct css_set *cg)
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
css_set_count--;
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
cg_link_list) {
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
kfree(link);
}
}
static void __put_css_set(struct css_set *cg, int taskexit)
{
int i;
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
if (atomic_add_unless(&cg->refcount, -1, 1))
return;
write_lock(&css_set_lock);
if (!atomic_dec_and_test(&cg->refcount)) {
write_unlock(&css_set_lock);
return;
}
unlink_css_set(cg);
write_unlock(&css_set_lock);
rcu_read_lock();
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
if (atomic_dec_and_test(&cgrp->count) &&
notify_on_release(cgrp)) {
if (taskexit)
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
}
rcu_read_unlock();
/*
* refcounted get/put for css_set objects
*/
static inline void get_css_set(struct css_set *cg)
{
atomic_inc(&cg->refcount);
}
static inline void put_css_set(struct css_set *cg)
{
__put_css_set(cg, 0);
static inline void put_css_set_taskexit(struct css_set *cg)
{
__put_css_set(cg, 1);
}
/*
* find_existing_css_set() is a helper for
* find_css_set(), and checks to see whether an existing
*
* oldcg: the cgroup group that we're using before the cgroup
* transition
*
* cgrp: the cgroup that we're moving into
*
* template: location in which to build the desired set of subsystem
* state objects for the new cgroup group
*/
static struct css_set *find_existing_css_set(
struct css_set *oldcg,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
struct cgroupfs_root *root = cgrp->root;
struct hlist_head *hhead;
struct hlist_node *node;
struct css_set *cg;
/* Built the set of subsystem state objects that we want to
* see in the new css_set */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
/* Subsystem is in this hierarchy. So we want
* the subsystem state from the new
* cgroup */
template[i] = cgrp->subsys[i];
} else {
/* Subsystem is not in this hierarchy, so we
* don't want to change the subsystem state */
template[i] = oldcg->subsys[i];
}
}
hhead = css_set_hash(template);
hlist_for_each_entry(cg, node, hhead, hlist) {
if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) {
/* All subsystems matched */
return cg;
}
/* No existing cgroup group matched */
return NULL;
}
static void free_cg_links(struct list_head *tmp)
{
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
list_del(&link->cgrp_link_list);
kfree(link);
}
}
/*
* allocate_cg_links() allocates "count" cg_cgroup_link structures
* and chains them on tmp through their cgrp_link_list fields. Returns 0 on
* success or a negative error
*/
static int allocate_cg_links(int count, struct list_head *tmp)
{
struct cg_cgroup_link *link;
int i;
INIT_LIST_HEAD(tmp);
for (i = 0; i < count; i++) {
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link) {
return -ENOMEM;
}
list_add(&link->cgrp_link_list, tmp);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
* @cg: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_cg_links,
struct css_set *cg, struct cgroup *cgrp)
{
struct cg_cgroup_link *link;
BUG_ON(list_empty(tmp_cg_links));
link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
cgrp_link_list);
link->cg = cg;
list_move(&link->cgrp_link_list, &cgrp->css_sets);
list_add(&link->cg_link_list, &cg->cg_links);
}
/*
* find_css_set() takes an existing cgroup group and a
* cgroup object, and returns a css_set object that's
* equivalent to the old group, but with the given cgroup
* substituted into the appropriate hierarchy. Must be called with
* cgroup_mutex held
*/
static struct css_set *find_css_set(
struct css_set *oldcg, struct cgroup *cgrp)
{
struct css_set *res;
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
int i;
struct list_head tmp_cg_links;
/* First see if we already have a cgroup group that matches
* the desired set */
read_lock(&css_set_lock);
res = find_existing_css_set(oldcg, cgrp, template);
if (res)
get_css_set(res);
read_unlock(&css_set_lock);
if (res)
return res;
res = kmalloc(sizeof(*res), GFP_KERNEL);
if (!res)
return NULL;
/* Allocate all the cg_cgroup_link objects that we'll need */
if (allocate_cg_links(root_count, &tmp_cg_links) < 0) {
kfree(res);
return NULL;
}
atomic_set(&res->refcount, 1);
INIT_LIST_HEAD(&res->cg_links);
INIT_LIST_HEAD(&res->tasks);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(res->subsys, template, sizeof(res->subsys));
write_lock(&css_set_lock);
/* Add reference counts and links from the new css_set. */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup *cgrp = res->subsys[i]->cgroup;
struct cgroup_subsys *ss = subsys[i];
atomic_inc(&cgrp->count);
/*
* We want to add a link once per cgroup, so we
* only do it for the first subsystem in each
* hierarchy
*/
if (ss->root->subsys_list.next == &ss->sibling)
link_css_set(&tmp_cg_links, res, cgrp);
if (list_empty(&rootnode.subsys_list))
link_css_set(&tmp_cg_links, res, dummytop);
BUG_ON(!list_empty(&tmp_cg_links));
css_set_count++;
/* Add this cgroup group to the hash table */
hhead = css_set_hash(res->subsys);
hlist_add_head(&res->hlist, hhead);
write_unlock(&css_set_lock);
return res;
/*
* There is one global cgroup mutex. We also require taking
* task_lock() when dereferencing a task's cgroup subsys pointers.
* See "The task_lock() exception", at the end of this comment.
*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
* (usually) take cgroup_mutex. These are the two most performance
* critical pieces of code here. The exception occurs on cgroup_exit(),
* when a task in a notify_on_release cgroup exits. Then cgroup_mutex
* is taken, and if the cgroup count is zero, a usermode call made
* to the release agent with the name of the cgroup (path relative to
* the root of cgroup file system) as the argument.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, top_cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that top_cgroup cannot be deleted.
*
* The task_lock() exception
*
* The need for this exception arises from the action of
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
* task->cgroup without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
* in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
*/
/**
* cgroup_lock - lock out any changes to cgroup structures
*
*/
void cgroup_lock(void)
{
mutex_lock(&cgroup_mutex);
}
/**
* cgroup_unlock - release lock on cgroup changes
*
* Undo the lock taken in a previous cgroup_lock() call.
*/
void cgroup_unlock(void)
{
mutex_unlock(&cgroup_mutex);
}
/*
* A couple of forward declarations required, due to cyclic reference loop:
* cgroup_mkdir -> cgroup_create -> cgroup_populate_dir ->
* cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations
* -> cgroup_mkdir.
*/
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
static int cgroup_populate_dir(struct cgroup *cgrp);
static const struct inode_operations cgroup_dir_inode_operations;
static struct file_operations proc_cgroupstats_operations;
static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
if (inode) {
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
}
return inode;
}
/*
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy) {
ret = ss->pre_destroy(ss, cgrp);
if (ret)
break;
}
return ret;
static void free_cgroup_rcu(struct rcu_head *obj)
{
struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
kfree(cgrp);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
if (S_ISDIR(inode->i_mode)) {
struct cgroup *cgrp = dentry->d_fsdata;
struct cgroup_subsys *ss;
BUG_ON(!(cgroup_is_removed(cgrp)));
/* It's possible for external users to be holding css
* reference counts on a cgroup; css_put() needs to
* be able to access the cgroup after decrementing
* the reference count in order to know if it needs to
* queue the cgroup to be handled by the release
* agent */
synchronize_rcu();
mutex_lock(&cgroup_mutex);
/*
* Release the subsystem state objects.
*/
for_each_subsys(cgrp->root, ss)
ss->destroy(ss, cgrp);
cgrp->root->number_of_cgroups--;
mutex_unlock(&cgroup_mutex);
/*
* Drop the active superblock reference that we took when we
* created the cgroup
*/
deactivate_super(cgrp->root->sb);
call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
}
iput(inode);
}
static void remove_dir(struct dentry *d)
{
struct dentry *parent = dget(d->d_parent);
d_delete(d);
simple_rmdir(parent->d_inode, d);
dput(parent);
}
static void cgroup_clear_directory(struct dentry *dentry)
{
struct list_head *node;
BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
spin_lock(&dcache_lock);
node = dentry->d_subdirs.next;
while (node != &dentry->d_subdirs) {
struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
list_del_init(node);
if (d->d_inode) {
/* This should never be called on a cgroup
* directory with child cgroups */
BUG_ON(d->d_inode->i_mode & S_IFDIR);
d = dget_locked(d);
spin_unlock(&dcache_lock);
d_delete(d);
simple_unlink(dentry->d_inode, d);
dput(d);
spin_lock(&dcache_lock);
}
node = dentry->d_subdirs.next;
}
spin_unlock(&dcache_lock);
}
/*
* NOTE : the dentry must have been dget()'ed
*/
static void cgroup_d_remove_dir(struct dentry *dentry)
{
cgroup_clear_directory(dentry);
spin_lock(&dcache_lock);
list_del_init(&dentry->d_u.d_child);
spin_unlock(&dcache_lock);
remove_dir(dentry);
}
/*
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
* CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
{
css_get(css);
}
void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
{
cgroup_wakeup_rmdir_waiter(css->cgroup);
css_put(css);
}
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
unsigned long added_bits, removed_bits;
struct cgroup *cgrp = &root->top_cgroup;
int i;
removed_bits = root->actual_subsys_bits & ~final_bits;
added_bits = final_bits & ~root->actual_subsys_bits;
/* Check that any added subsystems are currently free */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!(bit & added_bits))
continue;
if (ss->root != &rootnode) {
/* Subsystem isn't free */
return -EBUSY;
}
}
/* Currently we don't handle adding/removing subsystems when
* any child cgroups exist. This is theoretically supportable
* but involves complex error handling, so it's being left until
* later */
if (root->number_of_cgroups > 1)
return -EBUSY;
/* Process each subsystem */
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
unsigned long bit = 1UL << i;
if (bit & added_bits) {
/* We're binding this subsystem to this hierarchy */
BUG_ON(cgrp->subsys[i]);
BUG_ON(!dummytop->subsys[i]);
BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
mutex_lock(&ss->hierarchy_mutex);
cgrp->subsys[i] = dummytop->subsys[i];
cgrp->subsys[i]->cgroup = cgrp;
list_move(&ss->sibling, &root->subsys_list);
ss->root = root;
ss->bind(ss, cgrp);
mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & removed_bits) {
/* We're removing this subsystem */
BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
mutex_lock(&ss->hierarchy_mutex);
if (ss->bind)
ss->bind(ss, dummytop);
dummytop->subsys[i]->cgroup = dummytop;
cgrp->subsys[i] = NULL;
subsys[i]->root = &rootnode;
list_move(&ss->sibling, &rootnode.subsys_list);
mutex_unlock(&ss->hierarchy_mutex);
} else if (bit & final_bits) {
/* Subsystem state should already exist */
BUG_ON(!cgrp->subsys[i]);
} else {
/* Subsystem state shouldn't exist */
BUG_ON(cgrp->subsys[i]);
}
}
root->subsys_bits = root->actual_subsys_bits = final_bits;
synchronize_rcu();
return 0;
}
static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
{
struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info;
struct cgroup_subsys *ss;
mutex_lock(&cgroup_mutex);
for_each_subsys(root, ss)
seq_printf(seq, ",%s", ss->name);
if (test_bit(ROOT_NOPREFIX, &root->flags))
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
mutex_unlock(&cgroup_mutex);
return 0;
}
struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
char *name;
struct cgroupfs_root *new_root;
};
/* Convert a hierarchy specifier into a bitmask of subsystems and
* flags. */
static int parse_cgroupfs_options(char *data,
struct cgroup_sb_opts *opts)
{
char *token, *o = data ?: "all";
unsigned long mask = (unsigned long)-1;
#ifdef CONFIG_CPUSETS
mask = ~(1UL << cpuset_subsys_id);
#endif
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
if (!strcmp(token, "all")) {
/* Add all non-disabled subsystems */
int i;
opts->subsys_bits = 0;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!ss->disabled)
opts->subsys_bits |= 1ul << i;
}
} else if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
} else if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
opts->release_agent =
kstrndup(token + 14, PATH_MAX, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
} else if (!strncmp(token, "name=", 5)) {
int i;
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
/* Must match [\w.-]+ */
for (i = 0; i < strlen(name); i++) {
char c = name[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return -EINVAL;
}
/* Specifying two names is forbidden */
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
MAX_CGROUP_ROOT_NAMELEN,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
} else {
struct cgroup_subsys *ss;
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
ss = subsys[i];
if (!strcmp(token, ss->name)) {
if (!ss->disabled)
set_bit(i, &opts->subsys_bits);
break;
}
}
if (i == CGROUP_SUBSYS_COUNT)
return -ENOENT;
}
}
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
(opts->subsys_bits & mask))
return -EINVAL;
/* We can't have an empty hierarchy */
if (!opts->subsys_bits && !opts->name)
return -EINVAL;
return 0;
}
static int cgroup_remount(struct super_block *sb, int *flags, char *data)
{
int ret = 0;
struct cgroupfs_root *root = sb->s_fs_info;
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_sb_opts opts;
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
/* Don't allow flags to change at remount */
if (opts.flags != root->flags) {
ret = -EINVAL;
goto out_unlock;
}
/* Don't allow name to change at remount */
if (opts.name && strcmp(opts.name, root->name)) {
ret = -EINVAL;
goto out_unlock;
}
ret = rebind_subsystems(root, opts.subsys_bits);
if (ret)
goto out_unlock;
/* (re)populate subsystem files */
cgroup_populate_dir(cgrp);
if (opts.release_agent)
strcpy(root->release_agent_path, opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);