Newer
Older
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct cgroup_event *event = container_of(wait,
struct cgroup_event, wait);
Tejun Heo
committed
struct cgroup *cgrp = event->css->cgroup;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
spin_lock(&cgrp->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
}
return 0;
}
static void cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
struct cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control(struct cgroup_subsys_state *css,
struct cftype *cft, const char *buffer)
struct cgroup *cgrp = css->cgroup;
struct cgroup_event *event;
struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
struct file *efile;
struct file *cfile;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
Tejun Heo
committed
event->css = css;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
efile = eventfd_fget(efd);
if (IS_ERR(efile)) {
ret = PTR_ERR(efile);
goto out_kfree;
}
event->eventfd = eventfd_ctx_fileget(efile);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fget(cfd);
if (!cfile) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto out_put_cfile;
event->cft = __file_cft(cfile);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto out_put_cfile;
}
/*
* The file to be monitored must be in the same cgroup as
* cgroup.event_control is.
*/
cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
if (cgrp_cfile != cgrp) {
ret = -EINVAL;
goto out_put_cfile;
}
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto out_put_cfile;
}
Tejun Heo
committed
ret = event->cft->register_event(css, event->cft,
event->eventfd, buffer);
if (ret)
goto out_put_cfile;
efile->f_op->poll(efile, &event->pt);
/*
* Events should be removed after rmdir of cgroup directory, but before
* destroying subsystem state objects. Let's take reference to cgroup
* directory dentry to do that.
*/
dget(cgrp->dentry);
spin_lock(&cgrp->event_list_lock);
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);
fput(cfile);
fput(efile);
return 0;
out_put_cfile:
fput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
fput(efile);
out_kfree:
kfree(event);
return ret;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static struct cftype cgroup_base_files[] = {
.name = "cgroup.procs",
Ben Blum
committed
.open = cgroup_procs_open,
Ben Blum
committed
.release = cgroup_pidlist_release,
Ben Blum
committed
},
.name = "cgroup.event_control",
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_sane_behavior_show,
},
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
* don't exist if sane_behavior. If you're depending on these, be
* prepared to be burned.
*/
{
.name = "tasks",
.flags = CFTYPE_INSANE, /* use "procs" instead */
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
.release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
.flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
},
Tejun Heo
committed
* cgroup_populate_dir - create subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be added
*
* On failure, no file is added.
Tejun Heo
committed
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
Tejun Heo
committed
int i, ret = 0;
/* process cftsets of each subsystem */
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!test_bit(i, &subsys_mask))
list_for_each_entry(set, &ss->cftsets, node) {
ret = cgroup_addrm_files(cgrp, set->cfts, true);
if (ret < 0)
goto err;
}
for_each_root_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
struct css_id *id = rcu_dereference_protected(css->id, true);
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
if (id)
rcu_assign_pointer(id->css, css);
err:
cgroup_clear_dir(cgrp, subsys_mask);
return ret;
static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
if (css->parent)
css_put(css->parent);
static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
/*
* css holds an extra ref to @cgrp->dentry which is put on the last
* css_put(). dput() requires process context, which css_put() may
* be called without. @css->destroy_work will be used to invoke
* dput() asynchronously from css_put().
*/
INIT_WORK(&css->destroy_work, css_free_work_fn);
schedule_work(&css->destroy_work);
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
if (cgrp->parent)
css->parent = cgroup_css(cgrp->parent, ss->subsys_id);
else
css->flags |= CSS_ROOT;
rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css);
/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
lockdep_assert_held(&cgroup_mutex);
Tejun Heo
committed
if (ss->css_online)
ret = ss->css_online(css);
css->flags |= CSS_ONLINE;
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
lockdep_assert_held(&cgroup_mutex);
if (!(css->flags & CSS_ONLINE))
return;
ss->css_offline(css);
css->flags &= ~CSS_ONLINE;
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
* @dentry: dentry of the new cgroup
* @mode: mode to set on new inode
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
name = cgroup_alloc_name(dentry);
if (!name)
goto err_free_cgrp;
rcu_assign_pointer(cgrp->name, name);
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
/*
* Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
* anyway so that locking is contained inside cgroup proper and we
* don't get nasty surprises if we ever grow another caller.
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc(&sb->s_active);
init_cgroup_housekeeping(cgrp);
dentry->d_fsdata = cgrp;
cgrp->dentry = dentry;
cgrp->parent = parent;
cgrp->dummy_css.parent = &parent->dummy_css;
cgrp->root = parent->root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
for_each_root_subsys(root, ss) {
Tejun Heo
committed
struct cgroup_subsys_state *css;
css = ss->css_alloc(cgroup_css(parent, ss->subsys_id));
if (IS_ERR(css)) {
err = PTR_ERR(css);
err = percpu_ref_init(&css->refcnt, css_release);
ss->css_free(css);
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
/*
* Create directory. cgroup_create_file() returns with the new
* directory locked on success so that it can be populated without
* dropping cgroup_mutex.
*/
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
lockdep_assert_held(&dentry->d_inode->i_mutex);
cgrp->serial_nr = cgroup_serial_nr_next++;
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
/* each css holds a ref to the cgroup's dentry and the parent css */
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
percpu_ref_get(&css->parent->refcnt);
}
/* hold a ref to the parent's dentry */
dget(parent->dentry);
/* creation succeeded, notify subsystems */
for_each_root_subsys(root, ss) {
err = online_css(ss, cgrp);
if (err)
goto err_destroy;
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
Tejun Heo
committed
if (err)
goto err_destroy;
err = cgroup_populate_dir(cgrp, root->subsys_mask);
if (err)
goto err_destroy;
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
if (css) {
percpu_ref_cancel_init(&css->refcnt);
ss->css_free(css);
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
idr_remove(&root->cgroup_idr, cgrp->id);
err_free_name:
kfree(rcu_dereference_raw(cgrp->name));
kfree(cgrp);
err_destroy:
cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&dentry->d_inode->i_mutex);
return err;
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
/* the vfs holds inode->i_mutex already */
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
static void cgroup_css_killed(struct cgroup *cgrp)
{
if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
return;
/* percpu ref's of all css's are killed, kick off the next step */
INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
schedule_work(&cgrp->destroy_work);
}
static void css_ref_killed_fn(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
cgroup_css_killed(css->cgroup);
}
/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* @cgrp: cgroup to be destroyed
*
* css's make use of percpu refcnts whose killing latency shouldn't be
* exposed to userland and are RCU protected. Also, cgroup core needs to
* guarantee that css_tryget() won't succeed by the time ->css_offline() is
* invoked. To satisfy all the requirements, destruction is implemented in
* the following two steps.
*
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
* userland visible parts and start killing the percpu refcnts of
* css's. Set up so that the next stage will be kicked off once all
* the percpu refcnts are confirmed to be killed.
*
* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
* rest of destruction. Once all cgroup references are gone, the
* cgroup is RCU-freed.
*
* This function implements s1. After this step, @cgrp is gone as far as
* the userland is concerned and a new cgroup with the same name may be
* created. As cgroup doesn't care about the names internally, this
* doesn't cause any problem.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
struct dentry *d = cgrp->dentry;
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
* css_set_lock synchronizes access to ->cset_links and prevents
* @cgrp from being removed while __put_css_set() is in progress.
*/
read_lock(&css_set_lock);
empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
read_unlock(&css_set_lock);
if (!empty)
* Block new css_tryget() by killing css refcnts. cgroup core
* guarantees that, by the time ->css_offline() is invoked, no new
* css reference will be given out via css_tryget(). We can't
* simply call percpu_ref_kill() and proceed to offlining css's
* because percpu_ref_kill() doesn't guarantee that the ref is seen
* as killed on all CPUs on return.
*
* Use percpu_ref_kill_and_confirm() to get notifications as each
* css is confirmed to be seen as killed on all CPUs. The
* notification callback keeps track of the number of css's to be
* killed and schedules cgroup_offline_fn() to perform the rest of
* destruction once the percpu refs of all css's are confirmed to
* be killed.
atomic_set(&cgrp->css_kill_cnt, 1);
for_each_root_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
/*
* Killing would put the base ref, but we need to keep it
* alive until after ->css_offline.
*/
percpu_ref_get(&css->refcnt);
atomic_inc(&cgrp->css_kill_cnt);
percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
cgroup_css_killed(cgrp);
/*
* Mark @cgrp dead. This prevents further task migration and child
* creation by disabling cgroup_lock_live_group(). Note that
Tejun Heo
committed
* CGRP_DEAD assertion is depended upon by css_next_child() to
* resume iteration after dropping RCU read lock. See
Tejun Heo
committed
* css_next_child() for details.
/* CGRP_DEAD is set, remove from ->release_list for the last time */
raw_spin_lock(&release_list_lock);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/*
* Clear and remove @cgrp directory. The removal puts the base ref
* but we aren't quite done with @cgrp yet, so hold onto it.
Tejun Heo
committed
cgroup_clear_dir(cgrp, cgrp->root->subsys_mask);
cgroup_addrm_files(cgrp, cgroup_base_files, false);
dget(d);
cgroup_d_remove_dir(d);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock(&cgrp->event_list_lock);
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
/**
* cgroup_offline_fn - the second step of cgroup destruction
* @work: cgroup->destroy_free_work
*
* This function is invoked from a work item for a cgroup which is being
* destroyed after the percpu refcnts of all css's are guaranteed to be
* seen as killed on all CPUs, and performs the rest of destruction. This
* is the second step of destruction described in the comment above
* cgroup_destroy_locked().
*/
static void cgroup_offline_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
struct cgroup *parent = cgrp->parent;
struct dentry *d = cgrp->dentry;
struct cgroup_subsys *ss;
mutex_lock(&cgroup_mutex);
/*
* css_tryget() is guaranteed to fail now. Tell subsystems to
* initate destruction.
*/
for_each_root_subsys(cgrp->root, ss)
offline_css(ss, cgrp);
* Put the css refs from cgroup_destroy_locked(). Each css holds
* an extra reference to the cgroup's dentry and cgroup removal
* proceeds regardless of css refs. On the last put of each css,
* whenever that may be, the extra dentry ref is put so that dentry
* destruction happens only after all css's are released.
for_each_root_subsys(cgrp->root, ss)
/* delete this cgroup from parent->children */
/*
* We should remove the cgroup object from idr before its grace
* period starts, so we won't be looking up a cgroup while the
* cgroup is being freed.
*/
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
mutex_unlock(&cgroup_mutex);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = cgroup_destroy_locked(dentry->d_fsdata);
mutex_unlock(&cgroup_mutex);
return ret;
}
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
/*
* base_cftset is embedded in subsys itself, no need to worry about
* deregistration.
*/
if (ss->base_cftypes) {
struct cftype *cft;
for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
cft->ss = ss;
ss->base_cftset.cfts = ss->base_cftypes;
list_add_tail(&ss->base_cftset.node, &ss->cftsets);
}
}
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, cgroup_dummy_top);
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
init_css_set.subsys[ss->subsys_id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
BUG_ON(online_css(ss, cgroup_dummy_top));
mutex_unlock(&cgroup_mutex);
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
BUG_ON(ss->module);
}
/**
* cgroup_load_subsys: load and register a modular subsystem at runtime
* @ss: the subsystem to load
*
* This function should be called in a modular subsystem's initcall. If the
* subsystem is built as a module, it will be assigned a new subsys_id and set
* up for use. If the subsystem is built-in anyway, work is delegated to the
* simpler cgroup_init_subsys.
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
struct css_set *cset;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
Tejun Heo
committed
ss->css_alloc == NULL || ss->css_free == NULL)
return -EINVAL;
/*
* we don't support callbacks in modular subsystems. this check is
* before the ss->module check for consistency; a subsystem that could
* be a module should still have no callbacks even if the user isn't
* compiling it as one.
*/
if (ss->fork || ss->exit)
return -EINVAL;
/*
* an optionally modular subsystem is built-in: we want to do nothing,
* since cgroup_init_subsys will have already taken care of it.
*/
if (ss->module == NULL) {
BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
/* init base cftset */
cgroup_init_cftsets(ss);
cgroup_subsys[ss->subsys_id] = ss;
Tejun Heo
committed
* no ss->css_alloc seems to need anything important in the ss
* struct, so this can happen first (i.e. before the dummy root
Tejun Heo
committed
* attachment).
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id));
/* failure case - need to deassign the cgroup_subsys[] slot. */
cgroup_subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
/* our new subsystem will be attached to the dummy hierarchy. */
init_cgroup_css(css, ss, cgroup_dummy_top);
/* init_idr must be after init_cgroup_css because it sets css->id. */
if (ss->use_id) {
ret = cgroup_init_idr(ss, css);
if (ret)
goto err_unload;
}
/*
* Now we need to entangle the css into the existing css_sets. unlike
* in cgroup_init_subsys, there are now multiple css_sets, so each one
* will need a new pointer to it; done by iterating the css_set_table.
* furthermore, modifying the existing css_sets will corrupt the hash
* table state, so each changed css_set will need its hash recomputed.
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
/* skip entries that we already rehashed */
if (cset->subsys[ss->subsys_id])
continue;
/* remove existing entry */
hash_del(&cset->hlist);
cset->subsys[ss->subsys_id] = css;
/* recompute hash and restore entry */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
}
write_unlock(&css_set_lock);
ret = online_css(ss, cgroup_dummy_top);
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
err_unload:
mutex_unlock(&cgroup_mutex);
/* @ss can't be mounted here as try_module_get() would fail */
cgroup_unload_subsys(ss);
return ret;
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
/**
* cgroup_unload_subsys: unload a modular subsystem
* @ss: the subsystem to unload
*
* This function should be called in a modular subsystem's exitcall. When this
* function is invoked, the refcount on the subsystem's module will be 0, so
* the subsystem will not be attached to any hierarchy.
*/
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cgrp_cset_link *link;
BUG_ON(ss->module == NULL);
/*
* we shouldn't be called if the subsystem is in use, and the use of
* try_module_get() in rebind_subsystems() should ensure that it
* doesn't start being used while we're killing it off.
*/
BUG_ON(ss->root != &cgroup_dummy_root);
Tejun Heo
committed
offline_css(ss, cgroup_dummy_top);
Tejun Heo
committed
Tejun Heo
committed
idr_destroy(&ss->idr);
cgroup_subsys[ss->subsys_id] = NULL;
/* remove subsystem from the dummy root's list of subsystems */
list_del_init(&ss->sibling);
* disentangle the css from all css_sets attached to the dummy
* top. as in loading, we need to pay our respects to the hashtable
* gods.
list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
struct css_set *cset = link->cset;
hash_del(&cset->hlist);
cset->subsys[ss->subsys_id] = NULL;
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
}
write_unlock(&css_set_lock);
/*
* remove subsystem's css from the cgroup_dummy_top and free it -
* need to free before marking as null because ss->css_free needs
* the cgrp->subsys pointer to find their state. note that this
* also takes care of freeing the css_id.
ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id));
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
* cgroup_init_early - cgroup initialization at system boot
*
* Initialize cgroups at system boot, and initialize any
* subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
atomic_set(&init_css_set.refcount, 1);
INIT_LIST_HEAD(&init_css_set.cgrp_links);
INIT_LIST_HEAD(&init_css_set.tasks);
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&cgroup_dummy_root);
cgroup_root_count = 1;
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
init_cgrp_cset_link.cset = &init_css_set;