Newer
Older
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct cgroup_event *event = container_of(wait,
struct cgroup_event, wait);
Tejun Heo
committed
struct cgroup *cgrp = event->css->cgroup;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
spin_lock(&cgrp->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
}
return 0;
}
static void cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
struct cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control(struct cgroup_subsys_state *css,
struct cftype *cft, const char *buffer)
struct cgroup *cgrp = css->cgroup;
struct cgroup_event *event;
struct cgroup *cgrp_cfile;
unsigned int efd, cfd;
struct file *efile;
struct file *cfile;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
Tejun Heo
committed
event->css = css;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
efile = eventfd_fget(efd);
if (IS_ERR(efile)) {
ret = PTR_ERR(efile);
goto out_kfree;
}
event->eventfd = eventfd_ctx_fileget(efile);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto out_put_efile;
}
cfile = fget(cfd);
if (!cfile) {
ret = -EBADF;
goto out_put_eventfd;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto out_put_cfile;
event->cft = __file_cft(cfile);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto out_put_cfile;
}
/*
* The file to be monitored must be in the same cgroup as
* cgroup.event_control is.
*/
cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
if (cgrp_cfile != cgrp) {
ret = -EINVAL;
goto out_put_cfile;
}
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto out_put_cfile;
}
Tejun Heo
committed
ret = event->cft->register_event(css, event->cft,
event->eventfd, buffer);
if (ret)
goto out_put_cfile;
efile->f_op->poll(efile, &event->pt);
/*
* Events should be removed after rmdir of cgroup directory, but before
* destroying subsystem state objects. Let's take reference to cgroup
* directory dentry to do that.
*/
dget(cgrp->dentry);
spin_lock(&cgrp->event_list_lock);
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);
fput(cfile);
fput(efile);
return 0;
out_put_cfile:
fput(cfile);
out_put_eventfd:
eventfd_ctx_put(event->eventfd);
out_put_efile:
fput(efile);
out_kfree:
kfree(event);
return ret;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static struct cftype cgroup_base_files[] = {
.name = "cgroup.procs",
Ben Blum
committed
.open = cgroup_procs_open,
Ben Blum
committed
.release = cgroup_pidlist_release,
Ben Blum
committed
},
.name = "cgroup.event_control",
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_sane_behavior_show,
},
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
* don't exist if sane_behavior. If you're depending on these, be
* prepared to be burned.
*/
{
.name = "tasks",
.flags = CFTYPE_INSANE, /* use "procs" instead */
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
.release = cgroup_pidlist_release,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
.flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
},
Tejun Heo
committed
* cgroup_populate_dir - create subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be added
*
* On failure, no file is added.
Tejun Heo
committed
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
Tejun Heo
committed
int i, ret = 0;
/* process cftsets of each subsystem */
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!test_bit(i, &subsys_mask))
list_for_each_entry(set, &ss->cftsets, node) {
ret = cgroup_addrm_files(cgrp, set->cfts, true);
if (ret < 0)
goto err;
}
for_each_root_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
struct css_id *id = rcu_dereference_protected(css->id, true);
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
if (id)
rcu_assign_pointer(id->css, css);
err:
cgroup_clear_dir(cgrp, subsys_mask);
return ret;
static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
if (css->parent)
css_put(css->parent);
static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
/*
* css holds an extra ref to @cgrp->dentry which is put on the last
* css_put(). dput() requires process context, which css_put() may
* be called without. @css->destroy_work will be used to invoke
* dput() asynchronously from css_put().
*/
INIT_WORK(&css->destroy_work, css_free_work_fn);
schedule_work(&css->destroy_work);
static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
if (cgrp->parent)
css->parent = cgroup_css(cgrp->parent, ss->subsys_id);
else
css->flags |= CSS_ROOT;
/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
struct cgroup_subsys *ss = css->ss;
lockdep_assert_held(&cgroup_mutex);
Tejun Heo
committed
if (ss->css_online)
ret = ss->css_online(css);
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
}
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
struct cgroup_subsys *ss = css->ss;
lockdep_assert_held(&cgroup_mutex);
if (!(css->flags & CSS_ONLINE))
return;
ss->css_offline(css);
css->flags &= ~CSS_ONLINE;
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
* @dentry: dentry of the new cgroup
* @mode: mode to set on new inode
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
name = cgroup_alloc_name(dentry);
if (!name)
goto err_free_cgrp;
rcu_assign_pointer(cgrp->name, name);
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
/*
* Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
* anyway so that locking is contained inside cgroup proper and we
* don't get nasty surprises if we ever grow another caller.
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc(&sb->s_active);
init_cgroup_housekeeping(cgrp);
dentry->d_fsdata = cgrp;
cgrp->dentry = dentry;
cgrp->parent = parent;
cgrp->dummy_css.parent = &parent->dummy_css;
cgrp->root = parent->root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
for_each_root_subsys(root, ss) {
Tejun Heo
committed
struct cgroup_subsys_state *css;
css = ss->css_alloc(cgroup_css(parent, ss->subsys_id));
if (IS_ERR(css)) {
err = PTR_ERR(css);
css_ar[ss->subsys_id] = css;
err = percpu_ref_init(&css->refcnt, css_release);
/*
* Create directory. cgroup_create_file() returns with the new
* directory locked on success so that it can be populated without
* dropping cgroup_mutex.
*/
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
lockdep_assert_held(&dentry->d_inode->i_mutex);
cgrp->serial_nr = cgroup_serial_nr_next++;
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
/* each css holds a ref to the cgroup's dentry and the parent css */
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
percpu_ref_get(&css->parent->refcnt);
}
/* hold a ref to the parent's dentry */
dget(parent->dentry);
/* creation succeeded, notify subsystems */
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
Tejun Heo
committed
if (err)
goto err_destroy;
err = cgroup_populate_dir(cgrp, root->subsys_mask);
if (err)
goto err_destroy;
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
for_each_root_subsys(root, ss) {
struct cgroup_subsys_state *css = css_ar[ss->subsys_id];
if (css) {
percpu_ref_cancel_init(&css->refcnt);
ss->css_free(css);
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
idr_remove(&root->cgroup_idr, cgrp->id);
err_free_name:
kfree(rcu_dereference_raw(cgrp->name));
kfree(cgrp);
err_destroy:
cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&dentry->d_inode->i_mutex);
return err;
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
/* the vfs holds inode->i_mutex already */
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
static void cgroup_css_killed(struct cgroup *cgrp)
{
if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
return;
/* percpu ref's of all css's are killed, kick off the next step */
INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
schedule_work(&cgrp->destroy_work);
}
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget() is now guaranteed to fail.
*/
static void css_killed_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
cgroup_css_killed(cgrp);
}
/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_killed_work_fn);
schedule_work(&css->destroy_work);
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
}
/**
* cgroup_destroy_locked - the first stage of cgroup destruction
* @cgrp: cgroup to be destroyed
*
* css's make use of percpu refcnts whose killing latency shouldn't be
* exposed to userland and are RCU protected. Also, cgroup core needs to
* guarantee that css_tryget() won't succeed by the time ->css_offline() is
* invoked. To satisfy all the requirements, destruction is implemented in
* the following two steps.
*
* s1. Verify @cgrp can be destroyed and mark it dying. Remove all
* userland visible parts and start killing the percpu refcnts of
* css's. Set up so that the next stage will be kicked off once all
* the percpu refcnts are confirmed to be killed.
*
* s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
* rest of destruction. Once all cgroup references are gone, the
* cgroup is RCU-freed.
*
* This function implements s1. After this step, @cgrp is gone as far as
* the userland is concerned and a new cgroup with the same name may be
* created. As cgroup doesn't care about the names internally, this
* doesn't cause any problem.
*/
static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
struct dentry *d = cgrp->dentry;
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
lockdep_assert_held(&d->d_inode->i_mutex);
lockdep_assert_held(&cgroup_mutex);
* css_set_lock synchronizes access to ->cset_links and prevents
* @cgrp from being removed while __put_css_set() is in progress.
*/
read_lock(&css_set_lock);
empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
read_unlock(&css_set_lock);
if (!empty)
* Block new css_tryget() by killing css refcnts. cgroup core
* guarantees that, by the time ->css_offline() is invoked, no new
* css reference will be given out via css_tryget(). We can't
* simply call percpu_ref_kill() and proceed to offlining css's
* because percpu_ref_kill() doesn't guarantee that the ref is seen
* as killed on all CPUs on return.
*
* Use percpu_ref_kill_and_confirm() to get notifications as each
* css is confirmed to be seen as killed on all CPUs. The
* notification callback keeps track of the number of css's to be
* killed and schedules cgroup_offline_fn() to perform the rest of
* destruction once the percpu refs of all css's are confirmed to
* be killed.
atomic_set(&cgrp->css_kill_cnt, 1);
for_each_root_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
/*
* Killing would put the base ref, but we need to keep it
* alive until after ->css_offline.
*/
percpu_ref_get(&css->refcnt);
atomic_inc(&cgrp->css_kill_cnt);
percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
cgroup_css_killed(cgrp);
/*
* Mark @cgrp dead. This prevents further task migration and child
* creation by disabling cgroup_lock_live_group(). Note that
Tejun Heo
committed
* CGRP_DEAD assertion is depended upon by css_next_child() to
* resume iteration after dropping RCU read lock. See
Tejun Heo
committed
* css_next_child() for details.
/* CGRP_DEAD is set, remove from ->release_list for the last time */
raw_spin_lock(&release_list_lock);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/*
* Clear and remove @cgrp directory. The removal puts the base ref
* but we aren't quite done with @cgrp yet, so hold onto it.
Tejun Heo
committed
cgroup_clear_dir(cgrp, cgrp->root->subsys_mask);
cgroup_addrm_files(cgrp, cgroup_base_files, false);
dget(d);
cgroup_d_remove_dir(d);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace.
*/
spin_lock(&cgrp->event_list_lock);
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del_init(&event->list);
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
/**
* cgroup_offline_fn - the second step of cgroup destruction
* @work: cgroup->destroy_free_work
*
* This function is invoked from a work item for a cgroup which is being
* destroyed after the percpu refcnts of all css's are guaranteed to be
* seen as killed on all CPUs, and performs the rest of destruction. This
* is the second step of destruction described in the comment above
* cgroup_destroy_locked().
*/
static void cgroup_offline_fn(struct work_struct *work)
{
struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
struct cgroup *parent = cgrp->parent;
struct dentry *d = cgrp->dentry;
struct cgroup_subsys *ss;
mutex_lock(&cgroup_mutex);
/*
* css_tryget() is guaranteed to fail now. Tell subsystems to
* initate destruction.
*/
for_each_root_subsys(cgrp->root, ss)
offline_css(cgroup_css(cgrp, ss->subsys_id));
* Put the css refs from cgroup_destroy_locked(). Each css holds
* an extra reference to the cgroup's dentry and cgroup removal
* proceeds regardless of css refs. On the last put of each css,
* whenever that may be, the extra dentry ref is put so that dentry
* destruction happens only after all css's are released.
for_each_root_subsys(cgrp->root, ss)
/* delete this cgroup from parent->children */
/*
* We should remove the cgroup object from idr before its grace
* period starts, so we won't be looking up a cgroup while the
* cgroup is being freed.
*/
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
cgrp->id = -1;
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
mutex_unlock(&cgroup_mutex);
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = cgroup_destroy_locked(dentry->d_fsdata);
mutex_unlock(&cgroup_mutex);
return ret;
}
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
/*
* base_cftset is embedded in subsys itself, no need to worry about
* deregistration.
*/
if (ss->base_cftypes) {
struct cftype *cft;
for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
cft->ss = ss;
ss->base_cftset.cfts = ss->base_cftypes;
list_add_tail(&ss->base_cftset.node, &ss->cftsets);
}
}
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_css(css, ss, cgroup_dummy_top);
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
init_css_set.subsys[ss->subsys_id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
BUG_ON(online_css(css));
mutex_unlock(&cgroup_mutex);
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
BUG_ON(ss->module);
}
/**
* cgroup_load_subsys: load and register a modular subsystem at runtime
* @ss: the subsystem to load
*
* This function should be called in a modular subsystem's initcall. If the
* subsystem is built as a module, it will be assigned a new subsys_id and set
* up for use. If the subsystem is built-in anyway, work is delegated to the
* simpler cgroup_init_subsys.
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
struct css_set *cset;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
Tejun Heo
committed
ss->css_alloc == NULL || ss->css_free == NULL)
return -EINVAL;
/*
* we don't support callbacks in modular subsystems. this check is
* before the ss->module check for consistency; a subsystem that could
* be a module should still have no callbacks even if the user isn't
* compiling it as one.
*/
if (ss->fork || ss->exit)
return -EINVAL;
/*
* an optionally modular subsystem is built-in: we want to do nothing,
* since cgroup_init_subsys will have already taken care of it.
*/
if (ss->module == NULL) {
BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
/* init base cftset */
cgroup_init_cftsets(ss);
cgroup_subsys[ss->subsys_id] = ss;
Tejun Heo
committed
* no ss->css_alloc seems to need anything important in the ss
* struct, so this can happen first (i.e. before the dummy root
Tejun Heo
committed
* attachment).
css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id));
/* failure case - need to deassign the cgroup_subsys[] slot. */
cgroup_subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
ss->root = &cgroup_dummy_root;
/* our new subsystem will be attached to the dummy hierarchy. */
init_css(css, ss, cgroup_dummy_top);
/* init_idr must be after init_css() because it sets css->id. */
ret = cgroup_init_idr(ss, css);
if (ret)
goto err_unload;
}
/*
* Now we need to entangle the css into the existing css_sets. unlike
* in cgroup_init_subsys, there are now multiple css_sets, so each one
* will need a new pointer to it; done by iterating the css_set_table.
* furthermore, modifying the existing css_sets will corrupt the hash
* table state, so each changed css_set will need its hash recomputed.
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
/* skip entries that we already rehashed */
if (cset->subsys[ss->subsys_id])
continue;
/* remove existing entry */
hash_del(&cset->hlist);
cset->subsys[ss->subsys_id] = css;
/* recompute hash and restore entry */
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
}
write_unlock(&css_set_lock);
ret = online_css(css);
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
err_unload:
mutex_unlock(&cgroup_mutex);
/* @ss can't be mounted here as try_module_get() would fail */
cgroup_unload_subsys(ss);
return ret;
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
/**
* cgroup_unload_subsys: unload a modular subsystem
* @ss: the subsystem to unload
*
* This function should be called in a modular subsystem's exitcall. When this
* function is invoked, the refcount on the subsystem's module will be 0, so
* the subsystem will not be attached to any hierarchy.
*/
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cgrp_cset_link *link;
BUG_ON(ss->module == NULL);
/*
* we shouldn't be called if the subsystem is in use, and the use of
* try_module_get() in rebind_subsystems() should ensure that it
* doesn't start being used while we're killing it off.
*/
BUG_ON(ss->root != &cgroup_dummy_root);
Tejun Heo
committed
offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id));
Tejun Heo
committed
Tejun Heo
committed
idr_destroy(&ss->idr);
cgroup_subsys[ss->subsys_id] = NULL;
/* remove subsystem from the dummy root's list of subsystems */
list_del_init(&ss->sibling);
* disentangle the css from all css_sets attached to the dummy
* top. as in loading, we need to pay our respects to the hashtable
* gods.
list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
struct css_set *cset = link->cset;
hash_del(&cset->hlist);
cset->subsys[ss->subsys_id] = NULL;
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
}
write_unlock(&css_set_lock);
/*
* remove subsystem's css from the cgroup_dummy_top and free it -
* need to free before marking as null because ss->css_free needs
* the cgrp->subsys pointer to find their state. note that this
* also takes care of freeing the css_id.
ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id));
RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);