Newer
Older
* don't get nasty surprises if we ever grow another caller.
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
goto err_free;
}
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc(&sb->s_active);
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (clone_children(parent))
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
Tejun Heo
committed
struct cgroup_subsys_state *css;
Tejun Heo
committed
css = ss->create(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
/* At error, ->destroy() callback has to free assigned ID. */
if (clone_children(parent) && ss->post_clone)
Tejun Heo
committed
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
list_add(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
if (err < 0)
goto err_remove;
/* each css holds a ref to the cgroup's dentry */
for_each_subsys(root, ss)
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
/* If err < 0, we have a half-filled directory - oh well ;) */
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
err_remove:
list_del(&cgrp->sibling);
root->number_of_cgroups--;
err_destroy:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
kfree(cgrp);
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
/* the vfs holds inode->i_mutex already */
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
/*
* Check the reference count on each subsystem. Since we already
* established that there are no tasks in the cgroup, if the css refcount
* is also 1, then there should be no outstanding references, so the
* subsystem is safe to destroy. We scan across all subsystems rather than
* using the per-hierarchy linked list of mounted subsystems since we can
* be called via check_for_release() with no synchronization other than
* RCU, and the subsystem linked list isn't RCU-safe.
*/
static int cgroup_has_css_refs(struct cgroup *cgrp)
{
int i;
/*
* We won't need to lock the subsys array, because the subsystems
* we're concerned about aren't going anywhere since our cgroup root
* has a reference on them.
*/
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
/* Skip subsystems not present or not in this hierarchy */
if (ss == NULL || ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
/*
* When called from check_for_release() it's possible
* that by this point the cgroup has been removed
* and the css deleted. But a false-positive doesn't
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway.
*/
if (css && css_refcnt(css) > 1)
return 1;
}
return 0;
}
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
struct cgroup_event *event, *tmp;
struct cgroup_subsys *ss;
/* the vfs holds both inode->i_mutex already */
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
* Block new css_tryget() by deactivating refcnt and mark @cgrp
* removed. This makes future css_tryget() and child creation
* attempts fail thus maintaining the removal conditions verified
* above.
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
WARN_ON(atomic_read(&css->refcnt) < 0);
atomic_add(CSS_DEACT_BIAS, &css->refcnt);
set_bit(CGRP_REMOVED, &cgrp->flags);
/*
* Tell subsystems to initate destruction. pre_destroy() should be
* called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix
* potential deadlock in pre_destroy") for details.
*/
mutex_unlock(&cgroup_mutex);
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy)
mutex_lock(&cgroup_mutex);
/*
* Put all the base refs. Each css holds an extra reference to the
* cgroup's dentry and cgroup removal proceeds regardless of css
* refs. On the last put of each css, whenever that may be, the
* extra dentry ref is put so that dentry destruction happens only
* after all css's are released.
*/
for_each_subsys(cgrp->root, ss)
css_put(cgrp->subsys[ss->subsys_id]);
raw_spin_lock(&release_list_lock);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/* delete this cgroup from parent->children */
list_del_init(&cgrp->sibling);
list_del_init(&cgrp->allcg_node);
d = dget(cgrp->dentry);
cgroup_d_remove_dir(d);
dput(d);
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace
*/
spin_lock(&cgrp->event_list_lock);
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del(&event->list);
remove_wait_queue(event->wqh, &event->wait);
eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
mutex_unlock(&cgroup_mutex);
return 0;
}
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
/*
* base_cftset is embedded in subsys itself, no need to worry about
* deregistration.
*/
if (ss->base_cftypes) {
ss->base_cftset.cfts = ss->base_cftypes;
list_add_tail(&ss->base_cftset.node, &ss->cftsets);
}
}
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
css = ss->create(dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
need_forkexit_callback |= ss->fork || ss->exit;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
BUG_ON(ss->module);
}
/**
* cgroup_load_subsys: load and register a modular subsystem at runtime
* @ss: the subsystem to load
*
* This function should be called in a modular subsystem's initcall. If the
* subsystem is built as a module, it will be assigned a new subsys_id and set
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
* up for use. If the subsystem is built-in anyway, work is delegated to the
* simpler cgroup_init_subsys.
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
int i;
struct cgroup_subsys_state *css;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
ss->create == NULL || ss->destroy == NULL)
return -EINVAL;
/*
* we don't support callbacks in modular subsystems. this check is
* before the ss->module check for consistency; a subsystem that could
* be a module should still have no callbacks even if the user isn't
* compiling it as one.
*/
if (ss->fork || ss->exit)
return -EINVAL;
/*
* an optionally modular subsystem is built-in: we want to do nothing,
* since cgroup_init_subsys will have already taken care of it.
*/
if (ss->module == NULL) {
BUG_ON(subsys[ss->subsys_id] != ss);
return 0;
}
/* init base cftset */
cgroup_init_cftsets(ss);
subsys[ss->subsys_id] = ss;
/*
* no ss->create seems to need anything important in the ss struct, so
* this can happen first (i.e. before the rootnode attachment).
*/
css = ss->create(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
/* our new subsystem will be attached to the dummy hierarchy. */
init_cgroup_css(css, ss, dummytop);
/* init_idr must be after init_cgroup_css because it sets css->id. */
if (ss->use_id) {
int ret = cgroup_init_idr(ss, css);
if (ret) {
dummytop->subsys[ss->subsys_id] = NULL;
subsys[ss->subsys_id] = NULL;
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
4367
4368
4369
4370
4371
4372
4373
4374
4375
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
mutex_unlock(&cgroup_mutex);
return ret;
}
}
/*
* Now we need to entangle the css into the existing css_sets. unlike
* in cgroup_init_subsys, there are now multiple css_sets, so each one
* will need a new pointer to it; done by iterating the css_set_table.
* furthermore, modifying the existing css_sets will corrupt the hash
* table state, so each changed css_set will need its hash recomputed.
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
struct css_set *cg;
struct hlist_node *node, *tmp;
struct hlist_head *bucket = &css_set_table[i], *new_bucket;
hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
/* skip entries that we already rehashed */
if (cg->subsys[ss->subsys_id])
continue;
/* remove existing entry */
hlist_del(&cg->hlist);
/* set new value */
cg->subsys[ss->subsys_id] = css;
/* recompute hash and restore entry */
new_bucket = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, new_bucket);
}
}
write_unlock(&css_set_lock);
ss->active = 1;
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4398
4399
4400
4401
4402
4403
4404
4405
4406
4407
4408
4409
4410
4411
4412
4413
4414
4415
4416
4417
4418
4419
4420
4421
4422
4423
4424
/**
* cgroup_unload_subsys: unload a modular subsystem
* @ss: the subsystem to unload
*
* This function should be called in a modular subsystem's exitcall. When this
* function is invoked, the refcount on the subsystem's module will be 0, so
* the subsystem will not be attached to any hierarchy.
*/
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cg_cgroup_link *link;
struct hlist_head *hhead;
BUG_ON(ss->module == NULL);
/*
* we shouldn't be called if the subsystem is in use, and the use of
* try_module_get in parse_cgroupfs_options should ensure that it
* doesn't start being used while we're killing it off.
*/
BUG_ON(ss->root != &rootnode);
mutex_lock(&cgroup_mutex);
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
list_del_init(&ss->sibling);
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
/*
* disentangle the css from all css_sets attached to the dummytop. as
* in loading, we need to pay our respects to the hashtable gods.
*/
write_lock(&css_set_lock);
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
struct css_set *cg = link->cg;
hlist_del(&cg->hlist);
BUG_ON(!cg->subsys[ss->subsys_id]);
cg->subsys[ss->subsys_id] = NULL;
hhead = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, hhead);
}
write_unlock(&css_set_lock);
/*
* remove subsystem's css from the dummytop and free it - need to free
* before marking as null because ss->destroy needs the cgrp->subsys
* pointer to find their state. note that this also takes care of
* freeing the css_id.
*/
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
* cgroup_init_early - cgroup initialization at system boot
*
* Initialize cgroups at system boot, and initialize any
* subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
int i;
atomic_set(&init_css_set.refcount, 1);
INIT_LIST_HEAD(&init_css_set.cg_links);
INIT_LIST_HEAD(&init_css_set.tasks);
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&rootnode);
root_count = 1;
init_task.cgroups = &init_css_set;
init_css_set_link.cg = &init_css_set;
init_css_set_link.cgrp = dummytop;
list_add(&init_css_set_link.cgrp_link_list,
&rootnode.top_cgroup.css_sets);
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&css_set_table[i]);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/* at bootup time, we don't worry about modular subsystems */
if (!ss || ss->module)
continue;
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
BUG_ON(!ss->create);
BUG_ON(!ss->destroy);
if (ss->subsys_id != i) {
ss->name, ss->subsys_id);
BUG();
}
if (ss->early_init)
cgroup_init_subsys(ss);
}
return 0;
}
/**
* cgroup_init - cgroup initialization
*
* Register cgroup filesystem and /proc file, and initialize
* any subsystems that didn't request early init.
*/
int __init cgroup_init(void)
{
int err;
int i;
err = bdi_init(&cgroup_backing_dev_info);
if (err)
return err;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/* at bootup time, we don't worry about modular subsystems */
if (!ss || ss->module)
continue;
if (!ss->early_init)
cgroup_init_subsys(ss);
cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
/* Add init_css_set to the hash table */
hhead = css_set_hash(init_css_set.subsys);
hlist_add_head(&init_css_set.hlist, hhead);
BUG_ON(!init_root_id(&rootnode));
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
if (!cgroup_kobj) {
err = -ENOMEM;
goto out;
}
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
if (err)
bdi_destroy(&cgroup_backing_dev_info);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
* doesn't really matter if tsk->cgroup changes after we read it,
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
* anyway. No need to check that tsk->cgroup != NULL, thanks to
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
* cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
static int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
char *buf;
int retval;
struct cgroupfs_root *root;
retval = -ENOMEM;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto out;
retval = -ESRCH;
pid = m->private;
tsk = get_pid_task(pid, PIDTYPE_PID);
if (!tsk)
goto out_free;
retval = 0;
mutex_lock(&cgroup_mutex);
for_each_active_root(root) {
struct cgroup *cgrp;
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
cgrp = task_cgroup_from_root(tsk, root);
retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
if (retval < 0)
goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
}
out_unlock:
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
static int cgroup_open(struct inode *inode, struct file *file)
{
struct pid *pid = PROC_I(inode)->pid;
return single_open(file, proc_cgroup_show, pid);
}
const struct file_operations proc_cgroup_operations = {
.open = cgroup_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
* ideally we don't want subsystems moving around while we do this.
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys/hierarchy state.
*/
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
}
mutex_unlock(&cgroup_mutex);
return 0;
}
static int cgroupstats_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_cgroupstats_show, NULL);
static const struct file_operations proc_cgroupstats_operations = {
.open = cgroupstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
* @child: pointer to task_struct of forking parent process.
*
* Description: A task inherits its parent's cgroup at fork().
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
* it was not made under the protection of RCU or cgroup_mutex, so
* might no longer be a valid cgroup pointer. cgroup_attach_task() might
* have already changed current->cgroups, allowing the previously
* referenced cgroup group to be removed and freed.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
Tejun Heo
committed
* Adds the task to the list running through its css_set if necessary and
* call the subsystem fork() callbacks. Has to be after the task is
* visible on the task list in case we race with the first call to
* cgroup_iter_start() - to guarantee that the new task ends up on its
* list.
void cgroup_post_fork(struct task_struct *child)
{
Tejun Heo
committed
int i;
Frederic Weisbecker
committed
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
* to the tasklist under the tasklist_lock as well. If the child wasn't
* yet in the tasklist when we walked through it from
* cgroup_enable_task_cg_lists(), then use_task_css_set_links value
* should be visible now due to the paired locking and barriers implied
* by LOCK/UNLOCK: it is written before the tasklist_lock unlock
* in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
* lock on fork.
*/
if (use_task_css_set_links) {
write_lock(&css_set_lock);
task_lock(child);
if (list_empty(&child->cg_list))
list_add(&child->cg_list, &child->cgroups->tasks);
task_unlock(child);
write_unlock(&css_set_lock);
}
Tejun Heo
committed
4733
4734
4735
4736
4737
4738
4739
4740
4741
4742
4743
4744
4745
4746
4747
4748
4749
4750
4751
4752
4753
4754
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
if (need_forkexit_callback) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/*
* fork/exit callbacks are supported only for
* builtin subsystems and we don't need further
* synchronization as they never go away.
*/
if (!ss || ss->module)
continue;
if (ss->fork)
ss->fork(child);
}
}
Tejun Heo
committed
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
4761
4762
4763
4764
4765
4766
4767
4768
4769
4770
4771
4772
4773
4774
4775
4776
4777
4778
4779
4780
4781
4782
4783
4784
4785
4786
4787
4788
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* the_top_cgroup_hack:
*
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
*
* We call cgroup_exit() while the task is still competent to
* handle notify_on_release(), then leave the task attached to the
* root cgroup in each hierarchy for the remainder of its exit.
*
* To do this properly, we would increment the reference count on
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
* code we would add a second cgroup function call, to drop that
* reference. This would just create an unnecessary hot spot on
* the top_cgroup reference count, to no avail.
*
* Normally, holding a reference to a cgroup without bumping its
* count is unsafe. The cgroup could go away, or someone could
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
struct css_set *cg;
/*
* Unlink from the css_set task list if necessary.
* Optimistically check cg_list before taking
* css_set_lock
*/
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
/* Reassign the task to the init_css_set. */
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
if (run_callbacks && need_forkexit_callback) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/* modular subsystems can't use callbacks */
if (!ss || ss->module)
continue;
if (ss->exit) {
struct cgroup *old_cgrp =
rcu_dereference_raw(cg->subsys[i])->cgroup;
struct cgroup *cgrp = task_cgroup(tsk, i);
ss->exit(cgrp, old_cgrp, tsk);
put_css_set_taskexit(cg);
Grzegorz Nosek
committed
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
Grzegorz Nosek
committed
* @task: the task in question
Grzegorz Nosek
committed
* See if @cgrp is a descendant of @task's cgroup in the appropriate
* hierarchy.
*
* If we are sending in dummytop, then presumably we are creating
* the top cgroup in the subsystem.
*
* Called only by the ns (nsproxy) cgroup.
*/
Grzegorz Nosek
committed
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
{
int ret;
struct cgroup *target;
if (cgrp == dummytop)
target = task_cgroup_from_root(task, cgrp->root);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
static void check_for_release(struct cgroup *cgrp)
{
/* All of these checks rely on RCU to keep the cgroup
* structure alive */
if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
&& list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
/* Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
* it now */
int need_schedule_work = 0;
raw_spin_lock(&release_list_lock);
if (!cgroup_is_removed(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
raw_spin_unlock(&release_list_lock);
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
/* Caller must verify that the css is not for root cgroup */
bool __css_tryget(struct cgroup_subsys_state *css)
{
v = css_refcnt(css);
t = atomic_cmpxchg(&css->refcnt, v, v + 1);
if (likely(t == v))
return true;
cpu_relax();
}
EXPORT_SYMBOL_GPL(__css_tryget);
/* Caller must verify that the css is not for root cgroup */
void __css_put(struct cgroup_subsys_state *css)
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
switch (v) {
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
break;
case 0:
schedule_work(&css->dput_work);
}
rcu_read_unlock();
}
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*/
static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
char *pathbuf = NULL, *agentbuf = NULL;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
goto continue_free;
i = 0;
argv[i++] = agentbuf;
argv[i++] = pathbuf;
argv[i] = NULL;
i = 0;
/* minimal command environment */
envp[i++] = "HOME=/";
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[i] = NULL;
/* Drop the lock while we invoke the usermode helper,
* since the exec could involve hitting disk and hence
* be a slow process */
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
mutex_lock(&cgroup_mutex);
continue_free:
kfree(pathbuf);
kfree(agentbuf);
raw_spin_lock(&release_list_lock);
raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
static int __init cgroup_disable(char *str)
{