Newer
Older
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, dput_work);
struct dentry *dentry = css->cgroup->dentry;
struct super_block *sb = dentry->d_sb;
atomic_inc(&sb->s_active);
dput(dentry);
deactivate_super(sb);
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
/*
* If !clear_css_refs, css holds an extra ref to @cgrp->dentry
* which is put on the last css_put(). dput() requires process
* context, which css_put() may be called without. @css->dput_work
* will be used to invoke dput() asynchronously from css_put().
*/
INIT_WORK(&css->dput_work, css_dput_fn);
if (ss->__DEPRECATED_clear_css_refs)
set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
* @dentry: dentry of the new cgroup
* @mode: mode to set on new inode
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc(&sb->s_active);
mutex_lock(&cgroup_mutex);
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (clone_children(parent))
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
Tejun Heo
committed
struct cgroup_subsys_state *css;
Tejun Heo
committed
css = ss->create(cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
/* At error, ->destroy() callback has to free assigned ID. */
if (clone_children(parent) && ss->post_clone)
Tejun Heo
committed
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
list_add(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
if (err < 0)
goto err_remove;
/* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
for_each_subsys(root, ss)
if (!ss->__DEPRECATED_clear_css_refs)
dget(dentry);
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
list_add_tail(&cgrp->allcg_node, &root->allcg_list);
err = cgroup_populate_dir(cgrp, true, root->subsys_mask);
/* If err < 0, we have a half-filled directory - oh well ;) */
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
err_remove:
list_del(&cgrp->sibling);
root->number_of_cgroups--;
err_destroy:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
kfree(cgrp);
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
/* the vfs holds inode->i_mutex already */
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
/*
* Check the reference count on each subsystem. Since we already
* established that there are no tasks in the cgroup, if the css refcount
* is also 1, then there should be no outstanding references, so the
* subsystem is safe to destroy. We scan across all subsystems rather than
* using the per-hierarchy linked list of mounted subsystems since we can
* be called via check_for_release() with no synchronization other than
* RCU, and the subsystem linked list isn't RCU-safe.
*/
static int cgroup_has_css_refs(struct cgroup *cgrp)
{
int i;
/*
* We won't need to lock the subsys array, because the subsystems
* we're concerned about aren't going anywhere since our cgroup root
* has a reference on them.
*/
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
/* Skip subsystems not present or not in this hierarchy */
if (ss == NULL || ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
/*
* When called from check_for_release() it's possible
* that by this point the cgroup has been removed
* and the css deleted. But a false-positive doesn't
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway.
*/
if (css && css_refcnt(css) > 1)
return 1;
}
return 0;
}
/*
* Atomically mark all (or else none) of the cgroup's CSS objects as
* CSS_REMOVED. Return true on success, or false if the cgroup has
* busy subsystems. Call with cgroup_mutex held
*
* Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
* not, cgroup removal behaves differently.
*
* If clear is set, css refcnt for the subsystem should be zero before
* cgroup removal can be committed. This is implemented by
* CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
* called multiple times until all css refcnts reach zero and is allowed to
* veto removal on any invocation. This behavior is deprecated and will be
* removed as soon as the existing user (memcg) is updated.
*
* If clear is not set, each css holds an extra reference to the cgroup's
* dentry and cgroup removal proceeds regardless of css refs.
* ->pre_destroy() will be called at least once and is not allowed to fail.
* On the last put of each css, whenever that may be, the extra dentry ref
* is put so that dentry destruction happens only after all css's are
* released.
*/
static int cgroup_clear_css_refs(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
unsigned long flags;
bool failed = false;
/*
* Block new css_tryget() by deactivating refcnt. If all refcnts
* for subsystems w/ clear_css_refs set were 1 at the moment of
* deactivation, we succeeded.
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
WARN_ON(atomic_read(&css->refcnt) < 0);
atomic_add(CSS_DEACT_BIAS, &css->refcnt);
if (ss->__DEPRECATED_clear_css_refs)
failed |= css_refcnt(css) != 1;
/*
* If succeeded, set REMOVED and put all the base refs; otherwise,
* restore refcnts to positive values. Either way, all in-progress
* css_tryget() will be released.
*/
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
if (!failed) {
css_put(css);
} else {
atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
local_irq_restore(flags);
return !failed;
}
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
struct cgroup_event *event, *tmp;
/* the vfs holds both inode->i_mutex already */
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
if (!list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
mutex_unlock(&cgroup_mutex);
/*
* In general, subsystem has no css->refcnt after pre_destroy(). But
* in racy cases, subsystem may have to get css->refcnt after
* pre_destroy() and it makes rmdir return with -EBUSY. This sometimes
* make rmdir return -EBUSY too often. To avoid that, we use waitqueue
* for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir
* and subsystem's reference count handling. Please see css_get/put
* and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation.
*/
set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
ret = cgroup_call_pre_destroy(cgrp);
if (ret) {
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
/*
* Because someone may call cgroup_wakeup_rmdir_waiter() before
* prepare_to_wait(), we need to check this flag.
*/
if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))
schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
return -EINTR;
goto again;
}
/* NO css_tryget() can success after here. */
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
raw_spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
/* delete this cgroup from parent->children */
list_del_init(&cgrp->sibling);
list_del_init(&cgrp->allcg_node);
d = dget(cgrp->dentry);
cgroup_d_remove_dir(d);
dput(d);
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
/*
* Unregister events and notify userspace.
* Notify userspace about cgroup removing only after rmdir of cgroup
* directory to avoid race between userspace and kernelspace
*/
spin_lock(&cgrp->event_list_lock);
list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
list_del(&event->list);
remove_wait_queue(event->wqh, &event->wait);
eventfd_signal(event->eventfd, 1);
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
mutex_unlock(&cgroup_mutex);
return 0;
}
static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
INIT_LIST_HEAD(&ss->cftsets);
/*
* base_cftset is embedded in subsys itself, no need to worry about
* deregistration.
*/
if (ss->base_cftypes) {
ss->base_cftset.cfts = ss->base_cftypes;
list_add_tail(&ss->base_cftset.node, &ss->cftsets);
}
}
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* init base cftset */
cgroup_init_cftsets(ss);
/* Create the top cgroup state for this subsystem */
list_add(&ss->sibling, &rootnode.subsys_list);
css = ss->create(dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence the
* init_css_set is in the subsystem's top cgroup. */
init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
need_forkexit_callback |= ss->fork || ss->exit;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
/* this function shouldn't be used with modular subsystems, since they
* need to register a subsys_id, among other things */
BUG_ON(ss->module);
}
/**
* cgroup_load_subsys: load and register a modular subsystem at runtime
* @ss: the subsystem to load
*
* This function should be called in a modular subsystem's initcall. If the
* subsystem is built as a module, it will be assigned a new subsys_id and set
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
* up for use. If the subsystem is built-in anyway, work is delegated to the
* simpler cgroup_init_subsys.
*/
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
int i;
struct cgroup_subsys_state *css;
/* check name and function validity */
if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
ss->create == NULL || ss->destroy == NULL)
return -EINVAL;
/*
* we don't support callbacks in modular subsystems. this check is
* before the ss->module check for consistency; a subsystem that could
* be a module should still have no callbacks even if the user isn't
* compiling it as one.
*/
if (ss->fork || ss->exit)
return -EINVAL;
/*
* an optionally modular subsystem is built-in: we want to do nothing,
* since cgroup_init_subsys will have already taken care of it.
*/
if (ss->module == NULL) {
BUG_ON(subsys[ss->subsys_id] != ss);
return 0;
}
/* init base cftset */
cgroup_init_cftsets(ss);
subsys[ss->subsys_id] = ss;
/*
* no ss->create seems to need anything important in the ss struct, so
* this can happen first (i.e. before the rootnode attachment).
*/
css = ss->create(dummytop);
if (IS_ERR(css)) {
/* failure case - need to deassign the subsys[] slot. */
subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
return PTR_ERR(css);
}
list_add(&ss->sibling, &rootnode.subsys_list);
ss->root = &rootnode;
/* our new subsystem will be attached to the dummy hierarchy. */
init_cgroup_css(css, ss, dummytop);
/* init_idr must be after init_cgroup_css because it sets css->id. */
if (ss->use_id) {
int ret = cgroup_init_idr(ss, css);
if (ret) {
dummytop->subsys[ss->subsys_id] = NULL;
subsys[ss->subsys_id] = NULL;
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
mutex_unlock(&cgroup_mutex);
return ret;
}
}
/*
* Now we need to entangle the css into the existing css_sets. unlike
* in cgroup_init_subsys, there are now multiple css_sets, so each one
* will need a new pointer to it; done by iterating the css_set_table.
* furthermore, modifying the existing css_sets will corrupt the hash
* table state, so each changed css_set will need its hash recomputed.
* this is all done under the css_set_lock.
*/
write_lock(&css_set_lock);
for (i = 0; i < CSS_SET_TABLE_SIZE; i++) {
struct css_set *cg;
struct hlist_node *node, *tmp;
struct hlist_head *bucket = &css_set_table[i], *new_bucket;
hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) {
/* skip entries that we already rehashed */
if (cg->subsys[ss->subsys_id])
continue;
/* remove existing entry */
hlist_del(&cg->hlist);
/* set new value */
cg->subsys[ss->subsys_id] = css;
/* recompute hash and restore entry */
new_bucket = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, new_bucket);
}
}
write_unlock(&css_set_lock);
ss->active = 1;
/* success! */
mutex_unlock(&cgroup_mutex);
return 0;
EXPORT_SYMBOL_GPL(cgroup_load_subsys);
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
/**
* cgroup_unload_subsys: unload a modular subsystem
* @ss: the subsystem to unload
*
* This function should be called in a modular subsystem's exitcall. When this
* function is invoked, the refcount on the subsystem's module will be 0, so
* the subsystem will not be attached to any hierarchy.
*/
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
struct cg_cgroup_link *link;
struct hlist_head *hhead;
BUG_ON(ss->module == NULL);
/*
* we shouldn't be called if the subsystem is in use, and the use of
* try_module_get in parse_cgroupfs_options should ensure that it
* doesn't start being used while we're killing it off.
*/
BUG_ON(ss->root != &rootnode);
mutex_lock(&cgroup_mutex);
/* deassign the subsys_id */
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
list_del_init(&ss->sibling);
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571
4572
4573
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583
/*
* disentangle the css from all css_sets attached to the dummytop. as
* in loading, we need to pay our respects to the hashtable gods.
*/
write_lock(&css_set_lock);
list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) {
struct css_set *cg = link->cg;
hlist_del(&cg->hlist);
BUG_ON(!cg->subsys[ss->subsys_id]);
cg->subsys[ss->subsys_id] = NULL;
hhead = css_set_hash(cg->subsys);
hlist_add_head(&cg->hlist, hhead);
}
write_unlock(&css_set_lock);
/*
* remove subsystem's css from the dummytop and free it - need to free
* before marking as null because ss->destroy needs the cgrp->subsys
* pointer to find their state. note that this also takes care of
* freeing the css_id.
*/
dummytop->subsys[ss->subsys_id] = NULL;
mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);
* cgroup_init_early - cgroup initialization at system boot
*
* Initialize cgroups at system boot, and initialize any
* subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
int i;
atomic_set(&init_css_set.refcount, 1);
INIT_LIST_HEAD(&init_css_set.cg_links);
INIT_LIST_HEAD(&init_css_set.tasks);
INIT_HLIST_NODE(&init_css_set.hlist);
css_set_count = 1;
init_cgroup_root(&rootnode);
root_count = 1;
init_task.cgroups = &init_css_set;
init_css_set_link.cg = &init_css_set;
init_css_set_link.cgrp = dummytop;
list_add(&init_css_set_link.cgrp_link_list,
&rootnode.top_cgroup.css_sets);
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
INIT_HLIST_HEAD(&css_set_table[i]);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/* at bootup time, we don't worry about modular subsystems */
if (!ss || ss->module)
continue;
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
BUG_ON(!ss->create);
BUG_ON(!ss->destroy);
if (ss->subsys_id != i) {
ss->name, ss->subsys_id);
BUG();
}
if (ss->early_init)
cgroup_init_subsys(ss);
}
return 0;
}
/**
* cgroup_init - cgroup initialization
*
* Register cgroup filesystem and /proc file, and initialize
* any subsystems that didn't request early init.
*/
int __init cgroup_init(void)
{
int err;
int i;
err = bdi_init(&cgroup_backing_dev_info);
if (err)
return err;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/* at bootup time, we don't worry about modular subsystems */
if (!ss || ss->module)
continue;
if (!ss->early_init)
cgroup_init_subsys(ss);
cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
/* Add init_css_set to the hash table */
hhead = css_set_hash(init_css_set.subsys);
hlist_add_head(&init_css_set.hlist, hhead);
BUG_ON(!init_root_id(&rootnode));
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
if (!cgroup_kobj) {
err = -ENOMEM;
goto out;
}
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
if (err)
bdi_destroy(&cgroup_backing_dev_info);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
* doesn't really matter if tsk->cgroup changes after we read it,
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
4703
4704
4705
4706
4707
4708
4709
4710
4711
4712
4713
4714
4715
4716
4717
4718
4719
4720
4721
4722
4723
4724
4725
4726
4727
4728
4729
4730
4731
* anyway. No need to check that tsk->cgroup != NULL, thanks to
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
* cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
static int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
char *buf;
int retval;
struct cgroupfs_root *root;
retval = -ENOMEM;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto out;
retval = -ESRCH;
pid = m->private;
tsk = get_pid_task(pid, PIDTYPE_PID);
if (!tsk)
goto out_free;
retval = 0;
mutex_lock(&cgroup_mutex);
for_each_active_root(root) {
struct cgroup *cgrp;
seq_printf(m, "%d:", root->hierarchy_id);
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
cgrp = task_cgroup_from_root(tsk, root);
retval = cgroup_path(cgrp, buf, PAGE_SIZE);
4746
4747
4748
4749
4750
4751
4752
4753
4754
4755
4756
4757
4758
4759
4760
4761
4762
4763
4764
4765
4766
if (retval < 0)
goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
}
out_unlock:
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
static int cgroup_open(struct inode *inode, struct file *file)
{
struct pid *pid = PROC_I(inode)->pid;
return single_open(file, proc_cgroup_show, pid);
}
const struct file_operations proc_cgroup_operations = {
.open = cgroup_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
* ideally we don't want subsystems moving around while we do this.
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys/hierarchy state.
*/
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
}
mutex_unlock(&cgroup_mutex);
return 0;
}
static int cgroupstats_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_cgroupstats_show, NULL);
static const struct file_operations proc_cgroupstats_operations = {
.open = cgroupstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
* @child: pointer to task_struct of forking parent process.
*
* Description: A task inherits its parent's cgroup at fork().
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
* it was not made under the protection of RCU, cgroup_mutex or
* threadgroup_change_begin(), so it might no longer be a valid
* cgroup pointer. cgroup_attach_task() might have already changed
* current->cgroups, allowing the previously referenced cgroup
* group to be removed and freed.
*
* Outside the pointer validity we also need to process the css_set
* inheritance between threadgoup_change_begin() and
* threadgoup_change_end(), this way there is no leak in any process
* wide migration performed by cgroup_attach_proc() that could otherwise
* miss a thread because it is too early or too late in the fork stage.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
/*
* We don't need to task_lock() current because current->cgroups
* can't be changed concurrently here. The parent obviously hasn't
* exited and called cgroup_exit(), and we are synchronized against
* cgroup migration through threadgroup_change_begin().
*/
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
INIT_LIST_HEAD(&child->cg_list);
* cgroup_fork_callbacks - run fork callbacks
* @child: the new task
*
* Called on a new task very soon before adding it to the
* tasklist. No need to take any locks since no-one can
* be operating on this task.
*/
void cgroup_fork_callbacks(struct task_struct *child)
{
if (need_forkexit_callback) {
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/*
* forkexit callbacks are only supported for
* builtin subsystems.
*/
if (!ss || ss->module)
continue;
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
* Adds the task to the list running through its css_set if necessary.
* Has to be after the task is visible on the task list in case we race
* with the first call to cgroup_iter_start() - to guarantee that the
* new task ends up on its list.
*/
void cgroup_post_fork(struct task_struct *child)
{
Frederic Weisbecker
committed
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
* to the tasklist under the tasklist_lock as well. If the child wasn't
* yet in the tasklist when we walked through it from
* cgroup_enable_task_cg_lists(), then use_task_css_set_links value
* should be visible now due to the paired locking and barriers implied
* by LOCK/UNLOCK: it is written before the tasklist_lock unlock
* in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
* lock on fork.
*/
if (use_task_css_set_links) {
write_lock(&css_set_lock);
if (list_empty(&child->cg_list)) {
/*
* It's safe to use child->cgroups without task_lock()
* here because we are protected through
* threadgroup_change_begin() against concurrent
* css_set change in cgroup_task_migrate(). Also
* the task can't exit at that point until
* wake_up_new_task() is called, so we are protected
* against cgroup_exit() setting child->cgroup to
* init_css_set.
*/
list_add(&child->cg_list, &child->cgroups->tasks);
write_unlock(&css_set_lock);
}
}
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* the_top_cgroup_hack:
*
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
*
* We call cgroup_exit() while the task is still competent to
* handle notify_on_release(), then leave the task attached to the
* root cgroup in each hierarchy for the remainder of its exit.
*
* To do this properly, we would increment the reference count on
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
* code we would add a second cgroup function call, to drop that
* reference. This would just create an unnecessary hot spot on
* the top_cgroup reference count, to no avail.
*
* Normally, holding a reference to a cgroup without bumping its
* count is unsafe. The cgroup could go away, or someone could
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
struct css_set *cg;
/*
* Unlink from the css_set task list if necessary.
* Optimistically check cg_list before taking
* css_set_lock
*/
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
/* Reassign the task to the init_css_set. */
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
if (run_callbacks && need_forkexit_callback) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
/* modular subsystems can't use callbacks */
if (!ss || ss->module)
continue;
if (ss->exit) {
struct cgroup *old_cgrp =
rcu_dereference_raw(cg->subsys[i])->cgroup;
struct cgroup *cgrp = task_cgroup(tsk, i);
ss->exit(cgrp, old_cgrp, tsk);
put_css_set_taskexit(cg);
Grzegorz Nosek
committed
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
Grzegorz Nosek
committed
* @task: the task in question
Grzegorz Nosek
committed
* See if @cgrp is a descendant of @task's cgroup in the appropriate
* hierarchy.