Newer
Older
out_free:
kfree(buf);
out:
return retval;
}
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
* ideally we don't want subsystems moving around while we do this.
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys/hierarchy state.
*/
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
ss->root->number_of_cgroups, !ss->disabled);
mutex_unlock(&cgroup_mutex);
return 0;
}
static int cgroupstats_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_cgroupstats_show, NULL);
static const struct file_operations proc_cgroupstats_operations = {
.open = cgroupstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
* @child: pointer to task_struct of forking parent process.
*
* Description: A task inherits its parent's cgroup at fork().
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
* it was not made under the protection of RCU or cgroup_mutex, so
* might no longer be a valid cgroup pointer. cgroup_attach_task() might
* have already changed current->cgroups, allowing the previously
* referenced cgroup group to be removed and freed.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
task_lock(current);
get_css_set(task_css_set(current));
child->cgroups = current->cgroups;
task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
Tejun Heo
committed
* Adds the task to the list running through its css_set if necessary and
* call the subsystem fork() callbacks. Has to be after the task is
* visible on the task list in case we race with the first call to
* cgroup_iter_start() - to guarantee that the new task ends up on its
* list.
void cgroup_post_fork(struct task_struct *child)
{
Tejun Heo
committed
int i;
Frederic Weisbecker
committed
/*
* use_task_css_set_links is set to 1 before we walk the tasklist
* under the tasklist_lock and we read it here after we added the child
* to the tasklist under the tasklist_lock as well. If the child wasn't
* yet in the tasklist when we walked through it from
* cgroup_enable_task_cg_lists(), then use_task_css_set_links value
* should be visible now due to the paired locking and barriers implied
* by LOCK/UNLOCK: it is written before the tasklist_lock unlock
* in cgroup_enable_task_cg_lists() and read here after the tasklist_lock
* lock on fork.
*/
if (use_task_css_set_links) {
write_lock(&css_set_lock);
task_lock(child);
if (list_empty(&child->cg_list))
list_add(&child->cg_list, &task_css_set(child)->tasks);
task_unlock(child);
write_unlock(&css_set_lock);
}
Tejun Heo
committed
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
if (need_forkexit_callback) {
/*
* fork/exit callbacks are supported only for builtin
* subsystems, and the builtin section of the subsys
* array is immutable, so we don't need to lock the
* subsys array here. On the other hand, modular section
* of the array can be freed at module unload, so we
* can't touch that.
*/
for_each_builtin_subsys(ss, i)
Tejun Heo
committed
if (ss->fork)
ss->fork(child);
}
Tejun Heo
committed
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
5126
5127
5128
5129
5130
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140
5141
5142
5143
5144
5145
5146
5147
5148
5149
5150
5151
5152
5153
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* the_top_cgroup_hack:
*
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
*
* We call cgroup_exit() while the task is still competent to
* handle notify_on_release(), then leave the task attached to the
* root cgroup in each hierarchy for the remainder of its exit.
*
* To do this properly, we would increment the reference count on
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
* code we would add a second cgroup function call, to drop that
* reference. This would just create an unnecessary hot spot on
* the top_cgroup reference count, to no avail.
*
* Normally, holding a reference to a cgroup without bumping its
* count is unsafe. The cgroup could go away, or someone could
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
struct css_set *cset;
/*
* Unlink from the css_set task list if necessary.
* Optimistically check cg_list before taking
* css_set_lock
*/
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
/* Reassign the task to the init_css_set. */
task_lock(tsk);
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
if (run_callbacks && need_forkexit_callback) {
/*
* fork/exit callbacks are supported only for builtin
* subsystems, see cgroup_post_fork() for details.
*/
for_each_builtin_subsys(ss, i) {
struct cgroup *old_cgrp = cset->subsys[i]->cgroup;
struct cgroup *cgrp = task_cgroup(tsk, i);
ss->exit(cgrp, old_cgrp, tsk);
put_css_set_taskexit(cset);
static void check_for_release(struct cgroup *cgrp)
if (cgroup_is_releasable(cgrp) &&
list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
/*
* Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
int need_schedule_work = 0;
raw_spin_lock(&release_list_lock);
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
raw_spin_unlock(&release_list_lock);
5217
5218
5219
5220
5221
5222
5223
5224
5225
5226
5227
5228
5229
5230
5231
5232
5233
5234
5235
5236
5237
5238
5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*/
static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
char *pathbuf = NULL, *agentbuf = NULL;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
goto continue_free;
i = 0;
argv[i++] = agentbuf;
argv[i++] = pathbuf;
argv[i] = NULL;
i = 0;
/* minimal command environment */
envp[i++] = "HOME=/";
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[i] = NULL;
/* Drop the lock while we invoke the usermode helper,
* since the exec could involve hitting disk and hence
* be a slow process */
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
mutex_lock(&cgroup_mutex);
continue_free:
kfree(pathbuf);
kfree(agentbuf);
raw_spin_lock(&release_list_lock);
raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
static int __init cgroup_disable(char *str)
{
char *token;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
/*
* cgroup_disable, being at boot time, can't know about
* module subsystems, so we don't worry about them.
*/
for_each_builtin_subsys(ss, i) {
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
" subsystem\n", ss->name);
break;
}
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
/*
* Functons for CSS ID.
*/
/* to get ID other than 0, this should be called when !cgroup_is_dead() */
unsigned short css_id(struct cgroup_subsys_state *css)
{
struct css_id *cssid;
/*
* This css_id() can return correct value when somone has refcnt
* on this or this is under rcu_read_lock(). Once css->id is allocated,
* it's unchanged until freed.
*/
cssid = rcu_dereference_raw(css->id);
if (cssid)
return cssid->id;
return 0;
}
/**
* css_is_ancestor - test "root" css is an ancestor of "child"
* @child: the css to be tested.
* @root: the css supporsed to be an ancestor of the child.
*
* Returns true if "root" is an ancestor of "child" in its hierarchy. Because
* this function reads css->id, the caller must hold rcu_read_lock().
* But, considering usual usage, the csses should be valid objects after test.
* Assuming that the caller will do some action to the child if this returns
* returns true, the caller must take "child";s reference count.
* If "child" is valid object and this returns true, "root" is valid, too.
*/
bool css_is_ancestor(struct cgroup_subsys_state *child,
const struct cgroup_subsys_state *root)
struct css_id *child_id;
struct css_id *root_id;
child_id = rcu_dereference(child->id);
if (!child_id)
return false;
root_id = rcu_dereference(root->id);
if (!root_id)
return false;
if (child_id->depth < root_id->depth)
return false;
if (child_id->stack[root_id->depth] != root_id->id)
return false;
return true;
}
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = rcu_dereference_protected(css->id, true);
/* When this is called before css_id initialization, id can be NULL */
if (!id)
return;
BUG_ON(!ss->use_id);
rcu_assign_pointer(id->css, NULL);
rcu_assign_pointer(css->id, NULL);
kfree_rcu(id, rcu_head);
/*
* This is called by init or create(). Then, calls to this function are
* always serialized (By cgroup_mutex() at create()).
*/
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
BUG_ON(!ss->use_id);
size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
/* Don't use 0. allocates an ID of 1-65535 */
ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
/* Returns error when there are no free spaces for new ID.*/
newid->depth = depth;
return newid;
err_out:
kfree(newid);
static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
struct cgroup_subsys_state *rootcss)
idr_init(&ss->idr);
newid = get_new_cssid(ss, 0);
if (IS_ERR(newid))
return PTR_ERR(newid);
newid->stack[0] = newid->id;
RCU_INIT_POINTER(newid->css, rootcss);
RCU_INIT_POINTER(rootcss->id, newid);
return 0;
}
static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
struct cgroup *child)
{
int subsys_id, i, depth = 0;
struct cgroup_subsys_state *parent_css, *child_css;
struct css_id *child_id, *parent_id;
subsys_id = ss->subsys_id;
parent_css = parent->subsys[subsys_id];
child_css = child->subsys[subsys_id];
parent_id = rcu_dereference_protected(parent_css->id, true);
depth = parent_id->depth + 1;
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
child_id = get_new_cssid(ss, depth);
if (IS_ERR(child_id))
return PTR_ERR(child_id);
for (i = 0; i < depth; i++)
child_id->stack[i] = parent_id->stack[i];
child_id->stack[depth] = child_id->id;
/*
* child_id->css pointer will be set after this cgroup is available
* see cgroup_populate_dir()
*/
rcu_assign_pointer(child_css->id, child_id);
return 0;
}
/**
* css_lookup - lookup css by id
* @ss: cgroup subsys to be looked into.
* @id: the id
*
* Returns pointer to cgroup_subsys_state if there is valid one with id.
* NULL if not. Should be called under rcu_read_lock()
*/
struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
{
struct css_id *cssid = NULL;
BUG_ON(!ss->use_id);
cssid = idr_find(&ss->idr, id);
if (unlikely(!cssid))
return NULL;
return rcu_dereference(cssid->css);
}
/*
* get corresponding css from file open on cgroupfs directory
*/
struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
{
struct cgroup *cgrp;
struct inode *inode;
struct cgroup_subsys_state *css;
/* check in cgroup filesystem dir */
if (inode->i_op != &cgroup_dir_inode_operations)
return ERR_PTR(-EBADF);
if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
return ERR_PTR(-EINVAL);
/* get cgroup */
cgrp = __d_cgrp(f->f_dentry);
css = cgrp->subsys[id];
return css ? css : ERR_PTR(-ENOENT);
}
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
if (!css)
return ERR_PTR(-ENOMEM);
return css;
}
{
}
static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft)
{
}
static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft)
{
return (u64)(unsigned long)current->cgroups;
}
static u64 current_css_set_refcount_read(struct cgroup *cgrp,
struct cftype *cft)
{
u64 count;
rcu_read_lock();
count = atomic_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
static int current_css_set_cg_links_read(struct cgroup *cgrp,
struct cftype *cft,
struct seq_file *seq)
{
struct cgrp_cset_link *link;
struct css_set *cset;
read_lock(&css_set_lock);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
const char *name;
if (c->dentry)
name = c->dentry->d_name.name;
else
name = "?";
seq_printf(seq, "Root %d group %s\n",
c->root->hierarchy_id, name);
}
rcu_read_unlock();
read_unlock(&css_set_lock);
return 0;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
static int cgroup_css_links_read(struct cgroup *cgrp,
struct cftype *cft,
struct seq_file *seq)
{
struct cgrp_cset_link *link;
read_lock(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
seq_printf(seq, "css_set %p\n", cset);
list_for_each_entry(task, &cset->tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS) {
seq_puts(seq, " ...\n");
break;
} else {
seq_printf(seq, " task %d\n",
task_pid_vnr(task));
}
}
}
read_unlock(&css_set_lock);
return 0;
}
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
{
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
}
static struct cftype debug_files[] = {
{
.name = "taskcount",
.read_u64 = debug_taskcount_read,
},
{
.name = "current_css_set",
.read_u64 = current_css_set_read,
},
{
.name = "current_css_set_refcount",
.read_u64 = current_css_set_refcount_read,
},
{
.name = "current_css_set_cg_links",
.read_seq_string = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
.read_seq_string = cgroup_css_links_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
{ } /* terminate */
};
struct cgroup_subsys debug_subsys = {
.name = "debug",
Tejun Heo
committed
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
.subsys_id = debug_subsys_id,
.base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */