Newer
Older
* refs of all attached css's are killed. If @cgrp doesn't have
* any css, we kick it off here.
*/
if (!cgrp->nr_css)
cgroup_destroy_css_killed(cgrp);
/* remove @cgrp directory along with the base files */
mutex_unlock(&cgroup_mutex);
* There are two control paths which try to determine cgroup from
* dentry without going through kernfs - cgroupstats_build() and
* css_tryget_from_dir(). Those are supported by RCU protecting
* clearing of cgrp->kn->priv backpointer, which should happen
* after all files under it have been removed.
kernfs_remove(cgrp->kn); /* @cgrp has an extra ref on its kn */
RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
* cgroup_destroy_css_killed - the second step of cgroup destruction
* @work: cgroup->destroy_free_work
*
* This function is invoked from a work item for a cgroup which is being
* destroyed after all css's are offlined and performs the rest of
* destruction. This is the second step of destruction described in the
* comment above cgroup_destroy_locked().
static void cgroup_destroy_css_killed(struct cgroup *cgrp)
{
struct cgroup *parent = cgrp->parent;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
/* delete this cgroup from parent->children */
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
struct cgroup *cgrp = kn->priv;
int ret = 0;
/*
* This is self-destruction but @kn can't be removed while this
* callback is in progress. Let's break active protection. Once
* the protection is broken, @cgrp can be destroyed at any point.
* Pin it so that it stays accessible.
*/
cgroup_get(cgrp);
kernfs_break_active_protection(kn);
/*
* @cgrp might already have been destroyed while we're trying to
* grab the mutexes.
*/
if (!cgroup_is_dead(cgrp))
ret = cgroup_destroy_locked(cgrp);
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
.remount_fs = cgroup_remount,
.show_options = cgroup_show_options,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
};
static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
mutex_lock(&cgroup_mutex);
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_css(css, ss, &cgrp_dfl_root.cgrp);
/* Update the init_css_set to contain a subsys
* pointer to this state - since the subsystem is
* init_css_set is in the subsystem's root cgroup. */
init_css_set.subsys[ss->id] = css;
need_forkexit_callback |= ss->fork || ss->exit;
/* At system boot, before all subsystems have been
* registered, no tasks have been forked, so we don't
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
BUG_ON(online_css(css));
cgrp_dfl_root.cgrp.subsys_mask |= 1 << ss->id;
mutex_unlock(&cgroup_mutex);
* cgroup_init_early - cgroup initialization at system boot
*
* Initialize cgroups at system boot, and initialize any
* subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
static struct cgroup_sb_opts __initdata opts =
{ .flags = CGRP_ROOT_SANE_BEHAVIOR };
init_cgroup_root(&cgrp_dfl_root, &opts);
RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
"invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n",
i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
"cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
ss->name = cgroup_subsys_name[i];
if (ss->early_init)
cgroup_init_subsys(ss);
}
return 0;
}
/**
* cgroup_init - cgroup initialization
*
* Register cgroup filesystem and /proc file, and initialize
* any subsystems that didn't request early init.
*/
int __init cgroup_init(void)
{
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
mutex_lock(&cgroup_tree_mutex);
/* Add init_css_set to the hash table */
key = css_set_hash(init_css_set.subsys);
hash_add(css_set_table, &init_css_set.hlist, key);
BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
mutex_unlock(&cgroup_tree_mutex);
for_each_subsys(ss, ssid) {
if (!ss->early_init)
cgroup_init_subsys(ss);
/*
* cftype registration needs kmalloc and can't be done
* during early_init. Register base cftypes separately.
*/
if (ss->base_cftypes)
WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
}
cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj);
err = register_filesystem(&cgroup_fs_type);
if (err < 0) {
kobject_put(cgroup_kobj);
proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
static int __init cgroup_wq_init(void)
{
/*
* There isn't much point in executing destruction path in
* parallel. Good chunk is serialized with cgroup_mutex anyway.
* Use 1 for @max_active.
*
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
/*
* Used to destroy pidlists and separate to serve as flush domain.
* Cap @max_active to 1 too.
*/
cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy",
0, 1);
BUG_ON(!cgroup_pidlist_destroy_wq);
return 0;
}
core_initcall(cgroup_wq_init);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
/* TODO: Use a proper seq_file iterator */
int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
struct cgroup_root *root;
if (!buf)
goto out;
retval = -ESRCH;
pid = m->private;
tsk = get_pid_task(pid, PIDTYPE_PID);
if (!tsk)
goto out_free;
retval = 0;
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
for_each_root(root) {
struct cgroup *cgrp;
if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
if (root->cgrp.subsys_mask & (1 << ssid))
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
cgrp = task_cgroup_from_root(tsk, root);
path = cgroup_path(cgrp, buf, PATH_MAX);
if (!path) {
retval = -ENAMETOOLONG;
seq_putc(m, '\n');
}
out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
* ideally we don't want subsystems moving around while we do this.
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys/hierarchy state.
*/
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->name, ss->root->hierarchy_id,
Tejun Heo
committed
atomic_read(&ss->root->nr_cgrps), !ss->disabled);
mutex_unlock(&cgroup_mutex);
return 0;
}
static int cgroupstats_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_cgroupstats_show, NULL);
static const struct file_operations proc_cgroupstats_operations = {
.open = cgroupstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
* A task is associated with the init_css_set until cgroup_post_fork()
* attaches it to the parent's css_set. Empty cg_list indicates that
* @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
Tejun Heo
committed
* Adds the task to the list running through its css_set if necessary and
* call the subsystem fork() callbacks. Has to be after the task is
* visible on the task list in case we race with the first call to
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
Tejun Heo
committed
* list.
void cgroup_post_fork(struct task_struct *child)
{
Tejun Heo
committed
int i;
Frederic Weisbecker
committed
/*
* This may race against cgroup_enable_task_cg_links(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
* use_task_css_set_links or cgroup_enable_task_cg_lists() sees
* @child during its iteration.
*
* If we won the race, @child is associated with %current's
* css_set. Grabbing css_set_rwsem guarantees both that the
* association is stable, and, on completion of the parent's
* migration, @child is visible in the source of migration or
* already in the destination cgroup. This guarantee is necessary
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
* Note that if we lose to cgroup_enable_task_cg_links(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
Frederic Weisbecker
committed
*/
if (use_task_css_set_links) {
struct css_set *cset;
down_write(&css_set_rwsem);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
rcu_assign_pointer(child->cgroups, cset);
list_add(&child->cg_list, &cset->tasks);
get_css_set(cset);
}
up_write(&css_set_rwsem);
Tejun Heo
committed
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
if (need_forkexit_callback) {
Tejun Heo
committed
if (ss->fork)
ss->fork(child);
}
Tejun Heo
committed
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
* call cgroup_exit() while the task is still competent to handle
* notify_on_release(), then leave the task attached to the root cgroup in
* each hierarchy for the remainder of its exit. No need to bother with
* init_css_set refcnting. init_css_set never goes away and we can't race
* with migration path - PF_EXITING is visible to migration path.
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
struct css_set *cset;
bool put_cset = false;
* Unlink from @tsk from its css_set. As migration path can't race
* with us, we can check cg_list without grabbing css_set_rwsem.
*/
if (!list_empty(&tsk->cg_list)) {
down_write(&css_set_rwsem);
list_del_init(&tsk->cg_list);
up_write(&css_set_rwsem);
/* Reassign the task to the init_css_set. */
cset = task_css_set(tsk);
RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
if (run_callbacks && need_forkexit_callback) {
/* see cgroup_post_fork() for details */
for_each_subsys(ss, i) {
struct cgroup_subsys_state *old_css = cset->subsys[i];
struct cgroup_subsys_state *css = task_css(tsk, i);
ss->exit(css, old_css, tsk);
if (put_cset)
put_css_set(cset, true);
static void check_for_release(struct cgroup *cgrp)
if (cgroup_is_releasable(cgrp) &&
list_empty(&cgrp->cset_links) && list_empty(&cgrp->children)) {
/*
* Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
int need_schedule_work = 0;
raw_spin_lock(&release_list_lock);
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
raw_spin_unlock(&release_list_lock);
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*/
static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
raw_spin_unlock(&release_list_lock);
if (!pathbuf)
goto continue_free;
path = cgroup_path(cgrp, pathbuf, PATH_MAX);
if (!path)
goto continue_free;
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!agentbuf)
goto continue_free;
i = 0;
argv[i++] = agentbuf;
argv[i] = NULL;
i = 0;
/* minimal command environment */
envp[i++] = "HOME=/";
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[i] = NULL;
/* Drop the lock while we invoke the usermode helper,
* since the exec could involve hitting disk and hence
* be a slow process */
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
mutex_lock(&cgroup_mutex);
continue_free:
kfree(pathbuf);
kfree(agentbuf);
raw_spin_lock(&release_list_lock);
raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
static int __init cgroup_disable(char *str)
{
char *token;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
if (!strcmp(token, ss->name)) {
ss->disabled = 1;
printk(KERN_INFO "Disabling %s control group"
" subsystem\n", ss->name);
break;
}
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
/**
* css_tryget_from_dir - get corresponding css from the dentry of a cgroup dir
* @dentry: directory dentry of interest
* @ss: subsystem of interest
* If @dentry is a directory for a cgroup which has @ss enabled on it, try
* to get the corresponding css and return it. If such css doesn't exist
* or can't be pinned, an ERR_PTR value is returned.
struct cgroup_subsys_state *css_tryget_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup_subsys_state *css = NULL;
/* is @dentry a cgroup dir? */
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
kernfs_type(kn) != KERNFS_DIR)
rcu_read_lock();
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See destroy_locked() for details.
*/
cgrp = rcu_dereference(kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
if (!css || !css_tryget(css))
css = ERR_PTR(-ENOENT);
rcu_read_unlock();
return css;
/**
* css_from_id - lookup css by id
* @id: the cgroup id
* @ss: cgroup subsys to be looked into
*
* Returns the css if there's valid one with @id, otherwise returns NULL.
* Should be called under rcu_read_lock().
*/
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
struct cgroup *cgrp;
cgrp = idr_find(&ss->root->cgroup_idr, id);
if (cgrp)
return cgroup_css(cgrp, ss);
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
if (!css)
return ERR_PTR(-ENOMEM);
return css;
}
static void debug_css_free(struct cgroup_subsys_state *css)
{
kfree(css);
}
static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return cgroup_task_count(css->cgroup);
}
static u64 current_css_set_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return (u64)(unsigned long)current->cgroups;
}
static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
{
u64 count;
rcu_read_lock();
count = atomic_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
struct cgrp_cset_link *link;
struct css_set *cset;
char *name_buf;
name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name_buf)
return -ENOMEM;
down_read(&css_set_rwsem);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
cgroup_name(c, name_buf, NAME_MAX + 1);
seq_printf(seq, "Root %d group %s\n",
}
rcu_read_unlock();
up_read(&css_set_rwsem);
return 0;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
down_read(&css_set_rwsem);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
seq_printf(seq, "css_set %p\n", cset);
list_for_each_entry(task, &cset->tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
goto overflow;
seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
list_for_each_entry(task, &cset->mg_tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
goto overflow;
seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
continue;
overflow:
seq_puts(seq, " ...\n");
}
up_read(&css_set_rwsem);
return 0;
}
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
}
static struct cftype debug_files[] = {
{
.name = "taskcount",
.read_u64 = debug_taskcount_read,
},
{
.name = "current_css_set",
.read_u64 = current_css_set_read,
},
{
.name = "current_css_set_refcount",
.read_u64 = current_css_set_refcount_read,
},
{
.name = "current_css_set_cg_links",
.seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
.seq_show = cgroup_css_links_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
{ } /* terminate */
};
struct cgroup_subsys debug_cgrp_subsys = {
Tejun Heo
committed
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
.base_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */