Newer
Older
{
if (root) {
/* hierarchy ID should already have been released */
WARN_ON_ONCE(root->hierarchy_id);
idr_destroy(&root->cgroup_idr);
kfree(root);
}
}
static void cgroup_destroy_root(struct cgroup_root *root)
struct cgroup *cgrp = &root->cgrp;
struct cgrp_cset_link *link, *tmp_link;
BUG_ON(!list_empty(&cgrp->self.children));
/* Rebind all subsystems back to the default hierarchy */
Tejun Heo
committed
rebind_subsystems(&cgrp_dfl_root, root->subsys_mask);
/*
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
down_write(&css_set_rwsem);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
kfree(link);
}
up_write(&css_set_rwsem);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
cgroup_root_count--;
}
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
cgroup_free_root(root);
}
/* look up cgroup associated with given css_set on the specified hierarchy */
static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup_root *root)
{
struct cgroup *res = NULL;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_rwsem);
if (cset == &init_css_set) {
} else {
struct cgrp_cset_link *link;
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
res = c;
break;
}
}
}
BUG_ON(!res);
return res;
}
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_rwsem held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
{
/*
* No need to lock the task - since we hold cgroup_mutex the
* task can't change groups, so the only thing that can happen
* is that it exits and its css is set back to init_css_set.
*/
return cset_cgroup_from_root(task_css_set(task), root);
}
/*
* A task must hold cgroup_mutex to modify cgroups.
*
* Any task can increment and decrement the count field without lock.
* So in general, code holding cgroup_mutex can't rely on the count
* field not changing. However, if the count goes to zero, then only
* cgroup_attach_task() can increment it again. Because a count of zero
* means that no tasks are currently attached, therefore there is no
* way a task attached to that cgroup can fork (the other way to
* increment the count). So code holding cgroup_mutex can safely
* assume that if the count is zero, it will stay zero. Similarly, if
* a task holds cgroup_mutex on a cgroup with zero count, it
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
* least one task in the system (init, pid == 1), therefore, root cgroup
* always has either children cgroups and/or using tasks. So we don't
* need a special hack to ensure that root cgroup cannot be deleted.
*
* P.S. One more locking exception. RCU is used to guard the
* update of a tasks cgroup pointer by cgroup_attach_task()
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
char *buf)
struct cgroup_subsys *ss = cft->ss;
if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
!(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
cft->name);
else
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* S_IRUGO for read, S_IWUSR for write.
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
umode_t mode = 0;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write) {
if (cft->flags & CFTYPE_WORLD_WRITABLE)
mode |= S_IWUGO;
else
mode |= S_IWUSR;
}
return mode;
static void cgroup_get(struct cgroup *cgrp)
static bool cgroup_tryget(struct cgroup *cgrp)
{
return css_tryget(&cgrp->self);
}
static void cgroup_put(struct cgroup *cgrp)
Tejun Heo
committed
* cgroup_calc_child_subsys_mask - calculate child_subsys_mask
Tejun Heo
committed
* @subtree_control: the new subtree_control mask to consider
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
Tejun Heo
committed
* This function calculates which subsystems need to be enabled if
* @subtree_control is to be applied to @cgrp. The returned mask is always
* a superset of @subtree_control and follows the usual hierarchy rules.
static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
unsigned long subtree_control)
struct cgroup *parent = cgroup_parent(cgrp);
unsigned long cur_ss_mask = subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
Tejun Heo
committed
if (!cgroup_on_dfl(cgrp))
return cur_ss_mask;
unsigned long new_ss_mask = cur_ss_mask;
for_each_subsys_which(ss, ssid, &cur_ss_mask)
new_ss_mask |= ss->depends_on;
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
if (parent)
new_ss_mask &= parent->child_subsys_mask;
else
new_ss_mask &= cgrp->root->subsys_mask;
if (new_ss_mask == cur_ss_mask)
break;
cur_ss_mask = new_ss_mask;
}
Tejun Heo
committed
return cur_ss_mask;
}
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
* @cgrp: the target cgroup
*
* Update @cgrp->child_subsys_mask according to the current
* @cgrp->subtree_control using cgroup_calc_child_subsys_mask().
*/
static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
{
cgrp->child_subsys_mask =
cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control);
/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
* This helper undoes cgroup_kn_lock_live() and should be invoked before
* the method finishes if locking succeeded. Note that once this function
* returns the cgroup returned by cgroup_kn_lock_live() may become
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
static void cgroup_kn_unlock(struct kernfs_node *kn)
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
mutex_unlock(&cgroup_mutex);
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
* matching cgroup_kn_unlock() invocation.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
* We're gonna grab cgroup_mutex which nests outside kernfs
* active_ref. cgroup liveliness check alone provides enough
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
if (!cgroup_tryget(cgrp))
return NULL;
kernfs_break_active_protection(kn);
if (!cgroup_is_dead(cgrp))
return cgrp;
cgroup_kn_unlock(kn);
return NULL;
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
lockdep_assert_held(&cgroup_mutex);
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
Tejun Heo
committed
* cgroup_clear_dir - remove subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
struct cgroup_subsys *ss;
Tejun Heo
committed
int i;
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!(subsys_mask & (1 << i)))
list_for_each_entry(cfts, &ss->cfts, node)
cgroup_addrm_files(cgrp, cfts, false);
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
/**
* cgroup_populate_dir - create subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be added
*
* On failure, no file is added.
*/
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
int i, ret = 0;
/* process cftsets of each subsystem */
for_each_subsys(ss, i) {
struct cftype *cfts;
if (!(subsys_mask & (1 << i)))
continue;
list_for_each_entry(cfts, &ss->cfts, node) {
ret = cgroup_addrm_files(cgrp, cfts, true);
if (ret < 0)
goto err;
}
}
return 0;
err:
cgroup_clear_dir(cgrp, subsys_mask);
return ret;
}
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask)
struct cgroup *dcgrp = &dst_root->cgrp;
for_each_subsys_which(ss, ssid, &ss_mask) {
/* if @ss has non-root csses attached to it, can't move */
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
/* skip creating root files on dfl_root for inhibited subsystems */
tmp_ss_mask = ss_mask;
if (dst_root == &cgrp_dfl_root)
tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
ret = cgroup_populate_dir(dcgrp, tmp_ss_mask);
if (ret) {
if (dst_root != &cgrp_dfl_root)
return ret;
/*
* Rebinding back to the default root is not allowed to
* fail. Using both default and non-default roots should
* be rare. Moving subsystems back and forth even more so.
* Just warn about it and continue.
*/
if (cgrp_dfl_root_visible) {
pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
Tejun Heo
committed
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
for_each_subsys_which(ss, ssid, &ss_mask)
cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
for_each_subsys_which(ss, ssid, &ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
WARN_ON(!css || cgroup_css(dcgrp, ss));
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
down_write(&css_set_rwsem);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
Tejun Heo
committed
src_root->subsys_mask &= ~(1 << ssid);
scgrp->subtree_control &= ~(1 << ssid);
cgroup_refresh_child_subsys_mask(scgrp);
Tejun Heo
committed
Tejun Heo
committed
/* default hierarchy doesn't enable controllers by default */
Tejun Heo
committed
dst_root->subsys_mask |= 1 << ssid;
Tejun Heo
committed
if (dst_root == &cgrp_dfl_root) {
static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
} else {
dcgrp->subtree_control |= 1 << ssid;
cgroup_refresh_child_subsys_mask(dcgrp);
Tejun Heo
committed
static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
if (ss->bind)
ss->bind(css);
kernfs_activate(dcgrp->kn);
static int cgroup_show_options(struct seq_file *seq,
struct kernfs_root *kf_root)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_show_option(seq, "release_agent",
root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
seq_show_option(seq, "name", root->name);
return 0;
}
struct cgroup_sb_opts {
char *release_agent;
bool cpuset_clone_children;
/* User explicitly requested empty subsystem */
bool none;
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
char *token, *o = data;
bool all_ss = false, one_ss = false;
#ifdef CONFIG_CPUSETS
mask = ~(1U << cpuset_cgrp_id);
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
continue;
}
if (!strcmp(token, "all")) {
/* Mutually exclusive option 'all' + subsystem name */
if (one_ss)
return -EINVAL;
all_ss = true;
continue;
}
if (!strcmp(token, "__DEVEL__sane_behavior")) {
opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
continue;
}
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
opts->cpuset_clone_children = true;
opts->flags |= CGRP_ROOT_XATTR;
if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
continue;
}
if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
/* Must match [\w.-]+ */
for (i = 0; i < strlen(name); i++) {
char c = name[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return -EINVAL;
}
/* Specifying two names is forbidden */
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
continue;
}
if (strcmp(token, ss->legacy_name))
if (!cgroup_ssid_enabled(i))
continue;
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
opts->subsys_mask |= (1 << i);
one_ss = true;
break;
}
if (i == CGROUP_SUBSYS_COUNT)
return -ENOENT;
}
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
if (nr_opts != 1) {
pr_err("sane_behavior: no other mount options allowed\n");
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
* not specified, let's default to 'all'
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
if (cgroup_ssid_enabled(i))
opts->subsys_mask |= (1 << i);
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
if (!opts->subsys_mask && !opts->name)
return -EINVAL;
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
unsigned long added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
Tejun Heo
committed
if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
Tejun Heo
committed
added_mask = opts.subsys_mask & ~root->subsys_mask;
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
Tejun Heo
committed
/* remounting is not allowed for populated hierarchies */
if (!list_empty(&root->cgrp.self.children)) {
Tejun Heo
committed
ret = -EBUSY;
ret = rebind_subsystems(root, added_mask);
Tejun Heo
committed
if (ret)
rebind_subsystems(&cgrp_dfl_root, removed_mask);
if (opts.release_agent) {
spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
spin_unlock(&release_agent_path_lock);
}
mutex_unlock(&cgroup_mutex);
return ret;
}
/*
* To reduce the fork() overhead for systems that are not actually using
* their cgroups capability, we don't maintain the lists running through
* each css_set to its tasks until we see the list actually used - in other
* words after the first mount.
*/
static bool use_task_css_set_links __read_mostly;
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
down_write(&css_set_rwsem);
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
if (use_task_css_set_links)
goto out_unlock;
use_task_css_set_links = true;
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
spin_lock_irq(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
list_add(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
spin_unlock_irq(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
up_write(&css_set_rwsem);
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ssid;
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
Ben Blum
committed
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
Tejun Heo
committed
init_waitqueue_head(&cgrp->offline_waitq);
INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
static void init_cgroup_root(struct cgroup_root *root,
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
Tejun Heo
committed
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
struct cftype *base_files;
struct css_set *cset;
int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
root_cgrp->id = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
GFP_KERNEL);
if (ret)
goto out;
* We're accessing css_set_count without locking css_set_rwsem here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. The worst that can happen is that we
* have some link structures left over
*/
ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
if (ret)
ret = cgroup_init_root_id(root);
root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
KERNFS_ROOT_CREATE_DEACTIVATED,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
if (root == &cgrp_dfl_root)
base_files = cgroup_dfl_base_files;
else
base_files = cgroup_legacy_base_files;
ret = cgroup_addrm_files(root_cgrp, base_files, true);
ret = rebind_subsystems(root, ss_mask);
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
list_add(&root->root_list, &cgroup_roots);
cgroup_root_count++;
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
down_write(&css_set_rwsem);
hash_for_each(css_set_table, i, cset, hlist)
link_css_set(&tmp_links, cset, root_cgrp);
up_write(&css_set_rwsem);
BUG_ON(!list_empty(&root_cgrp->self.children));
Tejun Heo
committed
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
percpu_ref_exit(&root_cgrp->self.refcnt);
free_cgrp_cset_links(&tmp_links);
return ret;
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
struct super_block *pinned_sb = NULL;
struct cgroup_root *root;
struct cgroup_sb_opts opts;
/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
goto out_unlock;
if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
ret = 0;
goto out_unlock;
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
/*
* Destruction of cgroup root is asynchronous, so subsystems may
* still be dying after the previous unmount. Let's drain the
* dying subsystems. We just need to ensure that the ones
* unmounted previously finish dying and don't care about new ones
* starting. Testing ref liveliness is good enough.
*/
for_each_subsys(ss, i) {
if (!(opts.subsys_mask & (1 << i)) ||
ss->root == &cgrp_dfl_root)
continue;
if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
}
cgroup_put(&ss->root->cgrp);
}
for_each_root(root) {
Tejun Heo
committed
if (root == &cgrp_dfl_root)
Tejun Heo
committed
* If we asked for a name then it must match. Also, if
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
if (opts.name) {
if (strcmp(opts.name, root->name))
continue;
name_match = true;
}
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
Tejun Heo
committed
(opts.subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
ret = -EBUSY;
goto out_unlock;
}
if (root->flags ^ opts.flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
* We want to reuse @root whose lifetime is governed by its
* ->cgrp. Let's check whether @root is alive and keep it
* that way. As cgroup_kill_sb() can happen anytime, we
* want to block it by pinning the sb so that @root doesn't
* get killed before mount is complete.
*
* With the sb pinned, tryget_live can reliably indicate
* whether @root can be reused. If it's being killed,
* drain it. We can use wait_queue for the wait but this
* path is super cold. Let's just sleep a bit and retry.
pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
if (IS_ERR(pinned_sb) ||
!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
if (!IS_ERR_OR_NULL(pinned_sb))
deactivate_super(pinned_sb);
ret = restart_syscall();
goto out_free;
* No such thing, create a new one. name= matching without subsys
* specification is allowed for already existing hierarchies but we