Newer
Older
strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
return buf;
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* returns cft->mode if ->mode is not 0
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
umode_t mode = 0;
if (cft->mode)
return cft->mode;
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
if (cft->write_u64 || cft->write_s64 || cft->write)
mode |= S_IWUSR;
return mode;
static void cgroup_get(struct cgroup *cgrp)
static void cgroup_put(struct cgroup *cgrp)
/**
* cgroup_refresh_child_subsys_mask - update child_subsys_mask
* @cgrp: the target cgroup
*
* On the default hierarchy, a subsystem may request other subsystems to be
* enabled together through its ->depends_on mask. In such cases, more
* subsystems than specified in "cgroup.subtree_control" may be enabled.
*
* This function determines which subsystems need to be enabled given the
* current @cgrp->subtree_control and records it in
* @cgrp->child_subsys_mask. The resulting mask is always a superset of
* @cgrp->subtree_control and follows the usual hierarchy rules.
*/
static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
{
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
struct cgroup *parent = cgroup_parent(cgrp);
unsigned int cur_ss_mask = cgrp->subtree_control;
struct cgroup_subsys *ss;
int ssid;
lockdep_assert_held(&cgroup_mutex);
if (!cgroup_on_dfl(cgrp)) {
cgrp->child_subsys_mask = cur_ss_mask;
return;
}
while (true) {
unsigned int new_ss_mask = cur_ss_mask;
for_each_subsys(ss, ssid)
if (cur_ss_mask & (1 << ssid))
new_ss_mask |= ss->depends_on;
/*
* Mask out subsystems which aren't available. This can
* happen only if some depended-upon subsystems were bound
* to non-default hierarchies.
*/
if (parent)
new_ss_mask &= parent->child_subsys_mask;
else
new_ss_mask &= cgrp->root->subsys_mask;
if (new_ss_mask == cur_ss_mask)
break;
cur_ss_mask = new_ss_mask;
}
cgrp->child_subsys_mask = cur_ss_mask;
/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
* This helper undoes cgroup_kn_lock_live() and should be invoked before
* the method finishes if locking succeeded. Note that once this function
* returns the cgroup returned by cgroup_kn_lock_live() may become
* inaccessible any time. If the caller intends to continue to access the
* cgroup, it should pin it before invoking this function.
*/
static void cgroup_kn_unlock(struct kernfs_node *kn)
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
mutex_unlock(&cgroup_mutex);
kernfs_unbreak_active_protection(kn);
cgroup_put(cgrp);
/**
* cgroup_kn_lock_live - locking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
* This helper is to be used by a cgroup kernfs method currently servicing
* @kn. It breaks the active protection, performs cgroup locking and
* verifies that the associated cgroup is alive. Returns the cgroup if
* alive; otherwise, %NULL. A successful return should be undone by a
* matching cgroup_kn_unlock() invocation.
*
* Any cgroup kernfs method implementation which requires locking the
* associated cgroup should use this helper. It avoids nesting cgroup
* locking under kernfs active protection and allows all kernfs operations
* including self-removal.
*/
static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
struct cgroup *cgrp;
if (kernfs_type(kn) == KERNFS_DIR)
cgrp = kn->priv;
else
cgrp = kn->parent->priv;
* We're gonna grab cgroup_mutex which nests outside kernfs
* active_ref. cgroup liveliness check alone provides enough
* protection against removal. Ensure @cgrp stays accessible and
* break the active_ref protection.
cgroup_get(cgrp);
kernfs_break_active_protection(kn);
if (!cgroup_is_dead(cgrp))
return cgrp;
cgroup_kn_unlock(kn);
return NULL;
static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
lockdep_assert_held(&cgroup_mutex);
kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
Tejun Heo
committed
* cgroup_clear_dir - remove subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be removed
*/
static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
struct cgroup_subsys *ss;
Tejun Heo
committed
int i;
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!(subsys_mask & (1 << i)))
list_for_each_entry(cfts, &ss->cfts, node)
cgroup_addrm_files(cgrp, cfts, false);
static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
unsigned int tmp_ss_mask;
for_each_subsys(ss, ssid) {
if (!(ss_mask & (1 << ssid)))
continue;
/* if @ss has non-root csses attached to it, can't move */
if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
/* skip creating root files on dfl_root for inhibited subsystems */
tmp_ss_mask = ss_mask;
if (dst_root == &cgrp_dfl_root)
tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
if (ret) {
if (dst_root != &cgrp_dfl_root)
return ret;
/*
* Rebinding back to the default root is not allowed to
* fail. Using both default and non-default roots should
* be rare. Moving subsystems back and forth even more so.
* Just warn about it and continue.
*/
if (cgrp_dfl_root_visible) {
pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
}
Tejun Heo
committed
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
for_each_subsys(ss, ssid)
cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
for_each_subsys(ss, ssid) {
struct cgroup_root *src_root;
struct cgroup_subsys_state *css;
if (!(ss_mask & (1 << ssid)))
continue;
src_root = ss->root;
css = cgroup_css(&src_root->cgrp, ss);
WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
ss->root = dst_root;
css->cgroup = &dst_root->cgrp;
down_write(&css_set_rwsem);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
&dst_root->cgrp.e_csets[ss->id]);
up_write(&css_set_rwsem);
Tejun Heo
committed
src_root->subsys_mask &= ~(1 << ssid);
src_root->cgrp.subtree_control &= ~(1 << ssid);
cgroup_refresh_child_subsys_mask(&src_root->cgrp);
Tejun Heo
committed
Tejun Heo
committed
/* default hierarchy doesn't enable controllers by default */
Tejun Heo
committed
dst_root->subsys_mask |= 1 << ssid;
if (dst_root != &cgrp_dfl_root) {
dst_root->cgrp.subtree_control |= 1 << ssid;
cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
}
if (ss->bind)
ss->bind(css);
static int cgroup_show_options(struct seq_file *seq,
struct kernfs_root *kf_root)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_subsys *ss;
Tejun Heo
committed
if (root->subsys_mask & (1 << ssid))
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
return 0;
}
struct cgroup_sb_opts {
unsigned int subsys_mask;
unsigned int flags;
char *release_agent;
bool cpuset_clone_children;
/* User explicitly requested empty subsystem */
bool none;
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
char *token, *o = data;
bool all_ss = false, one_ss = false;
#ifdef CONFIG_CPUSETS
mask = ~(1U << cpuset_cgrp_id);
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
continue;
}
if (!strcmp(token, "all")) {
/* Mutually exclusive option 'all' + subsystem name */
if (one_ss)
return -EINVAL;
all_ss = true;
continue;
}
if (!strcmp(token, "__DEVEL__sane_behavior")) {
opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
continue;
}
if (!strcmp(token, "noprefix")) {
opts->flags |= CGRP_ROOT_NOPREFIX;
continue;
}
if (!strcmp(token, "clone_children")) {
opts->cpuset_clone_children = true;
opts->flags |= CGRP_ROOT_XATTR;
if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
continue;
}
if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
return -EINVAL;
/* Must match [\w.-]+ */
for (i = 0; i < strlen(name); i++) {
char c = name[i];
if (isalnum(c))
continue;
if ((c == '.') || (c == '-') || (c == '_'))
continue;
return -EINVAL;
}
/* Specifying two names is forbidden */
if (opts->name)
return -EINVAL;
opts->name = kstrndup(name,
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
continue;
}
if (strcmp(token, ss->name))
continue;
if (ss->disabled)
continue;
/* Mutually exclusive option 'all' + subsystem name */
if (all_ss)
return -EINVAL;
opts->subsys_mask |= (1 << i);
one_ss = true;
break;
}
if (i == CGROUP_SUBSYS_COUNT)
return -ENOENT;
}
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
if (nr_opts != 1) {
pr_err("sane_behavior: no other mount options allowed\n");
/*
* If the 'all' option was specified select all the subsystems,
* otherwise if 'none', 'name=' and a subsystem name options were
* not specified, let's default to 'all'
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
if (!ss->disabled)
opts->subsys_mask |= (1 << i);
/*
* We either have to specify by name or by subsystems. (So all
* empty hierarchies must have a name).
*/
if (!opts->subsys_mask && !opts->name)
return -EINVAL;
/*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
*/
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
struct cgroup_sb_opts opts;
unsigned int added_mask, removed_mask;
if (root == &cgrp_dfl_root) {
pr_err("remount is not allowed\n");
mutex_lock(&cgroup_mutex);
/* See what subsystems are wanted */
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
Tejun Heo
committed
if (opts.subsys_mask != root->subsys_mask || opts.release_agent)
pr_warn("option changes via remount are deprecated (pid=%d comm=%s)\n",
task_tgid_nr(current), current->comm);
Tejun Heo
committed
added_mask = opts.subsys_mask & ~root->subsys_mask;
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
Tejun Heo
committed
/* remounting is not allowed for populated hierarchies */
if (!list_empty(&root->cgrp.self.children)) {
Tejun Heo
committed
ret = -EBUSY;
ret = rebind_subsystems(root, added_mask);
Tejun Heo
committed
if (ret)
rebind_subsystems(&cgrp_dfl_root, removed_mask);
if (opts.release_agent) {
spin_lock(&release_agent_path_lock);
strcpy(root->release_agent_path, opts.release_agent);
spin_unlock(&release_agent_path_lock);
}
mutex_unlock(&cgroup_mutex);
return ret;
}
/*
* To reduce the fork() overhead for systems that are not actually using
* their cgroups capability, we don't maintain the lists running through
* each css_set to its tasks until we see the list actually used - in other
* words after the first mount.
*/
static bool use_task_css_set_links __read_mostly;
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
down_write(&css_set_rwsem);
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
if (use_task_css_set_links)
goto out_unlock;
use_task_css_set_links = true;
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
do_each_thread(g, p) {
WARN_ON_ONCE(!list_empty(&p->cg_list) ||
task_css_set(p) != &init_css_set);
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
spin_lock_irq(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
list_add(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
spin_unlock_irq(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
up_write(&css_set_rwsem);
static void init_cgroup_housekeeping(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ssid;
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->release_list);
Ben Blum
committed
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
cgrp->self.flags |= CSS_ONLINE;
for_each_subsys(ss, ssid)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
Tejun Heo
committed
init_waitqueue_head(&cgrp->offline_waitq);
static void init_cgroup_root(struct cgroup_root *root,
struct cgroup *cgrp = &root->cgrp;
INIT_LIST_HEAD(&root->root_list);
Tejun Heo
committed
atomic_set(&root->nr_cgrps, 1);
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
root->flags = opts->flags;
if (opts->release_agent)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
if (opts->cpuset_clone_children)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
struct cftype *base_files;
struct css_set *cset;
int i, ret;
lockdep_assert_held(&cgroup_mutex);
ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
root_cgrp->id = ret;
ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
if (ret)
goto out;
* We're accessing css_set_count without locking css_set_rwsem here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. The worst that can happen is that we
* have some link structures left over
*/
ret = allocate_cgrp_cset_links(css_set_count, &tmp_links);
if (ret)
ret = cgroup_init_root_id(root);
root->kf_root = kernfs_create_root(&cgroup_kf_syscall_ops,
KERNFS_ROOT_CREATE_DEACTIVATED,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
root_cgrp->kn = root->kf_root->kn;
if (root == &cgrp_dfl_root)
base_files = cgroup_dfl_base_files;
else
base_files = cgroup_legacy_base_files;
ret = cgroup_addrm_files(root_cgrp, base_files, true);
ret = rebind_subsystems(root, ss_mask);
/*
* There must be no failure case after here, since rebinding takes
* care of subsystems' refcounts, which are explicitly dropped in
* the failure exit path.
*/
list_add(&root->root_list, &cgroup_roots);
cgroup_root_count++;
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
down_write(&css_set_rwsem);
hash_for_each(css_set_table, i, cset, hlist)
link_css_set(&tmp_links, cset, root_cgrp);
up_write(&css_set_rwsem);
BUG_ON(!list_empty(&root_cgrp->self.children));
Tejun Heo
committed
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
percpu_ref_cancel_init(&root_cgrp->self.refcnt);
free_cgrp_cset_links(&tmp_links);
return ret;
static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
struct cgroup_root *root;
struct cgroup_sb_opts opts;
/*
* The first time anyone tries to mount a cgroup, enable the list
* linking each css_set to its tasks and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
goto out_unlock;
if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
ret = 0;
goto out_unlock;
for_each_root(root) {
Tejun Heo
committed
if (root == &cgrp_dfl_root)
Tejun Heo
committed
* If we asked for a name then it must match. Also, if
* name matches but sybsys_mask doesn't, we should fail.
* Remember whether name matched.
if (opts.name) {
if (strcmp(opts.name, root->name))
continue;
name_match = true;
}
* If we asked for subsystems (or explicitly for no
* subsystems) then they must match.
Tejun Heo
committed
(opts.subsys_mask != root->subsys_mask)) {
if (!name_match)
continue;
ret = -EBUSY;
goto out_unlock;
}
if (root->flags ^ opts.flags)
pr_warn("new mount options do not match the existing superblock, will be ignored\n");
* A root's lifetime is governed by its root cgroup.
* tryget_live failure indicate that the root is being
* destroyed. Wait for destruction to complete so that the
* subsystems are free. We can use wait_queue for the wait
* but this path is super cold. Let's just sleep for a bit
* and retry.
if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
mutex_unlock(&cgroup_mutex);
msleep(10);
ret = restart_syscall();
goto out_free;
* No such thing, create a new one. name= matching without subsys
* specification is allowed for already existing hierarchies but we
* can't create new one without subsys specification.
if (!opts.subsys_mask && !opts.none) {
ret = -EINVAL;
goto out_unlock;
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
ret = -ENOMEM;
init_cgroup_root(root, &opts);
ret = cgroup_setup_root(root, opts.subsys_mask);
out_unlock:
mutex_unlock(&cgroup_mutex);
kfree(opts.release_agent);
kfree(opts.name);
return ERR_PTR(ret);
dentry = kernfs_mount(fs_type, flags, root->kf_root,
CGROUP_SUPER_MAGIC, &new_sb);
cgroup_put(&root->cgrp);
return dentry;
}
static void cgroup_kill_sb(struct super_block *sb)
{
struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
struct cgroup_root *root = cgroup_root_from_kf(kf_root);
/*
* If @root doesn't have any mounts or children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
* cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
if (css_has_online_children(&root->cgrp.self) ||
root == &cgrp_dfl_root)
cgroup_put(&root->cgrp);
else
percpu_ref_kill(&root->cgrp.self.refcnt);
}
static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
.kill_sb = cgroup_kill_sb,
};
static struct kobject *cgroup_kobj;
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
* @task: target task
* @buf: the buffer to write the path into
* @buflen: the length of the buffer
*
* Determine @task's cgroup on the first (the one with the lowest non-zero
* hierarchy_id) cgroup hierarchy and copy its path into @buf. This
* function grabs cgroup_mutex and shouldn't be used inside locks used by
* cgroup controller callbacks.
*
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
struct cgroup_root *root;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
if (root) {
cgrp = task_cgroup_from_root(task, root);
} else {
/* if no hierarchy exists, everyone is in "/" */
if (strlcpy(buf, "/", buflen) < buflen)
path = buf;
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
EXPORT_SYMBOL_GPL(task_cgroup_path);
/* used to track tasks and other necessary states during migration */
Tejun Heo
committed
struct cgroup_taskset {
/* the src and dst cset list running through cset->mg_node */
struct list_head src_csets;
struct list_head dst_csets;
/*
* Fields for cgroup_taskset_*() iteration.
*
* Before migration is committed, the target migration tasks are on
* ->mg_tasks of the csets on ->src_csets. After, on ->mg_tasks of
* the csets on ->dst_csets. ->csets point to either ->src_csets
* or ->dst_csets depending on whether migration is committed.
*
* ->cur_csets and ->cur_task point to the current task position
* during iteration.
*/
struct list_head *csets;
struct css_set *cur_cset;
struct task_struct *cur_task;
Tejun Heo
committed
};
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
*
* @tset iteration is initialized and the first task is returned.
*/
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
tset->cur_task = NULL;
return cgroup_taskset_next(tset);
Tejun Heo
committed
}
/**
* cgroup_taskset_next - iterate to the next task in taskset
* @tset: taskset of interest
*
* Return the next task in @tset. Iteration must have been initialized
* with cgroup_taskset_first().
*/
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
Tejun Heo
committed
while (&cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
else
task = list_next_entry(task, cg_list);
Tejun Heo
committed
if (&task->cg_list != &cset->mg_tasks) {
tset->cur_cset = cset;
tset->cur_task = task;
return task;
}
Tejun Heo
committed
cset = list_next_entry(cset, mg_node);
task = NULL;
}
Tejun Heo
committed
return NULL;
Tejun Heo
committed
}
* cgroup_task_migrate - move a task from one cgroup to another.
* @old_cgrp: the cgroup @tsk is being migrated from
* @tsk: the task being migrated
* @new_cset: the new css_set @tsk is being attached to
* Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
static void cgroup_task_migrate(struct cgroup *old_cgrp,
struct task_struct *tsk,
struct css_set *new_cset)
struct css_set *old_cset;
lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_rwsem);
* We are synchronized through threadgroup_lock() against PF_EXITING
* setting such that we can't race against cgroup_exit() changing the
* css_set to init_css_set and dropping the old one.
Frederic Weisbecker
committed
WARN_ON_ONCE(tsk->flags & PF_EXITING);
old_cset = task_css_set(tsk);
get_css_set(new_cset);
rcu_assign_pointer(tsk->cgroups, new_cset);
/*
* Use move_tail so that cgroup_taskset_first() still returns the
* leader after migration. This works because cgroup_migrate()
* ensures that the dst_cset of the leader is the first on the
* tset's dst_csets list.
*/
list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);