Skip to content
cgroup.c 157 KiB
Newer Older
			msleep(10);
			ret = restart_syscall();
			goto out_free;
Tejun Heo's avatar
Tejun Heo committed
		goto out_unlock;
	 * No such thing, create a new one.  name= matching without subsys
	 * specification is allowed for already existing hierarchies but we
	 * can't create new one without subsys specification.
	if (!opts.subsys_mask && !opts.none) {
		ret = -EINVAL;
		goto out_unlock;
	root = kzalloc(sizeof(*root), GFP_KERNEL);
	if (!root) {
		ret = -ENOMEM;
Tejun Heo's avatar
Tejun Heo committed
		goto out_unlock;
	init_cgroup_root(root, &opts);

	ret = cgroup_setup_root(root, opts.subsys_mask);
Tejun Heo's avatar
Tejun Heo committed
	if (ret)
		cgroup_free_root(root);
	mutex_unlock(&cgroup_mutex);
	kfree(opts.release_agent);
	kfree(opts.name);
Tejun Heo's avatar
Tejun Heo committed
	if (ret)
	dentry = kernfs_mount(fs_type, flags, root->kf_root,
				CGROUP_SUPER_MAGIC, &new_sb);
	if (IS_ERR(dentry) || !new_sb)
		cgroup_put(&root->cgrp);

	/*
	 * If @pinned_sb, we're reusing an existing root and holding an
	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
	 */
	if (pinned_sb) {
		WARN_ON(new_sb);
		deactivate_super(pinned_sb);
	}

Tejun Heo's avatar
Tejun Heo committed
	return dentry;
}

static void cgroup_kill_sb(struct super_block *sb)
{
	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
	struct cgroup_root *root = cgroup_root_from_kf(kf_root);
	/*
	 * If @root doesn't have any mounts or children, start killing it.
	 * This prevents new mounts by disabling percpu_ref_tryget_live().
	 * cgroup_mount() may wait for @root's release.
	 *
	 * And don't kill the default root.
	if (!list_empty(&root->cgrp.self.children) ||
	    root == &cgrp_dfl_root)
		cgroup_put(&root->cgrp);
	else
		percpu_ref_kill(&root->cgrp.self.refcnt);

Tejun Heo's avatar
Tejun Heo committed
	kernfs_kill_sb(sb);
}

static struct file_system_type cgroup_fs_type = {
	.name = "cgroup",
Al Viro's avatar
Al Viro committed
	.mount = cgroup_mount,
	.kill_sb = cgroup_kill_sb,
};

 * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
 * @task: target task
 * @buf: the buffer to write the path into
 * @buflen: the length of the buffer
 *
 * Determine @task's cgroup on the first (the one with the lowest non-zero
 * hierarchy_id) cgroup hierarchy and copy its path into @buf.  This
 * function grabs cgroup_mutex and shouldn't be used inside locks used by
 * cgroup controller callbacks.
 *
Tejun Heo's avatar
Tejun Heo committed
 * Return value is the same as kernfs_path().
Tejun Heo's avatar
Tejun Heo committed
char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
	struct cgroup_root *root;
Tejun Heo's avatar
Tejun Heo committed
	int hierarchy_id = 1;
	char *path = NULL;

	mutex_lock(&cgroup_mutex);
	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);

	if (root) {
		cgrp = task_cgroup_from_root(task, root);
Tejun Heo's avatar
Tejun Heo committed
		path = cgroup_path(cgrp, buf, buflen);
	} else {
		/* if no hierarchy exists, everyone is in "/" */
Tejun Heo's avatar
Tejun Heo committed
		if (strlcpy(buf, "/", buflen) < buflen)
			path = buf;
	mutex_unlock(&cgroup_mutex);
Tejun Heo's avatar
Tejun Heo committed
	return path;
EXPORT_SYMBOL_GPL(task_cgroup_path);
/* used to track tasks and other necessary states during migration */
	/* the src and dst cset list running through cset->mg_node */
	struct list_head	src_csets;
	struct list_head	dst_csets;

	/*
	 * Fields for cgroup_taskset_*() iteration.
	 *
	 * Before migration is committed, the target migration tasks are on
	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
	 * or ->dst_csets depending on whether migration is committed.
	 *
	 * ->cur_csets and ->cur_task point to the current task position
	 * during iteration.
	 */
	struct list_head	*csets;
	struct css_set		*cur_cset;
	struct task_struct	*cur_task;
#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
	.csets			= &tset.src_csets,			\
}

/**
 * cgroup_taskset_add - try to add a migration target task to a taskset
 * @task: target task
 * @tset: target taskset
 *
 * Add @task, which is a migration target, to @tset.  This function becomes
 * noop if @task doesn't need to be migrated.  @task's css_set should have
 * been added as a migration source and @task->cg_list will be moved from
 * the css_set's tasks list to mg_tasks one.
 */
static void cgroup_taskset_add(struct task_struct *task,
			       struct cgroup_taskset *tset)
{
	struct css_set *cset;

	lockdep_assert_held(&css_set_rwsem);

	/* @task either already exited or can't exit until the end */
	if (task->flags & PF_EXITING)
		return;

	/* leave @task alone if post_fork() hasn't linked it yet */
	if (list_empty(&task->cg_list))
		return;

	cset = task_css_set(task);
	if (!cset->mg_src_cgrp)
		return;

	list_move_tail(&task->cg_list, &cset->mg_tasks);
	if (list_empty(&cset->mg_node))
		list_add_tail(&cset->mg_node, &tset->src_csets);
	if (list_empty(&cset->mg_dst_cset->mg_node))
		list_move_tail(&cset->mg_dst_cset->mg_node,
			       &tset->dst_csets);
}

/**
 * cgroup_taskset_first - reset taskset and return the first task
 * @tset: taskset of interest
 *
 * @tset iteration is initialized and the first task is returned.
 */
struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset)
{
	tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
	tset->cur_task = NULL;

	return cgroup_taskset_next(tset);
}

/**
 * cgroup_taskset_next - iterate to the next task in taskset
 * @tset: taskset of interest
 *
 * Return the next task in @tset.  Iteration must have been initialized
 * with cgroup_taskset_first().
 */
struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
{
	struct css_set *cset = tset->cur_cset;
	struct task_struct *task = tset->cur_task;
	while (&cset->mg_node != tset->csets) {
		if (!task)
			task = list_first_entry(&cset->mg_tasks,
						struct task_struct, cg_list);
		else
			task = list_next_entry(task, cg_list);
		if (&task->cg_list != &cset->mg_tasks) {
			tset->cur_cset = cset;
			tset->cur_task = task;
			return task;
		}
		cset = list_next_entry(cset, mg_node);
		task = NULL;
	}
 * cgroup_task_migrate - move a task from one cgroup to another.
 * @old_cgrp: the cgroup @tsk is being migrated from
 * @tsk: the task being migrated
 * @new_cset: the new css_set @tsk is being attached to
 * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
static void cgroup_task_migrate(struct cgroup *old_cgrp,
				struct task_struct *tsk,
				struct css_set *new_cset)
	struct css_set *old_cset;
	lockdep_assert_held(&cgroup_mutex);
	lockdep_assert_held(&css_set_rwsem);

	 * We are synchronized through cgroup_threadgroup_rwsem against
	 * PF_EXITING setting such that we can't race against cgroup_exit()
	 * changing the css_set to init_css_set and dropping the old one.
	old_cset = task_css_set(tsk);
	rcu_assign_pointer(tsk->cgroups, new_cset);
	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
	 * We just gained a reference on old_cset by taking it from the
	 * task. As trading it for new_cset is protected by cgroup_mutex,
	 * we're safe to drop it here; it will be freed under RCU.
	put_css_set_locked(old_cset);
/**
 * cgroup_taskset_migrate - migrate a taskset to a cgroup
 * @tset: taget taskset
 * @dst_cgrp: destination cgroup
 *
 * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
 * ->can_attach callbacks fails and guarantees that either all or none of
 * the tasks in @tset are migrated.  @tset is consumed regardless of
 * success.
 */
static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
				  struct cgroup *dst_cgrp)
{
	struct cgroup_subsys_state *css, *failed_css = NULL;
	struct task_struct *task, *tmp_task;
	struct css_set *cset, *tmp_cset;
	int i, ret;

	/* methods shouldn't be called if no task is actually migrating */
	if (list_empty(&tset->src_csets))
		return 0;

	/* check that we can legitimately attach to the cgroup */
	for_each_e_css(css, i, dst_cgrp) {
		if (css->ss->can_attach) {
			ret = css->ss->can_attach(css, tset);
			if (ret) {
				failed_css = css;
				goto out_cancel_attach;
			}
		}
	}

	/*
	 * Now that we're guaranteed success, proceed to move all tasks to
	 * the new cgroup.  There are no failure cases after here, so this
	 * is the commit point.
	 */
	down_write(&css_set_rwsem);
	list_for_each_entry(cset, &tset->src_csets, mg_node) {
		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
			cgroup_task_migrate(cset->mg_src_cgrp, task,
					    cset->mg_dst_cset);
	}
	up_write(&css_set_rwsem);

	/*
	 * Migration is committed, all target tasks are now on dst_csets.
	 * Nothing is sensitive to fork() after this point.  Notify
	 * controllers that migration is complete.
	 */
	tset->csets = &tset->dst_csets;

	for_each_e_css(css, i, dst_cgrp)
		if (css->ss->attach)
			css->ss->attach(css, tset);

	ret = 0;
	goto out_release_tset;

out_cancel_attach:
	for_each_e_css(css, i, dst_cgrp) {
		if (css == failed_css)
			break;
		if (css->ss->cancel_attach)
			css->ss->cancel_attach(css, tset);
	}
out_release_tset:
	down_write(&css_set_rwsem);
	list_splice_init(&tset->dst_csets, &tset->src_csets);
	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
		list_del_init(&cset->mg_node);
	}
	up_write(&css_set_rwsem);
	return ret;
}

Li Zefan's avatar
Li Zefan committed
/**
 * cgroup_migrate_finish - cleanup after attach
 * @preloaded_csets: list of preloaded css_sets
 * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
 * those functions for details.
static void cgroup_migrate_finish(struct list_head *preloaded_csets)
	struct css_set *cset, *tmp_cset;
	lockdep_assert_held(&cgroup_mutex);

	down_write(&css_set_rwsem);
	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
		cset->mg_src_cgrp = NULL;
		cset->mg_dst_cset = NULL;
		list_del_init(&cset->mg_preload_node);
		put_css_set_locked(cset);
	}
	up_write(&css_set_rwsem);
}

/**
 * cgroup_migrate_add_src - add a migration source css_set
 * @src_cset: the source css_set to add
 * @dst_cgrp: the destination cgroup
 * @preloaded_csets: list of preloaded css_sets
 *
 * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
 * @src_cset and add it to @preloaded_csets, which should later be cleaned
 * up by cgroup_migrate_finish().
 *
 * This function may be called without holding cgroup_threadgroup_rwsem
 * even if the target is a process.  Threads may be created and destroyed
 * but as long as cgroup_mutex is not dropped, no new css_set can be put
 * into play and the preloaded css_sets are guaranteed to cover all
 * migrations.
 */
static void cgroup_migrate_add_src(struct css_set *src_cset,
				   struct cgroup *dst_cgrp,
				   struct list_head *preloaded_csets)
{
	struct cgroup *src_cgrp;

	lockdep_assert_held(&cgroup_mutex);
	lockdep_assert_held(&css_set_rwsem);

	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);

	if (!list_empty(&src_cset->mg_preload_node))
		return;

	WARN_ON(src_cset->mg_src_cgrp);
	WARN_ON(!list_empty(&src_cset->mg_tasks));
	WARN_ON(!list_empty(&src_cset->mg_node));

	src_cset->mg_src_cgrp = src_cgrp;
	get_css_set(src_cset);
	list_add(&src_cset->mg_preload_node, preloaded_csets);
}

/**
 * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
 * @dst_cgrp: the destination cgroup (may be %NULL)
 * @preloaded_csets: list of preloaded source css_sets
 *
 * Tasks are about to be moved to @dst_cgrp and all the source css_sets
 * have been preloaded to @preloaded_csets.  This function looks up and
 * pins all destination css_sets, links each to its source, and append them
 * to @preloaded_csets.  If @dst_cgrp is %NULL, the destination of each
 * source css_set is assumed to be its cgroup on the default hierarchy.
 *
 * This function must be called after cgroup_migrate_add_src() has been
 * called on each migration source css_set.  After migration is performed
 * using cgroup_migrate(), cgroup_migrate_finish() must be called on
 * @preloaded_csets.
 */
static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
				      struct list_head *preloaded_csets)
{
	LIST_HEAD(csets);
	struct css_set *src_cset, *tmp_cset;

	lockdep_assert_held(&cgroup_mutex);

	/*
	 * Except for the root, child_subsys_mask must be zero for a cgroup
	 * with tasks so that child cgroups don't compete against tasks.
	 */
Tejun Heo's avatar
Tejun Heo committed
	if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) &&
	/* look up the dst cset for each src cset and link it to src */
	list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) {
		dst_cset = find_css_set(src_cset,
					dst_cgrp ?: src_cset->dfl_cgrp);
		if (!dst_cset)
			goto err;

		WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);

		/*
		 * If src cset equals dst, it's noop.  Drop the src.
		 * cgroup_migrate() will skip the cset too.  Note that we
		 * can't handle src == dst as some nodes are used by both.
		 */
		if (src_cset == dst_cset) {
			src_cset->mg_src_cgrp = NULL;
			list_del_init(&src_cset->mg_preload_node);
			put_css_set(src_cset);
			put_css_set(dst_cset);
		src_cset->mg_dst_cset = dst_cset;

		if (list_empty(&dst_cset->mg_preload_node))
			list_add(&dst_cset->mg_preload_node, &csets);
		else
			put_css_set(dst_cset);
	list_splice_tail(&csets, preloaded_csets);
	return 0;
err:
	cgroup_migrate_finish(&csets);
	return -ENOMEM;
}

/**
 * cgroup_migrate - migrate a process or task to a cgroup
 * @leader: the leader of the process or the task to migrate
 * @threadgroup: whether @leader points to the whole process or a single task
 * @cgrp: the destination cgroup
 *
 * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
 * process, the caller must be holding cgroup_threadgroup_rwsem.  The
 * caller is also responsible for invoking cgroup_migrate_add_src() and
 * cgroup_migrate_prepare_dst() on the targets before invoking this
 * function and following up with cgroup_migrate_finish().
 *
 * As long as a controller's ->can_attach() doesn't fail, this function is
 * guaranteed to succeed.  This means that, excluding ->can_attach()
 * failure, when migrating multiple targets, the success or failure can be
 * decided for all targets by invoking group_migrate_prepare_dst() before
 * actually starting migrating.
 */
static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
			  struct cgroup *cgrp)
	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
	struct task_struct *task;
	/*
	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
	 * already PF_EXITING could be freed from underneath us unless we
	 * take an rcu_read_lock.
	 */
		cgroup_taskset_add(task, &tset);
	} while_each_thread(leader, task);
	return cgroup_taskset_migrate(&tset, cgrp);
/**
 * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
 * @dst_cgrp: the cgroup to attach to
 * @leader: the task or the leader of the threadgroup to be attached
 * @threadgroup: attach the whole threadgroup?
 *
 * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
 */
static int cgroup_attach_task(struct cgroup *dst_cgrp,
			      struct task_struct *leader, bool threadgroup)
{
	LIST_HEAD(preloaded_csets);
	struct task_struct *task;
	int ret;

	/* look up all src csets */
	down_read(&css_set_rwsem);
	rcu_read_lock();
	task = leader;
	do {
		cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
				       &preloaded_csets);
		if (!threadgroup)
			break;
	} while_each_thread(leader, task);
	rcu_read_unlock();
	up_read(&css_set_rwsem);

	/* prepare dst csets and commit */
	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
	if (!ret)
		ret = cgroup_migrate(leader, threadgroup, dst_cgrp);

	cgroup_migrate_finish(&preloaded_csets);
	return ret;
static int cgroup_procs_write_permission(struct task_struct *task,
					 struct cgroup *dst_cgrp,
					 struct kernfs_open_file *of)
{
	const struct cred *cred = current_cred();
	const struct cred *tcred = get_task_cred(task);
	int ret = 0;

	/*
	 * even if we're attaching all tasks in the thread group, we only
	 * need to check permissions on one of them.
	 */
	if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
	    !uid_eq(cred->euid, tcred->uid) &&
	    !uid_eq(cred->euid, tcred->suid))
		ret = -EACCES;

	if (!ret && cgroup_on_dfl(dst_cgrp)) {
		struct super_block *sb = of->file->f_path.dentry->d_sb;
		struct cgroup *cgrp;
		struct inode *inode;

		down_read(&css_set_rwsem);
		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
		up_read(&css_set_rwsem);

		while (!cgroup_is_descendant(dst_cgrp, cgrp))
			cgrp = cgroup_parent(cgrp);

		ret = -ENOMEM;
		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
		if (inode) {
			ret = inode_permission(inode, MAY_WRITE);
			iput(inode);
		}
	}

/*
 * Find the task_struct of the task to attach by vpid and pass it along to the
 * function to attach either it or all tasks in its threadgroup. Will lock
 * cgroup_mutex and threadgroup.
static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
				    size_t nbytes, loff_t off, bool threadgroup)
{
	struct task_struct *tsk;
	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
		return -EINVAL;

	cgrp = cgroup_kn_lock_live(of->kn);
	if (!cgrp)
		return -ENODEV;

	percpu_down_write(&cgroup_threadgroup_rwsem);
		tsk = find_task_by_vpid(pid);
		if (!tsk) {
			ret = -ESRCH;
			goto out_unlock_rcu;
	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
	 * trapped in a cpuset, or RT worker may be born in a cgroup
	 * with no rt_runtime allocated.  Just say no.
	 */
	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
		goto out_unlock_rcu;
	ret = cgroup_procs_write_permission(tsk, cgrp, of);
	if (!ret)
		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
	put_task_struct(tsk);
	goto out_unlock_threadgroup;

out_unlock_rcu:
	rcu_read_unlock();
out_unlock_threadgroup:
	percpu_up_write(&cgroup_threadgroup_rwsem);
/**
 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
 * @from: attach to all cgroups of a given task
 * @tsk: the task to be attached
 */
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
	struct cgroup_root *root;
	mutex_lock(&cgroup_mutex);
		if (root == &cgrp_dfl_root)
		down_read(&css_set_rwsem);
		from_cgrp = task_cgroup_from_root(from, root);
		up_read(&css_set_rwsem);
Li Zefan's avatar
Li Zefan committed
		retval = cgroup_attach_task(from_cgrp, tsk, false);
	mutex_unlock(&cgroup_mutex);

	return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

static ssize_t cgroup_tasks_write(struct kernfs_open_file *of,
				  char *buf, size_t nbytes, loff_t off)
	return __cgroup_procs_write(of, buf, nbytes, off, false);
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
				  char *buf, size_t nbytes, loff_t off)
	return __cgroup_procs_write(of, buf, nbytes, off, true);
static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
					  char *buf, size_t nbytes, loff_t off)
	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
	cgrp = cgroup_kn_lock_live(of->kn);
	if (!cgrp)
	spin_lock(&release_agent_path_lock);
	strlcpy(cgrp->root->release_agent_path, strstrip(buf),
		sizeof(cgrp->root->release_agent_path));
	spin_unlock(&release_agent_path_lock);
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
	struct cgroup *cgrp = seq_css(seq)->cgroup;
	spin_lock(&release_agent_path_lock);
	seq_puts(seq, cgrp->root->release_agent_path);
	spin_unlock(&release_agent_path_lock);
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
	struct cgroup_subsys *ss;
	bool printed = false;
	int ssid;
	for_each_subsys_which(ss, ssid, &ss_mask) {
		if (printed)
			seq_putc(seq, ' ');
		seq_printf(seq, "%s", ss->name);
		printed = true;
/* show controllers which are currently attached to the default hierarchy */
static int cgroup_root_controllers_show(struct seq_file *seq, void *v)
	cgroup_print_ss_mask(seq, cgrp->root->subsys_mask &
			     ~cgrp_dfl_root_inhibit_ss_mask);
/* show controllers which are enabled from the parent */
static int cgroup_controllers_show(struct seq_file *seq, void *v)
	cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
/* show controllers which are enabled for a given cgroup's children */
static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
	cgroup_print_ss_mask(seq, cgrp->subtree_control);
	return 0;
}

/**
 * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
 * @cgrp: root of the subtree to update csses for
 *
 * @cgrp's child_subsys_mask has changed and its subtree's (self excluded)
 * css associations need to be updated accordingly.  This function looks up
 * all css_sets which are attached to the subtree, creates the matching
 * updated css_sets and migrates the tasks to the new ones.
 */
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
	LIST_HEAD(preloaded_csets);
	struct cgroup_subsys_state *css;
	struct css_set *src_cset;
	int ret;

	lockdep_assert_held(&cgroup_mutex);

	percpu_down_write(&cgroup_threadgroup_rwsem);

	/* look up all csses currently attached to @cgrp's subtree */
	down_read(&css_set_rwsem);
	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
		struct cgrp_cset_link *link;

		/* self is not affected by child_subsys_mask change */
		if (css->cgroup == cgrp)
			continue;

		list_for_each_entry(link, &css->cgroup->cset_links, cset_link)
			cgroup_migrate_add_src(link->cset, cgrp,
					       &preloaded_csets);
	}
	up_read(&css_set_rwsem);

	/* NULL dst indicates self on default hierarchy */
	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
	if (ret)
		goto out_finish;

	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
		struct task_struct *last_task = NULL, *task;

		/* src_csets precede dst_csets, break on the first dst_cset */
		if (!src_cset->mg_src_cgrp)
			break;

		/*
		 * All tasks in src_cset need to be migrated to the
		 * matching dst_cset.  Empty it process by process.  We
		 * walk tasks but migrate processes.  The leader might even
		 * belong to a different cset but such src_cset would also
		 * be among the target src_csets because the default
		 * hierarchy enforces per-process membership.
		 */
		while (true) {
			down_read(&css_set_rwsem);
			task = list_first_entry_or_null(&src_cset->tasks,
						struct task_struct, cg_list);
			if (task) {
				task = task->group_leader;
				WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
				get_task_struct(task);
			}
			up_read(&css_set_rwsem);

			if (!task)
				break;

			/* guard against possible infinite loop */
			if (WARN(last_task == task,
				 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
				goto out_finish;
			last_task = task;

			ret = cgroup_migrate(task, true, src_cset->dfl_cgrp);

			put_task_struct(task);

			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
				goto out_finish;
		}
	}

out_finish:
	cgroup_migrate_finish(&preloaded_csets);
	percpu_up_write(&cgroup_threadgroup_rwsem);
	return ret;
}

/* change the enabled child controllers for a cgroup in the default hierarchy */
static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
					    char *buf, size_t nbytes,
					    loff_t off)
	unsigned long enable = 0, disable = 0;
	unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
	struct cgroup *cgrp, *child;
	 * Parse input - space separated list of subsystem names prefixed
	 * with either + or -.
	buf = strstrip(buf);
	while ((tok = strsep(&buf, " "))) {
		unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;

		if (tok[0] == '\0')
			continue;
		for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
			if (!cgroup_ssid_enabled(ssid) ||
			    strcmp(tok + 1, ss->name))
			} else {
				return -EINVAL;
			}
			break;
		}
		if (ssid == CGROUP_SUBSYS_COUNT)
			return -EINVAL;
	}

	cgrp = cgroup_kn_lock_live(of->kn);
	if (!cgrp)
		return -ENODEV;

	for_each_subsys(ss, ssid) {
		if (enable & (1 << ssid)) {
			if (cgrp->subtree_control & (1 << ssid)) {
			/* unavailable or not enabled on the parent? */
			if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
			    (cgroup_parent(cgrp) &&
			     !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
				ret = -ENOENT;
				goto out_unlock;
			}
			if (!(cgrp->subtree_control & (1 << ssid))) {
				disable &= ~(1 << ssid);
				continue;
			}

			/* a child has it enabled? */
			cgroup_for_each_live_child(child, cgrp) {
				if (child->subtree_control & (1 << ssid)) {
	 * Except for the root, subtree_control must be zero for a cgroup
	 * with tasks so that child cgroups don't compete against tasks.
	 */
Tejun Heo's avatar
Tejun Heo committed
	if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
	 * Update subsys masks and calculate what needs to be done.  More
	 * subsystems than specified may need to be enabled or disabled
	 * depending on subsystem dependencies.
	 */
	old_sc = cgrp->subtree_control;
	old_ss = cgrp->child_subsys_mask;
	new_sc = (old_sc | enable) & ~disable;
	new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc);
	css_enable = ~old_ss & new_ss;
	css_disable = old_ss & ~new_ss;
	enable |= css_enable;
	disable |= css_disable;
	/*
	 * Because css offlining is asynchronous, userland might try to
	 * re-enable the same controller while the previous instance is
	 * still around.  In such cases, wait till it's gone using
	 * offline_waitq.
	 */
	for_each_subsys_which(ss, ssid, &css_enable) {
		cgroup_for_each_live_child(child, cgrp) {
			DEFINE_WAIT(wait);

			if (!cgroup_css(child, ss))
				continue;

			cgroup_get(child);
			prepare_to_wait(&child->offline_waitq, &wait,
					TASK_UNINTERRUPTIBLE);