Skip to content
cgroup.c 132 KiB
Newer Older
	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
	 * trapped in a cpuset, or RT worker may be born in a cgroup
	 * with no rt_runtime allocated.  Just say no.
	 */
	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
		ret = -EINVAL;
		rcu_read_unlock();
		goto out_unlock_cgroup;
	}

	get_task_struct(tsk);
	rcu_read_unlock();

	threadgroup_lock(tsk);
	if (threadgroup) {
		if (!thread_group_leader(tsk)) {
			/*
			 * a race with de_thread from another thread's exec()
			 * may strip us of our leadership, if this happens,
			 * there is no choice but to throw this task away and
			 * try again; this is
			 * "double-double-toil-and-trouble-check locking".
			 */
			threadgroup_unlock(tsk);
			put_task_struct(tsk);
			goto retry_find_task;
		}
	}

	ret = cgroup_attach_task(cgrp, tsk, threadgroup);

	threadgroup_unlock(tsk);

	put_task_struct(tsk);
	mutex_unlock(&cgroup_mutex);
/**
 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
 * @from: attach to all cgroups of a given task
 * @tsk: the task to be attached
 */
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
	struct cgroupfs_root *root;
	int retval = 0;

	mutex_lock(&cgroup_mutex);
Li Zefan's avatar
Li Zefan committed
		struct cgroup *from_cgrp = task_cgroup_from_root(from, root);
Li Zefan's avatar
Li Zefan committed
		retval = cgroup_attach_task(from_cgrp, tsk, false);
	mutex_unlock(&cgroup_mutex);

	return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);

static int cgroup_tasks_write(struct cgroup_subsys_state *css,
			      struct cftype *cft, u64 pid)
	return attach_task_by_pid(css->cgroup, pid, false);
static int cgroup_procs_write(struct cgroup_subsys_state *css,
			      struct cftype *cft, u64 tgid)
	return attach_task_by_pid(css->cgroup, tgid, true);
static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
				      struct cftype *cft, const char *buffer)
	struct cgroupfs_root *root = css->cgroup->root;

	BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
	if (!cgroup_lock_live_group(css->cgroup))
	spin_lock(&release_agent_path_lock);
	strlcpy(root->release_agent_path, buffer,
		sizeof(root->release_agent_path));
	spin_unlock(&release_agent_path_lock);
	mutex_unlock(&cgroup_mutex);
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
	struct cgroup *cgrp = seq_css(seq)->cgroup;
	if (!cgroup_lock_live_group(cgrp))
		return -ENODEV;
	seq_puts(seq, cgrp->root->release_agent_path);
	seq_putc(seq, '\n');
	mutex_unlock(&cgroup_mutex);
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
	struct cgroup *cgrp = seq_css(seq)->cgroup;

	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
Tejun Heo's avatar
Tejun Heo committed
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
				 size_t nbytes, loff_t off)
Tejun Heo's avatar
Tejun Heo committed
	struct cgroup *cgrp = of->kn->parent->priv;
	struct cftype *cft = of->kn->priv;
	struct cgroup_subsys_state *css;
Tejun Heo's avatar
Tejun Heo committed
	/*
	 * kernfs guarantees that a file isn't deleted with operations in
	 * flight, which means that the matching css is and stays alive and
	 * doesn't need to be pinned.  The RCU locking is not necessary
	 * either.  It's just for the convenience of using cgroup_css().
	 */
	rcu_read_lock();
	css = cgroup_css(cgrp, cft->ss);
	rcu_read_unlock();

	if (cft->write_string) {
		ret = cft->write_string(css, cft, strstrip(buf));
	} else if (cft->write_u64) {
		unsigned long long v;
		ret = kstrtoull(buf, 0, &v);
		if (!ret)
			ret = cft->write_u64(css, cft, v);
	} else if (cft->write_s64) {
		long long v;
		ret = kstrtoll(buf, 0, &v);
		if (!ret)
			ret = cft->write_s64(css, cft, v);
	} else if (cft->trigger) {
		ret = cft->trigger(css, (unsigned int)cft->private);
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
Tejun Heo's avatar
Tejun Heo committed
	return seq_cft(seq)->seq_start(seq, ppos);
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
Tejun Heo's avatar
Tejun Heo committed
	return seq_cft(seq)->seq_next(seq, v, ppos);
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
Tejun Heo's avatar
Tejun Heo committed
	seq_cft(seq)->seq_stop(seq, v);
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
	struct cftype *cft = seq_cft(m);
	struct cgroup_subsys_state *css = seq_css(m);
	if (cft->seq_show)
		return cft->seq_show(m, arg);
		seq_printf(m, "%llu\n", cft->read_u64(css, cft));
	else if (cft->read_s64)
		seq_printf(m, "%lld\n", cft->read_s64(css, cft));
	else
		return -EINVAL;
	return 0;
Tejun Heo's avatar
Tejun Heo committed
static struct kernfs_ops cgroup_kf_single_ops = {
	.atomic_write_len	= PAGE_SIZE,
	.write			= cgroup_file_write,
	.seq_show		= cgroup_seqfile_show,
Tejun Heo's avatar
Tejun Heo committed
static struct kernfs_ops cgroup_kf_ops = {
	.atomic_write_len	= PAGE_SIZE,
	.write			= cgroup_file_write,
	.seq_start		= cgroup_seqfile_start,
	.seq_next		= cgroup_seqfile_next,
	.seq_stop		= cgroup_seqfile_stop,
	.seq_show		= cgroup_seqfile_show,
};

/*
 * cgroup_rename - Only allow simple rename of directories in place.
 */
Tejun Heo's avatar
Tejun Heo committed
static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
			 const char *new_name_str)
Tejun Heo's avatar
Tejun Heo committed
	struct cgroup *cgrp = kn->priv;
	struct cgroup_name *name, *old_name;
Tejun Heo's avatar
Tejun Heo committed
	int ret;
Tejun Heo's avatar
Tejun Heo committed
	if (kernfs_type(kn) != KERNFS_DIR)
Tejun Heo's avatar
Tejun Heo committed
	if (kn->parent != new_parent)
	/*
	 * This isn't a proper migration and its usefulness is very
	 * limited.  Disallow if sane_behavior.
	 */
	if (cgroup_sane_behavior(cgrp))
		return -EPERM;

Tejun Heo's avatar
Tejun Heo committed
	name = cgroup_alloc_name(new_name_str);
	if (!name)
		return -ENOMEM;

Tejun Heo's avatar
Tejun Heo committed
	mutex_lock(&cgroup_tree_mutex);
	mutex_lock(&cgroup_mutex);

	ret = kernfs_rename(kn, new_parent, new_name_str);
	if (!ret) {
		old_name = rcu_dereference_protected(cgrp->name, true);
		rcu_assign_pointer(cgrp->name, name);
	} else {
		old_name = name;
Tejun Heo's avatar
Tejun Heo committed
	mutex_unlock(&cgroup_mutex);
	mutex_unlock(&cgroup_tree_mutex);

	kfree_rcu(old_name, rcu_head);
Tejun Heo's avatar
Tejun Heo committed
	return ret;
static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
	char name[CGROUP_FILE_NAME_MAX];
Tejun Heo's avatar
Tejun Heo committed
	struct kernfs_node *kn;
	struct lock_class_key *key = NULL;
Tejun Heo's avatar
Tejun Heo committed
#ifdef CONFIG_DEBUG_LOCK_ALLOC
	key = &cft->lockdep_key;
#endif
	kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
				  cgroup_file_mode(cft), 0, cft->kf_ops, cft,
				  NULL, false, key);
	if (IS_ERR(kn))
		return PTR_ERR(kn);
	return 0;
/**
 * cgroup_addrm_files - add or remove files to a cgroup directory
 * @cgrp: the target cgroup
 * @cfts: array of cftypes to be added
 * @is_add: whether to add or remove
 *
 * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
 * For removals, this function never fails.  If addition fails, this
 * function doesn't remove files already added.  The caller is responsible
 * for cleaning up.
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
			      bool is_add)
	struct cftype *cft;
	lockdep_assert_held(&cgroup_tree_mutex);

	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		/* does cft->flags tell us to skip this file on @cgrp? */
		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
			continue;
		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
			continue;
		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
			continue;

			ret = cgroup_add_file(cgrp, cft);
				pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
					cft->name, ret);
				return ret;
			}
		} else {
			cgroup_rm_file(cgrp, cft);
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
	struct cgroup_subsys *ss = cfts[0].ss;
	struct cgroup *root = &ss->root->top_cgroup;
	struct cgroup_subsys_state *css;
	lockdep_assert_held(&cgroup_tree_mutex);
	/* don't bother if @ss isn't attached */
	if (ss->root == &cgroup_dummy_root)

	/* add/rm files for all cgroups created before */
	css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
		ret = cgroup_addrm_files(cgrp, cfts, is_add);

	if (is_add && !ret)
		kernfs_activate(root->kn);
static void cgroup_exit_cftypes(struct cftype *cfts)
{
	struct cftype *cft;

Tejun Heo's avatar
Tejun Heo committed
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		/* free copy for custom atomic_write_len, see init_cftypes() */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
			kfree(cft->kf_ops);
		cft->kf_ops = NULL;
		cft->ss = NULL;
Tejun Heo's avatar
Tejun Heo committed
	}
Tejun Heo's avatar
Tejun Heo committed
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
Tejun Heo's avatar
Tejun Heo committed
	for (cft = cfts; cft->name[0] != '\0'; cft++) {
		struct kernfs_ops *kf_ops;

Tejun Heo's avatar
Tejun Heo committed
		WARN_ON(cft->ss || cft->kf_ops);

Tejun Heo's avatar
Tejun Heo committed
		if (cft->seq_start)
			kf_ops = &cgroup_kf_ops;
		else
			kf_ops = &cgroup_kf_single_ops;

		/*
		 * Ugh... if @cft wants a custom max_write_len, we need to
		 * make a copy of kf_ops to set its atomic_write_len.
		 */
		if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
			kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
			if (!kf_ops) {
				cgroup_exit_cftypes(cfts);
				return -ENOMEM;
			}
			kf_ops->atomic_write_len = cft->max_write_len;
		}

		cft->kf_ops = kf_ops;
Tejun Heo's avatar
Tejun Heo committed
	}

	return 0;
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
	lockdep_assert_held(&cgroup_tree_mutex);

	if (!cfts || !cfts[0].ss)
		return -ENOENT;

	list_del(&cfts->node);
	cgroup_apply_cftypes(cfts, false);
	cgroup_exit_cftypes(cfts);
	return 0;
}

/**
 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Unregister @cfts.  Files described by @cfts are removed from all
 * existing cgroups and all future cgroups won't have them either.  This
 * function can be called anytime whether @cfts' subsys is attached or not.
 *
 * Returns 0 on successful unregistration, -ENOENT if @cfts is not
 * registered.
 */
int cgroup_rm_cftypes(struct cftype *cfts)
{
	mutex_lock(&cgroup_tree_mutex);
	ret = cgroup_rm_cftypes_locked(cfts);
	mutex_unlock(&cgroup_tree_mutex);
	return ret;
/**
 * cgroup_add_cftypes - add an array of cftypes to a subsystem
 * @ss: target cgroup subsystem
 * @cfts: zero-length name terminated array of cftypes
 *
 * Register @cfts to @ss.  Files described by @cfts are created for all
 * existing cgroups to which @ss is attached and all future cgroups will
 * have them too.  This function can be called anytime whether @ss is
 * attached or not.
 *
 * Returns 0 on successful registration, -errno on failure.  Note that this
 * function currently returns 0 as long as @cfts registration is successful
 * even if some file creation attempts on existing cgroups fail.
 */
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
Tejun Heo's avatar
Tejun Heo committed
	ret = cgroup_init_cftypes(ss, cfts);
	if (ret)
		return ret;
	mutex_lock(&cgroup_tree_mutex);

Tejun Heo's avatar
Tejun Heo committed
	list_add_tail(&cfts->node, &ss->cfts);
	ret = cgroup_apply_cftypes(cfts, true);
		cgroup_rm_cftypes_locked(cfts);

	mutex_unlock(&cgroup_tree_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_add_cftypes);

Li Zefan's avatar
Li Zefan committed
/**
 * cgroup_task_count - count the number of tasks in a cgroup.
 * @cgrp: the cgroup in question
 *
 * Return the number of tasks in the cgroup.
 */
int cgroup_task_count(const struct cgroup *cgrp)
	struct cgrp_cset_link *link;
	list_for_each_entry(link, &cgrp->cset_links, cset_link)
		count += atomic_read(&link->cset->refcount);
	read_unlock(&css_set_lock);
 * To reduce the fork() overhead for systems that are not actually using
 * their cgroups capability, we don't maintain the lists running through
 * each css_set to its tasks until we see the list actually used - in other
 * words after the first call to css_task_iter_start().
static void cgroup_enable_task_cg_lists(void)
{
	struct task_struct *p, *g;
	write_lock(&css_set_lock);
	use_task_css_set_links = 1;
	/*
	 * We need tasklist_lock because RCU is not safe against
	 * while_each_thread(). Besides, a forking task that has passed
	 * cgroup_post_fork() without seeing use_task_css_set_links = 1
	 * is not guaranteed to have its child immediately visible in the
	 * tasklist if we walk through it with RCU.
	 */
	read_lock(&tasklist_lock);
	do_each_thread(g, p) {
		task_lock(p);
		/*
		 * We should check if the process is exiting, otherwise
		 * it will race with cgroup_exit() in that the list
		 * entry won't be deleted though the process has exited.
		 */
		if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
			list_add(&p->cg_list, &task_css_set(p)->tasks);
		task_unlock(p);
	} while_each_thread(g, p);
 * css_next_child - find the next child of a given css
 * @pos_css: the current position (%NULL to initiate traversal)
 * @parent_css: css whose children to walk
 * This function returns the next child of @parent_css and should be called
 * under either cgroup_mutex or RCU read lock.  The only requirement is
 * that @parent_css and @pos_css are accessible.  The next sibling is
 * guaranteed to be returned regardless of their states.
struct cgroup_subsys_state *
css_next_child(struct cgroup_subsys_state *pos_css,
	       struct cgroup_subsys_state *parent_css)
	struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
	struct cgroup *cgrp = parent_css->cgroup;
	cgroup_assert_mutexes_or_rcu_locked();

	/*
	 * @pos could already have been removed.  Once a cgroup is removed,
	 * its ->sibling.next is no longer updated when its next sibling
	 * changes.  As CGRP_DEAD assertion is serialized and happens
	 * before the cgroup is taken off the ->sibling list, if we see it
	 * unasserted, it's guaranteed that the next sibling hasn't
	 * finished its grace period even if it's already removed, and thus
	 * safe to dereference from this RCU critical section.  If
	 * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
	 * to be visible as %true here.
	 *
	 * If @pos is dead, its next pointer can't be dereferenced;
	 * however, as each cgroup is given a monotonically increasing
	 * unique serial number and always appended to the sibling list,
	 * the next one can be found by walking the parent's children until
	 * we see a cgroup with higher serial number than @pos's.  While
	 * this path can be slower, it's taken only when either the current
	 * cgroup is removed or iteration and removal race.
	if (!pos) {
		next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
	} else if (likely(!cgroup_is_dead(pos))) {
		next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
	} else {
		list_for_each_entry_rcu(next, &cgrp->children, sibling)
			if (next->serial_nr > pos->serial_nr)
				break;
	if (&next->sibling == &cgrp->children)
		return NULL;

	return cgroup_css(next, parent_css->ss);
 * css_next_descendant_pre - find the next descendant for pre-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 * To be used by css_for_each_descendant_pre().  Find the next descendant
 * to visit for pre-order traversal of @root's descendants.  @root is
 * included in the iteration and the first node to be visited.
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @root are accessible and @pos is a descendant of @root.
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
			struct cgroup_subsys_state *root)
	cgroup_assert_mutexes_or_rcu_locked();

	/* visit the first child if exists */
	if (next)
		return next;

	/* no child, visit my or the closest ancestor's next sibling */
	while (pos != root) {
		next = css_next_child(pos, css_parent(pos));
EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 * css_rightmost_descendant - return the rightmost descendant of a css
 * @pos: css of interest
 * Return the rightmost descendant of @pos.  If there's no descendant, @pos
 * is returned.  This can be used during pre-order traversal to skip
 * subtree of @pos.
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct rightmost descendant as
 * long as @pos is accessible.
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
	struct cgroup_subsys_state *last, *tmp;
	cgroup_assert_mutexes_or_rcu_locked();

	do {
		last = pos;
		/* ->prev isn't RCU safe, walk ->next till the end */
		pos = NULL;
			pos = tmp;
	} while (pos);

	return last;
}
EXPORT_SYMBOL_GPL(css_rightmost_descendant);
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
 * css_next_descendant_post - find the next descendant for post-order walk
 * @pos: the current position (%NULL to initiate traversal)
 * @root: css whose descendants to walk
 * To be used by css_for_each_descendant_post().  Find the next descendant
 * to visit for post-order traversal of @root's descendants.  @root is
 * included in the iteration and the last node to be visited.
 * While this function requires cgroup_mutex or RCU read locking, it
 * doesn't require the whole traversal to be contained in a single critical
 * section.  This function will return the correct next descendant as long
 * as both @pos and @cgroup are accessible and @pos is a descendant of
 * @cgroup.
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
			 struct cgroup_subsys_state *root)
	cgroup_assert_mutexes_or_rcu_locked();
	/* if first iteration, visit leftmost descendant which may be @root */
	if (!pos)
		return css_leftmost_descendant(root);
	/* if we visited @root, we're done */
	if (pos == root)
		return NULL;

	/* if there's an unvisited sibling, visit its leftmost descendant */
	next = css_next_child(pos, css_parent(pos));

	/* no sibling left, visit parent */
EXPORT_SYMBOL_GPL(css_next_descendant_post);
 * css_advance_task_iter - advance a task itererator to the next css_set
 * @it: the iterator to advance
 *
 * Advance @it to the next css_set to walk.
static void css_advance_task_iter(struct css_task_iter *it)
{
	struct list_head *l = it->cset_link;
	struct cgrp_cset_link *link;
	struct css_set *cset;

	/* Advance to the next non-empty css_set */
	do {
		l = l->next;
		if (l == &it->origin_css->cgroup->cset_links) {
			it->cset_link = NULL;
			return;
		}
		link = list_entry(l, struct cgrp_cset_link, cset_link);
		cset = link->cset;
	} while (list_empty(&cset->tasks));
	it->cset_link = l;
	it->task = cset->tasks.next;
}

 * css_task_iter_start - initiate task iteration
 * @css: the css to walk tasks of
 * @it: the task iterator to use
 *
 * Initiate iteration through the tasks of @css.  The caller can call
 * css_task_iter_next() to walk through the tasks until the function
 * returns NULL.  On completion of iteration, css_task_iter_end() must be
 * called.
 *
 * Note that this function acquires a lock which is released when the
 * iteration finishes.  The caller can't sleep while iteration is in
 * progress.
 */
void css_task_iter_start(struct cgroup_subsys_state *css,
			 struct css_task_iter *it)
	 * The first time anyone tries to iterate across a css, we need to
	 * enable the list linking each css_set to its tasks, and fix up
	 * all existing tasks.
	if (!use_task_css_set_links)
		cgroup_enable_task_cg_lists();

	read_lock(&css_set_lock);
	it->origin_css = css;
	it->cset_link = &css->cgroup->cset_links;
 * css_task_iter_next - return the next task for the iterator
 * @it: the task iterator being iterated
 *
 * The "next" function for task iteration.  @it should have been
 * initialized via css_task_iter_start().  Returns NULL when the iteration
 * reaches the end.
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
	struct task_struct *res;
	struct list_head *l = it->task;
	struct cgrp_cset_link *link;

	/* If the iterator cg is NULL, we have no tasks */
		return NULL;
	res = list_entry(l, struct task_struct, cg_list);
	/* Advance iterator to find next entry */
	l = l->next;
	link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
	if (l == &link->cset->tasks) {
		/*
		 * We reached the end of this task list - move on to the
		 * next cgrp_cset_link.
		 */
 * css_task_iter_end - finish task iteration
 * @it: the task iterator to finish
 *
 * Finish task iteration started by css_task_iter_start().
void css_task_iter_end(struct css_task_iter *it)
static inline int started_after_time(struct task_struct *t1,
				     struct timespec *time,
				     struct task_struct *t2)
{
	int start_diff = timespec_compare(&t1->start_time, time);
	if (start_diff > 0) {
		return 1;
	} else if (start_diff < 0) {
		return 0;
	} else {
		/*
		 * Arbitrarily, if two processes started at the same
		 * time, we'll say that the lower pointer value
		 * started first. Note that t2 may have exited by now
		 * so this may not be a valid pointer any longer, but
		 * that's fine - it still serves to distinguish
		 * between two tasks started (effectively) simultaneously.
		 */
		return t1 > t2;
	}
}

/*
 * This function is a callback from heap_insert() and is used to order
 * the heap.
 * In this case we order the heap in descending task start time.
 */
static inline int started_after(void *p1, void *p2)
{
	struct task_struct *t1 = p1;
	struct task_struct *t2 = p2;
	return started_after_time(t1, &t2->start_time, t2);
}

/**
 * css_scan_tasks - iterate though all the tasks in a css
 * @css: the css to iterate tasks of
 * @test: optional test callback
 * @process: process callback
 * @data: data passed to @test and @process
 * @heap: optional pre-allocated heap used for task iteration
 * Iterate through all the tasks in @css, calling @test for each, and if it
 * returns %true, call @process for it also.
 * @test may be NULL, meaning always true (select all tasks), which
 * effectively duplicates css_task_iter_{start,next,end}() but does not
 * lock css_set_lock for the call to @process.
 *
 * It is guaranteed that @process will act on every task that is a member
 * of @css for the duration of this call.  This function may or may not
 * call @process for tasks that exit or move to a different css during the
 * call, or are forked or move into the css during the call.
 * Note that @test may be called with locks held, and may in some
 * situations be called multiple times for the same task, so it should be
 * cheap.
 * If @heap is non-NULL, a heap has been pre-allocated and will be used for
 * heap operations (and its "gt" member will be overwritten), else a
 * temporary heap will be used (allocation of which may cause this function
 * to fail).
int css_scan_tasks(struct cgroup_subsys_state *css,
		   bool (*test)(struct task_struct *, void *),
		   void (*process)(struct task_struct *, void *),
		   void *data, struct ptr_heap *heap)
	struct task_struct *p, *dropped;
	/* Never dereference latest_task, since it's not refcounted */
	struct task_struct *latest_task = NULL;
	struct ptr_heap tmp_heap;
	struct timespec latest_time = { 0, 0 };

	if (heap) {
		/* The caller supplied our heap and pre-allocated its memory */
		heap->gt = &started_after;
	} else {
		/* We need to allocate our own heap memory */
		heap = &tmp_heap;
		retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
		if (retval)
			/* cannot allocate the heap */
			return retval;
	}

 again:
	/*
	 * Scan tasks in the css, using the @test callback to determine
	 * which are of interest, and invoking @process callback on the
	 * ones which need an update.  Since we don't want to hold any
	 * locks during the task updates, gather tasks to be processed in a
	 * heap structure.  The heap is sorted by descending task start
	 * time.  If the statically-sized heap fills up, we overflow tasks
	 * that started later, and in future iterations only consider tasks
	 * that started after the latest task in the previous pass. This
	 * guarantees forward progress and that we don't miss any tasks.
	 */
	heap->size = 0;
	css_task_iter_start(css, &it);
	while ((p = css_task_iter_next(&it))) {
		/*
		 * Only affect tasks that qualify per the caller's callback,
		 * if he provided one
		 */
		if (test && !test(p, data))
			continue;
		/*
		 * Only process tasks that started after the last task
		 * we processed
		 */
		if (!started_after_time(p, &latest_time, latest_task))
			continue;
		dropped = heap_insert(heap, p);
		if (dropped == NULL) {
			/*
			 * The new task was inserted; the heap wasn't
			 * previously full
			 */
			get_task_struct(p);
		} else if (dropped != p) {
			/*
			 * The new task was inserted, and pushed out a
			 * different task
			 */
			get_task_struct(p);
			put_task_struct(dropped);
		}
		/*
		 * Else the new task was newer than anything already in
		 * the heap and wasn't inserted
		 */
	}

	if (heap->size) {
		for (i = 0; i < heap->size; i++) {
			struct task_struct *q = heap->ptrs[i];
				latest_time = q->start_time;
				latest_task = q;
			}
			/* Process the task per the caller's callback */
			process(q, data);
		}
		/*
		 * If we had to process any tasks at all, scan again
		 * in case some of them were in the middle of forking
		 * children that didn't get processed.
		 * Not the most efficient way to do it, but it avoids
		 * having to take callback_mutex in the fork path
		 */
		goto again;
	}
	if (heap == &tmp_heap)
		heap_free(&tmp_heap);
	return 0;
}

static void cgroup_transfer_one_task(struct task_struct *task, void *data)
	struct cgroup *new_cgroup = data;
	mutex_lock(&cgroup_mutex);
	cgroup_attach_task(new_cgroup, task, false);
	mutex_unlock(&cgroup_mutex);
}

/**
 * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
 * @to: cgroup to which the tasks will be moved
 * @from: cgroup in which the tasks currently reside
 */
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
	return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
			      to, NULL);
 * Stuff for reading the 'tasks'/'procs' files.
 *
 * Reading this file can return large amounts of data if a cgroup has
 * *lots* of attached tasks. So it may need several calls to read(),
 * but we cannot guarantee that the information we produce is correct
 * unless we produce it entirely atomically.
 *
 */

/* which pidlist file are we talking about? */
enum cgroup_filetype {
	CGROUP_FILE_PROCS,
	CGROUP_FILE_TASKS,
};

/*
 * A pidlist is a list of pids that virtually represents the contents of one
 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
 * a pair (one each for procs, tasks) for each pid namespace that's relevant
 * to the cgroup.