Newer
Older
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
LIST_HEAD(preloaded_csets);
struct task_struct *task;
int ret;
/* look up all src csets */
down_read(&css_set_rwsem);
rcu_read_lock();
task = leader;
do {
cgroup_migrate_add_src(task_css_set(task), dst_cgrp,
&preloaded_csets);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
up_read(&css_set_rwsem);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
if (!ret)
ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
cgroup_migrate_finish(&preloaded_csets);
return ret;
}
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
* cgroup_mutex and threadgroup.
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
David Howells
committed
const struct cred *cred = current_cred(), *tcred;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
retry_find_task:
rcu_read_lock();
goto out_unlock_cgroup;
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
*/
David Howells
committed
tcred = __task_cred(tsk);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
!uid_eq(cred->euid, tcred->suid)) {
David Howells
committed
rcu_read_unlock();
ret = -EACCES;
goto out_unlock_cgroup;
} else
tsk = current;
tsk = tsk->group_leader;
/*
* Workqueue threads may acquire PF_NO_SETAFFINITY and become
* trapped in a cpuset, or RT worker may be born in a cgroup
* with no rt_runtime allocated. Just say no.
*/
if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
rcu_read_unlock();
goto out_unlock_cgroup;
}
get_task_struct(tsk);
rcu_read_unlock();
threadgroup_lock(tsk);
if (threadgroup) {
if (!thread_group_leader(tsk)) {
/*
* a race with de_thread from another thread's exec()
* may strip us of our leadership, if this happens,
* there is no choice but to throw this task away and
* try again; this is
* "double-double-toil-and-trouble-check locking".
*/
threadgroup_unlock(tsk);
put_task_struct(tsk);
goto retry_find_task;
}
}
ret = cgroup_attach_task(cgrp, tsk, threadgroup);
threadgroup_unlock(tsk);
out_unlock_cgroup:
/**
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
struct cgroupfs_root *root;
int retval = 0;
for_each_root(root) {
struct cgroup *from_cgrp;
if (root == &cgroup_dummy_root)
continue;
down_read(&css_set_rwsem);
from_cgrp = task_cgroup_from_root(from, root);
up_read(&css_set_rwsem);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
break;
}
return retval;
}
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
static int cgroup_tasks_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 pid)
return attach_task_by_pid(css->cgroup, pid, false);
static int cgroup_procs_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 tgid)
{
return attach_task_by_pid(css->cgroup, tgid, true);
}
static int cgroup_release_agent_write(struct cgroup_subsys_state *css,
struct cftype *cft, const char *buffer)
struct cgroupfs_root *root = css->cgroup->root;
BUILD_BUG_ON(sizeof(root->release_agent_path) < PATH_MAX);
if (!cgroup_lock_live_group(css->cgroup))
return -ENODEV;
spin_lock(&release_agent_path_lock);
strlcpy(root->release_agent_path, buffer,
sizeof(root->release_agent_path));
spin_unlock(&release_agent_path_lock);
return 0;
}
static int cgroup_release_agent_show(struct seq_file *seq, void *v)
struct cgroup *cgrp = seq_css(seq)->cgroup;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
seq_puts(seq, cgrp->root->release_agent_path);
seq_putc(seq, '\n');
return 0;
}
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
struct cgroup *cgrp = seq_css(seq)->cgroup;
seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
return 0;
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
struct cgroup_subsys_state *css;
/*
* kernfs guarantees that a file isn't deleted with operations in
* flight, which means that the matching css is and stays alive and
* doesn't need to be pinned. The RCU locking is not necessary
* either. It's just for the convenience of using cgroup_css().
*/
rcu_read_lock();
css = cgroup_css(cgrp, cft->ss);
rcu_read_unlock();
if (cft->write_string) {
ret = cft->write_string(css, cft, strstrip(buf));
} else if (cft->write_u64) {
unsigned long long v;
ret = kstrtoull(buf, 0, &v);
if (!ret)
ret = cft->write_u64(css, cft, v);
} else if (cft->write_s64) {
long long v;
ret = kstrtoll(buf, 0, &v);
if (!ret)
ret = cft->write_s64(css, cft, v);
} else if (cft->trigger) {
ret = cft->trigger(css, (unsigned int)cft->private);
return ret ?: nbytes;
static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
struct cftype *cft = seq_cft(m);
struct cgroup_subsys_state *css = seq_css(m);
if (cft->seq_show)
return cft->seq_show(m, arg);
if (cft->read_u64)
seq_printf(m, "%llu\n", cft->read_u64(css, cft));
else if (cft->read_s64)
seq_printf(m, "%lld\n", cft->read_s64(css, cft));
else
return -EINVAL;
return 0;
static struct kernfs_ops cgroup_kf_single_ops = {
.atomic_write_len = PAGE_SIZE,
.write = cgroup_file_write,
.seq_show = cgroup_seqfile_show,
static struct kernfs_ops cgroup_kf_ops = {
.atomic_write_len = PAGE_SIZE,
.write = cgroup_file_write,
.seq_start = cgroup_seqfile_start,
.seq_next = cgroup_seqfile_next,
.seq_stop = cgroup_seqfile_stop,
.seq_show = cgroup_seqfile_show,
};
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
const char *new_name_str)
/*
* This isn't a proper migration and its usefulness is very
* limited. Disallow if sane_behavior.
*/
if (cgroup_sane_behavior(cgrp))
return -EPERM;
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
ret = kernfs_rename(kn, new_parent, new_name_str);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
return ret;
static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
struct kernfs_node *kn;
struct lock_class_key *key = NULL;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
key = &cft->lockdep_key;
#endif
kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
cgroup_file_mode(cft), 0, cft->kf_ops, cft,
NULL, false, key);
/**
* cgroup_addrm_files - add or remove files to a cgroup directory
* @cgrp: the target cgroup
* @cfts: array of cftypes to be added
* @is_add: whether to add or remove
*
* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
* For removals, this function never fails. If addition fails, this
* function doesn't remove files already added. The caller is responsible
* for cleaning up.
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add)
lockdep_assert_held(&cgroup_tree_mutex);
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
continue;
if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
continue;
ret = cgroup_add_file(cgrp, cft);
pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n",
cft->name, ret);
return ret;
}
} else {
cgroup_rm_file(cgrp, cft);
static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
{
LIST_HEAD(pending);
struct cgroup_subsys *ss = cfts[0].ss;
Tejun Heo
committed
struct cgroup *root = &ss->root->top_cgroup;
struct cgroup_subsys_state *css;
lockdep_assert_held(&cgroup_tree_mutex);
/* don't bother if @ss isn't attached */
if (ss->root == &cgroup_dummy_root)
/* add/rm files for all cgroups created before */
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
Tejun Heo
committed
struct cgroup *cgrp = css->cgroup;
if (cgroup_is_dead(cgrp))
continue;
ret = cgroup_addrm_files(cgrp, cfts, is_add);
if (is_add && !ret)
kernfs_activate(root->kn);
static void cgroup_exit_cftypes(struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* free copy for custom atomic_write_len, see init_cftypes() */
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
kf_ops = &cgroup_kf_single_ops;
/*
* Ugh... if @cft wants a custom max_write_len, we need to
* make a copy of kf_ops to set its atomic_write_len.
*/
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
cgroup_exit_cftypes(cfts);
return -ENOMEM;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_tree_mutex);
if (!cfts || !cfts[0].ss)
return -ENOENT;
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
return 0;
}
/**
* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Unregister @cfts. Files described by @cfts are removed from all
* existing cgroups and all future cgroups won't have them either. This
* function can be called anytime whether @cfts' subsys is attached or not.
*
* Returns 0 on successful unregistration, -ENOENT if @cfts is not
* registered.
*/
int cgroup_rm_cftypes(struct cftype *cfts)
{
mutex_lock(&cgroup_tree_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_tree_mutex);
return ret;
/**
* cgroup_add_cftypes - add an array of cftypes to a subsystem
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Register @cfts to @ss. Files described by @cfts are created for all
* existing cgroups to which @ss is attached and all future cgroups will
* have them too. This function can be called anytime whether @ss is
* attached or not.
*
* Returns 0 on successful registration, -errno on failure. Note that this
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (!cfts || cfts[0].name[0] == '\0')
return 0;
ret = cgroup_init_cftypes(ss, cfts);
if (ret)
return ret;
mutex_lock(&cgroup_tree_mutex);
ret = cgroup_apply_cftypes(cfts, true);
cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_tree_mutex);
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
* Return the number of tasks in the cgroup.
*/
static int cgroup_task_count(const struct cgroup *cgrp)
struct cgrp_cset_link *link;
down_read(&css_set_rwsem);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
up_read(&css_set_rwsem);
Tejun Heo
committed
* css_next_child - find the next child of a given css
* @pos_css: the current position (%NULL to initiate traversal)
* @parent_css: css whose children to walk
Tejun Heo
committed
* This function returns the next child of @parent_css and should be called
* under either cgroup_mutex or RCU read lock. The only requirement is
* that @parent_css and @pos_css are accessible. The next sibling is
* guaranteed to be returned regardless of their states.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_child(struct cgroup_subsys_state *pos_css,
struct cgroup_subsys_state *parent_css)
Tejun Heo
committed
struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
struct cgroup *cgrp = parent_css->cgroup;
struct cgroup *next;
/*
* @pos could already have been removed. Once a cgroup is removed,
* its ->sibling.next is no longer updated when its next sibling
* changes. As CGRP_DEAD assertion is serialized and happens
* before the cgroup is taken off the ->sibling list, if we see it
* unasserted, it's guaranteed that the next sibling hasn't
* finished its grace period even if it's already removed, and thus
* safe to dereference from this RCU critical section. If
* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
* to be visible as %true here.
*
* If @pos is dead, its next pointer can't be dereferenced;
* however, as each cgroup is given a monotonically increasing
* unique serial number and always appended to the sibling list,
* the next one can be found by walking the parent's children until
* we see a cgroup with higher serial number than @pos's. While
* this path can be slower, it's taken only when either the current
* cgroup is removed or iteration and removal race.
if (!pos) {
next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
} else if (likely(!cgroup_is_dead(pos))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
} else {
list_for_each_entry_rcu(next, &cgrp->children, sibling)
if (next->serial_nr > pos->serial_nr)
break;
Tejun Heo
committed
if (&next->sibling == &cgrp->children)
return NULL;
return cgroup_css(next, parent_css->ss);
Tejun Heo
committed
* css_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_pre(). Find the next descendant
Tejun Heo
committed
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @root are accessible and @pos is a descendant of @root.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
Tejun Heo
committed
/* if first iteration, visit @root */
Tejun Heo
committed
return root;
/* visit the first child if exists */
Tejun Heo
committed
next = css_next_child(NULL, pos);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
Tejun Heo
committed
while (pos != root) {
next = css_next_child(pos, css_parent(pos));
Tejun Heo
committed
pos = css_parent(pos);
return NULL;
}
Tejun Heo
committed
* css_rightmost_descendant - return the rightmost descendant of a css
* @pos: css of interest
Tejun Heo
committed
* Return the rightmost descendant of @pos. If there's no descendant, @pos
* is returned. This can be used during pre-order traversal to skip
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as
* long as @pos is accessible.
Tejun Heo
committed
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last, *tmp;
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
Tejun Heo
committed
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
return last;
}
Tejun Heo
committed
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last;
do {
last = pos;
Tejun Heo
committed
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/**
Tejun Heo
committed
* css_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_post(). Find the next descendant
Tejun Heo
committed
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @cgroup are accessible and @pos is a descendant of
* @cgroup.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
return css_leftmost_descendant(root);
Tejun Heo
committed
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
Tejun Heo
committed
next = css_next_child(pos, css_parent(pos));
Tejun Heo
committed
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
Tejun Heo
committed
return css_parent(pos);
* css_advance_task_iter - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
static void css_advance_task_iter(struct css_task_iter *it)
{
struct list_head *l = it->cset_link;
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == &it->origin_css->cgroup->cset_links) {
it->cset_link = NULL;
return;
}
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
if (!list_empty(&cset->tasks))
it->task = cset->tasks.next;
else
it->task = cset->mg_tasks.next;
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*
* Note that this function acquires a lock which is released when the
* iteration finishes. The caller can't sleep while iteration is in
* progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
__acquires(css_set_rwsem)
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);
down_read(&css_set_rwsem);
it->origin_css = css;
it->cset_link = &css->cgroup->cset_links;
css_advance_task_iter(it);
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
*
* The "next" function for task iteration. @it should have been
* initialized via css_task_iter_start(). Returns NULL when the iteration
* reaches the end.
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
struct list_head *l = it->task;
struct cgrp_cset_link *link = list_entry(it->cset_link,
struct cgrp_cset_link, cset_link);
/* If the iterator cg is NULL, we have no tasks */
if (!it->cset_link)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
if (l == &link->cset->tasks)
l = link->cset->mg_tasks.next;
if (l == &link->cset->mg_tasks)
css_advance_task_iter(it);
return res;
}
* css_task_iter_end - finish task iteration
* @it: the task iterator to finish
*
* Finish task iteration started by css_task_iter_start().
void css_task_iter_end(struct css_task_iter *it)
__releases(css_set_rwsem)
up_read(&css_set_rwsem);
Tejun Heo
committed
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*
* Locking rules between cgroup_post_fork() and the migration path
* guarantee that, if a task is forking while being migrated, the new child
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
Tejun Heo
committed
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
LIST_HEAD(preloaded_csets);
struct cgrp_cset_link *link;
struct css_task_iter it;
struct task_struct *task;
int ret;
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
down_read(&css_set_rwsem);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
up_read(&css_set_rwsem);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
if (ret)
goto out_err;
/*
* Migrate tasks one-by-one until @form is empty. This fails iff
* ->can_attach() fails.
*/
do {
css_task_iter_start(&from->dummy_css, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
css_task_iter_end(&it);
if (task) {
ret = cgroup_migrate(to, task, false);
put_task_struct(task);
}
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
mutex_unlock(&cgroup_mutex);
return ret;
Tejun Heo
committed
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* for delayed destruction */
struct delayed_work destroy_dwork;
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after: