Newer
Older
* Ugh... if @cft wants a custom max_write_len, we need to
* make a copy of kf_ops to set its atomic_write_len.
*/
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
cgroup_exit_cftypes(cfts);
return -ENOMEM;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
if (!cfts || !cfts[0].ss)
return -ENOENT;
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
return 0;
/**
* cgroup_rm_cftypes - remove an array of cftypes from a subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Unregister @cfts. Files described by @cfts are removed from all
* existing cgroups and all future cgroups won't have them either. This
* function can be called anytime whether @cfts' subsys is attached or not.
*
* Returns 0 on successful unregistration, -ENOENT if @cfts is not
int cgroup_rm_cftypes(struct cftype *cfts)
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
/**
* cgroup_add_cftypes - add an array of cftypes to a subsystem
* @ss: target cgroup subsystem
* @cfts: zero-length name terminated array of cftypes
*
* Register @cfts to @ss. Files described by @cfts are created for all
* existing cgroups to which @ss is attached and all future cgroups will
* have them too. This function can be called anytime whether @ss is
* attached or not.
*
* Returns 0 on successful registration, -errno on failure. Note that this
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (!cfts || cfts[0].name[0] == '\0')
return 0;
ret = cgroup_init_cftypes(ss, cfts);
if (ret)
return ret;
mutex_lock(&cgroup_mutex);
ret = cgroup_apply_cftypes(cfts, true);
cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
* Return the number of tasks in the cgroup.
*/
static int cgroup_task_count(const struct cgroup *cgrp)
struct cgrp_cset_link *link;
down_read(&css_set_rwsem);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
up_read(&css_set_rwsem);
Tejun Heo
committed
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
* This function returns the next child of @parent and should be called
* under either cgroup_mutex or RCU read lock. The only requirement is
* that @parent and @pos are accessible. The next sibling is guaranteed to
* be returned regardless of their states.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *parent)
struct cgroup_subsys_state *next;
* @pos could already have been unlinked from the sibling list.
* Once a cgroup is removed, its ->sibling.next is no longer
* updated when its next sibling changes. CSS_RELEASED is set when
* @pos is taken off list, at which time its next pointer is valid,
* and, as releases are serialized, the one pointed to by the next
* pointer is guaranteed to not have started release yet. This
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
* have dropped rcu_read_lock() inbetween iterations.
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
* increasing unique serial number and always appended to the
* sibling list, the next one can be found by walking the parent's
* children until the first css with higher serial number than
* @pos's. While this path can be slower, it happens iff iteration
* races against release and the race window is very small.
next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
list_for_each_entry_rcu(next, &parent->children, sibling)
if (next->serial_nr > pos->serial_nr)
/*
* @next, if not pointing to the head, can be dereferenced and is
if (&next->sibling != &parent->children)
return next;
Tejun Heo
committed
* css_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_pre(). Find the next descendant
Tejun Heo
committed
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @root are accessible and @pos is a descendant of @root.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
Tejun Heo
committed
/* if first iteration, visit @root */
Tejun Heo
committed
return root;
/* visit the first child if exists */
Tejun Heo
committed
next = css_next_child(NULL, pos);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
Tejun Heo
committed
while (pos != root) {
return NULL;
}
Tejun Heo
committed
* css_rightmost_descendant - return the rightmost descendant of a css
* @pos: css of interest
Tejun Heo
committed
* Return the rightmost descendant of @pos. If there's no descendant, @pos
* is returned. This can be used during pre-order traversal to skip
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as
* long as @pos is accessible.
Tejun Heo
committed
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last, *tmp;
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
Tejun Heo
committed
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
return last;
}
Tejun Heo
committed
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last;
do {
last = pos;
Tejun Heo
committed
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/**
Tejun Heo
committed
* css_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_post(). Find the next descendant
Tejun Heo
committed
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @cgroup are accessible and @pos is a descendant of
* @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
return css_leftmost_descendant(root);
Tejun Heo
committed
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
Tejun Heo
committed
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
/**
* css_has_online_children - does a css have online children
* @css: the target css
*
* Returns %true if @css has any online children; otherwise, %false. This
* function can be called from any context but the caller is responsible
* for synchronizing against on/offlining as necessary.
*/
bool css_has_online_children(struct cgroup_subsys_state *css)
{
struct cgroup_subsys_state *child;
bool ret = false;
rcu_read_lock();
css_for_each_child(child, css) {
if (css->flags & CSS_ONLINE) {
ret = true;
break;
}
}
rcu_read_unlock();
return ret;
}
* css_advance_task_iter - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
static void css_advance_task_iter(struct css_task_iter *it)
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == it->cset_head) {
it->cset_pos = NULL;
if (it->ss) {
cset = container_of(l, struct css_set,
e_cset_node[it->ss->id]);
} else {
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
}
} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
it->task_pos = cset->mg_tasks.next;
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*
* Note that this function acquires a lock which is released when the
* iteration finishes. The caller can't sleep while iteration is in
* progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
__acquires(css_set_rwsem)
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);
down_read(&css_set_rwsem);
it->ss = css->ss;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
css_advance_task_iter(it);
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
*
* The "next" function for task iteration. @it should have been
* initialized via css_task_iter_start(). Returns NULL when the iteration
* reaches the end.
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
/* If the iterator cg is NULL, we have no tasks */
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
if (l == it->tasks_head)
l = it->mg_tasks_head->next;
css_advance_task_iter(it);
return res;
}
* css_task_iter_end - finish task iteration
* @it: the task iterator to finish
*
* Finish task iteration started by css_task_iter_start().
void css_task_iter_end(struct css_task_iter *it)
__releases(css_set_rwsem)
up_read(&css_set_rwsem);
Tejun Heo
committed
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
* Locking rules between cgroup_post_fork() and the migration path
* guarantee that, if a task is forking while being migrated, the new child
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
Tejun Heo
committed
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
LIST_HEAD(preloaded_csets);
struct cgrp_cset_link *link;
struct css_task_iter it;
struct task_struct *task;
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
down_read(&css_set_rwsem);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
up_read(&css_set_rwsem);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
if (ret)
goto out_err;
Tejun Heo
committed
/*
* Migrate tasks one-by-one until @form is empty. This fails iff
* ->can_attach() fails.
*/
do {
css_task_iter_start(&from->self, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
css_task_iter_end(&it);
if (task) {
ret = cgroup_migrate(to, task, false);
put_task_struct(task);
}
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
return ret;
Tejun Heo
committed
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* for delayed destruction */
struct delayed_work destroy_dwork;
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
* want to do away with it. Explicitly scramble sort order if
* sane_behavior so that no such expectation exists in the new interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
static pid_t pid_fry(pid_t pid)
{
unsigned a = pid & 0x55555555;
unsigned b = pid & 0xAAAAAAAA;
return (a << 1) | (b >> 1);
}
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
if (cgroup_sane_behavior(cgrp))
return pid_fry(pid);
else
return pid;
}
Ben Blum
committed
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static int fried_cmppid(const void *a, const void *b)
{
return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
lockdep_assert_held(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links)
if (l->key.type == type && l->key.ns == ns)
return l;
return NULL;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
enum cgroup_filetype type)
Ben Blum
committed
{
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
l = cgroup_pidlist_find(cgrp, type);
if (l)
return l;
Ben Blum
committed
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
Ben Blum
committed
return l;
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
Ben Blum
committed
l->key.type = type;
/* don't need task_nsproxy() if we're looking at ourself */
l->key.ns = get_pid_ns(task_active_pid_ns(current));
Ben Blum
committed
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct css_task_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
Ben Blum
committed
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
css_task_iter_start(&cgrp->self, &it);
while ((tsk = css_task_iter_next(&it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
css_task_iter_end(&it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
if (cgroup_sane_behavior(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
Ben Blum
committed
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
/* store array, freeing old if necessary */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
struct css_task_iter it;
/* it should be kernfs_node belonging to cgroupfs and is a directory */
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
* @kn->priv is RCU safe. Let's do the RCU dancing.
rcu_read_lock();
cgrp = rcu_dereference(kn->priv);
if (!cgrp || cgroup_is_dead(cgrp)) {
css_task_iter_start(&cgrp->self, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
int *iter, ret;
mutex_lock(&cgrp->pidlist_mutex);
/*
* !NULL @of->priv indicates that this isn't the first start()
* after open. If the matching pidlist is around, we can use that.
* Look for it. Note that @of->priv can't be used directly. It
* could already have been destroyed.
*/
if (of->priv)
of->priv = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
if (!of->priv) {
ret = pidlist_array_load(cgrp, type,
(struct cgroup_pidlist **)&of->priv);
if (ret)
return ERR_PTR(ret);
}
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
struct cgroup_pidlist *l = of->priv;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
struct cgroup_pidlist *l = of->priv;
Ben Blum
committed
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
return notify_on_release(css->cgroup);
}
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static struct cftype cgroup_base_files[] = {
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,