Newer
Older
int cgroup_task_count(const struct cgroup *cgrp)
struct cgrp_cset_link *link;
read_lock(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
read_unlock(&css_set_lock);
* To reduce the fork() overhead for systems that are not actually using
* their cgroups capability, we don't maintain the lists running through
* each css_set to its tasks until we see the list actually used - in other
* words after the first call to css_task_iter_start().
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
write_lock(&css_set_lock);
use_task_css_set_links = 1;
Frederic Weisbecker
committed
/*
* We need tasklist_lock because RCU is not safe against
* while_each_thread(). Besides, a forking task that has passed
* cgroup_post_fork() without seeing use_task_css_set_links = 1
* is not guaranteed to have its child immediately visible in the
* tasklist if we walk through it with RCU.
*/
read_lock(&tasklist_lock);
do_each_thread(g, p) {
task_lock(p);
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
*/
if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
list_add(&p->cg_list, &task_css_set(p)->tasks);
task_unlock(p);
} while_each_thread(g, p);
Frederic Weisbecker
committed
read_unlock(&tasklist_lock);
write_unlock(&css_set_lock);
}
Tejun Heo
committed
* css_next_child - find the next child of a given css
* @pos_css: the current position (%NULL to initiate traversal)
* @parent_css: css whose children to walk
Tejun Heo
committed
* This function returns the next child of @parent_css and should be called
* under RCU read lock. The only requirement is that @parent_css and
* @pos_css are accessible. The next sibling is guaranteed to be returned
* regardless of their states.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_child(struct cgroup_subsys_state *pos_css,
struct cgroup_subsys_state *parent_css)
Tejun Heo
committed
struct cgroup *pos = pos_css ? pos_css->cgroup : NULL;
struct cgroup *cgrp = parent_css->cgroup;
struct cgroup *next;
WARN_ON_ONCE(!rcu_read_lock_held());
/*
* @pos could already have been removed. Once a cgroup is removed,
* its ->sibling.next is no longer updated when its next sibling
* changes. As CGRP_DEAD assertion is serialized and happens
* before the cgroup is taken off the ->sibling list, if we see it
* unasserted, it's guaranteed that the next sibling hasn't
* finished its grace period even if it's already removed, and thus
* safe to dereference from this RCU critical section. If
* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
* to be visible as %true here.
*
* If @pos is dead, its next pointer can't be dereferenced;
* however, as each cgroup is given a monotonically increasing
* unique serial number and always appended to the sibling list,
* the next one can be found by walking the parent's children until
* we see a cgroup with higher serial number than @pos's. While
* this path can be slower, it's taken only when either the current
* cgroup is removed or iteration and removal race.
if (!pos) {
next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling);
} else if (likely(!cgroup_is_dead(pos))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
} else {
list_for_each_entry_rcu(next, &cgrp->children, sibling)
if (next->serial_nr > pos->serial_nr)
break;
Tejun Heo
committed
if (&next->sibling == &cgrp->children)
return NULL;
if (parent_css->ss)
return cgroup_css(next, parent_css->ss->subsys_id);
else
return &next->dummy_css;
Tejun Heo
committed
EXPORT_SYMBOL_GPL(css_next_child);
Tejun Heo
committed
* css_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_pre(). Find the next descendant
Tejun Heo
committed
* to visit for pre-order traversal of @root's descendants. @root is
* included in the iteration and the first node to be visited.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
Tejun Heo
committed
* and @root are accessible and @pos is a descendant of @root.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_pre(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
WARN_ON_ONCE(!rcu_read_lock_held());
Tejun Heo
committed
/* if first iteration, visit @root */
Tejun Heo
committed
return root;
/* visit the first child if exists */
Tejun Heo
committed
next = css_next_child(NULL, pos);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
Tejun Heo
committed
while (pos != root) {
next = css_next_child(pos, css_parent(pos));
Tejun Heo
committed
pos = css_parent(pos);
return NULL;
}
Tejun Heo
committed
EXPORT_SYMBOL_GPL(css_next_descendant_pre);
Tejun Heo
committed
* css_rightmost_descendant - return the rightmost descendant of a css
* @pos: css of interest
Tejun Heo
committed
* Return the rightmost descendant of @pos. If there's no descendant, @pos
* is returned. This can be used during pre-order traversal to skip
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct rightmost descendant as long as @pos is
* accessible.
Tejun Heo
committed
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last, *tmp;
WARN_ON_ONCE(!rcu_read_lock_held());
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
Tejun Heo
committed
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
return last;
}
Tejun Heo
committed
EXPORT_SYMBOL_GPL(css_rightmost_descendant);
Tejun Heo
committed
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last;
do {
last = pos;
Tejun Heo
committed
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/**
Tejun Heo
committed
* css_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_post(). Find the next descendant
Tejun Heo
committed
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
WARN_ON_ONCE(!rcu_read_lock_held());
/* if first iteration, visit the leftmost descendant */
if (!pos) {
Tejun Heo
committed
next = css_leftmost_descendant(root);
return next != root ? next : NULL;
Tejun Heo
committed
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
Tejun Heo
committed
next = css_next_child(pos, css_parent(pos));
Tejun Heo
committed
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
Tejun Heo
committed
return css_parent(pos);
Tejun Heo
committed
EXPORT_SYMBOL_GPL(css_next_descendant_post);
* css_advance_task_iter - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
static void css_advance_task_iter(struct css_task_iter *it)
{
struct list_head *l = it->cset_link;
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == &it->origin_css->cgroup->cset_links) {
it->cset_link = NULL;
return;
}
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
} while (list_empty(&cset->tasks));
it->cset_link = l;
it->task = cset->tasks.next;
}
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*
* Note that this function acquires a lock which is released when the
* iteration finishes. The caller can't sleep while iteration is in
* progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
Kirill A. Shutemov
committed
__acquires(css_set_lock)
* The first time anyone tries to iterate across a css, we need to
* enable the list linking each css_set to its tasks, and fix up
* all existing tasks.
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
read_lock(&css_set_lock);
it->origin_css = css;
it->cset_link = &css->cgroup->cset_links;
css_advance_task_iter(it);
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
*
* The "next" function for task iteration. @it should have been
* initialized via css_task_iter_start(). Returns NULL when the iteration
* reaches the end.
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
struct list_head *l = it->task;
struct cgrp_cset_link *link;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cset_link)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
if (l == &link->cset->tasks) {
/*
* We reached the end of this task list - move on to the
* next cgrp_cset_link.
*/
css_advance_task_iter(it);
} else {
it->task = l;
}
return res;
}
* css_task_iter_end - finish task iteration
* @it: the task iterator to finish
*
* Finish task iteration started by css_task_iter_start().
void css_task_iter_end(struct css_task_iter *it)
Kirill A. Shutemov
committed
__releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
{
int start_diff = timespec_compare(&t1->start_time, time);
if (start_diff > 0) {
return 1;
} else if (start_diff < 0) {
return 0;
} else {
/*
* Arbitrarily, if two processes started at the same
* time, we'll say that the lower pointer value
* started first. Note that t2 may have exited by now
* so this may not be a valid pointer any longer, but
* that's fine - it still serves to distinguish
* between two tasks started (effectively) simultaneously.
*/
return t1 > t2;
}
}
/*
* This function is a callback from heap_insert() and is used to order
* the heap.
* In this case we order the heap in descending task start time.
*/
static inline int started_after(void *p1, void *p2)
{
struct task_struct *t1 = p1;
struct task_struct *t2 = p2;
return started_after_time(t1, &t2->start_time, t2);
}
/**
* css_scan_tasks - iterate though all the tasks in a css
* @css: the css to iterate tasks of
* @test: optional test callback
* @process: process callback
* @data: data passed to @test and @process
* @heap: optional pre-allocated heap used for task iteration
* Iterate through all the tasks in @css, calling @test for each, and if it
* returns %true, call @process for it also.
* @test may be NULL, meaning always true (select all tasks), which
* effectively duplicates css_task_iter_{start,next,end}() but does not
* lock css_set_lock for the call to @process.
*
* It is guaranteed that @process will act on every task that is a member
* of @css for the duration of this call. This function may or may not
* call @process for tasks that exit or move to a different css during the
* call, or are forked or move into the css during the call.
*
* Note that @test may be called with locks held, and may in some
* situations be called multiple times for the same task, so it should be
* cheap.
*
* If @heap is non-NULL, a heap has been pre-allocated and will be used for
* heap operations (and its "gt" member will be overwritten), else a
* temporary heap will be used (allocation of which may cause this function
* to fail).
int css_scan_tasks(struct cgroup_subsys_state *css,
bool (*test)(struct task_struct *, void *),
void (*process)(struct task_struct *, void *),
void *data, struct ptr_heap *heap)
{
int retval, i;
struct css_task_iter it;
struct task_struct *p, *dropped;
/* Never dereference latest_task, since it's not refcounted */
struct task_struct *latest_task = NULL;
struct ptr_heap tmp_heap;
struct timespec latest_time = { 0, 0 };
/* The caller supplied our heap and pre-allocated its memory */
heap->gt = &started_after;
} else {
/* We need to allocate our own heap memory */
heap = &tmp_heap;
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
if (retval)
/* cannot allocate the heap */
return retval;
}
again:
/*
* Scan tasks in the css, using the @test callback to determine
* which are of interest, and invoking @process callback on the
* ones which need an update. Since we don't want to hold any
* locks during the task updates, gather tasks to be processed in a
* heap structure. The heap is sorted by descending task start
* time. If the statically-sized heap fills up, we overflow tasks
* that started later, and in future iterations only consider tasks
* that started after the latest task in the previous pass. This
* guarantees forward progress and that we don't miss any tasks.
*/
heap->size = 0;
css_task_iter_start(css, &it);
while ((p = css_task_iter_next(&it))) {
/*
* Only affect tasks that qualify per the caller's callback,
* if he provided one
*/
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
continue;
/*
* Only process tasks that started after the last task
* we processed
*/
if (!started_after_time(p, &latest_time, latest_task))
continue;
dropped = heap_insert(heap, p);
if (dropped == NULL) {
/*
* The new task was inserted; the heap wasn't
* previously full
*/
get_task_struct(p);
} else if (dropped != p) {
/*
* The new task was inserted, and pushed out a
* different task
*/
get_task_struct(p);
put_task_struct(dropped);
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
css_task_iter_end(&it);
if (heap->size) {
for (i = 0; i < heap->size; i++) {
struct task_struct *q = heap->ptrs[i];
latest_time = q->start_time;
latest_task = q;
}
/* Process the task per the caller's callback */
put_task_struct(q);
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.
* Not the most efficient way to do it, but it avoids
* having to take callback_mutex in the fork path
*/
goto again;
}
if (heap == &tmp_heap)
heap_free(&tmp_heap);
return 0;
}
static void cgroup_transfer_one_task(struct task_struct *task, void *data)
Tejun Heo
committed
{
Tejun Heo
committed
Tejun Heo
committed
cgroup_attach_task(new_cgroup, task, false);
Tejun Heo
committed
}
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
to, NULL);
Tejun Heo
committed
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* protects the other fields */
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
Ben Blum
committed
/*
* We can't drop the pidlist_mutex before taking the l->rwsem in case
Ben Blum
committed
* the last ref-holder is trying to remove l from the list at the same
* time. Holding the pidlist_mutex precludes somebody taking whichever
* list we find out from under us - compare release_pid_array().
*/
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links) {
if (l->key.type == type && l->key.ns == ns) {
/* make sure l doesn't vanish out from under us */
Ben Blum
committed
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
}
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
Ben Blum
committed
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
init_rwsem(&l->rwsem);
down_write(&l->rwsem);
Ben Blum
committed
l->key.type = type;
Ben Blum
committed
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct css_task_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
css_task_iter_end(&it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
Ben Blum
committed
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
Ben Blum
committed
/* store array, freeing old if necessary - lock already held */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
l->use_count++;
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
int ret = -EINVAL;
struct cgroup *cgrp;
struct css_task_iter it;
* Validate dentry by checking the superblock operations,
* and make sure it's a directory.
if (dentry->d_sb->s_op != &cgroup_ops ||
!S_ISDIR(dentry->d_inode->i_mode))
cgrp = dentry->d_fsdata;
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
Ben Blum
committed
if (l->list[mid] == pid) {
index = mid;
break;
Ben Blum
committed
} else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = *iter;
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = *p;
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
Ben Blum
committed
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
.start = cgroup_pidlist_start,
.stop = cgroup_pidlist_stop,
.next = cgroup_pidlist_next,
.show = cgroup_pidlist_show,
};
Ben Blum
committed
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
Ben Blum
committed
/*
* the case where we're the last user of this particular pidlist will
* have us remove it from the cgroup's list, which entails taking the
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
* pidlist_mutex, we have to take pidlist_mutex first.
*/
mutex_lock(&l->owner->pidlist_mutex);
Ben Blum
committed
BUG_ON(!l->use_count);
if (!--l->use_count) {
Ben Blum
committed
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
pidlist_free(l->list);
Ben Blum
committed
put_pid_ns(l->key.ns);
Ben Blum
committed
kfree(l);
return;
Ben Blum
committed
mutex_unlock(&l->owner->pidlist_mutex);
Ben Blum
committed
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
Ben Blum
committed
struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/*
* the seq_file will only be initialized if the file was opened for
* reading; hence we check if it's not null only in that case.
*/
l = ((struct seq_file *)file->private_data)->private;
cgroup_release_pid_array(l);
return seq_release(inode, file);
}
Ben Blum
committed
static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
Ben Blum
committed
.release = cgroup_pidlist_release,
};
Ben Blum
committed
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
Ben Blum
committed
/* helper function for the two below it */
Ben Blum
committed
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
Ben Blum
committed
struct cgroup_pidlist *l;
int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/* have the array populated */
Ben Blum
committed
retval = pidlist_array_load(cgrp, type, &l);
Ben Blum
committed
if (retval)
return retval;
/* configure file information */
file->f_op = &cgroup_pidlist_operations;
Ben Blum
committed
retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
Ben Blum
committed
cgroup_release_pid_array(l);
return retval;
Ben Blum
committed
((struct seq_file *)file->private_data)->private = l;
Ben Blum
committed
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
Ben Blum
committed
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
Ben Blum
committed
}
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
return notify_on_release(css->cgroup);
}
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
/*
* When dput() is called asynchronously, if umount has been done and
* then deactivate_super() in cgroup_free_fn() kills the superblock,
* there's a small window that vfs will see the root dentry with non-zero
* refcnt and trigger BUG().
*
* That's why we hold a reference before dput() and drop it right after.
*/
static void cgroup_dput(struct cgroup *cgrp)
{
struct super_block *sb = cgrp->root->sb;
atomic_inc(&sb->s_active);
dput(cgrp->dentry);
deactivate_super(sb);
}
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void cgroup_event_remove(struct work_struct *work)
{
struct cgroup_event *event = container_of(work, struct cgroup_event,
remove);
Tejun Heo
committed
struct cgroup_subsys_state *css = event->css;
struct cgroup *cgrp = css->cgroup;
remove_wait_queue(event->wqh, &event->wait);
Tejun Heo
committed
event->cft->unregister_event(css, event->cft, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);