Newer
Older
WARN_ON_ONCE(!rcu_read_lock_held());
/*
* @pos could already have been removed. Once a cgroup is removed,
* its ->sibling.next is no longer updated when its next sibling
* changes. As CGRP_DEAD assertion is serialized and happens
* before the cgroup is taken off the ->sibling list, if we see it
* unasserted, it's guaranteed that the next sibling hasn't
* finished its grace period even if it's already removed, and thus
* safe to dereference from this RCU critical section. If
* ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed
* to be visible as %true here.
if (likely(!cgroup_is_dead(pos))) {
3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
3027
3028
3029
3030
3031
3032
3033
3034
3035
3036
3037
next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling);
if (&next->sibling != &pos->parent->children)
return next;
return NULL;
}
/*
* Can't dereference the next pointer. Each cgroup is given a
* monotonically increasing unique serial number and always
* appended to the sibling list, so the next one can be found by
* walking the parent's children until we see a cgroup with higher
* serial number than @pos's.
*
* While this path can be slow, it's taken only when either the
* current cgroup is removed or iteration and removal race.
*/
list_for_each_entry_rcu(next, &pos->parent->children, sibling)
if (next->serial_nr > pos->serial_nr)
return next;
return NULL;
}
EXPORT_SYMBOL_GPL(cgroup_next_sibling);
/**
* cgroup_next_descendant_pre - find the next descendant for pre-order walk
* @pos: the current position (%NULL to initiate traversal)
* @cgroup: cgroup whose descendants to walk
*
* To be used by cgroup_for_each_descendant_pre(). Find the next
* descendant to visit for pre-order traversal of @cgroup's descendants.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
*/
struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
struct cgroup *cgroup)
{
struct cgroup *next;
WARN_ON_ONCE(!rcu_read_lock_held());
/* if first iteration, pretend we just visited @cgroup */
pos = cgroup;
/* visit the first child if exists */
next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
while (pos != cgroup) {
next = cgroup_next_sibling(pos);
if (next)
return next;
pos = pos->parent;
return NULL;
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre);
/**
* cgroup_rightmost_descendant - return the rightmost descendant of a cgroup
* @pos: cgroup of interest
*
* Return the rightmost descendant of @pos. If there's no descendant,
* @pos is returned. This can be used during pre-order traversal to skip
* subtree of @pos.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct rightmost descendant as long as @pos is
* accessible.
*/
struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos)
{
struct cgroup *last, *tmp;
WARN_ON_ONCE(!rcu_read_lock_held());
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
list_for_each_entry_rcu(tmp, &last->children, sibling)
pos = tmp;
} while (pos);
return last;
}
EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant);
static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos)
{
struct cgroup *last;
do {
last = pos;
pos = list_first_or_null_rcu(&pos->children, struct cgroup,
sibling);
} while (pos);
return last;
}
/**
* cgroup_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
* @cgroup: cgroup whose descendants to walk
*
* To be used by cgroup_for_each_descendant_post(). Find the next
* descendant to visit for post-order traversal of @cgroup's descendants.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
*/
struct cgroup *cgroup_next_descendant_post(struct cgroup *pos,
struct cgroup *cgroup)
{
struct cgroup *next;
WARN_ON_ONCE(!rcu_read_lock_held());
/* if first iteration, visit the leftmost descendant */
if (!pos) {
next = cgroup_leftmost_descendant(cgroup);
return next != cgroup ? next : NULL;
}
/* if there's an unvisited sibling, visit its leftmost descendant */
next = cgroup_next_sibling(pos);
if (next)
return cgroup_leftmost_descendant(next);
/* no sibling left, visit parent */
next = pos->parent;
return next != cgroup ? next : NULL;
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
Kirill A. Shutemov
committed
__acquires(css_set_lock)
{
/*
* The first time anyone tries to iterate across a cgroup,
* we need to enable the list linking each css_set to its
* tasks, and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
read_lock(&css_set_lock);
it->cset_link = &cgrp->cset_links;
cgroup_advance_iter(cgrp, it);
struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
struct cgroup_iter *it)
{
struct task_struct *res;
struct list_head *l = it->task;
struct cgrp_cset_link *link;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cset_link)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
if (l == &link->cset->tasks) {
/* We reached the end of this task list - move on to
* the next cg_cgroup_link */
cgroup_advance_iter(cgrp, it);
} else {
it->task = l;
}
return res;
}
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
Kirill A. Shutemov
committed
__releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
{
int start_diff = timespec_compare(&t1->start_time, time);
if (start_diff > 0) {
return 1;
} else if (start_diff < 0) {
return 0;
} else {
/*
* Arbitrarily, if two processes started at the same
* time, we'll say that the lower pointer value
* started first. Note that t2 may have exited by now
* so this may not be a valid pointer any longer, but
* that's fine - it still serves to distinguish
* between two tasks started (effectively) simultaneously.
*/
return t1 > t2;
}
}
/*
* This function is a callback from heap_insert() and is used to order
* the heap.
* In this case we order the heap in descending task start time.
*/
static inline int started_after(void *p1, void *p2)
{
struct task_struct *t1 = p1;
struct task_struct *t2 = p2;
return started_after_time(t1, &t2->start_time, t2);
}
/**
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
* @scan: struct cgroup_scanner containing arguments for the scan
*
* Arguments include pointers to callback functions test_task() and
* process_task().
* Iterate through all the tasks in a cgroup, calling test_task() for each,
* and if it returns true, call process_task() for it also.
* The test_task pointer may be NULL, meaning always true (select all tasks).
* Effectively duplicates cgroup_iter_{start,next,end}()
* but does not lock css_set_lock for the call to process_task().
* The struct cgroup_scanner may be embedded in any structure of the caller's
* creation.
* It is guaranteed that process_task() will act on every task that
* is a member of the cgroup for the duration of this call. This
* function may or may not call process_task() for tasks that exit
* or move to a different cgroup during the call, or are forked or
* move into the cgroup during the call.
*
* Note that test_task() may be called with locks held, and may in some
* situations be called multiple times for the same task, so it should
* be cheap.
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
* pre-allocated and will be used for heap operations (and its "gt" member will
* be overwritten), else a temporary heap will be used (allocation of which
* may cause this function to fail).
*/
int cgroup_scan_tasks(struct cgroup_scanner *scan)
{
int retval, i;
struct cgroup_iter it;
struct task_struct *p, *dropped;
/* Never dereference latest_task, since it's not refcounted */
struct task_struct *latest_task = NULL;
struct ptr_heap tmp_heap;
struct ptr_heap *heap;
struct timespec latest_time = { 0, 0 };
if (scan->heap) {
/* The caller supplied our heap and pre-allocated its memory */
heap = scan->heap;
heap->gt = &started_after;
} else {
/* We need to allocate our own heap memory */
heap = &tmp_heap;
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
if (retval)
/* cannot allocate the heap */
return retval;
}
again:
/*
* Scan tasks in the cgroup, using the scanner's "test_task" callback
* to determine which are of interest, and using the scanner's
* "process_task" callback to process any of them that need an update.
* Since we don't want to hold any locks during the task updates,
* gather tasks to be processed in a heap structure.
* The heap is sorted by descending task start time.
* If the statically-sized heap fills up, we overflow tasks that
* started later, and in future iterations only consider tasks that
* started after the latest task in the previous pass. This
* guarantees forward progress and that we don't miss any tasks.
*/
heap->size = 0;
cgroup_iter_start(scan->cg, &it);
while ((p = cgroup_iter_next(scan->cg, &it))) {
/*
* Only affect tasks that qualify per the caller's callback,
* if he provided one
*/
if (scan->test_task && !scan->test_task(p, scan))
continue;
/*
* Only process tasks that started after the last task
* we processed
*/
if (!started_after_time(p, &latest_time, latest_task))
continue;
dropped = heap_insert(heap, p);
if (dropped == NULL) {
/*
* The new task was inserted; the heap wasn't
* previously full
*/
get_task_struct(p);
} else if (dropped != p) {
/*
* The new task was inserted, and pushed out a
* different task
*/
get_task_struct(p);
put_task_struct(dropped);
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
cgroup_iter_end(scan->cg, &it);
if (heap->size) {
for (i = 0; i < heap->size; i++) {
struct task_struct *q = heap->ptrs[i];
latest_time = q->start_time;
latest_task = q;
}
/* Process the task per the caller's callback */
scan->process_task(q, scan);
put_task_struct(q);
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.
* Not the most efficient way to do it, but it avoids
* having to take callback_mutex in the fork path
*/
goto again;
}
if (heap == &tmp_heap)
heap_free(&tmp_heap);
return 0;
}
Tejun Heo
committed
static void cgroup_transfer_one_task(struct task_struct *task,
struct cgroup_scanner *scan)
{
struct cgroup *new_cgroup = scan->data;
Tejun Heo
committed
cgroup_attach_task(new_cgroup, task, false);
Tejun Heo
committed
}
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
struct cgroup_scanner scan;
scan.cg = from;
scan.test_task = NULL; /* select all tasks in cgroup */
scan.process_task = cgroup_transfer_one_task;
scan.heap = NULL;
scan.data = to;
return cgroup_scan_tasks(&scan);
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* protects the other fields */
struct rw_semaphore mutex;
};
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
Ben Blum
committed
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
* the last ref-holder is trying to remove l from the list at the same
* time. Holding the pidlist_mutex precludes somebody taking whichever
* list we find out from under us - compare release_pid_array().
*/
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links) {
if (l->key.type == type && l->key.ns == ns) {
/* make sure l doesn't vanish out from under us */
down_write(&l->mutex);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
}
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
Ben Blum
committed
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
init_rwsem(&l->mutex);
down_write(&l->mutex);
l->key.type = type;
Ben Blum
committed
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct cgroup_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
cgroup_iter_end(cgrp, &it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
Ben Blum
committed
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
Ben Blum
committed
/* store array, freeing old if necessary - lock already held */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->mutex);
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
int ret = -EINVAL;
struct cgroup *cgrp;
* Validate dentry by checking the superblock operations,
* and make sure it's a directory.
if (dentry->d_sb->s_op != &cgroup_ops ||
!S_ISDIR(dentry->d_inode->i_mode))
cgrp = dentry->d_fsdata;
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
cgroup_iter_end(cgrp, &it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;
Ben Blum
committed
down_read(&l->mutex);
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
Ben Blum
committed
if (l->list[mid] == pid) {
index = mid;
break;
Ben Blum
committed
} else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = *iter;
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
up_read(&l->mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = *p;
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
Ben Blum
committed
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
.start = cgroup_pidlist_start,
.stop = cgroup_pidlist_stop,
.next = cgroup_pidlist_next,
.show = cgroup_pidlist_show,
};
Ben Blum
committed
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
Ben Blum
committed
/*
* the case where we're the last user of this particular pidlist will
* have us remove it from the cgroup's list, which entails taking the
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
* pidlist_mutex, we have to take pidlist_mutex first.
*/
mutex_lock(&l->owner->pidlist_mutex);
Ben Blum
committed
down_write(&l->mutex);
BUG_ON(!l->use_count);
if (!--l->use_count) {
Ben Blum
committed
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
pidlist_free(l->list);
Ben Blum
committed
put_pid_ns(l->key.ns);
up_write(&l->mutex);
kfree(l);
return;
Ben Blum
committed
mutex_unlock(&l->owner->pidlist_mutex);
Ben Blum
committed
up_write(&l->mutex);
Ben Blum
committed
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
Ben Blum
committed
struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/*
* the seq_file will only be initialized if the file was opened for
* reading; hence we check if it's not null only in that case.
*/
l = ((struct seq_file *)file->private_data)->private;
cgroup_release_pid_array(l);
return seq_release(inode, file);
}
Ben Blum
committed
static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
Ben Blum
committed
.release = cgroup_pidlist_release,
};
Ben Blum
committed
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
Ben Blum
committed
/* helper function for the two below it */
Ben Blum
committed
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
Ben Blum
committed
struct cgroup_pidlist *l;
int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/* have the array populated */
Ben Blum
committed
retval = pidlist_array_load(cgrp, type, &l);
Ben Blum
committed
if (retval)
return retval;
/* configure file information */
file->f_op = &cgroup_pidlist_operations;
Ben Blum
committed
retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
Ben Blum
committed
cgroup_release_pid_array(l);
return retval;
Ben Blum
committed
((struct seq_file *)file->private_data)->private = l;
Ben Blum
committed
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
Ben Blum
committed
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
Ben Blum
committed
}
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
{
return notify_on_release(cgrp);
}
static int cgroup_write_notify_on_release(struct cgroup *cgrp,
struct cftype *cft,
u64 val)
{
clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
else
clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
return 0;
}
/*
* When dput() is called asynchronously, if umount has been done and
* then deactivate_super() in cgroup_free_fn() kills the superblock,
* there's a small window that vfs will see the root dentry with non-zero
* refcnt and trigger BUG().
*
* That's why we hold a reference before dput() and drop it right after.
*/
static void cgroup_dput(struct cgroup *cgrp)
{
struct super_block *sb = cgrp->root->sb;
atomic_inc(&sb->s_active);
dput(cgrp->dentry);
deactivate_super(sb);
}
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void cgroup_event_remove(struct work_struct *work)
{
struct cgroup_event *event = container_of(work, struct cgroup_event,
remove);
struct cgroup *cgrp = event->cgrp;
remove_wait_queue(event->wqh, &event->wait);
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct cgroup_event *event = container_of(wait,
struct cgroup_event, wait);
struct cgroup *cgrp = event->cgrp;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
spin_lock(&cgrp->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
}
return 0;
}
static void cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
struct cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
struct cgroup *cgrp_cfile;
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->cgrp = cgrp;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
efile = eventfd_fget(efd);
if (IS_ERR(efile)) {
ret = PTR_ERR(efile);
goto fail;
}
event->eventfd = eventfd_ctx_fileget(efile);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto fail;
}
cfile = fget(cfd);
if (!cfile) {
ret = -EBADF;
goto fail;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto fail;
event->cft = __file_cft(cfile);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto fail;
}
/*
* The file to be monitored must be in the same cgroup as
* cgroup.event_control is.
*/
cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
if (cgrp_cfile != cgrp) {
ret = -EINVAL;
goto fail;
}
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;