Newer
Older
Tejun Heo
committed
* css_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_post(). Find the next descendant
Tejun Heo
committed
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
*
* While this function requires RCU read locking, it doesn't require the
* whole traversal to be contained in a single RCU critical section. This
* function will return the correct next descendant as long as both @pos
* and @cgroup are accessible and @pos is a descendant of @cgroup.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
WARN_ON_ONCE(!rcu_read_lock_held());
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
return css_leftmost_descendant(root);
Tejun Heo
committed
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
Tejun Heo
committed
next = css_next_child(pos, css_parent(pos));
Tejun Heo
committed
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
Tejun Heo
committed
return css_parent(pos);
Tejun Heo
committed
EXPORT_SYMBOL_GPL(css_next_descendant_post);
* css_advance_task_iter - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
static void css_advance_task_iter(struct css_task_iter *it)
{
struct list_head *l = it->cset_link;
struct cgrp_cset_link *link;
struct css_set *cset;
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == &it->origin_css->cgroup->cset_links) {
it->cset_link = NULL;
return;
}
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
} while (list_empty(&cset->tasks));
it->cset_link = l;
it->task = cset->tasks.next;
}
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
*
* Note that this function acquires a lock which is released when the
* iteration finishes. The caller can't sleep while iteration is in
* progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
Kirill A. Shutemov
committed
__acquires(css_set_lock)
* The first time anyone tries to iterate across a css, we need to
* enable the list linking each css_set to its tasks, and fix up
* all existing tasks.
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
read_lock(&css_set_lock);
it->origin_css = css;
it->cset_link = &css->cgroup->cset_links;
css_advance_task_iter(it);
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
*
* The "next" function for task iteration. @it should have been
* initialized via css_task_iter_start(). Returns NULL when the iteration
* reaches the end.
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
struct task_struct *res;
struct list_head *l = it->task;
struct cgrp_cset_link *link;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cset_link)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link);
if (l == &link->cset->tasks) {
/*
* We reached the end of this task list - move on to the
* next cgrp_cset_link.
*/
css_advance_task_iter(it);
} else {
it->task = l;
}
return res;
}
* css_task_iter_end - finish task iteration
* @it: the task iterator to finish
*
* Finish task iteration started by css_task_iter_start().
void css_task_iter_end(struct css_task_iter *it)
Kirill A. Shutemov
committed
__releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
{
int start_diff = timespec_compare(&t1->start_time, time);
if (start_diff > 0) {
return 1;
} else if (start_diff < 0) {
return 0;
} else {
/*
* Arbitrarily, if two processes started at the same
* time, we'll say that the lower pointer value
* started first. Note that t2 may have exited by now
* so this may not be a valid pointer any longer, but
* that's fine - it still serves to distinguish
* between two tasks started (effectively) simultaneously.
*/
return t1 > t2;
}
}
/*
* This function is a callback from heap_insert() and is used to order
* the heap.
* In this case we order the heap in descending task start time.
*/
static inline int started_after(void *p1, void *p2)
{
struct task_struct *t1 = p1;
struct task_struct *t2 = p2;
return started_after_time(t1, &t2->start_time, t2);
}
/**
* css_scan_tasks - iterate though all the tasks in a css
* @css: the css to iterate tasks of
* @test: optional test callback
* @process: process callback
* @data: data passed to @test and @process
* @heap: optional pre-allocated heap used for task iteration
* Iterate through all the tasks in @css, calling @test for each, and if it
* returns %true, call @process for it also.
* @test may be NULL, meaning always true (select all tasks), which
* effectively duplicates css_task_iter_{start,next,end}() but does not
* lock css_set_lock for the call to @process.
*
* It is guaranteed that @process will act on every task that is a member
* of @css for the duration of this call. This function may or may not
* call @process for tasks that exit or move to a different css during the
* call, or are forked or move into the css during the call.
* Note that @test may be called with locks held, and may in some
* situations be called multiple times for the same task, so it should be
* cheap.
* If @heap is non-NULL, a heap has been pre-allocated and will be used for
* heap operations (and its "gt" member will be overwritten), else a
* temporary heap will be used (allocation of which may cause this function
* to fail).
int css_scan_tasks(struct cgroup_subsys_state *css,
bool (*test)(struct task_struct *, void *),
void (*process)(struct task_struct *, void *),
void *data, struct ptr_heap *heap)
{
int retval, i;
struct css_task_iter it;
struct task_struct *p, *dropped;
/* Never dereference latest_task, since it's not refcounted */
struct task_struct *latest_task = NULL;
struct ptr_heap tmp_heap;
struct timespec latest_time = { 0, 0 };
/* The caller supplied our heap and pre-allocated its memory */
heap->gt = &started_after;
} else {
/* We need to allocate our own heap memory */
heap = &tmp_heap;
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
if (retval)
/* cannot allocate the heap */
return retval;
}
again:
/*
* Scan tasks in the css, using the @test callback to determine
* which are of interest, and invoking @process callback on the
* ones which need an update. Since we don't want to hold any
* locks during the task updates, gather tasks to be processed in a
* heap structure. The heap is sorted by descending task start
* time. If the statically-sized heap fills up, we overflow tasks
* that started later, and in future iterations only consider tasks
* that started after the latest task in the previous pass. This
* guarantees forward progress and that we don't miss any tasks.
*/
heap->size = 0;
css_task_iter_start(css, &it);
while ((p = css_task_iter_next(&it))) {
/*
* Only affect tasks that qualify per the caller's callback,
* if he provided one
*/
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
continue;
/*
* Only process tasks that started after the last task
* we processed
*/
if (!started_after_time(p, &latest_time, latest_task))
continue;
dropped = heap_insert(heap, p);
if (dropped == NULL) {
/*
* The new task was inserted; the heap wasn't
* previously full
*/
get_task_struct(p);
} else if (dropped != p) {
/*
* The new task was inserted, and pushed out a
* different task
*/
get_task_struct(p);
put_task_struct(dropped);
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
css_task_iter_end(&it);
if (heap->size) {
for (i = 0; i < heap->size; i++) {
struct task_struct *q = heap->ptrs[i];
latest_time = q->start_time;
latest_task = q;
}
/* Process the task per the caller's callback */
put_task_struct(q);
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.
* Not the most efficient way to do it, but it avoids
* having to take callback_mutex in the fork path
*/
goto again;
}
if (heap == &tmp_heap)
heap_free(&tmp_heap);
return 0;
}
static void cgroup_transfer_one_task(struct task_struct *task, void *data)
Tejun Heo
committed
{
Tejun Heo
committed
Tejun Heo
committed
cgroup_attach_task(new_cgroup, task, false);
Tejun Heo
committed
}
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task,
to, NULL);
Tejun Heo
committed
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* for delayed destruction */
struct delayed_work destroy_dwork;
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
* want to do away with it. Explicitly scramble sort order if
* sane_behavior so that no such expectation exists in the new interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
static pid_t pid_fry(pid_t pid)
{
unsigned a = pid & 0x55555555;
unsigned b = pid & 0xAAAAAAAA;
return (a << 1) | (b >> 1);
}
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
if (cgroup_sane_behavior(cgrp))
return pid_fry(pid);
else
return pid;
}
Ben Blum
committed
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static int fried_cmppid(const void *a, const void *b)
{
return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
lockdep_assert_held(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links)
if (l->key.type == type && l->key.ns == ns)
return l;
return NULL;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
enum cgroup_filetype type)
Ben Blum
committed
{
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
l = cgroup_pidlist_find(cgrp, type);
if (l)
return l;
Ben Blum
committed
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
Ben Blum
committed
return l;
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
Ben Blum
committed
l->key.type = type;
/* don't need task_nsproxy() if we're looking at ourself */
l->key.ns = get_pid_ns(task_active_pid_ns(current));
Ben Blum
committed
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct css_task_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
Ben Blum
committed
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
css_task_iter_end(&it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
if (cgroup_sane_behavior(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
Ben Blum
committed
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
/* store array, freeing old if necessary */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
int ret = -EINVAL;
struct cgroup *cgrp;
struct css_task_iter it;
* Validate dentry by checking the superblock operations,
* and make sure it's a directory.
if (dentry->d_sb->s_op != &cgroup_ops ||
!S_ISDIR(dentry->d_inode->i_mode))
cgrp = dentry->d_fsdata;
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct cgroup_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
int *iter, ret;
mutex_lock(&cgrp->pidlist_mutex);
/*
* !NULL @of->priv indicates that this isn't the first start()
* after open. If the matching pidlist is around, we can use that.
* Look for it. Note that @of->priv can't be used directly. It
* could already have been destroyed.
*/
if (of->priv)
of->priv = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
if (!of->priv) {
ret = pidlist_array_load(cgrp, type,
(struct cgroup_pidlist **)&of->priv);
if (ret)
return ERR_PTR(ret);
}
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
struct cgroup_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
struct cgroup_open_file *of = s->private;
struct cgroup_pidlist *l = of->priv;
Ben Blum
committed
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
Ben Blum
committed
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
.start = cgroup_pidlist_start,
.stop = cgroup_pidlist_stop,
.next = cgroup_pidlist_next,
.show = cgroup_pidlist_show,
};
Ben Blum
committed
static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
.release = seq_release_private,
};
Ben Blum
committed
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
Ben Blum
committed
/* helper function for the two below it */
static int cgroup_pidlist_open(struct inode *unused, struct file *file)
struct cfent *cfe = __d_cfe(file->f_dentry);
struct cgroup_open_file *of;
Ben Blum
committed
/* configure file information */
file->f_op = &cgroup_pidlist_operations;
of = __seq_open_private(file, &cgroup_pidlist_seq_operations,
sizeof(*of));
if (!of)
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
return notify_on_release(css->cgroup);
}
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
/*
* When dput() is called asynchronously, if umount has been done and
* then deactivate_super() in cgroup_free_fn() kills the superblock,
* there's a small window that vfs will see the root dentry with non-zero
* refcnt and trigger BUG().
*
* That's why we hold a reference before dput() and drop it right after.
*/
static void cgroup_dput(struct cgroup *cgrp)
{
struct super_block *sb = cgrp->root->sb;
atomic_inc(&sb->s_active);
dput(cgrp->dentry);
deactivate_super(sb);
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static struct cftype cgroup_base_files[] = {
.name = "cgroup.procs",
.open = cgroup_pidlist_open,
.private = CGROUP_FILE_PROCS,
.write_u64 = cgroup_procs_write,
.mode = S_IRUGO | S_IWUSR,
Ben Blum
committed
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_sane_behavior_show,
},
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
* don't exist if sane_behavior. If you're depending on these, be
* prepared to be burned.
*/
{
.name = "tasks",
.flags = CFTYPE_INSANE, /* use "procs" instead */
.open = cgroup_pidlist_open,
.private = CGROUP_FILE_TASKS,
.write_u64 = cgroup_tasks_write,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
.flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
},
Tejun Heo
committed
* cgroup_populate_dir - create subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be added
*
* On failure, no file is added.
Tejun Heo
committed
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
Tejun Heo
committed
int i, ret = 0;
/* process cftsets of each subsystem */
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!test_bit(i, &subsys_mask))
list_for_each_entry(set, &ss->cftsets, node) {
ret = cgroup_addrm_files(cgrp, set->cfts, true);
if (ret < 0)
goto err;
}
err:
cgroup_clear_dir(cgrp, subsys_mask);
return ret;
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
/*
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
* Implemented in kill_css().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget() is guaranteed to fail, the css can be offlined
* by invoking offline_css(). After offlining, the base ref is put.
* Implemented in css_killed_work_fn().
*
* 3. When the percpu_ref reaches zero, the only possible remaining
* accessors are inside RCU read sections. css_release() schedules the
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
* css_free_work_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
* steps to the already complex sequence.
*/
static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
if (css->parent)
css_put(css->parent);
css->ss->css_free(css);
cgroup_dput(cgrp);
static void css_free_rcu_fn(struct rcu_head *rcu_head)
{
struct cgroup_subsys_state *css =
container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
/*
* css holds an extra ref to @cgrp->dentry which is put on the last
* css_put(). dput() requires process context which we don't have.
*/
INIT_WORK(&css->destroy_work, css_free_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
call_rcu(&css->rcu_head, css_free_rcu_fn);
static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;