Newer
Older
/* no sibling left, visit parent */
next = pos->parent;
return next != cgroup ? next : NULL;
}
EXPORT_SYMBOL_GPL(cgroup_next_descendant_post);
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
Kirill A. Shutemov
committed
__acquires(css_set_lock)
{
/*
* The first time anyone tries to iterate across a cgroup,
* we need to enable the list linking each css_set to its
* tasks, and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
read_lock(&css_set_lock);
it->cg_link = &cgrp->css_sets;
cgroup_advance_iter(cgrp, it);
struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
struct cgroup_iter *it)
{
struct task_struct *res;
struct list_head *l = it->task;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cg_link)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
if (l == &link->cg->tasks) {
/* We reached the end of this task list - move on to
* the next cg_cgroup_link */
cgroup_advance_iter(cgrp, it);
} else {
it->task = l;
}
return res;
}
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
Kirill A. Shutemov
committed
__releases(css_set_lock)
{
read_unlock(&css_set_lock);
}
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
{
int start_diff = timespec_compare(&t1->start_time, time);
if (start_diff > 0) {
return 1;
} else if (start_diff < 0) {
return 0;
} else {
/*
* Arbitrarily, if two processes started at the same
* time, we'll say that the lower pointer value
* started first. Note that t2 may have exited by now
* so this may not be a valid pointer any longer, but
* that's fine - it still serves to distinguish
* between two tasks started (effectively) simultaneously.
*/
return t1 > t2;
}
}
/*
* This function is a callback from heap_insert() and is used to order
* the heap.
* In this case we order the heap in descending task start time.
*/
static inline int started_after(void *p1, void *p2)
{
struct task_struct *t1 = p1;
struct task_struct *t2 = p2;
return started_after_time(t1, &t2->start_time, t2);
}
/**
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
* @scan: struct cgroup_scanner containing arguments for the scan
*
* Arguments include pointers to callback functions test_task() and
* process_task().
* Iterate through all the tasks in a cgroup, calling test_task() for each,
* and if it returns true, call process_task() for it also.
* The test_task pointer may be NULL, meaning always true (select all tasks).
* Effectively duplicates cgroup_iter_{start,next,end}()
* but does not lock css_set_lock for the call to process_task().
* The struct cgroup_scanner may be embedded in any structure of the caller's
* creation.
* It is guaranteed that process_task() will act on every task that
* is a member of the cgroup for the duration of this call. This
* function may or may not call process_task() for tasks that exit
* or move to a different cgroup during the call, or are forked or
* move into the cgroup during the call.
*
* Note that test_task() may be called with locks held, and may in some
* situations be called multiple times for the same task, so it should
* be cheap.
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
* pre-allocated and will be used for heap operations (and its "gt" member will
* be overwritten), else a temporary heap will be used (allocation of which
* may cause this function to fail).
*/
int cgroup_scan_tasks(struct cgroup_scanner *scan)
{
int retval, i;
struct cgroup_iter it;
struct task_struct *p, *dropped;
/* Never dereference latest_task, since it's not refcounted */
struct task_struct *latest_task = NULL;
struct ptr_heap tmp_heap;
struct ptr_heap *heap;
struct timespec latest_time = { 0, 0 };
if (scan->heap) {
/* The caller supplied our heap and pre-allocated its memory */
heap = scan->heap;
heap->gt = &started_after;
} else {
/* We need to allocate our own heap memory */
heap = &tmp_heap;
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
if (retval)
/* cannot allocate the heap */
return retval;
}
again:
/*
* Scan tasks in the cgroup, using the scanner's "test_task" callback
* to determine which are of interest, and using the scanner's
* "process_task" callback to process any of them that need an update.
* Since we don't want to hold any locks during the task updates,
* gather tasks to be processed in a heap structure.
* The heap is sorted by descending task start time.
* If the statically-sized heap fills up, we overflow tasks that
* started later, and in future iterations only consider tasks that
* started after the latest task in the previous pass. This
* guarantees forward progress and that we don't miss any tasks.
*/
heap->size = 0;
cgroup_iter_start(scan->cg, &it);
while ((p = cgroup_iter_next(scan->cg, &it))) {
/*
* Only affect tasks that qualify per the caller's callback,
* if he provided one
*/
if (scan->test_task && !scan->test_task(p, scan))
continue;
/*
* Only process tasks that started after the last task
* we processed
*/
if (!started_after_time(p, &latest_time, latest_task))
continue;
dropped = heap_insert(heap, p);
if (dropped == NULL) {
/*
* The new task was inserted; the heap wasn't
* previously full
*/
get_task_struct(p);
} else if (dropped != p) {
/*
* The new task was inserted, and pushed out a
* different task
*/
get_task_struct(p);
put_task_struct(dropped);
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
cgroup_iter_end(scan->cg, &it);
if (heap->size) {
for (i = 0; i < heap->size; i++) {
struct task_struct *q = heap->ptrs[i];
latest_time = q->start_time;
latest_task = q;
}
/* Process the task per the caller's callback */
scan->process_task(q, scan);
put_task_struct(q);
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.
* Not the most efficient way to do it, but it avoids
* having to take callback_mutex in the fork path
*/
goto again;
}
if (heap == &tmp_heap)
heap_free(&tmp_heap);
return 0;
}
Tejun Heo
committed
static void cgroup_transfer_one_task(struct task_struct *task,
struct cgroup_scanner *scan)
{
struct cgroup *new_cgroup = scan->data;
Tejun Heo
committed
cgroup_attach_task(new_cgroup, task, false);
Tejun Heo
committed
}
/**
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
struct cgroup_scanner scan;
scan.cg = from;
scan.test_task = NULL; /* select all tasks in cgroup */
scan.process_task = cgroup_transfer_one_task;
scan.heap = NULL;
scan.data = to;
return cgroup_scan_tasks(&scan);
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* how many files are using the current array */
int use_count;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* protects the other fields */
struct rw_semaphore mutex;
};
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
Ben Blum
committed
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
* the last ref-holder is trying to remove l from the list at the same
* time. Holding the pidlist_mutex precludes somebody taking whichever
* list we find out from under us - compare release_pid_array().
*/
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links) {
if (l->key.type == type && l->key.ns == ns) {
/* make sure l doesn't vanish out from under us */
down_write(&l->mutex);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
}
/* entry not found; create a new one */
l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
init_rwsem(&l->mutex);
down_write(&l->mutex);
l->key.type = type;
Ben Blum
committed
l->use_count = 0; /* don't increment here */
l->list = NULL;
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct cgroup_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
cgroup_iter_end(cgrp, &it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
Ben Blum
committed
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
Ben Blum
committed
/* store array, freeing old if necessary - lock already held */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->mutex);
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
int ret = -EINVAL;
struct cgroup *cgrp;
* Validate dentry by checking the superblock operations,
* and make sure it's a directory.
if (dentry->d_sb->s_op != &cgroup_ops ||
!S_ISDIR(dentry->d_inode->i_mode))
cgrp = dentry->d_fsdata;
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
cgroup_iter_end(cgrp, &it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;
Ben Blum
committed
down_read(&l->mutex);
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
Ben Blum
committed
if (l->list[mid] == pid) {
index = mid;
break;
Ben Blum
committed
} else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = *iter;
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
up_read(&l->mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = *p;
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
Ben Blum
committed
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
.start = cgroup_pidlist_start,
.stop = cgroup_pidlist_stop,
.next = cgroup_pidlist_next,
.show = cgroup_pidlist_show,
};
Ben Blum
committed
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
Ben Blum
committed
/*
* the case where we're the last user of this particular pidlist will
* have us remove it from the cgroup's list, which entails taking the
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
* pidlist_mutex, we have to take pidlist_mutex first.
*/
mutex_lock(&l->owner->pidlist_mutex);
Ben Blum
committed
down_write(&l->mutex);
BUG_ON(!l->use_count);
if (!--l->use_count) {
Ben Blum
committed
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
pidlist_free(l->list);
Ben Blum
committed
put_pid_ns(l->key.ns);
up_write(&l->mutex);
kfree(l);
return;
Ben Blum
committed
mutex_unlock(&l->owner->pidlist_mutex);
Ben Blum
committed
up_write(&l->mutex);
Ben Blum
committed
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
Ben Blum
committed
struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/*
* the seq_file will only be initialized if the file was opened for
* reading; hence we check if it's not null only in that case.
*/
l = ((struct seq_file *)file->private_data)->private;
cgroup_release_pid_array(l);
return seq_release(inode, file);
}
Ben Blum
committed
static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
Ben Blum
committed
.release = cgroup_pidlist_release,
};
Ben Blum
committed
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
Ben Blum
committed
/* helper function for the two below it */
Ben Blum
committed
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
Ben Blum
committed
struct cgroup_pidlist *l;
int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/* have the array populated */
Ben Blum
committed
retval = pidlist_array_load(cgrp, type, &l);
Ben Blum
committed
if (retval)
return retval;
/* configure file information */
file->f_op = &cgroup_pidlist_operations;
Ben Blum
committed
retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
Ben Blum
committed
cgroup_release_pid_array(l);
return retval;
Ben Blum
committed
((struct seq_file *)file->private_data)->private = l;
Ben Blum
committed
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
Ben Blum
committed
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
Ben Blum
committed
}
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
{
return notify_on_release(cgrp);
}
static int cgroup_write_notify_on_release(struct cgroup *cgrp,
struct cftype *cft,
u64 val)
{
clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
else
clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
return 0;
}
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void cgroup_event_remove(struct work_struct *work)
{
struct cgroup_event *event = container_of(work, struct cgroup_event,
remove);
struct cgroup *cgrp = event->cgrp;
remove_wait_queue(event->wqh, &event->wait);
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
/* Notify userspace the event is going away. */
eventfd_signal(event->eventfd, 1);
eventfd_ctx_put(event->eventfd);
kfree(event);
dput(cgrp->dentry);
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct cgroup_event *event = container_of(wait,
struct cgroup_event, wait);
struct cgroup *cgrp = event->cgrp;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
/*
* If the event has been detached at cgroup removal, we
* can simply return knowing the other side will cleanup
* for us.
*
* We can't race against event freeing since the other
* side will require wqh->lock via remove_wait_queue(),
* which we hold.
spin_lock(&cgrp->event_list_lock);
if (!list_empty(&event->list)) {
list_del_init(&event->list);
/*
* We are in atomic context, but cgroup_event_remove()
* may sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
spin_unlock(&cgrp->event_list_lock);
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
}
return 0;
}
static void cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
struct cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
struct cgroup *cgrp_cfile;
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->cgrp = cgrp;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
efile = eventfd_fget(efd);
if (IS_ERR(efile)) {
ret = PTR_ERR(efile);
goto fail;
}
event->eventfd = eventfd_ctx_fileget(efile);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto fail;
}
cfile = fget(cfd);
if (!cfile) {
ret = -EBADF;
goto fail;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(file_inode(cfile), MAY_READ);
if (ret < 0)
goto fail;
event->cft = __file_cft(cfile);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto fail;
}
/*
* The file to be monitored must be in the same cgroup as
* cgroup.event_control is.
*/
cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
if (cgrp_cfile != cgrp) {
ret = -EINVAL;
goto fail;
}
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;
}
ret = event->cft->register_event(cgrp, event->cft,
event->eventfd, buffer);
if (ret)
goto fail;
/*
* Events should be removed after rmdir of cgroup directory, but before
* destroying subsystem state objects. Let's take reference to cgroup
* directory dentry to do that.
*/
dget(cgrp->dentry);
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
spin_lock(&cgrp->event_list_lock);
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);
fput(cfile);
fput(efile);
return 0;
fail:
if (cfile)
fput(cfile);
if (event && event->eventfd && !IS_ERR(event->eventfd))
eventfd_ctx_put(event->eventfd);
if (!IS_ERR_OR_NULL(efile))
fput(efile);
kfree(event);
return ret;
}
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
struct cftype *cft)
{
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
}
static int cgroup_clone_children_write(struct cgroup *cgrp,
struct cftype *cft,
u64 val)
{
if (val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
/*
* for the common functions, 'private' gives the type of file
*/
Ben Blum
committed
/* for hysterical raisins, we can't put this on the older files */
#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
static struct cftype files[] = {
{
.name = "tasks",
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
Ben Blum
committed
.release = cgroup_pidlist_release,
Ben Blum
committed
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
Ben Blum
committed
.release = cgroup_pidlist_release,
Ben Blum
committed
},
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
{
.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "release_agent",
.flags = CFTYPE_ONLY_ON_ROOT,
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
},
/**
* cgroup_populate_dir - selectively creation of files in a directory
* @cgrp: target cgroup
* @base_files: true if the base files should be added
* @subsys_mask: mask of the subsystem ids whose files should be added
*/
static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files,
unsigned long subsys_mask)
{
int err;
struct cgroup_subsys *ss;
if (base_files) {
err = cgroup_addrm_files(cgrp, NULL, files, true);
if (err < 0)
return err;
}
/* process cftsets of each subsystem */
for_each_subsys(cgrp->root, ss) {
if (!test_bit(ss->subsys_id, &subsys_mask))
continue;
list_for_each_entry(set, &ss->cftsets, node)
cgroup_addrm_files(cgrp, ss, set->cfts, true);
/* This cgroup is ready now */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
if (css->id)
rcu_assign_pointer(css->id->css, css);
}
return 0;
}
static void css_dput_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, dput_work);
struct dentry *dentry = css->cgroup->dentry;
struct super_block *sb = dentry->d_sb;
atomic_inc(&sb->s_active);
dput(dentry);
deactivate_super(sb);
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
if (cgrp == dummytop)
css->flags |= CSS_ROOT;
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;