Newer
Older
get_task_struct(p);
put_task_struct(dropped);
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
cgroup_iter_end(scan->cg, &it);
if (heap->size) {
for (i = 0; i < heap->size; i++) {
struct task_struct *q = heap->ptrs[i];
latest_time = q->start_time;
latest_task = q;
}
/* Process the task per the caller's callback */
scan->process_task(q, scan);
put_task_struct(q);
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.
* Not the most efficient way to do it, but it avoids
* having to take callback_mutex in the fork path
*/
goto again;
}
if (heap == &tmp_heap)
heap_free(&tmp_heap);
return 0;
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
3078
3079
3080
3081
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
static void *pidlist_resize(void *p, int newcount)
{
void *newlist;
/* note: if new alloc fails, old p will still be valid either way */
if (is_vmalloc_addr(p)) {
newlist = vmalloc(newcount * sizeof(pid_t));
if (!newlist)
return NULL;
memcpy(newlist, p, newcount * sizeof(pid_t));
vfree(p);
} else {
newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL);
}
return newlist;
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* If the new stripped list is sufficiently smaller and there's enough memory
* to allocate a new buffer, will let go of the unneeded memory. Returns the
* number of unique elements.
Ben Blum
committed
/* is the size difference enough that we should re-allocate the array? */
#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new))
static int pidlist_uniq(pid_t **p, int length)
Ben Blum
committed
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
int src, dest = 1;
pid_t *list = *p;
pid_t *newlist;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
/*
* if the length difference is large enough, we want to allocate a
* smaller buffer to save memory. if this fails due to out of memory,
* we'll just stay with what we've got.
*/
if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) {
newlist = pidlist_resize(list, dest);
Ben Blum
committed
if (newlist)
*p = newlist;
}
return dest;
}
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = current->nsproxy->pid_ns;
Ben Blum
committed
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
* the last ref-holder is trying to remove l from the list at the same
* time. Holding the pidlist_mutex precludes somebody taking whichever
* list we find out from under us - compare release_pid_array().
*/
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links) {
if (l->key.type == type && l->key.ns == ns) {
/* make sure l doesn't vanish out from under us */
down_write(&l->mutex);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
}
/* entry not found; create a new one */
l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
if (!l) {
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
init_rwsem(&l->mutex);
down_write(&l->mutex);
l->key.type = type;
Ben Blum
committed
l->use_count = 0; /* don't increment here */
l->list = NULL;
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
mutex_unlock(&cgrp->pidlist_mutex);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct cgroup_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
cgroup_iter_end(cgrp, &it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
Ben Blum
committed
length = pidlist_uniq(&array, length);
Ben Blum
committed
l = cgroup_pidlist_find(cgrp, type);
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
Ben Blum
committed
/* store array, freeing old if necessary - lock already held */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
l->use_count++;
up_write(&l->mutex);
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
int ret = -EINVAL;
struct cgroup *cgrp;
* Validate dentry by checking the superblock operations,
* and make sure it's a directory.
if (dentry->d_sb->s_op != &cgroup_ops ||
!S_ISDIR(dentry->d_inode->i_mode))
cgrp = dentry->d_fsdata;
cgroup_iter_start(cgrp, &it);
while ((tsk = cgroup_iter_next(cgrp, &it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
cgroup_iter_end(cgrp, &it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
int index = 0, pid = *pos;
int *iter;
Ben Blum
committed
down_read(&l->mutex);
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
Ben Blum
committed
if (l->list[mid] == pid) {
index = mid;
break;
Ben Blum
committed
} else if (l->list[mid] <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = *iter;
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
up_read(&l->mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
Ben Blum
committed
struct cgroup_pidlist *l = s->private;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = *p;
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
Ben Blum
committed
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
.start = cgroup_pidlist_start,
.stop = cgroup_pidlist_stop,
.next = cgroup_pidlist_next,
.show = cgroup_pidlist_show,
};
Ben Blum
committed
static void cgroup_release_pid_array(struct cgroup_pidlist *l)
Ben Blum
committed
/*
* the case where we're the last user of this particular pidlist will
* have us remove it from the cgroup's list, which entails taking the
* mutex. since in pidlist_find the pidlist->lock depends on cgroup->
* pidlist_mutex, we have to take pidlist_mutex first.
*/
mutex_lock(&l->owner->pidlist_mutex);
Ben Blum
committed
down_write(&l->mutex);
BUG_ON(!l->use_count);
if (!--l->use_count) {
Ben Blum
committed
/* we're the last user if refcount is 0; remove and free */
list_del(&l->links);
mutex_unlock(&l->owner->pidlist_mutex);
pidlist_free(l->list);
Ben Blum
committed
put_pid_ns(l->key.ns);
up_write(&l->mutex);
kfree(l);
return;
Ben Blum
committed
mutex_unlock(&l->owner->pidlist_mutex);
Ben Blum
committed
up_write(&l->mutex);
Ben Blum
committed
static int cgroup_pidlist_release(struct inode *inode, struct file *file)
Ben Blum
committed
struct cgroup_pidlist *l;
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/*
* the seq_file will only be initialized if the file was opened for
* reading; hence we check if it's not null only in that case.
*/
l = ((struct seq_file *)file->private_data)->private;
cgroup_release_pid_array(l);
return seq_release(inode, file);
}
Ben Blum
committed
static const struct file_operations cgroup_pidlist_operations = {
.read = seq_read,
.llseek = seq_lseek,
.write = cgroup_file_write,
Ben Blum
committed
.release = cgroup_pidlist_release,
};
Ben Blum
committed
* The following functions handle opens on a file that displays a pidlist
* (tasks or procs). Prepare an array of the process/thread IDs of whoever's
* in the cgroup.
Ben Blum
committed
/* helper function for the two below it */
Ben Blum
committed
static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type)
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
Ben Blum
committed
struct cgroup_pidlist *l;
int retval;
/* Nothing to do for write-only files */
if (!(file->f_mode & FMODE_READ))
return 0;
Ben Blum
committed
/* have the array populated */
Ben Blum
committed
retval = pidlist_array_load(cgrp, type, &l);
Ben Blum
committed
if (retval)
return retval;
/* configure file information */
file->f_op = &cgroup_pidlist_operations;
Ben Blum
committed
retval = seq_open(file, &cgroup_pidlist_seq_operations);
if (retval) {
Ben Blum
committed
cgroup_release_pid_array(l);
return retval;
Ben Blum
committed
((struct seq_file *)file->private_data)->private = l;
Ben Blum
committed
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_TASKS);
Ben Blum
committed
}
static int cgroup_procs_open(struct inode *unused, struct file *file)
{
Ben Blum
committed
return cgroup_pidlist_open(file, CGROUP_FILE_PROCS);
Ben Blum
committed
}
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
{
return notify_on_release(cgrp);
}
static int cgroup_write_notify_on_release(struct cgroup *cgrp,
struct cftype *cft,
u64 val)
{
clear_bit(CGRP_RELEASABLE, &cgrp->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
else
clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
return 0;
}
/*
* Unregister event and free resources.
*
* Gets called from workqueue.
*/
static void cgroup_event_remove(struct work_struct *work)
{
struct cgroup_event *event = container_of(work, struct cgroup_event,
remove);
struct cgroup *cgrp = event->cgrp;
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
eventfd_ctx_put(event->eventfd);
kfree(event);
dput(cgrp->dentry);
}
/*
* Gets called on POLLHUP on eventfd when user closes it.
*
* Called with wqh->lock held and interrupts disabled.
*/
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
int sync, void *key)
{
struct cgroup_event *event = container_of(wait,
struct cgroup_event, wait);
struct cgroup *cgrp = event->cgrp;
unsigned long flags = (unsigned long)key;
if (flags & POLLHUP) {
__remove_wait_queue(event->wqh, &event->wait);
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
spin_lock(&cgrp->event_list_lock);
list_del(&event->list);
spin_unlock(&cgrp->event_list_lock);
/*
* We are in atomic context, but cgroup_event_remove() may
* sleep, so we have to call it in workqueue.
*/
schedule_work(&event->remove);
}
return 0;
}
static void cgroup_event_ptable_queue_proc(struct file *file,
wait_queue_head_t *wqh, poll_table *pt)
{
struct cgroup_event *event = container_of(pt,
struct cgroup_event, pt);
event->wqh = wqh;
add_wait_queue(wqh, &event->wait);
}
/*
* Parse input and register new cgroup event handler.
*
* Input must be in format '<event_fd> <control_fd> <args>'.
* Interpretation of args is defined by control file implementation.
*/
static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
struct cgroup_event *event = NULL;
unsigned int efd, cfd;
struct file *efile = NULL;
struct file *cfile = NULL;
char *endp;
int ret;
efd = simple_strtoul(buffer, &endp, 10);
if (*endp != ' ')
return -EINVAL;
buffer = endp + 1;
cfd = simple_strtoul(buffer, &endp, 10);
if ((*endp != ' ') && (*endp != '\0'))
return -EINVAL;
buffer = endp + 1;
event = kzalloc(sizeof(*event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->cgrp = cgrp;
INIT_LIST_HEAD(&event->list);
init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
INIT_WORK(&event->remove, cgroup_event_remove);
efile = eventfd_fget(efd);
if (IS_ERR(efile)) {
ret = PTR_ERR(efile);
goto fail;
}
event->eventfd = eventfd_ctx_fileget(efile);
if (IS_ERR(event->eventfd)) {
ret = PTR_ERR(event->eventfd);
goto fail;
}
cfile = fget(cfd);
if (!cfile) {
ret = -EBADF;
goto fail;
}
/* the process need read permission on control file */
/* AV: shouldn't we check that it's been opened for read instead? */
ret = inode_permission(cfile->f_path.dentry->d_inode, MAY_READ);
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
if (ret < 0)
goto fail;
event->cft = __file_cft(cfile);
if (IS_ERR(event->cft)) {
ret = PTR_ERR(event->cft);
goto fail;
}
if (!event->cft->register_event || !event->cft->unregister_event) {
ret = -EINVAL;
goto fail;
}
ret = event->cft->register_event(cgrp, event->cft,
event->eventfd, buffer);
if (ret)
goto fail;
if (efile->f_op->poll(efile, &event->pt) & POLLHUP) {
event->cft->unregister_event(cgrp, event->cft, event->eventfd);
ret = 0;
goto fail;
}
/*
* Events should be removed after rmdir of cgroup directory, but before
* destroying subsystem state objects. Let's take reference to cgroup
* directory dentry to do that.
*/
dget(cgrp->dentry);
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
spin_lock(&cgrp->event_list_lock);
list_add(&event->list, &cgrp->event_list);
spin_unlock(&cgrp->event_list_lock);
fput(cfile);
fput(efile);
return 0;
fail:
if (cfile)
fput(cfile);
if (event && event->eventfd && !IS_ERR(event->eventfd))
eventfd_ctx_put(event->eventfd);
if (!IS_ERR_OR_NULL(efile))
fput(efile);
kfree(event);
return ret;
}
static u64 cgroup_clone_children_read(struct cgroup *cgrp,
struct cftype *cft)
{
return clone_children(cgrp);
}
static int cgroup_clone_children_write(struct cgroup *cgrp,
struct cftype *cft,
u64 val)
{
if (val)
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
else
clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
return 0;
}
/*
* for the common functions, 'private' gives the type of file
*/
Ben Blum
committed
/* for hysterical raisins, we can't put this on the older files */
#define CGROUP_FILE_GENERIC_PREFIX "cgroup."
static struct cftype files[] = {
{
.name = "tasks",
.open = cgroup_tasks_open,
.write_u64 = cgroup_tasks_write,
Ben Blum
committed
.release = cgroup_pidlist_release,
Ben Blum
committed
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
Ben Blum
committed
.release = cgroup_pidlist_release,
Ben Blum
committed
},
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
{
.name = CGROUP_FILE_GENERIC_PREFIX "event_control",
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
};
static struct cftype cft_release_agent = {
.name = "release_agent",
.read_seq_string = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX,
static int cgroup_populate_dir(struct cgroup *cgrp)
{
int err;
struct cgroup_subsys *ss;
/* First clear out any existing files */
cgroup_clear_directory(cgrp->dentry);
err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
if (err < 0)
return err;
if (cgrp == cgrp->top_cgroup) {
if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
return err;
}
for_each_subsys(cgrp->root, ss) {
if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
/* This cgroup is ready now */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
if (css->id)
rcu_assign_pointer(css->id->css, css);
}
return 0;
}
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
{
/* We need to take each hierarchy_mutex in a consistent order */
int i;
/*
* No worry about a race with rebind_subsystems that might mess up the
* locking order, since both parties are under cgroup_mutex.
*/
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
}
}
static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
{
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->root == root)
mutex_unlock(&ss->hierarchy_mutex);
}
}
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
* @dentry: dentry of the new cgroup
* @mode: mode to set on new inode
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc(&sb->s_active);
mutex_lock(&cgroup_mutex);
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (clone_children(parent))
set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id) {
err = alloc_css_id(ss, parent, cgrp);
if (err)
/* At error, ->destroy() callback has to free assigned ID. */
if (clone_children(parent) && ss->post_clone)
ss->post_clone(ss, cgrp);
cgroup_lock_hierarchy(root);
list_add(&cgrp->sibling, &cgrp->parent->children);
cgroup_unlock_hierarchy(root);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
if (err < 0)
goto err_remove;
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
err = cgroup_populate_dir(cgrp);
/* If err < 0, we have a half-filled directory - oh well ;) */
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
err_remove:
cgroup_lock_hierarchy(root);
list_del(&cgrp->sibling);
cgroup_unlock_hierarchy(root);
root->number_of_cgroups--;
err_destroy:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
ss->destroy(ss, cgrp);
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
kfree(cgrp);
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
/* the vfs holds inode->i_mutex already */
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
static int cgroup_has_css_refs(struct cgroup *cgrp)
{
/* Check the reference count on each subsystem. Since we
* already established that there are no tasks in the
* cgroup, if the css refcount is also 1, then there should
* be no outstanding references, so the subsystem is safe to
* destroy. We scan across all subsystems rather than using
* the per-hierarchy linked list of mounted subsystems since
* we can be called via check_for_release() with no
* synchronization other than RCU, and the subsystem linked
* list isn't RCU-safe */
int i;
/*
* We won't need to lock the subsys array, because the subsystems
* we're concerned about aren't going anywhere since our cgroup root
* has a reference on them.
*/
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
/* Skip subsystems not present or not in this hierarchy */
if (ss == NULL || ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
/* When called from check_for_release() it's possible
* that by this point the cgroup has been removed
* and the css deleted. But a false-positive doesn't
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway. */
return 1;
}
return 0;
}
/*
* Atomically mark all (or else none) of the cgroup's CSS objects as
* CSS_REMOVED. Return true on success, or false if the cgroup has
* busy subsystems. Call with cgroup_mutex held
*/
static int cgroup_clear_css_refs(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
unsigned long flags;
bool failed = false;
local_irq_save(flags);
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
int refcnt;
while (1) {
/* We can only remove a CSS with a refcnt==1 */
refcnt = atomic_read(&css->refcnt);
if (refcnt > 1) {
failed = true;
goto done;
}
BUG_ON(!refcnt);
/*
* Drop the refcnt to 0 while we check other
* subsystems. This will cause any racing
* css_tryget() to spin until we set the
* CSS_REMOVED bits or abort
*/
if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
break;
cpu_relax();
}
}
done:
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
if (failed) {
/*
* Restore old refcnt if we previously managed
* to clear it from 1 to 0
*/
if (!atomic_read(&css->refcnt))
atomic_set(&css->refcnt, 1);
} else {
/* Commit the fact that the CSS is removed */
set_bit(CSS_REMOVED, &css->flags);
}
}
local_irq_restore(flags);
return !failed;
}
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
struct cgroup_event *event, *tmp;
/* the vfs holds both inode->i_mutex already */
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);