Newer
Older
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
cgroup_iter_end(cgrp, &it);
rcu_read_unlock();
err:
return ret;
}
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
/*
* Convert array 'a' of 'npids' pid_t's to a string of newline separated
* decimal pids in 'buf'. Don't write more than 'sz' chars, but return
* count 'cnt' of how many chars would be written if buf were large enough.
*/
static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
{
int cnt = 0;
int i;
for (i = 0; i < npids; i++)
cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
return cnt;
}
/*
* Handle an open on 'tasks' file. Prepare a buffer listing the
* process id's of tasks currently attached to the cgroup being opened.
*
* Does not require any specific cgroup mutexes, and does not take any.
*/
static int cgroup_tasks_open(struct inode *unused, struct file *file)
{
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
struct ctr_struct *ctr;
pid_t *pidarray;
int npids;
char c;
if (!(file->f_mode & FMODE_READ))
return 0;
ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
if (!ctr)
goto err0;
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
npids = cgroup_task_count(cgrp);
if (npids) {
pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
if (!pidarray)
goto err1;
npids = pid_array_load(pidarray, npids, cgrp);
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
/* Call pid_array_to_buf() twice, first just to get bufsz */
ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
if (!ctr->buf)
goto err2;
ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
kfree(pidarray);
} else {
ctr->buf = 0;
ctr->bufsz = 0;
}
file->private_data = ctr;
return 0;
err2:
kfree(pidarray);
err1:
kfree(ctr);
err0:
return -ENOMEM;
}
static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
struct cftype *cft,
struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
struct ctr_struct *ctr = file->private_data;
return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
}
static int cgroup_tasks_release(struct inode *unused_inode,
struct file *file)
{
struct ctr_struct *ctr;
if (file->f_mode & FMODE_READ) {
ctr = file->private_data;
kfree(ctr->buf);
kfree(ctr);
}
return 0;
}
static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
struct cftype *cft)
{
return notify_on_release(cgrp);
}
static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
}
/*
* for the common functions, 'private' gives the type of file
*/
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
static struct cftype files[] = {
{
.name = "tasks",
.open = cgroup_tasks_open,
.read = cgroup_tasks_read,
.write = cgroup_common_file_write,
.release = cgroup_tasks_release,
.private = FILE_TASKLIST,
},
{
.name = "notify_on_release",
.read_uint = cgroup_read_notify_on_release,
.write = cgroup_common_file_write,
.private = FILE_NOTIFY_ON_RELEASE,
},
{
.name = "releasable",
.read_uint = cgroup_read_releasable,
.private = FILE_RELEASABLE,
}
};
static struct cftype cft_release_agent = {
.name = "release_agent",
.read = cgroup_common_file_read,
.write = cgroup_common_file_write,
.private = FILE_RELEASE_AGENT,
static int cgroup_populate_dir(struct cgroup *cgrp)
{
int err;
struct cgroup_subsys *ss;
/* First clear out any existing files */
cgroup_clear_directory(cgrp->dentry);
err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
if (err < 0)
return err;
if (cgrp == cgrp->top_cgroup) {
if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
return err;
}
for_each_subsys(cgrp->root, ss) {
if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
return err;
}
return 0;
}
static void init_cgroup_css(struct cgroup_subsys_state *css,
struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
atomic_set(&css->refcnt, 0);
css->flags = 0;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
cgrp->subsys[ss->subsys_id] = css;
}
/*
* cgroup_create - create a cgroup
* parent: cgroup that will be parent of the new cgroup.
* name: name of the new cgroup. Will be strcpy'ed.
* mode: mode to set on new inode
*
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
int mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
int err = 0;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
return -ENOMEM;
/* Grab a reference on the superblock so the hierarchy doesn't
* get deleted on unmount if there are child cgroups. This
* can be done outside cgroup_mutex, since the sb can't
* disappear while someone has an open control file on the
* fs */
atomic_inc(&sb->s_active);
mutex_lock(&cgroup_mutex);
cgrp->flags = 0;
INIT_LIST_HEAD(&cgrp->sibling);
INIT_LIST_HEAD(&cgrp->children);
INIT_LIST_HEAD(&cgrp->css_sets);
INIT_LIST_HEAD(&cgrp->release_list);
cgrp->parent = parent;
cgrp->root = parent->root;
cgrp->top_cgroup = parent->top_cgroup;
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
if (IS_ERR(css)) {
err = PTR_ERR(css);
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
list_add(&cgrp->sibling, &cgrp->parent->children);
root->number_of_cgroups++;
err = cgroup_create_dir(cgrp, dentry, mode);
if (err < 0)
goto err_remove;
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
err = cgroup_populate_dir(cgrp);
/* If err < 0, we have a half-filled directory - oh well ;) */
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return 0;
err_remove:
list_del(&cgrp->sibling);
root->number_of_cgroups--;
err_destroy:
for_each_subsys(root, ss) {
if (cgrp->subsys[ss->subsys_id])
ss->destroy(ss, cgrp);
}
mutex_unlock(&cgroup_mutex);
/* Release the reference count that we took on the superblock */
deactivate_super(sb);
kfree(cgrp);
return err;
}
static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
{
struct cgroup *c_parent = dentry->d_parent->d_fsdata;
/* the vfs holds inode->i_mutex already */
return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}
static inline int cgroup_has_css_refs(struct cgroup *cgrp)
{
/* Check the reference count on each subsystem. Since we
* already established that there are no tasks in the
* cgroup, if the css refcount is also 0, then there should
* be no outstanding references, so the subsystem is safe to
* destroy. We scan across all subsystems rather than using
* the per-hierarchy linked list of mounted subsystems since
* we can be called via check_for_release() with no
* synchronization other than RCU, and the subsystem linked
* list isn't RCU-safe */
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
struct cgroup_subsys_state *css;
/* Skip subsystems not in this hierarchy */
if (ss->root != cgrp->root)
continue;
css = cgrp->subsys[ss->subsys_id];
/* When called from check_for_release() it's possible
* that by this point the cgroup has been removed
* and the css deleted. But a false-positive doesn't
* matter, since it can only happen if the cgroup
* has been deleted and hence no longer needs the
* release agent to be called anyway. */
return 1;
}
return 0;
}
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
struct super_block *sb;
struct cgroupfs_root *root;
/* the vfs holds both inode->i_mutex already */
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
if (!list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
parent = cgrp->parent;
root = cgrp->root;
/*
* Call pre_destroy handlers of subsys
*/
cgroup_call_pre_destroy(cgrp);
/*
* Notify subsyses that rmdir() request comes.
*/
if (cgroup_has_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del(&cgrp->release_list);
spin_unlock(&release_list_lock);
/* delete my sibling from parent->children */
list_del(&cgrp->sibling);
spin_lock(&cgrp->dentry->d_lock);
d = dget(cgrp->dentry);
cgrp->dentry = NULL;
spin_unlock(&d->d_lock);
cgroup_d_remove_dir(d);
dput(d);
set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
mutex_unlock(&cgroup_mutex);
return 0;
}
static void cgroup_init_subsys(struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
struct list_head *l;
printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
/* Create the top cgroup state for this subsystem */
ss->root = &rootnode;
css = ss->create(ss, dummytop);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_cgroup_css(css, ss, dummytop);
/* Update all cgroup groups to contain a subsys
* pointer to this state - since the subsystem is
* newly registered, all tasks and hence all cgroup
* groups are in the subsystem's top cgroup. */
write_lock(&css_set_lock);
l = &init_css_set.list;
do {
struct css_set *cg =
list_entry(l, struct css_set, list);
cg->subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
l = l->next;
} while (l != &init_css_set.list);
write_unlock(&css_set_lock);
/* If this subsystem requested that it be notified with fork
* events, we should send it one now for every process in the
* system */
if (ss->fork) {
struct task_struct *g, *p;
read_lock(&tasklist_lock);
do_each_thread(g, p) {
ss->fork(ss, p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
}
need_forkexit_callback |= ss->fork || ss->exit;
ss->active = 1;
}
/**
* cgroup_init_early - initialize cgroups at system boot, and
* initialize any subsystems that request early init.
*/
int __init cgroup_init_early(void)
{
int i;
kref_init(&init_css_set.ref);
kref_get(&init_css_set.ref);
INIT_LIST_HEAD(&init_css_set.list);
INIT_LIST_HEAD(&init_css_set.cg_links);
INIT_LIST_HEAD(&init_css_set.tasks);
css_set_count = 1;
init_cgroup_root(&rootnode);
list_add(&rootnode.root_list, &roots);
root_count = 1;
init_task.cgroups = &init_css_set;
init_css_set_link.cg = &init_css_set;
list_add(&init_css_set_link.cgrp_link_list,
&rootnode.top_cgroup.css_sets);
list_add(&init_css_set_link.cg_link_list,
&init_css_set.cg_links);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
BUG_ON(!ss->name);
BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN);
BUG_ON(!ss->create);
BUG_ON(!ss->destroy);
if (ss->subsys_id != i) {
ss->name, ss->subsys_id);
BUG();
}
if (ss->early_init)
cgroup_init_subsys(ss);
}
return 0;
}
/**
* cgroup_init - register cgroup filesystem and /proc file, and
* initialize any subsystems that didn't request early init.
*/
int __init cgroup_init(void)
{
int err;
int i;
struct proc_dir_entry *entry;
err = bdi_init(&cgroup_backing_dev_info);
if (err)
return err;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
}
err = register_filesystem(&cgroup_fs_type);
if (err < 0)
goto out;
entry = create_proc_entry("cgroups", 0, NULL);
if (entry)
entry->proc_fops = &proc_cgroupstats_operations;
if (err)
bdi_destroy(&cgroup_backing_dev_info);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
* - No need to task_lock(tsk) on this tsk->cgroup reference, as it
* doesn't really matter if tsk->cgroup changes after we read it,
* and we take cgroup_mutex, keeping cgroup_attach_task() from changing it
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
* anyway. No need to check that tsk->cgroup != NULL, thanks to
* the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks
* cgroup to top_cgroup.
*/
/* TODO: Use a proper seq_file iterator */
static int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
char *buf;
int retval;
struct cgroupfs_root *root;
retval = -ENOMEM;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!buf)
goto out;
retval = -ESRCH;
pid = m->private;
tsk = get_pid_task(pid, PIDTYPE_PID);
if (!tsk)
goto out_free;
retval = 0;
mutex_lock(&cgroup_mutex);
for_each_root(root) {
struct cgroup_subsys *ss;
struct cgroup *cgrp;
int subsys_id;
int count = 0;
/* Skip this hierarchy if it has no active subsystems */
if (!root->actual_subsys_bits)
continue;
for_each_subsys(root, ss)
seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
seq_putc(m, ':');
get_first_subsys(&root->top_cgroup, NULL, &subsys_id);
cgrp = task_cgroup(tsk, subsys_id);
retval = cgroup_path(cgrp, buf, PAGE_SIZE);
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
if (retval < 0)
goto out_unlock;
seq_puts(m, buf);
seq_putc(m, '\n');
}
out_unlock:
mutex_unlock(&cgroup_mutex);
put_task_struct(tsk);
out_free:
kfree(buf);
out:
return retval;
}
static int cgroup_open(struct inode *inode, struct file *file)
{
struct pid *pid = PROC_I(inode)->pid;
return single_open(file, proc_cgroup_show, pid);
}
struct file_operations proc_cgroup_operations = {
.open = cgroup_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
int i;
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\n");
mutex_lock(&cgroup_mutex);
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
seq_printf(m, "%s\t%lu\t%d\n",
ss->name, ss->root->subsys_bits,
ss->root->number_of_cgroups);
}
mutex_unlock(&cgroup_mutex);
return 0;
}
static int cgroupstats_open(struct inode *inode, struct file *file)
{
return single_open(file, proc_cgroupstats_show, 0);
}
static struct file_operations proc_cgroupstats_operations = {
.open = cgroupstats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/**
* cgroup_fork - attach newly forked task to its parents cgroup.
* @tsk: pointer to task_struct of forking parent process.
*
* Description: A task inherits its parent's cgroup at fork().
*
* A pointer to the shared css_set was automatically copied in
* fork.c by dup_task_struct(). However, we ignore that copy, since
* it was not made under the protection of RCU or cgroup_mutex, so
* might no longer be a valid cgroup pointer. cgroup_attach_task() might
* have already changed current->cgroups, allowing the previously
* referenced cgroup group to be removed and freed.
*
* At the point that cgroup_fork() is called, 'current' is the parent
* task, and the passed argument 'child' points to the child task.
*/
void cgroup_fork(struct task_struct *child)
{
task_lock(current);
child->cgroups = current->cgroups;
get_css_set(child->cgroups);
task_unlock(current);
INIT_LIST_HEAD(&child->cg_list);
}
/**
* cgroup_fork_callbacks - called on a new task very soon before
* adding it to the tasklist. No need to take any locks since no-one
* can be operating on this task
*/
void cgroup_fork_callbacks(struct task_struct *child)
{
if (need_forkexit_callback) {
int i;
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->fork)
ss->fork(ss, child);
}
}
}
/**
* cgroup_post_fork - called on a new task after adding it to the
* task list. Adds the task to the list running through its css_set
* if necessary. Has to be after the task is visible on the task list
* in case we race with the first call to cgroup_iter_start() - to
* guarantee that the new task ends up on its list. */
void cgroup_post_fork(struct task_struct *child)
{
if (use_task_css_set_links) {
write_lock(&css_set_lock);
if (list_empty(&child->cg_list))
list_add(&child->cg_list, &child->cgroups->tasks);
write_unlock(&css_set_lock);
}
}
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* the_top_cgroup_hack:
*
* Set the exiting tasks cgroup to the root cgroup (top_cgroup).
*
* We call cgroup_exit() while the task is still competent to
* handle notify_on_release(), then leave the task attached to the
* root cgroup in each hierarchy for the remainder of its exit.
*
* To do this properly, we would increment the reference count on
* top_cgroup, and near the very end of the kernel/exit.c do_exit()
* code we would add a second cgroup function call, to drop that
* reference. This would just create an unnecessary hot spot on
* the top_cgroup reference count, to no avail.
*
* Normally, holding a reference to a cgroup without bumping its
* count is unsafe. The cgroup could go away, or someone could
* attach us to a different cgroup, decrementing the count on
* the first cgroup that we never incremented. But in this case,
* top_cgroup isn't going away, and either task has PF_EXITING set,
* which wards off any cgroup_attach_task() attempts, or task is a failed
* fork, never visible to cgroup_attach_task.
*
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
int i;
struct css_set *cg;
if (run_callbacks && need_forkexit_callback) {
for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
struct cgroup_subsys *ss = subsys[i];
if (ss->exit)
ss->exit(ss, tsk);
}
}
/*
* Unlink from the css_set task list if necessary.
* Optimistically check cg_list before taking
* css_set_lock
*/
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
list_del(&tsk->cg_list);
write_unlock(&css_set_lock);
}
/* Reassign the task to the init_css_set. */
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
put_css_set_taskexit(cg);
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775
/**
* cgroup_clone - duplicate the current cgroup in the hierarchy
* that the given subsystem is attached to, and move this task into
* the new child
*/
int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
{
struct dentry *dentry;
int ret = 0;
char nodename[MAX_CGROUP_TYPE_NAMELEN];
struct cgroup *parent, *child;
struct inode *inode;
struct css_set *cg;
struct cgroupfs_root *root;
struct cgroup_subsys *ss;
/* We shouldn't be called by an unregistered subsystem */
BUG_ON(!subsys->active);
/* First figure out what hierarchy and cgroup we're dealing
* with, and pin them so we can drop cgroup_mutex */
mutex_lock(&cgroup_mutex);
again:
root = subsys->root;
if (root == &rootnode) {
printk(KERN_INFO
"Not cloning cgroup for unused subsystem %s\n",
subsys->name);
mutex_unlock(&cgroup_mutex);
return 0;
}
cg = tsk->cgroups;
parent = task_cgroup(tsk, subsys->subsys_id);
snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid);
/* Pin the hierarchy */
atomic_inc(&parent->root->sb->s_active);
/* Keep the cgroup alive */
get_css_set(cg);
mutex_unlock(&cgroup_mutex);
/* Now do the VFS work to create a cgroup */
inode = parent->dentry->d_inode;
/* Hold the parent directory mutex across this operation to
* stop anyone else deleting the new cgroup */
mutex_lock(&inode->i_mutex);
dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
if (IS_ERR(dentry)) {
printk(KERN_INFO
"cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
PTR_ERR(dentry));
ret = PTR_ERR(dentry);
goto out_release;
}
/* Create the cgroup directory, which also creates the cgroup */
ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
child = __d_cgrp(dentry);
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
dput(dentry);
if (ret) {
printk(KERN_INFO
"Failed to create cgroup %s: %d\n", nodename,
ret);
goto out_release;
}
if (!child) {
printk(KERN_INFO
"Couldn't find new cgroup %s\n", nodename);
ret = -ENOMEM;
goto out_release;
}
/* The cgroup now exists. Retake cgroup_mutex and check
* that we're still in the same state that we thought we
* were. */
mutex_lock(&cgroup_mutex);
if ((root != subsys->root) ||
(parent != task_cgroup(tsk, subsys->subsys_id))) {
/* Aargh, we raced ... */
mutex_unlock(&inode->i_mutex);
put_css_set(cg);
deactivate_super(parent->root->sb);
/* The cgroup is still accessible in the VFS, but
* we're not going to try to rmdir() it at this
* point. */
printk(KERN_INFO
"Race in cgroup_clone() - leaking cgroup %s\n",
nodename);
goto again;
}
/* do any required auto-setup */
for_each_subsys(root, ss) {
if (ss->post_clone)
ss->post_clone(ss, child);
}
/* All seems fine. Finish by moving the task into the new cgroup */
ret = cgroup_attach_task(child, tsk);
mutex_unlock(&cgroup_mutex);
out_release:
mutex_unlock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
put_css_set(cg);
mutex_unlock(&cgroup_mutex);
deactivate_super(parent->root->sb);
return ret;
}
/*
* See if "cgrp" is a descendant of the current task's cgroup in
* the appropriate hierarchy
*
* If we are sending in dummytop, then presumably we are creating
* the top cgroup in the subsystem.
*
* Called only by the ns (nsproxy) cgroup.
*/
int cgroup_is_descendant(const struct cgroup *cgrp)
{
int ret;
struct cgroup *target;
int subsys_id;
if (cgrp == dummytop)
get_first_subsys(cgrp, NULL, &subsys_id);
target = task_cgroup(current, subsys_id);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
static void check_for_release(struct cgroup *cgrp)
{
/* All of these checks rely on RCU to keep the cgroup
* structure alive */
if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count)
&& list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) {
/* Control Group is currently removeable. If it's not
* already queued for a userspace notification, queue
* it now */
int need_schedule_work = 0;
spin_lock(&release_list_lock);
if (!cgroup_is_removed(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
spin_unlock(&release_list_lock);
if (need_schedule_work)
schedule_work(&release_agent_work);
}
}
void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
}
rcu_read_unlock();
}
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*
*/
static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
char *pathbuf;
struct cgroup *cgrp = list_entry(release_list.next,
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
spin_unlock(&release_list_lock);
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!pathbuf) {
spin_lock(&release_list_lock);
continue;
}
if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) {
kfree(pathbuf);
spin_lock(&release_list_lock);
continue;
}
i = 0;
argv[i++] = cgrp->root->release_agent_path;
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
argv[i++] = (char *)pathbuf;
argv[i] = NULL;
i = 0;
/* minimal command environment */
envp[i++] = "HOME=/";
envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[i] = NULL;
/* Drop the lock while we invoke the usermode helper,
* since the exec could involve hitting disk and hence
* be a slow process */
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
kfree(pathbuf);
mutex_lock(&cgroup_mutex);
spin_lock(&release_list_lock);
}
spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}