Newer
Older
LIST_HEAD(preloaded_csets);
struct cgrp_cset_link *link;
struct css_task_iter it;
struct task_struct *task;
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
down_read(&css_set_rwsem);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
up_read(&css_set_rwsem);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
if (ret)
goto out_err;
Tejun Heo
committed
/*
* Migrate tasks one-by-one until @form is empty. This fails iff
* ->can_attach() fails.
*/
do {
css_task_iter_start(&from->dummy_css, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
css_task_iter_end(&it);
if (task) {
ret = cgroup_migrate(to, task, false);
put_task_struct(task);
}
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
return ret;
Tejun Heo
committed
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* for delayed destruction */
struct delayed_work destroy_dwork;
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
if (is_vmalloc_addr(p))
vfree(p);
else
kfree(p);
}
3104
3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
3175
3176
3177
3178
3179
3180
3181
3182
3183
3184
3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
* want to do away with it. Explicitly scramble sort order if
* sane_behavior so that no such expectation exists in the new interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
static pid_t pid_fry(pid_t pid)
{
unsigned a = pid & 0x55555555;
unsigned b = pid & 0xAAAAAAAA;
return (a << 1) | (b >> 1);
}
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
if (cgroup_sane_behavior(cgrp))
return pid_fry(pid);
else
return pid;
}
Ben Blum
committed
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static int fried_cmppid(const void *a, const void *b)
{
return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
lockdep_assert_held(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links)
if (l->key.type == type && l->key.ns == ns)
return l;
return NULL;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
enum cgroup_filetype type)
Ben Blum
committed
{
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
l = cgroup_pidlist_find(cgrp, type);
if (l)
return l;
Ben Blum
committed
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
Ben Blum
committed
return l;
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
Ben Blum
committed
l->key.type = type;
/* don't need task_nsproxy() if we're looking at ourself */
l->key.ns = get_pid_ns(task_active_pid_ns(current));
Ben Blum
committed
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct css_task_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
Ben Blum
committed
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
css_task_iter_end(&it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
if (cgroup_sane_behavior(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
Ben Blum
committed
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
/* store array, freeing old if necessary */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
struct css_task_iter it;
/* it should be kernfs_node belonging to cgroupfs and is a directory */
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_from_dir(),
* @kn->priv is RCU safe. Let's do the RCU dancing.
rcu_read_lock();
cgrp = rcu_dereference(kn->priv);
if (!cgrp || cgroup_is_dead(cgrp)) {
css_task_iter_start(&cgrp->dummy_css, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
int *iter, ret;
mutex_lock(&cgrp->pidlist_mutex);
/*
* !NULL @of->priv indicates that this isn't the first start()
* after open. If the matching pidlist is around, we can use that.
* Look for it. Note that @of->priv can't be used directly. It
* could already have been destroyed.
*/
if (of->priv)
of->priv = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
if (!of->priv) {
ret = pidlist_array_load(cgrp, type,
(struct cgroup_pidlist **)&of->priv);
if (ret)
return ERR_PTR(ret);
}
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
struct cgroup_pidlist *l = of->priv;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
struct cgroup_pidlist *l = of->priv;
Ben Blum
committed
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
{
return seq_printf(s, "%d\n", *(int *)v);
}
Ben Blum
committed
/*
* seq_operations functions for iterating on pidlists through seq_file -
* independent of whether it's tasks or procs
*/
static const struct seq_operations cgroup_pidlist_seq_operations = {
.start = cgroup_pidlist_start,
.stop = cgroup_pidlist_stop,
.next = cgroup_pidlist_next,
.show = cgroup_pidlist_show,
};
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
return notify_on_release(css->cgroup);
}
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static struct cftype cgroup_base_files[] = {
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.write_u64 = cgroup_procs_write,
.mode = S_IRUGO | S_IWUSR,
Ben Blum
committed
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_sane_behavior_show,
/*
* Historical crazy stuff. These don't have "cgroup." prefix and
* don't exist if sane_behavior. If you're depending on these, be
* prepared to be burned.
*/
{
.name = "tasks",
.flags = CFTYPE_INSANE, /* use "procs" instead */
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.write_u64 = cgroup_tasks_write,
.mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
.flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write_string = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
Tejun Heo
committed
* cgroup_populate_dir - create subsys files in a cgroup directory
* @cgrp: target cgroup
* @subsys_mask: mask of the subsystem ids whose files should be added
*
* On failure, no file is added.
Tejun Heo
committed
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
struct cgroup_subsys *ss;
Tejun Heo
committed
int i, ret = 0;
/* process cftsets of each subsystem */
Tejun Heo
committed
for_each_subsys(ss, i) {
Tejun Heo
committed
if (!test_bit(i, &subsys_mask))
list_for_each_entry(cfts, &ss->cfts, node) {
ret = cgroup_addrm_files(cgrp, cfts, true);
if (ret < 0)
goto err;
}
err:
cgroup_clear_dir(cgrp, subsys_mask);
return ret;
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
/*
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
* Implemented in kill_css().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget() is guaranteed to fail, the css can be offlined
* by invoking offline_css(). After offlining, the base ref is put.
* Implemented in css_killed_work_fn().
*
* 3. When the percpu_ref reaches zero, the only possible remaining
* accessors are inside RCU read sections. css_release() schedules the
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
* css_free_work_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
* steps to the already complex sequence.
*/
static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
if (css->parent)
css_put(css->parent);
css->ss->css_free(css);
static void css_free_rcu_fn(struct rcu_head *rcu_head)
{
struct cgroup_subsys_state *css =
container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
INIT_WORK(&css->destroy_work, css_free_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
static void css_release(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
RCU_INIT_POINTER(css->cgroup->subsys[css->ss->id], NULL);
call_rcu(&css->rcu_head, css_free_rcu_fn);
static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
struct cgroup *cgrp)
css->cgroup = cgrp;
css->parent = cgroup_css(cgrp->parent, ss);
css->flags |= CSS_ROOT;
BUG_ON(cgroup_css(cgrp, ss));
/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
struct cgroup_subsys *ss = css->ss;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
Tejun Heo
committed
if (ss->css_online)
ret = ss->css_online(css);
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
struct cgroup_subsys *ss = css->ss;
lockdep_assert_held(&cgroup_tree_mutex);
lockdep_assert_held(&cgroup_mutex);
if (!(css->flags & CSS_ONLINE))
return;
ss->css_offline(css);
css->flags &= ~CSS_ONLINE;
css->cgroup->nr_css--;
RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
/**
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
* css is online and installed in @cgrp with all interface files created.
* Returns 0 on success, -errno on failure.
*/
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
{
struct cgroup *parent = cgrp->parent;
struct cgroup_subsys_state *css;
int err;
lockdep_assert_held(&cgroup_mutex);
css = ss->css_alloc(cgroup_css(parent, ss));
if (IS_ERR(css))
return PTR_ERR(css);
err = percpu_ref_init(&css->refcnt, css_release);
if (err)
init_css(css, ss, cgrp);
err = cgroup_populate_dir(cgrp, 1 << ss->id);
err = online_css(css);
if (err)
css_get(css->parent);
if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
parent->parent) {
pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
current->comm, current->pid, ss->name);
if (!strcmp(ss->name, "memory"))
pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
ss->warned_broken_hierarchy = true;
}
return 0;
Linus Torvalds
committed
cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
percpu_ref_cancel_init(&css->refcnt);
ss->css_free(css);
return err;
}
* cgroup_create - create a cgroup
* @parent: cgroup that will be parent of the new cgroup
static long cgroup_create(struct cgroup *parent, const char *name,
struct cgroup *cgrp;
struct cgroup_root *root = parent->root;
struct cgroup_subsys *ss;
/* allocate the cgroup and its ID, 0 is reserved for the root */
cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
if (!cgrp)
/*
* Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
* anyway so that locking is contained inside cgroup proper and we
* don't get nasty surprises if we ever grow another caller.
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
}
/*
* Temporarily set the pointer to NULL, so idr_find() won't return
* a half-baked cgroup.
*/
cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
if (cgrp->id < 0) {
err = -ENOMEM;
goto err_unlock;
init_cgroup_housekeeping(cgrp);
cgrp->parent = parent;
cgrp->dummy_css.parent = &parent->dummy_css;
cgrp->root = parent->root;
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
goto err_free_id;
* This extra ref will be put in cgroup_free_fn() and guarantees
* that @cgrp->kn is always accessible.
cgrp->serial_nr = cgroup_serial_nr_next++;
/* allocation complete, commit to creation */
list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
Tejun Heo
committed
atomic_inc(&root->nr_cgrps);
/*
* @cgrp is now fully operational. If something fails after this
* point, it'll be released via the normal destruction path.
*/
idr_replace(&root->cgroup_idr, cgrp, cgrp->id);
err = cgroup_kn_set_ugid(kn);
if (err)
goto err_destroy;
err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
Tejun Heo
committed
if (err)
goto err_destroy;
/* let's create and online css's */
Tejun Heo
committed
if (parent->child_subsys_mask & (1 << ssid)) {
err = create_css(cgrp, ss);
if (err)
goto err_destroy;
}
Tejun Heo
committed
/*
* On the default hierarchy, a child doesn't automatically inherit
* child_subsys_mask from the parent. Each is configured manually.
*/
if (!cgroup_on_dfl(cgrp))
cgrp->child_subsys_mask = parent->child_subsys_mask;
Tejun Heo
committed
mutex_unlock(&cgroup_mutex);
idr_remove(&root->cgroup_idr, cgrp->id);
err_unlock:
mutex_unlock(&cgroup_mutex);
err_unlock_tree:
mutex_unlock(&cgroup_tree_mutex);
kfree(cgrp);
err_destroy:
cgroup_destroy_locked(cgrp);
mutex_unlock(&cgroup_mutex);
static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
umode_t mode)
int ret;
/*
* cgroup_create() grabs cgroup_tree_mutex which nests outside
* kernfs active_ref and cgroup_create() already synchronizes
* properly against removal through cgroup_lock_live_group().
* Break it before calling cgroup_create().
*/
cgroup_get(parent);
kernfs_break_active_protection(parent_kn);
ret = cgroup_create(parent, name, mode);
kernfs_unbreak_active_protection(parent_kn);
cgroup_put(parent);
return ret;
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget() is now guaranteed to fail.
*/
static void css_killed_work_fn(struct work_struct *work)
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
mutex_lock(&cgroup_mutex);
/*
* css_tryget() is guaranteed to fail now. Tell subsystems to
* initate destruction.
*/
offline_css(css);
/*
* If @cgrp is marked dead, it's waiting for refs of all css's to
* be disabled before proceeding to the second phase of cgroup
* destruction. If we are the last one, kick it off.
*/
if (!cgrp->nr_css && cgroup_is_dead(cgrp))
cgroup_destroy_css_killed(cgrp);
mutex_unlock(&cgroup_mutex);
/*
* Put the css refs from kill_css(). Each css holds an extra
* reference to the cgroup's dentry and cgroup removal proceeds
* regardless of css refs. On the last put of each css, whenever
* that may be, the extra dentry ref is put so that dentry
* destruction happens only after all css's are released.
*/
css_put(css);
/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
INIT_WORK(&css->destroy_work, css_killed_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);
Tejun Heo
committed
/**
* kill_css - destroy a css
* @css: css to destroy
*
* This function initiates destruction of @css by removing cgroup interface
* files and putting its base reference. ->css_offline() will be invoked
* asynchronously once css_tryget() is guaranteed to fail and when the
* reference count reaches zero, @css will be released.
*/
static void kill_css(struct cgroup_subsys_state *css)