Newer
Older
Tejun Heo
committed
/* if first iteration, visit @root */
Tejun Heo
committed
return root;
/* visit the first child if exists */
Tejun Heo
committed
next = css_next_child(NULL, pos);
if (next)
return next;
/* no child, visit my or the closest ancestor's next sibling */
Tejun Heo
committed
while (pos != root) {
return NULL;
}
Tejun Heo
committed
* css_rightmost_descendant - return the rightmost descendant of a css
* @pos: css of interest
Tejun Heo
committed
* Return the rightmost descendant of @pos. If there's no descendant, @pos
* is returned. This can be used during pre-order traversal to skip
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct rightmost descendant as
* long as @pos is accessible.
Tejun Heo
committed
struct cgroup_subsys_state *
css_rightmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last, *tmp;
do {
last = pos;
/* ->prev isn't RCU safe, walk ->next till the end */
pos = NULL;
Tejun Heo
committed
css_for_each_child(tmp, last)
pos = tmp;
} while (pos);
return last;
}
Tejun Heo
committed
static struct cgroup_subsys_state *
css_leftmost_descendant(struct cgroup_subsys_state *pos)
Tejun Heo
committed
struct cgroup_subsys_state *last;
do {
last = pos;
Tejun Heo
committed
pos = css_next_child(NULL, pos);
} while (pos);
return last;
}
/**
Tejun Heo
committed
* css_next_descendant_post - find the next descendant for post-order walk
* @pos: the current position (%NULL to initiate traversal)
Tejun Heo
committed
* @root: css whose descendants to walk
Tejun Heo
committed
* To be used by css_for_each_descendant_post(). Find the next descendant
Tejun Heo
committed
* to visit for post-order traversal of @root's descendants. @root is
* included in the iteration and the last node to be visited.
* While this function requires cgroup_mutex or RCU read locking, it
* doesn't require the whole traversal to be contained in a single critical
* section. This function will return the correct next descendant as long
* as both @pos and @cgroup are accessible and @pos is a descendant of
* @cgroup.
*
* If a subsystem synchronizes ->css_online() and the start of iteration, a
* css which finished ->css_online() is guaranteed to be visible in the
* future iterations and will stay visible until the last reference is put.
* A css which hasn't finished ->css_online() or already finished
* ->css_offline() may show up during traversal. It's each subsystem's
* responsibility to synchronize against on/offlining.
Tejun Heo
committed
struct cgroup_subsys_state *
css_next_descendant_post(struct cgroup_subsys_state *pos,
struct cgroup_subsys_state *root)
Tejun Heo
committed
struct cgroup_subsys_state *next;
/* if first iteration, visit leftmost descendant which may be @root */
if (!pos)
return css_leftmost_descendant(root);
Tejun Heo
committed
/* if we visited @root, we're done */
if (pos == root)
return NULL;
/* if there's an unvisited sibling, visit its leftmost descendant */
Tejun Heo
committed
return css_leftmost_descendant(next);
/* no sibling left, visit parent */
/**
* css_has_online_children - does a css have online children
* @css: the target css
*
* Returns %true if @css has any online children; otherwise, %false. This
* function can be called from any context but the caller is responsible
* for synchronizing against on/offlining as necessary.
*/
bool css_has_online_children(struct cgroup_subsys_state *css)
{
struct cgroup_subsys_state *child;
bool ret = false;
rcu_read_lock();
css_for_each_child(child, css) {
if (child->flags & CSS_ONLINE) {
ret = true;
break;
}
}
rcu_read_unlock();
return ret;
* css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
static void css_task_iter_advance_css_set(struct css_task_iter *it)
struct cgrp_cset_link *link;
struct css_set *cset;
lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == it->cset_head) {
it->cset_pos = NULL;
if (it->ss) {
cset = container_of(l, struct css_set,
e_cset_node[it->ss->id]);
} else {
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
}
} while (!css_set_populated(cset));
it->task_pos = cset->mg_tasks.next;
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
/*
* We don't keep css_sets locked across iteration steps and thus
* need to take steps to ensure that iteration can be resumed after
* the lock is re-acquired. Iteration is performed at two levels -
* css_sets and tasks in them.
*
* Once created, a css_set never leaves its cgroup lists, so a
* pinned css_set is guaranteed to stay put and we can resume
* iteration afterwards.
*
* Tasks may leave @cset across iteration steps. This is resolved
* by registering each iterator with the css_set currently being
* walked and making css_set_move_task() advance iterators whose
* next task is leaving.
*/
if (it->cur_cset) {
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
}
get_css_set(cset);
it->cur_cset = cset;
list_add(&it->iters_node, &cset->task_iters);
static void css_task_iter_advance(struct css_task_iter *it)
{
struct list_head *l = it->task_pos;
lockdep_assert_held(&css_set_lock);
WARN_ON_ONCE(!l);
/*
* Advance iterator to find next entry. cset->tasks is consumed
* first and then ->mg_tasks. After ->mg_tasks, we move onto the
* next cset.
*/
l = l->next;
if (l == it->tasks_head)
l = it->mg_tasks_head->next;
if (l == it->mg_tasks_head)
css_task_iter_advance_css_set(it);
else
it->task_pos = l;
}
* css_task_iter_start - initiate task iteration
* @css: the css to walk tasks of
* @it: the task iterator to use
*
* Initiate iteration through the tasks of @css. The caller can call
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);
memset(it, 0, sizeof(*it));
spin_lock_bh(&css_set_lock);
it->ss = css->ss;
if (it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
css_task_iter_advance_css_set(it);
spin_unlock_bh(&css_set_lock);
* css_task_iter_next - return the next task for the iterator
* @it: the task iterator being iterated
*
* The "next" function for task iteration. @it should have been
* initialized via css_task_iter_start(). Returns NULL when the iteration
* reaches the end.
struct task_struct *css_task_iter_next(struct css_task_iter *it)
if (it->cur_task) {
put_task_struct(it->cur_task);
it->cur_task = NULL;
}
spin_lock_bh(&css_set_lock);
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
cg_list);
get_task_struct(it->cur_task);
css_task_iter_advance(it);
}
spin_unlock_bh(&css_set_lock);
return it->cur_task;
* css_task_iter_end - finish task iteration
* @it: the task iterator to finish
*
* Finish task iteration started by css_task_iter_start().
void css_task_iter_end(struct css_task_iter *it)
if (it->cur_cset) {
spin_lock_bh(&css_set_lock);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
spin_unlock_bh(&css_set_lock);
}
if (it->cur_task)
put_task_struct(it->cur_task);
Tejun Heo
committed
* cgroup_trasnsfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
* Locking rules between cgroup_post_fork() and the migration path
* guarantee that, if a task is forking while being migrated, the new child
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
Tejun Heo
committed
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
LIST_HEAD(preloaded_csets);
struct cgrp_cset_link *link;
struct css_task_iter it;
struct task_struct *task;
Tejun Heo
committed
if (!cgroup_may_migrate_to(to))
return -EBUSY;
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
spin_unlock_bh(&css_set_lock);
ret = cgroup_migrate_prepare_dst(&preloaded_csets);
if (ret)
goto out_err;
Tejun Heo
committed
* Migrate tasks one-by-one until @from is empty. This fails iff
* ->can_attach() fails.
*/
do {
css_task_iter_start(&from->self, &it);
task = css_task_iter_next(&it);
if (task)
get_task_struct(task);
css_task_iter_end(&it);
if (task) {
ret = cgroup_migrate(task, false, to->root);
put_task_struct(task);
}
} while (task && !ret);
out_err:
cgroup_migrate_finish(&preloaded_csets);
return ret;
Tejun Heo
committed
}
Ben Blum
committed
* Stuff for reading the 'tasks'/'procs' files.
*
* Reading this file can return large amounts of data if a cgroup has
* *lots* of attached tasks. So it may need several calls to read(),
* but we cannot guarantee that the information we produce is correct
* unless we produce it entirely atomically.
*
*/
4376
4377
4378
4379
4380
4381
4382
4383
4384
4385
4386
4387
4388
4389
4390
4391
4392
4393
4394
4395
4396
4397
4398
4399
4400
4401
/* which pidlist file are we talking about? */
enum cgroup_filetype {
CGROUP_FILE_PROCS,
CGROUP_FILE_TASKS,
};
/*
* A pidlist is a list of pids that virtually represents the contents of one
* of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists,
* a pair (one each for procs, tasks) for each pid namespace that's relevant
* to the cgroup.
*/
struct cgroup_pidlist {
/*
* used to find which pidlist is wanted. doesn't change as long as
* this particular list stays in the list.
*/
struct { enum cgroup_filetype type; struct pid_namespace *ns; } key;
/* array of xids */
pid_t *list;
/* how many elements the above list has */
int length;
/* each of these stored in a list by its cgroup */
struct list_head links;
/* pointer to the cgroup we belong to, for list removal purposes */
struct cgroup *owner;
/* for delayed destruction */
struct delayed_work destroy_dwork;
/*
* The following two functions "fix" the issue where there are more pids
* than kmalloc will give memory for; in such cases, we use vmalloc/vfree.
* TODO: replace with a kernel-wide solution to this problem
*/
#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2))
static void *pidlist_allocate(int count)
{
if (PIDLIST_TOO_LARGE(count))
return vmalloc(count * sizeof(pid_t));
else
return kmalloc(count * sizeof(pid_t), GFP_KERNEL);
}
static void pidlist_free(void *p)
{
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
/*
* Used to destroy all pidlists lingering waiting for destroy timer. None
* should be left afterwards.
*/
static void cgroup_pidlist_destroy_all(struct cgroup *cgrp)
{
struct cgroup_pidlist *l, *tmp_l;
mutex_lock(&cgrp->pidlist_mutex);
list_for_each_entry_safe(l, tmp_l, &cgrp->pidlists, links)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, 0);
mutex_unlock(&cgrp->pidlist_mutex);
flush_workqueue(cgroup_pidlist_destroy_wq);
BUG_ON(!list_empty(&cgrp->pidlists));
}
static void cgroup_pidlist_destroy_work_fn(struct work_struct *work)
{
struct delayed_work *dwork = to_delayed_work(work);
struct cgroup_pidlist *l = container_of(dwork, struct cgroup_pidlist,
destroy_dwork);
struct cgroup_pidlist *tofree = NULL;
mutex_lock(&l->owner->pidlist_mutex);
/*
* Destroy iff we didn't get queued again. The state won't change
* as destroy_dwork can only be queued while locked.
if (!delayed_work_pending(dwork)) {
list_del(&l->links);
pidlist_free(l->list);
put_pid_ns(l->key.ns);
tofree = l;
}
mutex_unlock(&l->owner->pidlist_mutex);
kfree(tofree);
}
Ben Blum
committed
* pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries
* Returns the number of unique elements.
static int pidlist_uniq(pid_t *list, int length)
Ben Blum
committed
4472
4473
4474
4475
4476
4477
4478
4479
4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
int src, dest = 1;
/*
* we presume the 0th element is unique, so i starts at 1. trivial
* edge cases first; no work needs to be done for either
*/
if (length == 0 || length == 1)
return length;
/* src and dest walk down the list; dest counts unique elements */
for (src = 1; src < length; src++) {
/* find next unique element */
while (list[src] == list[src-1]) {
src++;
if (src == length)
goto after;
}
/* dest always points to where the next unique element goes */
list[dest] = list[src];
dest++;
}
after:
return dest;
}
/*
* The two pid files - task and cgroup.procs - guaranteed that the result
* is sorted, which forced this whole pidlist fiasco. As pid order is
* different per namespace, each namespace needs differently sorted list,
* making it impossible to use, for example, single rbtree of member tasks
* sorted by task pointer. As pidlists can be fairly large, allocating one
* per open file is dangerous, so cgroup had to implement shared pool of
* pidlists keyed by cgroup and namespace.
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
* want to do away with it. Explicitly scramble sort order if on the
* default hierarchy so that no such expectation exists in the new
* interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
*/
static pid_t pid_fry(pid_t pid)
{
unsigned a = pid & 0x55555555;
unsigned b = pid & 0xAAAAAAAA;
return (a << 1) | (b >> 1);
}
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
if (cgroup_on_dfl(cgrp))
return pid_fry(pid);
else
return pid;
}
Ben Blum
committed
static int cmppid(const void *a, const void *b)
{
return *(pid_t *)a - *(pid_t *)b;
}
static int fried_cmppid(const void *a, const void *b)
{
return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b);
}
static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
enum cgroup_filetype type)
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
struct pid_namespace *ns = task_active_pid_ns(current);
lockdep_assert_held(&cgrp->pidlist_mutex);
list_for_each_entry(l, &cgrp->pidlists, links)
if (l->key.type == type && l->key.ns == ns)
return l;
return NULL;
}
Ben Blum
committed
/*
* find the appropriate pidlist for our purpose (given procs vs tasks)
* returns with the lock on that pidlist already held, and takes care
* of the use count, or returns NULL with no locks held if we're out of
* memory.
*/
static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
enum cgroup_filetype type)
Ben Blum
committed
{
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
l = cgroup_pidlist_find(cgrp, type);
if (l)
return l;
Ben Blum
committed
/* entry not found; create a new one */
l = kzalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL);
Ben Blum
committed
return l;
INIT_DELAYED_WORK(&l->destroy_dwork, cgroup_pidlist_destroy_work_fn);
Ben Blum
committed
l->key.type = type;
/* don't need task_nsproxy() if we're looking at ourself */
l->key.ns = get_pid_ns(task_active_pid_ns(current));
Ben Blum
committed
l->owner = cgrp;
list_add(&l->links, &cgrp->pidlists);
return l;
}
Ben Blum
committed
/*
* Load a cgroup's pidarray with either procs' tgids or tasks' pids
*/
Ben Blum
committed
static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
struct cgroup_pidlist **lp)
Ben Blum
committed
{
pid_t *array;
int length;
int pid, n = 0; /* used for populating the array */
struct css_task_iter it;
struct task_struct *tsk;
Ben Blum
committed
struct cgroup_pidlist *l;
lockdep_assert_held(&cgrp->pidlist_mutex);
Ben Blum
committed
/*
* If cgroup gets more users after we read count, we won't have
* enough space - tough. This race is indistinguishable to the
* caller from the case that the additional cgroup users didn't
* show up until sometime later on.
*/
length = cgroup_task_count(cgrp);
array = pidlist_allocate(length);
Ben Blum
committed
if (!array)
return -ENOMEM;
/* now, populate the array */
css_task_iter_start(&cgrp->self, &it);
while ((tsk = css_task_iter_next(&it))) {
Ben Blum
committed
if (unlikely(n == length))
Ben Blum
committed
/* get tgid or pid for procs or tasks file respectively */
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
else
pid = task_pid_vnr(tsk);
Ben Blum
committed
if (pid > 0) /* make sure to only use valid results */
array[n++] = pid;
css_task_iter_end(&it);
Ben Blum
committed
length = n;
/* now sort & (if procs) strip out duplicates */
if (cgroup_on_dfl(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
Ben Blum
committed
if (type == CGROUP_FILE_PROCS)
length = pidlist_uniq(array, length);
l = cgroup_pidlist_find_create(cgrp, type);
Ben Blum
committed
if (!l) {
pidlist_free(array);
Ben Blum
committed
return -ENOMEM;
Ben Blum
committed
}
/* store array, freeing old if necessary */
pidlist_free(l->list);
Ben Blum
committed
l->list = array;
l->length = length;
Ben Blum
committed
*lp = l;
Ben Blum
committed
return 0;
* @stats: cgroupstats to fill information into
* @dentry: A dentry entry belonging to the cgroup for which stats have
* been requested.
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct cgroup *cgrp;
struct css_task_iter it;
/* it should be kernfs_node belonging to cgroupfs and is a directory */
if (dentry->d_sb->s_type != &cgroup_fs_type || !kn ||
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
* @kn->priv is RCU safe. Let's do the RCU dancing.
rcu_read_lock();
cgrp = rcu_dereference(kn->priv);
if (!cgrp || cgroup_is_dead(cgrp)) {
css_task_iter_start(&cgrp->self, &it);
while ((tsk = css_task_iter_next(&it))) {
switch (tsk->state) {
case TASK_RUNNING:
stats->nr_running++;
break;
case TASK_INTERRUPTIBLE:
stats->nr_sleeping++;
break;
case TASK_UNINTERRUPTIBLE:
stats->nr_uninterruptible++;
break;
case TASK_STOPPED:
stats->nr_stopped++;
break;
default:
if (delayacct_is_task_waiting_on_io(tsk))
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
Ben Blum
committed
* seq_file methods for the tasks/procs files. The seq_file position is the
* next pid to display; the seq_file iterator is a pointer to the pid
Ben Blum
committed
* in the cgroup->l->list array.
Ben Blum
committed
static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
/*
* Initially we receive a position value that corresponds to
* one more than the last pid shown (or 0 on the first call or
* after a seek to the start). Use a binary-search to find the
* next pid to display, if any
*/
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
int index = 0, pid = *pos;
int *iter, ret;
mutex_lock(&cgrp->pidlist_mutex);
/*
* !NULL @of->priv indicates that this isn't the first start()
* after open. If the matching pidlist is around, we can use that.
* Look for it. Note that @of->priv can't be used directly. It
* could already have been destroyed.
*/
if (of->priv)
of->priv = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
if (!of->priv) {
ret = pidlist_array_load(cgrp, type,
(struct cgroup_pidlist **)&of->priv);
if (ret)
return ERR_PTR(ret);
}
if (pid) {
Ben Blum
committed
int end = l->length;
while (index < end) {
int mid = (index + end) / 2;
if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) {
index = mid;
break;
} else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid)
index = mid + 1;
else
end = mid;
}
}
/* If we're off the end of the array, we're done */
Ben Blum
committed
if (index >= l->length)
return NULL;
/* Update the abstract position to be the actual pid that we found */
Ben Blum
committed
iter = l->list + index;
*pos = cgroup_pid_fry(cgrp, *iter);
return iter;
}
Ben Blum
committed
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
struct cgroup_pidlist *l = of->priv;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
mutex_unlock(&seq_css(s)->cgroup->pidlist_mutex);
}
Ben Blum
committed
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
struct cgroup_pidlist *l = of->priv;
Ben Blum
committed
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
* Advance to the next pid in the array. If this goes off the
* end, we're done
*/
p++;
if (p >= end) {
return NULL;
} else {
*pos = cgroup_pid_fry(seq_css(s)->cgroup, *p);
return p;
}
}
Ben Blum
committed
static int cgroup_pidlist_show(struct seq_file *s, void *v)
seq_printf(s, "%d\n", *(int *)v);
return 0;
static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft)
return notify_on_release(css->cgroup);
}
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
return 0;
}
static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
struct cftype *cft)
return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
/* cgroup core interface files for the default hierarchy */
static struct cftype cgroup_dfl_base_files[] = {
.name = "cgroup.procs",
.file_offset = offsetof(struct cgroup, procs_file),
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.write = cgroup_procs_write,
Ben Blum
committed
},
Tejun Heo
committed
{
.name = "cgroup.controllers",
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
Tejun Heo
committed
},
.name = "cgroup.events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct cgroup, events_file),
.seq_show = cgroup_events_show,
{ } /* terminate */
};
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
/* cgroup core interface files for the legacy hierarchies */
static struct cftype cgroup_legacy_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write = cgroup_procs_write,
},
{
.name = "cgroup.clone_children",
.read_u64 = cgroup_clone_children_read,
.write_u64 = cgroup_clone_children_write,
},
{
.name = "cgroup.sane_behavior",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_sane_behavior_show,
},
{
.name = "tasks",
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.write = cgroup_tasks_write,
},
{
.name = "notify_on_release",
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
.flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
/*
* css destruction is four-stage process.
*
* 1. Destruction starts. Killing of the percpu_ref is initiated.
* Implemented in kill_css().
*
* 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
* and thus css_tryget_online() is guaranteed to fail, the css can be
* offlined by invoking offline_css(). After offlining, the base ref is
* put. Implemented in css_killed_work_fn().
*
* 3. When the percpu_ref reaches zero, the only possible remaining
* accessors are inside RCU read sections. css_release() schedules the
* RCU callback.
*
* 4. After the grace period, the css can be freed. Implemented in
* css_free_work_fn().
*
* It is actually hairier because both step 2 and 4 require process context
* and thus involve punting to css->destroy_work adding two additional
* steps to the already complex sequence.
*/
static void css_free_work_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
percpu_ref_exit(&css->refcnt);
struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
if (parent)
css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
cancel_work_sync(&cgrp->release_agent_work);
/*
* We get a ref to the parent, and put the ref when
* this cgroup is being freed, so it's guaranteed
* that the parent won't be destroyed before its
* children.
*/
kernfs_put(cgrp->kn);
kfree(cgrp);
} else {
/*
* This is root cgroup's refcnt reaching zero,
* which indicates that the root should be
* released.
*/
cgroup_destroy_root(cgrp->root);
}
}
static void css_free_rcu_fn(struct rcu_head *rcu_head)
{
struct cgroup_subsys_state *css =
container_of(rcu_head, struct cgroup_subsys_state, rcu_head);
INIT_WORK(&css->destroy_work, css_free_work_fn);
queue_work(cgroup_destroy_wq, &css->destroy_work);