Newer
Older
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include "cgroup-internal.h"
#include <linux/cred.h>
#include <linux/init_task.h>
#include <linux/magic.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
Ingo Molnar
committed
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/percpu-rwsem.h>
#include <linux/idr.h>
#include <linux/kthread.h>
#include <linux/proc_ns.h>
#include <linux/nsproxy.h>
#define CREATE_TRACE_POINTS
#include <trace/events/cgroup.h>
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
* css_set_lock protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
DEFINE_SPINLOCK(css_set_lock);
#ifdef CONFIG_PROVE_RCU
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
* Protects cgroup_idr and css_idr so that IDs can be released without
* grabbing cgroup_mutex.
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
/*
* Protects cgroup_file->kn for !self csses. It synchronizes notifications
* against file removal/re-creation across css hiding.
*/
static DEFINE_SPINLOCK(cgroup_file_kn_lock);
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_destroy_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
Tejun Heo
committed
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x) \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
/*
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
/* some controllers are not supported in the default hierarchy */
/* some controllers are implicitly enabled on the default hierarchy */
/* The list of hierarchy roots */
LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
* Assign a monotonically increasing serial number to csses. It guarantees
* cgroups with bigger numbers are newer than those with smaller numbers.
* Also, as csses are always appended to the parent's ->children list, it
* guarantees that sibling csses are always sorted in the ascending serial
* number order on the list. Protected by cgroup_mutex.
static u64 css_serial_nr_next = 1;
* These bitmasks identify subsystems with specific features to avoid
* having to do iterative checks repeatedly.
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
static u16 have_free_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
.root_cset = &init_css_set,
};
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
Tejun Heo
committed
Tejun Heo
committed
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
struct cgroup_subsys *ss);
static void css_release(struct percpu_ref *ref);
Tejun Heo
committed
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
*
* cgroup_subsys_enabled() can only be used with literal subsys names which
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
bool cgroup_ssid_enabled(int ssid)
{
if (CGROUP_SUBSYS_COUNT == 0)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
Tejun Heo
committed
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
* cases where a subsystem should behave differnetly depending on the
* interface version.
*
* The set of behaviors which change on the default hierarchy are still
* being determined and the mount option is prefixed with __DEVEL__.
*
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
* and "name" are disallowed.
*
* - When mounting an existing superblock, mount options should match.
*
* - Remount is disallowed.
*
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
* recycled inbetween reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
*
* - "cgroup.clone_children" is removed.
*
* - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
* and its descendants contain no task; otherwise, 1. The file also
* generates kernfs notification which can be monitored through poll and
* [di]notify when the value of the file changes.
*
* - cpuset: tasks will be kept in empty cpusets when hotplug happens and
* take masks of ancestors with non-empty cpus/mems, instead of being
* moved to an ancestor.
*
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
* - memcg: use_hierarchy is on by default and the cgroup file for the flag
* is not created.
*
* - blkcg: blk-throttle becomes properly hierarchical.
*
* - debug: disallowed on the default hierarchy.
*/
bool cgroup_on_dfl(const struct cgroup *cgrp)
Tejun Heo
committed
{
return cgrp->root == &cgrp_dfl_root;
}
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
{
int ret;
idr_preload(gfp_mask);
Mel Gorman
committed
ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
idr_preload_end();
return ret;
}
static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
void *ret;
ret = idr_replace(idr, ptr, id);
return ret;
}
static void cgroup_idr_remove(struct idr *idr, int id)
{
idr_remove(idr, id);
static struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
struct cgroup_subsys_state *parent_css = cgrp->self.parent;
if (parent_css)
return container_of(parent_css, struct cgroup, self);
return NULL;
}
/* subsystems visibly enabled on a cgroup */
static u16 cgroup_control(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
u16 root_ss_mask = cgrp->root->subsys_mask;
if (parent)
return parent->subtree_control;
if (cgroup_on_dfl(cgrp))
root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
cgrp_dfl_implicit_ss_mask);
return root_ss_mask;
}
/* subsystems enabled on a cgroup */
static u16 cgroup_ss_mask(struct cgroup *cgrp)
{
struct cgroup *parent = cgroup_parent(cgrp);
if (parent)
return parent->subtree_ss_mask;
return cgrp->root->subsys_mask;
}
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
* function must be called either under cgroup_mutex or rcu_read_lock() and
* the caller is responsible for pinning the returned css if it wants to
* keep accessing it outside the said locks. This function may return
* %NULL if @cgrp doesn't have @subsys_id enabled.
*/
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
if (ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
else
return &cgrp->self;
/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
* Similar to cgroup_css() but returns the effective css, which is defined
* as the matching css of the nearest ancestor including self which has @ss
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
if (!ss)
return &cgrp->self;
/*
* This function is used while updating css associations and thus
* can't test the csses directly. Test ss_mask.
while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
if (!cgrp)
return NULL;
}
return cgroup_css(cgrp, ss);
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get the effective css of @cgrp for @ss. The effective css is
* defined as the matching css of the nearest ancestor including self which
* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
* the root css is returned, so this function always returns a valid css.
* The returned css must be put using css_put().
*/
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
do {
css = cgroup_css(cgrp, ss);
if (css && css_tryget_online(css))
goto out_unlock;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
css = init_css_set.subsys[ss->id];
css_get(css);
out_unlock:
rcu_read_unlock();
return css;
}
static void __maybe_unused cgroup_get(struct cgroup *cgrp)
{
css_get(&cgrp->self);
}
static void cgroup_get_live(struct cgroup *cgrp)
{
WARN_ON_ONCE(cgroup_is_dead(cgrp));
css_get(&cgrp->self);
}
static bool cgroup_tryget(struct cgroup *cgrp)
{
return css_tryget(&cgrp->self);
}
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
/*
* This is open and unprotected implementation of cgroup_css().
* seq_css() is only called from a kernfs file operation which has
* an active reference on the file. Because all the subsystem
* files are drained before a css is disassociated with a cgroup,
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
if (cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
* for_each_e_css - iterate all effective css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_e_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
; \
else
* do_each_subsys_mask - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
* @ss_mask: the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
* @ss_mask is set.
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
(ssid) = 0; \
break; \
} \
for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \
(ss) = cgroup_subsys[ssid]; \
{
#define while_each_subsys_mask() \
} \
} \
} while (false)
Tejun Heo
committed
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
if (({ lockdep_assert_held(&cgroup_mutex); \
Tejun Heo
committed
cgroup_is_dead(child); })) \
; \
else
Tejun Heo
committed
/* walk live descendants in preorder */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/* walk live descendants in postorder */
#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \
css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
(dsct) = (d_css)->cgroup; \
cgroup_is_dead(dsct); })) \
; \
else
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
.refcount = REFCOUNT_INIT(1),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
};
static int css_set_count = 1; /* 1 for init_css_set */
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
*
* css_set_populated() should be the same as !!cset->nr_tasks at steady
* state. However, css_set_populated() can be called while a task is being
* added to or removed from the linked list before the nr_tasks is
* properly updated. Hence, we can't just look at ->nr_tasks here.
*/
static bool css_set_populated(struct css_set *cset)
{
lockdep_assert_held(&css_set_lock);
return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
}
/**
* cgroup_update_populated - updated populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* One of the css_sets associated with @cgrp is either getting its first
* task or losing the last. Update @cgrp->populated_cnt accordingly. The
* count is propagated towards root so that a given cgroup's populated_cnt
* is zero iff the cgroup and all its descendants don't contain any tasks.
*
* @cgrp's interface file "cgroup.populated" is zero if
* @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
* changes from or to zero, userland is notified that the content of the
* interface file has changed. This can be used to detect when @cgrp and
* its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
lockdep_assert_held(&css_set_lock);
do {
bool trigger;
if (populated)
trigger = !cgrp->populated_cnt++;
else
trigger = !--cgrp->populated_cnt;
if (!trigger)
break;
cgroup_file_notify(&cgrp->events_file);
} while (cgrp);
}
/**
* css_set_update_populated - update populated state of a css_set
* @cset: target css_set
* @populated: whether @cset is populated or depopulated
*
* @cset is either getting the first task or losing the last. Update the
* ->populated_cnt of all associated cgroups accordingly.
*/
static void css_set_update_populated(struct css_set *cset, bool populated)
{
struct cgrp_cset_link *link;
lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
cgroup_update_populated(link->cgrp, populated);
}
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
* @from_cset: css_set @task currently belongs to (may be NULL)
* @to_cset: new css_set @task is being moved to (may be NULL)
* @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
*
* Move @task from @from_cset to @to_cset. If @task didn't belong to any
* css_set, @from_cset can be NULL. If @task is being disassociated
* instead of moved, @to_cset can be NULL.
*
* This function automatically handles populated_cnt updates and
* css_task_iter adjustments but the caller is responsible for managing
* @from_cset and @to_cset's reference counts.
*/
static void css_set_move_task(struct task_struct *task,
struct css_set *from_cset, struct css_set *to_cset,
bool use_mg_tasks)
{
lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
struct css_task_iter *it, *pos;
WARN_ON_ONCE(list_empty(&task->cg_list));
/*
* @task is leaving, advance task iterators which are
* pointing to it so that they can resume at the next
* position. Advancing an iterator might remove it from
* the list, use safe walk. See css_task_iter_advance*()
* for details.
*/
list_for_each_entry_safe(it, pos, &from_cset->task_iters,
iters_node)
if (it->task_pos == &task->cg_list)
css_task_iter_advance(it);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
} else {
WARN_ON_ONCE(!list_empty(&task->cg_list));
}
if (to_cset) {
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
* against cgroup_exit() changing the css_set to
* init_css_set and dropping the old one.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
rcu_assign_pointer(task->cgroups, to_cset);
list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
&to_cset->tasks);
}
}
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
struct cgroup_subsys *ss;
int i;
key += (unsigned long)css[i];
key = (key >> 16) ^ key;
void put_css_set_locked(struct css_set *cset)
struct cgrp_cset_link *link, *tmp_link;
lockdep_assert_held(&css_set_lock);
if (!refcount_dec_and_test(&cset->refcount))
return;
/* This css_set is dead. unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
css_put(cset->subsys[ssid]);
}
hash_del(&cset->hlist);
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
if (cgroup_parent(link->cgrp))
cgroup_put(link->cgrp);
kfree(link);
kfree_rcu(cset, rcu_head);
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
* @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
* Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cset,
struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
/*
* On the default hierarchy, there can be csets which are
* associated with the same set of cgroups but different csses.
* Let's first ensure that csses match.
*/
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
* share the same effective css, this comparison is always
* necessary.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
struct cgrp_cset_link *link1, *link2;
struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
if (cgrp1->root == new_cgrp->root) {
if (cgrp1 != new_cgrp)
return false;
} else {
if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
/**
* find_existing_css_set - init css array and find the matching css_set
* @old_cset: the css_set that we're using before the cgroup transition
* @cgrp: the cgroup that we're moving into
* @template: out param for the new set of csses, should be clear on entry
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
struct cgroup_root *root = cgrp->root;
struct css_set *cset;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
Tejun Heo
committed
if (root->subsys_mask & (1UL << i)) {
/*
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
template[i] = cgroup_e_css(cgrp, ss);
/*
* @ss is not in this hierarchy, so we don't want
* to change the css.
*/
template[i] = old_cset->subsys[i];
hash_for_each_possible(css_set_table, cset, hlist, key) {
if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
/* No existing cgroup group matched */
return NULL;
}
static void free_cgrp_cset_links(struct list_head *links_to_free)
struct cgrp_cset_link *link, *tmp_link;
list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
list_del(&link->cset_link);
kfree(link);
}
}
/**
* allocate_cgrp_cset_links - allocate cgrp_cset_links
* @count: the number of links to allocate
* @tmp_links: list_head the allocated links are put on
*
* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
* through ->cset_link. Returns 0 on success or -errno.
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
struct cgrp_cset_link *link;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL);
free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
* @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgroup *cgrp)
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
if (cgroup_on_dfl(cgrp))
cset->dfl_cgrp = cgrp;
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
/*
* Always add links to the tail of the lists so that the lists are
* in choronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
if (cgroup_parent(cgrp))
cgroup_get_live(cgrp);
/**
* find_css_set - return a new css_set with one cgroup updated
* @old_cset: the baseline css_set
* @cgrp: the cgroup to be updated
*
* Return a new css_set that's equivalent to @old_cset, but with @cgrp
* substituted into the appropriate hierarchy.
static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup *cgrp)
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (cset)
return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
return NULL;
}
refcount_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */