Newer
Older
/*
* Generic process-grouping system.
*
* Based originally on the cpuset system, extracted by Paul Menage
* Copyright (C) 2006 Google, Inc
*
* Notifications support
* Copyright (C) 2009 Nokia Corporation
* Author: Kirill A. Shutemov
*
* Copyright notices from the original cpuset code:
* --------------------------------------------------
* Copyright (C) 2003 BULL SA.
* Copyright (C) 2004-2006 Silicon Graphics, Inc.
*
* Portions derived from Patrick Mochel's sysfs code.
* sysfs is Copyright (c) 2001-3 Patrick Mochel
*
* 2003-10-10 Written by Simon Derr.
* 2003-10-22 Updates by Stephen Hemminger.
* 2004 May-July Rework by Paul Jackson.
* ---------------------------------------------------
*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cred.h>
#include <linux/init_task.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/magic.h>
#include <linux/mm.h>
#include <linux/mutex.h>
#include <linux/mount.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/rwsem.h>
#include <linux/percpu-rwsem.h>
#include <linux/kmod.h>
#include <linux/delayacct.h>
#include <linux/cgroupstats.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/kthread.h>
/*
* pidlists linger the following amount before being destroyed. The goal
* is avoiding frequent destruction in the middle of consecutive read calls
* Expiring in the middle is a performance problem not a correctness one.
* 1 sec should be enough.
*/
#define CGROUP_PIDLIST_DESTROY_DELAY HZ
#define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \
MAX_CFTYPE_NAME + 2)
/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
* css_set_rwsem protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
DECLARE_RWSEM(css_set_rwsem);
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_rwsem);
static DEFINE_MUTEX(cgroup_mutex);
static DECLARE_RWSEM(css_set_rwsem);
* Protects cgroup_idr and css_idr so that IDs can be released without
* grabbing cgroup_mutex.
*/
static DEFINE_SPINLOCK(cgroup_idr_lock);
/*
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
static DEFINE_SPINLOCK(release_agent_path_lock);
struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
/*
* cgroup destruction makes heavy use of work items and there can be a lot
* of concurrent destructions. Use a separate workqueue so that cgroup
* destruction work items don't end up filling up max_active of system_wq
* which may lead to deadlock.
*/
static struct workqueue_struct *cgroup_destroy_wq;
/*
* pidlist destructions need to be flushed on cgroup destruction. Use a
* separate workqueue as flush domain.
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
/* generate an array of cgroup subsystem pointers */
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
static struct cgroup_subsys *cgroup_subsys[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
/* array of cgroup subsystem names */
#define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
static const char *cgroup_subsys_name[] = {
#include <linux/cgroup_subsys.h>
};
Tejun Heo
committed
/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
#define SUBSYS(_x) \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
#include <linux/cgroup_subsys.h>
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
static struct static_key_true *cgroup_subsys_enabled_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
#include <linux/cgroup_subsys.h>
};
#undef SUBSYS
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
* part of that cgroup.
/*
* The default hierarchy always exists but is hidden until mounted for the
* first time. This is for backward compatibility.
*/
static bool cgrp_dfl_root_visible;
/*
* Set by the boot param of the same name and makes subsystems with NULL
* ->dfl_files to use ->legacy_files on the default hierarchy.
*/
static bool cgroup_legacy_files_on_dfl;
/* some controllers are not supported in the default hierarchy */
static unsigned long cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
static LIST_HEAD(cgroup_roots);
static int cgroup_root_count;
/* hierarchy ID allocation and mapping, protected by cgroup_mutex */
static DEFINE_IDR(cgroup_hierarchy_idr);
* Assign a monotonically increasing serial number to csses. It guarantees
* cgroups with bigger numbers are newer than those with smaller numbers.
* Also, as csses are always appended to the parent's ->children list, it
* guarantees that sibling csses are always sorted in the ascending serial
* number order on the list. Protected by cgroup_mutex.
static u64 css_serial_nr_next = 1;
/*
* These bitmask flags indicate whether tasks in the fork and exit paths have
* fork/exit handlers to call. This avoids us having to do extra work in the
* fork/exit path to check which subsystems have fork/exit callbacks.
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
Tejun Heo
committed
static int rebind_subsystems(struct cgroup_root *dst_root,
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
bool visible);
static void css_release(struct percpu_ref *ref);
Tejun Heo
committed
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
/**
* cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
* @ssid: subsys ID of interest
*
* cgroup_subsys_enabled() can only be used with literal subsys names which
* is fine for individual subsystems but unsuitable for cgroup core. This
* is slower static_key_enabled() based test indexed by @ssid.
*/
static bool cgroup_ssid_enabled(int ssid)
{
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
}
Tejun Heo
committed
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
/**
* cgroup_on_dfl - test whether a cgroup is on the default hierarchy
* @cgrp: the cgroup of interest
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
* cases where a subsystem should behave differnetly depending on the
* interface version.
*
* The set of behaviors which change on the default hierarchy are still
* being determined and the mount option is prefixed with __DEVEL__.
*
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
* and "name" are disallowed.
*
* - When mounting an existing superblock, mount options should match.
*
* - Remount is disallowed.
*
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
* recycled inbetween reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
*
* - "cgroup.clone_children" is removed.
*
* - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
* and its descendants contain no task; otherwise, 1. The file also
* generates kernfs notification which can be monitored through poll and
* [di]notify when the value of the file changes.
*
* - cpuset: tasks will be kept in empty cpusets when hotplug happens and
* take masks of ancestors with non-empty cpus/mems, instead of being
* moved to an ancestor.
*
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
* - memcg: use_hierarchy is on by default and the cgroup file for the flag
* is not created.
*
* - blkcg: blk-throttle becomes properly hierarchical.
*
* - debug: disallowed on the default hierarchy.
*/
static bool cgroup_on_dfl(const struct cgroup *cgrp)
{
return cgrp->root == &cgrp_dfl_root;
}
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
{
int ret;
idr_preload(gfp_mask);
ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
idr_preload_end();
return ret;
}
static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
{
void *ret;
ret = idr_replace(idr, ptr, id);
return ret;
}
static void cgroup_idr_remove(struct idr *idr, int id)
{
idr_remove(idr, id);
static struct cgroup *cgroup_parent(struct cgroup *cgrp)
{
struct cgroup_subsys_state *parent_css = cgrp->self.parent;
if (parent_css)
return container_of(parent_css, struct cgroup, self);
return NULL;
}
/**
* cgroup_css - obtain a cgroup's css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
* Return @cgrp's css (cgroup_subsys_state) associated with @ss. This
* function must be called either under cgroup_mutex or rcu_read_lock() and
* the caller is responsible for pinning the returned css if it wants to
* keep accessing it outside the said locks. This function may return
* %NULL if @cgrp doesn't have @subsys_id enabled.
*/
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
if (ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
else
return &cgrp->self;
/**
* cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest (%NULL returns @cgrp->self)
* Similar to cgroup_css() but returns the effective css, which is defined
* as the matching css of the nearest ancestor including self which has @ss
* enabled. If @ss is associated with the hierarchy @cgrp is on, this
* function is guaranteed to return non-NULL css.
*/
static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
lockdep_assert_held(&cgroup_mutex);
if (!ss)
return &cgrp->self;
if (!(cgrp->root->subsys_mask & (1 << ss->id)))
return NULL;
/*
* This function is used while updating css associations and thus
* can't test the csses directly. Use ->child_subsys_mask.
*/
while (cgroup_parent(cgrp) &&
!(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id)))
cgrp = cgroup_parent(cgrp);
return cgroup_css(cgrp, ss);
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
/**
* cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
* Find and get the effective css of @cgrp for @ss. The effective css is
* defined as the matching css of the nearest ancestor including self which
* has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on,
* the root css is returned, so this function always returns a valid css.
* The returned css must be put using css_put().
*/
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
do {
css = cgroup_css(cgrp, ss);
if (css && css_tryget_online(css))
goto out_unlock;
cgrp = cgroup_parent(cgrp);
} while (cgrp);
css = init_css_set.subsys[ss->id];
css_get(css);
out_unlock:
rcu_read_unlock();
return css;
}
/* convenient tests for these bits */
static inline bool cgroup_is_dead(const struct cgroup *cgrp)
return !(cgrp->self.flags & CSS_ONLINE);
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
/*
* This is open and unprotected implementation of cgroup_css().
* seq_css() is only called from a kernfs file operation which has
* an active reference on the file. Because all the subsystem
* files are drained before a css is disassociated with a cgroup,
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
if (cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
/**
* cgroup_is_descendant - test ancestry
* @cgrp: the cgroup to be tested
* @ancestor: possible ancestor of @cgrp
*
* Test whether @cgrp is a descendant of @ancestor. It also returns %true
* if @cgrp == @ancestor. This function is safe to call as long as @cgrp
* and @ancestor are accessible.
*/
bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
{
while (cgrp) {
if (cgrp == ancestor)
return true;
static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
/**
* for_each_css - iterate all css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = rcu_dereference_check( \
(cgrp)->subsys[(ssid)], \
lockdep_is_held(&cgroup_mutex)))) { } \
else
/**
* for_each_e_css - iterate all effective css's of a cgroup
* @css: the iteration cursor
* @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
* @cgrp: the target cgroup to iterate css's of
*
* Should be called under cgroup_[tree_]mutex.
*/
#define for_each_e_css(css, ssid, cgrp) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \
if (!((css) = cgroup_e_css(cgrp, cgroup_subsys[(ssid)]))) \
; \
else
* for_each_subsys - iterate all enabled cgroup subsystems
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
#define for_each_subsys(ss, ssid) \
for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT && \
(((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
/**
* for_each_subsys_which - filter for_each_subsys with a bitmask
* @ss: the iteration cursor
* @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
* @ss_maskp: a pointer to the bitmask
*
* The block will only run for cases where the ssid-th bit (1 << ssid) of
* mask is set to 1.
*/
#define for_each_subsys_which(ss, ssid, ss_maskp) \
if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \
(ssid) = 0; \
else \
for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \
if (((ss) = cgroup_subsys[ssid]) && false) \
break; \
else
/* iterate across the hierarchies */
#define for_each_root(root) \
list_for_each_entry((root), &cgroup_roots, root_list)
Tejun Heo
committed
/* iterate over child cgrps, lock should be held throughout iteration */
#define cgroup_for_each_live_child(child, cgrp) \
list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
if (({ lockdep_assert_held(&cgroup_mutex); \
Tejun Heo
committed
cgroup_is_dead(child); })) \
; \
else
static void cgroup_release_agent(struct work_struct *work);
static void check_for_release(struct cgroup *cgrp);
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
* direction, a css_set is naturally associated with multiple cgroups.
* This M:N relationship is represented by the following link structure
* which exists for each association and allows traversing the associations
* from both sides.
*/
struct cgrp_cset_link {
/* the cgroup and css_set this link associates */
struct cgroup *cgrp;
struct css_set *cset;
/* list of cgrp_cset_links anchored at cgrp->cset_links */
struct list_head cset_link;
/* list of cgrp_cset_links anchored at css_set->cgrp_links */
struct list_head cgrp_link;
/*
* The default css_set - used by init and its children prior to any
* hierarchies being mounted. It contains a pointer to the root state
* for each subsystem. Also used to anchor the list of css_sets. Not
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
.refcount = ATOMIC_INIT(1),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
};
static int css_set_count = 1; /* 1 for init_css_set */
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
/**
* cgroup_update_populated - updated populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
* @cgrp is either getting the first task (css_set) or losing the last.
* Update @cgrp->populated_cnt accordingly. The count is propagated
* towards root so that a given cgroup's populated_cnt is zero iff the
* cgroup and all its descendants are empty.
*
* @cgrp's interface file "cgroup.populated" is zero if
* @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
* changes from or to zero, userland is notified that the content of the
* interface file has changed. This can be used to detect when @cgrp and
* its descendants become populated or empty.
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
lockdep_assert_held(&css_set_rwsem);
do {
bool trigger;
if (populated)
trigger = !cgrp->populated_cnt++;
else
trigger = !--cgrp->populated_cnt;
if (!trigger)
break;
cgroup_file_notify(&cgrp->events_file);
} while (cgrp);
}
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
* account cgroups in empty hierarchies.
*/
static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
struct cgroup_subsys *ss;
int i;
key += (unsigned long)css[i];
key = (key >> 16) ^ key;
static void put_css_set_locked(struct css_set *cset)
struct cgrp_cset_link *link, *tmp_link;
lockdep_assert_held(&css_set_rwsem);
if (!atomic_dec_and_test(&cset->refcount))
return;
/* This css_set is dead. unlink it and release cgroup refcounts */
for_each_subsys(ss, ssid)
list_del(&cset->e_cset_node[ssid]);
hash_del(&cset->hlist);
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
struct cgroup *cgrp = link->cgrp;
list_del(&link->cset_link);
list_del(&link->cgrp_link);
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links)) {
cgroup_update_populated(cgrp, false);
kfree(link);
kfree_rcu(cset, rcu_head);
static void put_css_set(struct css_set *cset)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
* rwlock
*/
if (atomic_add_unless(&cset->refcount, -1, 1))
return;
down_write(&css_set_rwsem);
up_write(&css_set_rwsem);
}
/*
* refcounted get/put for css_set objects
*/
static inline void get_css_set(struct css_set *cset)
atomic_inc(&cset->refcount);
* compare_css_sets - helper function for find_existing_css_set().
* @cset: candidate css_set being tested
* @old_cset: existing css_set for a task
* @new_cgrp: cgroup that's being entered by the task
* @template: desired set of css pointers in css_set (pre-calculated)
*
* Returns true if "cset" matches "old_cset" except for the hierarchy
* which "new_cgrp" belongs to, for which it should match "new_cgrp".
*/
static bool compare_css_sets(struct css_set *cset,
struct css_set *old_cset,
struct cgroup *new_cgrp,
struct cgroup_subsys_state *template[])
{
struct list_head *l1, *l2;
/*
* On the default hierarchy, there can be csets which are
* associated with the same set of cgroups but different csses.
* Let's first ensure that csses match.
*/
if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
return false;
/*
* Compare cgroup pointers in order to distinguish between
* different cgroups in hierarchies. As different cgroups may
* share the same effective css, this comparison is always
* necessary.
*/
l1 = &cset->cgrp_links;
l2 = &old_cset->cgrp_links;
while (1) {
struct cgrp_cset_link *link1, *link2;
struct cgroup *cgrp1, *cgrp2;
l1 = l1->next;
l2 = l2->next;
/* See if we reached the end - both lists are equal length. */
if (l1 == &cset->cgrp_links) {
BUG_ON(l2 != &old_cset->cgrp_links);
break;
} else {
BUG_ON(l2 == &old_cset->cgrp_links);
}
/* Locate the cgroups associated with these links. */
link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
cgrp1 = link1->cgrp;
cgrp2 = link2->cgrp;
/* Hierarchies should be linked in the same order. */
BUG_ON(cgrp1->root != cgrp2->root);
/*
* If this hierarchy is the hierarchy of the cgroup
* that's changing, then we need to check that this
* css_set points to the new cgroup; if it's any other
* hierarchy, then this css_set should point to the
* same cgroup as the old css_set.
*/
if (cgrp1->root == new_cgrp->root) {
if (cgrp1 != new_cgrp)
return false;
} else {
if (cgrp1 != cgrp2)
return false;
}
}
return true;
}
/**
* find_existing_css_set - init css array and find the matching css_set
* @old_cset: the css_set that we're using before the cgroup transition
* @cgrp: the cgroup that we're moving into
* @template: out param for the new set of csses, should be clear on entry
static struct css_set *find_existing_css_set(struct css_set *old_cset,
struct cgroup *cgrp,
struct cgroup_subsys_state *template[])
struct cgroup_root *root = cgrp->root;
struct css_set *cset;
/*
* Build the set of subsystem state objects that we want to see in the
* new css_set. while subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
Tejun Heo
committed
if (root->subsys_mask & (1UL << i)) {
/*
* @ss is in this hierarchy, so we want the
* effective css from @cgrp.
*/
template[i] = cgroup_e_css(cgrp, ss);
/*
* @ss is not in this hierarchy, so we don't want
* to change the css.
*/
template[i] = old_cset->subsys[i];
hash_for_each_possible(css_set_table, cset, hlist, key) {
if (!compare_css_sets(cset, old_cset, cgrp, template))
continue;
/* This css_set matches what we need */
/* No existing cgroup group matched */
return NULL;
}
static void free_cgrp_cset_links(struct list_head *links_to_free)
struct cgrp_cset_link *link, *tmp_link;
list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
list_del(&link->cset_link);
kfree(link);
}
}
/**
* allocate_cgrp_cset_links - allocate cgrp_cset_links
* @count: the number of links to allocate
* @tmp_links: list_head the allocated links are put on
*
* Allocate @count cgrp_cset_link structures and chain them on @tmp_links
* through ->cset_link. Returns 0 on success or -errno.
static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
struct cgrp_cset_link *link;
INIT_LIST_HEAD(tmp_links);
for (i = 0; i < count; i++) {
link = kzalloc(sizeof(*link), GFP_KERNEL);
free_cgrp_cset_links(tmp_links);
return -ENOMEM;
}
list_add(&link->cset_link, tmp_links);
}
return 0;
}
/**
* link_css_set - a helper function to link a css_set to a cgroup
* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
* @cset: the css_set to be linked
* @cgrp: the destination cgroup
*/
static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
struct cgroup *cgrp)
struct cgrp_cset_link *link;
BUG_ON(list_empty(tmp_links));
if (cgroup_on_dfl(cgrp))
cset->dfl_cgrp = cgrp;
link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
link->cset = cset;
link->cgrp = cgrp;
if (list_empty(&cgrp->cset_links))
cgroup_update_populated(cgrp, true);
list_move(&link->cset_link, &cgrp->cset_links);
/*
* Always add links to the tail of the list so that the list
* is sorted by order of hierarchy creation
*/
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
/**
* find_css_set - return a new css_set with one cgroup updated
* @old_cset: the baseline css_set
* @cgrp: the cgroup to be updated
*
* Return a new css_set that's equivalent to @old_cset, but with @cgrp
* substituted into the appropriate hierarchy.
static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup *cgrp)
struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
struct css_set *cset;
struct list_head tmp_links;
struct cgrp_cset_link *link;
lockdep_assert_held(&cgroup_mutex);
/* First see if we already have a cgroup group that matches
* the desired set */
down_read(&css_set_rwsem);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
up_read(&css_set_rwsem);
if (cset)
return cset;
cset = kzalloc(sizeof(*cset), GFP_KERNEL);
return NULL;
/* Allocate all the cgrp_cset_link objects that we'll need */
if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
return NULL;
}
atomic_set(&cset->refcount, 1);
INIT_LIST_HEAD(&cset->cgrp_links);
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
down_write(&css_set_rwsem);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == cgrp->root)
c = cgrp;
link_css_set(&tmp_links, cset, c);
BUG_ON(!list_empty(&tmp_links));
css_set_count++;
key = css_set_hash(cset->subsys);
hash_add(css_set_table, &cset->hlist, key);
for_each_subsys(ss, ssid)
list_add_tail(&cset->e_cset_node[ssid],
&cset->subsys[ssid]->cgroup->e_csets[ssid]);
up_write(&css_set_rwsem);
static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
struct cgroup *root_cgrp = kf_root->kn->priv;
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
lockdep_assert_held(&cgroup_mutex);
id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
if (id < 0)
return id;
root->hierarchy_id = id;
return 0;
}
static void cgroup_exit_root_id(struct cgroup_root *root)