Newer
Older
/* init_css_set.subsys[] has been updated, re-hash */
hash_del(&init_css_set.hlist);
hash_add(css_set_table, &init_css_set.hlist,
css_set_hash(init_css_set.subsys));
WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
WARN_ON(register_filesystem(&cgroup_fs_type));
WARN_ON(register_filesystem(&cgroup2_fs_type));
WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
static int __init cgroup_wq_init(void)
{
/*
* There isn't much point in executing destruction path in
* parallel. Good chunk is serialized with cgroup_mutex anyway.
* Use 1 for @max_active.
*
* We would prefer to do this in cgroup_init() above, but that
* is called before init_workqueues(): so leave this until after.
*/
cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
BUG_ON(!cgroup_destroy_wq);
return 0;
}
core_initcall(cgroup_wq_init);
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk)
char *buf;
struct cgroup_root *root;
if (!buf)
goto out;
mutex_lock(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
for_each_root(root) {
struct cgroup *cgrp;
if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
continue;
seq_printf(m, "%d:", root->hierarchy_id);
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(m, "%s%s", count++ ? "," : "",
if (strlen(root->name))
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
cgrp = task_cgroup_from_root(tsk, root);
/*
* On traditional hierarchies, all zombie tasks show up as
* belonging to the root cgroup. On the default hierarchy,
* while a zombie doesn't show up in "cgroup.procs" and
* thus can't be migrated, its /proc/PID/cgroup keeps
* reporting the cgroup it belonged to before exiting. If
* the cgroup is removed before the zombie is reaped,
* " (deleted)" is appended to the cgroup path.
*/
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
Tejun Heo
committed
if (retval >= PATH_MAX)
retval = -ENAMETOOLONG;
Tejun Heo
committed
if (retval < 0)
goto out_unlock;
seq_puts(m, buf);
seq_puts(m, "/");
if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
seq_puts(m, " (deleted)\n");
else
seq_putc(m, '\n');
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
kfree(buf);
out:
return retval;
}
* cgroup_fork - initialize cgroup related fields during copy_process()
* @child: pointer to task_struct of forking parent process.
* A task is associated with the init_css_set until cgroup_post_fork()
* attaches it to the parent's css_set. Empty cg_list indicates that
* @child isn't holding reference to its css_set.
*/
void cgroup_fork(struct task_struct *child)
{
RCU_INIT_POINTER(child->cgroups, &init_css_set);
INIT_LIST_HEAD(&child->cg_list);
/**
* cgroup_can_fork - called on a new task before the process is exposed
* @child: the task in question.
*
* This calls the subsystem can_fork() callbacks. If the can_fork() callback
* returns an error, the fork aborts with that error code. This allows for
* a cgroup subsystem to conditionally allow or deny new forks.
*/
int cgroup_can_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i, j, ret;
do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child);
if (ret)
goto out_revert;
} while_each_subsys_mask();
return 0;
out_revert:
for_each_subsys(ss, j) {
if (j >= i)
break;
if (ss->cancel_fork)
ss->cancel_fork(child);
}
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
* @child: the task in question
*
* This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded.
*/
void cgroup_cancel_fork(struct task_struct *child)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
ss->cancel_fork(child);
* cgroup_post_fork - called on a new task after adding it to the task list
* @child: the task in question
*
Tejun Heo
committed
* Adds the task to the list running through its css_set if necessary and
* call the subsystem fork() callbacks. Has to be after the task is
* visible on the task list in case we race with the first call to
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
Tejun Heo
committed
* list.
void cgroup_post_fork(struct task_struct *child)
Tejun Heo
committed
int i;
Frederic Weisbecker
committed
/*
* This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
* use_task_css_set_links or cgroup_enable_task_cg_lists() sees
* @child during its iteration.
*
* If we won the race, @child is associated with %current's
* css_set. Grabbing css_set_lock guarantees both that the
* association is stable, and, on completion of the parent's
* migration, @child is visible in the source of migration or
* already in the destination cgroup. This guarantee is necessary
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
* Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
Frederic Weisbecker
committed
*/
if (use_task_css_set_links) {
struct css_set *cset;
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
css_set_move_task(child, NULL, cset, false);
spin_unlock_irq(&css_set_lock);
Tejun Heo
committed
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
Tejun Heo
committed
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
* call cgroup_exit() while the task is still competent to handle
* notify_on_release(), then leave the task attached to the root cgroup in
* each hierarchy for the remainder of its exit. No need to bother with
* init_css_set refcnting. init_css_set never goes away and we can't race
* with migration path - PF_EXITING is visible to migration path.
void cgroup_exit(struct task_struct *tsk)
struct css_set *cset;
* Unlink from @tsk from its css_set. As migration path can't race
* with us, we can check css_set and cg_list without synchronization.
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
/* see cgroup_post_fork() for details */
do_each_subsys_mask(ss, i, have_exit_callback) {
} while_each_subsys_mask();
void cgroup_free(struct task_struct *task)
{
struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
do_each_subsys_mask(ss, ssid, have_free_callback) {
ss->free(task);
} while_each_subsys_mask();
put_css_set(cset);
static int __init cgroup_disable(char *str)
{
char *token;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
cgroup_disable_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
* @ss: subsystem of interest
* If @dentry is a directory for a cgroup which has @ss enabled on it, try
* to get the corresponding css and return it. If such css doesn't exist
* or can't be pinned, an ERR_PTR value is returned.
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct file_system_type *s_type = dentry->d_sb->s_type;
/* is @dentry a cgroup dir? */
if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
!kn || kernfs_type(kn) != KERNFS_DIR)
rcu_read_lock();
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
if (!css || !css_tryget_online(css))
css = ERR_PTR(-ENOENT);
rcu_read_unlock();
return css;
/**
* css_from_id - lookup css by id
* @id: the cgroup id
* @ss: cgroup subsys to be looked into
*
* Returns the css if there's valid one with @id, otherwise returns NULL.
* Should be called under rcu_read_lock().
*/
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return idr_find(&ss->css_idr, id);
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
/**
* cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
* @path: path on the default hierarchy
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
* success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
* if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
cgroup_get(cgrp);
} else {
cgrp = ERR_PTR(-ENOTDIR);
}
kernfs_put(kn);
} else {
cgrp = ERR_PTR(-ENOENT);
}
mutex_unlock(&cgroup_mutex);
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
/**
* cgroup_get_from_fd - get a cgroup pointer from a fd
* @fd: fd obtained by open(cgroup2_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
struct cgroup *cgroup_get_from_fd(int fd)
{
struct cgroup_subsys_state *css;
struct cgroup *cgrp;
struct file *f;
f = fget_raw(fd);
if (!f)
return ERR_PTR(-EBADF);
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
fput(f);
if (IS_ERR(css))
return ERR_CAST(css);
cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
DEFINE_SPINLOCK(cgroup_sk_update_lock);
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
static bool cgroup_sk_alloc_disabled __read_mostly;
void cgroup_sk_alloc_disable(void)
{
if (cgroup_sk_alloc_disabled)
return;
pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
cgroup_sk_alloc_disabled = true;
}
#else
#define cgroup_sk_alloc_disabled false
#endif
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
if (cgroup_sk_alloc_disabled)
return;
/* Socket clone path */
if (skcd->val) {
cgroup_get(sock_cgroup_ptr(skcd));
return;
}
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
rcu_read_lock();
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
skcd->val = (unsigned long)cset->dfl_cgrp;
break;
}
cpu_relax();
}
rcu_read_unlock();
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
cgroup_put(sock_cgroup_ptr(skcd));
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
}
static void dec_cgroup_namespaces(struct ucounts *ucounts)
{
dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
}
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517
5518
5519
5520
5521
5522
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
return ERR_PTR(ret);
}
atomic_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
}
EXPORT_SYMBOL(free_cgroup_ns);
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
struct ucounts *ucounts;
struct css_set *cset;
BUG_ON(!old_ns);
if (!(flags & CLONE_NEWCGROUP)) {
get_cgroup_ns(old_ns);
return old_ns;
}
/* Allow only sysadmin to create cgroup namespace. */
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
ucounts = inc_cgroup_namespaces(user_ns);
if (!ucounts)
Eric W. Biederman
committed
return ERR_PTR(-ENOSPC);
/* It is not safe to take cgroup_mutex here */
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (IS_ERR(new_ns)) {
put_css_set(cset);
dec_cgroup_namespaces(ucounts);
return new_ns;
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
return new_ns;
}
static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
return container_of(ns, struct cgroup_namespace, ns);
}
static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* Don't need to do anything if we are attaching to our own cgroupns. */
if (cgroup_ns == nsproxy->cgroup_ns)
return 0;
get_cgroup_ns(cgroup_ns);
put_cgroup_ns(nsproxy->cgroup_ns);
nsproxy->cgroup_ns = cgroup_ns;
return 0;
5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
5605
5606
5607
5608
5609
5610
5611
5612
5613
5614
5615
5616
5617
}
static struct ns_common *cgroupns_get(struct task_struct *task)
{
struct cgroup_namespace *ns = NULL;
struct nsproxy *nsproxy;
task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->cgroup_ns;
get_cgroup_ns(ns);
}
task_unlock(task);
return ns ? &ns->ns : NULL;
}
static void cgroupns_put(struct ns_common *ns)
{
put_cgroup_ns(to_cg_ns(ns));
}
static struct user_namespace *cgroupns_owner(struct ns_common *ns)
{
return to_cg_ns(ns)->user_ns;
}
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
.type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
.owner = cgroupns_owner,
};
static __init int cgroup_namespaces_init(void)
{
return 0;
}
subsys_initcall(cgroup_namespaces_init);
#ifdef CONFIG_CGROUP_BPF
void cgroup_bpf_update(struct cgroup *cgrp,
struct bpf_prog *prog,
enum bpf_attach_type type)
{
struct cgroup *parent = cgroup_parent(cgrp);
mutex_lock(&cgroup_mutex);
__cgroup_bpf_update(cgrp, parent, prog, type);
mutex_unlock(&cgroup_mutex);
}
#endif /* CONFIG_CGROUP_BPF */