Newer
Older
* This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
* use_task_css_set_links or cgroup_enable_task_cg_lists() sees
* @child during its iteration.
*
* If we won the race, @child is associated with %current's
* css_set. Grabbing css_set_lock guarantees both that the
* association is stable, and, on completion of the parent's
* migration, @child is visible in the source of migration or
* already in the destination cgroup. This guarantee is necessary
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
* Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
Frederic Weisbecker
committed
*/
if (use_task_css_set_links) {
struct css_set *cset;
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
css_set_move_task(child, NULL, cset, false);
spin_unlock_irq(&css_set_lock);
Tejun Heo
committed
/*
* Call ss->fork(). This must happen after @child is linked on
* css_set; otherwise, @child might change state between ->fork()
* and addition to css_set.
*/
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
Tejun Heo
committed
/**
* cgroup_exit - detach cgroup from exiting task
* @tsk: pointer to task_struct of exiting process
*
* Description: Detach cgroup from @tsk and release it.
*
* Note that cgroups marked notify_on_release force every task in
* them to take the global cgroup_mutex mutex when exiting.
* This could impact scaling on very large systems. Be reluctant to
* use notify_on_release cgroups where very high task exit scaling
* is required on large systems.
*
* We set the exiting tasks cgroup to the root cgroup (top_cgroup). We
* call cgroup_exit() while the task is still competent to handle
* notify_on_release(), then leave the task attached to the root cgroup in
* each hierarchy for the remainder of its exit. No need to bother with
* init_css_set refcnting. init_css_set never goes away and we can't race
* with migration path - PF_EXITING is visible to migration path.
void cgroup_exit(struct task_struct *tsk)
struct css_set *cset;
* Unlink from @tsk from its css_set. As migration path can't race
* with us, we can check css_set and cg_list without synchronization.
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
/* see cgroup_post_fork() for details */
do_each_subsys_mask(ss, i, have_exit_callback) {
} while_each_subsys_mask();
void cgroup_free(struct task_struct *task)
{
struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
do_each_subsys_mask(ss, ssid, have_free_callback) {
ss->free(task);
} while_each_subsys_mask();
put_css_set(cset);
static void check_for_release(struct cgroup *cgrp)
if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
schedule_work(&cgrp->release_agent_work);
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
}
/*
* Notify userspace when a cgroup is released, by running the
* configured release agent with the name of the cgroup (path
* relative to the root of cgroup file system) as the argument.
*
* Most likely, this user command will try to rmdir this cgroup.
*
* This races with the possibility that some other task will be
* attached to this cgroup before it is removed, or that some other
* user task will 'mkdir' a child cgroup of this cgroup. That's ok.
* The presumed 'rmdir' will fail quietly if this cgroup is no longer
* unused, and this cgroup will be reprieved from its death sentence,
* to continue to serve a useful existence. Next time it's released,
* we will get notified again, if it still has 'notify_on_release' set.
*
* The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
* means only wait until the task is successfully execve()'d. The
* separate release agent task is forked by call_usermodehelper(),
* then control in this thread returns here, without waiting for the
* release agent task. We don't bother to wait because the caller of
* this routine has no use for the exit status of the release agent
* task, so no sense holding our caller up for that.
*/
static void cgroup_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
char *pathbuf = NULL, *agentbuf = NULL;
int ret;
mutex_lock(&cgroup_mutex);
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!pathbuf || !agentbuf)
goto out;
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_irq(&css_set_lock);
Tejun Heo
committed
if (ret < 0 || ret >= PATH_MAX)
goto out;
argv[0] = agentbuf;
argv[1] = pathbuf;
argv[2] = NULL;
/* minimal command environment */
envp[0] = "HOME=/";
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
mutex_unlock(&cgroup_mutex);
kfree(agentbuf);
kfree(pathbuf);
static int __init cgroup_disable(char *str)
{
char *token;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
cgroup_disable_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
static int __init cgroup_no_v1(char *str)
{
struct cgroup_subsys *ss;
char *token;
int i;
while ((token = strsep(&str, ",")) != NULL) {
if (!*token)
continue;
if (!strcmp(token, "all")) {
break;
}
for_each_subsys(ss, i) {
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
cgroup_no_v1_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
* @ss: subsystem of interest
* If @dentry is a directory for a cgroup which has @ss enabled on it, try
* to get the corresponding css and return it. If such css doesn't exist
* or can't be pinned, an ERR_PTR value is returned.
struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
struct cgroup_subsys *ss)
struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
struct file_system_type *s_type = dentry->d_sb->s_type;
/* is @dentry a cgroup dir? */
if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
!kn || kernfs_type(kn) != KERNFS_DIR)
rcu_read_lock();
/*
* This path doesn't originate from kernfs and @kn could already
* have been or be removed at any point. @kn->priv is RCU
* protected for this access. See css_release_work_fn() for details.
*/
cgrp = rcu_dereference(kn->priv);
if (cgrp)
css = cgroup_css(cgrp, ss);
if (!css || !css_tryget_online(css))
css = ERR_PTR(-ENOENT);
rcu_read_unlock();
return css;
/**
* css_from_id - lookup css by id
* @id: the cgroup id
* @ss: cgroup subsys to be looked into
*
* Returns the css if there's valid one with @id, otherwise returns NULL.
* Should be called under rcu_read_lock().
*/
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return idr_find(&ss->css_idr, id);
6273
6274
6275
6276
6277
6278
6279
6280
6281
6282
6283
6284
6285
6286
6287
6288
6289
6290
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
/**
* cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
* @path: path on the default hierarchy
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
* success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
* if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
cgroup_get(cgrp);
} else {
cgrp = ERR_PTR(-ENOTDIR);
}
kernfs_put(kn);
} else {
cgrp = ERR_PTR(-ENOENT);
}
mutex_unlock(&cgroup_mutex);
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6307
6308
6309
6310
6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
/**
* cgroup_get_from_fd - get a cgroup pointer from a fd
* @fd: fd obtained by open(cgroup2_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
struct cgroup *cgroup_get_from_fd(int fd)
{
struct cgroup_subsys_state *css;
struct cgroup *cgrp;
struct file *f;
f = fget_raw(fd);
if (!f)
return ERR_PTR(-EBADF);
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
fput(f);
if (IS_ERR(css))
return ERR_CAST(css);
cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
/*
* sock->sk_cgrp_data handling. For more info, see sock_cgroup_data
* definition in cgroup-defs.h.
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
DEFINE_SPINLOCK(cgroup_sk_update_lock);
6350
6351
6352
6353
6354
6355
6356
6357
6358
6359
6360
6361
6362
6363
6364
6365
6366
6367
6368
6369
6370
static bool cgroup_sk_alloc_disabled __read_mostly;
void cgroup_sk_alloc_disable(void)
{
if (cgroup_sk_alloc_disabled)
return;
pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
cgroup_sk_alloc_disabled = true;
}
#else
#define cgroup_sk_alloc_disabled false
#endif
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
if (cgroup_sk_alloc_disabled)
return;
/* Socket clone path */
if (skcd->val) {
cgroup_get(sock_cgroup_ptr(skcd));
return;
}
6377
6378
6379
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
rcu_read_lock();
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
skcd->val = (unsigned long)cset->dfl_cgrp;
break;
}
cpu_relax();
}
rcu_read_unlock();
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
cgroup_put(sock_cgroup_ptr(skcd));
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
static struct ucounts *inc_cgroup_namespaces(struct user_namespace *ns)
{
return inc_ucount(ns, current_euid(), UCOUNT_CGROUP_NAMESPACES);
}
static void dec_cgroup_namespaces(struct ucounts *ucounts)
{
dec_ucount(ucounts, UCOUNT_CGROUP_NAMESPACES);
}
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
static struct cgroup_namespace *alloc_cgroup_ns(void)
{
struct cgroup_namespace *new_ns;
int ret;
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
if (ret) {
kfree(new_ns);
return ERR_PTR(ret);
}
atomic_set(&new_ns->count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
void free_cgroup_ns(struct cgroup_namespace *ns)
{
put_css_set(ns->root_cset);
dec_cgroup_namespaces(ns->ucounts);
put_user_ns(ns->user_ns);
ns_free_inum(&ns->ns);
kfree(ns);
}
EXPORT_SYMBOL(free_cgroup_ns);
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
struct user_namespace *user_ns,
struct cgroup_namespace *old_ns)
{
struct cgroup_namespace *new_ns;
struct ucounts *ucounts;
struct css_set *cset;
BUG_ON(!old_ns);
if (!(flags & CLONE_NEWCGROUP)) {
get_cgroup_ns(old_ns);
return old_ns;
}
/* Allow only sysadmin to create cgroup namespace. */
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
ucounts = inc_cgroup_namespaces(user_ns);
if (!ucounts)
Eric W. Biederman
committed
return ERR_PTR(-ENOSPC);
/* It is not safe to take cgroup_mutex here */
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (IS_ERR(new_ns)) {
put_css_set(cset);
dec_cgroup_namespaces(ucounts);
return new_ns;
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
new_ns->root_cset = cset;
return new_ns;
}
static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
{
return container_of(ns, struct cgroup_namespace, ns);
}
static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
/* Don't need to do anything if we are attaching to our own cgroupns. */
if (cgroup_ns == nsproxy->cgroup_ns)
return 0;
get_cgroup_ns(cgroup_ns);
put_cgroup_ns(nsproxy->cgroup_ns);
nsproxy->cgroup_ns = cgroup_ns;
return 0;
6505
6506
6507
6508
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
}
static struct ns_common *cgroupns_get(struct task_struct *task)
{
struct cgroup_namespace *ns = NULL;
struct nsproxy *nsproxy;
task_lock(task);
nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->cgroup_ns;
get_cgroup_ns(ns);
}
task_unlock(task);
return ns ? &ns->ns : NULL;
}
static void cgroupns_put(struct ns_common *ns)
{
put_cgroup_ns(to_cg_ns(ns));
}
static struct user_namespace *cgroupns_owner(struct ns_common *ns)
{
return to_cg_ns(ns)->user_ns;
}
const struct proc_ns_operations cgroupns_operations = {
.name = "cgroup",
.type = CLONE_NEWCGROUP,
.get = cgroupns_get,
.put = cgroupns_put,
.install = cgroupns_install,
.owner = cgroupns_owner,
};
static __init int cgroup_namespaces_init(void)
{
return 0;
}
subsys_initcall(cgroup_namespaces_init);
#ifdef CONFIG_CGROUP_BPF
void cgroup_bpf_update(struct cgroup *cgrp,
struct bpf_prog *prog,
enum bpf_attach_type type)
{
struct cgroup *parent = cgroup_parent(cgrp);
mutex_lock(&cgroup_mutex);
__cgroup_bpf_update(cgrp, parent, prog, type);
mutex_unlock(&cgroup_mutex);
}
#endif /* CONFIG_CGROUP_BPF */
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *
debug_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
if (!css)
return ERR_PTR(-ENOMEM);
return css;
}
static void debug_css_free(struct cgroup_subsys_state *css)
{
kfree(css);
}
static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return cgroup_task_count(css->cgroup);
}
static u64 current_css_set_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
return (u64)(unsigned long)current->cgroups;
}
static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
{
u64 count;
rcu_read_lock();
count = atomic_read(&task_css_set(current)->refcount);
rcu_read_unlock();
return count;
}
static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
{
struct cgrp_cset_link *link;
struct css_set *cset;
char *name_buf;
name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
if (!name_buf)
return -ENOMEM;
spin_lock_irq(&css_set_lock);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
cgroup_name(c, name_buf, NAME_MAX + 1);
seq_printf(seq, "Root %d group %s\n",
}
rcu_read_unlock();
spin_unlock_irq(&css_set_lock);
return 0;
}
#define MAX_TASKS_SHOWN_PER_CSS 25
static int cgroup_css_links_read(struct seq_file *seq, void *v)
{
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
seq_printf(seq, "css_set %p\n", cset);
list_for_each_entry(task, &cset->tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
goto overflow;
seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
list_for_each_entry(task, &cset->mg_tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
goto overflow;
seq_printf(seq, " task %d\n", task_pid_vnr(task));
}
continue;
overflow:
seq_puts(seq, " ...\n");
}
spin_unlock_irq(&css_set_lock);
return 0;
}
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
return (!cgroup_is_populated(css->cgroup) &&
!css_has_online_children(&css->cgroup->self));
}
static struct cftype debug_files[] = {
{
.name = "taskcount",
.read_u64 = debug_taskcount_read,
},
{
.name = "current_css_set",
.read_u64 = current_css_set_read,
},
{
.name = "current_css_set_refcount",
.read_u64 = current_css_set_refcount_read,
},
{
.name = "current_css_set_cg_links",
.seq_show = current_css_set_cg_links_read,
},
{
.name = "cgroup_css_links",
.seq_show = cgroup_css_links_read,
},
{
.name = "releasable",
.read_u64 = releasable_read,
},
{ } /* terminate */
};
struct cgroup_subsys debug_cgrp_subsys = {
Tejun Heo
committed
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
.legacy_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */