Newer
Older
/**
* cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
* @cgrp: the cgroup to attach to
* @leader: the threadgroup leader task_struct of the group to be attached
*
* Call holding cgroup_mutex and the group_rwsem of the leader. Will take
* task_lock of each thread in leader's threadgroup individually in turn.
*/
int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
{
int retval, i, group_size;
struct cgroup_subsys *ss, *failed_ss = NULL;
bool cancel_failed_ss = false;
/* guaranteed to be initialized later, but the compiler needs this */
struct cgroup *oldcgrp = NULL;
struct css_set *oldcg;
struct cgroupfs_root *root = cgrp->root;
/* threadgroup list cursor and array */
struct task_struct *tsk;
/*
* we need to make sure we have css_sets for all the tasks we're
* going to move -before- we actually start moving them, so that in
* case we get an ENOMEM we can bail out before making any changes.
*/
struct list_head newcg_list;
struct cg_list_entry *cg_entry, *temp_nobe;
/*
* step 0: in order to do expensive, possibly blocking operations for
* every thread, we cannot iterate the thread group list, since it needs
* rcu or tasklist locked. instead, build an array of all threads in the
* group - group_rwsem prevents new threads from appearing, and if
* threads exit, this will just be an over-estimate.
*/
group_size = get_nr_threads(leader);
/* flex_array supports very large thread-groups better than kmalloc. */
group = flex_array_alloc(sizeof(struct task_struct *), group_size,
GFP_KERNEL);
/* pre-allocate to guarantee space while iterating in rcu read-side. */
retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
if (retval)
goto out_free_group_list;
/* prevent changes to the threadgroup list while we take a snapshot. */
read_lock(&tasklist_lock);
if (!thread_group_leader(leader)) {
/*
* a race with de_thread from another thread's exec() may strip
* us of our leadership, making while_each_thread unsafe to use
* on this task. if this happens, there is no choice but to
* throw this task away and try again (from cgroup_procs_write);
* this is "double-double-toil-and-trouble-check locking".
*/
read_unlock(&tasklist_lock);
retval = -EAGAIN;
goto out_free_group_list;
}
/* take a reference on each task in the group to go in the array. */
tsk = leader;
i = 0;
do {
/* @tsk either already exited or can't exit until the end */
if (tsk->flags & PF_EXITING)
continue;
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
get_task_struct(tsk);
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
*/
retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
} while_each_thread(leader, tsk);
/* remember the number of threads in the array for later. */
group_size = i;
read_unlock(&tasklist_lock);
/*
* step 1: check that we can legitimately attach to the cgroup.
*/
for_each_subsys(root, ss) {
if (ss->can_attach) {
retval = ss->can_attach(ss, cgrp, leader);
if (retval) {
failed_ss = ss;
goto out_cancel_attach;
}
}
/* a callback to be run on every thread in the threadgroup. */
if (ss->can_attach_task) {
/* run on each task in the threadgroup. */
for (i = 0; i < group_size; i++) {
tsk = flex_array_get_ptr(group, i);
retval = ss->can_attach_task(cgrp, tsk);
if (retval) {
failed_ss = ss;
cancel_failed_ss = true;
goto out_cancel_attach;
}
}
}
}
/*
* step 2: make sure css_sets exist for all threads to be migrated.
* we use find_css_set, which allocates a new one if necessary.
*/
INIT_LIST_HEAD(&newcg_list);
for (i = 0; i < group_size; i++) {
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
/* nothing to do if this task is already in the cgroup */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
continue;
/* get old css_set pointer */
task_lock(tsk);
oldcg = tsk->cgroups;
get_css_set(oldcg);
task_unlock(tsk);
/* see if the new one for us is already in the list? */
if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
/* was already there, nothing to do. */
put_css_set(oldcg);
} else {
/* we don't already have it. get new one. */
retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
put_css_set(oldcg);
if (retval)
goto out_list_teardown;
}
}
/*
* step 3: now that we're guaranteed success wrt the css_sets, proceed
* to move all tasks to the new cgroup, calling ss->attach_task for each
* one along the way. there are no failure cases after here, so this is
* the commit point.
*/
for_each_subsys(root, ss) {
if (ss->pre_attach)
ss->pre_attach(cgrp);
}
for (i = 0; i < group_size; i++) {
/* leave current thread as it is if it's already there */
oldcgrp = task_cgroup_from_root(tsk, root);
if (cgrp == oldcgrp)
continue;
retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
BUG_ON(retval);
/* attach each task to each subsystem */
for_each_subsys(root, ss) {
if (ss->attach_task)
ss->attach_task(cgrp, tsk);
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
}
/* nothing is sensitive to fork() after this point. */
/*
* step 4: do expensive, non-thread-specific subsystem callbacks.
* TODO: if ever a subsystem needs to know the oldcgrp for each task
* being moved, this call will need to be reworked to communicate that.
*/
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(ss, cgrp, oldcgrp, leader);
}
/*
* step 5: success! and cleanup
*/
synchronize_rcu();
cgroup_wakeup_rmdir_waiter(cgrp);
retval = 0;
out_list_teardown:
/* clean up the list of prefetched css_sets. */
list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
list_del(&cg_entry->links);
put_css_set(cg_entry->cg);
kfree(cg_entry);
}
out_cancel_attach:
/* same deal as in cgroup_attach_task */
if (retval) {
for_each_subsys(root, ss) {
if (ss == failed_ss) {
if (cancel_failed_ss && ss->cancel_attach)
ss->cancel_attach(ss, cgrp, leader);
break;
}
if (ss->cancel_attach)
ss->cancel_attach(ss, cgrp, leader);
}
}
/* clean up the array of referenced threads in the group. */
for (i = 0; i < group_size; i++) {
tsk = flex_array_get_ptr(group, i);
put_task_struct(tsk);
}
return retval;
}
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
* cgroup_mutex and threadgroup; may take task_lock of task.
static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
David Howells
committed
const struct cred *cred = current_cred(), *tcred;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
if (pid) {
rcu_read_lock();
if (!tsk) {
rcu_read_unlock();
cgroup_unlock();
return -ESRCH;
}
if (threadgroup) {
/*
* RCU protects this access, since tsk was found in the
* tid map. a race with de_thread may cause group_leader
* to stop being the leader, but cgroup_attach_proc will
* detect it later.
*/
tsk = tsk->group_leader;
/*
* even if we're attaching all tasks in the thread group, we
* only need to check permissions on one of them.
*/
David Howells
committed
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
David Howells
committed
get_task_struct(tsk);
rcu_read_unlock();
if (threadgroup)
tsk = current->group_leader;
else
tsk = current;
get_task_struct(tsk);
}
threadgroup_lock(tsk);
if (threadgroup)
threadgroup_unlock(tsk);
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
return attach_task_by_pid(cgrp, pid, false);
}
static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
{
int ret;
do {
/*
* attach_proc fails with -EAGAIN if threadgroup leadership
* changes in the middle of the operation, in which case we need
* to find the task_struct for the new leader and start over.
*/
ret = attach_task_by_pid(cgrp, tgid, true);
} while (ret == -EAGAIN);
return ret;
}
/**
* cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
* @cgrp: the cgroup to be checked for liveness
*
* On success, returns true; the lock should be later released with
* cgroup_unlock(). On failure returns false with no lock held.
bool cgroup_lock_live_group(struct cgroup *cgrp)
{
mutex_lock(&cgroup_mutex);
if (cgroup_is_removed(cgrp)) {
mutex_unlock(&cgroup_mutex);
return false;
}
return true;
}
EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
if (strlen(buffer) >= PATH_MAX)
return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
strcpy(cgrp->root->release_agent_path, buffer);
return 0;
}
static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
struct seq_file *seq)
{
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
seq_puts(seq, cgrp->root->release_agent_path);
seq_putc(seq, '\n');
return 0;
}
/* A buffer size big enough for numbers or short strings */
#define CGROUP_LOCAL_BUFFER_SIZE 64
static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
char buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
char *end;
if (!nbytes)
return -EINVAL;
if (nbytes >= sizeof(buffer))
return -E2BIG;
if (copy_from_user(buffer, userbuf, nbytes))
return -EFAULT;
buffer[nbytes] = 0; /* nul-terminate */
if (cft->write_u64) {
u64 val = simple_strtoull(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
retval = cft->write_u64(cgrp, cft, val);
} else {
s64 val = simple_strtoll(strstrip(buffer), &end, 0);
if (*end)
return -EINVAL;
retval = cft->write_s64(cgrp, cft, val);
}
if (!retval)
retval = nbytes;
return retval;
}
static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
const char __user *userbuf,
size_t nbytes, loff_t *unused_ppos)
{
char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
int retval = 0;
size_t max_bytes = cft->max_write_len;
char *buffer = local_buffer;
if (!max_bytes)
max_bytes = sizeof(local_buffer) - 1;
if (nbytes >= max_bytes)
return -E2BIG;
/* Allocate a dynamic buffer if we need one */
if (nbytes >= sizeof(local_buffer)) {
buffer = kmalloc(nbytes + 1, GFP_KERNEL);
if (buffer == NULL)
return -ENOMEM;
}
if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
retval = -EFAULT;
goto out;
}
buffer[nbytes] = 0; /* nul-terminate */
retval = cft->write_string(cgrp, cft, strstrip(buffer));
if (!retval)
retval = nbytes;
if (buffer != local_buffer)
kfree(buffer);
return retval;
}
static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
size_t nbytes, loff_t *ppos)
{
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
if (cft->write_u64 || cft->write_s64)
return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
if (cft->write_string)
return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
if (cft->trigger) {
int ret = cft->trigger(cgrp, (unsigned int)cft->private);
return ret ? ret : nbytes;
}
static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
u64 val = cft->read_u64(cgrp, cft);
int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
struct file *file,
char __user *buf, size_t nbytes,
loff_t *ppos)
{
char tmp[CGROUP_LOCAL_BUFFER_SIZE];
s64 val = cft->read_s64(cgrp, cft);
int len = sprintf(tmp, "%lld\n", (long long) val);
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
}
static ssize_t cgroup_file_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos)
{
struct cftype *cft = __d_cft(file->f_dentry);
struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
return -ENODEV;
if (cft->read)
return cft->read(cgrp, cft, file, buf, nbytes, ppos);
if (cft->read_u64)
return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
if (cft->read_s64)
return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos);
return -EINVAL;
}
/*
* seqfile ops/methods for returning structured data. Currently just
* supports string->u64 maps, but can be extended in future.
*/
struct cgroup_seqfile_state {
struct cftype *cft;
struct cgroup *cgroup;
};
static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
{
struct seq_file *sf = cb->state;
return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
}
static int cgroup_seqfile_show(struct seq_file *m, void *arg)
{
struct cgroup_seqfile_state *state = m->private;
struct cftype *cft = state->cft;
if (cft->read_map) {
struct cgroup_map_cb cb = {
.fill = cgroup_map_add,
.state = m,
};
return cft->read_map(state->cgroup, cft, &cb);
}
return cft->read_seq_string(state->cgroup, cft, m);
static int cgroup_seqfile_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
kfree(seq->private);
return single_release(inode, file);
}
static const struct file_operations cgroup_seqfile_operations = {
.write = cgroup_file_write,
.llseek = seq_lseek,
.release = cgroup_seqfile_release,
};
static int cgroup_file_open(struct inode *inode, struct file *file)
{
int err;
struct cftype *cft;
err = generic_file_open(inode, file);
if (err)
return err;
cft = __d_cft(file->f_dentry);
if (cft->read_map || cft->read_seq_string) {
struct cgroup_seqfile_state *state =
kzalloc(sizeof(*state), GFP_USER);
if (!state)
return -ENOMEM;
state->cft = cft;
state->cgroup = __d_cgrp(file->f_dentry->d_parent);
file->f_op = &cgroup_seqfile_operations;
err = single_open(file, cgroup_seqfile_show, state);
if (err < 0)
kfree(state);
} else if (cft->open)
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
err = cft->open(inode, file);
else
err = 0;
return err;
}
static int cgroup_file_release(struct inode *inode, struct file *file)
{
struct cftype *cft = __d_cft(file->f_dentry);
if (cft->release)
return cft->release(inode, file);
return 0;
}
/*
* cgroup_rename - Only allow simple rename of directories in place.
*/
static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
if (!S_ISDIR(old_dentry->d_inode->i_mode))
return -ENOTDIR;
if (new_dentry->d_inode)
return -EEXIST;
if (old_dir != new_dir)
return -EIO;
return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
}
static const struct file_operations cgroup_file_operations = {
.read = cgroup_file_read,
.write = cgroup_file_write,
.llseek = generic_file_llseek,
.open = cgroup_file_open,
.release = cgroup_file_release,
};
static const struct inode_operations cgroup_dir_inode_operations = {
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
.rename = cgroup_rename,
};
static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
if (dentry->d_name.len > NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
d_add(dentry, NULL);
return NULL;
}
/*
* Check if a file is a control file
*/
static inline struct cftype *__file_cft(struct file *file)
{
if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations)
return ERR_PTR(-EINVAL);
return __d_cft(file->f_dentry);
}
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
struct inode *inode;
if (!dentry)
return -ENOENT;
if (dentry->d_inode)
return -EEXIST;
inode = cgroup_new_inode(mode, sb);
if (!inode)
return -ENOMEM;
if (S_ISDIR(mode)) {
inode->i_op = &cgroup_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
/* start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
/* start with the directory inode held, so that we can
* populate it without racing with another mkdir */
mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
} else if (S_ISREG(mode)) {
inode->i_size = 0;
inode->i_fop = &cgroup_file_operations;
}
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
return 0;
}
/*
* cgroup_create_dir - create a directory for an object.
* @cgrp: the cgroup we create the directory for. It must have a valid
* ->parent field. And we are going to fill its ->dentry field.
* @dentry: dentry of the new cgroup
* @mode: mode to set on new directory.
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
{
struct dentry *parent;
int error = 0;
parent = cgrp->parent->dentry;
error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb);
dentry->d_fsdata = cgrp;
inc_nlink(parent->d_inode);
rcu_assign_pointer(cgrp->dentry, dentry);
dget(dentry);
}
dput(dentry);
return error;
}
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* returns cft->mode if ->mode is not 0
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
static mode_t cgroup_file_mode(const struct cftype *cft)
{
mode_t mode = 0;
if (cft->mode)
return cft->mode;
if (cft->read || cft->read_u64 || cft->read_s64 ||
cft->read_map || cft->read_seq_string)
mode |= S_IRUGO;
if (cft->write || cft->write_u64 || cft->write_s64 ||
cft->write_string || cft->trigger)
mode |= S_IWUSR;
return mode;
}
int cgroup_add_file(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
const struct cftype *cft)
{
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
strcpy(name, subsys->name);
strcat(name, ".");
}
strcat(name, cft->name);
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
dentry = lookup_one_len(name, dir, strlen(name));
if (!IS_ERR(dentry)) {
mode = cgroup_file_mode(cft);
error = cgroup_create_file(dentry, mode | S_IFREG,
cgrp->root->sb);
if (!error)
dentry->d_fsdata = (void *)cft;
dput(dentry);
} else
error = PTR_ERR(dentry);
return error;
}
EXPORT_SYMBOL_GPL(cgroup_add_file);
int cgroup_add_files(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
const struct cftype cft[],
int count)
{
int i, err;
for (i = 0; i < count; i++) {
err = cgroup_add_file(cgrp, subsys, &cft[i]);
if (err)
return err;
}
return 0;
}
EXPORT_SYMBOL_GPL(cgroup_add_files);
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
* Return the number of tasks in the cgroup.
*/
int cgroup_task_count(const struct cgroup *cgrp)
read_lock(&css_set_lock);
list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
count += atomic_read(&link->cg->refcount);
}
read_unlock(&css_set_lock);
/*
* Advance a list_head iterator. The iterator should be positioned at
* the start of a css_set
*/
static void cgroup_advance_iter(struct cgroup *cgrp,
struct cgroup_iter *it)
{
struct list_head *l = it->cg_link;
struct cg_cgroup_link *link;
struct css_set *cg;
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == &cgrp->css_sets) {
it->cg_link = NULL;
return;
}
link = list_entry(l, struct cg_cgroup_link, cgrp_link_list);
cg = link->cg;
} while (list_empty(&cg->tasks));
it->cg_link = l;
it->task = cg->tasks.next;
}
/*
* To reduce the fork() overhead for systems that are not actually
* using their cgroups capability, we don't maintain the lists running
* through each css_set to its tasks until we see the list actually
* used - in other words after the first call to cgroup_iter_start().
*
* The tasklist_lock is not held here, as do_each_thread() and
* while_each_thread() are protected by RCU.
*/
static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
write_lock(&css_set_lock);
use_task_css_set_links = 1;
do_each_thread(g, p) {
task_lock(p);
/*
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
*/
if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
list_add(&p->cg_list, &p->cgroups->tasks);
task_unlock(p);
} while_each_thread(g, p);
write_unlock(&css_set_lock);
}
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
{
/*
* The first time anyone tries to iterate across a cgroup,
* we need to enable the list linking each css_set to its
* tasks, and fix up all existing tasks.
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
read_lock(&css_set_lock);
it->cg_link = &cgrp->css_sets;
cgroup_advance_iter(cgrp, it);
struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
struct cgroup_iter *it)
{
struct task_struct *res;
struct list_head *l = it->task;
/* If the iterator cg is NULL, we have no tasks */
if (!it->cg_link)
return NULL;
res = list_entry(l, struct task_struct, cg_list);
/* Advance iterator to find next entry */
l = l->next;
link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
if (l == &link->cg->tasks) {
/* We reached the end of this task list - move on to
* the next cg_cgroup_link */
cgroup_advance_iter(cgrp, it);
} else {
it->task = l;
}
return res;
}
void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
{
read_unlock(&css_set_lock);
}
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
static inline int started_after_time(struct task_struct *t1,
struct timespec *time,
struct task_struct *t2)
{
int start_diff = timespec_compare(&t1->start_time, time);
if (start_diff > 0) {
return 1;
} else if (start_diff < 0) {
return 0;
} else {
/*
* Arbitrarily, if two processes started at the same
* time, we'll say that the lower pointer value
* started first. Note that t2 may have exited by now
* so this may not be a valid pointer any longer, but
* that's fine - it still serves to distinguish
* between two tasks started (effectively) simultaneously.
*/
return t1 > t2;
}
}
/*
* This function is a callback from heap_insert() and is used to order
* the heap.
* In this case we order the heap in descending task start time.
*/
static inline int started_after(void *p1, void *p2)
{
struct task_struct *t1 = p1;
struct task_struct *t2 = p2;
return started_after_time(t1, &t2->start_time, t2);
}
/**
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
* @scan: struct cgroup_scanner containing arguments for the scan
*
* Arguments include pointers to callback functions test_task() and
* process_task().
* Iterate through all the tasks in a cgroup, calling test_task() for each,
* and if it returns true, call process_task() for it also.
* The test_task pointer may be NULL, meaning always true (select all tasks).
* Effectively duplicates cgroup_iter_{start,next,end}()
* but does not lock css_set_lock for the call to process_task().
* The struct cgroup_scanner may be embedded in any structure of the caller's
* creation.
* It is guaranteed that process_task() will act on every task that
* is a member of the cgroup for the duration of this call. This
* function may or may not call process_task() for tasks that exit
* or move to a different cgroup during the call, or are forked or
* move into the cgroup during the call.
*
* Note that test_task() may be called with locks held, and may in some
* situations be called multiple times for the same task, so it should
* be cheap.
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
* pre-allocated and will be used for heap operations (and its "gt" member will
* be overwritten), else a temporary heap will be used (allocation of which
* may cause this function to fail).
*/
int cgroup_scan_tasks(struct cgroup_scanner *scan)
{
int retval, i;
struct cgroup_iter it;
struct task_struct *p, *dropped;
/* Never dereference latest_task, since it's not refcounted */
struct task_struct *latest_task = NULL;
struct ptr_heap tmp_heap;
struct ptr_heap *heap;
struct timespec latest_time = { 0, 0 };
if (scan->heap) {
/* The caller supplied our heap and pre-allocated its memory */
heap = scan->heap;
heap->gt = &started_after;
} else {
/* We need to allocate our own heap memory */
heap = &tmp_heap;
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
if (retval)
/* cannot allocate the heap */
return retval;
}
again:
/*
* Scan tasks in the cgroup, using the scanner's "test_task" callback
* to determine which are of interest, and using the scanner's
* "process_task" callback to process any of them that need an update.
* Since we don't want to hold any locks during the task updates,
* gather tasks to be processed in a heap structure.
* The heap is sorted by descending task start time.
* If the statically-sized heap fills up, we overflow tasks that
* started later, and in future iterations only consider tasks that
* started after the latest task in the previous pass. This
* guarantees forward progress and that we don't miss any tasks.
*/
heap->size = 0;
cgroup_iter_start(scan->cg, &it);
while ((p = cgroup_iter_next(scan->cg, &it))) {
/*
* Only affect tasks that qualify per the caller's callback,
* if he provided one
*/
if (scan->test_task && !scan->test_task(p, scan))
continue;
/*
* Only process tasks that started after the last task
* we processed
*/
if (!started_after_time(p, &latest_time, latest_task))
continue;
dropped = heap_insert(heap, p);
if (dropped == NULL) {
/*
* The new task was inserted; the heap wasn't
* previously full
*/
get_task_struct(p);
} else if (dropped != p) {
/*
* The new task was inserted, and pushed out a
* different task
*/
get_task_struct(p);
put_task_struct(dropped);
}
/*
* Else the new task was newer than anything already in
* the heap and wasn't inserted
*/
}
cgroup_iter_end(scan->cg, &it);
if (heap->size) {
for (i = 0; i < heap->size; i++) {
struct task_struct *q = heap->ptrs[i];
latest_time = q->start_time;
latest_task = q;
}
/* Process the task per the caller's callback */
scan->process_task(q, scan);
put_task_struct(q);
}
/*
* If we had to process any tasks at all, scan again
* in case some of them were in the middle of forking
* children that didn't get processed.