cgroup.c

}

/*
 * Gets called on POLLHUP on eventfd when user closes it.
 *
 * Called with wqh->lock held and interrupts disabled.
 */
static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
		int sync, void *key)
{
	struct cgroup_event *event = container_of(wait,
			struct cgroup_event, wait);
	struct cgroup *cgrp = event->css->cgroup;
	unsigned long flags = (unsigned long)key;

	if (flags & POLLHUP) {
		/*
		 * If the event has been detached at cgroup removal, we
		 * can simply return knowing the other side will cleanup
		 * for us.
		 *
		 * We can't race against event freeing since the other
		 * side will require wqh->lock via remove_wait_queue(),
		 * which we hold.
		 */
		spin_lock(&cgrp->event_list_lock);
		if (!list_empty(&event->list)) {
			list_del_init(&event->list);
			/*
			 * We are in atomic context, but cgroup_event_remove()
			 * may sleep, so we have to call it in workqueue.
			 */
			schedule_work(&event->remove);
		}
		spin_unlock(&cgrp->event_list_lock);
	}

	return 0;
}

static void cgroup_event_ptable_queue_proc(struct file *file,
		wait_queue_head_t *wqh, poll_table *pt)
{
	struct cgroup_event *event = container_of(pt,
			struct cgroup_event, pt);

	event->wqh = wqh;
	add_wait_queue(wqh, &event->wait);
}

/*
 * Parse input and register new cgroup event handler.
 *
 * Input must be in format '<event_fd> <control_fd> <args>'.
 * Interpretation of args is defined by control file implementation.
 */
static int cgroup_write_event_control(struct cgroup_subsys_state *css,
				      struct cftype *cft, const char *buffer)
{
	struct cgroup *cgrp = css->cgroup;
	struct cgroup_event *event;
	struct cgroup *cgrp_cfile;
	unsigned int efd, cfd;
	struct file *efile;
	struct file *cfile;
	char *endp;
	int ret;

	efd = simple_strtoul(buffer, &endp, 10);
	if (*endp != ' ')
		return -EINVAL;
	buffer = endp + 1;

	cfd = simple_strtoul(buffer, &endp, 10);
	if ((*endp != ' ') && (*endp != '\0'))
		return -EINVAL;
	buffer = endp + 1;

	event = kzalloc(sizeof(*event), GFP_KERNEL);
	if (!event)
		return -ENOMEM;
	event->css = css;
	INIT_LIST_HEAD(&event->list);
	init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
	init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
	INIT_WORK(&event->remove, cgroup_event_remove);

	efile = eventfd_fget(efd);
	if (IS_ERR(efile)) {
		ret = PTR_ERR(efile);
		goto out_kfree;
	}

	event->eventfd = eventfd_ctx_fileget(efile);
	if (IS_ERR(event->eventfd)) {
		ret = PTR_ERR(event->eventfd);
		goto out_put_efile;
	}

	cfile = fget(cfd);
	if (!cfile) {
		ret = -EBADF;
		goto out_put_eventfd;
	}

	/* the process need read permission on control file */
	/* AV: shouldn't we check that it's been opened for read instead? */
	ret = inode_permission(file_inode(cfile), MAY_READ);
	if (ret < 0)
		goto out_put_cfile;

	event->cft = __file_cft(cfile);
	if (IS_ERR(event->cft)) {
		ret = PTR_ERR(event->cft);
		goto out_put_cfile;
	}

	/*
	 * The file to be monitored must be in the same cgroup as
	 * cgroup.event_control is.
	 */
	cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent);
	if (cgrp_cfile != cgrp) {
		ret = -EINVAL;
		goto out_put_cfile;
	}

	if (!event->cft->register_event || !event->cft->unregister_event) {
		ret = -EINVAL;
		goto out_put_cfile;
	}

	ret = event->cft->register_event(css, event->cft,
			event->eventfd, buffer);
	if (ret)
		goto out_put_cfile;

	efile->f_op->poll(efile, &event->pt);

	/*
	 * Events should be removed after rmdir of cgroup directory, but before
	 * destroying subsystem state objects. Let's take reference to cgroup
	 * directory dentry to do that.
	 */
	dget(cgrp->dentry);

	spin_lock(&cgrp->event_list_lock);
	list_add(&event->list, &cgrp->event_list);
	spin_unlock(&cgrp->event_list_lock);

	fput(cfile);
	fput(efile);

	return 0;

out_put_cfile:
	fput(cfile);
out_put_eventfd:
	eventfd_ctx_put(event->eventfd);
out_put_efile:
	fput(efile);
out_kfree:
	kfree(event);

	return ret;
}

static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
				      struct cftype *cft)
{
	return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
}

static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
				       struct cftype *cft, u64 val)
{
	if (val)
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
	else
		clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags);
	return 0;
}

static struct cftype cgroup_base_files[] = {
	{
		.name = "cgroup.procs",
		.open = cgroup_procs_open,
		.write_u64 = cgroup_procs_write,
		.release = cgroup_pidlist_release,
		.mode = S_IRUGO | S_IWUSR,
	},
	{
		.name = "cgroup.event_control",
		.write_string = cgroup_write_event_control,
		.mode = S_IWUGO,
	},
	{
		.name = "cgroup.clone_children",
		.flags = CFTYPE_INSANE,
		.read_u64 = cgroup_clone_children_read,
		.write_u64 = cgroup_clone_children_write,
	},
	{
		.name = "cgroup.sane_behavior",
		.flags = CFTYPE_ONLY_ON_ROOT,
		.read_seq_string = cgroup_sane_behavior_show,
	},

	/*
	 * Historical crazy stuff.  These don't have "cgroup."  prefix and
	 * don't exist if sane_behavior.  If you're depending on these, be
	 * prepared to be burned.
	 */
	{
		.name = "tasks",
		.flags = CFTYPE_INSANE,		/* use "procs" instead */
		.open = cgroup_tasks_open,
		.write_u64 = cgroup_tasks_write,
		.release = cgroup_pidlist_release,
		.mode = S_IRUGO | S_IWUSR,
	},
	{
		.name = "notify_on_release",
		.flags = CFTYPE_INSANE,
		.read_u64 = cgroup_read_notify_on_release,
		.write_u64 = cgroup_write_notify_on_release,
	},
	{
		.name = "release_agent",
		.flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
		.read_seq_string = cgroup_release_agent_show,
		.write_string = cgroup_release_agent_write,
		.max_write_len = PATH_MAX,
	},
	{ }	/* terminate */
};

/**
 * cgroup_populate_dir - create subsys files in a cgroup directory
 * @cgrp: target cgroup
 * @subsys_mask: mask of the subsystem ids whose files should be added
 *
 * On failure, no file is added.
 */
static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
{
	struct cgroup_subsys *ss;
	int i, ret = 0;

	/* process cftsets of each subsystem */
	for_each_subsys(ss, i) {
		struct cftype_set *set;

		if (!test_bit(i, &subsys_mask))
			continue;

		list_for_each_entry(set, &ss->cftsets, node) {
			ret = cgroup_addrm_files(cgrp, set->cfts, true);
			if (ret < 0)
				goto err;
		}
	}

	/* This cgroup is ready now */
	for_each_root_subsys(cgrp->root, ss) {
		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);
		struct css_id *id = rcu_dereference_protected(css->id, true);

		/*
		 * Update id->css pointer and make this css visible from
		 * CSS ID functions. This pointer will be dereferened
		 * from RCU-read-side without locks.
		 */
		if (id)
			rcu_assign_pointer(id->css, css);
	}

	return 0;
err:
	cgroup_clear_dir(cgrp, subsys_mask);
	return ret;
}

static void css_free_work_fn(struct work_struct *work)
{
	struct cgroup_subsys_state *css =
		container_of(work, struct cgroup_subsys_state, destroy_work);

	if (css->parent)
		css_put(css->parent);

	cgroup_dput(css->cgroup);
}

static void css_release(struct percpu_ref *ref)
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

	/*
	 * css holds an extra ref to @cgrp->dentry which is put on the last
	 * css_put().  dput() requires process context, which css_put() may
	 * be called without.  @css->destroy_work will be used to invoke
	 * dput() asynchronously from css_put().
	 */
	INIT_WORK(&css->destroy_work, css_free_work_fn);
	schedule_work(&css->destroy_work);
}

static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss,
		     struct cgroup *cgrp)
{
	css->cgroup = cgrp;
	css->ss = ss;
	css->flags = 0;
	css->id = NULL;

	if (cgrp->parent)
		css->parent = cgroup_css(cgrp->parent, ss->subsys_id);
	else
		css->flags |= CSS_ROOT;

	BUG_ON(cgroup_css(cgrp, ss->subsys_id));
}

/* invoke ->css_online() on a new CSS and mark it online if successful */
static int online_css(struct cgroup_subsys_state *css)
{
	struct cgroup_subsys *ss = css->ss;
	int ret = 0;

	lockdep_assert_held(&cgroup_mutex);

	if (ss->css_online)
		ret = ss->css_online(css);
	if (!ret) {
		css->flags |= CSS_ONLINE;
		rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css);
	}
	return ret;
}

/* if the CSS is online, invoke ->css_offline() on it and mark it offline */
static void offline_css(struct cgroup_subsys_state *css)
{
	struct cgroup_subsys *ss = css->ss;

	lockdep_assert_held(&cgroup_mutex);

	if (!(css->flags & CSS_ONLINE))
		return;

	if (ss->css_offline)
		ss->css_offline(css);

	css->flags &= ~CSS_ONLINE;
}

/*
 * cgroup_create - create a cgroup
 * @parent: cgroup that will be parent of the new cgroup
 * @dentry: dentry of the new cgroup
 * @mode: mode to set on new inode
 *
 * Must be called with the mutex on the parent inode held
 */
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
			     umode_t mode)
{
	struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { };
	struct cgroup *cgrp;
	struct cgroup_name *name;
	struct cgroupfs_root *root = parent->root;
	int err = 0;
	struct cgroup_subsys *ss;
	struct super_block *sb = root->sb;

	/* allocate the cgroup and its ID, 0 is reserved for the root */
	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
	if (!cgrp)
		return -ENOMEM;

	name = cgroup_alloc_name(dentry);
	if (!name)
		goto err_free_cgrp;
	rcu_assign_pointer(cgrp->name, name);

	/*
	 * Temporarily set the pointer to NULL, so idr_find() won't return
	 * a half-baked cgroup.
	 */
	cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
	if (cgrp->id < 0)
		goto err_free_name;

	/*
	 * Only live parents can have children.  Note that the liveliness
	 * check isn't strictly necessary because cgroup_mkdir() and
	 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it
	 * anyway so that locking is contained inside cgroup proper and we
	 * don't get nasty surprises if we ever grow another caller.
	 */
	if (!cgroup_lock_live_group(parent)) {
		err = -ENODEV;
		goto err_free_id;
	}

	/* Grab a reference on the superblock so the hierarchy doesn't
	 * get deleted on unmount if there are child cgroups.  This
	 * can be done outside cgroup_mutex, since the sb can't
	 * disappear while someone has an open control file on the
	 * fs */
	atomic_inc(&sb->s_active);

	init_cgroup_housekeeping(cgrp);

	dentry->d_fsdata = cgrp;
	cgrp->dentry = dentry;

	cgrp->parent = parent;
	cgrp->dummy_css.parent = &parent->dummy_css;
	cgrp->root = parent->root;

	if (notify_on_release(parent))
		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);

	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
		set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);

	for_each_root_subsys(root, ss) {
		struct cgroup_subsys_state *css;

		css = ss->css_alloc(cgroup_css(parent, ss->subsys_id));
		if (IS_ERR(css)) {
			err = PTR_ERR(css);
			goto err_free_all;
		}
		css_ar[ss->subsys_id] = css;

		err = percpu_ref_init(&css->refcnt, css_release);
		if (err)
			goto err_free_all;

		init_css(css, ss, cgrp);

		if (ss->use_id) {
			err = alloc_css_id(css);
			if (err)
				goto err_free_all;
		}
	}

	/*
	 * Create directory.  cgroup_create_file() returns with the new
	 * directory locked on success so that it can be populated without
	 * dropping cgroup_mutex.
	 */
	err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
	if (err < 0)
		goto err_free_all;
	lockdep_assert_held(&dentry->d_inode->i_mutex);

	cgrp->serial_nr = cgroup_serial_nr_next++;

	/* allocation complete, commit to creation */
	list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children);
	root->number_of_cgroups++;

	/* each css holds a ref to the cgroup's dentry and the parent css */
	for_each_root_subsys(root, ss) {
		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];

		dget(dentry);
		percpu_ref_get(&css->parent->refcnt);
	}

	/* hold a ref to the parent's dentry */
	dget(parent->dentry);

	/* creation succeeded, notify subsystems */
	for_each_root_subsys(root, ss) {
		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];

		err = online_css(css);
		if (err)
			goto err_destroy;

		if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
		    parent->parent) {
			pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
				   current->comm, current->pid, ss->name);
			if (!strcmp(ss->name, "memory"))
				pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n");
			ss->warned_broken_hierarchy = true;
		}
	}

	idr_replace(&root->cgroup_idr, cgrp, cgrp->id);

	err = cgroup_addrm_files(cgrp, cgroup_base_files, true);
	if (err)
		goto err_destroy;

	err = cgroup_populate_dir(cgrp, root->subsys_mask);
	if (err)
		goto err_destroy;

	mutex_unlock(&cgroup_mutex);
	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);

	return 0;

err_free_all:
	for_each_root_subsys(root, ss) {
		struct cgroup_subsys_state *css = css_ar[ss->subsys_id];

		if (css) {
			percpu_ref_cancel_init(&css->refcnt);
			ss->css_free(css);
		}
	}
	mutex_unlock(&cgroup_mutex);
	/* Release the reference count that we took on the superblock */
	deactivate_super(sb);
err_free_id:
	idr_remove(&root->cgroup_idr, cgrp->id);
err_free_name:
	kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp:
	kfree(cgrp);
	return err;

err_destroy:
	cgroup_destroy_locked(cgrp);
	mutex_unlock(&cgroup_mutex);
	mutex_unlock(&dentry->d_inode->i_mutex);
	return err;
}

static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
	struct cgroup *c_parent = dentry->d_parent->d_fsdata;

	/* the vfs holds inode->i_mutex already */
	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
}

static void cgroup_css_killed(struct cgroup *cgrp)
{
	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
		return;

	/* percpu ref's of all css's are killed, kick off the next step */
	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
	schedule_work(&cgrp->destroy_work);
}

/*
 * This is called when the refcnt of a css is confirmed to be killed.
 * css_tryget() is now guaranteed to fail.
 */
static void css_killed_work_fn(struct work_struct *work)
{
	struct cgroup_subsys_state *css =
		container_of(work, struct cgroup_subsys_state, destroy_work);
	struct cgroup *cgrp = css->cgroup;

	cgroup_css_killed(cgrp);
}

/* css kill confirmation processing requires process context, bounce */
static void css_killed_ref_fn(struct percpu_ref *ref)
{
	struct cgroup_subsys_state *css =
		container_of(ref, struct cgroup_subsys_state, refcnt);

	INIT_WORK(&css->destroy_work, css_killed_work_fn);
	schedule_work(&css->destroy_work);
}

/**
 * cgroup_destroy_locked - the first stage of cgroup destruction
 * @cgrp: cgroup to be destroyed
 *
 * css's make use of percpu refcnts whose killing latency shouldn't be
 * exposed to userland and are RCU protected.  Also, cgroup core needs to
 * guarantee that css_tryget() won't succeed by the time ->css_offline() is
 * invoked.  To satisfy all the requirements, destruction is implemented in
 * the following two steps.
 *
 * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
 *     userland visible parts and start killing the percpu refcnts of
 *     css's.  Set up so that the next stage will be kicked off once all
 *     the percpu refcnts are confirmed to be killed.
 *
 * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
 *     rest of destruction.  Once all cgroup references are gone, the
 *     cgroup is RCU-freed.
 *
 * This function implements s1.  After this step, @cgrp is gone as far as
 * the userland is concerned and a new cgroup with the same name may be
 * created.  As cgroup doesn't care about the names internally, this
 * doesn't cause any problem.
 */
static int cgroup_destroy_locked(struct cgroup *cgrp)
	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
	struct dentry *d = cgrp->dentry;
	struct cgroup_event *event, *tmp;
	struct cgroup_subsys *ss;
	bool empty;

	lockdep_assert_held(&d->d_inode->i_mutex);
	lockdep_assert_held(&cgroup_mutex);

	/*
	 * css_set_lock synchronizes access to ->cset_links and prevents
	 * @cgrp from being removed while __put_css_set() is in progress.
	 */
	read_lock(&css_set_lock);
	empty = list_empty(&cgrp->cset_links) && list_empty(&cgrp->children);
	read_unlock(&css_set_lock);
	if (!empty)
		return -EBUSY;

	/*
	 * Block new css_tryget() by killing css refcnts.  cgroup core
	 * guarantees that, by the time ->css_offline() is invoked, no new
	 * css reference will be given out via css_tryget().  We can't
	 * simply call percpu_ref_kill() and proceed to offlining css's
	 * because percpu_ref_kill() doesn't guarantee that the ref is seen
	 * as killed on all CPUs on return.
	 *
	 * Use percpu_ref_kill_and_confirm() to get notifications as each
	 * css is confirmed to be seen as killed on all CPUs.  The
	 * notification callback keeps track of the number of css's to be
	 * killed and schedules cgroup_offline_fn() to perform the rest of
	 * destruction once the percpu refs of all css's are confirmed to
	 * be killed.
	 */
	atomic_set(&cgrp->css_kill_cnt, 1);
	for_each_root_subsys(cgrp->root, ss) {
		struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id);

		/*
		 * Killing would put the base ref, but we need to keep it
		 * alive until after ->css_offline.
		 */
		percpu_ref_get(&css->refcnt);

		atomic_inc(&cgrp->css_kill_cnt);
		percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
	}
	cgroup_css_killed(cgrp);

	/*
	 * Mark @cgrp dead.  This prevents further task migration and child
	 * creation by disabling cgroup_lock_live_group().  Note that
	 * CGRP_DEAD assertion is depended upon by css_next_child() to
	 * resume iteration after dropping RCU read lock.  See
	 * css_next_child() for details.
	 */
	set_bit(CGRP_DEAD, &cgrp->flags);

	/* CGRP_DEAD is set, remove from ->release_list for the last time */
	raw_spin_lock(&release_list_lock);
	if (!list_empty(&cgrp->release_list))
		list_del_init(&cgrp->release_list);
	raw_spin_unlock(&release_list_lock);

	/*
	 * Clear and remove @cgrp directory.  The removal puts the base ref
	 * but we aren't quite done with @cgrp yet, so hold onto it.
	 */
	cgroup_clear_dir(cgrp, cgrp->root->subsys_mask);
	cgroup_addrm_files(cgrp, cgroup_base_files, false);
	dget(d);
	cgroup_d_remove_dir(d);

	/*
	 * Unregister events and notify userspace.
	 * Notify userspace about cgroup removing only after rmdir of cgroup
	 * directory to avoid race between userspace and kernelspace.
	 */
	spin_lock(&cgrp->event_list_lock);
	list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
		list_del_init(&event->list);
		schedule_work(&event->remove);
	}
	spin_unlock(&cgrp->event_list_lock);

	return 0;
};

/**
 * cgroup_offline_fn - the second step of cgroup destruction
 * @work: cgroup->destroy_free_work
 *
 * This function is invoked from a work item for a cgroup which is being
 * destroyed after the percpu refcnts of all css's are guaranteed to be
 * seen as killed on all CPUs, and performs the rest of destruction.  This
 * is the second step of destruction described in the comment above
 * cgroup_destroy_locked().
 */
static void cgroup_offline_fn(struct work_struct *work)
{
	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
	struct cgroup *parent = cgrp->parent;
	struct dentry *d = cgrp->dentry;
	struct cgroup_subsys *ss;

	mutex_lock(&cgroup_mutex);

	/*
	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
	 * initate destruction.
	 */
	for_each_root_subsys(cgrp->root, ss)
		offline_css(cgroup_css(cgrp, ss->subsys_id));

	/*
	 * Put the css refs from cgroup_destroy_locked().  Each css holds
	 * an extra reference to the cgroup's dentry and cgroup removal
	 * proceeds regardless of css refs.  On the last put of each css,
	 * whenever that may be, the extra dentry ref is put so that dentry
	 * destruction happens only after all css's are released.
	 */
	for_each_root_subsys(cgrp->root, ss)
		css_put(cgroup_css(cgrp, ss->subsys_id));

	/* delete this cgroup from parent->children */
	list_del_rcu(&cgrp->sibling);

	/*
	 * We should remove the cgroup object from idr before its grace
	 * period starts, so we won't be looking up a cgroup while the
	 * cgroup is being freed.
	 */
	idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
	cgrp->id = -1;

	dput(d);

	set_bit(CGRP_RELEASABLE, &parent->flags);
	check_for_release(parent);

	mutex_unlock(&cgroup_mutex);
}

static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
	int ret;

	mutex_lock(&cgroup_mutex);
	ret = cgroup_destroy_locked(dentry->d_fsdata);
	mutex_unlock(&cgroup_mutex);

	return ret;
}

static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
{
	INIT_LIST_HEAD(&ss->cftsets);

	/*
	 * base_cftset is embedded in subsys itself, no need to worry about
	 * deregistration.
	 */
	if (ss->base_cftypes) {
		struct cftype *cft;

		for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++)
			cft->ss = ss;

		ss->base_cftset.cfts = ss->base_cftypes;
		list_add_tail(&ss->base_cftset.node, &ss->cftsets);
	}
}

static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;

	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);

	mutex_lock(&cgroup_mutex);

	/* init base cftset */
	cgroup_init_cftsets(ss);

	/* Create the top cgroup state for this subsystem */
	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
	ss->root = &cgroup_dummy_root;
	css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id));
	/* We don't handle early failures gracefully */
	BUG_ON(IS_ERR(css));
	init_css(css, ss, cgroup_dummy_top);

	/* Update the init_css_set to contain a subsys
	 * pointer to this state - since the subsystem is
	 * newly registered, all tasks and hence the
	 * init_css_set is in the subsystem's top cgroup. */
	init_css_set.subsys[ss->subsys_id] = css;

	need_forkexit_callback |= ss->fork || ss->exit;

	/* At system boot, before all subsystems have been
	 * registered, no tasks have been forked, so we don't
	 * need to invoke fork callbacks here. */
	BUG_ON(!list_empty(&init_task.tasks));

	BUG_ON(online_css(css));

	mutex_unlock(&cgroup_mutex);

	/* this function shouldn't be used with modular subsystems, since they
	 * need to register a subsys_id, among other things */
	BUG_ON(ss->module);
}

/**
 * cgroup_load_subsys: load and register a modular subsystem at runtime
 * @ss: the subsystem to load
 *
 * This function should be called in a modular subsystem's initcall. If the
 * subsystem is built as a module, it will be assigned a new subsys_id and set
 * up for use. If the subsystem is built-in anyway, work is delegated to the
 * simpler cgroup_init_subsys.
 */
int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
{
	struct cgroup_subsys_state *css;
	int i, ret;
	struct hlist_node *tmp;
	struct css_set *cset;
	unsigned long key;

	/* check name and function validity */
	if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN ||
	    ss->css_alloc == NULL || ss->css_free == NULL)
		return -EINVAL;

	/*
	 * we don't support callbacks in modular subsystems. this check is
	 * before the ss->module check for consistency; a subsystem that could
	 * be a module should still have no callbacks even if the user isn't
	 * compiling it as one.
	 */
	if (ss->fork || ss->exit)
		return -EINVAL;

	/*
	 * an optionally modular subsystem is built-in: we want to do nothing,
	 * since cgroup_init_subsys will have already taken care of it.
	 */
	if (ss->module == NULL) {
		/* a sanity check */
		BUG_ON(cgroup_subsys[ss->subsys_id] != ss);
		return 0;
	}

	/* init base cftset */
	cgroup_init_cftsets(ss);

	mutex_lock(&cgroup_mutex);
	cgroup_subsys[ss->subsys_id] = ss;

	/*
	 * no ss->css_alloc seems to need anything important in the ss
	 * struct, so this can happen first (i.e. before the dummy root
	 * attachment).
	 */
	css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id));
	if (IS_ERR(css)) {
		/* failure case - need to deassign the cgroup_subsys[] slot. */
		cgroup_subsys[ss->subsys_id] = NULL;
		mutex_unlock(&cgroup_mutex);
		return PTR_ERR(css);
	}

	list_add(&ss->sibling, &cgroup_dummy_root.subsys_list);
	ss->root = &cgroup_dummy_root;

	/* our new subsystem will be attached to the dummy hierarchy. */
	init_css(css, ss, cgroup_dummy_top);
	/* init_idr must be after init_css() because it sets css->id. */
	if (ss->use_id) {
		ret = cgroup_init_idr(ss, css);
		if (ret)
			goto err_unload;
	}

	/*
	 * Now we need to entangle the css into the existing css_sets. unlike
	 * in cgroup_init_subsys, there are now multiple css_sets, so each one
	 * will need a new pointer to it; done by iterating the css_set_table.
	 * furthermore, modifying the existing css_sets will corrupt the hash
	 * table state, so each changed css_set will need its hash recomputed.
	 * this is all done under the css_set_lock.
	 */
	write_lock(&css_set_lock);
	hash_for_each_safe(css_set_table, i, tmp, cset, hlist) {
		/* skip entries that we already rehashed */
		if (cset->subsys[ss->subsys_id])
			continue;
		/* remove existing entry */
		hash_del(&cset->hlist);
		/* set new value */
		cset->subsys[ss->subsys_id] = css;
		/* recompute hash and restore entry */
		key = css_set_hash(cset->subsys);
		hash_add(css_set_table, &cset->hlist, key);
	}
	write_unlock(&css_set_lock);

	ret = online_css(css);
	if (ret)
		goto err_unload;

	/* success! */
	mutex_unlock(&cgroup_mutex);
	return 0;

err_unload:
	mutex_unlock(&cgroup_mutex);
	/* @ss can't be mounted here as try_module_get() would fail */
	cgroup_unload_subsys(ss);
	return ret;
}
EXPORT_SYMBOL_GPL(cgroup_load_subsys);

/**
 * cgroup_unload_subsys: unload a modular subsystem
 * @ss: the subsystem to unload
 *
 * This function should be called in a modular subsystem's exitcall. When this
 * function is invoked, the refcount on the subsystem's module will be 0, so
 * the subsystem will not be attached to any hierarchy.
 */
void cgroup_unload_subsys(struct cgroup_subsys *ss)
{
	struct cgrp_cset_link *link;

	BUG_ON(ss->module == NULL);

	/*
	 * we shouldn't be called if the subsystem is in use, and the use of
	 * try_module_get() in rebind_subsystems() should ensure that it
	 * doesn't start being used while we're killing it off.
	 */
	BUG_ON(ss->root != &cgroup_dummy_root);

	mutex_lock(&cgroup_mutex);

	offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id));

	if (ss->use_id)
		idr_destroy(&ss->idr);

	/* deassign the subsys_id */
	cgroup_subsys[ss->subsys_id] = NULL;

	/* remove subsystem from the dummy root's list of subsystems */
	list_del_init(&ss->sibling);

	/*
	 * disentangle the css from all css_sets attached to the dummy
	 * top. as in loading, we need to pay our respects to the hashtable
	 * gods.
	 */
	write_lock(&css_set_lock);
	list_for_each_entry(link, &cgroup_dummy_top->cset_links, cset_link) {
		struct css_set *cset = link->cset;
		unsigned long key;

		hash_del(&cset->hlist);
		cset->subsys[ss->subsys_id] = NULL;
		key = css_set_hash(cset->subsys);
		hash_add(css_set_table, &cset->hlist, key);
	}
	write_unlock(&css_set_lock);

	/*
	 * remove subsystem's css from the cgroup_dummy_top and free it -
	 * need to free before marking as null because ss->css_free needs
	 * the cgrp->subsys pointer to find their state. note that this
	 * also takes care of freeing the css_id.
	 */
	ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id));
	RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);

	mutex_unlock(&cgroup_mutex);
}
EXPORT_SYMBOL_GPL(cgroup_unload_subsys);

/**
 * cgroup_init_early - cgroup initialization at system boot
 *
 * Initialize cgroups at system boot, and initialize any
 * subsystems that request early init.