diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ca0b1d54e701644f0c714bc01fa999aab304a97..586ad91300b0f3924b41bd417cda9cbb076f1a9d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -332,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from); +static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include @@ -3001,792 +3002,475 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -#ifdef CONFIG_SYSFS -/* - * Workqueues with WQ_SYSFS flag set is visible to userland via - * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the - * following attributes. - * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items - * - * Unbound workqueues have the following extra attributes. +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free * - * id RO int : the associated pool ID - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers + * Undo alloc_workqueue_attrs(). */ -struct wq_device { - struct workqueue_struct *wq; - struct device dev; -}; - -static struct workqueue_struct *dev_to_wq(struct device *dev) +void free_workqueue_attrs(struct workqueue_attrs *attrs) { - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - return wq_dev->wq; + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } } -static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, - char *buf) +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. + * + * Return: The allocated new workqueue_attr on success. %NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { - struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; - return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_copy(attrs->cpumask, cpu_possible_mask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; } -static DEVICE_ATTR_RO(per_cpu); -static ssize_t max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) { - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); + /* + * Unlike hash and equality test, this function doesn't ignore + * ->no_numa as it is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears ->no_numa after copying. + */ + to->no_numa = from->no_numa; } -static ssize_t max_active_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { - struct workqueue_struct *wq = dev_to_wq(dev); - int val; - - if (sscanf(buf, "%d", &val) != 1 || val <= 0) - return -EINVAL; + u32 hash = 0; - workqueue_set_max_active(wq, val); - return count; + hash = jhash_1word(attrs->nice, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + return hash; } -static DEVICE_ATTR_RW(max_active); - -static struct attribute *wq_sysfs_attrs[] = { - &dev_attr_per_cpu.attr, - &dev_attr_max_active.attr, - NULL, -}; -ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) { - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - rcu_read_lock_sched(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); - - return written; + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; } -static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, - char *buf) +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * + * Return: 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. + */ +static int init_worker_pool(struct worker_pool *pool) { - struct workqueue_struct *wq = dev_to_wq(dev); - int written; + spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; + pool->node = NUMA_NO_NODE; + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); - mutex_unlock(&wq->mutex); + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - return written; -} + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); -/* prepare workqueue_attrs for sysfs store operations */ -static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) -{ - struct workqueue_attrs *attrs; + mutex_init(&pool->manager_arb); + mutex_init(&pool->attach_mutex); + INIT_LIST_HEAD(&pool->workers); - attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!attrs) - return NULL; + ida_init(&pool->worker_ida); + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; - mutex_lock(&wq->mutex); - copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); - return attrs; + /* shouldn't fail above this point */ + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; } -static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static void rcu_free_wq(struct rcu_head *rcu) { - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); - if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); else - ret = -EINVAL; + free_workqueue_attrs(wq->unbound_attrs); - free_workqueue_attrs(attrs); - return ret ?: count; + kfree(wq->rescuer); + kfree(wq); } -static ssize_t wq_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void rcu_free_pool(struct rcu_head *rcu) { - struct workqueue_struct *wq = dev_to_wq(dev); - int written; + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq->unbound_attrs->cpumask)); - mutex_unlock(&wq->mutex); - return written; + ida_destroy(&pool->worker_ida); + free_workqueue_attrs(pool->attrs); + kfree(pool); } -static ssize_t wq_cpumask_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. + */ +static void put_unbound_pool(struct worker_pool *pool) { - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; + DECLARE_COMPLETION_ONSTACK(detach_completion); + struct worker *worker; - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + lockdep_assert_held(&wq_pool_mutex); - ret = cpumask_parse(buf, attrs->cpumask); - if (!ret) - ret = apply_workqueue_attrs(wq, attrs); + if (--pool->refcnt) + return; - free_workqueue_attrs(attrs); - return ret ?: count; -} + /* sanity checks */ + if (WARN_ON(!(pool->cpu < 0)) || + WARN_ON(!list_empty(&pool->worklist))) + return; -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); - } + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * attach_mutex. + */ + mutex_lock(&pool->manager_arb); - free_workqueue_attrs(attrs); - return ret ?: count; -} + spin_lock_irq(&pool->lock); + while ((worker = first_idle_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + spin_unlock_irq(&pool->lock); -static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), - __ATTR(nice, 0644, wq_nice_show, wq_nice_store), - __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), - __ATTR_NULL, -}; + mutex_lock(&pool->attach_mutex); + if (!list_empty(&pool->workers)) + pool->detach_completion = &detach_completion; + mutex_unlock(&pool->attach_mutex); -static struct bus_type wq_subsys = { - .name = "workqueue", - .dev_groups = wq_sysfs_groups, -}; + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); -static int __init wq_sysfs_init(void) -{ - return subsys_virtual_register(&wq_subsys, NULL); -} -core_initcall(wq_sysfs_init); + mutex_unlock(&pool->manager_arb); -static void wq_device_release(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); - kfree(wq_dev); + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); } /** - * workqueue_sysfs_register - make a workqueue visible in sysfs - * @wq: the workqueue to register + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get * - * Expose @wq in sysfs under /sys/bus/workqueue/devices. - * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set - * which is the preferred method. + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. * - * Workqueue user should use this function directly iff it wants to apply - * workqueue_attrs before making the workqueue visible in sysfs; otherwise, - * apply_workqueue_attrs() may race against userland updating the - * attributes. + * Should be called with wq_pool_mutex held. * - * Return: 0 on success, -errno on failure. + * Return: On success, a worker_pool with the same attributes as @attrs. + * On failure, %NULL. */ -int workqueue_sysfs_register(struct workqueue_struct *wq) +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { - struct wq_device *wq_dev; - int ret; + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + int node; - /* - * Adjusting max_active or creating new pwqs by applyting - * attributes breaks ordering guarantee. Disallow exposing ordered - * workqueues. - */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) - return -EINVAL; + lockdep_assert_held(&wq_pool_mutex); - wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); - if (!wq_dev) - return -ENOMEM; + /* do we already have a matching pool? */ + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + return pool; + } + } - wq_dev->wq = wq; - wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; - wq_dev->dev.release = wq_device_release; + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + copy_workqueue_attrs(pool->attrs, attrs); /* - * unbound_attrs are created separately. Suppress uevent until - * everything is ready. + * no_numa isn't a worker_pool attribute, always clear it. See + * 'struct workqueue_attrs' comments for detail. */ - dev_set_uevent_suppress(&wq_dev->dev, true); - - ret = device_register(&wq_dev->dev); - if (ret) { - kfree(wq_dev); - wq->wq_dev = NULL; - return ret; - } - - if (wq->flags & WQ_UNBOUND) { - struct device_attribute *attr; + pool->attrs->no_numa = false; - for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { - ret = device_create_file(&wq_dev->dev, attr); - if (ret) { - device_unregister(&wq_dev->dev); - wq->wq_dev = NULL; - return ret; + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; } } } - dev_set_uevent_suppress(&wq_dev->dev, false); - kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); - return 0; -} + if (worker_pool_assign_id(pool) < 0) + goto fail; -/** - * workqueue_sysfs_unregister - undo workqueue_sysfs_register() - * @wq: the workqueue to unregister - * - * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. - */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev = wq->wq_dev; + /* create and start the initial worker */ + if (!create_worker(pool)) + goto fail; - if (!wq->wq_dev) - return; + /* install */ + hash_add(unbound_pool_hash, &pool->hash_node, hash); - wq->wq_dev = NULL; - device_unregister(&wq_dev->dev); + return pool; +fail: + if (pool) + put_unbound_pool(pool); + return NULL; } -#else /* CONFIG_SYSFS */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } -#endif /* CONFIG_SYSFS */ -/** - * free_workqueue_attrs - free a workqueue_attrs - * @attrs: workqueue_attrs to free - * - * Undo alloc_workqueue_attrs(). - */ -void free_workqueue_attrs(struct workqueue_attrs *attrs) +static void rcu_free_pwq(struct rcu_head *rcu) { - if (attrs) { - free_cpumask_var(attrs->cpumask); - kfree(attrs); - } + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); } -/** - * alloc_workqueue_attrs - allocate a workqueue_attrs - * @gfp_mask: allocation mask to use - * - * Allocate a new workqueue_attrs, initialize with default settings and - * return it. - * - * Return: The allocated new workqueue_attr on success. %NULL on failure. +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. */ -struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static void pwq_unbound_release_workfn(struct work_struct *work) { - struct workqueue_attrs *attrs; + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + bool is_last; - attrs = kzalloc(sizeof(*attrs), gfp_mask); - if (!attrs) - goto fail; - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) - goto fail; + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; - cpumask_copy(attrs->cpumask, cpu_possible_mask); - return attrs; -fail: - free_workqueue_attrs(attrs); - return NULL; -} + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + + call_rcu_sched(&pwq->rcu, rcu_free_pwq); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from) -{ - to->nice = from->nice; - cpumask_copy(to->cpumask, from->cpumask); /* - * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Schedule RCU free. */ - to->no_numa = from->no_numa; -} - -/* hash value of the content of @attr */ -static u32 wqattrs_hash(const struct workqueue_attrs *attrs) -{ - u32 hash = 0; - - hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - return hash; -} - -/* content equality test */ -static bool wqattrs_equal(const struct workqueue_attrs *a, - const struct workqueue_attrs *b) -{ - if (a->nice != b->nice) - return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) - return false; - return true; + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** - * init_worker_pool - initialize a newly zalloc'd worker_pool - * @pool: worker_pool to initialize - * - * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * pwq_adjust_max_active - update a pwq's max_active to the current setting + * @pwq: target pool_workqueue * - * Return: 0 on success, -errno on failure. Even on failure, all fields - * inside @pool proper are initialized and put_unbound_pool() can be called - * on @pool safely to release it. + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ -static int init_worker_pool(struct worker_pool *pool) +static void pwq_adjust_max_active(struct pool_workqueue *pwq) { - spin_lock_init(&pool->lock); - pool->id = -1; - pool->cpu = -1; - pool->node = NUMA_NO_NODE; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; + struct workqueue_struct *wq = pwq->wq; + bool freezable = wq->flags & WQ_FREEZABLE; - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); + /* for @wq->saved_max_active */ + lockdep_assert_held(&wq->mutex); - mutex_init(&pool->manager_arb); - mutex_init(&pool->attach_mutex); - INIT_LIST_HEAD(&pool->workers); + /* fast exit for non-freezable wqs */ + if (!freezable && pwq->max_active == wq->saved_max_active) + return; - ida_init(&pool->worker_ida); - INIT_HLIST_NODE(&pool->hash_node); - pool->refcnt = 1; + spin_lock_irq(&pwq->pool->lock); - /* shouldn't fail above this point */ - pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!pool->attrs) - return -ENOMEM; - return 0; -} + /* + * During [un]freezing, the caller is responsible for ensuring that + * this function is called at least once after @workqueue_freezing + * is updated and visible. + */ + if (!freezable || !workqueue_freezing) { + pwq->max_active = wq->saved_max_active; -static void rcu_free_wq(struct rcu_head *rcu) -{ - struct workqueue_struct *wq = - container_of(rcu, struct workqueue_struct, rcu); + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else - free_workqueue_attrs(wq->unbound_attrs); + /* + * Need to kick a worker after thawed or an unbound wq's + * max_active is bumped. It's a slow path. Do it always. + */ + wake_up_worker(pwq->pool); + } else { + pwq->max_active = 0; + } - kfree(wq->rescuer); - kfree(wq); + spin_unlock_irq(&pwq->pool->lock); } -static void rcu_free_pool(struct rcu_head *rcu) +/* initialize newly alloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, + struct worker_pool *pool) { - struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - ida_destroy(&pool->worker_ida); - free_workqueue_attrs(pool->attrs); - kfree(pool); + memset(pwq, 0, sizeof(*pwq)); + + pwq->pool = pool; + pwq->wq = wq; + pwq->flush_color = -1; + pwq->refcnt = 1; + INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->pwqs_node); + INIT_LIST_HEAD(&pwq->mayday_node); + INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); } -/** - * put_unbound_pool - put a worker_pool - * @pool: worker_pool to put - * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU - * safe manner. get_unbound_pool() calls this function on its failure path - * and this function should be able to release pools which went through, - * successfully or not, init_worker_pool(). - * - * Should be called with wq_pool_mutex held. - */ -static void put_unbound_pool(struct worker_pool *pool) +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq) { - DECLARE_COMPLETION_ONSTACK(detach_completion); - struct worker *worker; + struct workqueue_struct *wq = pwq->wq; - lockdep_assert_held(&wq_pool_mutex); + lockdep_assert_held(&wq->mutex); - if (--pool->refcnt) + /* may be called multiple times, ignore if already linked */ + if (!list_empty(&pwq->pwqs_node)) return; - /* sanity checks */ - if (WARN_ON(!(pool->cpu < 0)) || - WARN_ON(!list_empty(&pool->worklist))) - return; + /* set the matching work_color */ + pwq->work_color = wq->work_color; - /* release id and unhash */ - if (pool->id >= 0) - idr_remove(&worker_pool_idr, pool->id); - hash_del(&pool->hash_node); + /* sync max_active to the current setting */ + pwq_adjust_max_active(pwq); - /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. - */ - mutex_lock(&pool->manager_arb); + /* link in @pwq */ + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} - spin_lock_irq(&pool->lock); - while ((worker = first_idle_worker(pool))) - destroy_worker(worker); - WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct worker_pool *pool; + struct pool_workqueue *pwq; - mutex_lock(&pool->attach_mutex); - if (!list_empty(&pool->workers)) - pool->detach_completion = &detach_completion; - mutex_unlock(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_mutex); - if (pool->detach_completion) - wait_for_completion(pool->detach_completion); + pool = get_unbound_pool(attrs); + if (!pool) + return NULL; - mutex_unlock(&pool->manager_arb); + pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); + if (!pwq) { + put_unbound_pool(pool); + return NULL; + } - /* shut down the timers */ - del_timer_sync(&pool->idle_timer); - del_timer_sync(&pool->mayday_timer); + init_pwq(pwq, wq, pool); + return pwq; +} - /* sched-RCU protected to allow dereferences from get_work_pool() */ - call_rcu_sched(&pool->rcu, rcu_free_pool); +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&wq_pool_mutex); + + if (pwq) { + put_unbound_pool(pwq->pool); + kmem_cache_free(pwq_cache, pwq); + } } /** - * get_unbound_pool - get a worker_pool with the specified attributes - * @attrs: the attributes of the worker_pool to get + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask * - * Obtain a worker_pool which has the same attributes as @attrs, bump the - * reference count and return it. If there already is a matching - * worker_pool, it will be used; otherwise, this function attempts to - * create a new one. + * Calculate the cpumask a workqueue with @attrs should use on @node. If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation. The result is stored in @cpumask. * - * Should be called with wq_pool_mutex held. + * If NUMA affinity is not enabled, @attrs->cpumask is always used. If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. * - * Return: On success, a worker_pool with the same attributes as @attrs. - * On failure, %NULL. + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + * + * Return: %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. */ -static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, + int cpu_going_down, cpumask_t *cpumask) { - u32 hash = wqattrs_hash(attrs); - struct worker_pool *pool; - int node; + if (!wq_numa_enabled || attrs->no_numa) + goto use_dfl; - lockdep_assert_held(&wq_pool_mutex); + /* does @node have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + if (cpu_going_down >= 0) + cpumask_clear_cpu(cpu_going_down, cpumask); - /* do we already have a matching pool? */ - hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { - if (wqattrs_equal(pool->attrs, attrs)) { - pool->refcnt++; - return pool; - } - } - - /* nope, create a new one */ - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool || init_worker_pool(pool) < 0) - goto fail; - - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ - copy_workqueue_attrs(pool->attrs, attrs); - - /* - * no_numa isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->no_numa = false; - - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { - pool->node = node; - break; - } - } - } - - if (worker_pool_assign_id(pool) < 0) - goto fail; - - /* create and start the initial worker */ - if (!create_worker(pool)) - goto fail; - - /* install */ - hash_add(unbound_pool_hash, &pool->hash_node, hash); - - return pool; -fail: - if (pool) - put_unbound_pool(pool); - return NULL; -} - -static void rcu_free_pwq(struct rcu_head *rcu) -{ - kmem_cache_free(pwq_cache, - container_of(rcu, struct pool_workqueue, rcu)); -} - -/* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. - */ -static void pwq_unbound_release_workfn(struct work_struct *work) -{ - struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); - struct workqueue_struct *wq = pwq->wq; - struct worker_pool *pool = pwq->pool; - bool is_last; - - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - - mutex_lock(&wq->mutex); - list_del_rcu(&pwq->pwqs_node); - is_last = list_empty(&wq->pwqs); - mutex_unlock(&wq->mutex); - - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); - - call_rcu_sched(&pwq->rcu, rcu_free_pwq); - - /* - * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Schedule RCU free. - */ - if (is_last) - call_rcu_sched(&wq->rcu, rcu_free_wq); -} - -/** - * pwq_adjust_max_active - update a pwq's max_active to the current setting - * @pwq: target pool_workqueue - * - * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items - * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. - */ -static void pwq_adjust_max_active(struct pool_workqueue *pwq) -{ - struct workqueue_struct *wq = pwq->wq; - bool freezable = wq->flags & WQ_FREEZABLE; - - /* for @wq->saved_max_active */ - lockdep_assert_held(&wq->mutex); - - /* fast exit for non-freezable wqs */ - if (!freezable && pwq->max_active == wq->saved_max_active) - return; - - spin_lock_irq(&pwq->pool->lock); - - /* - * During [un]freezing, the caller is responsible for ensuring that - * this function is called at least once after @workqueue_freezing - * is updated and visible. - */ - if (!freezable || !workqueue_freezing) { - pwq->max_active = wq->saved_max_active; - - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); - - /* - * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. It's a slow path. Do it always. - */ - wake_up_worker(pwq->pool); - } else { - pwq->max_active = 0; - } - - spin_unlock_irq(&pwq->pool->lock); -} - -/* initialize newly alloced @pwq which is associated with @wq and @pool */ -static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, - struct worker_pool *pool) -{ - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - - memset(pwq, 0, sizeof(*pwq)); - - pwq->pool = pool; - pwq->wq = wq; - pwq->flush_color = -1; - pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); - INIT_LIST_HEAD(&pwq->pwqs_node); - INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); -} - -/* sync @pwq with the current state of its associated wq and link it */ -static void link_pwq(struct pool_workqueue *pwq) -{ - struct workqueue_struct *wq = pwq->wq; - - lockdep_assert_held(&wq->mutex); - - /* may be called multiple times, ignore if already linked */ - if (!list_empty(&pwq->pwqs_node)) - return; - - /* set the matching work_color */ - pwq->work_color = wq->work_color; - - /* sync max_active to the current setting */ - pwq_adjust_max_active(pwq); - - /* link in @pwq */ - list_add_rcu(&pwq->pwqs_node, &wq->pwqs); -} - -/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ -static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) -{ - struct worker_pool *pool; - struct pool_workqueue *pwq; - - lockdep_assert_held(&wq_pool_mutex); - - pool = get_unbound_pool(attrs); - if (!pool) - return NULL; - - pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); - if (!pwq) { - put_unbound_pool(pool); - return NULL; - } - - init_pwq(pwq, wq, pool); - return pwq; -} - -/* undo alloc_unbound_pwq(), used only in the error path */ -static void free_unbound_pwq(struct pool_workqueue *pwq) -{ - lockdep_assert_held(&wq_pool_mutex); - - if (pwq) { - put_unbound_pool(pwq->pool); - kmem_cache_free(pwq_cache, pwq); - } -} - -/** - * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node - * @attrs: the wq_attrs of interest - * @node: the target NUMA node - * @cpu_going_down: if >= 0, the CPU to consider as offline - * @cpumask: outarg, the resulting cpumask - * - * Calculate the cpumask a workqueue with @attrs should use on @node. If - * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. - * - * If NUMA affinity is not enabled, @attrs->cpumask is always used. If - * enabled and @node has online CPUs requested by @attrs, the returned - * cpumask is the intersection of the possible CPUs of @node and - * @attrs->cpumask. - * - * The caller is responsible for ensuring that the cpumask of @node stays - * stable. - * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. - */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, - int cpu_going_down, cpumask_t *cpumask) -{ - if (!wq_numa_enabled || attrs->no_numa) - goto use_dfl; - - /* does @node have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); - if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, cpumask); - - if (cpumask_empty(cpumask)) - goto use_dfl; + if (cpumask_empty(cpumask)) + goto use_dfl; /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); @@ -4817,202 +4501,519 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); - } + mutex_unlock(&pool->attach_mutex); + } + + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + + mutex_unlock(&wq_pool_mutex); + break; + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; + struct workqueue_struct *wq; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding per-cpu workers should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ + flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); + break; + } + return NOTIFY_OK; +} + +#ifdef CONFIG_SMP + +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void work_for_cpu_fn(struct work_struct *work) +{ + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * It is up to the caller to ensure that the cpu doesn't go offline. + * The caller must not hold any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all freezable + * workqueues will queue new works to their delayed_works list instead of + * pool->worklist. + * + * CONTEXT: + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + */ +void freeze_workqueues_begin(void) +{ + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + WARN_ON_ONCE(workqueue_freezing); + workqueue_freezing = true; + + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + + mutex_unlock(&wq_pool_mutex); +} + +/** + * freeze_workqueues_busy - are freezable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases wq_pool_mutex. + * + * Return: + * %true if some freezable workqueues are still busy. %false if freezing + * is complete. + */ +bool freeze_workqueues_busy(void) +{ + bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + WARN_ON_ONCE(!workqueue_freezing); + + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + rcu_read_lock_sched(); + for_each_pwq(pwq, wq) { + WARN_ON_ONCE(pwq->nr_active < 0); + if (pwq->nr_active) { + busy = true; + rcu_read_unlock_sched(); + goto out_unlock; + } + } + rcu_read_unlock_sched(); + } +out_unlock: + mutex_unlock(&wq_pool_mutex); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective pool worklists. + * + * CONTEXT: + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + */ +void thaw_workqueues(void) +{ + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + if (!workqueue_freezing) + goto out_unlock; + + workqueue_freezing = false; + + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + +out_unlock: + mutex_unlock(&wq_pool_mutex); +} +#endif /* CONFIG_FREEZER */ + +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} +static DEVICE_ATTR_RO(per_cpu); + +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} +static DEVICE_ATTR_RW(max_active); + +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, +}; +ATTRIBUTE_GROUPS(wq_sysfs); + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq->unbound_attrs->cpumask)); + mutex_unlock(&wq->mutex); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); - break; - } - return NOTIFY_OK; + return written; } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { - int cpu = (unsigned long)hcpu; - struct work_struct unbind_work; - struct workqueue_struct *wq; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; - /* update NUMA affinity of unbound workqueues */ - mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); - mutex_unlock(&wq_pool_mutex); + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); - break; + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); } - return NOTIFY_OK; + + free_workqueue_attrs(attrs); + return ret ?: count; } -#ifdef CONFIG_SMP +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; -struct work_for_cpu { - struct work_struct work; - long (*fn)(void *); - void *arg; - long ret; +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_groups = wq_sysfs_groups, }; -static void work_for_cpu_fn(struct work_struct *work) +static int __init wq_sysfs_init(void) { - struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); - - wfc->ret = wfc->fn(wfc->arg); + return subsys_virtual_register(&wq_subsys, NULL); } +core_initcall(wq_sysfs_init); -/** - * work_on_cpu - run a function in user context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function arg - * - * It is up to the caller to ensure that the cpu doesn't go offline. - * The caller must not hold any locks which would prevent @fn from completing. - * - * Return: The value @fn returns. - */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +static void wq_device_release(struct device *dev) { - struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - destroy_work_on_stack(&wfc.work); - return wfc.ret; + kfree(wq_dev); } -EXPORT_SYMBOL_GPL(work_on_cpu); -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_FREEZER /** - * freeze_workqueues_begin - begin freezing workqueues + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register * - * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of - * pool->worklist. + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. * - * CONTEXT: - * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Return: 0 on success, -errno on failure. */ -void freeze_workqueues_begin(void) +int workqueue_sysfs_register(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct pool_workqueue *pwq; - - mutex_lock(&wq_pool_mutex); + struct wq_device *wq_dev; + int ret; - WARN_ON_ONCE(workqueue_freezing); - workqueue_freezing = true; + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; - list_for_each_entry(wq, &workqueues, list) { - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); - mutex_unlock(&wq->mutex); - } + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; - mutex_unlock(&wq_pool_mutex); -} + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; -/** - * freeze_workqueues_busy - are freezable workqueues still busy? - * - * Check whether freezing is complete. This function must be called - * between freeze_workqueues_begin() and thaw_workqueues(). - * - * CONTEXT: - * Grabs and releases wq_pool_mutex. - * - * Return: - * %true if some freezable workqueues are still busy. %false if freezing - * is complete. - */ -bool freeze_workqueues_busy(void) -{ - bool busy = false; - struct workqueue_struct *wq; - struct pool_workqueue *pwq; + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); - mutex_lock(&wq_pool_mutex); + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } - WARN_ON_ONCE(!workqueue_freezing); + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; - list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - /* - * nr_active is monotonically decreasing. It's safe - * to peek without lock. - */ - rcu_read_lock_sched(); - for_each_pwq(pwq, wq) { - WARN_ON_ONCE(pwq->nr_active < 0); - if (pwq->nr_active) { - busy = true; - rcu_read_unlock_sched(); - goto out_unlock; + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; } } - rcu_read_unlock_sched(); } -out_unlock: - mutex_unlock(&wq_pool_mutex); - return busy; + + dev_set_uevent_suppress(&wq_dev->dev, false); + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; } /** - * thaw_workqueues - thaw workqueues - * - * Thaw workqueues. Normal queueing is restored and all collected - * frozen works are transferred to their respective pool worklists. + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister * - * CONTEXT: - * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ -void thaw_workqueues(void) +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct pool_workqueue *pwq; - - mutex_lock(&wq_pool_mutex); - - if (!workqueue_freezing) - goto out_unlock; - - workqueue_freezing = false; + struct wq_device *wq_dev = wq->wq_dev; - /* restore max_active and repopulate worklist */ - list_for_each_entry(wq, &workqueues, list) { - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); - mutex_unlock(&wq->mutex); - } + if (!wq->wq_dev) + return; -out_unlock: - mutex_unlock(&wq_pool_mutex); + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); } -#endif /* CONFIG_FREEZER */ +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ static void __init wq_numa_init(void) {