Merge tag 'wq-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq (7ab044a4) · Commits · jan.koester / Linux

Documentation/admin-guide/kernel-parameters.txt

+12 −0

Original line number	Diff line number	Diff line
		@@ -6972,6 +6972,18 @@
		it can be updated at runtime by writing to the
		corresponding sysfs file.

		workqueue.cpu_intensive_thresh_us=
		Per-cpu work items which run for longer than this
		threshold are automatically considered CPU intensive
		and excluded from concurrency management to prevent
		them from noticeably delaying other per-cpu work
		items. Default is 10000 (10ms).

		If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel
		will report the work functions which violate this
		threshold repeatedly. They are likely good
		candidates for using WQ_UNBOUND workqueues instead.

		workqueue.disable_numa
		By default, all work items queued to unbound
		workqueues are affine to the NUMA nodes they're

Documentation/core-api/workqueue.rst

+32 −0

Original line number	Diff line number	Diff line
		@@ -348,6 +348,37 @@ Guidelines
		level of locality in wq operations and work item execution.


		Monitoring
		==========

		Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::

		$ tools/workqueue/wq_monitor.py events
		total infl CPUtime CPUhog CMwake mayday rescued
		events 18545 0 6.1 0 5 - -
		events_highpri 8 0 0.0 0 0 - -
		events_long 3 0 0.0 0 0 - -
		events_unbound 38306 0 0.1 - - - -
		events_freezable 0 0 0.0 0 0 - -
		events_power_efficient 29598 0 0.2 0 0 - -
		events_freezable_power_ 10 0 0.0 0 0 - -
		sock_diag_events 0 0 0.0 0 0 - -

		total infl CPUtime CPUhog CMwake mayday rescued
		events 18548 0 6.1 0 5 - -
		events_highpri 8 0 0.0 0 0 - -
		events_long 3 0 0.0 0 0 - -
		events_unbound 38322 0 0.1 - - - -
		events_freezable 0 0 0.0 0 0 - -
		events_power_efficient 29603 0 0.2 0 0 - -
		events_freezable_power_ 10 0 0.0 0 0 - -
		sock_diag_events 0 0 0.0 0 0 - -

		...

		See the command's help message for more info.


		Debugging
		=========

		@@ -387,6 +418,7 @@ the stack trace of the offending worker thread. ::
		The work item's function should be trivially visible in the stack
		trace.


		Non-reentrance Conditions
		=========================

kernel/sched/core.c

+3 −0

Original line number	Diff line number	Diff line
		@@ -5670,6 +5670,9 @@ void scheduler_tick(void)

		perf_event_task_tick();

		if (curr->flags & PF_WQ_WORKER)
		wq_worker_tick(curr);

		#ifdef CONFIG_SMP
		rq->idle_balance = idle_cpu(cpu);
		trigger_load_balance(rq);

kernel/workqueue.c

+258 −64

Original line number	Diff line number	Diff line
		@@ -126,6 +126,12 @@ enum {
		* cpu or grabbing pool->lock is enough for read access. If
		* POOL_DISASSOCIATED is set, it's identical to L.
		*
		* K: Only modified by worker while holding pool->lock. Can be safely read by
		* self, while holding pool->lock or from IRQ context if %current is the
		* kworker.
		*
		* S: Only modified by worker self.
		*
		* A: wq_pool_attach_mutex protected.
		*
		* PL: wq_pool_mutex protected.
		@@ -199,6 +205,22 @@ struct worker_pool {
		struct rcu_head rcu;
		};

		/*
		* Per-pool_workqueue statistics. These can be monitored using
		* tools/workqueue/wq_monitor.py.
		*/
		enum pool_workqueue_stats {
		PWQ_STAT_STARTED, /* work items started execution */
		PWQ_STAT_COMPLETED, /* work items completed execution */
		PWQ_STAT_CPU_TIME, /* total CPU time consumed */
		PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
		PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
		PWQ_STAT_MAYDAY, /* maydays to rescuer */
		PWQ_STAT_RESCUED, /* linked work items executed by rescuer */

		PWQ_NR_STATS,
		};

		/*
		* The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS
		* of work_struct->data are used for flags and the remaining high bits
		@@ -236,6 +258,8 @@ struct pool_workqueue {
		struct list_head pwqs_node; /* WR: node on wq->pwqs */
		struct list_head mayday_node; /* MD: node on wq->maydays */

		u64 stats[PWQ_NR_STATS];

		/*
		* Release of unbound pwq is punted to system_wq. See put_pwq()
		* and pwq_unbound_release_workfn() for details. pool_workqueue
		@@ -310,6 +334,14 @@ static struct kmem_cache *pwq_cache;
		static cpumask_var_t *wq_numa_possible_cpumask;
		/* possible CPUs of each node */

		/*
		* Per-cpu work items which run for longer than the following threshold are
		* automatically considered CPU intensive and excluded from concurrency
		* management to prevent them from noticeably delaying other per-cpu work items.
		*/
		static unsigned long wq_cpu_intensive_thresh_us = 10000;
		module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);

		static bool wq_disable_numa;
		module_param_named(disable_numa, wq_disable_numa, bool, 0444);

		@@ -866,6 +898,152 @@ static void wake_up_worker(struct worker_pool *pool)
		wake_up_process(worker->task);
		}

		/**
		* worker_set_flags - set worker flags and adjust nr_running accordingly
		* @worker: self
		* @flags: flags to set
		*
		* Set @flags in @worker->flags and adjust nr_running accordingly.
		*
		* CONTEXT:
		* raw_spin_lock_irq(pool->lock)
		*/
		static inline void worker_set_flags(struct worker *worker, unsigned int flags)
		{
		struct worker_pool *pool = worker->pool;

		WARN_ON_ONCE(worker->task != current);

		/* If transitioning into NOT_RUNNING, adjust nr_running. */
		if ((flags & WORKER_NOT_RUNNING) &&
		!(worker->flags & WORKER_NOT_RUNNING)) {
		pool->nr_running--;
		}

		worker->flags \|= flags;
		}

		/**
		* worker_clr_flags - clear worker flags and adjust nr_running accordingly
		* @worker: self
		* @flags: flags to clear
		*
		* Clear @flags in @worker->flags and adjust nr_running accordingly.
		*
		* CONTEXT:
		* raw_spin_lock_irq(pool->lock)
		*/
		static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
		{
		struct worker_pool *pool = worker->pool;
		unsigned int oflags = worker->flags;

		WARN_ON_ONCE(worker->task != current);

		worker->flags &= ~flags;

		/*
		* If transitioning out of NOT_RUNNING, increment nr_running. Note
		* that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
		* of multiple flags, not a single flag.
		*/
		if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
		pool->nr_running++;
		}

		#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

		/*
		* Concurrency-managed per-cpu work items that hog CPU for longer than
		* wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
		* which prevents them from stalling other concurrency-managed work items. If a
		* work function keeps triggering this mechanism, it's likely that the work item
		* should be using an unbound workqueue instead.
		*
		* wq_cpu_intensive_report() tracks work functions which trigger such conditions
		* and report them so that they can be examined and converted to use unbound
		* workqueues as appropriate. To avoid flooding the console, each violating work
		* function is tracked and reported with exponential backoff.
		*/
		#define WCI_MAX_ENTS 128

		struct wci_ent {
		work_func_t func;
		atomic64_t cnt;
		struct hlist_node hash_node;
		};

		static struct wci_ent wci_ents[WCI_MAX_ENTS];
		static int wci_nr_ents;
		static DEFINE_RAW_SPINLOCK(wci_lock);
		static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

		static struct wci_ent *wci_find_ent(work_func_t func)
		{
		struct wci_ent *ent;

		hash_for_each_possible_rcu(wci_hash, ent, hash_node,
		(unsigned long)func) {
		if (ent->func == func)
		return ent;
		}
		return NULL;
		}

		static void wq_cpu_intensive_report(work_func_t func)
		{
		struct wci_ent *ent;

		restart:
		ent = wci_find_ent(func);
		if (ent) {
		u64 cnt;

		/*
		* Start reporting from the fourth time and back off
		* exponentially.
		*/
		cnt = atomic64_inc_return_relaxed(&ent->cnt);
		if (cnt >= 4 && is_power_of_2(cnt))
		printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
		ent->func, wq_cpu_intensive_thresh_us,
		atomic64_read(&ent->cnt));
		return;
		}

		/*
		* @func is a new violation. Allocate a new entry for it. If wcn_ents[]
		* is exhausted, something went really wrong and we probably made enough
		* noise already.
		*/
		if (wci_nr_ents >= WCI_MAX_ENTS)
		return;

		raw_spin_lock(&wci_lock);

		if (wci_nr_ents >= WCI_MAX_ENTS) {
		raw_spin_unlock(&wci_lock);
		return;
		}

		if (wci_find_ent(func)) {
		raw_spin_unlock(&wci_lock);
		goto restart;
		}

		ent = &wci_ents[wci_nr_ents++];
		ent->func = func;
		atomic64_set(&ent->cnt, 1);
		hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

		raw_spin_unlock(&wci_lock);
		}

		#else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */
		static void wq_cpu_intensive_report(work_func_t func) {}
		#endif /* CONFIG_WQ_CPU_INTENSIVE_REPORT */

		/**
		* wq_worker_running - a worker is running again
		* @task: task waking up
		@@ -876,7 +1054,7 @@ void wq_worker_running(struct task_struct *task)
		{
		struct worker *worker = kthread_data(task);

		if (!worker->sleeping)
		if (!READ_ONCE(worker->sleeping))
		return;

		/*
		@@ -889,7 +1067,14 @@ void wq_worker_running(struct task_struct *task)
		if (!(worker->flags & WORKER_NOT_RUNNING))
		worker->pool->nr_running++;
		preempt_enable();
		worker->sleeping = 0;

		/*
		* CPU intensive auto-detection cares about how long a work item hogged
		* CPU without sleeping. Reset the starting timestamp on wakeup.
		*/
		worker->current_at = worker->task->se.sum_exec_runtime;

		WRITE_ONCE(worker->sleeping, 0);
		}

		/**
		@@ -915,10 +1100,10 @@ void wq_worker_sleeping(struct task_struct *task)
		pool = worker->pool;

		/* Return if preempted before wq_worker_running() was reached */
		if (worker->sleeping)
		if (READ_ONCE(worker->sleeping))
		return;

		worker->sleeping = 1;
		WRITE_ONCE(worker->sleeping, 1);
		raw_spin_lock_irq(&pool->lock);

		/*
		@@ -932,11 +1117,65 @@ void wq_worker_sleeping(struct task_struct *task)
		}

		pool->nr_running--;
		if (need_more_worker(pool))
		if (need_more_worker(pool)) {
		worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
		wake_up_worker(pool);
		}
		raw_spin_unlock_irq(&pool->lock);
		}

		/**
		* wq_worker_tick - a scheduler tick occurred while a kworker is running
		* @task: task currently running
		*
		* Called from scheduler_tick(). We're in the IRQ context and the current
		* worker's fields which follow the 'K' locking rule can be accessed safely.
		*/
		void wq_worker_tick(struct task_struct *task)
		{
		struct worker *worker = kthread_data(task);
		struct pool_workqueue *pwq = worker->current_pwq;
		struct worker_pool *pool = worker->pool;

		if (!pwq)
		return;

		pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

		if (!wq_cpu_intensive_thresh_us)
		return;

		/*
		* If the current worker is concurrency managed and hogged the CPU for
		* longer than wq_cpu_intensive_thresh_us, it's automatically marked
		* CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
		*
		* Set @worker->sleeping means that @worker is in the process of
		* switching out voluntarily and won't be contributing to
		* @pool->nr_running until it wakes up. As wq_worker_sleeping() also
		* decrements ->nr_running, setting CPU_INTENSIVE here can lead to
		* double decrements. The task is releasing the CPU anyway. Let's skip.
		* We probably want to make this prettier in the future.
		*/
		if ((worker->flags & WORKER_NOT_RUNNING) \|\| READ_ONCE(worker->sleeping) \|\|
		worker->task->se.sum_exec_runtime - worker->current_at <
		wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
		return;

		raw_spin_lock(&pool->lock);

		worker_set_flags(worker, WORKER_CPU_INTENSIVE);
		wq_cpu_intensive_report(worker->current_func);
		pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

		if (need_more_worker(pool)) {
		pwq->stats[PWQ_STAT_CM_WAKEUP]++;
		wake_up_worker(pool);
		}

		raw_spin_unlock(&pool->lock);
		}

		/**
		* wq_worker_last_func - retrieve worker's last work function
		* @task: Task to retrieve last work function of.
		@@ -968,60 +1207,6 @@ work_func_t wq_worker_last_func(struct task_struct *task)
		return worker->last_func;
		}

		/**
		* worker_set_flags - set worker flags and adjust nr_running accordingly
		* @worker: self
		* @flags: flags to set
		*
		* Set @flags in @worker->flags and adjust nr_running accordingly.
		*
		* CONTEXT:
		* raw_spin_lock_irq(pool->lock)
		*/
		static inline void worker_set_flags(struct worker *worker, unsigned int flags)
		{
		struct worker_pool *pool = worker->pool;

		WARN_ON_ONCE(worker->task != current);

		/* If transitioning into NOT_RUNNING, adjust nr_running. */
		if ((flags & WORKER_NOT_RUNNING) &&
		!(worker->flags & WORKER_NOT_RUNNING)) {
		pool->nr_running--;
		}

		worker->flags \|= flags;
		}

		/**
		* worker_clr_flags - clear worker flags and adjust nr_running accordingly
		* @worker: self
		* @flags: flags to clear
		*
		* Clear @flags in @worker->flags and adjust nr_running accordingly.
		*
		* CONTEXT:
		* raw_spin_lock_irq(pool->lock)
		*/
		static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
		{
		struct worker_pool *pool = worker->pool;
		unsigned int oflags = worker->flags;

		WARN_ON_ONCE(worker->task != current);

		worker->flags &= ~flags;

		/*
		* If transitioning out of NOT_RUNNING, increment nr_running. Note
		* that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
		* of multiple flags, not a single flag.
		*/
		if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
		pool->nr_running++;
		}

		/**
		* find_worker_executing_work - find worker which is executing a work
		* @pool: pool of interest
		@@ -1542,6 +1727,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
		* We queue the work to a specific CPU, the caller must ensure it
		* can't go away. Callers that fail to ensure that the specified
		* CPU cannot go away will execute on a randomly chosen CPU.
		* But note well that callers specifying a CPU that never has been
		* online will get a splat.
		*
		* Return: %false if @work was already on a queue, %true otherwise.
		*/
		@@ -2166,6 +2353,7 @@ static void send_mayday(struct work_struct *work)
		get_pwq(pwq);
		list_add_tail(&pwq->mayday_node, &wq->maydays);
		wake_up_process(wq->rescuer->task);
		pwq->stats[PWQ_STAT_MAYDAY]++;
		}
		}

		@@ -2303,7 +2491,6 @@ __acquires(&pool->lock)
		{
		struct pool_workqueue *pwq = get_work_pwq(work);
		struct worker_pool *pool = worker->pool;
		bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
		unsigned long work_data;
		struct worker *collision;
		#ifdef CONFIG_LOCKDEP
		@@ -2340,6 +2527,7 @@ __acquires(&pool->lock)
		worker->current_work = work;
		worker->current_func = work->func;
		worker->current_pwq = pwq;
		worker->current_at = worker->task->se.sum_exec_runtime;
		work_data = *work_data_bits(work);
		worker->current_color = get_work_color(work_data);

		@@ -2357,7 +2545,7 @@ __acquires(&pool->lock)
		* of concurrency management and the next code block will chain
		* execution of the pending work items.
		*/
		if (unlikely(cpu_intensive))
		if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
		worker_set_flags(worker, WORKER_CPU_INTENSIVE);

		/*
		@@ -2404,6 +2592,7 @@ __acquires(&pool->lock)
		* workqueues), so hiding them isn't a problem.
		*/
		lockdep_invariant_state(true);
		pwq->stats[PWQ_STAT_STARTED]++;
		trace_workqueue_execute_start(work);
		worker->current_func(work);
		/*
		@@ -2411,6 +2600,7 @@ __acquires(&pool->lock)
		* point will only record its address.
		*/
		trace_workqueue_execute_end(work, worker->current_func);
		pwq->stats[PWQ_STAT_COMPLETED]++;
		lock_map_release(&lockdep_map);
		lock_map_release(&pwq->wq->lockdep_map);

		@@ -2435,8 +2625,11 @@ __acquires(&pool->lock)

		raw_spin_lock_irq(&pool->lock);

		/* clear cpu intensive status */
		if (unlikely(cpu_intensive))
		/*
		* In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
		* CPU intensive by wq_worker_tick() if @work hogged CPU longer than
		* wq_cpu_intensive_thresh_us. Clear it.
		*/
		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

		/* tag the worker for identification in schedule() */
		@@ -2654,6 +2847,7 @@ static int rescuer_thread(void *__rescuer)
		if (first)
		pool->watchdog_ts = jiffies;
		move_linked_works(work, scheduled, &n);
		pwq->stats[PWQ_STAT_RESCUED]++;
		}
		first = false;
		}

kernel/workqueue_internal.h

+13 −11

Original line number	Diff line number	Diff line
		@@ -28,13 +28,18 @@ struct worker {
		struct hlist_node hentry; /* L: while busy */
		};

		struct work_struct current_work; / L: work being processed */
		work_func_t current_func; /* L: current_work's fn */
		struct pool_workqueue current_pwq; / L: current_work's pwq */
		unsigned int current_color; /* L: current_work's color */
		struct list_head scheduled; /* L: scheduled works */
		struct work_struct current_work; / K: work being processed and its */
		work_func_t current_func; /* K: function */
		struct pool_workqueue current_pwq; / K: pwq */
		u64 current_at; /* K: runtime at start or last wakeup */
		unsigned int current_color; /* K: color */

		int sleeping; /* S: is worker sleeping? */

		/* 64 bytes boundary on 64bit, 32 on 32bit */
		/* used by the scheduler to determine a worker's last known identity */
		work_func_t last_func; /* K: last work's fn */

		struct list_head scheduled; /* L: scheduled works */

		struct task_struct task; / I: worker task */
		struct worker_pool pool; / A: the associated pool */
		@@ -42,10 +47,9 @@ struct worker {
		struct list_head node; /* A: anchored at pool->workers */
		/* A: runs through worker->node */

		unsigned long last_active; /* L: last active timestamp */
		unsigned long last_active; /* K: last active timestamp */
		unsigned int flags; /* X: flags */
		int id; /* I: worker id */
		int sleeping; /* None */

		/*
		* Opaque string set with work_set_desc(). Printed out with task
		@@ -55,9 +59,6 @@ struct worker {

		/* used only by rescuers to point to the target workqueue */
		struct workqueue_struct rescue_wq; / I: the workqueue to rescue */

		/* used by the scheduler to determine a worker's last known identity */
		work_func_t last_func;
		};

		/**
		@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
		*/
		void wq_worker_running(struct task_struct *task);
		void wq_worker_sleeping(struct task_struct *task);
		void wq_worker_tick(struct task_struct *task);
		work_func_t wq_worker_last_func(struct task_struct *task);

		#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */