Commit 616db877 authored by Tejun Heo's avatar Tejun Heo
Browse files

workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE



If a per-cpu work item hogs the CPU, it can prevent other work items from
starting through concurrency management. A per-cpu workqueue which intends
to host such CPU-hogging work items can choose to not participate in
concurrency management by setting %WQ_CPU_INTENSIVE; however, this can be
error-prone and difficult to debug when missed.

This patch adds an automatic CPU usage based detection. If a
concurrency-managed work item consumes more CPU time than the threshold
(10ms by default) continuously without intervening sleeps, wq_worker_tick()
which is called from scheduler_tick() will detect the condition and
automatically mark it CPU_INTENSIVE.

The mechanism isn't foolproof:

* Detection depends on tick hitting the work item. Getting preempted at the
  right timings may allow a violating work item to evade detection at least
  temporarily.

* nohz_full CPUs may not be running ticks and thus can fail detection.

* Even when detection is working, the 10ms detection delays can add up if
  many CPU-hogging work items are queued at the same time.

However, in vast majority of cases, this should be able to detect violations
reliably and provide reasonable protection with a small increase in code
complexity.

If some work items trigger this condition repeatedly, the bigger problem
likely is the CPU being saturated with such per-cpu work items and the
solution would be making them UNBOUND. The next patch will add a debug
mechanism to help spot such cases.

v4: Documentation for workqueue.cpu_intensive_thresh_us added to
    kernel-parameters.txt.

v3: Switch to use wq_worker_tick() instead of hooking into preemptions as
    suggested by Peter.

v2: Lai pointed out that wq_worker_stopping() also needs to be called from
    preemption and rtlock paths and an earlier patch was updated
    accordingly. This patch adds a comment describing the risk of infinte
    recursions and how they're avoided.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarPeter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
parent bdf8b9bf
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -6931,6 +6931,13 @@
			it can be updated at runtime by writing to the
			corresponding sysfs file.

	workqueue.cpu_intensive_thresh_us=
			Per-cpu work items which run for longer than this
			threshold are automatically considered CPU intensive
			and excluded from concurrency management to prevent
			them from noticeably delaying other per-cpu work
			items. Default is 10000 (10ms).

	workqueue.disable_numa
			By default, all work items queued to unbound
			workqueues are affine to the NUMA nodes they're
+19 −19
Original line number Diff line number Diff line
@@ -354,25 +354,25 @@ Monitoring
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::

  $ tools/workqueue/wq_monitor.py events
                              total  infl  CMwake  mayday rescued
  events                      18545     0       5       -       -
  events_highpri                  8     0       0       -       -
  events_long                     3     0       0       -       -
  events_unbound              38306     0       -       -       -
  events_freezable                0     0       0       -       -
  events_power_efficient      29598     0       0       -       -
  events_freezable_power_        10     0       0       -       -
  sock_diag_events                0     0       0       -       -

                              total  infl  CMwake  mayday rescued
  events                      18548     0       5       -       -
  events_highpri                  8     0       0       -       -
  events_long                     3     0       0       -       -
  events_unbound              38322     0       -       -       -
  events_freezable                0     0       0       -       -
  events_power_efficient      29603     0       0       -       -
  events_freezable_power_        10     0       0       -       -
  sock_diag_events                0     0       0       -       -
                              total  infl CPUitsv  CMwake  mayday rescued
  events                      18545     0       0       5       -       -
  events_highpri                  8     0       0       0       -       -
  events_long                     3     0       0       0       -       -
  events_unbound              38306     0       -       -       -       -
  events_freezable                0     0       0       0       -       -
  events_power_efficient      29598     0       0       0       -       -
  events_freezable_power_        10     0       0       0       -       -
  sock_diag_events                0     0       0       0       -       -

                              total  infl CPUitsv  CMwake  mayday rescued
  events                      18548     0       0       5       -       -
  events_highpri                  8     0       0       0       -       -
  events_long                     3     0       0       0       -       -
  events_unbound              38322     0       -       -       -       -
  events_freezable                0     0       0       0       -       -
  events_power_efficient      29603     0       0       0       -       -
  events_freezable_power_        10     0       0       0       -       -
  sock_diag_events                0     0       0       0       -       -

  ...

+3 −0
Original line number Diff line number Diff line
@@ -5632,6 +5632,9 @@ void scheduler_tick(void)

	perf_event_task_tick();

	if (curr->flags & PF_WQ_WORKER)
		wq_worker_tick(curr);

#ifdef CONFIG_SMP
	rq->idle_balance = idle_cpu(cpu);
	trigger_load_balance(rq);
+63 −5
Original line number Diff line number Diff line
@@ -212,6 +212,7 @@ struct worker_pool {
enum pool_workqueue_stats {
	PWQ_STAT_STARTED,	/* work items started execution */
	PWQ_STAT_COMPLETED,	/* work items completed execution */
	PWQ_STAT_CPU_INTENSIVE,	/* wq_cpu_intensive_thresh_us violations */
	PWQ_STAT_CM_WAKEUP,	/* concurrency-management worker wakeups */
	PWQ_STAT_MAYDAY,	/* maydays to rescuer */
	PWQ_STAT_RESCUED,	/* linked work items executed by rescuer */
@@ -332,6 +333,14 @@ static struct kmem_cache *pwq_cache;
static cpumask_var_t *wq_numa_possible_cpumask;
					/* possible CPUs of each node */

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 */
static unsigned long wq_cpu_intensive_thresh_us = 10000;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);

static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

@@ -962,6 +971,13 @@ void wq_worker_running(struct task_struct *task)
	if (!(worker->flags & WORKER_NOT_RUNNING))
		worker->pool->nr_running++;
	preempt_enable();

	/*
	 * CPU intensive auto-detection cares about how long a work item hogged
	 * CPU without sleeping. Reset the starting timestamp on wakeup.
	 */
	worker->current_at = worker->task->se.sum_exec_runtime;

	worker->sleeping = 0;
}

@@ -1012,6 +1028,45 @@ void wq_worker_sleeping(struct task_struct *task)
	raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from scheduler_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
	struct worker *worker = kthread_data(task);
	struct pool_workqueue *pwq = worker->current_pwq;
	struct worker_pool *pool = worker->pool;

	if (!pwq)
		return;

	/*
	 * If the current worker is concurrency managed and hogged the CPU for
	 * longer than wq_cpu_intensive_thresh_us, it's automatically marked
	 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
	 */
	if ((worker->flags & WORKER_NOT_RUNNING) ||
	    worker->task->se.sum_exec_runtime - worker->current_at <
	    wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
		return;

	raw_spin_lock(&pool->lock);

	worker_set_flags(worker, WORKER_CPU_INTENSIVE);
	pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

	if (need_more_worker(pool)) {
		pwq->stats[PWQ_STAT_CM_WAKEUP]++;
		wake_up_worker(pool);
	}

	raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
@@ -2327,7 +2382,6 @@ __acquires(&pool->lock)
{
	struct pool_workqueue *pwq = get_work_pwq(work);
	struct worker_pool *pool = worker->pool;
	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
	unsigned long work_data;
	struct worker *collision;
#ifdef CONFIG_LOCKDEP
@@ -2364,6 +2418,7 @@ __acquires(&pool->lock)
	worker->current_work = work;
	worker->current_func = work->func;
	worker->current_pwq = pwq;
	worker->current_at = worker->task->se.sum_exec_runtime;
	work_data = *work_data_bits(work);
	worker->current_color = get_work_color(work_data);

@@ -2381,7 +2436,7 @@ __acquires(&pool->lock)
	 * of concurrency management and the next code block will chain
	 * execution of the pending work items.
	 */
	if (unlikely(cpu_intensive))
	if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
		worker_set_flags(worker, WORKER_CPU_INTENSIVE);

	/*
@@ -2461,8 +2516,11 @@ __acquires(&pool->lock)

	raw_spin_lock_irq(&pool->lock);

	/* clear cpu intensive status */
	if (unlikely(cpu_intensive))
	/*
	 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
	 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
	 * wq_cpu_intensive_thresh_us. Clear it.
	 */
	worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

	/* tag the worker for identification in schedule() */
+2 −0
Original line number Diff line number Diff line
@@ -31,6 +31,7 @@ struct worker {
	struct work_struct	*current_work;	/* K: work being processed and its */
	work_func_t		current_func;	/* K: function */
	struct pool_workqueue	*current_pwq;	/* K: pwq */
	u64			current_at;	/* K: runtime at start or last wakeup */
	unsigned int		current_color;	/* K: color */

	int			sleeping;	/* S: is worker sleeping? */
@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
Loading