Commit 7ab044a4 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull workqueue updates from Tejun Heo:

 - Concurrency-managed per-cpu work items that hog CPUs and delay the
   execution of other work items are now automatically detected and
   excluded from concurrency management. Reporting on such work items
   can also be enabled through a config option.

 - Added tools/workqueue/wq_monitor.py which improves visibility into
   workqueue usages and behaviors.

 - Arnd's minimal fix for gcc-13 enum warning on 32bit compiles,
   superseded by commit afa4bb77 in mainline.

* tag 'wq-for-6.5' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: Disable per-cpu CPU hog detection when wq_cpu_intensive_thresh_us is 0
  workqueue: Fix WARN_ON_ONCE() triggers in worker_enter_idle()
  workqueue: fix enum type for gcc-13
  workqueue: Track and monitor per-workqueue CPU time usage
  workqueue: Report work funcs that trigger automatic CPU_INTENSIVE mechanism
  workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE
  workqueue: Improve locking rule description for worker fields
  workqueue: Move worker_set/clr_flags() upwards
  workqueue: Re-order struct worker fields
  workqueue: Add pwq->stats[] and a monitoring script
  Further upgrade queue_work_on() comment
parents 18eb3b6d 18c8ae81
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -6972,6 +6972,18 @@
			it can be updated at runtime by writing to the
			corresponding sysfs file.

	workqueue.cpu_intensive_thresh_us=
			Per-cpu work items which run for longer than this
			threshold are automatically considered CPU intensive
			and excluded from concurrency management to prevent
			them from noticeably delaying other per-cpu work
			items. Default is 10000 (10ms).

			If CONFIG_WQ_CPU_INTENSIVE_REPORT is set, the kernel
			will report the work functions which violate this
			threshold repeatedly. They are likely good
			candidates for using WQ_UNBOUND workqueues instead.

	workqueue.disable_numa
			By default, all work items queued to unbound
			workqueues are affine to the NUMA nodes they're
+32 −0
Original line number Diff line number Diff line
@@ -348,6 +348,37 @@ Guidelines
  level of locality in wq operations and work item execution.


Monitoring
==========

Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::

  $ tools/workqueue/wq_monitor.py events
                              total  infl  CPUtime  CPUhog  CMwake  mayday rescued
  events                      18545     0      6.1       0       5       -       -
  events_highpri                  8     0      0.0       0       0       -       -
  events_long                     3     0      0.0       0       0       -       -
  events_unbound              38306     0      0.1       -       -       -       -
  events_freezable                0     0      0.0       0       0       -       -
  events_power_efficient      29598     0      0.2       0       0       -       -
  events_freezable_power_        10     0      0.0       0       0       -       -
  sock_diag_events                0     0      0.0       0       0       -       -

                              total  infl  CPUtime  CPUhog  CMwake  mayday rescued
  events                      18548     0      6.1       0       5       -       -
  events_highpri                  8     0      0.0       0       0       -       -
  events_long                     3     0      0.0       0       0       -       -
  events_unbound              38322     0      0.1       -       -       -       -
  events_freezable                0     0      0.0       0       0       -       -
  events_power_efficient      29603     0      0.2       0       0       -       -
  events_freezable_power_        10     0      0.0       0       0       -       -
  sock_diag_events                0     0      0.0       0       0       -       -

  ...

See the command's help message for more info.


Debugging
=========

@@ -387,6 +418,7 @@ the stack trace of the offending worker thread. ::
The work item's function should be trivially visible in the stack
trace.


Non-reentrance Conditions
=========================

+3 −0
Original line number Diff line number Diff line
@@ -5670,6 +5670,9 @@ void scheduler_tick(void)

	perf_event_task_tick();

	if (curr->flags & PF_WQ_WORKER)
		wq_worker_tick(curr);

#ifdef CONFIG_SMP
	rq->idle_balance = idle_cpu(cpu);
	trigger_load_balance(rq);
+258 −64
Original line number Diff line number Diff line
@@ -126,6 +126,12 @@ enum {
 *    cpu or grabbing pool->lock is enough for read access.  If
 *    POOL_DISASSOCIATED is set, it's identical to L.
 *
 * K: Only modified by worker while holding pool->lock. Can be safely read by
 *    self, while holding pool->lock or from IRQ context if %current is the
 *    kworker.
 *
 * S: Only modified by worker self.
 *
 * A: wq_pool_attach_mutex protected.
 *
 * PL: wq_pool_mutex protected.
@@ -199,6 +205,22 @@ struct worker_pool {
	struct rcu_head		rcu;
};

/*
 * Per-pool_workqueue statistics. These can be monitored using
 * tools/workqueue/wq_monitor.py.
 */
enum pool_workqueue_stats {
	PWQ_STAT_STARTED,	/* work items started execution */
	PWQ_STAT_COMPLETED,	/* work items completed execution */
	PWQ_STAT_CPU_TIME,	/* total CPU time consumed */
	PWQ_STAT_CPU_INTENSIVE,	/* wq_cpu_intensive_thresh_us violations */
	PWQ_STAT_CM_WAKEUP,	/* concurrency-management worker wakeups */
	PWQ_STAT_MAYDAY,	/* maydays to rescuer */
	PWQ_STAT_RESCUED,	/* linked work items executed by rescuer */

	PWQ_NR_STATS,
};

/*
 * The per-pool workqueue.  While queued, the lower WORK_STRUCT_FLAG_BITS
 * of work_struct->data are used for flags and the remaining high bits
@@ -236,6 +258,8 @@ struct pool_workqueue {
	struct list_head	pwqs_node;	/* WR: node on wq->pwqs */
	struct list_head	mayday_node;	/* MD: node on wq->maydays */

	u64			stats[PWQ_NR_STATS];

	/*
	 * Release of unbound pwq is punted to system_wq.  See put_pwq()
	 * and pwq_unbound_release_workfn() for details.  pool_workqueue
@@ -310,6 +334,14 @@ static struct kmem_cache *pwq_cache;
static cpumask_var_t *wq_numa_possible_cpumask;
					/* possible CPUs of each node */

/*
 * Per-cpu work items which run for longer than the following threshold are
 * automatically considered CPU intensive and excluded from concurrency
 * management to prevent them from noticeably delaying other per-cpu work items.
 */
static unsigned long wq_cpu_intensive_thresh_us = 10000;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);

static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);

@@ -866,6 +898,152 @@ static void wake_up_worker(struct worker_pool *pool)
		wake_up_process(worker->task);
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock)
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
	struct worker_pool *pool = worker->pool;

	WARN_ON_ONCE(worker->task != current);

	/* If transitioning into NOT_RUNNING, adjust nr_running. */
	if ((flags & WORKER_NOT_RUNNING) &&
	    !(worker->flags & WORKER_NOT_RUNNING)) {
		pool->nr_running--;
	}

	worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock)
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
	struct worker_pool *pool = worker->pool;
	unsigned int oflags = worker->flags;

	WARN_ON_ONCE(worker->task != current);

	worker->flags &= ~flags;

	/*
	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
	 * of multiple flags, not a single flag.
	 */
	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
			pool->nr_running++;
}

#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT

/*
 * Concurrency-managed per-cpu work items that hog CPU for longer than
 * wq_cpu_intensive_thresh_us trigger the automatic CPU_INTENSIVE mechanism,
 * which prevents them from stalling other concurrency-managed work items. If a
 * work function keeps triggering this mechanism, it's likely that the work item
 * should be using an unbound workqueue instead.
 *
 * wq_cpu_intensive_report() tracks work functions which trigger such conditions
 * and report them so that they can be examined and converted to use unbound
 * workqueues as appropriate. To avoid flooding the console, each violating work
 * function is tracked and reported with exponential backoff.
 */
#define WCI_MAX_ENTS 128

struct wci_ent {
	work_func_t		func;
	atomic64_t		cnt;
	struct hlist_node	hash_node;
};

static struct wci_ent wci_ents[WCI_MAX_ENTS];
static int wci_nr_ents;
static DEFINE_RAW_SPINLOCK(wci_lock);
static DEFINE_HASHTABLE(wci_hash, ilog2(WCI_MAX_ENTS));

static struct wci_ent *wci_find_ent(work_func_t func)
{
	struct wci_ent *ent;

	hash_for_each_possible_rcu(wci_hash, ent, hash_node,
				   (unsigned long)func) {
		if (ent->func == func)
			return ent;
	}
	return NULL;
}

static void wq_cpu_intensive_report(work_func_t func)
{
	struct wci_ent *ent;

restart:
	ent = wci_find_ent(func);
	if (ent) {
		u64 cnt;

		/*
		 * Start reporting from the fourth time and back off
		 * exponentially.
		 */
		cnt = atomic64_inc_return_relaxed(&ent->cnt);
		if (cnt >= 4 && is_power_of_2(cnt))
			printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n",
					ent->func, wq_cpu_intensive_thresh_us,
					atomic64_read(&ent->cnt));
		return;
	}

	/*
	 * @func is a new violation. Allocate a new entry for it. If wcn_ents[]
	 * is exhausted, something went really wrong and we probably made enough
	 * noise already.
	 */
	if (wci_nr_ents >= WCI_MAX_ENTS)
		return;

	raw_spin_lock(&wci_lock);

	if (wci_nr_ents >= WCI_MAX_ENTS) {
		raw_spin_unlock(&wci_lock);
		return;
	}

	if (wci_find_ent(func)) {
		raw_spin_unlock(&wci_lock);
		goto restart;
	}

	ent = &wci_ents[wci_nr_ents++];
	ent->func = func;
	atomic64_set(&ent->cnt, 1);
	hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func);

	raw_spin_unlock(&wci_lock);
}

#else	/* CONFIG_WQ_CPU_INTENSIVE_REPORT */
static void wq_cpu_intensive_report(work_func_t func) {}
#endif	/* CONFIG_WQ_CPU_INTENSIVE_REPORT */

/**
 * wq_worker_running - a worker is running again
 * @task: task waking up
@@ -876,7 +1054,7 @@ void wq_worker_running(struct task_struct *task)
{
	struct worker *worker = kthread_data(task);

	if (!worker->sleeping)
	if (!READ_ONCE(worker->sleeping))
		return;

	/*
@@ -889,7 +1067,14 @@ void wq_worker_running(struct task_struct *task)
	if (!(worker->flags & WORKER_NOT_RUNNING))
		worker->pool->nr_running++;
	preempt_enable();
	worker->sleeping = 0;

	/*
	 * CPU intensive auto-detection cares about how long a work item hogged
	 * CPU without sleeping. Reset the starting timestamp on wakeup.
	 */
	worker->current_at = worker->task->se.sum_exec_runtime;

	WRITE_ONCE(worker->sleeping, 0);
}

/**
@@ -915,10 +1100,10 @@ void wq_worker_sleeping(struct task_struct *task)
	pool = worker->pool;

	/* Return if preempted before wq_worker_running() was reached */
	if (worker->sleeping)
	if (READ_ONCE(worker->sleeping))
		return;

	worker->sleeping = 1;
	WRITE_ONCE(worker->sleeping, 1);
	raw_spin_lock_irq(&pool->lock);

	/*
@@ -932,11 +1117,65 @@ void wq_worker_sleeping(struct task_struct *task)
	}

	pool->nr_running--;
	if (need_more_worker(pool))
	if (need_more_worker(pool)) {
		worker->current_pwq->stats[PWQ_STAT_CM_WAKEUP]++;
		wake_up_worker(pool);
	}
	raw_spin_unlock_irq(&pool->lock);
}

/**
 * wq_worker_tick - a scheduler tick occurred while a kworker is running
 * @task: task currently running
 *
 * Called from scheduler_tick(). We're in the IRQ context and the current
 * worker's fields which follow the 'K' locking rule can be accessed safely.
 */
void wq_worker_tick(struct task_struct *task)
{
	struct worker *worker = kthread_data(task);
	struct pool_workqueue *pwq = worker->current_pwq;
	struct worker_pool *pool = worker->pool;

	if (!pwq)
		return;

	pwq->stats[PWQ_STAT_CPU_TIME] += TICK_USEC;

	if (!wq_cpu_intensive_thresh_us)
		return;

	/*
	 * If the current worker is concurrency managed and hogged the CPU for
	 * longer than wq_cpu_intensive_thresh_us, it's automatically marked
	 * CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
	 *
	 * Set @worker->sleeping means that @worker is in the process of
	 * switching out voluntarily and won't be contributing to
	 * @pool->nr_running until it wakes up. As wq_worker_sleeping() also
	 * decrements ->nr_running, setting CPU_INTENSIVE here can lead to
	 * double decrements. The task is releasing the CPU anyway. Let's skip.
	 * We probably want to make this prettier in the future.
	 */
	if ((worker->flags & WORKER_NOT_RUNNING) || READ_ONCE(worker->sleeping) ||
	    worker->task->se.sum_exec_runtime - worker->current_at <
	    wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
		return;

	raw_spin_lock(&pool->lock);

	worker_set_flags(worker, WORKER_CPU_INTENSIVE);
	wq_cpu_intensive_report(worker->current_func);
	pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;

	if (need_more_worker(pool)) {
		pwq->stats[PWQ_STAT_CM_WAKEUP]++;
		wake_up_worker(pool);
	}

	raw_spin_unlock(&pool->lock);
}

/**
 * wq_worker_last_func - retrieve worker's last work function
 * @task: Task to retrieve last work function of.
@@ -968,60 +1207,6 @@ work_func_t wq_worker_last_func(struct task_struct *task)
	return worker->last_func;
}

/**
 * worker_set_flags - set worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to set
 *
 * Set @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock)
 */
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
	struct worker_pool *pool = worker->pool;

	WARN_ON_ONCE(worker->task != current);

	/* If transitioning into NOT_RUNNING, adjust nr_running. */
	if ((flags & WORKER_NOT_RUNNING) &&
	    !(worker->flags & WORKER_NOT_RUNNING)) {
		pool->nr_running--;
	}

	worker->flags |= flags;
}

/**
 * worker_clr_flags - clear worker flags and adjust nr_running accordingly
 * @worker: self
 * @flags: flags to clear
 *
 * Clear @flags in @worker->flags and adjust nr_running accordingly.
 *
 * CONTEXT:
 * raw_spin_lock_irq(pool->lock)
 */
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
	struct worker_pool *pool = worker->pool;
	unsigned int oflags = worker->flags;

	WARN_ON_ONCE(worker->task != current);

	worker->flags &= ~flags;

	/*
	 * If transitioning out of NOT_RUNNING, increment nr_running.  Note
	 * that the nested NOT_RUNNING is not a noop.  NOT_RUNNING is mask
	 * of multiple flags, not a single flag.
	 */
	if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
		if (!(worker->flags & WORKER_NOT_RUNNING))
			pool->nr_running++;
}

/**
 * find_worker_executing_work - find worker which is executing a work
 * @pool: pool of interest
@@ -1542,6 +1727,8 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 * We queue the work to a specific CPU, the caller must ensure it
 * can't go away.  Callers that fail to ensure that the specified
 * CPU cannot go away will execute on a randomly chosen CPU.
 * But note well that callers specifying a CPU that never has been
 * online will get a splat.
 *
 * Return: %false if @work was already on a queue, %true otherwise.
 */
@@ -2166,6 +2353,7 @@ static void send_mayday(struct work_struct *work)
		get_pwq(pwq);
		list_add_tail(&pwq->mayday_node, &wq->maydays);
		wake_up_process(wq->rescuer->task);
		pwq->stats[PWQ_STAT_MAYDAY]++;
	}
}

@@ -2303,7 +2491,6 @@ __acquires(&pool->lock)
{
	struct pool_workqueue *pwq = get_work_pwq(work);
	struct worker_pool *pool = worker->pool;
	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
	unsigned long work_data;
	struct worker *collision;
#ifdef CONFIG_LOCKDEP
@@ -2340,6 +2527,7 @@ __acquires(&pool->lock)
	worker->current_work = work;
	worker->current_func = work->func;
	worker->current_pwq = pwq;
	worker->current_at = worker->task->se.sum_exec_runtime;
	work_data = *work_data_bits(work);
	worker->current_color = get_work_color(work_data);

@@ -2357,7 +2545,7 @@ __acquires(&pool->lock)
	 * of concurrency management and the next code block will chain
	 * execution of the pending work items.
	 */
	if (unlikely(cpu_intensive))
	if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
		worker_set_flags(worker, WORKER_CPU_INTENSIVE);

	/*
@@ -2404,6 +2592,7 @@ __acquires(&pool->lock)
	 * workqueues), so hiding them isn't a problem.
	 */
	lockdep_invariant_state(true);
	pwq->stats[PWQ_STAT_STARTED]++;
	trace_workqueue_execute_start(work);
	worker->current_func(work);
	/*
@@ -2411,6 +2600,7 @@ __acquires(&pool->lock)
	 * point will only record its address.
	 */
	trace_workqueue_execute_end(work, worker->current_func);
	pwq->stats[PWQ_STAT_COMPLETED]++;
	lock_map_release(&lockdep_map);
	lock_map_release(&pwq->wq->lockdep_map);

@@ -2435,8 +2625,11 @@ __acquires(&pool->lock)

	raw_spin_lock_irq(&pool->lock);

	/* clear cpu intensive status */
	if (unlikely(cpu_intensive))
	/*
	 * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
	 * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
	 * wq_cpu_intensive_thresh_us. Clear it.
	 */
	worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

	/* tag the worker for identification in schedule() */
@@ -2654,6 +2847,7 @@ static int rescuer_thread(void *__rescuer)
				if (first)
					pool->watchdog_ts = jiffies;
				move_linked_works(work, scheduled, &n);
				pwq->stats[PWQ_STAT_RESCUED]++;
			}
			first = false;
		}
+13 −11
Original line number Diff line number Diff line
@@ -28,13 +28,18 @@ struct worker {
		struct hlist_node	hentry;	/* L: while busy */
	};

	struct work_struct	*current_work;	/* L: work being processed */
	work_func_t		current_func;	/* L: current_work's fn */
	struct pool_workqueue	*current_pwq;	/* L: current_work's pwq */
	unsigned int		current_color;	/* L: current_work's color */
	struct list_head	scheduled;	/* L: scheduled works */
	struct work_struct	*current_work;	/* K: work being processed and its */
	work_func_t		current_func;	/* K: function */
	struct pool_workqueue	*current_pwq;	/* K: pwq */
	u64			current_at;	/* K: runtime at start or last wakeup */
	unsigned int		current_color;	/* K: color */

	int			sleeping;	/* S: is worker sleeping? */

	/* 64 bytes boundary on 64bit, 32 on 32bit */
	/* used by the scheduler to determine a worker's last known identity */
	work_func_t		last_func;	/* K: last work's fn */

	struct list_head	scheduled;	/* L: scheduled works */

	struct task_struct	*task;		/* I: worker task */
	struct worker_pool	*pool;		/* A: the associated pool */
@@ -42,10 +47,9 @@ struct worker {
	struct list_head	node;		/* A: anchored at pool->workers */
						/* A: runs through worker->node */

	unsigned long		last_active;	/* L: last active timestamp */
	unsigned long		last_active;	/* K: last active timestamp */
	unsigned int		flags;		/* X: flags */
	int			id;		/* I: worker id */
	int			sleeping;	/* None */

	/*
	 * Opaque string set with work_set_desc().  Printed out with task
@@ -55,9 +59,6 @@ struct worker {

	/* used only by rescuers to point to the target workqueue */
	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */

	/* used by the scheduler to determine a worker's last known identity */
	work_func_t		last_func;
};

/**
@@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
 */
void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task);

#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
Loading