Commit 9abff574 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'wq-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq

Pull workqueue fixes from Tejun Heo:

 - Improve workqueue stall diagnostics: dump all busy workers (not just
   running ones), show wall-clock duration of in-flight work items, and
   add a sample module for reproducing stalls

 - Fix POOL_BH vs WQ_BH flag namespace mismatch in pr_cont_worker_id()

 - Rename pool->watchdog_ts to pool->last_progress_ts and related
   functions for clarity

* tag 'wq-for-7.0-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq:
  workqueue: Rename show_cpu_pool{s,}_hog{s,}() to reflect broadened scope
  workqueue: Add stall detector sample module
  workqueue: Show all busy workers in stall diagnostics
  workqueue: Show in-flight work item duration in stall diagnostics
  workqueue: Rename pool->watchdog_ts to pool->last_progress_ts
  workqueue: Use POOL_BH instead of WQ_BH when checking pool flags
parents b073bcb8 98c790b1
Loading
Loading
Loading
Loading
+28 −27
Original line number Diff line number Diff line
@@ -190,7 +190,7 @@ struct worker_pool {
	int			id;		/* I: pool ID */
	unsigned int		flags;		/* L: flags */

	unsigned long		watchdog_ts;	/* L: watchdog timestamp */
	unsigned long		last_progress_ts;	/* L: last forward progress timestamp */
	bool			cpu_stall;	/* WD: stalled cpu bound pool */

	/*
@@ -1697,7 +1697,7 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
	WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE));
	trace_workqueue_activate_work(work);
	if (list_empty(&pwq->pool->worklist))
		pwq->pool->watchdog_ts = jiffies;
		pwq->pool->last_progress_ts = jiffies;
	move_linked_works(work, &pwq->pool->worklist, NULL);
	__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
@@ -2348,7 +2348,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
	 */
	if (list_empty(&pwq->inactive_works) && pwq_tryinc_nr_active(pwq, false)) {
		if (list_empty(&pool->worklist))
			pool->watchdog_ts = jiffies;
			pool->last_progress_ts = jiffies;

		trace_workqueue_activate_work(work);
		insert_work(pwq, work, &pool->worklist, work_flags);
@@ -3204,6 +3204,7 @@ __acquires(&pool->lock)
	worker->current_pwq = pwq;
	if (worker->task)
		worker->current_at = worker->task->se.sum_exec_runtime;
	worker->current_start = jiffies;
	work_data = *work_data_bits(work);
	worker->current_color = get_work_color(work_data);

@@ -3352,7 +3353,7 @@ static void process_scheduled_works(struct worker *worker)
	while ((work = list_first_entry_or_null(&worker->scheduled,
						struct work_struct, entry))) {
		if (first) {
			worker->pool->watchdog_ts = jiffies;
			worker->pool->last_progress_ts = jiffies;
			first = false;
		}
		process_one_work(worker, work);
@@ -4850,7 +4851,7 @@ static int init_worker_pool(struct worker_pool *pool)
	pool->cpu = -1;
	pool->node = NUMA_NO_NODE;
	pool->flags |= POOL_DISASSOCIATED;
	pool->watchdog_ts = jiffies;
	pool->last_progress_ts = jiffies;
	INIT_LIST_HEAD(&pool->worklist);
	INIT_LIST_HEAD(&pool->idle_list);
	hash_init(pool->busy_hash);
@@ -6274,7 +6275,7 @@ static void pr_cont_worker_id(struct worker *worker)
{
	struct worker_pool *pool = worker->pool;

	if (pool->flags & WQ_BH)
	if (pool->flags & POOL_BH)
		pr_cont("bh%s",
			pool->attrs->nice == HIGHPRI_NICE_LEVEL ? "-hi" : "");
	else
@@ -6359,6 +6360,8 @@ static void show_pwq(struct pool_workqueue *pwq)
			pr_cont(" %s", comma ? "," : "");
			pr_cont_worker_id(worker);
			pr_cont(":%ps", worker->current_func);
			pr_cont(" for %us",
				jiffies_to_msecs(jiffies - worker->current_start) / 1000);
			list_for_each_entry(work, &worker->scheduled, entry)
				pr_cont_work(false, work, &pcws);
			pr_cont_work_flush(comma, (work_func_t)-1L, &pcws);
@@ -6462,7 +6465,7 @@ static void show_one_worker_pool(struct worker_pool *pool)

	/* How long the first pending work is waiting for a worker. */
	if (!list_empty(&pool->worklist))
		hung = jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000;
		hung = jiffies_to_msecs(jiffies - pool->last_progress_ts) / 1000;

	/*
	 * Defer printing to avoid deadlocks in console drivers that
@@ -7580,11 +7583,11 @@ MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds

/*
 * Show workers that might prevent the processing of pending work items.
 * The only candidates are CPU-bound workers in the running state.
 * Pending work items should be handled by another idle worker
 * in all other situations.
 * A busy worker that is not running on the CPU (e.g. sleeping in
 * wait_event_idle() with PF_WQ_WORKER cleared) can stall the pool just as
 * effectively as a CPU-bound one, so dump every in-flight worker.
 */
static void show_cpu_pool_hog(struct worker_pool *pool)
static void show_cpu_pool_busy_workers(struct worker_pool *pool)
{
	struct worker *worker;
	unsigned long irq_flags;
@@ -7593,7 +7596,6 @@ static void show_cpu_pool_hog(struct worker_pool *pool)
	raw_spin_lock_irqsave(&pool->lock, irq_flags);

	hash_for_each(pool->busy_hash, bkt, worker, hentry) {
		if (task_is_running(worker->task)) {
		/*
		 * Defer printing to avoid deadlocks in console
		 * drivers that queue work while holding locks
@@ -7606,23 +7608,22 @@ static void show_cpu_pool_hog(struct worker_pool *pool)

		printk_deferred_exit();
	}
	}

	raw_spin_unlock_irqrestore(&pool->lock, irq_flags);
}

static void show_cpu_pools_hogs(void)
static void show_cpu_pools_busy_workers(void)
{
	struct worker_pool *pool;
	int pi;

	pr_info("Showing backtraces of running workers in stalled CPU-bound worker pools:\n");
	pr_info("Showing backtraces of busy workers in stalled worker pools:\n");

	rcu_read_lock();

	for_each_pool(pool, pi) {
		if (pool->cpu_stall)
			show_cpu_pool_hog(pool);
			show_cpu_pool_busy_workers(pool);

	}

@@ -7691,7 +7692,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
			touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu));
		else
			touched = READ_ONCE(wq_watchdog_touched);
		pool_ts = READ_ONCE(pool->watchdog_ts);
		pool_ts = READ_ONCE(pool->last_progress_ts);

		if (time_after(pool_ts, touched))
			ts = pool_ts;
@@ -7719,7 +7720,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
		show_all_workqueues();

	if (cpu_pool_stall)
		show_cpu_pools_hogs();
		show_cpu_pools_busy_workers();

	if (lockup_detected)
		panic_on_wq_watchdog(max_stall_time);
+1 −0
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@ struct worker {
	work_func_t		current_func;	/* K: function */
	struct pool_workqueue	*current_pwq;	/* K: pwq */
	u64			current_at;	/* K: runtime at start or last wakeup */
	unsigned long		current_start;	/* K: start time of current work item */
	unsigned int		current_color;	/* K: color */

	int			sleeping;	/* S: is worker sleeping? */
+1 −0
Original line number Diff line number Diff line
obj-m += wq_stall.o
+98 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0
/*
 * wq_stall - Test module for the workqueue stall detector.
 *
 * Deliberately creates a workqueue stall so the watchdog fires and
 * prints diagnostic output.  Useful for verifying that the stall
 * detector correctly identifies stuck workers and produces useful
 * backtraces.
 *
 * The stall is triggered by clearing PF_WQ_WORKER before sleeping,
 * which hides the worker from the concurrency manager.  A second
 * work item queued on the same pool then sits in the worklist with
 * no worker available to process it.
 *
 * After ~30s the workqueue watchdog fires:
 *   BUG: workqueue lockup - pool cpus=N ...
 *
 * Build:
 *	make -C <kernel tree> M=samples/workqueue/stall_detector modules
 *
 * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
 * Copyright (c) 2026 Breno Leitao <leitao@debian.org>
 */

#include <linux/module.h>
#include <linux/workqueue.h>
#include <linux/wait.h>
#include <linux/atomic.h>
#include <linux/sched.h>

static DECLARE_WAIT_QUEUE_HEAD(stall_wq_head);
static atomic_t wake_condition = ATOMIC_INIT(0);
static struct work_struct stall_work1;
static struct work_struct stall_work2;

static void stall_work2_fn(struct work_struct *work)
{
	pr_info("wq_stall: second work item finally ran\n");
}

static void stall_work1_fn(struct work_struct *work)
{
	pr_info("wq_stall: first work item running on cpu %d\n",
		raw_smp_processor_id());

	/*
	 * Queue second item while we're still counted as running
	 * (pool->nr_running > 0).  Since schedule_work() on a per-CPU
	 * workqueue targets raw_smp_processor_id(), item 2 lands on the
	 * same pool.  __queue_work -> kick_pool -> need_more_worker()
	 * sees nr_running > 0 and does NOT wake a new worker.
	 */
	schedule_work(&stall_work2);

	/*
	 * Hide from the workqueue concurrency manager.  Without
	 * PF_WQ_WORKER, schedule() won't call wq_worker_sleeping(),
	 * so nr_running is never decremented and no replacement
	 * worker is created.  Item 2 stays stuck in pool->worklist.
	 */
	current->flags &= ~PF_WQ_WORKER;

	pr_info("wq_stall: entering wait_event_idle (PF_WQ_WORKER cleared)\n");
	pr_info("wq_stall: expect 'BUG: workqueue lockup' in ~30-60s\n");
	wait_event_idle(stall_wq_head, atomic_read(&wake_condition) != 0);

	/* Restore so process_one_work() cleanup works correctly */
	current->flags |= PF_WQ_WORKER;
	pr_info("wq_stall: woke up, PF_WQ_WORKER restored\n");
}

static int __init wq_stall_init(void)
{
	pr_info("wq_stall: loading\n");

	INIT_WORK(&stall_work1, stall_work1_fn);
	INIT_WORK(&stall_work2, stall_work2_fn);
	schedule_work(&stall_work1);

	return 0;
}

static void __exit wq_stall_exit(void)
{
	pr_info("wq_stall: unloading\n");
	atomic_set(&wake_condition, 1);
	wake_up(&stall_wq_head);
	flush_work(&stall_work1);
	flush_work(&stall_work2);
	pr_info("wq_stall: all work flushed, module unloaded\n");
}

module_init(wq_stall_init);
module_exit(wq_stall_exit);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Reproduce workqueue stall caused by PF_WQ_WORKER misuse");
MODULE_AUTHOR("Breno Leitao <leitao@debian.org>");