mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-03 23:37:40 -04:00
workqueue: Fix false positive stall reports
On weakly ordered architectures (e.g., arm64), the lockless check in
wq_watchdog_timer_fn() can observe a reordering between the worklist
insertion and the last_progress_ts update. Specifically, the watchdog
can see a non-empty worklist (from a list_add) while reading a stale
last_progress_ts value, causing a false positive stall report.
This was confirmed by reading pool->last_progress_ts again after holding
pool->lock in wq_watchdog_timer_fn():
workqueue watchdog: pool 7 false positive detected!
lockless_ts=4784580465 locked_ts=4785033728
diff=453263ms worklist_empty=0
To avoid slowing down the hot path (queue_work, etc.), recheck
last_progress_ts with pool->lock held. This will eliminate the false
positive with minimal overhead.
Remove two extra empty lines in wq_watchdog_timer_fn() as we are on it.
Fixes: 82607adcf9 ("workqueue: implement lockup detector")
Cc: stable@vger.kernel.org # v4.5+
Assisted-by: claude-code:claude-opus-4-6
Signed-off-by: Song Liu <song@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
This commit is contained in:
@@ -7699,8 +7699,28 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
|
||||
else
|
||||
ts = touched;
|
||||
|
||||
/* did we stall? */
|
||||
/*
|
||||
* Did we stall?
|
||||
*
|
||||
* Do a lockless check first. On weakly ordered
|
||||
* architectures, the lockless check can observe a
|
||||
* reordering between worklist insert_work() and
|
||||
* last_progress_ts update from __queue_work(). Since
|
||||
* __queue_work() is a much hotter path than the timer
|
||||
* function, we handle false positive here by reading
|
||||
* last_progress_ts again with pool->lock held.
|
||||
*/
|
||||
if (time_after(now, ts + thresh)) {
|
||||
scoped_guard(raw_spinlock_irqsave, &pool->lock) {
|
||||
pool_ts = pool->last_progress_ts;
|
||||
if (time_after(pool_ts, touched))
|
||||
ts = pool_ts;
|
||||
else
|
||||
ts = touched;
|
||||
}
|
||||
if (!time_after(now, ts + thresh))
|
||||
continue;
|
||||
|
||||
lockup_detected = true;
|
||||
stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
|
||||
max_stall_time = max(max_stall_time, stall_time);
|
||||
@@ -7712,8 +7732,6 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
|
||||
pr_cont_pool_info(pool);
|
||||
pr_cont(" stuck for %us!\n", stall_time);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (lockup_detected)
|
||||
|
||||
Reference in New Issue
Block a user