Merge tag 'wq-for-6.20' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq (9bdc6489) · Commits · git / linux-net

Documentation/admin-guide/kernel-parameters.txt

+10 −1

Original line number	Diff line number	Diff line
		@@ -8376,7 +8376,16 @@ Kernel parameters
		CONFIG_WQ_WATCHDOG. It sets the number times of the
		stall to trigger panic.

		The default is 0, which disables the panic on stall.
		The default is set by CONFIG_BOOTPARAM_WQ_STALL_PANIC,
		which is 0 (disabled) if not configured.

		workqueue.panic_on_stall_time=<uint>
		Panic when a workqueue stall has been continuous for
		the specified number of seconds. Unlike panic_on_stall
		which counts accumulated stall events, this triggers
		based on the duration of a single continuous stall.

		The default is 0, which disables the time-based panic.

		workqueue.cpu_intensive_thresh_us=
		Per-cpu work items which run for longer than this

kernel/workqueue.c

+115 −34

Original line number	Diff line number	Diff line
		@@ -117,6 +117,8 @@ enum wq_internal_consts {
		MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */
		CREATE_COOLDOWN = HZ, /* time to breath after fail */

		RESCUER_BATCH = 16, /* process items per turn */

		/*
		* Rescue workers are used only on emergencies and shared by
		* all cpus. Give MIN_NICE.
		@@ -286,6 +288,7 @@ struct pool_workqueue {
		struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */
		struct list_head pwqs_node; /* WR: node on wq->pwqs */
		struct list_head mayday_node; /* MD: node on wq->maydays */
		struct work_struct mayday_cursor; /* L: cursor on pool->worklist */

		u64 stats[PWQ_NR_STATS];

		@@ -1120,6 +1123,12 @@ static struct worker find_worker_executing_work(struct worker_pool pool,
		return NULL;
		}

		static void mayday_cursor_func(struct work_struct *work)
		{
		/* should not be processed, only for marking position */
		BUG();
		}

		/**
		* move_linked_works - move linked works to a list
		* @work: start of series of works to be scheduled
		@@ -1182,6 +1191,16 @@ static bool assign_work(struct work_struct work, struct worker worker,

		lockdep_assert_held(&pool->lock);

		/* The cursor work should not be processed */
		if (unlikely(work->func == mayday_cursor_func)) {
		/* only worker_thread() can possibly take this branch */
		WARN_ON_ONCE(worker->rescue_wq);
		if (nextp)
		*nextp = list_next_entry(work, entry);
		list_del_init(&work->entry);
		return false;
		}

		/*
		* A single work shouldn't be executed concurrently by multiple workers.
		* __queue_work() ensures that @work doesn't jump to a different pool
		@@ -2976,9 +2995,8 @@ static void idle_cull_fn(struct work_struct *work)
		reap_dying_workers(&cull_list);
		}

		static void send_mayday(struct work_struct *work)
		static void send_mayday(struct pool_workqueue *pwq)
		{
		struct pool_workqueue *pwq = get_work_pwq(work);
		struct workqueue_struct *wq = pwq->wq;

		lockdep_assert_held(&wq_mayday_lock);
		@@ -3016,7 +3034,7 @@ static void pool_mayday_timeout(struct timer_list *t)
		* rescuers.
		*/
		list_for_each_entry(work, &pool->worklist, entry)
		send_mayday(work);
		send_mayday(get_work_pwq(work));
		}

		raw_spin_unlock(&wq_mayday_lock);
		@@ -3440,22 +3458,57 @@ static int worker_thread(void *__worker)
		static bool assign_rescuer_work(struct pool_workqueue pwq, struct worker rescuer)
		{
		struct worker_pool *pool = pwq->pool;
		struct work_struct *cursor = &pwq->mayday_cursor;
		struct work_struct work, n;

		/* need rescue? */
		if (!pwq->nr_active \|\| !need_to_create_worker(pool))
		/* have work items to rescue? */
		if (!pwq->nr_active)
		return false;

		/* need rescue? */
		if (!need_to_create_worker(pool)) {
		/*
		* Slurp in all works issued via this workqueue and
		* process'em.
		* The pool has idle workers and doesn't need the rescuer, so it
		* could simply return false here.
		*
		* However, the memory pressure might not be fully relieved.
		* In PERCPU pool with concurrency enabled, having idle workers
		* does not necessarily mean memory pressure is gone; it may
		* simply mean regular workers have woken up, completed their
		* work, and gone idle again due to concurrency limits.
		*
		* In this case, those working workers may later sleep again,
		* the pool may run out of idle workers, and it will have to
		* allocate new ones and wait for the timer to send mayday,
		* causing unnecessary delay - especially if memory pressure
		* was never resolved throughout.
		*
		* Do more work if memory pressure is still on to reduce
		* relapse, using (pool->flags & POOL_MANAGER_ACTIVE), though
		* not precisely, unless there are other PWQs needing help.
		*/
		list_for_each_entry_safe(work, n, &pool->worklist, entry) {
		if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n))
		if (!(pool->flags & POOL_MANAGER_ACTIVE) \|\|
		!list_empty(&pwq->wq->maydays))
		return false;
		}

		/* search from the start or cursor if available */
		if (list_empty(&cursor->entry))
		work = list_first_entry(&pool->worklist, struct work_struct, entry);
		else
		work = list_next_entry(cursor, entry);

		/* find the next work item to rescue */
		list_for_each_entry_safe_from(work, n, &pool->worklist, entry) {
		if (get_work_pwq(work) == pwq && assign_work(work, rescuer, &n)) {
		pwq->stats[PWQ_STAT_RESCUED]++;
		/* put the cursor for next search */
		list_move_tail(&cursor->entry, &n->entry);
		return true;
		}
		}

		return !list_empty(&rescuer->scheduled);
		return false;
		}

		/**
		@@ -3512,6 +3565,7 @@ static int rescuer_thread(void *__rescuer)
		struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
		struct pool_workqueue, mayday_node);
		struct worker_pool *pool = pwq->pool;
		unsigned int count = 0;

		__set_current_state(TASK_RUNNING);
		list_del_init(&pwq->mayday_node);
		@@ -3524,31 +3578,27 @@ static int rescuer_thread(void *__rescuer)

		WARN_ON_ONCE(!list_empty(&rescuer->scheduled));

		if (assign_rescuer_work(pwq, rescuer)) {
		while (assign_rescuer_work(pwq, rescuer)) {
		process_scheduled_works(rescuer);

		/*
		* The above execution of rescued work items could
		* have created more to rescue through
		* pwq_activate_first_inactive() or chained
		* queueing. Let's put @pwq back on mayday list so
		* that such back-to-back work items, which may be
		* being used to relieve memory pressure, don't
		* incur MAYDAY_INTERVAL delay inbetween.
		* If the per-turn work item limit is reached and other
		* PWQs are in mayday, requeue mayday for this PWQ and
		* let the rescuer handle the other PWQs first.
		*/
		if (pwq->nr_active && need_to_create_worker(pool)) {
		if (++count > RESCUER_BATCH && !list_empty(&pwq->wq->maydays) &&
		pwq->nr_active && need_to_create_worker(pool)) {
		raw_spin_lock(&wq_mayday_lock);
		/*
		* Queue iff somebody else hasn't queued it already.
		*/
		if (list_empty(&pwq->mayday_node)) {
		get_pwq(pwq);
		list_add_tail(&pwq->mayday_node, &wq->maydays);
		}
		send_mayday(pwq);
		raw_spin_unlock(&wq_mayday_lock);
		break;
		}
		}

		/* The cursor can not be left behind without the rescuer watching it. */
		if (!list_empty(&pwq->mayday_cursor.entry) && list_empty(&pwq->mayday_node))
		list_del_init(&pwq->mayday_cursor.entry);

		/*
		* Leave this pool. Notify regular workers; otherwise, we end up
		* with 0 concurrency and stalling the execution.
		@@ -5167,6 +5217,19 @@ static void init_pwq(struct pool_workqueue pwq, struct workqueue_struct wq,
		INIT_LIST_HEAD(&pwq->pwqs_node);
		INIT_LIST_HEAD(&pwq->mayday_node);
		kthread_init_work(&pwq->release_work, pwq_release_workfn);

		/*
		* Set the dummy cursor work with valid function and get_work_pwq().
		*
		* The cursor work should only be in the pwq->pool->worklist, and
		* should not be treated as a processable work item.
		*
		* WORK_STRUCT_PENDING and WORK_STRUCT_INACTIVE just make it less
		* surprise for kernel debugging tools and reviewers.
		*/
		INIT_WORK(&pwq->mayday_cursor, mayday_cursor_func);
		atomic_long_set(&pwq->mayday_cursor.data, (unsigned long)pwq \|
		WORK_STRUCT_PENDING \| WORK_STRUCT_PWQ \| WORK_STRUCT_INACTIVE);
		}

		/* sync @pwq with the current state of its associated wq and link it */
		@@ -7508,9 +7571,13 @@ static struct timer_list wq_watchdog_timer;
		static unsigned long wq_watchdog_touched = INITIAL_JIFFIES;
		static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES;

		static unsigned int wq_panic_on_stall;
		static unsigned int wq_panic_on_stall = CONFIG_BOOTPARAM_WQ_STALL_PANIC;
		module_param_named(panic_on_stall, wq_panic_on_stall, uint, 0644);

		static unsigned int wq_panic_on_stall_time;
		module_param_named(panic_on_stall_time, wq_panic_on_stall_time, uint, 0644);
		MODULE_PARM_DESC(panic_on_stall_time, "Panic if stall exceeds this many seconds (0=disabled)");

		/*
		* Show workers that might prevent the processing of pending work items.
		* The only candidates are CPU-bound workers in the running state.
		@@ -7562,14 +7629,25 @@ static void show_cpu_pools_hogs(void)
		rcu_read_unlock();
		}

		static void panic_on_wq_watchdog(void)
		/*
		* It triggers a panic in two scenarios: when the total number of stalls
		* exceeds a threshold, and when a stall lasts longer than
		* wq_panic_on_stall_time
		*/
		static void panic_on_wq_watchdog(unsigned int stall_time_sec)
		{
		static unsigned int wq_stall;

		if (wq_panic_on_stall) {
		wq_stall++;
		BUG_ON(wq_stall >= wq_panic_on_stall);
		if (wq_stall >= wq_panic_on_stall)
		panic("workqueue: %u stall(s) exceeded threshold %u\n",
		wq_stall, wq_panic_on_stall);
		}

		if (wq_panic_on_stall_time && stall_time_sec >= wq_panic_on_stall_time)
		panic("workqueue: stall lasted %us, exceeding threshold %us\n",
		stall_time_sec, wq_panic_on_stall_time);
		}

		static void wq_watchdog_reset_touched(void)
		@@ -7584,10 +7662,12 @@ static void wq_watchdog_reset_touched(void)
		static void wq_watchdog_timer_fn(struct timer_list *unused)
		{
		unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
		unsigned int max_stall_time = 0;
		bool lockup_detected = false;
		bool cpu_pool_stall = false;
		unsigned long now = jiffies;
		struct worker_pool *pool;
		unsigned int stall_time;
		int pi;

		if (!thresh)
		@@ -7621,14 +7701,15 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
		/* did we stall? */
		if (time_after(now, ts + thresh)) {
		lockup_detected = true;
		stall_time = jiffies_to_msecs(now - pool_ts) / 1000;
		max_stall_time = max(max_stall_time, stall_time);
		if (pool->cpu >= 0 && !(pool->flags & POOL_BH)) {
		pool->cpu_stall = true;
		cpu_pool_stall = true;
		}
		pr_emerg("BUG: workqueue lockup - pool");
		pr_cont_pool_info(pool);
		pr_cont(" stuck for %us!\n",
		jiffies_to_msecs(now - pool_ts) / 1000);
		pr_cont(" stuck for %us!\n", stall_time);
		}


		@@ -7641,7 +7722,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
		show_cpu_pools_hogs();

		if (lockup_detected)
		panic_on_wq_watchdog();
		panic_on_wq_watchdog(max_stall_time);

		wq_watchdog_reset_touched();
		mod_timer(&wq_watchdog_timer, jiffies + thresh);

lib/Kconfig.debug

+23 −0

Original line number	Diff line number	Diff line
		@@ -1322,6 +1322,29 @@ config WQ_WATCHDOG
		state. This can be configured through kernel parameter
		"workqueue.watchdog_thresh" and its sysfs counterpart.

		config BOOTPARAM_WQ_STALL_PANIC
		int "Panic on Nth workqueue stall"
		default 0
		range 0 100
		depends on WQ_WATCHDOG
		help
		Set the number of workqueue stalls to trigger a kernel panic.
		A workqueue stall occurs when a worker pool doesn't make forward
		progress on a pending work item for over 30 seconds (configurable
		using the workqueue.watchdog_thresh parameter).

		If n = 0, the kernel will not panic on stall. If n > 0, the kernel
		will panic after n stall warnings.

		The panic can be used in combination with panic_timeout,
		to cause the system to reboot automatically after a
		stall has been detected. This feature is useful for
		high-availability systems that have uptime guarantees and
		where a stall must be resolved ASAP.

		This setting can be overridden at runtime via the
		workqueue.panic_on_stall kernel parameter.

		config WQ_CPU_INTENSIVE_REPORT
		bool "Report per-cpu work items which hog CPU for too long"
		depends on DEBUG_KERNEL