Commit b96e7a5f authored by Joel Fernandes (Google)'s avatar Joel Fernandes (Google) Committed by Frederic Weisbecker
Browse files

rcu/tree: Defer setting of jiffies during stall reset

There are instances where rcu_cpu_stall_reset() is called when jiffies
did not get a chance to update for a long time. Before jiffies is
updated, the CPU stall detector can go off triggering false-positives
where a just-started grace period appears to be ages old. In the past,
we disabled stall detection in rcu_cpu_stall_reset() however this got
changed [1]. This is resulting in false-positives in KGDB usecase [2].

Fix this by deferring the update of jiffies to the third run of the FQS
loop. This is more robust, as, even if rcu_cpu_stall_reset() is called
just before jiffies is read, we would end up pushing out the jiffies
read by 3 more FQS loops. Meanwhile the CPU stall detection will be
delayed and we will not get any false positives.

[1] https://lore.kernel.org/all/20210521155624.174524-2-senozhatsky@chromium.org/
[2] https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/



Tested with rcutorture.cpu_stall option as well to verify stall behavior
with/without patch.

Tested-by: default avatarHuacai Chen <chenhuacai@loongson.cn>
Reported-by: default avatarBinbin Zhou <zhoubinbin@loongson.cn>
Closes: https://lore.kernel.org/all/20230814020045.51950-2-chenhuacai@loongson.cn/


Suggested-by: default avatarPaul McKenney <paulmck@kernel.org>
Cc: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: a80be428 ("rcu: Do not disable GP stall detection in rcu_cpu_stall_reset()")
Signed-off-by: default avatarJoel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: default avatarPaul E. McKenney <paulmck@kernel.org>
Signed-off-by: default avatarFrederic Weisbecker <frederic@kernel.org>
parent 7c1b3e0c
Loading
Loading
Loading
Loading
+12 −0
Original line number Diff line number Diff line
@@ -1556,10 +1556,22 @@ static bool rcu_gp_fqs_check_wake(int *gfp)
 */
static void rcu_gp_fqs(bool first_time)
{
	int nr_fqs = READ_ONCE(rcu_state.nr_fqs_jiffies_stall);
	struct rcu_node *rnp = rcu_get_root();

	WRITE_ONCE(rcu_state.gp_activity, jiffies);
	WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);

	WARN_ON_ONCE(nr_fqs > 3);
	/* Only countdown nr_fqs for stall purposes if jiffies moves. */
	if (nr_fqs) {
		if (nr_fqs == 1) {
			WRITE_ONCE(rcu_state.jiffies_stall,
				   jiffies + rcu_jiffies_till_stall_check());
		}
		WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, --nr_fqs);
	}

	if (first_time) {
		/* Collect dyntick-idle snapshots. */
		force_qs_rnp(dyntick_save_progress_counter);
+4 −0
Original line number Diff line number Diff line
@@ -386,6 +386,10 @@ struct rcu_state {
						/*  in jiffies. */
	unsigned long jiffies_stall;		/* Time at which to check */
						/*  for CPU stalls. */
	int nr_fqs_jiffies_stall;		/* Number of fqs loops after
						 * which read jiffies and set
						 * jiffies_stall. Stall
						 * warnings disabled if !0. */
	unsigned long jiffies_resched;		/* Time at which to resched */
						/*  a reluctant CPU. */
	unsigned long n_force_qs_gpstart;	/* Snapshot of n_force_qs at */
+18 −2
Original line number Diff line number Diff line
@@ -150,12 +150,17 @@ static void panic_on_rcu_stall(void)
/**
 * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
 *
 * To perform the reset request from the caller, disable stall detection until
 * 3 fqs loops have passed. This is required to ensure a fresh jiffies is
 * loaded.  It should be safe to do from the fqs loop as enough timer
 * interrupts and context switches should have passed.
 *
 * The caller must disable hard irqs.
 */
void rcu_cpu_stall_reset(void)
{
	WRITE_ONCE(rcu_state.jiffies_stall,
		   jiffies + rcu_jiffies_till_stall_check());
	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 3);
	WRITE_ONCE(rcu_state.jiffies_stall, ULONG_MAX);
}

//////////////////////////////////////////////////////////////////////////////
@@ -171,6 +176,7 @@ static void record_gp_stall_check_time(void)
	WRITE_ONCE(rcu_state.gp_start, j);
	j1 = rcu_jiffies_till_stall_check();
	smp_mb(); // ->gp_start before ->jiffies_stall and caller's ->gp_seq.
	WRITE_ONCE(rcu_state.nr_fqs_jiffies_stall, 0);
	WRITE_ONCE(rcu_state.jiffies_stall, j + j1);
	rcu_state.jiffies_resched = j + j1 / 2;
	rcu_state.n_force_qs_gpstart = READ_ONCE(rcu_state.n_force_qs);
@@ -726,6 +732,16 @@ static void check_cpu_stall(struct rcu_data *rdp)
	    !rcu_gp_in_progress())
		return;
	rcu_stall_kick_kthreads();

	/*
	 * Check if it was requested (via rcu_cpu_stall_reset()) that the FQS
	 * loop has to set jiffies to ensure a non-stale jiffies value. This
	 * is required to have good jiffies value after coming out of long
	 * breaks of jiffies updates. Not doing so can cause false positives.
	 */
	if (READ_ONCE(rcu_state.nr_fqs_jiffies_stall) > 0)
		return;

	j = jiffies;

	/*