Commit f24dc33f authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull timer subsystem updates from Ingo Molnar:

 - Various preparatory cleanups & enhancements of the timer-wheel code,
   in preparation for the WIP 'pull timers at expiry' timer migration
   model series (which will replace the current 'push timers at enqueue'
   migration model), by Anna-Maria Behnsen:

      - Update comments and clean up confusing variable names

      - Add debug check to warn about time travel

      - Improve/expand timer-wheel tracepoints

      - Optimize away unnecessary IPIs for deferrable timers

      - Restructure & clean up next_expiry_recalc()

      - Clean up forward_timer_base()

      - Introduce __forward_timer_base() and use it to simplify and
        micro-optimize get_next_timer_interrupt()

 - Restructure the get_next_timer_interrupt()'s idle logic for better
   readability and to enable a minor optimization.

 - Fix the nextevt calculation when no timers are pending

 - Fix the sysfs_get_uname() prototype declaration

* tag 'timers-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  timers: Fix nextevt calculation when no timers are pending
  timers: Rework idle logic
  timers: Use already existing function for forwarding timer base
  timers: Split out forward timer base functionality
  timers: Clarify check in forward_timer_base()
  timers: Move store of next event into __next_timer_interrupt()
  timers: Do not IPI for deferrable timers
  tracing/timers: Add tracepoint for tracking timer base is_idle flag
  tracing/timers: Enhance timer_start tracepoint
  tick-sched: Warn when next tick seems to be in the past
  tick/sched: Cleanup confusing variables
  tick-sched: Fix function names in comments
  time: Make sysfs_get_uname() function visible in header
parents 46a08b4d da65f29d
Loading
Loading
Loading
Loading
+30 −10
Original line number Diff line number Diff line
@@ -47,21 +47,20 @@ DEFINE_EVENT(timer_class, timer_init,
/**
 * timer_start - called when the timer is started
 * @timer:		pointer to struct timer_list
 * @expires:	the timers expiry time
 * @flags:	the timers flags
 * @bucket_expiry:	the bucket expiry time
 */
TRACE_EVENT(timer_start,

	TP_PROTO(struct timer_list *timer,
		unsigned long expires,
		unsigned int flags),
		unsigned long bucket_expiry),

	TP_ARGS(timer, expires, flags),
	TP_ARGS(timer, bucket_expiry),

	TP_STRUCT__entry(
		__field( void *,	timer		)
		__field( void *,	function	)
		__field( unsigned long,	expires		)
		__field( unsigned long,	bucket_expiry	)
		__field( unsigned long,	now		)
		__field( unsigned int,	flags		)
	),
@@ -69,15 +68,16 @@ TRACE_EVENT(timer_start,
	TP_fast_assign(
		__entry->timer		= timer;
		__entry->function	= timer->function;
		__entry->expires	= expires;
		__entry->expires	= timer->expires;
		__entry->bucket_expiry	= bucket_expiry;
		__entry->now		= jiffies;
		__entry->flags		= flags;
		__entry->flags		= timer->flags;
	),

	TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s",
	TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s",
		  __entry->timer, __entry->function, __entry->expires,
		  (long)__entry->expires - __entry->now,
		  __entry->flags & TIMER_CPUMASK,
		  __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK,
		  __entry->flags >> TIMER_ARRAYSHIFT,
		  decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK))
);
@@ -142,6 +142,26 @@ DEFINE_EVENT(timer_class, timer_cancel,
	TP_ARGS(timer)
);

TRACE_EVENT(timer_base_idle,

	TP_PROTO(bool is_idle, unsigned int cpu),

	TP_ARGS(is_idle, cpu),

	TP_STRUCT__entry(
		__field( bool,		is_idle	)
		__field( unsigned int,	cpu	)
	),

	TP_fast_assign(
		__entry->is_idle	= is_idle;
		__entry->cpu		= cpu;
	),

	TP_printk("is_idle=%d cpu=%d",
		  __entry->is_idle, __entry->cpu)
);

#define decode_clockid(type)						\
	__print_symbolic(type,						\
		{ CLOCK_REALTIME,	"CLOCK_REALTIME"	},	\
+2 −1
Original line number Diff line number Diff line
@@ -56,7 +56,6 @@ extern int clockevents_program_event(struct clock_event_device *dev,
				     ktime_t expires, bool force);
extern void clockevents_handle_noop(struct clock_event_device *dev);
extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);

/* Broadcasting support */
# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
@@ -197,3 +196,5 @@ void hrtimers_resume_local(void);
#else
#define JIFFIES_SHIFT	8
#endif

extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+14 −11
Original line number Diff line number Diff line
@@ -839,6 +839,10 @@ static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
		ts->next_timer = next_tick;
	}

	/* Make sure next_tick is never before basemono! */
	if (WARN_ON_ONCE(basemono > next_tick))
		next_tick = basemono;

	/*
	 * If the tick is due in the next period, keep it ticking or
	 * force prod the timer.
@@ -887,7 +891,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
	u64 basemono = ts->timer_expires_base;
	u64 expires = ts->timer_expires;
	ktime_t tick = expires;

	/* Make sure we won't be trying to stop it twice in a row. */
	ts->timer_expires_base = 0;
@@ -910,7 +913,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
	/* Skip reprogram of event if it's not changed */
	if (ts->tick_stopped && (expires == ts->next_tick)) {
		/* Sanity check: make sure clockevent is actually programmed */
		if (tick == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
		if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
			return;

		WARN_ON_ONCE(1);
@@ -920,11 +923,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
	}

	/*
	 * nohz_stop_sched_tick() can be called several times before
	 * nohz_restart_sched_tick() is called. This happens when
	 * interrupts arrive which do not cause a reschedule. In the
	 * first call we save the current tick time, so we can restart
	 * the scheduler tick in nohz_restart_sched_tick().
	 * tick_nohz_stop_tick() can be called several times before
	 * tick_nohz_restart_sched_tick() is called. This happens when
	 * interrupts arrive which do not cause a reschedule. In the first
	 * call we save the current tick time, so we can restart the
	 * scheduler tick in tick_nohz_restart_sched_tick().
	 */
	if (!ts->tick_stopped) {
		calc_load_nohz_start();
@@ -935,7 +938,7 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
		trace_tick_stop(1, TICK_DEP_MASK_NONE);
	}

	ts->next_tick = tick;
	ts->next_tick = expires;

	/*
	 * If the expiration time == KTIME_MAX, then we simply stop
@@ -950,11 +953,11 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
	}

	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
		hrtimer_start(&ts->sched_timer, tick,
		hrtimer_start(&ts->sched_timer, expires,
			      HRTIMER_MODE_ABS_PINNED_HARD);
	} else {
		hrtimer_set_expires(&ts->sched_timer, tick);
		tick_program_event(tick, 1);
		hrtimer_set_expires(&ts->sched_timer, expires);
		tick_program_event(expires, 1);
	}
}

+63 −47
Original line number Diff line number Diff line
@@ -571,18 +571,15 @@ static int calc_wheel_index(unsigned long expires, unsigned long clk,
static void
trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
{
	if (!is_timers_nohz_active())
		return;

	/*
	 * TODO: This wants some optimizing similar to the code below, but we
	 * will do that when we switch from push to pull for deferrable timers.
	 * Deferrable timers do not prevent the CPU from entering dynticks and
	 * are not taken into account on the idle/nohz_full path. An IPI when a
	 * new deferrable timer is enqueued will wake up the remote CPU but
	 * nothing will be done with the deferrable timer base. Therefore skip
	 * the remote IPI for deferrable timers completely.
	 */
	if (timer->flags & TIMER_DEFERRABLE) {
		if (tick_nohz_full_cpu(base->cpu))
			wake_up_nohz_cpu(base->cpu);
	if (!is_timers_nohz_active() || timer->flags & TIMER_DEFERRABLE)
		return;
	}

	/*
	 * We might have to IPI the remote CPU if the base is idle and the
@@ -606,7 +603,7 @@ static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
	__set_bit(idx, base->pending_map);
	timer_set_idx(timer, idx);

	trace_timer_start(timer, timer->expires, timer->flags);
	trace_timer_start(timer, bucket_expiry);

	/*
	 * Check whether this is the new first expiring timer. The
@@ -942,31 +939,34 @@ get_target_base(struct timer_base *base, unsigned tflags)
	return get_timer_this_cpu_base(tflags);
}

static inline void forward_timer_base(struct timer_base *base)
static inline void __forward_timer_base(struct timer_base *base,
					unsigned long basej)
{
	unsigned long jnow = READ_ONCE(jiffies);

	/*
	 * No need to forward if we are close enough below jiffies.
	 * Also while executing timers, base->clk is 1 offset ahead
	 * of jiffies to avoid endless requeuing to current jiffies.
	 * Check whether we can forward the base. We can only do that when
	 * @basej is past base->clk otherwise we might rewind base->clk.
	 */
	if ((long)(jnow - base->clk) < 1)
	if (time_before_eq(basej, base->clk))
		return;

	/*
	 * If the next expiry value is > jiffies, then we fast forward to
	 * jiffies otherwise we forward to the next expiry value.
	 */
	if (time_after(base->next_expiry, jnow)) {
		base->clk = jnow;
	if (time_after(base->next_expiry, basej)) {
		base->clk = basej;
	} else {
		if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
			return;
		base->clk = base->next_expiry;
	}

}

static inline void forward_timer_base(struct timer_base *base)
{
	__forward_timer_base(base, READ_ONCE(jiffies));
}

/*
 * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
@@ -1803,8 +1803,10 @@ static int next_pending_bucket(struct timer_base *base, unsigned offset,
/*
 * Search the first expiring timer in the various clock levels. Caller must
 * hold base->lock.
 *
 * Store next expiry time in base->next_expiry.
 */
static unsigned long __next_timer_interrupt(struct timer_base *base)
static void next_expiry_recalc(struct timer_base *base)
{
	unsigned long clk, next, adj;
	unsigned lvl, offset = 0;
@@ -1870,10 +1872,9 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
		clk += adj;
	}

	base->next_expiry = next;
	base->next_expiry_recalc = false;
	base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);

	return next;
}

#ifdef CONFIG_NO_HZ_COMMON
@@ -1921,8 +1922,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
{
	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
	unsigned long nextevt = basej + NEXT_TIMER_MAX_DELTA;
	u64 expires = KTIME_MAX;
	unsigned long nextevt;
	bool was_idle;

	/*
	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1933,37 +1935,44 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)

	raw_spin_lock(&base->lock);
	if (base->next_expiry_recalc)
		base->next_expiry = __next_timer_interrupt(base);
	nextevt = base->next_expiry;
		next_expiry_recalc(base);

	/*
	 * We have a fresh next event. Check whether we can forward the
	 * base. We can only do that when @basej is past base->clk
	 * otherwise we might rewind base->clk.
	 * base.
	 */
	if (time_after(basej, base->clk)) {
		if (time_after(nextevt, basej))
			base->clk = basej;
		else if (time_after(nextevt, base->clk))
			base->clk = nextevt;
	}
	__forward_timer_base(base, basej);

	if (time_before_eq(nextevt, basej)) {
		expires = basem;
		base->is_idle = false;
	} else {
		if (base->timers_pending)
	if (base->timers_pending) {
		nextevt = base->next_expiry;

		/* If we missed a tick already, force 0 delta */
		if (time_before(nextevt, basej))
			nextevt = basej;
		expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
	} else {
		/*
		 * If we expect to sleep more than a tick, mark the base idle.
		 * Also the tick is stopped so any added timer must forward
		 * the base clk itself to keep granularity small. This idle
		 * logic is only maintained for the BASE_STD base, deferrable
		 * timers may still see large granularity skew (by design).
		 * Move next_expiry for the empty base into the future to
		 * prevent a unnecessary raise of the timer softirq when the
		 * next_expiry value will be reached even if there is no timer
		 * pending.
		 */
		if ((expires - basem) > TICK_NSEC)
			base->is_idle = true;
		base->next_expiry = nextevt;
	}

	/*
	 * Base is idle if the next event is more than a tick away.
	 *
	 * If the base is marked idle then any timer add operation must forward
	 * the base clk itself to keep granularity small. This idle logic is
	 * only maintained for the BASE_STD base, deferrable timers may still
	 * see large granularity skew (by design).
	 */
	was_idle = base->is_idle;
	base->is_idle = time_after(nextevt, basej + 1);
	if (was_idle != base->is_idle)
		trace_timer_base_idle(base->is_idle, base->cpu);

	raw_spin_unlock(&base->lock);

	return cmp_next_hrtimer_event(basem, expires);
@@ -1984,7 +1993,10 @@ void timer_clear_idle(void)
	 * sending the IPI a few instructions smaller for the cost of taking
	 * the lock in the exit from idle path.
	 */
	if (base->is_idle) {
		base->is_idle = false;
		trace_timer_base_idle(false, smp_processor_id());
	}
}
#endif

@@ -2015,8 +2027,12 @@ static inline void __run_timers(struct timer_base *base)
		 */
		WARN_ON_ONCE(!levels && !base->next_expiry_recalc
			     && base->timers_pending);
		/*
		 * While executing timers, base->clk is set 1 offset ahead of
		 * jiffies to avoid endless requeuing to current jiffies.
		 */
		base->clk++;
		base->next_expiry = __next_timer_interrupt(base);
		next_expiry_recalc(base);

		while (levels--)
			expire_timers(base, heads + levels);