Commit 54129104 authored by Thomas Gleixner's avatar Thomas Gleixner Committed by Ingo Molnar
Browse files

rseq: Expose lightweight statistics in debugfs



Analyzing the call frequency without actually using tracing is helpful for
analysis of this infrastructure. The overhead is minimal as it just
increments a per CPU counter associated to each operation.

The debugfs readout provides a racy sum of all counters.

Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Reviewed-by: default avatarMathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.027916598@linutronix.de
parent dab34475
Loading
Loading
Loading
Loading
+0 −16
Original line number Diff line number Diff line
@@ -29,21 +29,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t)
	}
}

static __always_inline void rseq_exit_to_user_mode(void)
{
	struct rseq_event *ev = &current->rseq.event;

	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
		WARN_ON_ONCE(ev->sched_switch);

	/*
	 * Ensure that event (especially user_irq) is cleared when the
	 * interrupt did not result in a schedule and therefore the
	 * rseq processing did not clear it.
	 */
	ev->events = 0;
}

/*
 * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode,
 * which clears TIF_NOTIFY_RESUME. To avoid updating user space RSEQ in
@@ -92,7 +77,6 @@ static inline void rseq_sched_switch_event(struct task_struct *t) { }
static inline void rseq_virt_userspace_exit(void) { }
static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { }
static inline void rseq_execve(struct task_struct *t) { }
static inline void rseq_exit_to_user_mode(void) { }
#endif  /* !CONFIG_RSEQ */

#ifdef CONFIG_DEBUG_RSEQ
+49 −0
Original line number Diff line number Diff line
@@ -2,6 +2,37 @@
#ifndef _LINUX_RSEQ_ENTRY_H
#define _LINUX_RSEQ_ENTRY_H

/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
#ifdef CONFIG_RSEQ_STATS
#include <linux/percpu.h>

struct rseq_stats {
	unsigned long	exit;
	unsigned long	signal;
	unsigned long	slowpath;
	unsigned long	ids;
	unsigned long	cs;
	unsigned long	clear;
	unsigned long	fixup;
};

DECLARE_PER_CPU(struct rseq_stats, rseq_stats);

/*
 * Slow path has interrupts and preemption enabled, but the fast path
 * runs with interrupts disabled so there is no point in having the
 * preemption checks implied in __this_cpu_inc() for every operation.
 */
#ifdef RSEQ_BUILD_SLOW_PATH
#define rseq_stat_inc(which)	this_cpu_inc((which))
#else
#define rseq_stat_inc(which)	raw_cpu_inc((which))
#endif

#else /* CONFIG_RSEQ_STATS */
#define rseq_stat_inc(x)	do { } while (0)
#endif /* !CONFIG_RSEQ_STATS */

#ifdef CONFIG_RSEQ
#include <linux/rseq.h>

@@ -39,8 +70,26 @@ static __always_inline void rseq_note_user_irq_entry(void)
		current->rseq.event.user_irq = true;
}

static __always_inline void rseq_exit_to_user_mode(void)
{
	struct rseq_event *ev = &current->rseq.event;

	rseq_stat_inc(rseq_stats.exit);

	if (IS_ENABLED(CONFIG_DEBUG_RSEQ))
		WARN_ON_ONCE(ev->sched_switch);

	/*
	 * Ensure that event (especially user_irq) is cleared when the
	 * interrupt did not result in a schedule and therefore the
	 * rseq processing did not clear it.
	 */
	ev->events = 0;
}

#else /* CONFIG_RSEQ */
static inline void rseq_note_user_irq_entry(void) { }
static inline void rseq_exit_to_user_mode(void) { }
#endif /* !CONFIG_RSEQ */

#endif /* _LINUX_RSEQ_ENTRY_H */
+12 −0
Original line number Diff line number Diff line
@@ -1913,6 +1913,18 @@ config RSEQ

	  If unsure, say Y.

config RSEQ_STATS
	default n
	bool "Enable lightweight statistics of restartable sequences" if EXPERT
	depends on RSEQ && DEBUG_FS
	help
	  Enable lightweight counters which expose information about the
	  frequency of RSEQ operations via debugfs. Mostly interesting for
	  kernel debugging or performance analysis. While lightweight it's
	  still adding code into the user/kernel mode transitions.

	  If unsure, say N.

config DEBUG_RSEQ
	default n
	bool "Enable debugging of rseq() system call" if EXPERT
+72 −7
Original line number Diff line number Diff line
@@ -67,12 +67,16 @@
 *   F1. <failure>
 */

/* Required to select the proper per_cpu ops for rseq_stats_inc() */
#define RSEQ_BUILD_SLOW_PATH

#include <linux/debugfs.h>
#include <linux/ratelimit.h>
#include <linux/rseq_entry.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/rseq_entry.h>
#include <linux/uaccess.h>
#include <linux/types.h>
#include <linux/ratelimit.h>
#include <asm/ptrace.h>

#define CREATE_TRACE_POINTS
@@ -108,6 +112,56 @@ void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
}
#endif /* CONFIG_TRACEPOINTS */

#ifdef CONFIG_RSEQ_STATS
DEFINE_PER_CPU(struct rseq_stats, rseq_stats);

static int rseq_debug_show(struct seq_file *m, void *p)
{
	struct rseq_stats stats = { };
	unsigned int cpu;

	for_each_possible_cpu(cpu) {
		stats.exit	+= data_race(per_cpu(rseq_stats.exit, cpu));
		stats.signal	+= data_race(per_cpu(rseq_stats.signal, cpu));
		stats.slowpath	+= data_race(per_cpu(rseq_stats.slowpath, cpu));
		stats.ids	+= data_race(per_cpu(rseq_stats.ids, cpu));
		stats.cs	+= data_race(per_cpu(rseq_stats.cs, cpu));
		stats.clear	+= data_race(per_cpu(rseq_stats.clear, cpu));
		stats.fixup	+= data_race(per_cpu(rseq_stats.fixup, cpu));
	}

	seq_printf(m, "exit:   %16lu\n", stats.exit);
	seq_printf(m, "signal: %16lu\n", stats.signal);
	seq_printf(m, "slowp:  %16lu\n", stats.slowpath);
	seq_printf(m, "ids:    %16lu\n", stats.ids);
	seq_printf(m, "cs:     %16lu\n", stats.cs);
	seq_printf(m, "clear:  %16lu\n", stats.clear);
	seq_printf(m, "fixup:  %16lu\n", stats.fixup);
	return 0;
}

static int rseq_debug_open(struct inode *inode, struct file *file)
{
	return single_open(file, rseq_debug_show, inode->i_private);
}

static const struct file_operations dfs_ops = {
	.open		= rseq_debug_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
};

static int __init rseq_debugfs_init(void)
{
	struct dentry *root_dir = debugfs_create_dir("rseq", NULL);

	debugfs_create_file("stats", 0444, root_dir, NULL, &dfs_ops);
	return 0;
}
__initcall(rseq_debugfs_init);
#endif /* CONFIG_RSEQ_STATS */

#ifdef CONFIG_DEBUG_RSEQ
static struct rseq *rseq_kernel_fields(struct task_struct *t)
{
@@ -187,12 +241,13 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
	u32 node_id = cpu_to_node(cpu_id);
	u32 mm_cid = task_mm_cid(t);

	/*
	 * Validate read-only rseq fields.
	 */
	rseq_stat_inc(rseq_stats.ids);

	/* Validate read-only rseq fields on debug kernels */
	if (rseq_validate_ro_fields(t))
		goto efault;
	WARN_ON_ONCE((int) mm_cid < 0);

	if (!user_write_access_begin(rseq, t->rseq.len))
		goto efault;

@@ -403,6 +458,8 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
	struct rseq_cs rseq_cs;
	int ret;

	rseq_stat_inc(rseq_stats.cs);

	ret = rseq_get_rseq_cs(t, &rseq_cs);
	if (ret)
		return ret;
@@ -412,8 +469,10 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
	 * If not nested over a rseq critical section, restart is useless.
	 * Clear the rseq_cs pointer and return.
	 */
	if (!in_rseq_cs(ip, &rseq_cs))
	if (!in_rseq_cs(ip, &rseq_cs)) {
		rseq_stat_inc(rseq_stats.clear);
		return clear_rseq_cs(t->rseq.usrptr);
	}
	ret = rseq_check_flags(t, rseq_cs.flags);
	if (ret < 0)
		return ret;
@@ -422,6 +481,7 @@ static int rseq_ip_fixup(struct pt_regs *regs, bool abort)
	ret = clear_rseq_cs(t->rseq.usrptr);
	if (ret)
		return ret;
	rseq_stat_inc(rseq_stats.fixup);
	trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
			    rseq_cs.abort_ip);
	instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
@@ -462,6 +522,11 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
	if (unlikely(t->flags & PF_EXITING))
		return;

	if (ksig)
		rseq_stat_inc(rseq_stats.signal);
	else
		rseq_stat_inc(rseq_stats.slowpath);

	/*
	 * Read and clear the event pending bit first. If the task
	 * was not preempted or migrated or a signal is on the way,