Performance events updates for v6.18:
Core perf code updates: - Convert mmap() related reference counts to refcount_t. This is in reaction to the recently fixed refcount bugs, which could have been detected earlier and could have mitigated the bug somewhat. (Thomas Gleixner, Peter Zijlstra) - Clean up and simplify the callchain code, in preparation for sframes. (Steven Rostedt, Josh Poimboeuf) Uprobes updates: - Add support to optimize usdt probes on x86-64, which gives a substantial speedup. (Jiri Olsa) - Cleanups and fixes on x86 (Peter Zijlstra) PMU driver updates: - Various optimizations and fixes to the Intel PMU driver (Dapeng Mi) Misc cleanups and fixes: - Remove redundant __GFP_NOWARN (Qianfeng Rong) Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmjWpGIRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1iHvxAAvO8qWbbhUdF3EZaFU0Wx6oh5KBhImU49 VZ107xe9llA0Szy3hIl1YpdOQA2NAHtma6We/ebonrPVTTkcSCGq8absc+GahA3I CHIomx2hjD0OQ01aHvTqgHJUdFUQQ0yzE3+FY6Tsn05JsNZvDmqpAMIoMQT0LuuG 7VvVRLBuDXtuMtNmGaGCvfDGKTZkGGxD6iZS1iWHuixvVAz4IECK0vYqSyh31UGA w9Jwa0thwjKm2EZTmcSKaHSM2zw3N8QXJ3SNPPThuMrtO6QDz2+3Da9kO+vhGcRP Jls9KnWC2wxNxqIs3dr80Mzn4qMplc67Ekx2tUqX4tYEGGtJQxW6tm3JOKKIgFMI g/KF9/WJPXp0rVI9mtoQkgndzyswR/ZJBAwfEQu+nAqlp3gmmQR9+MeYPCyNnyhB 2g22PTMbXkihJmRPAVeH+WhwFy1YY3nsRhh61ha3/N0ULXTHUh0E+hWwUVMifYSV SwXqQx4srlo6RJJNTji1d6R3muNjXCQNEsJ0lCOX6ajVoxWZsPH2x7/W1A8LKmY+ FLYQUi6X9ogQbOO3WxCjUhzp5nMTNA2vvo87MUzDlZOCLPqYZmqcjntHuXwdjPyO lPcfTzc2nK1Ud26bG3+p2Bk3fjqkX9XcTMFniOvjKfffEfwpAq4xRPBQ3uRlzn0V pf9067JYF+c= =sVXH -----END PGP SIGNATURE----- Merge tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull performance events updates from Ingo Molnar: "Core perf code updates: - Convert mmap() related reference counts to refcount_t. This is in reaction to the recently fixed refcount bugs, which could have been detected earlier and could have mitigated the bug somewhat (Thomas Gleixner, Peter Zijlstra) - Clean up and simplify the callchain code, in preparation for sframes (Steven Rostedt, Josh Poimboeuf) Uprobes updates: - Add support to optimize usdt probes on x86-64, which gives a substantial speedup (Jiri Olsa) - Cleanups and fixes on x86 (Peter Zijlstra) PMU driver updates: - Various optimizations and fixes to the Intel PMU driver (Dapeng Mi) Misc cleanups and fixes: - Remove redundant __GFP_NOWARN (Qianfeng Rong)" * tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits) selftests/bpf: Fix uprobe_sigill test for uprobe syscall error value uprobes/x86: Return error from uprobe syscall when not called from trampoline perf: Skip user unwind if the task is a kernel thread perf: Simplify get_perf_callchain() user logic perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL perf: Have get_perf_callchain() return NULL if crosstask and user are set perf: Remove get_perf_callchain() init_nr argument perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap() perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48) perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error perf/x86/intel: Use early_initcall() to hook bts_init() uprobes: Remove redundant __GFP_NOWARN selftests/seccomp: validate uprobe syscall passes through seccomp seccomp: passthrough uprobe systemcall without filtering selftests/bpf: Fix uprobe syscall shadow stack test selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe selftests/bpf: Add uprobe_regs_equal test selftests/bpf: Add optimized usdt variant for basic usdt test ...
This commit is contained in:
commit
e4dcbdff11
|
@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
|||
unsigned long vaddr)
|
||||
{
|
||||
return uprobe_write_opcode(auprobe, vma, vaddr,
|
||||
__opcode_to_mem_arm(auprobe->bpinsn));
|
||||
__opcode_to_mem_arm(auprobe->bpinsn), true);
|
||||
}
|
||||
|
||||
bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
|
|
|
@ -345,6 +345,7 @@
|
|||
333 common io_pgetevents sys_io_pgetevents
|
||||
334 common rseq sys_rseq
|
||||
335 common uretprobe sys_uretprobe
|
||||
336 common uprobe sys_uprobe
|
||||
# don't use numbers 387 through 423, add new calls after the last
|
||||
# 'common' entry
|
||||
424 common pidfd_send_signal sys_pidfd_send_signal
|
||||
|
|
|
@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event)
|
|||
|
||||
void x86_pmu_show_pmu_cap(struct pmu *pmu)
|
||||
{
|
||||
pr_info("... version: %d\n", x86_pmu.version);
|
||||
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
|
||||
pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu));
|
||||
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
|
||||
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
|
||||
pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu));
|
||||
pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl));
|
||||
pr_info("... version: %d\n", x86_pmu.version);
|
||||
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
|
||||
pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu));
|
||||
pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64));
|
||||
pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu));
|
||||
pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64));
|
||||
pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask);
|
||||
pr_info("... max period: %016llx\n", x86_pmu.max_period);
|
||||
pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl));
|
||||
}
|
||||
|
||||
static int __init init_hw_perf_events(void)
|
||||
|
|
|
@ -643,4 +643,4 @@ static __init int bts_init(void)
|
|||
|
||||
return perf_pmu_register(&bts_pmu, "intel_bts", -1);
|
||||
}
|
||||
arch_initcall(bts_init);
|
||||
early_initcall(bts_init);
|
||||
|
|
|
@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
|
|||
{
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
struct hw_perf_event *hwc = &event->hw;
|
||||
u64 mask, bits = 0;
|
||||
int idx = hwc->idx;
|
||||
u64 bits = 0;
|
||||
|
||||
if (is_topdown_idx(idx)) {
|
||||
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
|
||||
|
@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
|
|||
|
||||
idx -= INTEL_PMC_IDX_FIXED;
|
||||
bits = intel_fixed_bits_by_idx(idx, bits);
|
||||
mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
|
||||
|
||||
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
|
||||
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
|
||||
bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
|
||||
mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
|
||||
}
|
||||
|
||||
cpuc->fixed_ctrl_val &= ~mask;
|
||||
cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
|
||||
cpuc->fixed_ctrl_val |= bits;
|
||||
}
|
||||
|
||||
|
@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc)
|
|||
if (event->group_leader != leader->group_leader)
|
||||
break;
|
||||
for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) {
|
||||
if (WARN_ON_ONCE(i + idx > cpuc->n_events))
|
||||
if (i + idx >= cpuc->n_events ||
|
||||
!is_acr_event_group(cpuc->event_list[i + idx]))
|
||||
return;
|
||||
__set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1);
|
||||
}
|
||||
|
@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
|
|||
0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
|
||||
|
||||
if (pmu->intel_cap.perf_metrics)
|
||||
pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
|
||||
pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
|
||||
else
|
||||
pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
|
||||
pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
|
||||
|
||||
intel_pmu_check_event_constraints(pmu->event_constraints,
|
||||
pmu->cntr_mask64,
|
||||
|
@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu)
|
|||
rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
|
||||
if (!perf_cap.perf_metrics) {
|
||||
x86_pmu.intel_cap.perf_metrics = 0;
|
||||
x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
|
||||
x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void)
|
|||
}
|
||||
|
||||
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
|
||||
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
|
||||
x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
|
||||
|
||||
if (x86_pmu.intel_cap.pebs_timing_info)
|
||||
x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;
|
||||
|
|
|
@ -315,12 +315,14 @@
|
|||
#define PERF_CAP_PT_IDX 16
|
||||
|
||||
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
|
||||
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
|
||||
#define PERF_CAP_ARCH_REG BIT_ULL(7)
|
||||
#define PERF_CAP_PEBS_FORMAT 0xf00
|
||||
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
|
||||
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
|
||||
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
|
||||
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
|
||||
#define PERF_CAP_ARCH_REG BIT_ULL(7)
|
||||
#define PERF_CAP_PEBS_FORMAT 0xf00
|
||||
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
|
||||
#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
|
||||
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
|
||||
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
|
||||
PERF_CAP_PEBS_TIMING_INFO)
|
||||
|
||||
#define MSR_IA32_RTIT_CTL 0x00000570
|
||||
#define RTIT_CTL_TRACEEN BIT(0)
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36)
|
||||
#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40)
|
||||
|
||||
#define INTEL_FIXED_BITS_MASK 0xFULL
|
||||
#define INTEL_FIXED_BITS_STRIDE 4
|
||||
#define INTEL_FIXED_0_KERNEL (1ULL << 0)
|
||||
#define INTEL_FIXED_0_USER (1ULL << 1)
|
||||
|
@ -48,6 +47,11 @@
|
|||
#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34)
|
||||
#define ICL_FIXED_0_ADAPTIVE (1ULL << 32)
|
||||
|
||||
#define INTEL_FIXED_BITS_MASK \
|
||||
(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \
|
||||
INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \
|
||||
ICL_FIXED_0_ADAPTIVE)
|
||||
|
||||
#define intel_fixed_bits_by_idx(_idx, _bits) \
|
||||
((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
|
||||
|
||||
|
@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx)
|
|||
#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
|
||||
#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
|
||||
|
||||
#define GLOBAL_CTRL_EN_PERF_METRICS 48
|
||||
#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
|
||||
/*
|
||||
* We model guest LBR event tracing as another fixed-mode PMC like BTS.
|
||||
*
|
||||
|
|
|
@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig);
|
|||
int restore_signal_shadow_stack(void);
|
||||
int shstk_update_last_frame(unsigned long val);
|
||||
bool shstk_is_enabled(void);
|
||||
int shstk_pop(u64 *val);
|
||||
int shstk_push(u64 val);
|
||||
#else
|
||||
static inline long shstk_prctl(struct task_struct *task, int option,
|
||||
unsigned long arg2) { return -EINVAL; }
|
||||
|
@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
|
|||
static inline int restore_signal_shadow_stack(void) { return 0; }
|
||||
static inline int shstk_update_last_frame(unsigned long val) { return 0; }
|
||||
static inline bool shstk_is_enabled(void) { return false; }
|
||||
static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
|
||||
static inline int shstk_push(u64 val) { return -ENOTSUPP; }
|
||||
#endif /* CONFIG_X86_USER_SHADOW_STACK */
|
||||
|
||||
#endif /* __ASSEMBLER__ */
|
||||
|
|
|
@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
|
|||
#define UPROBE_SWBP_INSN 0xcc
|
||||
#define UPROBE_SWBP_INSN_SIZE 1
|
||||
|
||||
enum {
|
||||
ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0,
|
||||
ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1,
|
||||
};
|
||||
|
||||
struct uprobe_xol_ops;
|
||||
|
||||
struct arch_uprobe {
|
||||
|
@ -45,6 +50,8 @@ struct arch_uprobe {
|
|||
u8 ilen;
|
||||
} push;
|
||||
};
|
||||
|
||||
unsigned long flags;
|
||||
};
|
||||
|
||||
struct arch_uprobe_task {
|
||||
|
|
|
@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void)
|
|||
return ssp;
|
||||
}
|
||||
|
||||
int shstk_pop(u64 *val)
|
||||
{
|
||||
int ret = 0;
|
||||
u64 ssp;
|
||||
|
||||
if (!features_enabled(ARCH_SHSTK_SHSTK))
|
||||
return -ENOTSUPP;
|
||||
|
||||
fpregs_lock_and_load();
|
||||
|
||||
rdmsrq(MSR_IA32_PL3_SSP, ssp);
|
||||
if (val && get_user(*val, (__user u64 *)ssp))
|
||||
ret = -EFAULT;
|
||||
else
|
||||
wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
|
||||
fpregs_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int shstk_push(u64 val)
|
||||
{
|
||||
u64 ssp;
|
||||
int ret;
|
||||
|
||||
if (!features_enabled(ARCH_SHSTK_SHSTK))
|
||||
return -ENOTSUPP;
|
||||
|
||||
fpregs_lock_and_load();
|
||||
|
||||
rdmsrq(MSR_IA32_PL3_SSP, ssp);
|
||||
ssp -= SS_FRAME_SIZE;
|
||||
ret = write_user_shstk_64((__user void *)ssp, val);
|
||||
if (!ret)
|
||||
wrmsrq(MSR_IA32_PL3_SSP, ssp);
|
||||
fpregs_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define SHSTK_DATA_BIT BIT(63)
|
||||
|
||||
static int put_shstk_data(u64 __user *addr, u64 data)
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <asm/processor.h>
|
||||
#include <asm/insn.h>
|
||||
#include <asm/mmu_context.h>
|
||||
#include <asm/nops.h>
|
||||
|
||||
/* Post-execution fixups. */
|
||||
|
||||
|
@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
|
|||
|
||||
#ifdef CONFIG_X86_64
|
||||
|
||||
struct uretprobe_syscall_args {
|
||||
unsigned long r11;
|
||||
unsigned long cx;
|
||||
unsigned long ax;
|
||||
};
|
||||
|
||||
asm (
|
||||
".pushsection .rodata\n"
|
||||
".global uretprobe_trampoline_entry\n"
|
||||
"uretprobe_trampoline_entry:\n"
|
||||
"pushq %rax\n"
|
||||
"pushq %rcx\n"
|
||||
"pushq %r11\n"
|
||||
"movq $" __stringify(__NR_uretprobe) ", %rax\n"
|
||||
"push %rax\n"
|
||||
"push %rcx\n"
|
||||
"push %r11\n"
|
||||
"mov $" __stringify(__NR_uretprobe) ", %rax\n"
|
||||
"syscall\n"
|
||||
".global uretprobe_syscall_check\n"
|
||||
"uretprobe_syscall_check:\n"
|
||||
"popq %r11\n"
|
||||
"popq %rcx\n"
|
||||
|
||||
/* The uretprobe syscall replaces stored %rax value with final
|
||||
"pop %r11\n"
|
||||
"pop %rcx\n"
|
||||
/*
|
||||
* The uretprobe syscall replaces stored %rax value with final
|
||||
* return address, so we don't restore %rax in here and just
|
||||
* call ret.
|
||||
*/
|
||||
"retq\n"
|
||||
"ret\n"
|
||||
"int3\n"
|
||||
".global uretprobe_trampoline_end\n"
|
||||
"uretprobe_trampoline_end:\n"
|
||||
".popsection\n"
|
||||
|
@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[];
|
|||
extern u8 uretprobe_trampoline_end[];
|
||||
extern u8 uretprobe_syscall_check[];
|
||||
|
||||
void *arch_uprobe_trampoline(unsigned long *psize)
|
||||
void *arch_uretprobe_trampoline(unsigned long *psize)
|
||||
{
|
||||
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
|
@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp)
|
|||
SYSCALL_DEFINE0(uretprobe)
|
||||
{
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
unsigned long err, ip, sp, r11_cx_ax[3], tramp;
|
||||
struct uretprobe_syscall_args args;
|
||||
unsigned long err, ip, sp, tramp;
|
||||
|
||||
/* If there's no trampoline, we are called from wrong place. */
|
||||
tramp = uprobe_get_trampoline_vaddr();
|
||||
|
@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe)
|
|||
if (unlikely(regs->ip != trampoline_check_ip(tramp)))
|
||||
goto sigill;
|
||||
|
||||
err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
|
||||
err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
|
||||
if (err)
|
||||
goto sigill;
|
||||
|
||||
/* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
|
||||
regs->r11 = r11_cx_ax[0];
|
||||
regs->cx = r11_cx_ax[1];
|
||||
regs->ax = r11_cx_ax[2];
|
||||
regs->sp += sizeof(r11_cx_ax);
|
||||
regs->r11 = args.r11;
|
||||
regs->cx = args.cx;
|
||||
regs->ax = args.ax;
|
||||
regs->sp += sizeof(args);
|
||||
regs->orig_ax = -1;
|
||||
|
||||
ip = regs->ip;
|
||||
|
@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe)
|
|||
*/
|
||||
if (regs->sp != sp || shstk_is_enabled())
|
||||
return regs->ax;
|
||||
regs->sp -= sizeof(r11_cx_ax);
|
||||
regs->sp -= sizeof(args);
|
||||
|
||||
/* for the case uprobe_consumer has changed r11/cx */
|
||||
r11_cx_ax[0] = regs->r11;
|
||||
r11_cx_ax[1] = regs->cx;
|
||||
args.r11 = regs->r11;
|
||||
args.cx = regs->cx;
|
||||
|
||||
/*
|
||||
* ax register is passed through as return value, so we can use
|
||||
* its space on stack for ip value and jump to it through the
|
||||
* trampoline's ret instruction
|
||||
*/
|
||||
r11_cx_ax[2] = regs->ip;
|
||||
args.ax = regs->ip;
|
||||
regs->ip = ip;
|
||||
|
||||
err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
|
||||
err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
|
||||
if (err)
|
||||
goto sigill;
|
||||
|
||||
|
@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
|||
*sr = utask->autask.saved_scratch_register;
|
||||
}
|
||||
}
|
||||
|
||||
static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
|
||||
{
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
static struct page *tramp_mapping_pages[2] __ro_after_init;
|
||||
|
||||
static struct vm_special_mapping tramp_mapping = {
|
||||
.name = "[uprobes-trampoline]",
|
||||
.mremap = tramp_mremap,
|
||||
.pages = tramp_mapping_pages,
|
||||
};
|
||||
|
||||
struct uprobe_trampoline {
|
||||
struct hlist_node node;
|
||||
unsigned long vaddr;
|
||||
};
|
||||
|
||||
static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
|
||||
{
|
||||
long delta = (long)(vaddr + 5 - vtramp);
|
||||
|
||||
return delta >= INT_MIN && delta <= INT_MAX;
|
||||
}
|
||||
|
||||
static unsigned long find_nearest_trampoline(unsigned long vaddr)
|
||||
{
|
||||
struct vm_unmapped_area_info info = {
|
||||
.length = PAGE_SIZE,
|
||||
.align_mask = ~PAGE_MASK,
|
||||
};
|
||||
unsigned long low_limit, high_limit;
|
||||
unsigned long low_tramp, high_tramp;
|
||||
unsigned long call_end = vaddr + 5;
|
||||
|
||||
if (check_add_overflow(call_end, INT_MIN, &low_limit))
|
||||
low_limit = PAGE_SIZE;
|
||||
|
||||
high_limit = call_end + INT_MAX;
|
||||
|
||||
/* Search up from the caller address. */
|
||||
info.low_limit = call_end;
|
||||
info.high_limit = min(high_limit, TASK_SIZE);
|
||||
high_tramp = vm_unmapped_area(&info);
|
||||
|
||||
/* Search down from the caller address. */
|
||||
info.low_limit = max(low_limit, PAGE_SIZE);
|
||||
info.high_limit = call_end;
|
||||
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
|
||||
low_tramp = vm_unmapped_area(&info);
|
||||
|
||||
if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
|
||||
return -ENOMEM;
|
||||
if (IS_ERR_VALUE(high_tramp))
|
||||
return low_tramp;
|
||||
if (IS_ERR_VALUE(low_tramp))
|
||||
return high_tramp;
|
||||
|
||||
/* Return address that's closest to the caller address. */
|
||||
if (call_end - low_tramp < high_tramp - call_end)
|
||||
return low_tramp;
|
||||
return high_tramp;
|
||||
}
|
||||
|
||||
static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
|
||||
{
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct uprobe_trampoline *tramp;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
if (!user_64bit_mode(regs))
|
||||
return NULL;
|
||||
|
||||
vaddr = find_nearest_trampoline(vaddr);
|
||||
if (IS_ERR_VALUE(vaddr))
|
||||
return NULL;
|
||||
|
||||
tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
|
||||
if (unlikely(!tramp))
|
||||
return NULL;
|
||||
|
||||
tramp->vaddr = vaddr;
|
||||
vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
|
||||
VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
|
||||
&tramp_mapping);
|
||||
if (IS_ERR(vma)) {
|
||||
kfree(tramp);
|
||||
return NULL;
|
||||
}
|
||||
return tramp;
|
||||
}
|
||||
|
||||
static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
|
||||
{
|
||||
struct uprobes_state *state = ¤t->mm->uprobes_state;
|
||||
struct uprobe_trampoline *tramp = NULL;
|
||||
|
||||
if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
|
||||
return NULL;
|
||||
|
||||
hlist_for_each_entry(tramp, &state->head_tramps, node) {
|
||||
if (is_reachable_by_call(tramp->vaddr, vaddr)) {
|
||||
*new = false;
|
||||
return tramp;
|
||||
}
|
||||
}
|
||||
|
||||
tramp = create_uprobe_trampoline(vaddr);
|
||||
if (!tramp)
|
||||
return NULL;
|
||||
|
||||
*new = true;
|
||||
hlist_add_head(&tramp->node, &state->head_tramps);
|
||||
return tramp;
|
||||
}
|
||||
|
||||
static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
|
||||
{
|
||||
/*
|
||||
* We do not unmap and release uprobe trampoline page itself,
|
||||
* because there's no easy way to make sure none of the threads
|
||||
* is still inside the trampoline.
|
||||
*/
|
||||
hlist_del(&tramp->node);
|
||||
kfree(tramp);
|
||||
}
|
||||
|
||||
void arch_uprobe_init_state(struct mm_struct *mm)
|
||||
{
|
||||
INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
|
||||
}
|
||||
|
||||
void arch_uprobe_clear_state(struct mm_struct *mm)
|
||||
{
|
||||
struct uprobes_state *state = &mm->uprobes_state;
|
||||
struct uprobe_trampoline *tramp;
|
||||
struct hlist_node *n;
|
||||
|
||||
hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
|
||||
destroy_uprobe_trampoline(tramp);
|
||||
}
|
||||
|
||||
static bool __in_uprobe_trampoline(unsigned long ip)
|
||||
{
|
||||
struct vm_area_struct *vma = vma_lookup(current->mm, ip);
|
||||
|
||||
return vma && vma_is_special_mapping(vma, &tramp_mapping);
|
||||
}
|
||||
|
||||
static bool in_uprobe_trampoline(unsigned long ip)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
bool found, retry = true;
|
||||
unsigned int seq;
|
||||
|
||||
rcu_read_lock();
|
||||
if (mmap_lock_speculate_try_begin(mm, &seq)) {
|
||||
found = __in_uprobe_trampoline(ip);
|
||||
retry = mmap_lock_speculate_retry(mm, seq);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (retry) {
|
||||
mmap_read_lock(mm);
|
||||
found = __in_uprobe_trampoline(ip);
|
||||
mmap_read_unlock(mm);
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
/*
|
||||
* See uprobe syscall trampoline; the call to the trampoline will push
|
||||
* the return address on the stack, the trampoline itself then pushes
|
||||
* cx, r11 and ax.
|
||||
*/
|
||||
struct uprobe_syscall_args {
|
||||
unsigned long ax;
|
||||
unsigned long r11;
|
||||
unsigned long cx;
|
||||
unsigned long retaddr;
|
||||
};
|
||||
|
||||
SYSCALL_DEFINE0(uprobe)
|
||||
{
|
||||
struct pt_regs *regs = task_pt_regs(current);
|
||||
struct uprobe_syscall_args args;
|
||||
unsigned long ip, sp, sret;
|
||||
int err;
|
||||
|
||||
/* Allow execution only from uprobe trampolines. */
|
||||
if (!in_uprobe_trampoline(regs->ip))
|
||||
return -ENXIO;
|
||||
|
||||
err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
|
||||
if (err)
|
||||
goto sigill;
|
||||
|
||||
ip = regs->ip;
|
||||
|
||||
/*
|
||||
* expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
|
||||
* - adjust ip to the probe address, call saved next instruction address
|
||||
* - adjust sp to the probe's stack frame (check trampoline code)
|
||||
*/
|
||||
regs->ax = args.ax;
|
||||
regs->r11 = args.r11;
|
||||
regs->cx = args.cx;
|
||||
regs->ip = args.retaddr - 5;
|
||||
regs->sp += sizeof(args);
|
||||
regs->orig_ax = -1;
|
||||
|
||||
sp = regs->sp;
|
||||
|
||||
err = shstk_pop((u64 *)&sret);
|
||||
if (err == -EFAULT || (!err && sret != args.retaddr))
|
||||
goto sigill;
|
||||
|
||||
handle_syscall_uprobe(regs, regs->ip);
|
||||
|
||||
/*
|
||||
* Some of the uprobe consumers has changed sp, we can do nothing,
|
||||
* just return via iret.
|
||||
*/
|
||||
if (regs->sp != sp) {
|
||||
/* skip the trampoline call */
|
||||
if (args.retaddr - 5 == regs->ip)
|
||||
regs->ip += 5;
|
||||
return regs->ax;
|
||||
}
|
||||
|
||||
regs->sp -= sizeof(args);
|
||||
|
||||
/* for the case uprobe_consumer has changed ax/r11/cx */
|
||||
args.ax = regs->ax;
|
||||
args.r11 = regs->r11;
|
||||
args.cx = regs->cx;
|
||||
|
||||
/* keep return address unless we are instructed otherwise */
|
||||
if (args.retaddr - 5 != regs->ip)
|
||||
args.retaddr = regs->ip;
|
||||
|
||||
if (shstk_push(args.retaddr) == -EFAULT)
|
||||
goto sigill;
|
||||
|
||||
regs->ip = ip;
|
||||
|
||||
err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
|
||||
if (err)
|
||||
goto sigill;
|
||||
|
||||
/* ensure sysret, see do_syscall_64() */
|
||||
regs->r11 = regs->flags;
|
||||
regs->cx = regs->ip;
|
||||
return 0;
|
||||
|
||||
sigill:
|
||||
force_sig(SIGILL);
|
||||
return -1;
|
||||
}
|
||||
|
||||
asm (
|
||||
".pushsection .rodata\n"
|
||||
".balign " __stringify(PAGE_SIZE) "\n"
|
||||
"uprobe_trampoline_entry:\n"
|
||||
"push %rcx\n"
|
||||
"push %r11\n"
|
||||
"push %rax\n"
|
||||
"mov $" __stringify(__NR_uprobe) ", %rax\n"
|
||||
"syscall\n"
|
||||
"pop %rax\n"
|
||||
"pop %r11\n"
|
||||
"pop %rcx\n"
|
||||
"ret\n"
|
||||
"int3\n"
|
||||
".balign " __stringify(PAGE_SIZE) "\n"
|
||||
".popsection\n"
|
||||
);
|
||||
|
||||
extern u8 uprobe_trampoline_entry[];
|
||||
|
||||
static int __init arch_uprobes_init(void)
|
||||
{
|
||||
tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
|
||||
return 0;
|
||||
}
|
||||
|
||||
late_initcall(arch_uprobes_init);
|
||||
|
||||
enum {
|
||||
EXPECT_SWBP,
|
||||
EXPECT_CALL,
|
||||
};
|
||||
|
||||
struct write_opcode_ctx {
|
||||
unsigned long base;
|
||||
int expect;
|
||||
};
|
||||
|
||||
static int is_call_insn(uprobe_opcode_t *insn)
|
||||
{
|
||||
return *insn == CALL_INSN_OPCODE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verification callback used by int3_update uprobe_write calls to make sure
|
||||
* the underlying instruction is as expected - either int3 or call.
|
||||
*/
|
||||
static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
|
||||
int nbytes, void *data)
|
||||
{
|
||||
struct write_opcode_ctx *ctx = data;
|
||||
uprobe_opcode_t old_opcode[5];
|
||||
|
||||
uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
|
||||
|
||||
switch (ctx->expect) {
|
||||
case EXPECT_SWBP:
|
||||
if (is_swbp_insn(&old_opcode[0]))
|
||||
return 1;
|
||||
break;
|
||||
case EXPECT_CALL:
|
||||
if (is_call_insn(&old_opcode[0]))
|
||||
return 1;
|
||||
break;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Modify multi-byte instructions by using INT3 breakpoints on SMP.
|
||||
* We completely avoid using stop_machine() here, and achieve the
|
||||
* synchronization using INT3 breakpoints and SMP cross-calls.
|
||||
* (borrowed comment from smp_text_poke_batch_finish)
|
||||
*
|
||||
* The way it is done:
|
||||
* - Add an INT3 trap to the address that will be patched
|
||||
* - SMP sync all CPUs
|
||||
* - Update all but the first byte of the patched range
|
||||
* - SMP sync all CPUs
|
||||
* - Replace the first byte (INT3) by the first byte of the replacing opcode
|
||||
* - SMP sync all CPUs
|
||||
*/
|
||||
static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
unsigned long vaddr, char *insn, bool optimize)
|
||||
{
|
||||
uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
|
||||
struct write_opcode_ctx ctx = {
|
||||
.base = vaddr,
|
||||
};
|
||||
int err;
|
||||
|
||||
/*
|
||||
* Write int3 trap.
|
||||
*
|
||||
* The swbp_optimize path comes with breakpoint already installed,
|
||||
* so we can skip this step for optimize == true.
|
||||
*/
|
||||
if (!optimize) {
|
||||
ctx.expect = EXPECT_CALL;
|
||||
err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
|
||||
true /* is_register */, false /* do_update_ref_ctr */,
|
||||
&ctx);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
||||
smp_text_poke_sync_each_cpu();
|
||||
|
||||
/* Write all but the first byte of the patched range. */
|
||||
ctx.expect = EXPECT_SWBP;
|
||||
err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
|
||||
true /* is_register */, false /* do_update_ref_ctr */,
|
||||
&ctx);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
smp_text_poke_sync_each_cpu();
|
||||
|
||||
/*
|
||||
* Write first byte.
|
||||
*
|
||||
* The swbp_unoptimize needs to finish uprobe removal together
|
||||
* with ref_ctr update, using uprobe_write with proper flags.
|
||||
*/
|
||||
err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
|
||||
optimize /* is_register */, !optimize /* do_update_ref_ctr */,
|
||||
&ctx);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
smp_text_poke_sync_each_cpu();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
unsigned long vaddr, unsigned long tramp)
|
||||
{
|
||||
u8 call[5];
|
||||
|
||||
__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
|
||||
(const void *) tramp, CALL_INSN_SIZE);
|
||||
return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
|
||||
}
|
||||
|
||||
static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
unsigned long vaddr)
|
||||
{
|
||||
return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
|
||||
}
|
||||
|
||||
static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
|
||||
{
|
||||
unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
|
||||
struct vm_area_struct *vma;
|
||||
struct page *page;
|
||||
|
||||
page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
|
||||
if (IS_ERR(page))
|
||||
return PTR_ERR(page);
|
||||
uprobe_copy_from_page(page, vaddr, dst, len);
|
||||
put_page(page);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
|
||||
{
|
||||
struct __packed __arch_relative_insn {
|
||||
u8 op;
|
||||
s32 raddr;
|
||||
} *call = (struct __arch_relative_insn *) insn;
|
||||
|
||||
if (!is_call_insn(insn))
|
||||
return false;
|
||||
return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
|
||||
}
|
||||
|
||||
static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
|
||||
{
|
||||
uprobe_opcode_t insn[5];
|
||||
int err;
|
||||
|
||||
err = copy_from_vaddr(mm, vaddr, &insn, 5);
|
||||
if (err)
|
||||
return err;
|
||||
return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
|
||||
}
|
||||
|
||||
static bool should_optimize(struct arch_uprobe *auprobe)
|
||||
{
|
||||
return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
|
||||
test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
|
||||
}
|
||||
|
||||
int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
unsigned long vaddr)
|
||||
{
|
||||
if (should_optimize(auprobe)) {
|
||||
/*
|
||||
* We could race with another thread that already optimized the probe,
|
||||
* so let's not overwrite it with int3 again in this case.
|
||||
*/
|
||||
int ret = is_optimized(vma->vm_mm, vaddr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret)
|
||||
return 0;
|
||||
}
|
||||
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
|
||||
true /* is_register */);
|
||||
}
|
||||
|
||||
int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
unsigned long vaddr)
|
||||
{
|
||||
if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
|
||||
int ret = is_optimized(vma->vm_mm, vaddr);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
if (ret) {
|
||||
ret = swbp_unoptimize(auprobe, vma, vaddr);
|
||||
WARN_ON_ONCE(ret);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
|
||||
false /* is_register */);
|
||||
}
|
||||
|
||||
static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
|
||||
unsigned long vaddr)
|
||||
{
|
||||
struct uprobe_trampoline *tramp;
|
||||
struct vm_area_struct *vma;
|
||||
bool new = false;
|
||||
int err = 0;
|
||||
|
||||
vma = find_vma(mm, vaddr);
|
||||
if (!vma)
|
||||
return -EINVAL;
|
||||
tramp = get_uprobe_trampoline(vaddr, &new);
|
||||
if (!tramp)
|
||||
return -EINVAL;
|
||||
err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
|
||||
if (WARN_ON_ONCE(err) && new)
|
||||
destroy_uprobe_trampoline(tramp);
|
||||
return err;
|
||||
}
|
||||
|
||||
void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
uprobe_opcode_t insn[5];
|
||||
|
||||
if (!should_optimize(auprobe))
|
||||
return;
|
||||
|
||||
mmap_write_lock(mm);
|
||||
|
||||
/*
|
||||
* Check if some other thread already optimized the uprobe for us,
|
||||
* if it's the case just go away silently.
|
||||
*/
|
||||
if (copy_from_vaddr(mm, vaddr, &insn, 5))
|
||||
goto unlock;
|
||||
if (!is_swbp_insn((uprobe_opcode_t*) &insn))
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* If we fail to optimize the uprobe we set the fail bit so the
|
||||
* above should_optimize will fail from now on.
|
||||
*/
|
||||
if (__arch_uprobe_optimize(auprobe, mm, vaddr))
|
||||
set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
|
||||
|
||||
unlock:
|
||||
mmap_write_unlock(mm);
|
||||
}
|
||||
|
||||
static bool insn_is_nop(struct insn *insn)
|
||||
{
|
||||
return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
|
||||
}
|
||||
|
||||
static bool insn_is_nopl(struct insn *insn)
|
||||
{
|
||||
if (insn->opcode.nbytes != 2)
|
||||
return false;
|
||||
|
||||
if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
|
||||
return false;
|
||||
|
||||
if (!insn->modrm.nbytes)
|
||||
return false;
|
||||
|
||||
if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
|
||||
return false;
|
||||
|
||||
/* 0f 1f /0 - NOPL */
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool can_optimize(struct insn *insn, unsigned long vaddr)
|
||||
{
|
||||
if (!insn->x86_64 || insn->length != 5)
|
||||
return false;
|
||||
|
||||
if (!insn_is_nop(insn) && !insn_is_nopl(insn))
|
||||
return false;
|
||||
|
||||
/* We can't do cross page atomic writes yet. */
|
||||
return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
|
||||
}
|
||||
#else /* 32-bit: */
|
||||
/*
|
||||
* No RIP-relative addressing on 32-bit
|
||||
|
@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
|||
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
|
||||
{
|
||||
}
|
||||
static bool can_optimize(struct insn *insn, unsigned long vaddr)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_X86_64 */
|
||||
|
||||
struct uprobe_xol_ops {
|
||||
|
@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
|
|||
*/
|
||||
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
|
||||
{
|
||||
struct insn insn;
|
||||
u8 fix_ip_or_call = UPROBE_FIX_IP;
|
||||
struct insn insn;
|
||||
int ret;
|
||||
|
||||
ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (can_optimize(&insn, addr))
|
||||
set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
|
||||
|
||||
ret = branch_setup_xol_ops(auprobe, &insn);
|
||||
if (ret != -ENOSYS)
|
||||
return ret;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
|
||||
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
|
||||
|
||||
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
|
||||
/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
|
||||
#define fixed_ctrl_field(ctrl_reg, idx) \
|
||||
(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)
|
||||
|
||||
|
|
|
@ -859,7 +859,7 @@ struct perf_event {
|
|||
|
||||
/* mmap bits */
|
||||
struct mutex mmap_mutex;
|
||||
atomic_t mmap_count;
|
||||
refcount_t mmap_count;
|
||||
|
||||
struct perf_buffer *rb;
|
||||
struct list_head rb_entry;
|
||||
|
@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
|
|||
extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
|
||||
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
|
||||
extern struct perf_callchain_entry *
|
||||
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
|
||||
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
|
||||
u32 max_stack, bool crosstask, bool add_mark);
|
||||
extern int get_callchain_buffers(int max_stack);
|
||||
extern void put_callchain_buffers(void);
|
||||
|
|
|
@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
|
|||
|
||||
asmlinkage long sys_uretprobe(void);
|
||||
|
||||
asmlinkage long sys_uprobe(void);
|
||||
|
||||
/* pciconfig: alpha, arm, arm64, ia64, sparc */
|
||||
asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
|
||||
unsigned long off, unsigned long len,
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <linux/wait.h>
|
||||
#include <linux/timer.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/mutex.h>
|
||||
|
||||
struct uprobe;
|
||||
struct vm_area_struct;
|
||||
|
@ -185,8 +186,14 @@ struct xol_area;
|
|||
|
||||
struct uprobes_state {
|
||||
struct xol_area *xol_area;
|
||||
#ifdef CONFIG_X86_64
|
||||
struct hlist_head head_tramps;
|
||||
#endif
|
||||
};
|
||||
|
||||
typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
|
||||
uprobe_opcode_t *insn, int nbytes, void *data);
|
||||
|
||||
extern void __init uprobes_init(void);
|
||||
extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
|
||||
extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
|
||||
|
@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn);
|
|||
extern bool is_trap_insn(uprobe_opcode_t *insn);
|
||||
extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
|
||||
extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
|
||||
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t);
|
||||
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
|
||||
bool is_register);
|
||||
extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
|
||||
uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
|
||||
void *data);
|
||||
extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
|
||||
extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
|
||||
extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
|
||||
|
@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
|
|||
extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
|
||||
void *src, unsigned long len);
|
||||
extern void uprobe_handle_trampoline(struct pt_regs *regs);
|
||||
extern void *arch_uprobe_trampoline(unsigned long *psize);
|
||||
extern void *arch_uretprobe_trampoline(unsigned long *psize);
|
||||
extern unsigned long uprobe_get_trampoline_vaddr(void);
|
||||
extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
|
||||
extern void arch_uprobe_clear_state(struct mm_struct *mm);
|
||||
extern void arch_uprobe_init_state(struct mm_struct *mm);
|
||||
extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
|
||||
extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
|
||||
#else /* !CONFIG_UPROBES */
|
||||
struct uprobes_state {
|
||||
};
|
||||
|
|
|
@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
|
|||
if (max_depth > sysctl_perf_event_max_stack)
|
||||
max_depth = sysctl_perf_event_max_stack;
|
||||
|
||||
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
|
||||
trace = get_perf_callchain(regs, kernel, user, max_depth,
|
||||
false, false);
|
||||
|
||||
if (unlikely(!trace))
|
||||
|
@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
|
|||
else if (kernel && task)
|
||||
trace = get_callchain_entry_for_task(task, max_depth);
|
||||
else
|
||||
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
|
||||
trace = get_perf_callchain(regs, kernel, user, max_depth,
|
||||
crosstask, false);
|
||||
|
||||
if (unlikely(!trace) || trace->nr < skip) {
|
||||
|
|
|
@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
|
|||
}
|
||||
|
||||
struct perf_callchain_entry *
|
||||
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
|
||||
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
|
||||
u32 max_stack, bool crosstask, bool add_mark)
|
||||
{
|
||||
struct perf_callchain_entry *entry;
|
||||
struct perf_callchain_entry_ctx ctx;
|
||||
int rctx, start_entry_idx;
|
||||
|
||||
/* crosstask is not supported for user stacks */
|
||||
if (crosstask && user && !kernel)
|
||||
return NULL;
|
||||
|
||||
entry = get_callchain_entry(&rctx);
|
||||
if (!entry)
|
||||
return NULL;
|
||||
|
||||
ctx.entry = entry;
|
||||
ctx.max_stack = max_stack;
|
||||
ctx.nr = entry->nr = init_nr;
|
||||
ctx.contexts = 0;
|
||||
ctx.contexts_maxed = false;
|
||||
ctx.entry = entry;
|
||||
ctx.max_stack = max_stack;
|
||||
ctx.nr = entry->nr = 0;
|
||||
ctx.contexts = 0;
|
||||
ctx.contexts_maxed = false;
|
||||
|
||||
if (kernel && !user_mode(regs)) {
|
||||
if (add_mark)
|
||||
|
@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
|
|||
perf_callchain_kernel(&ctx, regs);
|
||||
}
|
||||
|
||||
if (user) {
|
||||
if (user && !crosstask) {
|
||||
if (!user_mode(regs)) {
|
||||
if (current->mm)
|
||||
regs = task_pt_regs(current);
|
||||
else
|
||||
regs = NULL;
|
||||
}
|
||||
|
||||
if (regs) {
|
||||
if (crosstask)
|
||||
if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
|
||||
goto exit_put;
|
||||
|
||||
if (add_mark)
|
||||
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
|
||||
|
||||
start_entry_idx = entry->nr;
|
||||
perf_callchain_user(&ctx, regs);
|
||||
fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
|
||||
regs = task_pt_regs(current);
|
||||
}
|
||||
|
||||
if (add_mark)
|
||||
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
|
||||
|
||||
start_entry_idx = entry->nr;
|
||||
perf_callchain_user(&ctx, regs);
|
||||
fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
|
||||
}
|
||||
|
||||
exit_put:
|
||||
|
|
|
@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
|
|||
*/
|
||||
static inline bool event_update_userpage(struct perf_event *event)
|
||||
{
|
||||
if (likely(!atomic_read(&event->mmap_count)))
|
||||
if (likely(!refcount_read(&event->mmap_count)))
|
||||
return false;
|
||||
|
||||
perf_event_update_time(event);
|
||||
|
@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
|
|||
struct perf_event *event = vma->vm_file->private_data;
|
||||
mapped_f mapped = get_mapped(event, event_mapped);
|
||||
|
||||
atomic_inc(&event->mmap_count);
|
||||
atomic_inc(&event->rb->mmap_count);
|
||||
refcount_inc(&event->mmap_count);
|
||||
refcount_inc(&event->rb->mmap_count);
|
||||
|
||||
if (vma->vm_pgoff)
|
||||
atomic_inc(&event->rb->aux_mmap_count);
|
||||
refcount_inc(&event->rb->aux_mmap_count);
|
||||
|
||||
if (mapped)
|
||||
mapped(event, vma->vm_mm);
|
||||
|
@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
|
|||
* to avoid complications.
|
||||
*/
|
||||
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
|
||||
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
|
||||
refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
|
||||
/*
|
||||
* Stop all AUX events that are writing to this buffer,
|
||||
* so that we can free its AUX pages and corresponding PMU
|
||||
|
@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
|
|||
mutex_unlock(&rb->aux_mutex);
|
||||
}
|
||||
|
||||
if (atomic_dec_and_test(&rb->mmap_count))
|
||||
if (refcount_dec_and_test(&rb->mmap_count))
|
||||
detach_rest = true;
|
||||
|
||||
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
|
||||
if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
|
||||
goto out_put;
|
||||
|
||||
ring_buffer_attach(event, NULL);
|
||||
|
@ -6933,19 +6933,200 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
|
|||
return err;
|
||||
}
|
||||
|
||||
static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
|
||||
{
|
||||
unsigned long user_locked, user_lock_limit, locked, lock_limit;
|
||||
struct user_struct *user = current_user();
|
||||
|
||||
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
|
||||
/* Increase the limit linearly with more CPUs */
|
||||
user_lock_limit *= num_online_cpus();
|
||||
|
||||
user_locked = atomic_long_read(&user->locked_vm);
|
||||
|
||||
/*
|
||||
* sysctl_perf_event_mlock may have changed, so that
|
||||
* user->locked_vm > user_lock_limit
|
||||
*/
|
||||
if (user_locked > user_lock_limit)
|
||||
user_locked = user_lock_limit;
|
||||
user_locked += *user_extra;
|
||||
|
||||
if (user_locked > user_lock_limit) {
|
||||
/*
|
||||
* charge locked_vm until it hits user_lock_limit;
|
||||
* charge the rest from pinned_vm
|
||||
*/
|
||||
*extra = user_locked - user_lock_limit;
|
||||
*user_extra -= *extra;
|
||||
}
|
||||
|
||||
lock_limit = rlimit(RLIMIT_MEMLOCK);
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
|
||||
|
||||
return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
|
||||
}
|
||||
|
||||
static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
|
||||
{
|
||||
struct user_struct *user = current_user();
|
||||
|
||||
atomic_long_add(user_extra, &user->locked_vm);
|
||||
atomic64_add(extra, &vma->vm_mm->pinned_vm);
|
||||
}
|
||||
|
||||
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
long extra = 0, user_extra = nr_pages;
|
||||
struct perf_buffer *rb;
|
||||
int rb_flags = 0;
|
||||
|
||||
nr_pages -= 1;
|
||||
|
||||
/*
|
||||
* If we have rb pages ensure they're a power-of-two number, so we
|
||||
* can do bitmasks instead of modulo.
|
||||
*/
|
||||
if (nr_pages != 0 && !is_power_of_2(nr_pages))
|
||||
return -EINVAL;
|
||||
|
||||
WARN_ON_ONCE(event->ctx->parent_ctx);
|
||||
|
||||
if (event->rb) {
|
||||
if (data_page_nr(event->rb) != nr_pages)
|
||||
return -EINVAL;
|
||||
|
||||
if (refcount_inc_not_zero(&event->rb->mmap_count)) {
|
||||
/*
|
||||
* Success -- managed to mmap() the same buffer
|
||||
* multiple times.
|
||||
*/
|
||||
perf_mmap_account(vma, user_extra, extra);
|
||||
refcount_inc(&event->mmap_count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Raced against perf_mmap_close()'s
|
||||
* refcount_dec_and_mutex_lock() remove the
|
||||
* event and continue as if !event->rb
|
||||
*/
|
||||
ring_buffer_attach(event, NULL);
|
||||
}
|
||||
|
||||
if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
|
||||
return -EPERM;
|
||||
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
rb_flags |= RING_BUFFER_WRITABLE;
|
||||
|
||||
rb = rb_alloc(nr_pages,
|
||||
event->attr.watermark ? event->attr.wakeup_watermark : 0,
|
||||
event->cpu, rb_flags);
|
||||
|
||||
if (!rb)
|
||||
return -ENOMEM;
|
||||
|
||||
refcount_set(&rb->mmap_count, 1);
|
||||
rb->mmap_user = get_current_user();
|
||||
rb->mmap_locked = extra;
|
||||
|
||||
ring_buffer_attach(event, rb);
|
||||
|
||||
perf_event_update_time(event);
|
||||
perf_event_init_userpage(event);
|
||||
perf_event_update_userpage(event);
|
||||
|
||||
perf_mmap_account(vma, user_extra, extra);
|
||||
refcount_set(&event->mmap_count, 1);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
long extra = 0, user_extra = nr_pages;
|
||||
u64 aux_offset, aux_size;
|
||||
struct perf_buffer *rb;
|
||||
int ret, rb_flags = 0;
|
||||
|
||||
rb = event->rb;
|
||||
if (!rb)
|
||||
return -EINVAL;
|
||||
|
||||
guard(mutex)(&rb->aux_mutex);
|
||||
|
||||
/*
|
||||
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
|
||||
* mapped, all subsequent mappings should have the same size
|
||||
* and offset. Must be above the normal perf buffer.
|
||||
*/
|
||||
aux_offset = READ_ONCE(rb->user_page->aux_offset);
|
||||
aux_size = READ_ONCE(rb->user_page->aux_size);
|
||||
|
||||
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
|
||||
return -EINVAL;
|
||||
|
||||
/* already mapped with a different offset */
|
||||
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
|
||||
return -EINVAL;
|
||||
|
||||
if (aux_size != nr_pages * PAGE_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
/* already mapped with a different size */
|
||||
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
|
||||
return -EINVAL;
|
||||
|
||||
if (!is_power_of_2(nr_pages))
|
||||
return -EINVAL;
|
||||
|
||||
if (!refcount_inc_not_zero(&rb->mmap_count))
|
||||
return -EINVAL;
|
||||
|
||||
if (rb_has_aux(rb)) {
|
||||
refcount_inc(&rb->aux_mmap_count);
|
||||
|
||||
} else {
|
||||
if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
|
||||
refcount_dec(&rb->mmap_count);
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
WARN_ON(!rb && event->rb);
|
||||
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
rb_flags |= RING_BUFFER_WRITABLE;
|
||||
|
||||
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
|
||||
event->attr.aux_watermark, rb_flags);
|
||||
if (ret) {
|
||||
refcount_dec(&rb->mmap_count);
|
||||
return ret;
|
||||
}
|
||||
|
||||
refcount_set(&rb->aux_mmap_count, 1);
|
||||
rb->aux_mmap_locked = extra;
|
||||
}
|
||||
|
||||
perf_mmap_account(vma, user_extra, extra);
|
||||
refcount_inc(&event->mmap_count);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
struct perf_event *event = file->private_data;
|
||||
unsigned long user_locked, user_lock_limit;
|
||||
struct user_struct *user = current_user();
|
||||
struct mutex *aux_mutex = NULL;
|
||||
struct perf_buffer *rb = NULL;
|
||||
unsigned long locked, lock_limit;
|
||||
unsigned long vma_size;
|
||||
unsigned long nr_pages;
|
||||
long user_extra = 0, extra = 0;
|
||||
int ret, flags = 0;
|
||||
unsigned long vma_size, nr_pages;
|
||||
mapped_f mapped;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Don't allow mmap() of inherited per-task counters. This would
|
||||
|
@ -6971,192 +7152,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
|
|||
if (vma_size != PAGE_SIZE * nr_pages)
|
||||
return -EINVAL;
|
||||
|
||||
user_extra = nr_pages;
|
||||
|
||||
mutex_lock(&event->mmap_mutex);
|
||||
ret = -EINVAL;
|
||||
|
||||
/*
|
||||
* This relies on __pmu_detach_event() taking mmap_mutex after marking
|
||||
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
|
||||
* will detach the rb created here.
|
||||
*/
|
||||
if (event->state <= PERF_EVENT_STATE_REVOKED) {
|
||||
ret = -ENODEV;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (vma->vm_pgoff == 0) {
|
||||
nr_pages -= 1;
|
||||
|
||||
scoped_guard (mutex, &event->mmap_mutex) {
|
||||
/*
|
||||
* If we have rb pages ensure they're a power-of-two number, so we
|
||||
* can do bitmasks instead of modulo.
|
||||
* This relies on __pmu_detach_event() taking mmap_mutex after marking
|
||||
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
|
||||
* will detach the rb created here.
|
||||
*/
|
||||
if (nr_pages != 0 && !is_power_of_2(nr_pages))
|
||||
goto unlock;
|
||||
if (event->state <= PERF_EVENT_STATE_REVOKED)
|
||||
return -ENODEV;
|
||||
|
||||
WARN_ON_ONCE(event->ctx->parent_ctx);
|
||||
|
||||
if (event->rb) {
|
||||
if (data_page_nr(event->rb) != nr_pages)
|
||||
goto unlock;
|
||||
|
||||
if (atomic_inc_not_zero(&event->rb->mmap_count)) {
|
||||
/*
|
||||
* Success -- managed to mmap() the same buffer
|
||||
* multiple times.
|
||||
*/
|
||||
ret = 0;
|
||||
/* We need the rb to map pages. */
|
||||
rb = event->rb;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* Raced against perf_mmap_close()'s
|
||||
* atomic_dec_and_mutex_lock() remove the
|
||||
* event and continue as if !event->rb
|
||||
*/
|
||||
ring_buffer_attach(event, NULL);
|
||||
}
|
||||
|
||||
} else {
|
||||
/*
|
||||
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
|
||||
* mapped, all subsequent mappings should have the same size
|
||||
* and offset. Must be above the normal perf buffer.
|
||||
*/
|
||||
u64 aux_offset, aux_size;
|
||||
|
||||
rb = event->rb;
|
||||
if (!rb)
|
||||
goto aux_unlock;
|
||||
|
||||
aux_mutex = &rb->aux_mutex;
|
||||
mutex_lock(aux_mutex);
|
||||
|
||||
aux_offset = READ_ONCE(rb->user_page->aux_offset);
|
||||
aux_size = READ_ONCE(rb->user_page->aux_size);
|
||||
|
||||
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
|
||||
goto aux_unlock;
|
||||
|
||||
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
|
||||
goto aux_unlock;
|
||||
|
||||
/* already mapped with a different offset */
|
||||
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
|
||||
goto aux_unlock;
|
||||
|
||||
if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
|
||||
goto aux_unlock;
|
||||
|
||||
/* already mapped with a different size */
|
||||
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
|
||||
goto aux_unlock;
|
||||
|
||||
if (!is_power_of_2(nr_pages))
|
||||
goto aux_unlock;
|
||||
|
||||
if (!atomic_inc_not_zero(&rb->mmap_count))
|
||||
goto aux_unlock;
|
||||
|
||||
if (rb_has_aux(rb)) {
|
||||
atomic_inc(&rb->aux_mmap_count);
|
||||
ret = 0;
|
||||
goto unlock;
|
||||
}
|
||||
if (vma->vm_pgoff == 0)
|
||||
ret = perf_mmap_rb(vma, event, nr_pages);
|
||||
else
|
||||
ret = perf_mmap_aux(vma, event, nr_pages);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
|
||||
|
||||
/*
|
||||
* Increase the limit linearly with more CPUs:
|
||||
*/
|
||||
user_lock_limit *= num_online_cpus();
|
||||
|
||||
user_locked = atomic_long_read(&user->locked_vm);
|
||||
|
||||
/*
|
||||
* sysctl_perf_event_mlock may have changed, so that
|
||||
* user->locked_vm > user_lock_limit
|
||||
*/
|
||||
if (user_locked > user_lock_limit)
|
||||
user_locked = user_lock_limit;
|
||||
user_locked += user_extra;
|
||||
|
||||
if (user_locked > user_lock_limit) {
|
||||
/*
|
||||
* charge locked_vm until it hits user_lock_limit;
|
||||
* charge the rest from pinned_vm
|
||||
*/
|
||||
extra = user_locked - user_lock_limit;
|
||||
user_extra -= extra;
|
||||
}
|
||||
|
||||
lock_limit = rlimit(RLIMIT_MEMLOCK);
|
||||
lock_limit >>= PAGE_SHIFT;
|
||||
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
|
||||
|
||||
if ((locked > lock_limit) && perf_is_paranoid() &&
|
||||
!capable(CAP_IPC_LOCK)) {
|
||||
ret = -EPERM;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
WARN_ON(!rb && event->rb);
|
||||
|
||||
if (vma->vm_flags & VM_WRITE)
|
||||
flags |= RING_BUFFER_WRITABLE;
|
||||
|
||||
if (!rb) {
|
||||
rb = rb_alloc(nr_pages,
|
||||
event->attr.watermark ? event->attr.wakeup_watermark : 0,
|
||||
event->cpu, flags);
|
||||
|
||||
if (!rb) {
|
||||
ret = -ENOMEM;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
atomic_set(&rb->mmap_count, 1);
|
||||
rb->mmap_user = get_current_user();
|
||||
rb->mmap_locked = extra;
|
||||
|
||||
ring_buffer_attach(event, rb);
|
||||
|
||||
perf_event_update_time(event);
|
||||
perf_event_init_userpage(event);
|
||||
perf_event_update_userpage(event);
|
||||
ret = 0;
|
||||
} else {
|
||||
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
|
||||
event->attr.aux_watermark, flags);
|
||||
if (!ret) {
|
||||
atomic_set(&rb->aux_mmap_count, 1);
|
||||
rb->aux_mmap_locked = extra;
|
||||
}
|
||||
}
|
||||
|
||||
unlock:
|
||||
if (!ret) {
|
||||
atomic_long_add(user_extra, &user->locked_vm);
|
||||
atomic64_add(extra, &vma->vm_mm->pinned_vm);
|
||||
|
||||
atomic_inc(&event->mmap_count);
|
||||
} else if (rb) {
|
||||
/* AUX allocation failed */
|
||||
atomic_dec(&rb->mmap_count);
|
||||
}
|
||||
aux_unlock:
|
||||
if (aux_mutex)
|
||||
mutex_unlock(aux_mutex);
|
||||
mutex_unlock(&event->mmap_mutex);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* Since pinned accounting is per vm we cannot allow fork() to copy our
|
||||
* vma.
|
||||
|
@ -7174,7 +7186,7 @@ aux_unlock:
|
|||
* full cleanup in this case and therefore does not invoke
|
||||
* vmops::close().
|
||||
*/
|
||||
ret = map_range(rb, vma);
|
||||
ret = map_range(event->rb, vma);
|
||||
if (ret)
|
||||
perf_mmap_close(vma);
|
||||
|
||||
|
@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
|
|||
if (user_mode(regs)) {
|
||||
regs_user->abi = perf_reg_abi(current);
|
||||
regs_user->regs = regs;
|
||||
} else if (!(current->flags & PF_KTHREAD)) {
|
||||
} else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
|
||||
perf_get_regs_user(regs_user, regs);
|
||||
} else {
|
||||
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
|
||||
|
@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt)
|
|||
* Try IRQ-safe get_user_page_fast_only first.
|
||||
* If failed, leave phys_addr as 0.
|
||||
*/
|
||||
if (current->mm != NULL) {
|
||||
if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
|
||||
struct page *p;
|
||||
|
||||
pagefault_disable();
|
||||
|
@ -8192,7 +8204,8 @@ struct perf_callchain_entry *
|
|||
perf_callchain(struct perf_event *event, struct pt_regs *regs)
|
||||
{
|
||||
bool kernel = !event->attr.exclude_callchain_kernel;
|
||||
bool user = !event->attr.exclude_callchain_user;
|
||||
bool user = !event->attr.exclude_callchain_user &&
|
||||
!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
|
||||
/* Disallow cross-task user callchains. */
|
||||
bool crosstask = event->ctx->task && event->ctx->task != current;
|
||||
const u32 max_stack = event->attr.sample_max_stack;
|
||||
|
@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
|
|||
if (!kernel && !user)
|
||||
return &__empty_callchain;
|
||||
|
||||
callchain = get_perf_callchain(regs, 0, kernel, user,
|
||||
callchain = get_perf_callchain(regs, kernel, user,
|
||||
max_stack, crosstask, true);
|
||||
return callchain ?: &__empty_callchain;
|
||||
}
|
||||
|
@ -13249,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
|
|||
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
|
||||
set:
|
||||
/* Can't redirect output if we've got an active mmap() */
|
||||
if (atomic_read(&event->mmap_count))
|
||||
if (refcount_read(&event->mmap_count))
|
||||
goto unlock;
|
||||
|
||||
if (output_event) {
|
||||
|
@ -13262,7 +13275,7 @@ set:
|
|||
goto unlock;
|
||||
|
||||
/* did we race against perf_mmap_close() */
|
||||
if (!atomic_read(&rb->mmap_count)) {
|
||||
if (!refcount_read(&rb->mmap_count)) {
|
||||
ring_buffer_put(rb);
|
||||
goto unlock;
|
||||
}
|
||||
|
|
|
@ -35,7 +35,7 @@ struct perf_buffer {
|
|||
spinlock_t event_lock;
|
||||
struct list_head event_list;
|
||||
|
||||
atomic_t mmap_count;
|
||||
refcount_t mmap_count;
|
||||
unsigned long mmap_locked;
|
||||
struct user_struct *mmap_user;
|
||||
|
||||
|
@ -47,7 +47,7 @@ struct perf_buffer {
|
|||
unsigned long aux_pgoff;
|
||||
int aux_nr_pages;
|
||||
int aux_overwrite;
|
||||
atomic_t aux_mmap_count;
|
||||
refcount_t aux_mmap_count;
|
||||
unsigned long aux_mmap_locked;
|
||||
void (*free_aux)(void *);
|
||||
refcount_t aux_refcount;
|
||||
|
|
|
@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
|
|||
* the same order, see perf_mmap_close. Otherwise we end up freeing
|
||||
* aux pages in this path, which is a bug, because in_atomic().
|
||||
*/
|
||||
if (!atomic_read(&rb->aux_mmap_count))
|
||||
if (!refcount_read(&rb->aux_mmap_count))
|
||||
goto err;
|
||||
|
||||
if (!refcount_inc_not_zero(&rb->aux_refcount))
|
||||
|
|
|
@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
|
|||
return is_swbp_insn(insn);
|
||||
}
|
||||
|
||||
static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
|
||||
void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
|
||||
{
|
||||
void *kaddr = kmap_atomic(page);
|
||||
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
|
||||
|
@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
|
|||
kunmap_atomic(kaddr);
|
||||
}
|
||||
|
||||
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
|
||||
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
|
||||
int nbytes, void *data)
|
||||
{
|
||||
uprobe_opcode_t old_opcode;
|
||||
bool is_swbp;
|
||||
|
@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
|
|||
* is a trap variant; uprobes always wins over any other (gdb)
|
||||
* breakpoint.
|
||||
*/
|
||||
copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
|
||||
uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
|
||||
is_swbp = is_swbp_insn(&old_opcode);
|
||||
|
||||
if (is_swbp_insn(new_opcode)) {
|
||||
if (is_swbp_insn(insn)) {
|
||||
if (is_swbp) /* register: already installed? */
|
||||
return 0;
|
||||
} else {
|
||||
|
@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
|
|||
return identical;
|
||||
}
|
||||
|
||||
static int __uprobe_write_opcode(struct vm_area_struct *vma,
|
||||
static int __uprobe_write(struct vm_area_struct *vma,
|
||||
struct folio_walk *fw, struct folio *folio,
|
||||
unsigned long opcode_vaddr, uprobe_opcode_t opcode)
|
||||
unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
|
||||
bool is_register)
|
||||
{
|
||||
const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
|
||||
const bool is_register = !!is_swbp_insn(&opcode);
|
||||
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
|
||||
bool pmd_mappable;
|
||||
|
||||
/* For now, we'll only handle PTE-mapped folios. */
|
||||
|
@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
|
|||
*/
|
||||
flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
|
||||
fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
|
||||
copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
|
||||
copy_to_page(fw->page, insn_vaddr, insn, nbytes);
|
||||
|
||||
/*
|
||||
* When unregistering, we may only zap a PTE if uffd is disabled and
|
||||
|
@ -482,23 +483,32 @@ remap:
|
|||
* @opcode_vaddr: the virtual address to store the opcode.
|
||||
* @opcode: opcode to be written at @opcode_vaddr.
|
||||
*
|
||||
* Called with mm->mmap_lock held for read or write.
|
||||
* Called with mm->mmap_lock held for write.
|
||||
* Return 0 (success) or a negative errno.
|
||||
*/
|
||||
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
|
||||
const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
|
||||
bool is_register)
|
||||
{
|
||||
const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
|
||||
return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
|
||||
verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
|
||||
}
|
||||
|
||||
int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
|
||||
uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
|
||||
void *data)
|
||||
{
|
||||
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct uprobe *uprobe;
|
||||
int ret, is_register, ref_ctr_updated = 0;
|
||||
int ret, ref_ctr_updated = 0;
|
||||
unsigned int gup_flags = FOLL_FORCE;
|
||||
struct mmu_notifier_range range;
|
||||
struct folio_walk fw;
|
||||
struct folio *folio;
|
||||
struct page *page;
|
||||
|
||||
is_register = is_swbp_insn(&opcode);
|
||||
uprobe = container_of(auprobe, struct uprobe, arch);
|
||||
|
||||
if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
|
||||
|
@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
|||
* page that we can safely modify. Use FOLL_WRITE to trigger a write
|
||||
* fault if required. When unregistering, we might be lucky and the
|
||||
* anon page is already gone. So defer write faults until really
|
||||
* required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
|
||||
* required. Use FOLL_SPLIT_PMD, because __uprobe_write()
|
||||
* cannot deal with PMDs yet.
|
||||
*/
|
||||
if (is_register)
|
||||
|
@ -521,14 +531,14 @@ retry:
|
|||
goto out;
|
||||
folio = page_folio(page);
|
||||
|
||||
ret = verify_opcode(page, opcode_vaddr, &opcode);
|
||||
ret = verify(page, insn_vaddr, insn, nbytes, data);
|
||||
if (ret <= 0) {
|
||||
folio_put(folio);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* We are going to replace instruction, update ref_ctr. */
|
||||
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
|
||||
if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
|
||||
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
|
||||
if (ret) {
|
||||
folio_put(folio);
|
||||
|
@ -560,7 +570,7 @@ retry:
|
|||
/* Walk the page tables again, to perform the actual update. */
|
||||
if (folio_walk_start(&fw, vma, vaddr, 0)) {
|
||||
if (fw.page == page)
|
||||
ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
|
||||
ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
|
||||
folio_walk_end(&fw, vma);
|
||||
}
|
||||
|
||||
|
@ -580,7 +590,7 @@ retry:
|
|||
|
||||
out:
|
||||
/* Revert back reference counter if instruction update failed. */
|
||||
if (ret < 0 && ref_ctr_updated)
|
||||
if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
|
||||
update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
|
||||
|
||||
/* try collapse pmd for compound page */
|
||||
|
@ -602,7 +612,7 @@ out:
|
|||
int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
|
||||
unsigned long vaddr)
|
||||
{
|
||||
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
|
||||
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe,
|
|||
struct vm_area_struct *vma, unsigned long vaddr)
|
||||
{
|
||||
return uprobe_write_opcode(auprobe, vma, vaddr,
|
||||
*(uprobe_opcode_t *)&auprobe->insn);
|
||||
*(uprobe_opcode_t *)&auprobe->insn, false);
|
||||
}
|
||||
|
||||
/* uprobe should have guaranteed positive refcount */
|
||||
|
@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
|
|||
if (IS_ERR(page))
|
||||
return PTR_ERR(page);
|
||||
|
||||
copy_from_page(page, offset, insn, nbytes);
|
||||
uprobe_copy_from_page(page, offset, insn, nbytes);
|
||||
put_page(page);
|
||||
|
||||
return 0;
|
||||
|
@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
|
|||
* reclaim. This is optimistic, no harm done if it fails.
|
||||
*/
|
||||
prev = kmalloc(sizeof(struct map_info),
|
||||
GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
||||
GFP_NOWAIT | __GFP_NOMEMALLOC);
|
||||
if (prev)
|
||||
prev->next = NULL;
|
||||
}
|
||||
|
@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode,
|
|||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/*
|
||||
* This ensures that copy_from_page(), copy_to_page() and
|
||||
* This ensures that uprobe_copy_from_page(), copy_to_page() and
|
||||
* __update_ref_ctr() can't cross page boundary.
|
||||
*/
|
||||
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
|
||||
|
@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
|
|||
struct vm_area_struct *vma;
|
||||
int err = 0;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
mmap_write_lock(mm);
|
||||
for_each_vma(vmi, vma) {
|
||||
unsigned long vaddr;
|
||||
loff_t offset;
|
||||
|
@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
|
|||
vaddr = offset_to_vaddr(vma, uprobe->offset);
|
||||
err |= remove_breakpoint(uprobe, vma, vaddr);
|
||||
}
|
||||
mmap_read_unlock(mm);
|
||||
mmap_write_unlock(mm);
|
||||
|
||||
return err;
|
||||
}
|
||||
|
@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
|
|||
return ret;
|
||||
}
|
||||
|
||||
void * __weak arch_uprobe_trampoline(unsigned long *psize)
|
||||
void * __weak arch_uretprobe_trampoline(unsigned long *psize)
|
||||
{
|
||||
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
|
||||
|
||||
|
@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
|
|||
init_waitqueue_head(&area->wq);
|
||||
/* Reserve the 1st slot for get_trampoline_vaddr() */
|
||||
set_bit(0, area->bitmap);
|
||||
insns = arch_uprobe_trampoline(&insns_size);
|
||||
insns = arch_uretprobe_trampoline(&insns_size);
|
||||
arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
|
||||
|
||||
if (!xol_add_vma(mm, area))
|
||||
|
@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
|
|||
return area;
|
||||
}
|
||||
|
||||
void __weak arch_uprobe_clear_state(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
|
||||
void __weak arch_uprobe_init_state(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* uprobe_clear_state - Free the area allocated for slots.
|
||||
*/
|
||||
|
@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm)
|
|||
delayed_uprobe_remove(NULL, mm);
|
||||
mutex_unlock(&delayed_uprobe_lock);
|
||||
|
||||
arch_uprobe_clear_state(mm);
|
||||
|
||||
if (!area)
|
||||
return;
|
||||
|
||||
|
@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
|
|||
if (result < 0)
|
||||
return result;
|
||||
|
||||
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
|
||||
uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
|
||||
put_page(page);
|
||||
out:
|
||||
/* This needs to return true for any variant of the trap insn */
|
||||
|
@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
|
|||
return true;
|
||||
}
|
||||
|
||||
void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
|
||||
{
|
||||
}
|
||||
|
||||
/*
|
||||
* Run handler and ask thread to singlestep.
|
||||
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
|
||||
|
@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
|
|||
|
||||
handler_chain(uprobe, regs);
|
||||
|
||||
/* Try to optimize after first hit. */
|
||||
arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
|
||||
|
||||
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
|
||||
goto out;
|
||||
|
||||
|
@ -2752,6 +2779,23 @@ out:
|
|||
rcu_read_unlock_trace();
|
||||
}
|
||||
|
||||
void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
|
||||
{
|
||||
struct uprobe *uprobe;
|
||||
int is_swbp;
|
||||
|
||||
guard(rcu_tasks_trace)();
|
||||
|
||||
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
|
||||
if (!uprobe)
|
||||
return;
|
||||
if (!get_utask())
|
||||
return;
|
||||
if (arch_uprobe_ignore(&uprobe->arch, regs))
|
||||
return;
|
||||
handler_chain(uprobe, regs);
|
||||
}
|
||||
|
||||
/*
|
||||
* Perform required fix-ups and disable singlestep.
|
||||
* Allow pending signals to take effect.
|
||||
|
|
|
@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
|
|||
{
|
||||
#ifdef CONFIG_UPROBES
|
||||
mm->uprobes_state.xol_area = NULL;
|
||||
arch_uprobe_init_state(mm);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -741,6 +741,26 @@ out:
|
|||
}
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
static bool seccomp_uprobe_exception(struct seccomp_data *sd)
|
||||
{
|
||||
#if defined __NR_uretprobe || defined __NR_uprobe
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
if (sd->arch == SECCOMP_ARCH_NATIVE)
|
||||
#endif
|
||||
{
|
||||
#ifdef __NR_uretprobe
|
||||
if (sd->nr == __NR_uretprobe)
|
||||
return true;
|
||||
#endif
|
||||
#ifdef __NR_uprobe
|
||||
if (sd->nr == __NR_uprobe)
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* seccomp_is_const_allow - check if filter is constant allow with given data
|
||||
* @fprog: The BPF programs
|
||||
|
@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
|
|||
return false;
|
||||
|
||||
/* Our single exception to filtering. */
|
||||
#ifdef __NR_uretprobe
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
if (sd->arch == SECCOMP_ARCH_NATIVE)
|
||||
#endif
|
||||
if (sd->nr == __NR_uretprobe)
|
||||
return true;
|
||||
#endif
|
||||
if (seccomp_uprobe_exception(sd))
|
||||
return true;
|
||||
|
||||
for (pc = 0; pc < fprog->len; pc++) {
|
||||
struct sock_filter *insn = &fprog->filter[pc];
|
||||
|
@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = {
|
|||
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
|
||||
#ifdef __NR_uretprobe
|
||||
__NR_uretprobe,
|
||||
#endif
|
||||
#ifdef __NR_uprobe
|
||||
__NR_uprobe,
|
||||
#endif
|
||||
-1, /* negative terminated */
|
||||
};
|
||||
|
|
|
@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
|
|||
COND_SYSCALL(rseq);
|
||||
|
||||
COND_SYSCALL(uretprobe);
|
||||
COND_SYSCALL(uprobe);
|
||||
|
|
|
@ -315,12 +315,14 @@
|
|||
#define PERF_CAP_PT_IDX 16
|
||||
|
||||
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
|
||||
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
|
||||
#define PERF_CAP_ARCH_REG BIT_ULL(7)
|
||||
#define PERF_CAP_PEBS_FORMAT 0xf00
|
||||
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
|
||||
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
|
||||
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
|
||||
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
|
||||
#define PERF_CAP_ARCH_REG BIT_ULL(7)
|
||||
#define PERF_CAP_PEBS_FORMAT 0xf00
|
||||
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
|
||||
#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
|
||||
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
|
||||
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
|
||||
PERF_CAP_PEBS_TIMING_INFO)
|
||||
|
||||
#define MSR_IA32_RTIT_CTL 0x00000570
|
||||
#define RTIT_CTL_TRACEEN BIT(0)
|
||||
|
|
|
@ -8,22 +8,31 @@
|
|||
#include <asm/ptrace.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/stringify.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <sys/wait.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/prctl.h>
|
||||
#include <asm/prctl.h>
|
||||
#include "uprobe_syscall.skel.h"
|
||||
#include "uprobe_syscall_executed.skel.h"
|
||||
#include "bpf/libbpf_internal.h"
|
||||
|
||||
__naked unsigned long uretprobe_regs_trigger(void)
|
||||
#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00
|
||||
#include "usdt.h"
|
||||
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
|
||||
__attribute__((aligned(16)))
|
||||
__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void)
|
||||
{
|
||||
asm volatile (
|
||||
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */
|
||||
"movq $0xdeadbeef, %rax\n"
|
||||
"ret\n"
|
||||
);
|
||||
}
|
||||
|
||||
__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
|
||||
__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after)
|
||||
{
|
||||
asm volatile (
|
||||
"movq %r15, 0(%rdi)\n"
|
||||
|
@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
|
|||
"movq $0, 120(%rdi)\n" /* orig_rax */
|
||||
"movq $0, 128(%rdi)\n" /* rip */
|
||||
"movq $0, 136(%rdi)\n" /* cs */
|
||||
"pushq %rax\n"
|
||||
"pushf\n"
|
||||
"pop %rax\n"
|
||||
"movq %rax, 144(%rdi)\n" /* eflags */
|
||||
"pop %rax\n"
|
||||
"movq %rsp, 152(%rdi)\n" /* rsp */
|
||||
"movq $0, 160(%rdi)\n" /* ss */
|
||||
|
||||
/* save 2nd argument */
|
||||
"pushq %rsi\n"
|
||||
"call uretprobe_regs_trigger\n"
|
||||
"call uprobe_regs_trigger\n"
|
||||
|
||||
/* save return value and load 2nd argument pointer to rax */
|
||||
"pushq %rax\n"
|
||||
|
@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
|
|||
);
|
||||
}
|
||||
|
||||
static void test_uretprobe_regs_equal(void)
|
||||
static void test_uprobe_regs_equal(bool retprobe)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_uprobe_opts, opts,
|
||||
.retprobe = retprobe,
|
||||
);
|
||||
struct uprobe_syscall *skel = NULL;
|
||||
struct pt_regs before = {}, after = {};
|
||||
unsigned long *pb = (unsigned long *) &before;
|
||||
unsigned long *pa = (unsigned long *) &after;
|
||||
unsigned long *pp;
|
||||
unsigned long offset;
|
||||
unsigned int i, cnt;
|
||||
int err;
|
||||
|
||||
offset = get_uprobe_offset(&uprobe_regs_trigger);
|
||||
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
|
||||
return;
|
||||
|
||||
skel = uprobe_syscall__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load"))
|
||||
goto cleanup;
|
||||
|
||||
err = uprobe_syscall__attach(skel);
|
||||
if (!ASSERT_OK(err, "uprobe_syscall__attach"))
|
||||
skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe,
|
||||
0, "/proc/self/exe", offset, &opts);
|
||||
if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts"))
|
||||
goto cleanup;
|
||||
|
||||
uretprobe_regs(&before, &after);
|
||||
/* make sure uprobe gets optimized */
|
||||
if (!retprobe)
|
||||
uprobe_regs_trigger();
|
||||
|
||||
uprobe_regs(&before, &after);
|
||||
|
||||
pp = (unsigned long *) &skel->bss->regs;
|
||||
cnt = sizeof(before)/sizeof(*pb);
|
||||
|
@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void)
|
|||
unsigned int offset = i * sizeof(unsigned long);
|
||||
|
||||
/*
|
||||
* Check register before and after uretprobe_regs_trigger call
|
||||
* Check register before and after uprobe_regs_trigger call
|
||||
* that triggers the uretprobe.
|
||||
*/
|
||||
switch (offset) {
|
||||
|
@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void)
|
|||
|
||||
/*
|
||||
* Check register seen from bpf program and register after
|
||||
* uretprobe_regs_trigger call
|
||||
* uprobe_regs_trigger call (with rax exception, check below).
|
||||
*/
|
||||
switch (offset) {
|
||||
/*
|
||||
|
@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void)
|
|||
case offsetof(struct pt_regs, rsp):
|
||||
case offsetof(struct pt_regs, ss):
|
||||
break;
|
||||
/*
|
||||
* uprobe does not see return value in rax, it needs to see the
|
||||
* original (before) rax value
|
||||
*/
|
||||
case offsetof(struct pt_regs, rax):
|
||||
if (!retprobe) {
|
||||
ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check");
|
||||
break;
|
||||
}
|
||||
default:
|
||||
if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check"))
|
||||
fprintf(stdout, "failed register offset %u\n", offset);
|
||||
|
@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset)
|
|||
return ret != n ? (int) ret : 0;
|
||||
}
|
||||
|
||||
static void test_uretprobe_regs_change(void)
|
||||
static void test_regs_change(void)
|
||||
{
|
||||
struct pt_regs before = {}, after = {};
|
||||
unsigned long *pb = (unsigned long *) &before;
|
||||
|
@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void)
|
|||
unsigned long cnt = sizeof(before)/sizeof(*pb);
|
||||
unsigned int i, err, offset;
|
||||
|
||||
offset = get_uprobe_offset(uretprobe_regs_trigger);
|
||||
offset = get_uprobe_offset(uprobe_regs_trigger);
|
||||
|
||||
err = write_bpf_testmod_uprobe(offset);
|
||||
if (!ASSERT_OK(err, "register_uprobe"))
|
||||
return;
|
||||
|
||||
uretprobe_regs(&before, &after);
|
||||
/* make sure uprobe gets optimized */
|
||||
uprobe_regs_trigger();
|
||||
|
||||
uprobe_regs(&before, &after);
|
||||
|
||||
err = write_bpf_testmod_uprobe(0);
|
||||
if (!ASSERT_OK(err, "unregister_uprobe"))
|
||||
|
@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void)
|
|||
);
|
||||
struct uprobe_syscall_executed *skel;
|
||||
int pid, status, err, go[2], c = 0;
|
||||
struct bpf_link *link;
|
||||
|
||||
if (!ASSERT_OK(pipe(go), "pipe"))
|
||||
return;
|
||||
|
@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void)
|
|||
_exit(0);
|
||||
}
|
||||
|
||||
skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid,
|
||||
"/proc/self/exe",
|
||||
"uretprobe_syscall_call", &opts);
|
||||
if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi"))
|
||||
skel->bss->pid = pid;
|
||||
|
||||
link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi,
|
||||
pid, "/proc/self/exe",
|
||||
"uretprobe_syscall_call", &opts);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
|
||||
goto cleanup;
|
||||
skel->links.test_uretprobe_multi = link;
|
||||
|
||||
/* kick the child */
|
||||
write(go[1], &c, 1);
|
||||
|
@ -301,6 +340,256 @@ cleanup:
|
|||
close(go[0]);
|
||||
}
|
||||
|
||||
#define TRAMP "[uprobes-trampoline]"
|
||||
|
||||
__attribute__((aligned(16)))
|
||||
__nocf_check __weak __naked void uprobe_test(void)
|
||||
{
|
||||
asm volatile (" \n"
|
||||
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n"
|
||||
"ret \n"
|
||||
);
|
||||
}
|
||||
|
||||
__attribute__((aligned(16)))
|
||||
__nocf_check __weak void usdt_test(void)
|
||||
{
|
||||
USDT(optimized_uprobe, usdt);
|
||||
}
|
||||
|
||||
static int find_uprobes_trampoline(void *tramp_addr)
|
||||
{
|
||||
void *start, *end;
|
||||
char line[128];
|
||||
int ret = -1;
|
||||
FILE *maps;
|
||||
|
||||
maps = fopen("/proc/self/maps", "r");
|
||||
if (!maps) {
|
||||
fprintf(stderr, "cannot open maps\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (fgets(line, sizeof(line), maps)) {
|
||||
int m = -1;
|
||||
|
||||
/* We care only about private r-x mappings. */
|
||||
if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2)
|
||||
continue;
|
||||
if (m < 0)
|
||||
continue;
|
||||
if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) {
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(maps);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 };
|
||||
|
||||
static void *find_nop5(void *fn)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 10; i++) {
|
||||
if (!memcmp(nop5, fn + i, 5))
|
||||
return fn + i;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
typedef void (__attribute__((nocf_check)) *trigger_t)(void);
|
||||
|
||||
static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger,
|
||||
void *addr, int executed)
|
||||
{
|
||||
struct __arch_relative_insn {
|
||||
__u8 op;
|
||||
__s32 raddr;
|
||||
} __packed *call;
|
||||
void *tramp = NULL;
|
||||
|
||||
/* Uprobe gets optimized after first trigger, so let's press twice. */
|
||||
trigger();
|
||||
trigger();
|
||||
|
||||
/* Make sure bpf program got executed.. */
|
||||
ASSERT_EQ(skel->bss->executed, executed, "executed");
|
||||
|
||||
/* .. and check the trampoline is as expected. */
|
||||
call = (struct __arch_relative_insn *) addr;
|
||||
tramp = (void *) (call + 1) + call->raddr;
|
||||
ASSERT_EQ(call->op, 0xe8, "call");
|
||||
ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
|
||||
|
||||
return tramp;
|
||||
}
|
||||
|
||||
static void check_detach(void *addr, void *tramp)
|
||||
{
|
||||
/* [uprobes_trampoline] stays after detach */
|
||||
ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
|
||||
ASSERT_OK(memcmp(addr, nop5, 5), "nop5");
|
||||
}
|
||||
|
||||
static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link,
|
||||
trigger_t trigger, void *addr, int executed)
|
||||
{
|
||||
void *tramp;
|
||||
|
||||
tramp = check_attach(skel, trigger, addr, executed);
|
||||
bpf_link__destroy(link);
|
||||
check_detach(addr, tramp);
|
||||
}
|
||||
|
||||
static void test_uprobe_legacy(void)
|
||||
{
|
||||
struct uprobe_syscall_executed *skel = NULL;
|
||||
LIBBPF_OPTS(bpf_uprobe_opts, opts,
|
||||
.retprobe = true,
|
||||
);
|
||||
struct bpf_link *link;
|
||||
unsigned long offset;
|
||||
|
||||
offset = get_uprobe_offset(&uprobe_test);
|
||||
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
|
||||
goto cleanup;
|
||||
|
||||
/* uprobe */
|
||||
skel = uprobe_syscall_executed__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
|
||||
return;
|
||||
|
||||
skel->bss->pid = getpid();
|
||||
|
||||
link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
|
||||
0, "/proc/self/exe", offset, NULL);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
|
||||
goto cleanup;
|
||||
|
||||
check(skel, link, uprobe_test, uprobe_test, 2);
|
||||
|
||||
/* uretprobe */
|
||||
skel->bss->executed = 0;
|
||||
|
||||
link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe,
|
||||
0, "/proc/self/exe", offset, &opts);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
|
||||
goto cleanup;
|
||||
|
||||
check(skel, link, uprobe_test, uprobe_test, 2);
|
||||
|
||||
cleanup:
|
||||
uprobe_syscall_executed__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_uprobe_multi(void)
|
||||
{
|
||||
struct uprobe_syscall_executed *skel = NULL;
|
||||
LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
|
||||
struct bpf_link *link;
|
||||
unsigned long offset;
|
||||
|
||||
offset = get_uprobe_offset(&uprobe_test);
|
||||
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
|
||||
goto cleanup;
|
||||
|
||||
opts.offsets = &offset;
|
||||
opts.cnt = 1;
|
||||
|
||||
skel = uprobe_syscall_executed__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
|
||||
return;
|
||||
|
||||
skel->bss->pid = getpid();
|
||||
|
||||
/* uprobe.multi */
|
||||
link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi,
|
||||
0, "/proc/self/exe", NULL, &opts);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
|
||||
goto cleanup;
|
||||
|
||||
check(skel, link, uprobe_test, uprobe_test, 2);
|
||||
|
||||
/* uretprobe.multi */
|
||||
skel->bss->executed = 0;
|
||||
opts.retprobe = true;
|
||||
link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi,
|
||||
0, "/proc/self/exe", NULL, &opts);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
|
||||
goto cleanup;
|
||||
|
||||
check(skel, link, uprobe_test, uprobe_test, 2);
|
||||
|
||||
cleanup:
|
||||
uprobe_syscall_executed__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_uprobe_session(void)
|
||||
{
|
||||
struct uprobe_syscall_executed *skel = NULL;
|
||||
LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
|
||||
.session = true,
|
||||
);
|
||||
struct bpf_link *link;
|
||||
unsigned long offset;
|
||||
|
||||
offset = get_uprobe_offset(&uprobe_test);
|
||||
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
|
||||
goto cleanup;
|
||||
|
||||
opts.offsets = &offset;
|
||||
opts.cnt = 1;
|
||||
|
||||
skel = uprobe_syscall_executed__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
|
||||
return;
|
||||
|
||||
skel->bss->pid = getpid();
|
||||
|
||||
link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session,
|
||||
0, "/proc/self/exe", NULL, &opts);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
|
||||
goto cleanup;
|
||||
|
||||
check(skel, link, uprobe_test, uprobe_test, 4);
|
||||
|
||||
cleanup:
|
||||
uprobe_syscall_executed__destroy(skel);
|
||||
}
|
||||
|
||||
static void test_uprobe_usdt(void)
|
||||
{
|
||||
struct uprobe_syscall_executed *skel;
|
||||
struct bpf_link *link;
|
||||
void *addr;
|
||||
|
||||
errno = 0;
|
||||
addr = find_nop5(usdt_test);
|
||||
if (!ASSERT_OK_PTR(addr, "find_nop5"))
|
||||
return;
|
||||
|
||||
skel = uprobe_syscall_executed__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
|
||||
return;
|
||||
|
||||
skel->bss->pid = getpid();
|
||||
|
||||
link = bpf_program__attach_usdt(skel->progs.test_usdt,
|
||||
-1 /* all PIDs */, "/proc/self/exe",
|
||||
"optimized_uprobe", "usdt", NULL);
|
||||
if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt"))
|
||||
goto cleanup;
|
||||
|
||||
check(skel, link, usdt_test, addr, 2);
|
||||
|
||||
cleanup:
|
||||
uprobe_syscall_executed__destroy(skel);
|
||||
}
|
||||
|
||||
/*
|
||||
* Borrowed from tools/testing/selftests/x86/test_shadow_stack.c.
|
||||
*
|
||||
|
@ -343,30 +632,166 @@ static void test_uretprobe_shadow_stack(void)
|
|||
return;
|
||||
}
|
||||
|
||||
/* Run all of the uretprobe tests. */
|
||||
test_uretprobe_regs_equal();
|
||||
test_uretprobe_regs_change();
|
||||
/* Run all the tests with shadow stack in place. */
|
||||
|
||||
test_uprobe_regs_equal(false);
|
||||
test_uprobe_regs_equal(true);
|
||||
test_uretprobe_syscall_call();
|
||||
|
||||
test_uprobe_legacy();
|
||||
test_uprobe_multi();
|
||||
test_uprobe_session();
|
||||
test_uprobe_usdt();
|
||||
|
||||
test_regs_change();
|
||||
|
||||
ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
|
||||
}
|
||||
|
||||
static volatile bool race_stop;
|
||||
|
||||
static USDT_DEFINE_SEMA(race);
|
||||
|
||||
static void *worker_trigger(void *arg)
|
||||
{
|
||||
unsigned long rounds = 0;
|
||||
|
||||
while (!race_stop) {
|
||||
uprobe_test();
|
||||
rounds++;
|
||||
}
|
||||
|
||||
printf("tid %d trigger rounds: %lu\n", gettid(), rounds);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *worker_attach(void *arg)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_uprobe_opts, opts);
|
||||
struct uprobe_syscall_executed *skel;
|
||||
unsigned long rounds = 0, offset;
|
||||
const char *sema[2] = {
|
||||
__stringify(USDT_SEMA(race)),
|
||||
NULL,
|
||||
};
|
||||
unsigned long *ref;
|
||||
int err;
|
||||
|
||||
offset = get_uprobe_offset(&uprobe_test);
|
||||
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
|
||||
return NULL;
|
||||
|
||||
err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT);
|
||||
if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema"))
|
||||
return NULL;
|
||||
|
||||
opts.ref_ctr_offset = *ref;
|
||||
|
||||
skel = uprobe_syscall_executed__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
|
||||
return NULL;
|
||||
|
||||
skel->bss->pid = getpid();
|
||||
|
||||
while (!race_stop) {
|
||||
skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
|
||||
0, "/proc/self/exe", offset, &opts);
|
||||
if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts"))
|
||||
break;
|
||||
|
||||
bpf_link__destroy(skel->links.test_uprobe);
|
||||
skel->links.test_uprobe = NULL;
|
||||
rounds++;
|
||||
}
|
||||
|
||||
printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed);
|
||||
uprobe_syscall_executed__destroy(skel);
|
||||
free(ref);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static useconds_t race_msec(void)
|
||||
{
|
||||
char *env;
|
||||
|
||||
env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC");
|
||||
if (env)
|
||||
return atoi(env);
|
||||
|
||||
/* default duration is 500ms */
|
||||
return 500;
|
||||
}
|
||||
|
||||
static void test_uprobe_race(void)
|
||||
{
|
||||
int err, i, nr_threads;
|
||||
pthread_t *threads;
|
||||
|
||||
nr_threads = libbpf_num_possible_cpus();
|
||||
if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus"))
|
||||
return;
|
||||
nr_threads = max(2, nr_threads);
|
||||
|
||||
threads = alloca(sizeof(*threads) * nr_threads);
|
||||
if (!ASSERT_OK_PTR(threads, "malloc"))
|
||||
return;
|
||||
|
||||
for (i = 0; i < nr_threads; i++) {
|
||||
err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach,
|
||||
NULL);
|
||||
if (!ASSERT_OK(err, "pthread_create"))
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
usleep(race_msec() * 1000);
|
||||
|
||||
cleanup:
|
||||
race_stop = true;
|
||||
for (nr_threads = i, i = 0; i < nr_threads; i++)
|
||||
pthread_join(threads[i], NULL);
|
||||
|
||||
ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore");
|
||||
}
|
||||
|
||||
#ifndef __NR_uprobe
|
||||
#define __NR_uprobe 336
|
||||
#endif
|
||||
|
||||
static void test_uprobe_error(void)
|
||||
{
|
||||
long err = syscall(__NR_uprobe);
|
||||
|
||||
ASSERT_EQ(err, -1, "error");
|
||||
ASSERT_EQ(errno, ENXIO, "errno");
|
||||
}
|
||||
|
||||
static void __test_uprobe_syscall(void)
|
||||
{
|
||||
if (test__start_subtest("uretprobe_regs_equal"))
|
||||
test_uprobe_regs_equal(true);
|
||||
if (test__start_subtest("uretprobe_syscall_call"))
|
||||
test_uretprobe_syscall_call();
|
||||
if (test__start_subtest("uretprobe_shadow_stack"))
|
||||
test_uretprobe_shadow_stack();
|
||||
if (test__start_subtest("uprobe_legacy"))
|
||||
test_uprobe_legacy();
|
||||
if (test__start_subtest("uprobe_multi"))
|
||||
test_uprobe_multi();
|
||||
if (test__start_subtest("uprobe_session"))
|
||||
test_uprobe_session();
|
||||
if (test__start_subtest("uprobe_usdt"))
|
||||
test_uprobe_usdt();
|
||||
if (test__start_subtest("uprobe_race"))
|
||||
test_uprobe_race();
|
||||
if (test__start_subtest("uprobe_error"))
|
||||
test_uprobe_error();
|
||||
if (test__start_subtest("uprobe_regs_equal"))
|
||||
test_uprobe_regs_equal(false);
|
||||
if (test__start_subtest("regs_change"))
|
||||
test_regs_change();
|
||||
}
|
||||
#else
|
||||
static void test_uretprobe_regs_equal(void)
|
||||
{
|
||||
test__skip();
|
||||
}
|
||||
|
||||
static void test_uretprobe_regs_change(void)
|
||||
{
|
||||
test__skip();
|
||||
}
|
||||
|
||||
static void test_uretprobe_syscall_call(void)
|
||||
{
|
||||
test__skip();
|
||||
}
|
||||
|
||||
static void test_uretprobe_shadow_stack(void)
|
||||
static void __test_uprobe_syscall(void)
|
||||
{
|
||||
test__skip();
|
||||
}
|
||||
|
@ -374,12 +799,5 @@ static void test_uretprobe_shadow_stack(void)
|
|||
|
||||
void test_uprobe_syscall(void)
|
||||
{
|
||||
if (test__start_subtest("uretprobe_regs_equal"))
|
||||
test_uretprobe_regs_equal();
|
||||
if (test__start_subtest("uretprobe_regs_change"))
|
||||
test_uretprobe_regs_change();
|
||||
if (test__start_subtest("uretprobe_syscall_call"))
|
||||
test_uretprobe_syscall_call();
|
||||
if (test__start_subtest("uretprobe_shadow_stack"))
|
||||
test_uretprobe_shadow_stack();
|
||||
__test_uprobe_syscall();
|
||||
}
|
||||
|
|
|
@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) {
|
|||
}
|
||||
}
|
||||
|
||||
static void subtest_basic_usdt(void)
|
||||
static void subtest_basic_usdt(bool optimized)
|
||||
{
|
||||
LIBBPF_OPTS(bpf_usdt_opts, opts);
|
||||
struct test_usdt *skel;
|
||||
struct test_usdt__bss *bss;
|
||||
int err, i;
|
||||
int err, i, called;
|
||||
|
||||
#define TRIGGER(x) ({ \
|
||||
trigger_func(x); \
|
||||
if (optimized) \
|
||||
trigger_func(x); \
|
||||
optimized ? 2 : 1; \
|
||||
})
|
||||
|
||||
skel = test_usdt__open_and_load();
|
||||
if (!ASSERT_OK_PTR(skel, "skel_open"))
|
||||
|
@ -66,11 +73,11 @@ static void subtest_basic_usdt(void)
|
|||
if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link"))
|
||||
goto cleanup;
|
||||
|
||||
trigger_func(1);
|
||||
called = TRIGGER(1);
|
||||
|
||||
ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called");
|
||||
ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called");
|
||||
ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called");
|
||||
ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
|
||||
ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
|
||||
ASSERT_EQ(bss->usdt12_called, called, "usdt12_called");
|
||||
|
||||
ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie");
|
||||
ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt");
|
||||
|
@ -119,11 +126,11 @@ static void subtest_basic_usdt(void)
|
|||
* bpf_program__attach_usdt() handles this properly and attaches to
|
||||
* all possible places of USDT invocation.
|
||||
*/
|
||||
trigger_func(2);
|
||||
called += TRIGGER(2);
|
||||
|
||||
ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called");
|
||||
ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called");
|
||||
ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called");
|
||||
ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
|
||||
ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
|
||||
ASSERT_EQ(bss->usdt12_called, called, "usdt12_called");
|
||||
|
||||
/* only check values that depend on trigger_func()'s input value */
|
||||
ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1");
|
||||
|
@ -142,9 +149,9 @@ static void subtest_basic_usdt(void)
|
|||
if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach"))
|
||||
goto cleanup;
|
||||
|
||||
trigger_func(3);
|
||||
called += TRIGGER(3);
|
||||
|
||||
ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called");
|
||||
ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
|
||||
/* this time usdt3 has custom cookie */
|
||||
ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie");
|
||||
ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt");
|
||||
|
@ -158,6 +165,7 @@ static void subtest_basic_usdt(void)
|
|||
|
||||
cleanup:
|
||||
test_usdt__destroy(skel);
|
||||
#undef TRIGGER
|
||||
}
|
||||
|
||||
unsigned short test_usdt_100_semaphore SEC(".probes");
|
||||
|
@ -425,7 +433,11 @@ cleanup:
|
|||
void test_usdt(void)
|
||||
{
|
||||
if (test__start_subtest("basic"))
|
||||
subtest_basic_usdt();
|
||||
subtest_basic_usdt(false);
|
||||
#ifdef __x86_64__
|
||||
if (test__start_subtest("basic_optimized"))
|
||||
subtest_basic_usdt(true);
|
||||
#endif
|
||||
if (test__start_subtest("multispec"))
|
||||
subtest_multispec_usdt();
|
||||
if (test__start_subtest("urand_auto_attach"))
|
||||
|
|
|
@ -7,8 +7,8 @@ struct pt_regs regs;
|
|||
|
||||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger")
|
||||
int uretprobe(struct pt_regs *ctx)
|
||||
SEC("uprobe")
|
||||
int probe(struct pt_regs *ctx)
|
||||
{
|
||||
__builtin_memcpy(®s, ctx, sizeof(regs));
|
||||
return 0;
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
#include "vmlinux.h"
|
||||
#include <bpf/bpf_helpers.h>
|
||||
#include <bpf/bpf_tracing.h>
|
||||
#include <bpf/usdt.bpf.h>
|
||||
#include <string.h>
|
||||
|
||||
struct pt_regs regs;
|
||||
|
@ -8,10 +10,64 @@ struct pt_regs regs;
|
|||
char _license[] SEC("license") = "GPL";
|
||||
|
||||
int executed = 0;
|
||||
int pid;
|
||||
|
||||
SEC("uretprobe.multi")
|
||||
int test(struct pt_regs *regs)
|
||||
SEC("uprobe")
|
||||
int BPF_UPROBE(test_uprobe)
|
||||
{
|
||||
executed = 1;
|
||||
if (bpf_get_current_pid_tgid() >> 32 != pid)
|
||||
return 0;
|
||||
|
||||
executed++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("uretprobe")
|
||||
int BPF_URETPROBE(test_uretprobe)
|
||||
{
|
||||
if (bpf_get_current_pid_tgid() >> 32 != pid)
|
||||
return 0;
|
||||
|
||||
executed++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("uprobe.multi")
|
||||
int test_uprobe_multi(struct pt_regs *ctx)
|
||||
{
|
||||
if (bpf_get_current_pid_tgid() >> 32 != pid)
|
||||
return 0;
|
||||
|
||||
executed++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("uretprobe.multi")
|
||||
int test_uretprobe_multi(struct pt_regs *ctx)
|
||||
{
|
||||
if (bpf_get_current_pid_tgid() >> 32 != pid)
|
||||
return 0;
|
||||
|
||||
executed++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("uprobe.session")
|
||||
int test_uprobe_session(struct pt_regs *ctx)
|
||||
{
|
||||
if (bpf_get_current_pid_tgid() >> 32 != pid)
|
||||
return 0;
|
||||
|
||||
executed++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
SEC("usdt")
|
||||
int test_usdt(struct pt_regs *ctx)
|
||||
{
|
||||
if (bpf_get_current_pid_tgid() >> 32 != pid)
|
||||
return 0;
|
||||
|
||||
executed++;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -500,15 +500,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
|
|||
*/
|
||||
#ifdef __x86_64__
|
||||
|
||||
static int
|
||||
uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data)
|
||||
{
|
||||
regs->cx = 0x87654321feebdaed;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
|
||||
struct pt_regs *regs, __u64 *data)
|
||||
|
||||
{
|
||||
regs->ax = 0x12345678deadbeef;
|
||||
regs->cx = 0x87654321feebdaed;
|
||||
regs->r11 = (u64) -1;
|
||||
return true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct testmod_uprobe {
|
||||
|
@ -520,6 +526,7 @@ struct testmod_uprobe {
|
|||
static DEFINE_MUTEX(testmod_uprobe_mutex);
|
||||
|
||||
static struct testmod_uprobe uprobe = {
|
||||
.consumer.handler = uprobe_handler,
|
||||
.consumer.ret_handler = uprobe_ret_handler,
|
||||
};
|
||||
|
||||
|
|
|
@ -0,0 +1,545 @@
|
|||
// SPDX-License-Identifier: BSD-2-Clause
|
||||
/*
|
||||
* This single-header library defines a collection of variadic macros for
|
||||
* defining and triggering USDTs (User Statically-Defined Tracepoints):
|
||||
*
|
||||
* - For USDTs without associated semaphore:
|
||||
* USDT(group, name, args...)
|
||||
*
|
||||
* - For USDTs with implicit (transparent to the user) semaphore:
|
||||
* USDT_WITH_SEMA(group, name, args...)
|
||||
* USDT_IS_ACTIVE(group, name)
|
||||
*
|
||||
* - For USDTs with explicit (user-defined and provided) semaphore:
|
||||
* USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...)
|
||||
* USDT_SEMA_IS_ACTIVE(sema)
|
||||
*
|
||||
* all of which emit a NOP instruction into the instruction stream, and so
|
||||
* have *zero* overhead for the surrounding code. USDTs are identified by
|
||||
* a combination of `group` and `name` identifiers, which is used by external
|
||||
* tracing tooling (tracers) for identifying exact USDTs of interest.
|
||||
*
|
||||
* USDTs can have an associated (2-byte) activity counter (USDT semaphore),
|
||||
* automatically maintained by Linux kernel whenever any correctly written
|
||||
* BPF-based tracer is attached to the USDT. This USDT semaphore can be used
|
||||
* to check whether there is a need to do any extra data collection and
|
||||
* processing for a given USDT (if necessary), and otherwise avoid extra work
|
||||
* for a common case of USDT not being traced ("active").
|
||||
*
|
||||
* See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or
|
||||
* USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on
|
||||
* working with USDTs with implicitly or explicitly associated
|
||||
* USDT semaphores, respectively.
|
||||
*
|
||||
* There is also some additional data recorded into an auxiliary note
|
||||
* section. The data in the note section describes the operands, in terms of
|
||||
* size and location, used by tracing tooling to know where to find USDT
|
||||
* arguments. Each location is encoded as an assembler operand string.
|
||||
* Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert
|
||||
* breakpoints on top of the nop, and decode the location operand-strings,
|
||||
* like an assembler, to find the values being passed.
|
||||
*
|
||||
* The operand strings are selected by the compiler for each operand.
|
||||
* They are constrained by inline-assembler codes.The default is:
|
||||
*
|
||||
* #define USDT_ARG_CONSTRAINT nor
|
||||
*
|
||||
* This is a good default if the operands tend to be integral and
|
||||
* moderate in number (smaller than number of registers). In other
|
||||
* cases, the compiler may report "'asm' requires impossible reload" or
|
||||
* similar. In this case, consider simplifying the macro call (fewer
|
||||
* and simpler operands), reduce optimization, or override the default
|
||||
* constraints string via:
|
||||
*
|
||||
* #define USDT_ARG_CONSTRAINT g
|
||||
* #include <usdt.h>
|
||||
*
|
||||
* For some historical description of USDT v3 format (the one used by this
|
||||
* library and generally recognized and assumed by BPF-based tracing tools)
|
||||
* see [0]. The more formal specification can be found at [1]. Additional
|
||||
* argument constraints information can be found at [2].
|
||||
*
|
||||
* Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for
|
||||
* this USDT library implementation. Current implementation differs *a lot* in
|
||||
* terms of exposed user API and general usability, which was the main goal
|
||||
* and focus of the reimplementation work. Nevertheless, underlying recorded
|
||||
* USDT definitions are fully binary compatible and any USDT-based tooling
|
||||
* should work equally well with USDTs defined by either SystemTap's or this
|
||||
* library's USDT implementation.
|
||||
*
|
||||
* [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html
|
||||
* [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
|
||||
* [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html
|
||||
* [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h
|
||||
*/
|
||||
#ifndef __USDT_H
|
||||
#define __USDT_H
|
||||
|
||||
/*
|
||||
* Changelog:
|
||||
*
|
||||
* 0.1.0
|
||||
* -----
|
||||
* - Initial release
|
||||
*/
|
||||
#define USDT_MAJOR_VERSION 0
|
||||
#define USDT_MINOR_VERSION 1
|
||||
#define USDT_PATCH_VERSION 0
|
||||
|
||||
/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
|
||||
#define __usdt_va_opt 1
|
||||
#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__
|
||||
#else
|
||||
#define __usdt_va_args(...) , ##__VA_ARGS__
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Trigger USDT with `group`:`name` identifier and pass through `args` as its
|
||||
* arguments. Zero arguments are acceptable as well. No USDT semaphore is
|
||||
* associated with this USDT.
|
||||
*
|
||||
* Such "semaphoreless" USDTs are commonly used when there is no extra data
|
||||
* collection or processing needed to collect and prepare USDT arguments and
|
||||
* they are just available in the surrounding code. USDT() macro will just
|
||||
* record their locations in CPU registers or in memory for tracing tooling to
|
||||
* be able to access them, if necessary.
|
||||
*/
|
||||
#ifdef __usdt_va_opt
|
||||
#define USDT(group, name, ...) \
|
||||
__usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__)
|
||||
#else
|
||||
#define USDT(group, name, ...) \
|
||||
__usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Trigger USDT with `group`:`name` identifier and pass through `args` as its
|
||||
* arguments. Zero arguments are acceptable as well. USDT also get an
|
||||
* implicitly-defined associated USDT semaphore, which will be "activated" by
|
||||
* tracing tooling and can be used to check whether USDT is being actively
|
||||
* observed.
|
||||
*
|
||||
* USDTs with semaphore are commonly used when there is a need to perform
|
||||
* additional data collection and processing to prepare USDT arguments, which
|
||||
* otherwise might not be necessary for the rest of application logic. In such
|
||||
* case, USDT semaphore can be used to avoid unnecessary extra work. If USDT
|
||||
* is not traced (which is presumed to be a common situation), the associated
|
||||
* USDT semaphore is "inactive", and so there is no need to waste resources to
|
||||
* prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether
|
||||
* USDT is "active".
|
||||
*
|
||||
* N.B. There is an inherent (albeit short) gap between checking whether USDT
|
||||
* is active and triggering corresponding USDT, in which external tracer can
|
||||
* be attached to an USDT and activate USDT semaphore after the activity check.
|
||||
* If such a race occurs, tracers might miss one USDT execution. Tracers are
|
||||
* expected to accommodate such possibility and this is expected to not be
|
||||
* a problem for applications and tracers.
|
||||
*
|
||||
* N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained
|
||||
* within a single executable or shared library and is not shared outside
|
||||
* them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name
|
||||
* identifier across executable and shared library, it will work and won't
|
||||
* conflict, per se, but will define independent USDT semaphores, one for each
|
||||
* shared library/executable in which USDT_WITH_SEMA(group, name) is used.
|
||||
* That is, if you attach to this USDT in one shared library (or executable),
|
||||
* then only USDT semaphore within that shared library (or executable) will be
|
||||
* updated by the kernel, while other libraries (or executable) will not see
|
||||
* activated USDT semaphore. In short, it's best to use unique USDT group:name
|
||||
* identifiers across different shared libraries (and, equivalently, between
|
||||
* executable and shared library). This is advanced consideration and is
|
||||
* rarely (if ever) seen in practice, but just to avoid surprises this is
|
||||
* called out here. (Static libraries become a part of final executable, once
|
||||
* linked by linker, so the above considerations don't apply to them.)
|
||||
*/
|
||||
#ifdef __usdt_va_opt
|
||||
#define USDT_WITH_SEMA(group, name, ...) \
|
||||
__usdt_probe(group, name, \
|
||||
__usdt_sema_implicit, __usdt_sema_name(group, name) \
|
||||
__VA_OPT__(,) __VA_ARGS__)
|
||||
#else
|
||||
#define USDT_WITH_SEMA(group, name, ...) \
|
||||
__usdt_probe(group, name, \
|
||||
__usdt_sema_implicit, __usdt_sema_name(group, name), \
|
||||
##__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
struct usdt_sema { volatile unsigned short active; };
|
||||
|
||||
/*
|
||||
* Check if USDT with `group`:`name` identifier is "active" (i.e., whether it
|
||||
* is attached to by external tracing tooling and is actively observed).
|
||||
*
|
||||
* This macro can be used to decide whether any additional and potentially
|
||||
* expensive data collection or processing should be done to pass extra
|
||||
* information into the given USDT. It is assumed that USDT is triggered with
|
||||
* USDT_WITH_SEMA() macro which will implicitly define associated USDT
|
||||
* semaphore. (If one needs more control over USDT semaphore, see
|
||||
* USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.)
|
||||
*
|
||||
* N.B. Such checks are necessarily racy and speculative. Between checking
|
||||
* whether USDT is active and triggering the USDT itself, tracer can be
|
||||
* detached with no notification. This race should be extremely rare and worst
|
||||
* case should result in one-time wasted extra data collection and processing.
|
||||
*/
|
||||
#define USDT_IS_ACTIVE(group, name) ({ \
|
||||
extern struct usdt_sema __usdt_sema_name(group, name) \
|
||||
__usdt_asm_name(__usdt_sema_name(group, name)); \
|
||||
__usdt_sema_implicit(__usdt_sema_name(group, name)); \
|
||||
__usdt_sema_name(group, name).active > 0; \
|
||||
})
|
||||
|
||||
/*
|
||||
* APIs for working with user-defined explicit USDT semaphores.
|
||||
*
|
||||
* This is a less commonly used advanced API for use cases in which user needs
|
||||
* an explicit control over (potentially shared across multiple USDTs) USDT
|
||||
* semaphore instance. This can be used when there is a group of logically
|
||||
* related USDTs that all need extra data collection and processing whenever
|
||||
* any of a family of related USDTs are "activated" (i.e., traced). In such
|
||||
* a case, all such related USDTs will be associated with the same shared USDT
|
||||
* semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be
|
||||
* triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra
|
||||
* USDT semaphore identifier as an extra parameter.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Underlying C global variable name for user-defined USDT semaphore with
|
||||
* `sema` identifier. Could be useful for debugging, but normally shouldn't be
|
||||
* used explicitly.
|
||||
*/
|
||||
#define USDT_SEMA(sema) __usdt_sema_##sema
|
||||
|
||||
/*
|
||||
* Define storage for user-defined USDT semaphore `sema`.
|
||||
*
|
||||
* Should be used only once in non-header source file to let compiler allocate
|
||||
* space for the semaphore variable. Just like with any other global variable.
|
||||
*
|
||||
* This macro can be used anywhere where global variable declaration is
|
||||
* allowed. Just like with global variable definitions, there should be only
|
||||
* one definition of user-defined USDT semaphore with given `sema` identifier,
|
||||
* otherwise compiler or linker will complain about duplicate variable
|
||||
* definition.
|
||||
*
|
||||
* For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace
|
||||
* and inside namespaces (including nested namespaces). Just make sure that
|
||||
* USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is
|
||||
* referenced, or any of its parent namespaces, so the C++ language-level
|
||||
* identifier is visible to the code that needs to reference the semaphore.
|
||||
* At the lowest layer, USDT semaphores have global naming and visibility
|
||||
* (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked
|
||||
* against from C or C++ code, if necessary). To keep it simple, putting
|
||||
* USDT_DECLARE_SEMA() declarations into global namespaces is the simplest
|
||||
* no-brainer solution. All these aspects are irrelevant for plain C, because
|
||||
* C doesn't have namespaces and everything is always in the global namespace.
|
||||
*
|
||||
* N.B. Due to USDT metadata being recorded in non-allocatable ELF note
|
||||
* section, it has limitations when it comes to relocations, which, in
|
||||
* practice, means that it's not possible to correctly share USDT semaphores
|
||||
* between main executable and shared libraries, or even between multiple
|
||||
* shared libraries. USDT semaphore has to be contained to individual shared
|
||||
* library or executable to avoid unpleasant surprises with half-working USDT
|
||||
* semaphores. We enforce this by marking semaphore ELF symbols as having
|
||||
* a hidden visibility. This is quite an advanced use case and consideration
|
||||
* and for most users this should have no consequences whatsoever.
|
||||
*/
|
||||
#define USDT_DEFINE_SEMA(sema) \
|
||||
struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \
|
||||
__usdt_asm_name(USDT_SEMA(sema)) \
|
||||
__attribute__((visibility("hidden"))) = { 0 }
|
||||
|
||||
/*
|
||||
* Declare extern reference to user-defined USDT semaphore `sema`.
|
||||
*
|
||||
* Refers to a variable defined in another compilation unit by
|
||||
* USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across
|
||||
* multiple compilation units (i.e., .c and .cpp files).
|
||||
*
|
||||
* See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities.
|
||||
*/
|
||||
#define USDT_DECLARE_SEMA(sema) \
|
||||
extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema))
|
||||
|
||||
/*
|
||||
* Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it
|
||||
* is attached to by external tracing tooling and is actively observed).
|
||||
*
|
||||
* This macro can be used to decide whether any additional and potentially
|
||||
* expensive data collection or processing should be done to pass extra
|
||||
* information into USDT(s) associated with USDT semaphore `sema`.
|
||||
*
|
||||
* N.B. Such checks are necessarily racy. Between checking the state of USDT
|
||||
* semaphore and triggering associated USDT(s), the active tracer might attach
|
||||
* or detach. This race should be extremely rare and worst case should result
|
||||
* in one-time missed USDT event or wasted extra data collection and
|
||||
* processing. USDT-using tracers should be written with this in mind and is
|
||||
* not a concern of the application defining USDTs with associated semaphore.
|
||||
*/
|
||||
#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0)
|
||||
|
||||
/*
|
||||
* Invoke USDT specified by `group` and `name` identifiers and associate
|
||||
* explicitly user-defined semaphore `sema` with it. Pass through `args` as
|
||||
* USDT arguments. `args` are optional and zero arguments are acceptable.
|
||||
*
|
||||
* Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be
|
||||
* checked whether active with USDT_SEMA_IS_ACTIVE().
|
||||
*/
|
||||
#ifdef __usdt_va_opt
|
||||
#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \
|
||||
__usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__)
|
||||
#else
|
||||
#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \
|
||||
__usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Adjustable implementation aspects
|
||||
*/
|
||||
#ifndef USDT_ARG_CONSTRAINT
|
||||
#if defined __powerpc__
|
||||
#define USDT_ARG_CONSTRAINT nZr
|
||||
#elif defined __arm__
|
||||
#define USDT_ARG_CONSTRAINT g
|
||||
#elif defined __loongarch__
|
||||
#define USDT_ARG_CONSTRAINT nmr
|
||||
#else
|
||||
#define USDT_ARG_CONSTRAINT nor
|
||||
#endif
|
||||
#endif /* USDT_ARG_CONSTRAINT */
|
||||
|
||||
#ifndef USDT_NOP
|
||||
#if defined(__ia64__) || defined(__s390__) || defined(__s390x__)
|
||||
#define USDT_NOP nop 0
|
||||
#else
|
||||
#define USDT_NOP nop
|
||||
#endif
|
||||
#endif /* USDT_NOP */
|
||||
|
||||
/*
|
||||
* Implementation details
|
||||
*/
|
||||
/* USDT name for implicitly-defined USDT semaphore, derived from group:name */
|
||||
#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name
|
||||
/* ELF section into which USDT semaphores are put */
|
||||
#define __usdt_sema_sec __attribute__((section(".probes")))
|
||||
|
||||
#define __usdt_concat(a, b) a ## b
|
||||
#define __usdt_apply(fn, n) __usdt_concat(fn, n)
|
||||
|
||||
#ifndef __usdt_nth
|
||||
#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N
|
||||
#endif
|
||||
|
||||
#ifndef __usdt_narg
|
||||
#ifdef __usdt_va_opt
|
||||
#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
|
||||
#else
|
||||
#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
|
||||
#endif
|
||||
#endif /* __usdt_narg */
|
||||
|
||||
#define __usdt_hash #
|
||||
#define __usdt_str_(x) #x
|
||||
#define __usdt_str(x) __usdt_str_(x)
|
||||
|
||||
#ifndef __usdt_asm_name
|
||||
#define __usdt_asm_name(name) __asm__(__usdt_str(name))
|
||||
#endif
|
||||
|
||||
#define __usdt_asm0() "\n"
|
||||
#define __usdt_asm1(x) __usdt_str(x) "\n"
|
||||
#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__)
|
||||
#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__)
|
||||
#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__)
|
||||
#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__)
|
||||
#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__)
|
||||
#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__)
|
||||
#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__)
|
||||
#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__)
|
||||
#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__)
|
||||
#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__)
|
||||
#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__)
|
||||
#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
#ifdef __LP64__
|
||||
#define __usdt_asm_addr .8byte
|
||||
#else
|
||||
#define __usdt_asm_addr .4byte
|
||||
#endif
|
||||
|
||||
#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x)
|
||||
#define __usdt_asm_strz(x) __usdt_asm_strz_(x)
|
||||
#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x)
|
||||
#define __usdt_asm_str(x) __usdt_asm_str_(x)
|
||||
|
||||
/* "semaphoreless" USDT case */
|
||||
#ifndef __usdt_sema_none
|
||||
#define __usdt_sema_none(sema)
|
||||
#endif
|
||||
|
||||
/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */
|
||||
#ifndef __usdt_sema_implicit
|
||||
#define __usdt_sema_implicit(sema) \
|
||||
__asm__ __volatile__ ( \
|
||||
__usdt_asm1(.ifndef sema) \
|
||||
__usdt_asm3( .pushsection .probes, "aw", "progbits") \
|
||||
__usdt_asm1( .weak sema) \
|
||||
__usdt_asm1( .hidden sema) \
|
||||
__usdt_asm1( .align 2) \
|
||||
__usdt_asm1(sema:) \
|
||||
__usdt_asm1( .zero 2) \
|
||||
__usdt_asm2( .type sema, @object) \
|
||||
__usdt_asm2( .size sema, 2) \
|
||||
__usdt_asm1( .popsection) \
|
||||
__usdt_asm1(.endif) \
|
||||
);
|
||||
#endif
|
||||
|
||||
/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */
|
||||
#ifndef __usdt_sema_explicit
|
||||
#define __usdt_sema_explicit(sema) \
|
||||
__asm__ __volatile__ ("" :: "m" (sema));
|
||||
#endif
|
||||
|
||||
/* main USDT definition (nop and .note.stapsdt metadata) */
|
||||
#define __usdt_probe(group, name, sema_def, sema, ...) do { \
|
||||
sema_def(sema) \
|
||||
__asm__ __volatile__ ( \
|
||||
__usdt_asm( 990: USDT_NOP) \
|
||||
__usdt_asm3( .pushsection .note.stapsdt, "", "note") \
|
||||
__usdt_asm1( .balign 4) \
|
||||
__usdt_asm3( .4byte 992f-991f,994f-993f,3) \
|
||||
__usdt_asm1(991: .asciz "stapsdt") \
|
||||
__usdt_asm1(992: .balign 4) \
|
||||
__usdt_asm1(993: __usdt_asm_addr 990b) \
|
||||
__usdt_asm1( __usdt_asm_addr _.stapsdt.base) \
|
||||
__usdt_asm1( __usdt_asm_addr sema) \
|
||||
__usdt_asm_strz(group) \
|
||||
__usdt_asm_strz(name) \
|
||||
__usdt_asm_args(__VA_ARGS__) \
|
||||
__usdt_asm1( .ascii "\0") \
|
||||
__usdt_asm1(994: .balign 4) \
|
||||
__usdt_asm1( .popsection) \
|
||||
__usdt_asm1(.ifndef _.stapsdt.base) \
|
||||
__usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\
|
||||
__usdt_asm1( .weak _.stapsdt.base) \
|
||||
__usdt_asm1( .hidden _.stapsdt.base) \
|
||||
__usdt_asm1(_.stapsdt.base:) \
|
||||
__usdt_asm1( .space 1) \
|
||||
__usdt_asm2( .size _.stapsdt.base, 1) \
|
||||
__usdt_asm1( .popsection) \
|
||||
__usdt_asm1(.endif) \
|
||||
:: __usdt_asm_ops(__VA_ARGS__) \
|
||||
); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* NB: gdb PR24541 highlighted an unspecified corner of the sdt.h
|
||||
* operand note format.
|
||||
*
|
||||
* The named register may be a longer or shorter (!) alias for the
|
||||
* storage where the value in question is found. For example, on
|
||||
* i386, 64-bit value may be put in register pairs, and a register
|
||||
* name stored would identify just one of them. Previously, gcc was
|
||||
* asked to emit the %w[id] (16-bit alias of some registers holding
|
||||
* operands), even when a wider 32-bit value was used.
|
||||
*
|
||||
* Bottom line: the byte-width given before the @ sign governs. If
|
||||
* there is a mismatch between that width and that of the named
|
||||
* register, then a sys/sdt.h note consumer may need to employ
|
||||
* architecture-specific heuristics to figure out where the compiler
|
||||
* has actually put the complete value.
|
||||
*/
|
||||
#if defined(__powerpc__) || defined(__powerpc64__)
|
||||
#define __usdt_argref(id) %I[id]%[id]
|
||||
#elif defined(__i386__)
|
||||
#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */
|
||||
#else
|
||||
#define __usdt_argref(id) %[id]
|
||||
#endif
|
||||
|
||||
#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \
|
||||
__usdt_asm1(.ascii "@") \
|
||||
__usdt_asm_str(__usdt_argref(__usdt_aval##n))
|
||||
|
||||
#define __usdt_asm_args0 /* no arguments */
|
||||
#define __usdt_asm_args1 __usdt_asm_arg(1)
|
||||
#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2)
|
||||
#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3)
|
||||
#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4)
|
||||
#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5)
|
||||
#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6)
|
||||
#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7)
|
||||
#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8)
|
||||
#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9)
|
||||
#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10)
|
||||
#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11)
|
||||
#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12)
|
||||
#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__))
|
||||
|
||||
#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5)
|
||||
#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x))
|
||||
|
||||
/*
|
||||
* We can't use __builtin_choose_expr() in C++, so fall back to table-based
|
||||
* signedness determination for known types, utilizing templates magic.
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
|
||||
#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed)
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
template<typename T> struct __usdt_t { static const bool is_signed = false; };
|
||||
template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {};
|
||||
template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {};
|
||||
|
||||
#define __usdt_def_signed(T) \
|
||||
template<> struct __usdt_t<T> { static const bool is_signed = true; }; \
|
||||
template<> struct __usdt_t<const T> { static const bool is_signed = true; }; \
|
||||
template<> struct __usdt_t<volatile T> { static const bool is_signed = true; }; \
|
||||
template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; }
|
||||
#define __usdt_maybe_signed(T) \
|
||||
template<> struct __usdt_t<T> { static const bool is_signed = (T)-1 < (T)1; }; \
|
||||
template<> struct __usdt_t<const T> { static const bool is_signed = (T)-1 < (T)1; }; \
|
||||
template<> struct __usdt_t<volatile T> { static const bool is_signed = (T)-1 < (T)1; }; \
|
||||
template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; }
|
||||
|
||||
__usdt_def_signed(signed char);
|
||||
__usdt_def_signed(short);
|
||||
__usdt_def_signed(int);
|
||||
__usdt_def_signed(long);
|
||||
__usdt_def_signed(long long);
|
||||
__usdt_maybe_signed(char);
|
||||
__usdt_maybe_signed(wchar_t);
|
||||
|
||||
#else /* !__cplusplus */
|
||||
|
||||
#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4)
|
||||
#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U))
|
||||
#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1)
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#define __usdt_asm_op(n, x) \
|
||||
[__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \
|
||||
[__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x)
|
||||
|
||||
#define __usdt_asm_ops0() [__usdt_dummy] "g" (0)
|
||||
#define __usdt_asm_ops1(x) __usdt_asm_op(1, x)
|
||||
#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x)
|
||||
#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x)
|
||||
#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x)
|
||||
#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x)
|
||||
#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x)
|
||||
#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x)
|
||||
#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x)
|
||||
#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x)
|
||||
#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x)
|
||||
#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x)
|
||||
#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x)
|
||||
#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__)
|
||||
|
||||
#endif /* __USDT_H */
|
|
@ -74,6 +74,14 @@
|
|||
#define noinline __attribute__((noinline))
|
||||
#endif
|
||||
|
||||
#ifndef __nocf_check
|
||||
#define __nocf_check __attribute__((nocf_check))
|
||||
#endif
|
||||
|
||||
#ifndef __naked
|
||||
#define __naked __attribute__((__naked__))
|
||||
#endif
|
||||
|
||||
#ifndef PR_SET_NO_NEW_PRIVS
|
||||
#define PR_SET_NO_NEW_PRIVS 38
|
||||
#define PR_GET_NO_NEW_PRIVS 39
|
||||
|
@ -5027,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader)
|
|||
EXPECT_EQ(0, status);
|
||||
}
|
||||
|
||||
noinline int probed(void)
|
||||
#ifdef __x86_64__
|
||||
|
||||
/*
|
||||
* We need naked probed_uprobe function. Using __nocf_check
|
||||
* check to skip possible endbr64 instruction and ignoring
|
||||
* -Wattributes, otherwise the compilation might fail.
|
||||
*/
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wattributes"
|
||||
|
||||
__naked __nocf_check noinline int probed_uprobe(void)
|
||||
{
|
||||
/*
|
||||
* Optimized uprobe is possible only on top of nop5 instruction.
|
||||
*/
|
||||
asm volatile (" \n"
|
||||
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n"
|
||||
"ret \n"
|
||||
);
|
||||
}
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
#else
|
||||
noinline int probed_uprobe(void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
noinline int probed_uretprobe(void)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
@ -5080,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr)
|
|||
return found ? (uintptr_t)addr - start + base : -1;
|
||||
}
|
||||
|
||||
FIXTURE(URETPROBE) {
|
||||
FIXTURE(UPROBE) {
|
||||
int fd;
|
||||
};
|
||||
|
||||
FIXTURE_VARIANT(URETPROBE) {
|
||||
FIXTURE_VARIANT(UPROBE) {
|
||||
/*
|
||||
* All of the URETPROBE behaviors can be tested with either
|
||||
* uretprobe attached or not
|
||||
* All of the U(RET)PROBE behaviors can be tested with either
|
||||
* u(ret)probe attached or not
|
||||
*/
|
||||
bool attach;
|
||||
/*
|
||||
* Test both uprobe and uretprobe.
|
||||
*/
|
||||
bool uretprobe;
|
||||
};
|
||||
|
||||
FIXTURE_VARIANT_ADD(URETPROBE, attached) {
|
||||
.attach = true,
|
||||
};
|
||||
|
||||
FIXTURE_VARIANT_ADD(URETPROBE, not_attached) {
|
||||
FIXTURE_VARIANT_ADD(UPROBE, not_attached) {
|
||||
.attach = false,
|
||||
.uretprobe = false,
|
||||
};
|
||||
|
||||
FIXTURE_SETUP(URETPROBE)
|
||||
FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) {
|
||||
.attach = true,
|
||||
.uretprobe = false,
|
||||
};
|
||||
|
||||
FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) {
|
||||
.attach = true,
|
||||
.uretprobe = true,
|
||||
};
|
||||
|
||||
FIXTURE_SETUP(UPROBE)
|
||||
{
|
||||
const size_t attr_sz = sizeof(struct perf_event_attr);
|
||||
struct perf_event_attr attr;
|
||||
ssize_t offset;
|
||||
int type, bit;
|
||||
|
||||
#ifndef __NR_uretprobe
|
||||
SKIP(return, "__NR_uretprobe syscall not defined");
|
||||
#if !defined(__NR_uprobe) || !defined(__NR_uretprobe)
|
||||
SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined");
|
||||
#endif
|
||||
|
||||
if (!variant->attach)
|
||||
|
@ -5118,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE)
|
|||
|
||||
type = determine_uprobe_perf_type();
|
||||
ASSERT_GE(type, 0);
|
||||
bit = determine_uprobe_retprobe_bit();
|
||||
ASSERT_GE(bit, 0);
|
||||
offset = get_uprobe_offset(probed);
|
||||
|
||||
if (variant->uretprobe) {
|
||||
bit = determine_uprobe_retprobe_bit();
|
||||
ASSERT_GE(bit, 0);
|
||||
}
|
||||
|
||||
offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe);
|
||||
ASSERT_GE(offset, 0);
|
||||
|
||||
attr.config |= 1 << bit;
|
||||
if (variant->uretprobe)
|
||||
attr.config |= 1 << bit;
|
||||
attr.size = attr_sz;
|
||||
attr.type = type;
|
||||
attr.config1 = ptr_to_u64("/proc/self/exe");
|
||||
|
@ -5134,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE)
|
|||
PERF_FLAG_FD_CLOEXEC);
|
||||
}
|
||||
|
||||
FIXTURE_TEARDOWN(URETPROBE)
|
||||
FIXTURE_TEARDOWN(UPROBE)
|
||||
{
|
||||
/* we could call close(self->fd), but we'd need extra filter for
|
||||
* that and since we are calling _exit right away..
|
||||
|
@ -5148,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog)
|
|||
return -1;
|
||||
}
|
||||
|
||||
probed();
|
||||
/*
|
||||
* Uprobe is optimized after first hit, so let's hit twice.
|
||||
*/
|
||||
probed_uprobe();
|
||||
probed_uprobe();
|
||||
|
||||
probed_uretprobe();
|
||||
return 0;
|
||||
}
|
||||
|
||||
TEST_F(URETPROBE, uretprobe_default_allow)
|
||||
TEST_F(UPROBE, uprobe_default_allow)
|
||||
{
|
||||
struct sock_filter filter[] = {
|
||||
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
|
||||
|
@ -5165,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow)
|
|||
ASSERT_EQ(0, run_probed_with_filter(&prog));
|
||||
}
|
||||
|
||||
TEST_F(URETPROBE, uretprobe_default_block)
|
||||
TEST_F(UPROBE, uprobe_default_block)
|
||||
{
|
||||
struct sock_filter filter[] = {
|
||||
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
|
||||
|
@ -5182,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block)
|
|||
ASSERT_EQ(0, run_probed_with_filter(&prog));
|
||||
}
|
||||
|
||||
TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
|
||||
TEST_F(UPROBE, uprobe_block_syscall)
|
||||
{
|
||||
struct sock_filter filter[] = {
|
||||
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
|
||||
offsetof(struct seccomp_data, nr)),
|
||||
#ifdef __NR_uprobe
|
||||
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2),
|
||||
#endif
|
||||
#ifdef __NR_uretprobe
|
||||
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
|
||||
#endif
|
||||
|
@ -5201,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
|
|||
ASSERT_EQ(0, run_probed_with_filter(&prog));
|
||||
}
|
||||
|
||||
TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
|
||||
TEST_F(UPROBE, uprobe_default_block_with_syscall)
|
||||
{
|
||||
struct sock_filter filter[] = {
|
||||
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
|
||||
offsetof(struct seccomp_data, nr)),
|
||||
#ifdef __NR_uprobe
|
||||
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0),
|
||||
#endif
|
||||
#ifdef __NR_uretprobe
|
||||
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue