Performance events updates for v6.18:

Core perf code updates:
 
  - Convert mmap() related reference counts to refcount_t. This
    is in reaction to the recently fixed refcount bugs, which
    could have been detected earlier and could have mitigated
    the bug somewhat. (Thomas Gleixner, Peter Zijlstra)
 
  - Clean up and simplify the callchain code, in preparation
    for sframes. (Steven Rostedt, Josh Poimboeuf)
 
 Uprobes updates:
 
  - Add support to optimize usdt probes on x86-64, which
    gives a substantial speedup. (Jiri Olsa)
 
  - Cleanups and fixes on x86 (Peter Zijlstra)
 
 PMU driver updates:
 
  - Various optimizations and fixes to the Intel PMU driver
    (Dapeng Mi)
 
 Misc cleanups and fixes:
 
  - Remove redundant __GFP_NOWARN (Qianfeng Rong)
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmjWpGIRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1iHvxAAvO8qWbbhUdF3EZaFU0Wx6oh5KBhImU49
 VZ107xe9llA0Szy3hIl1YpdOQA2NAHtma6We/ebonrPVTTkcSCGq8absc+GahA3I
 CHIomx2hjD0OQ01aHvTqgHJUdFUQQ0yzE3+FY6Tsn05JsNZvDmqpAMIoMQT0LuuG
 7VvVRLBuDXtuMtNmGaGCvfDGKTZkGGxD6iZS1iWHuixvVAz4IECK0vYqSyh31UGA
 w9Jwa0thwjKm2EZTmcSKaHSM2zw3N8QXJ3SNPPThuMrtO6QDz2+3Da9kO+vhGcRP
 Jls9KnWC2wxNxqIs3dr80Mzn4qMplc67Ekx2tUqX4tYEGGtJQxW6tm3JOKKIgFMI
 g/KF9/WJPXp0rVI9mtoQkgndzyswR/ZJBAwfEQu+nAqlp3gmmQR9+MeYPCyNnyhB
 2g22PTMbXkihJmRPAVeH+WhwFy1YY3nsRhh61ha3/N0ULXTHUh0E+hWwUVMifYSV
 SwXqQx4srlo6RJJNTji1d6R3muNjXCQNEsJ0lCOX6ajVoxWZsPH2x7/W1A8LKmY+
 FLYQUi6X9ogQbOO3WxCjUhzp5nMTNA2vvo87MUzDlZOCLPqYZmqcjntHuXwdjPyO
 lPcfTzc2nK1Ud26bG3+p2Bk3fjqkX9XcTMFniOvjKfffEfwpAq4xRPBQ3uRlzn0V
 pf9067JYF+c=
 =sVXH
 -----END PGP SIGNATURE-----

Merge tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
 "Core perf code updates:

   - Convert mmap() related reference counts to refcount_t. This is in
     reaction to the recently fixed refcount bugs, which could have been
     detected earlier and could have mitigated the bug somewhat (Thomas
     Gleixner, Peter Zijlstra)

   - Clean up and simplify the callchain code, in preparation for
     sframes (Steven Rostedt, Josh Poimboeuf)

  Uprobes updates:

   - Add support to optimize usdt probes on x86-64, which gives a
     substantial speedup (Jiri Olsa)

   - Cleanups and fixes on x86 (Peter Zijlstra)

  PMU driver updates:

   - Various optimizations and fixes to the Intel PMU driver (Dapeng Mi)

  Misc cleanups and fixes:

   - Remove redundant __GFP_NOWARN (Qianfeng Rong)"

* tag 'perf-core-2025-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (57 commits)
  selftests/bpf: Fix uprobe_sigill test for uprobe syscall error value
  uprobes/x86: Return error from uprobe syscall when not called from trampoline
  perf: Skip user unwind if the task is a kernel thread
  perf: Simplify get_perf_callchain() user logic
  perf: Use current->flags & PF_KTHREAD|PF_USER_WORKER instead of current->mm == NULL
  perf: Have get_perf_callchain() return NULL if crosstask and user are set
  perf: Remove get_perf_callchain() init_nr argument
  perf/x86: Print PMU counters bitmap in x86_pmu_show_pmu_cap()
  perf/x86/intel: Add ICL_FIXED_0_ADAPTIVE bit into INTEL_FIXED_BITS_MASK
  perf/x86/intel: Change macro GLOBAL_CTRL_EN_PERF_METRICS to BIT_ULL(48)
  perf/x86: Add PERF_CAP_PEBS_TIMING_INFO flag
  perf/x86/intel: Fix IA32_PMC_x_CFG_B MSRs access error
  perf/x86/intel: Use early_initcall() to hook bts_init()
  uprobes: Remove redundant __GFP_NOWARN
  selftests/seccomp: validate uprobe syscall passes through seccomp
  seccomp: passthrough uprobe systemcall without filtering
  selftests/bpf: Fix uprobe syscall shadow stack test
  selftests/bpf: Change test_uretprobe_regs_change for uprobe and uretprobe
  selftests/bpf: Add uprobe_regs_equal test
  selftests/bpf: Add optimized usdt variant for basic usdt test
  ...
This commit is contained in:
Linus Torvalds 2025-09-30 11:11:21 -07:00
commit e4dcbdff11
32 changed files with 2262 additions and 416 deletions

View File

@ -30,7 +30,7 @@ int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr,
__opcode_to_mem_arm(auprobe->bpinsn));
__opcode_to_mem_arm(auprobe->bpinsn), true);
}
bool arch_uprobe_ignore(struct arch_uprobe *auprobe, struct pt_regs *regs)

View File

@ -345,6 +345,7 @@
333 common io_pgetevents sys_io_pgetevents
334 common rseq sys_rseq
335 common uretprobe sys_uretprobe
336 common uprobe sys_uprobe
# don't use numbers 387 through 423, add new calls after the last
# 'common' entry
424 common pidfd_send_signal sys_pidfd_send_signal

View File

@ -2069,13 +2069,15 @@ static void _x86_pmu_read(struct perf_event *event)
void x86_pmu_show_pmu_cap(struct pmu *pmu)
{
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic registers: %d\n", x86_pmu_num_counters(pmu));
pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
pr_info("... fixed-purpose events: %d\n", x86_pmu_num_counters_fixed(pmu));
pr_info("... event mask: %016Lx\n", hybrid(pmu, intel_ctrl));
pr_info("... version: %d\n", x86_pmu.version);
pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
pr_info("... generic counters: %d\n", x86_pmu_num_counters(pmu));
pr_info("... generic bitmap: %016llx\n", hybrid(pmu, cntr_mask64));
pr_info("... fixed-purpose counters: %d\n", x86_pmu_num_counters_fixed(pmu));
pr_info("... fixed-purpose bitmap: %016llx\n", hybrid(pmu, fixed_cntr_mask64));
pr_info("... value mask: %016llx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016llx\n", x86_pmu.max_period);
pr_info("... global_ctrl mask: %016llx\n", hybrid(pmu, intel_ctrl));
}
static int __init init_hw_perf_events(void)

View File

@ -643,4 +643,4 @@ static __init int bts_init(void)
return perf_pmu_register(&bts_pmu, "intel_bts", -1);
}
arch_initcall(bts_init);
early_initcall(bts_init);

View File

@ -2845,8 +2845,8 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
{
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct hw_perf_event *hwc = &event->hw;
u64 mask, bits = 0;
int idx = hwc->idx;
u64 bits = 0;
if (is_topdown_idx(idx)) {
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@ -2885,14 +2885,10 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
idx -= INTEL_PMC_IDX_FIXED;
bits = intel_fixed_bits_by_idx(idx, bits);
mask = intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) {
if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip)
bits |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
mask |= intel_fixed_bits_by_idx(idx, ICL_FIXED_0_ADAPTIVE);
}
cpuc->fixed_ctrl_val &= ~mask;
cpuc->fixed_ctrl_val &= ~intel_fixed_bits_by_idx(idx, INTEL_FIXED_BITS_MASK);
cpuc->fixed_ctrl_val |= bits;
}
@ -2997,7 +2993,8 @@ static void intel_pmu_acr_late_setup(struct cpu_hw_events *cpuc)
if (event->group_leader != leader->group_leader)
break;
for_each_set_bit(idx, (unsigned long *)&event->attr.config2, X86_PMC_IDX_MAX) {
if (WARN_ON_ONCE(i + idx > cpuc->n_events))
if (i + idx >= cpuc->n_events ||
!is_acr_event_group(cpuc->event_list[i + idx]))
return;
__set_bit(cpuc->assign[i + idx], (unsigned long *)&event->hw.config1);
}
@ -5318,9 +5315,9 @@ static void intel_pmu_check_hybrid_pmus(struct x86_hybrid_pmu *pmu)
0, x86_pmu_num_counters(&pmu->pmu), 0, 0);
if (pmu->intel_cap.perf_metrics)
pmu->intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
pmu->intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
else
pmu->intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
intel_pmu_check_event_constraints(pmu->event_constraints,
pmu->cntr_mask64,
@ -5455,7 +5452,7 @@ static void intel_pmu_cpu_starting(int cpu)
rdmsrq(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
if (!perf_cap.perf_metrics) {
x86_pmu.intel_cap.perf_metrics = 0;
x86_pmu.intel_ctrl &= ~(1ULL << GLOBAL_CTRL_EN_PERF_METRICS);
x86_pmu.intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS;
}
}
@ -7789,7 +7786,7 @@ __init int intel_pmu_init(void)
}
if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics)
x86_pmu.intel_ctrl |= 1ULL << GLOBAL_CTRL_EN_PERF_METRICS;
x86_pmu.intel_ctrl |= GLOBAL_CTRL_EN_PERF_METRICS;
if (x86_pmu.intel_cap.pebs_timing_info)
x86_pmu.flags |= PMU_FL_RETIRE_LATENCY;

View File

@ -315,12 +315,14 @@
#define PERF_CAP_PT_IDX 16
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
#define PERF_CAP_ARCH_REG BIT_ULL(7)
#define PERF_CAP_PEBS_FORMAT 0xf00
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
#define PERF_CAP_ARCH_REG BIT_ULL(7)
#define PERF_CAP_PEBS_FORMAT 0xf00
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
PERF_CAP_PEBS_TIMING_INFO)
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)

View File

@ -35,7 +35,6 @@
#define ARCH_PERFMON_EVENTSEL_EQ (1ULL << 36)
#define ARCH_PERFMON_EVENTSEL_UMASK2 (0xFFULL << 40)
#define INTEL_FIXED_BITS_MASK 0xFULL
#define INTEL_FIXED_BITS_STRIDE 4
#define INTEL_FIXED_0_KERNEL (1ULL << 0)
#define INTEL_FIXED_0_USER (1ULL << 1)
@ -48,6 +47,11 @@
#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34)
#define ICL_FIXED_0_ADAPTIVE (1ULL << 32)
#define INTEL_FIXED_BITS_MASK \
(INTEL_FIXED_0_KERNEL | INTEL_FIXED_0_USER | \
INTEL_FIXED_0_ANYTHREAD | INTEL_FIXED_0_ENABLE_PMI | \
ICL_FIXED_0_ADAPTIVE)
#define intel_fixed_bits_by_idx(_idx, _bits) \
((_bits) << ((_idx) * INTEL_FIXED_BITS_STRIDE))
@ -430,7 +434,7 @@ static inline bool is_topdown_idx(int idx)
#define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(GLOBAL_STATUS_TRACE_TOPAPMI_BIT)
#define GLOBAL_STATUS_PERF_METRICS_OVF_BIT 48
#define GLOBAL_CTRL_EN_PERF_METRICS 48
#define GLOBAL_CTRL_EN_PERF_METRICS BIT_ULL(48)
/*
* We model guest LBR event tracing as another fixed-mode PMC like BTS.
*

View File

@ -23,6 +23,8 @@ int setup_signal_shadow_stack(struct ksignal *ksig);
int restore_signal_shadow_stack(void);
int shstk_update_last_frame(unsigned long val);
bool shstk_is_enabled(void);
int shstk_pop(u64 *val);
int shstk_push(u64 val);
#else
static inline long shstk_prctl(struct task_struct *task, int option,
unsigned long arg2) { return -EINVAL; }
@ -35,6 +37,8 @@ static inline int setup_signal_shadow_stack(struct ksignal *ksig) { return 0; }
static inline int restore_signal_shadow_stack(void) { return 0; }
static inline int shstk_update_last_frame(unsigned long val) { return 0; }
static inline bool shstk_is_enabled(void) { return false; }
static inline int shstk_pop(u64 *val) { return -ENOTSUPP; }
static inline int shstk_push(u64 val) { return -ENOTSUPP; }
#endif /* CONFIG_X86_USER_SHADOW_STACK */
#endif /* __ASSEMBLER__ */

View File

@ -20,6 +20,11 @@ typedef u8 uprobe_opcode_t;
#define UPROBE_SWBP_INSN 0xcc
#define UPROBE_SWBP_INSN_SIZE 1
enum {
ARCH_UPROBE_FLAG_CAN_OPTIMIZE = 0,
ARCH_UPROBE_FLAG_OPTIMIZE_FAIL = 1,
};
struct uprobe_xol_ops;
struct arch_uprobe {
@ -45,6 +50,8 @@ struct arch_uprobe {
u8 ilen;
} push;
};
unsigned long flags;
};
struct arch_uprobe_task {

View File

@ -246,6 +246,46 @@ static unsigned long get_user_shstk_addr(void)
return ssp;
}
int shstk_pop(u64 *val)
{
int ret = 0;
u64 ssp;
if (!features_enabled(ARCH_SHSTK_SHSTK))
return -ENOTSUPP;
fpregs_lock_and_load();
rdmsrq(MSR_IA32_PL3_SSP, ssp);
if (val && get_user(*val, (__user u64 *)ssp))
ret = -EFAULT;
else
wrmsrq(MSR_IA32_PL3_SSP, ssp + SS_FRAME_SIZE);
fpregs_unlock();
return ret;
}
int shstk_push(u64 val)
{
u64 ssp;
int ret;
if (!features_enabled(ARCH_SHSTK_SHSTK))
return -ENOTSUPP;
fpregs_lock_and_load();
rdmsrq(MSR_IA32_PL3_SSP, ssp);
ssp -= SS_FRAME_SIZE;
ret = write_user_shstk_64((__user void *)ssp, val);
if (!ret)
wrmsrq(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
return ret;
}
#define SHSTK_DATA_BIT BIT(63)
static int put_shstk_data(u64 __user *addr, u64 data)

View File

@ -18,6 +18,7 @@
#include <asm/processor.h>
#include <asm/insn.h>
#include <asm/mmu_context.h>
#include <asm/nops.h>
/* Post-execution fixups. */
@ -310,25 +311,32 @@ static int uprobe_init_insn(struct arch_uprobe *auprobe, struct insn *insn, bool
#ifdef CONFIG_X86_64
struct uretprobe_syscall_args {
unsigned long r11;
unsigned long cx;
unsigned long ax;
};
asm (
".pushsection .rodata\n"
".global uretprobe_trampoline_entry\n"
"uretprobe_trampoline_entry:\n"
"pushq %rax\n"
"pushq %rcx\n"
"pushq %r11\n"
"movq $" __stringify(__NR_uretprobe) ", %rax\n"
"push %rax\n"
"push %rcx\n"
"push %r11\n"
"mov $" __stringify(__NR_uretprobe) ", %rax\n"
"syscall\n"
".global uretprobe_syscall_check\n"
"uretprobe_syscall_check:\n"
"popq %r11\n"
"popq %rcx\n"
/* The uretprobe syscall replaces stored %rax value with final
"pop %r11\n"
"pop %rcx\n"
/*
* The uretprobe syscall replaces stored %rax value with final
* return address, so we don't restore %rax in here and just
* call ret.
*/
"retq\n"
"ret\n"
"int3\n"
".global uretprobe_trampoline_end\n"
"uretprobe_trampoline_end:\n"
".popsection\n"
@ -338,7 +346,7 @@ extern u8 uretprobe_trampoline_entry[];
extern u8 uretprobe_trampoline_end[];
extern u8 uretprobe_syscall_check[];
void *arch_uprobe_trampoline(unsigned long *psize)
void *arch_uretprobe_trampoline(unsigned long *psize)
{
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
struct pt_regs *regs = task_pt_regs(current);
@ -365,7 +373,8 @@ static unsigned long trampoline_check_ip(unsigned long tramp)
SYSCALL_DEFINE0(uretprobe)
{
struct pt_regs *regs = task_pt_regs(current);
unsigned long err, ip, sp, r11_cx_ax[3], tramp;
struct uretprobe_syscall_args args;
unsigned long err, ip, sp, tramp;
/* If there's no trampoline, we are called from wrong place. */
tramp = uprobe_get_trampoline_vaddr();
@ -376,15 +385,15 @@ SYSCALL_DEFINE0(uretprobe)
if (unlikely(regs->ip != trampoline_check_ip(tramp)))
goto sigill;
err = copy_from_user(r11_cx_ax, (void __user *)regs->sp, sizeof(r11_cx_ax));
err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
if (err)
goto sigill;
/* expose the "right" values of r11/cx/ax/sp to uprobe_consumer/s */
regs->r11 = r11_cx_ax[0];
regs->cx = r11_cx_ax[1];
regs->ax = r11_cx_ax[2];
regs->sp += sizeof(r11_cx_ax);
regs->r11 = args.r11;
regs->cx = args.cx;
regs->ax = args.ax;
regs->sp += sizeof(args);
regs->orig_ax = -1;
ip = regs->ip;
@ -400,21 +409,21 @@ SYSCALL_DEFINE0(uretprobe)
*/
if (regs->sp != sp || shstk_is_enabled())
return regs->ax;
regs->sp -= sizeof(r11_cx_ax);
regs->sp -= sizeof(args);
/* for the case uprobe_consumer has changed r11/cx */
r11_cx_ax[0] = regs->r11;
r11_cx_ax[1] = regs->cx;
args.r11 = regs->r11;
args.cx = regs->cx;
/*
* ax register is passed through as return value, so we can use
* its space on stack for ip value and jump to it through the
* trampoline's ret instruction
*/
r11_cx_ax[2] = regs->ip;
args.ax = regs->ip;
regs->ip = ip;
err = copy_to_user((void __user *)regs->sp, r11_cx_ax, sizeof(r11_cx_ax));
err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
if (err)
goto sigill;
@ -608,6 +617,581 @@ static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
*sr = utask->autask.saved_scratch_register;
}
}
static int tramp_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma)
{
return -EPERM;
}
static struct page *tramp_mapping_pages[2] __ro_after_init;
static struct vm_special_mapping tramp_mapping = {
.name = "[uprobes-trampoline]",
.mremap = tramp_mremap,
.pages = tramp_mapping_pages,
};
struct uprobe_trampoline {
struct hlist_node node;
unsigned long vaddr;
};
static bool is_reachable_by_call(unsigned long vtramp, unsigned long vaddr)
{
long delta = (long)(vaddr + 5 - vtramp);
return delta >= INT_MIN && delta <= INT_MAX;
}
static unsigned long find_nearest_trampoline(unsigned long vaddr)
{
struct vm_unmapped_area_info info = {
.length = PAGE_SIZE,
.align_mask = ~PAGE_MASK,
};
unsigned long low_limit, high_limit;
unsigned long low_tramp, high_tramp;
unsigned long call_end = vaddr + 5;
if (check_add_overflow(call_end, INT_MIN, &low_limit))
low_limit = PAGE_SIZE;
high_limit = call_end + INT_MAX;
/* Search up from the caller address. */
info.low_limit = call_end;
info.high_limit = min(high_limit, TASK_SIZE);
high_tramp = vm_unmapped_area(&info);
/* Search down from the caller address. */
info.low_limit = max(low_limit, PAGE_SIZE);
info.high_limit = call_end;
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
low_tramp = vm_unmapped_area(&info);
if (IS_ERR_VALUE(high_tramp) && IS_ERR_VALUE(low_tramp))
return -ENOMEM;
if (IS_ERR_VALUE(high_tramp))
return low_tramp;
if (IS_ERR_VALUE(low_tramp))
return high_tramp;
/* Return address that's closest to the caller address. */
if (call_end - low_tramp < high_tramp - call_end)
return low_tramp;
return high_tramp;
}
static struct uprobe_trampoline *create_uprobe_trampoline(unsigned long vaddr)
{
struct pt_regs *regs = task_pt_regs(current);
struct mm_struct *mm = current->mm;
struct uprobe_trampoline *tramp;
struct vm_area_struct *vma;
if (!user_64bit_mode(regs))
return NULL;
vaddr = find_nearest_trampoline(vaddr);
if (IS_ERR_VALUE(vaddr))
return NULL;
tramp = kzalloc(sizeof(*tramp), GFP_KERNEL);
if (unlikely(!tramp))
return NULL;
tramp->vaddr = vaddr;
vma = _install_special_mapping(mm, tramp->vaddr, PAGE_SIZE,
VM_READ|VM_EXEC|VM_MAYEXEC|VM_MAYREAD|VM_DONTCOPY|VM_IO,
&tramp_mapping);
if (IS_ERR(vma)) {
kfree(tramp);
return NULL;
}
return tramp;
}
static struct uprobe_trampoline *get_uprobe_trampoline(unsigned long vaddr, bool *new)
{
struct uprobes_state *state = &current->mm->uprobes_state;
struct uprobe_trampoline *tramp = NULL;
if (vaddr > TASK_SIZE || vaddr < PAGE_SIZE)
return NULL;
hlist_for_each_entry(tramp, &state->head_tramps, node) {
if (is_reachable_by_call(tramp->vaddr, vaddr)) {
*new = false;
return tramp;
}
}
tramp = create_uprobe_trampoline(vaddr);
if (!tramp)
return NULL;
*new = true;
hlist_add_head(&tramp->node, &state->head_tramps);
return tramp;
}
static void destroy_uprobe_trampoline(struct uprobe_trampoline *tramp)
{
/*
* We do not unmap and release uprobe trampoline page itself,
* because there's no easy way to make sure none of the threads
* is still inside the trampoline.
*/
hlist_del(&tramp->node);
kfree(tramp);
}
void arch_uprobe_init_state(struct mm_struct *mm)
{
INIT_HLIST_HEAD(&mm->uprobes_state.head_tramps);
}
void arch_uprobe_clear_state(struct mm_struct *mm)
{
struct uprobes_state *state = &mm->uprobes_state;
struct uprobe_trampoline *tramp;
struct hlist_node *n;
hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node)
destroy_uprobe_trampoline(tramp);
}
static bool __in_uprobe_trampoline(unsigned long ip)
{
struct vm_area_struct *vma = vma_lookup(current->mm, ip);
return vma && vma_is_special_mapping(vma, &tramp_mapping);
}
static bool in_uprobe_trampoline(unsigned long ip)
{
struct mm_struct *mm = current->mm;
bool found, retry = true;
unsigned int seq;
rcu_read_lock();
if (mmap_lock_speculate_try_begin(mm, &seq)) {
found = __in_uprobe_trampoline(ip);
retry = mmap_lock_speculate_retry(mm, seq);
}
rcu_read_unlock();
if (retry) {
mmap_read_lock(mm);
found = __in_uprobe_trampoline(ip);
mmap_read_unlock(mm);
}
return found;
}
/*
* See uprobe syscall trampoline; the call to the trampoline will push
* the return address on the stack, the trampoline itself then pushes
* cx, r11 and ax.
*/
struct uprobe_syscall_args {
unsigned long ax;
unsigned long r11;
unsigned long cx;
unsigned long retaddr;
};
SYSCALL_DEFINE0(uprobe)
{
struct pt_regs *regs = task_pt_regs(current);
struct uprobe_syscall_args args;
unsigned long ip, sp, sret;
int err;
/* Allow execution only from uprobe trampolines. */
if (!in_uprobe_trampoline(regs->ip))
return -ENXIO;
err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args));
if (err)
goto sigill;
ip = regs->ip;
/*
* expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus:
* - adjust ip to the probe address, call saved next instruction address
* - adjust sp to the probe's stack frame (check trampoline code)
*/
regs->ax = args.ax;
regs->r11 = args.r11;
regs->cx = args.cx;
regs->ip = args.retaddr - 5;
regs->sp += sizeof(args);
regs->orig_ax = -1;
sp = regs->sp;
err = shstk_pop((u64 *)&sret);
if (err == -EFAULT || (!err && sret != args.retaddr))
goto sigill;
handle_syscall_uprobe(regs, regs->ip);
/*
* Some of the uprobe consumers has changed sp, we can do nothing,
* just return via iret.
*/
if (regs->sp != sp) {
/* skip the trampoline call */
if (args.retaddr - 5 == regs->ip)
regs->ip += 5;
return regs->ax;
}
regs->sp -= sizeof(args);
/* for the case uprobe_consumer has changed ax/r11/cx */
args.ax = regs->ax;
args.r11 = regs->r11;
args.cx = regs->cx;
/* keep return address unless we are instructed otherwise */
if (args.retaddr - 5 != regs->ip)
args.retaddr = regs->ip;
if (shstk_push(args.retaddr) == -EFAULT)
goto sigill;
regs->ip = ip;
err = copy_to_user((void __user *)regs->sp, &args, sizeof(args));
if (err)
goto sigill;
/* ensure sysret, see do_syscall_64() */
regs->r11 = regs->flags;
regs->cx = regs->ip;
return 0;
sigill:
force_sig(SIGILL);
return -1;
}
asm (
".pushsection .rodata\n"
".balign " __stringify(PAGE_SIZE) "\n"
"uprobe_trampoline_entry:\n"
"push %rcx\n"
"push %r11\n"
"push %rax\n"
"mov $" __stringify(__NR_uprobe) ", %rax\n"
"syscall\n"
"pop %rax\n"
"pop %r11\n"
"pop %rcx\n"
"ret\n"
"int3\n"
".balign " __stringify(PAGE_SIZE) "\n"
".popsection\n"
);
extern u8 uprobe_trampoline_entry[];
static int __init arch_uprobes_init(void)
{
tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry);
return 0;
}
late_initcall(arch_uprobes_init);
enum {
EXPECT_SWBP,
EXPECT_CALL,
};
struct write_opcode_ctx {
unsigned long base;
int expect;
};
static int is_call_insn(uprobe_opcode_t *insn)
{
return *insn == CALL_INSN_OPCODE;
}
/*
* Verification callback used by int3_update uprobe_write calls to make sure
* the underlying instruction is as expected - either int3 or call.
*/
static int verify_insn(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode,
int nbytes, void *data)
{
struct write_opcode_ctx *ctx = data;
uprobe_opcode_t old_opcode[5];
uprobe_copy_from_page(page, ctx->base, (uprobe_opcode_t *) &old_opcode, 5);
switch (ctx->expect) {
case EXPECT_SWBP:
if (is_swbp_insn(&old_opcode[0]))
return 1;
break;
case EXPECT_CALL:
if (is_call_insn(&old_opcode[0]))
return 1;
break;
}
return -1;
}
/*
* Modify multi-byte instructions by using INT3 breakpoints on SMP.
* We completely avoid using stop_machine() here, and achieve the
* synchronization using INT3 breakpoints and SMP cross-calls.
* (borrowed comment from smp_text_poke_batch_finish)
*
* The way it is done:
* - Add an INT3 trap to the address that will be patched
* - SMP sync all CPUs
* - Update all but the first byte of the patched range
* - SMP sync all CPUs
* - Replace the first byte (INT3) by the first byte of the replacing opcode
* - SMP sync all CPUs
*/
static int int3_update(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr, char *insn, bool optimize)
{
uprobe_opcode_t int3 = UPROBE_SWBP_INSN;
struct write_opcode_ctx ctx = {
.base = vaddr,
};
int err;
/*
* Write int3 trap.
*
* The swbp_optimize path comes with breakpoint already installed,
* so we can skip this step for optimize == true.
*/
if (!optimize) {
ctx.expect = EXPECT_CALL;
err = uprobe_write(auprobe, vma, vaddr, &int3, 1, verify_insn,
true /* is_register */, false /* do_update_ref_ctr */,
&ctx);
if (err)
return err;
}
smp_text_poke_sync_each_cpu();
/* Write all but the first byte of the patched range. */
ctx.expect = EXPECT_SWBP;
err = uprobe_write(auprobe, vma, vaddr + 1, insn + 1, 4, verify_insn,
true /* is_register */, false /* do_update_ref_ctr */,
&ctx);
if (err)
return err;
smp_text_poke_sync_each_cpu();
/*
* Write first byte.
*
* The swbp_unoptimize needs to finish uprobe removal together
* with ref_ctr update, using uprobe_write with proper flags.
*/
err = uprobe_write(auprobe, vma, vaddr, insn, 1, verify_insn,
optimize /* is_register */, !optimize /* do_update_ref_ctr */,
&ctx);
if (err)
return err;
smp_text_poke_sync_each_cpu();
return 0;
}
static int swbp_optimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr, unsigned long tramp)
{
u8 call[5];
__text_gen_insn(call, CALL_INSN_OPCODE, (const void *) vaddr,
(const void *) tramp, CALL_INSN_SIZE);
return int3_update(auprobe, vma, vaddr, call, true /* optimize */);
}
static int swbp_unoptimize(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
return int3_update(auprobe, vma, vaddr, auprobe->insn, false /* optimize */);
}
static int copy_from_vaddr(struct mm_struct *mm, unsigned long vaddr, void *dst, int len)
{
unsigned int gup_flags = FOLL_FORCE|FOLL_SPLIT_PMD;
struct vm_area_struct *vma;
struct page *page;
page = get_user_page_vma_remote(mm, vaddr, gup_flags, &vma);
if (IS_ERR(page))
return PTR_ERR(page);
uprobe_copy_from_page(page, vaddr, dst, len);
put_page(page);
return 0;
}
static bool __is_optimized(uprobe_opcode_t *insn, unsigned long vaddr)
{
struct __packed __arch_relative_insn {
u8 op;
s32 raddr;
} *call = (struct __arch_relative_insn *) insn;
if (!is_call_insn(insn))
return false;
return __in_uprobe_trampoline(vaddr + 5 + call->raddr);
}
static int is_optimized(struct mm_struct *mm, unsigned long vaddr)
{
uprobe_opcode_t insn[5];
int err;
err = copy_from_vaddr(mm, vaddr, &insn, 5);
if (err)
return err;
return __is_optimized((uprobe_opcode_t *)&insn, vaddr);
}
static bool should_optimize(struct arch_uprobe *auprobe)
{
return !test_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags) &&
test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
}
int set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
if (should_optimize(auprobe)) {
/*
* We could race with another thread that already optimized the probe,
* so let's not overwrite it with int3 again in this case.
*/
int ret = is_optimized(vma->vm_mm, vaddr);
if (ret < 0)
return ret;
if (ret)
return 0;
}
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN,
true /* is_register */);
}
int set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
if (test_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags)) {
int ret = is_optimized(vma->vm_mm, vaddr);
if (ret < 0)
return ret;
if (ret) {
ret = swbp_unoptimize(auprobe, vma, vaddr);
WARN_ON_ONCE(ret);
return ret;
}
}
return uprobe_write_opcode(auprobe, vma, vaddr, *(uprobe_opcode_t *)&auprobe->insn,
false /* is_register */);
}
static int __arch_uprobe_optimize(struct arch_uprobe *auprobe, struct mm_struct *mm,
unsigned long vaddr)
{
struct uprobe_trampoline *tramp;
struct vm_area_struct *vma;
bool new = false;
int err = 0;
vma = find_vma(mm, vaddr);
if (!vma)
return -EINVAL;
tramp = get_uprobe_trampoline(vaddr, &new);
if (!tramp)
return -EINVAL;
err = swbp_optimize(auprobe, vma, vaddr, tramp->vaddr);
if (WARN_ON_ONCE(err) && new)
destroy_uprobe_trampoline(tramp);
return err;
}
void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
{
struct mm_struct *mm = current->mm;
uprobe_opcode_t insn[5];
if (!should_optimize(auprobe))
return;
mmap_write_lock(mm);
/*
* Check if some other thread already optimized the uprobe for us,
* if it's the case just go away silently.
*/
if (copy_from_vaddr(mm, vaddr, &insn, 5))
goto unlock;
if (!is_swbp_insn((uprobe_opcode_t*) &insn))
goto unlock;
/*
* If we fail to optimize the uprobe we set the fail bit so the
* above should_optimize will fail from now on.
*/
if (__arch_uprobe_optimize(auprobe, mm, vaddr))
set_bit(ARCH_UPROBE_FLAG_OPTIMIZE_FAIL, &auprobe->flags);
unlock:
mmap_write_unlock(mm);
}
static bool insn_is_nop(struct insn *insn)
{
return insn->opcode.nbytes == 1 && insn->opcode.bytes[0] == 0x90;
}
static bool insn_is_nopl(struct insn *insn)
{
if (insn->opcode.nbytes != 2)
return false;
if (insn->opcode.bytes[0] != 0x0f || insn->opcode.bytes[1] != 0x1f)
return false;
if (!insn->modrm.nbytes)
return false;
if (X86_MODRM_REG(insn->modrm.bytes[0]) != 0)
return false;
/* 0f 1f /0 - NOPL */
return true;
}
static bool can_optimize(struct insn *insn, unsigned long vaddr)
{
if (!insn->x86_64 || insn->length != 5)
return false;
if (!insn_is_nop(insn) && !insn_is_nopl(insn))
return false;
/* We can't do cross page atomic writes yet. */
return PAGE_SIZE - (vaddr & ~PAGE_MASK) >= 5;
}
#else /* 32-bit: */
/*
* No RIP-relative addressing on 32-bit
@ -621,6 +1205,10 @@ static void riprel_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
static void riprel_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)
{
}
static bool can_optimize(struct insn *insn, unsigned long vaddr)
{
return false;
}
#endif /* CONFIG_X86_64 */
struct uprobe_xol_ops {
@ -979,14 +1567,17 @@ static int push_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn)
*/
int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long addr)
{
struct insn insn;
u8 fix_ip_or_call = UPROBE_FIX_IP;
struct insn insn;
int ret;
ret = uprobe_init_insn(auprobe, &insn, is_64bit_mm(mm));
if (ret)
return ret;
if (can_optimize(&insn, addr))
set_bit(ARCH_UPROBE_FLAG_CAN_OPTIMIZE, &auprobe->flags);
ret = branch_setup_xol_ops(auprobe, &insn);
if (ret != -ENOSYS)
return ret;

View File

@ -13,7 +13,7 @@
#define MSR_IA32_MISC_ENABLE_PMU_RO_MASK (MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | \
MSR_IA32_MISC_ENABLE_BTS_UNAVAIL)
/* retrieve the 4 bits for EN and PMI out of IA32_FIXED_CTR_CTRL */
/* retrieve a fixed counter bits out of IA32_FIXED_CTR_CTRL */
#define fixed_ctrl_field(ctrl_reg, idx) \
(((ctrl_reg) >> ((idx) * INTEL_FIXED_BITS_STRIDE)) & INTEL_FIXED_BITS_MASK)

View File

@ -859,7 +859,7 @@ struct perf_event {
/* mmap bits */
struct mutex mmap_mutex;
atomic_t mmap_count;
refcount_t mmap_count;
struct perf_buffer *rb;
struct list_head rb_entry;
@ -1719,7 +1719,7 @@ DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
extern void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs);
extern struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark);
extern int get_callchain_buffers(int max_stack);
extern void put_callchain_buffers(void);

View File

@ -1005,6 +1005,8 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
asmlinkage long sys_uretprobe(void);
asmlinkage long sys_uprobe(void);
/* pciconfig: alpha, arm, arm64, ia64, sparc */
asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
unsigned long off, unsigned long len,

View File

@ -17,6 +17,7 @@
#include <linux/wait.h>
#include <linux/timer.h>
#include <linux/seqlock.h>
#include <linux/mutex.h>
struct uprobe;
struct vm_area_struct;
@ -185,8 +186,14 @@ struct xol_area;
struct uprobes_state {
struct xol_area *xol_area;
#ifdef CONFIG_X86_64
struct hlist_head head_tramps;
#endif
};
typedef int (*uprobe_write_verify_t)(struct page *page, unsigned long vaddr,
uprobe_opcode_t *insn, int nbytes, void *data);
extern void __init uprobes_init(void);
extern int set_swbp(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
extern int set_orig_insn(struct arch_uprobe *aup, struct vm_area_struct *vma, unsigned long vaddr);
@ -194,7 +201,11 @@ extern bool is_swbp_insn(uprobe_opcode_t *insn);
extern bool is_trap_insn(uprobe_opcode_t *insn);
extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs);
extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs);
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t);
extern int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsigned long vaddr, uprobe_opcode_t,
bool is_register);
extern int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma, const unsigned long opcode_vaddr,
uprobe_opcode_t *insn, int nbytes, uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
void *data);
extern struct uprobe *uprobe_register(struct inode *inode, loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc);
extern int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool);
extern void uprobe_unregister_nosync(struct uprobe *uprobe, struct uprobe_consumer *uc);
@ -224,8 +235,13 @@ extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
void *src, unsigned long len);
extern void uprobe_handle_trampoline(struct pt_regs *regs);
extern void *arch_uprobe_trampoline(unsigned long *psize);
extern void *arch_uretprobe_trampoline(unsigned long *psize);
extern unsigned long uprobe_get_trampoline_vaddr(void);
extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len);
extern void arch_uprobe_clear_state(struct mm_struct *mm);
extern void arch_uprobe_init_state(struct mm_struct *mm);
extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr);
extern void arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr);
#else /* !CONFIG_UPROBES */
struct uprobes_state {
};

View File

@ -314,7 +314,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
if (max_depth > sysctl_perf_event_max_stack)
max_depth = sysctl_perf_event_max_stack;
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
trace = get_perf_callchain(regs, kernel, user, max_depth,
false, false);
if (unlikely(!trace))
@ -451,7 +451,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
else if (kernel && task)
trace = get_callchain_entry_for_task(task, max_depth);
else
trace = get_perf_callchain(regs, 0, kernel, user, max_depth,
trace = get_perf_callchain(regs, kernel, user, max_depth,
crosstask, false);
if (unlikely(!trace) || trace->nr < skip) {

View File

@ -217,22 +217,26 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr
}
struct perf_callchain_entry *
get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
get_perf_callchain(struct pt_regs *regs, bool kernel, bool user,
u32 max_stack, bool crosstask, bool add_mark)
{
struct perf_callchain_entry *entry;
struct perf_callchain_entry_ctx ctx;
int rctx, start_entry_idx;
/* crosstask is not supported for user stacks */
if (crosstask && user && !kernel)
return NULL;
entry = get_callchain_entry(&rctx);
if (!entry)
return NULL;
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = init_nr;
ctx.contexts = 0;
ctx.contexts_maxed = false;
ctx.entry = entry;
ctx.max_stack = max_stack;
ctx.nr = entry->nr = 0;
ctx.contexts = 0;
ctx.contexts_maxed = false;
if (kernel && !user_mode(regs)) {
if (add_mark)
@ -240,25 +244,19 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user,
perf_callchain_kernel(&ctx, regs);
}
if (user) {
if (user && !crosstask) {
if (!user_mode(regs)) {
if (current->mm)
regs = task_pt_regs(current);
else
regs = NULL;
}
if (regs) {
if (crosstask)
if (current->flags & (PF_KTHREAD | PF_USER_WORKER))
goto exit_put;
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
start_entry_idx = entry->nr;
perf_callchain_user(&ctx, regs);
fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
regs = task_pt_regs(current);
}
if (add_mark)
perf_callchain_store_context(&ctx, PERF_CONTEXT_USER);
start_entry_idx = entry->nr;
perf_callchain_user(&ctx, regs);
fixup_uretprobe_trampoline_entries(entry, start_entry_idx);
}
exit_put:

View File

@ -3974,7 +3974,7 @@ static noinline int visit_groups_merge(struct perf_event_context *ctx,
*/
static inline bool event_update_userpage(struct perf_event *event)
{
if (likely(!atomic_read(&event->mmap_count)))
if (likely(!refcount_read(&event->mmap_count)))
return false;
perf_event_update_time(event);
@ -6710,11 +6710,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
mapped_f mapped = get_mapped(event, event_mapped);
atomic_inc(&event->mmap_count);
atomic_inc(&event->rb->mmap_count);
refcount_inc(&event->mmap_count);
refcount_inc(&event->rb->mmap_count);
if (vma->vm_pgoff)
atomic_inc(&event->rb->aux_mmap_count);
refcount_inc(&event->rb->aux_mmap_count);
if (mapped)
mapped(event, vma->vm_mm);
@ -6749,7 +6749,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
* to avoid complications.
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
refcount_dec_and_mutex_lock(&rb->aux_mmap_count, &rb->aux_mutex)) {
/*
* Stop all AUX events that are writing to this buffer,
* so that we can free its AUX pages and corresponding PMU
@ -6769,10 +6769,10 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&rb->aux_mutex);
}
if (atomic_dec_and_test(&rb->mmap_count))
if (refcount_dec_and_test(&rb->mmap_count))
detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
if (!refcount_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
ring_buffer_attach(event, NULL);
@ -6933,19 +6933,200 @@ static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
return err;
}
static bool perf_mmap_calc_limits(struct vm_area_struct *vma, long *user_extra, long *extra)
{
unsigned long user_locked, user_lock_limit, locked, lock_limit;
struct user_struct *user = current_user();
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/* Increase the limit linearly with more CPUs */
user_lock_limit *= num_online_cpus();
user_locked = atomic_long_read(&user->locked_vm);
/*
* sysctl_perf_event_mlock may have changed, so that
* user->locked_vm > user_lock_limit
*/
if (user_locked > user_lock_limit)
user_locked = user_lock_limit;
user_locked += *user_extra;
if (user_locked > user_lock_limit) {
/*
* charge locked_vm until it hits user_lock_limit;
* charge the rest from pinned_vm
*/
*extra = user_locked - user_lock_limit;
*user_extra -= *extra;
}
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + *extra;
return locked <= lock_limit || !perf_is_paranoid() || capable(CAP_IPC_LOCK);
}
static void perf_mmap_account(struct vm_area_struct *vma, long user_extra, long extra)
{
struct user_struct *user = current_user();
atomic_long_add(user_extra, &user->locked_vm);
atomic64_add(extra, &vma->vm_mm->pinned_vm);
}
static int perf_mmap_rb(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
long extra = 0, user_extra = nr_pages;
struct perf_buffer *rb;
int rb_flags = 0;
nr_pages -= 1;
/*
* If we have rb pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (event->rb) {
if (data_page_nr(event->rb) != nr_pages)
return -EINVAL;
if (refcount_inc_not_zero(&event->rb->mmap_count)) {
/*
* Success -- managed to mmap() the same buffer
* multiple times.
*/
perf_mmap_account(vma, user_extra, extra);
refcount_inc(&event->mmap_count);
return 0;
}
/*
* Raced against perf_mmap_close()'s
* refcount_dec_and_mutex_lock() remove the
* event and continue as if !event->rb
*/
ring_buffer_attach(event, NULL);
}
if (!perf_mmap_calc_limits(vma, &user_extra, &extra))
return -EPERM;
if (vma->vm_flags & VM_WRITE)
rb_flags |= RING_BUFFER_WRITABLE;
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, rb_flags);
if (!rb)
return -ENOMEM;
refcount_set(&rb->mmap_count, 1);
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
perf_mmap_account(vma, user_extra, extra);
refcount_set(&event->mmap_count, 1);
return 0;
}
static int perf_mmap_aux(struct vm_area_struct *vma, struct perf_event *event,
unsigned long nr_pages)
{
long extra = 0, user_extra = nr_pages;
u64 aux_offset, aux_size;
struct perf_buffer *rb;
int ret, rb_flags = 0;
rb = event->rb;
if (!rb)
return -EINVAL;
guard(mutex)(&rb->aux_mutex);
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
* mapped, all subsequent mappings should have the same size
* and offset. Must be above the normal perf buffer.
*/
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
return -EINVAL;
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
return -EINVAL;
/* already mapped with a different offset */
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
return -EINVAL;
if (aux_size != nr_pages * PAGE_SIZE)
return -EINVAL;
/* already mapped with a different size */
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
return -EINVAL;
if (!is_power_of_2(nr_pages))
return -EINVAL;
if (!refcount_inc_not_zero(&rb->mmap_count))
return -EINVAL;
if (rb_has_aux(rb)) {
refcount_inc(&rb->aux_mmap_count);
} else {
if (!perf_mmap_calc_limits(vma, &user_extra, &extra)) {
refcount_dec(&rb->mmap_count);
return -EPERM;
}
WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
rb_flags |= RING_BUFFER_WRITABLE;
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, rb_flags);
if (ret) {
refcount_dec(&rb->mmap_count);
return ret;
}
refcount_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
}
perf_mmap_account(vma, user_extra, extra);
refcount_inc(&event->mmap_count);
return 0;
}
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
unsigned long user_locked, user_lock_limit;
struct user_struct *user = current_user();
struct mutex *aux_mutex = NULL;
struct perf_buffer *rb = NULL;
unsigned long locked, lock_limit;
unsigned long vma_size;
unsigned long nr_pages;
long user_extra = 0, extra = 0;
int ret, flags = 0;
unsigned long vma_size, nr_pages;
mapped_f mapped;
int ret;
/*
* Don't allow mmap() of inherited per-task counters. This would
@ -6971,192 +7152,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (vma_size != PAGE_SIZE * nr_pages)
return -EINVAL;
user_extra = nr_pages;
mutex_lock(&event->mmap_mutex);
ret = -EINVAL;
/*
* This relies on __pmu_detach_event() taking mmap_mutex after marking
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
* will detach the rb created here.
*/
if (event->state <= PERF_EVENT_STATE_REVOKED) {
ret = -ENODEV;
goto unlock;
}
if (vma->vm_pgoff == 0) {
nr_pages -= 1;
scoped_guard (mutex, &event->mmap_mutex) {
/*
* If we have rb pages ensure they're a power-of-two number, so we
* can do bitmasks instead of modulo.
* This relies on __pmu_detach_event() taking mmap_mutex after marking
* the event REVOKED. Either we observe the state, or __pmu_detach_event()
* will detach the rb created here.
*/
if (nr_pages != 0 && !is_power_of_2(nr_pages))
goto unlock;
if (event->state <= PERF_EVENT_STATE_REVOKED)
return -ENODEV;
WARN_ON_ONCE(event->ctx->parent_ctx);
if (event->rb) {
if (data_page_nr(event->rb) != nr_pages)
goto unlock;
if (atomic_inc_not_zero(&event->rb->mmap_count)) {
/*
* Success -- managed to mmap() the same buffer
* multiple times.
*/
ret = 0;
/* We need the rb to map pages. */
rb = event->rb;
goto unlock;
}
/*
* Raced against perf_mmap_close()'s
* atomic_dec_and_mutex_lock() remove the
* event and continue as if !event->rb
*/
ring_buffer_attach(event, NULL);
}
} else {
/*
* AUX area mapping: if rb->aux_nr_pages != 0, it's already
* mapped, all subsequent mappings should have the same size
* and offset. Must be above the normal perf buffer.
*/
u64 aux_offset, aux_size;
rb = event->rb;
if (!rb)
goto aux_unlock;
aux_mutex = &rb->aux_mutex;
mutex_lock(aux_mutex);
aux_offset = READ_ONCE(rb->user_page->aux_offset);
aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
goto aux_unlock;
/* already mapped with a different offset */
if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
goto aux_unlock;
if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
goto aux_unlock;
/* already mapped with a different size */
if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
goto aux_unlock;
if (!is_power_of_2(nr_pages))
goto aux_unlock;
if (!atomic_inc_not_zero(&rb->mmap_count))
goto aux_unlock;
if (rb_has_aux(rb)) {
atomic_inc(&rb->aux_mmap_count);
ret = 0;
goto unlock;
}
if (vma->vm_pgoff == 0)
ret = perf_mmap_rb(vma, event, nr_pages);
else
ret = perf_mmap_aux(vma, event, nr_pages);
if (ret)
return ret;
}
user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
/*
* Increase the limit linearly with more CPUs:
*/
user_lock_limit *= num_online_cpus();
user_locked = atomic_long_read(&user->locked_vm);
/*
* sysctl_perf_event_mlock may have changed, so that
* user->locked_vm > user_lock_limit
*/
if (user_locked > user_lock_limit)
user_locked = user_lock_limit;
user_locked += user_extra;
if (user_locked > user_lock_limit) {
/*
* charge locked_vm until it hits user_lock_limit;
* charge the rest from pinned_vm
*/
extra = user_locked - user_lock_limit;
user_extra -= extra;
}
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
locked = atomic64_read(&vma->vm_mm->pinned_vm) + extra;
if ((locked > lock_limit) && perf_is_paranoid() &&
!capable(CAP_IPC_LOCK)) {
ret = -EPERM;
goto unlock;
}
WARN_ON(!rb && event->rb);
if (vma->vm_flags & VM_WRITE)
flags |= RING_BUFFER_WRITABLE;
if (!rb) {
rb = rb_alloc(nr_pages,
event->attr.watermark ? event->attr.wakeup_watermark : 0,
event->cpu, flags);
if (!rb) {
ret = -ENOMEM;
goto unlock;
}
atomic_set(&rb->mmap_count, 1);
rb->mmap_user = get_current_user();
rb->mmap_locked = extra;
ring_buffer_attach(event, rb);
perf_event_update_time(event);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
ret = 0;
} else {
ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
event->attr.aux_watermark, flags);
if (!ret) {
atomic_set(&rb->aux_mmap_count, 1);
rb->aux_mmap_locked = extra;
}
}
unlock:
if (!ret) {
atomic_long_add(user_extra, &user->locked_vm);
atomic64_add(extra, &vma->vm_mm->pinned_vm);
atomic_inc(&event->mmap_count);
} else if (rb) {
/* AUX allocation failed */
atomic_dec(&rb->mmap_count);
}
aux_unlock:
if (aux_mutex)
mutex_unlock(aux_mutex);
mutex_unlock(&event->mmap_mutex);
if (ret)
return ret;
/*
* Since pinned accounting is per vm we cannot allow fork() to copy our
* vma.
@ -7174,7 +7186,7 @@ aux_unlock:
* full cleanup in this case and therefore does not invoke
* vmops::close().
*/
ret = map_range(rb, vma);
ret = map_range(event->rb, vma);
if (ret)
perf_mmap_close(vma);
@ -7440,7 +7452,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
} else if (!(current->flags & PF_KTHREAD)) {
} else if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
perf_get_regs_user(regs_user, regs);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@ -8080,7 +8092,7 @@ static u64 perf_virt_to_phys(u64 virt)
* Try IRQ-safe get_user_page_fast_only first.
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER))) {
struct page *p;
pagefault_disable();
@ -8192,7 +8204,8 @@ struct perf_callchain_entry *
perf_callchain(struct perf_event *event, struct pt_regs *regs)
{
bool kernel = !event->attr.exclude_callchain_kernel;
bool user = !event->attr.exclude_callchain_user;
bool user = !event->attr.exclude_callchain_user &&
!(current->flags & (PF_KTHREAD | PF_USER_WORKER));
/* Disallow cross-task user callchains. */
bool crosstask = event->ctx->task && event->ctx->task != current;
const u32 max_stack = event->attr.sample_max_stack;
@ -8204,7 +8217,7 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs)
if (!kernel && !user)
return &__empty_callchain;
callchain = get_perf_callchain(regs, 0, kernel, user,
callchain = get_perf_callchain(regs, kernel, user,
max_stack, crosstask, true);
return callchain ?: &__empty_callchain;
}
@ -13249,7 +13262,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
mutex_lock_double(&event->mmap_mutex, &output_event->mmap_mutex);
set:
/* Can't redirect output if we've got an active mmap() */
if (atomic_read(&event->mmap_count))
if (refcount_read(&event->mmap_count))
goto unlock;
if (output_event) {
@ -13262,7 +13275,7 @@ set:
goto unlock;
/* did we race against perf_mmap_close() */
if (!atomic_read(&rb->mmap_count)) {
if (!refcount_read(&rb->mmap_count)) {
ring_buffer_put(rb);
goto unlock;
}

View File

@ -35,7 +35,7 @@ struct perf_buffer {
spinlock_t event_lock;
struct list_head event_list;
atomic_t mmap_count;
refcount_t mmap_count;
unsigned long mmap_locked;
struct user_struct *mmap_user;
@ -47,7 +47,7 @@ struct perf_buffer {
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
atomic_t aux_mmap_count;
refcount_t aux_mmap_count;
unsigned long aux_mmap_locked;
void (*free_aux)(void *);
refcount_t aux_refcount;

View File

@ -400,7 +400,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* the same order, see perf_mmap_close. Otherwise we end up freeing
* aux pages in this path, which is a bug, because in_atomic().
*/
if (!atomic_read(&rb->aux_mmap_count))
if (!refcount_read(&rb->aux_mmap_count))
goto err;
if (!refcount_inc_not_zero(&rb->aux_refcount))

View File

@ -177,7 +177,7 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn)
return is_swbp_insn(insn);
}
static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
@ -191,7 +191,8 @@ static void copy_to_page(struct page *page, unsigned long vaddr, const void *src
kunmap_atomic(kaddr);
}
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode)
static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *insn,
int nbytes, void *data)
{
uprobe_opcode_t old_opcode;
bool is_swbp;
@ -205,10 +206,10 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* is a trap variant; uprobes always wins over any other (gdb)
* breakpoint.
*/
copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
uprobe_copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
if (is_swbp_insn(insn)) {
if (is_swbp) /* register: already installed? */
return 0;
} else {
@ -399,12 +400,12 @@ static bool orig_page_is_identical(struct vm_area_struct *vma,
return identical;
}
static int __uprobe_write_opcode(struct vm_area_struct *vma,
static int __uprobe_write(struct vm_area_struct *vma,
struct folio_walk *fw, struct folio *folio,
unsigned long opcode_vaddr, uprobe_opcode_t opcode)
unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
bool is_register)
{
const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
const bool is_register = !!is_swbp_insn(&opcode);
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
bool pmd_mappable;
/* For now, we'll only handle PTE-mapped folios. */
@ -429,7 +430,7 @@ static int __uprobe_write_opcode(struct vm_area_struct *vma,
*/
flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
copy_to_page(fw->page, insn_vaddr, insn, nbytes);
/*
* When unregistering, we may only zap a PTE if uffd is disabled and
@ -482,23 +483,32 @@ remap:
* @opcode_vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @opcode_vaddr.
*
* Called with mm->mmap_lock held for read or write.
* Called with mm->mmap_lock held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
const unsigned long opcode_vaddr, uprobe_opcode_t opcode)
const unsigned long opcode_vaddr, uprobe_opcode_t opcode,
bool is_register)
{
const unsigned long vaddr = opcode_vaddr & PAGE_MASK;
return uprobe_write(auprobe, vma, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE,
verify_opcode, is_register, true /* do_update_ref_ctr */, NULL);
}
int uprobe_write(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
const unsigned long insn_vaddr, uprobe_opcode_t *insn, int nbytes,
uprobe_write_verify_t verify, bool is_register, bool do_update_ref_ctr,
void *data)
{
const unsigned long vaddr = insn_vaddr & PAGE_MASK;
struct mm_struct *mm = vma->vm_mm;
struct uprobe *uprobe;
int ret, is_register, ref_ctr_updated = 0;
int ret, ref_ctr_updated = 0;
unsigned int gup_flags = FOLL_FORCE;
struct mmu_notifier_range range;
struct folio_walk fw;
struct folio *folio;
struct page *page;
is_register = is_swbp_insn(&opcode);
uprobe = container_of(auprobe, struct uprobe, arch);
if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags)))
@ -509,7 +519,7 @@ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
* page that we can safely modify. Use FOLL_WRITE to trigger a write
* fault if required. When unregistering, we might be lucky and the
* anon page is already gone. So defer write faults until really
* required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode()
* required. Use FOLL_SPLIT_PMD, because __uprobe_write()
* cannot deal with PMDs yet.
*/
if (is_register)
@ -521,14 +531,14 @@ retry:
goto out;
folio = page_folio(page);
ret = verify_opcode(page, opcode_vaddr, &opcode);
ret = verify(page, insn_vaddr, insn, nbytes, data);
if (ret <= 0) {
folio_put(folio);
goto out;
}
/* We are going to replace instruction, update ref_ctr. */
if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
if (do_update_ref_ctr && !ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1);
if (ret) {
folio_put(folio);
@ -560,7 +570,7 @@ retry:
/* Walk the page tables again, to perform the actual update. */
if (folio_walk_start(&fw, vma, vaddr, 0)) {
if (fw.page == page)
ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
ret = __uprobe_write(vma, &fw, folio, insn_vaddr, insn, nbytes, is_register);
folio_walk_end(&fw, vma);
}
@ -580,7 +590,7 @@ retry:
out:
/* Revert back reference counter if instruction update failed. */
if (ret < 0 && ref_ctr_updated)
if (do_update_ref_ctr && ret < 0 && ref_ctr_updated)
update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */
@ -602,7 +612,7 @@ out:
int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma,
unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN, true);
}
/**
@ -618,7 +628,7 @@ int __weak set_orig_insn(struct arch_uprobe *auprobe,
struct vm_area_struct *vma, unsigned long vaddr)
{
return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
*(uprobe_opcode_t *)&auprobe->insn, false);
}
/* uprobe should have guaranteed positive refcount */
@ -1051,7 +1061,7 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
if (IS_ERR(page))
return PTR_ERR(page);
copy_from_page(page, offset, insn, nbytes);
uprobe_copy_from_page(page, offset, insn, nbytes);
put_page(page);
return 0;
@ -1210,7 +1220,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
* reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN);
GFP_NOWAIT | __GFP_NOMEMALLOC);
if (prev)
prev->next = NULL;
}
@ -1397,7 +1407,7 @@ struct uprobe *uprobe_register(struct inode *inode,
return ERR_PTR(-EINVAL);
/*
* This ensures that copy_from_page(), copy_to_page() and
* This ensures that uprobe_copy_from_page(), copy_to_page() and
* __update_ref_ctr() can't cross page boundary.
*/
if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
@ -1463,7 +1473,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
struct vm_area_struct *vma;
int err = 0;
mmap_read_lock(mm);
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
unsigned long vaddr;
loff_t offset;
@ -1480,7 +1490,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
vaddr = offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, vma, vaddr);
}
mmap_read_unlock(mm);
mmap_write_unlock(mm);
return err;
}
@ -1726,7 +1736,7 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
return ret;
}
void * __weak arch_uprobe_trampoline(unsigned long *psize)
void * __weak arch_uretprobe_trampoline(unsigned long *psize)
{
static uprobe_opcode_t insn = UPROBE_SWBP_INSN;
@ -1758,7 +1768,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
init_waitqueue_head(&area->wq);
/* Reserve the 1st slot for get_trampoline_vaddr() */
set_bit(0, area->bitmap);
insns = arch_uprobe_trampoline(&insns_size);
insns = arch_uretprobe_trampoline(&insns_size);
arch_uprobe_copy_ixol(area->page, 0, insns, insns_size);
if (!xol_add_vma(mm, area))
@ -1792,6 +1802,14 @@ static struct xol_area *get_xol_area(void)
return area;
}
void __weak arch_uprobe_clear_state(struct mm_struct *mm)
{
}
void __weak arch_uprobe_init_state(struct mm_struct *mm)
{
}
/*
* uprobe_clear_state - Free the area allocated for slots.
*/
@ -1803,6 +1821,8 @@ void uprobe_clear_state(struct mm_struct *mm)
delayed_uprobe_remove(NULL, mm);
mutex_unlock(&delayed_uprobe_lock);
arch_uprobe_clear_state(mm);
if (!area)
return;
@ -2393,7 +2413,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
uprobe_copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
/* This needs to return true for any variant of the trap insn */
@ -2677,6 +2697,10 @@ bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check c
return true;
}
void __weak arch_uprobe_optimize(struct arch_uprobe *auprobe, unsigned long vaddr)
{
}
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@ -2741,6 +2765,9 @@ static void handle_swbp(struct pt_regs *regs)
handler_chain(uprobe, regs);
/* Try to optimize after first hit. */
arch_uprobe_optimize(&uprobe->arch, bp_vaddr);
if (arch_uprobe_skip_sstep(&uprobe->arch, regs))
goto out;
@ -2752,6 +2779,23 @@ out:
rcu_read_unlock_trace();
}
void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr)
{
struct uprobe *uprobe;
int is_swbp;
guard(rcu_tasks_trace)();
uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp);
if (!uprobe)
return;
if (!get_utask())
return;
if (arch_uprobe_ignore(&uprobe->arch, regs))
return;
handler_chain(uprobe, regs);
}
/*
* Perform required fix-ups and disable singlestep.
* Allow pending signals to take effect.

View File

@ -1014,6 +1014,7 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
{
#ifdef CONFIG_UPROBES
mm->uprobes_state.xol_area = NULL;
arch_uprobe_init_state(mm);
#endif
}

View File

@ -741,6 +741,26 @@ out:
}
#ifdef SECCOMP_ARCH_NATIVE
static bool seccomp_uprobe_exception(struct seccomp_data *sd)
{
#if defined __NR_uretprobe || defined __NR_uprobe
#ifdef SECCOMP_ARCH_COMPAT
if (sd->arch == SECCOMP_ARCH_NATIVE)
#endif
{
#ifdef __NR_uretprobe
if (sd->nr == __NR_uretprobe)
return true;
#endif
#ifdef __NR_uprobe
if (sd->nr == __NR_uprobe)
return true;
#endif
}
#endif
return false;
}
/**
* seccomp_is_const_allow - check if filter is constant allow with given data
* @fprog: The BPF programs
@ -758,13 +778,8 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
return false;
/* Our single exception to filtering. */
#ifdef __NR_uretprobe
#ifdef SECCOMP_ARCH_COMPAT
if (sd->arch == SECCOMP_ARCH_NATIVE)
#endif
if (sd->nr == __NR_uretprobe)
return true;
#endif
if (seccomp_uprobe_exception(sd))
return true;
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
@ -1042,6 +1057,9 @@ static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
#ifdef __NR_uretprobe
__NR_uretprobe,
#endif
#ifdef __NR_uprobe
__NR_uprobe,
#endif
-1, /* negative terminated */
};

View File

@ -392,3 +392,4 @@ COND_SYSCALL(setuid16);
COND_SYSCALL(rseq);
COND_SYSCALL(uretprobe);
COND_SYSCALL(uprobe);

View File

@ -315,12 +315,14 @@
#define PERF_CAP_PT_IDX 16
#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
#define PERF_CAP_ARCH_REG BIT_ULL(7)
#define PERF_CAP_PEBS_FORMAT 0xf00
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE)
#define PERF_CAP_PEBS_TRAP BIT_ULL(6)
#define PERF_CAP_ARCH_REG BIT_ULL(7)
#define PERF_CAP_PEBS_FORMAT 0xf00
#define PERF_CAP_PEBS_BASELINE BIT_ULL(14)
#define PERF_CAP_PEBS_TIMING_INFO BIT_ULL(17)
#define PERF_CAP_PEBS_MASK (PERF_CAP_PEBS_TRAP | PERF_CAP_ARCH_REG | \
PERF_CAP_PEBS_FORMAT | PERF_CAP_PEBS_BASELINE | \
PERF_CAP_PEBS_TIMING_INFO)
#define MSR_IA32_RTIT_CTL 0x00000570
#define RTIT_CTL_TRACEEN BIT(0)

View File

@ -8,22 +8,31 @@
#include <asm/ptrace.h>
#include <linux/compiler.h>
#include <linux/stringify.h>
#include <linux/kernel.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <sys/prctl.h>
#include <asm/prctl.h>
#include "uprobe_syscall.skel.h"
#include "uprobe_syscall_executed.skel.h"
#include "bpf/libbpf_internal.h"
__naked unsigned long uretprobe_regs_trigger(void)
#define USDT_NOP .byte 0x0f, 0x1f, 0x44, 0x00, 0x00
#include "usdt.h"
#pragma GCC diagnostic ignored "-Wattributes"
__attribute__((aligned(16)))
__nocf_check __weak __naked unsigned long uprobe_regs_trigger(void)
{
asm volatile (
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00\n" /* nop5 */
"movq $0xdeadbeef, %rax\n"
"ret\n"
);
}
__naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
__naked void uprobe_regs(struct pt_regs *before, struct pt_regs *after)
{
asm volatile (
"movq %r15, 0(%rdi)\n"
@ -44,15 +53,17 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
"movq $0, 120(%rdi)\n" /* orig_rax */
"movq $0, 128(%rdi)\n" /* rip */
"movq $0, 136(%rdi)\n" /* cs */
"pushq %rax\n"
"pushf\n"
"pop %rax\n"
"movq %rax, 144(%rdi)\n" /* eflags */
"pop %rax\n"
"movq %rsp, 152(%rdi)\n" /* rsp */
"movq $0, 160(%rdi)\n" /* ss */
/* save 2nd argument */
"pushq %rsi\n"
"call uretprobe_regs_trigger\n"
"call uprobe_regs_trigger\n"
/* save return value and load 2nd argument pointer to rax */
"pushq %rax\n"
@ -92,25 +103,37 @@ __naked void uretprobe_regs(struct pt_regs *before, struct pt_regs *after)
);
}
static void test_uretprobe_regs_equal(void)
static void test_uprobe_regs_equal(bool retprobe)
{
LIBBPF_OPTS(bpf_uprobe_opts, opts,
.retprobe = retprobe,
);
struct uprobe_syscall *skel = NULL;
struct pt_regs before = {}, after = {};
unsigned long *pb = (unsigned long *) &before;
unsigned long *pa = (unsigned long *) &after;
unsigned long *pp;
unsigned long offset;
unsigned int i, cnt;
int err;
offset = get_uprobe_offset(&uprobe_regs_trigger);
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
return;
skel = uprobe_syscall__open_and_load();
if (!ASSERT_OK_PTR(skel, "uprobe_syscall__open_and_load"))
goto cleanup;
err = uprobe_syscall__attach(skel);
if (!ASSERT_OK(err, "uprobe_syscall__attach"))
skel->links.probe = bpf_program__attach_uprobe_opts(skel->progs.probe,
0, "/proc/self/exe", offset, &opts);
if (!ASSERT_OK_PTR(skel->links.probe, "bpf_program__attach_uprobe_opts"))
goto cleanup;
uretprobe_regs(&before, &after);
/* make sure uprobe gets optimized */
if (!retprobe)
uprobe_regs_trigger();
uprobe_regs(&before, &after);
pp = (unsigned long *) &skel->bss->regs;
cnt = sizeof(before)/sizeof(*pb);
@ -119,7 +142,7 @@ static void test_uretprobe_regs_equal(void)
unsigned int offset = i * sizeof(unsigned long);
/*
* Check register before and after uretprobe_regs_trigger call
* Check register before and after uprobe_regs_trigger call
* that triggers the uretprobe.
*/
switch (offset) {
@ -133,7 +156,7 @@ static void test_uretprobe_regs_equal(void)
/*
* Check register seen from bpf program and register after
* uretprobe_regs_trigger call
* uprobe_regs_trigger call (with rax exception, check below).
*/
switch (offset) {
/*
@ -146,6 +169,15 @@ static void test_uretprobe_regs_equal(void)
case offsetof(struct pt_regs, rsp):
case offsetof(struct pt_regs, ss):
break;
/*
* uprobe does not see return value in rax, it needs to see the
* original (before) rax value
*/
case offsetof(struct pt_regs, rax):
if (!retprobe) {
ASSERT_EQ(pp[i], pb[i], "uprobe rax prog-before value check");
break;
}
default:
if (!ASSERT_EQ(pp[i], pa[i], "register prog-after value check"))
fprintf(stdout, "failed register offset %u\n", offset);
@ -175,7 +207,7 @@ static int write_bpf_testmod_uprobe(unsigned long offset)
return ret != n ? (int) ret : 0;
}
static void test_uretprobe_regs_change(void)
static void test_regs_change(void)
{
struct pt_regs before = {}, after = {};
unsigned long *pb = (unsigned long *) &before;
@ -183,13 +215,16 @@ static void test_uretprobe_regs_change(void)
unsigned long cnt = sizeof(before)/sizeof(*pb);
unsigned int i, err, offset;
offset = get_uprobe_offset(uretprobe_regs_trigger);
offset = get_uprobe_offset(uprobe_regs_trigger);
err = write_bpf_testmod_uprobe(offset);
if (!ASSERT_OK(err, "register_uprobe"))
return;
uretprobe_regs(&before, &after);
/* make sure uprobe gets optimized */
uprobe_regs_trigger();
uprobe_regs(&before, &after);
err = write_bpf_testmod_uprobe(0);
if (!ASSERT_OK(err, "unregister_uprobe"))
@ -252,6 +287,7 @@ static void test_uretprobe_syscall_call(void)
);
struct uprobe_syscall_executed *skel;
int pid, status, err, go[2], c = 0;
struct bpf_link *link;
if (!ASSERT_OK(pipe(go), "pipe"))
return;
@ -277,11 +313,14 @@ static void test_uretprobe_syscall_call(void)
_exit(0);
}
skel->links.test = bpf_program__attach_uprobe_multi(skel->progs.test, pid,
"/proc/self/exe",
"uretprobe_syscall_call", &opts);
if (!ASSERT_OK_PTR(skel->links.test, "bpf_program__attach_uprobe_multi"))
skel->bss->pid = pid;
link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi,
pid, "/proc/self/exe",
"uretprobe_syscall_call", &opts);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
goto cleanup;
skel->links.test_uretprobe_multi = link;
/* kick the child */
write(go[1], &c, 1);
@ -301,6 +340,256 @@ cleanup:
close(go[0]);
}
#define TRAMP "[uprobes-trampoline]"
__attribute__((aligned(16)))
__nocf_check __weak __naked void uprobe_test(void)
{
asm volatile (" \n"
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n"
"ret \n"
);
}
__attribute__((aligned(16)))
__nocf_check __weak void usdt_test(void)
{
USDT(optimized_uprobe, usdt);
}
static int find_uprobes_trampoline(void *tramp_addr)
{
void *start, *end;
char line[128];
int ret = -1;
FILE *maps;
maps = fopen("/proc/self/maps", "r");
if (!maps) {
fprintf(stderr, "cannot open maps\n");
return -1;
}
while (fgets(line, sizeof(line), maps)) {
int m = -1;
/* We care only about private r-x mappings. */
if (sscanf(line, "%p-%p r-xp %*x %*x:%*x %*u %n", &start, &end, &m) != 2)
continue;
if (m < 0)
continue;
if (!strncmp(&line[m], TRAMP, sizeof(TRAMP)-1) && (start == tramp_addr)) {
ret = 0;
break;
}
}
fclose(maps);
return ret;
}
static unsigned char nop5[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 };
static void *find_nop5(void *fn)
{
int i;
for (i = 0; i < 10; i++) {
if (!memcmp(nop5, fn + i, 5))
return fn + i;
}
return NULL;
}
typedef void (__attribute__((nocf_check)) *trigger_t)(void);
static void *check_attach(struct uprobe_syscall_executed *skel, trigger_t trigger,
void *addr, int executed)
{
struct __arch_relative_insn {
__u8 op;
__s32 raddr;
} __packed *call;
void *tramp = NULL;
/* Uprobe gets optimized after first trigger, so let's press twice. */
trigger();
trigger();
/* Make sure bpf program got executed.. */
ASSERT_EQ(skel->bss->executed, executed, "executed");
/* .. and check the trampoline is as expected. */
call = (struct __arch_relative_insn *) addr;
tramp = (void *) (call + 1) + call->raddr;
ASSERT_EQ(call->op, 0xe8, "call");
ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
return tramp;
}
static void check_detach(void *addr, void *tramp)
{
/* [uprobes_trampoline] stays after detach */
ASSERT_OK(find_uprobes_trampoline(tramp), "uprobes_trampoline");
ASSERT_OK(memcmp(addr, nop5, 5), "nop5");
}
static void check(struct uprobe_syscall_executed *skel, struct bpf_link *link,
trigger_t trigger, void *addr, int executed)
{
void *tramp;
tramp = check_attach(skel, trigger, addr, executed);
bpf_link__destroy(link);
check_detach(addr, tramp);
}
static void test_uprobe_legacy(void)
{
struct uprobe_syscall_executed *skel = NULL;
LIBBPF_OPTS(bpf_uprobe_opts, opts,
.retprobe = true,
);
struct bpf_link *link;
unsigned long offset;
offset = get_uprobe_offset(&uprobe_test);
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
goto cleanup;
/* uprobe */
skel = uprobe_syscall_executed__open_and_load();
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
return;
skel->bss->pid = getpid();
link = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
0, "/proc/self/exe", offset, NULL);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
goto cleanup;
check(skel, link, uprobe_test, uprobe_test, 2);
/* uretprobe */
skel->bss->executed = 0;
link = bpf_program__attach_uprobe_opts(skel->progs.test_uretprobe,
0, "/proc/self/exe", offset, &opts);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_opts"))
goto cleanup;
check(skel, link, uprobe_test, uprobe_test, 2);
cleanup:
uprobe_syscall_executed__destroy(skel);
}
static void test_uprobe_multi(void)
{
struct uprobe_syscall_executed *skel = NULL;
LIBBPF_OPTS(bpf_uprobe_multi_opts, opts);
struct bpf_link *link;
unsigned long offset;
offset = get_uprobe_offset(&uprobe_test);
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
goto cleanup;
opts.offsets = &offset;
opts.cnt = 1;
skel = uprobe_syscall_executed__open_and_load();
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
return;
skel->bss->pid = getpid();
/* uprobe.multi */
link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_multi,
0, "/proc/self/exe", NULL, &opts);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
goto cleanup;
check(skel, link, uprobe_test, uprobe_test, 2);
/* uretprobe.multi */
skel->bss->executed = 0;
opts.retprobe = true;
link = bpf_program__attach_uprobe_multi(skel->progs.test_uretprobe_multi,
0, "/proc/self/exe", NULL, &opts);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
goto cleanup;
check(skel, link, uprobe_test, uprobe_test, 2);
cleanup:
uprobe_syscall_executed__destroy(skel);
}
static void test_uprobe_session(void)
{
struct uprobe_syscall_executed *skel = NULL;
LIBBPF_OPTS(bpf_uprobe_multi_opts, opts,
.session = true,
);
struct bpf_link *link;
unsigned long offset;
offset = get_uprobe_offset(&uprobe_test);
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
goto cleanup;
opts.offsets = &offset;
opts.cnt = 1;
skel = uprobe_syscall_executed__open_and_load();
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
return;
skel->bss->pid = getpid();
link = bpf_program__attach_uprobe_multi(skel->progs.test_uprobe_session,
0, "/proc/self/exe", NULL, &opts);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_uprobe_multi"))
goto cleanup;
check(skel, link, uprobe_test, uprobe_test, 4);
cleanup:
uprobe_syscall_executed__destroy(skel);
}
static void test_uprobe_usdt(void)
{
struct uprobe_syscall_executed *skel;
struct bpf_link *link;
void *addr;
errno = 0;
addr = find_nop5(usdt_test);
if (!ASSERT_OK_PTR(addr, "find_nop5"))
return;
skel = uprobe_syscall_executed__open_and_load();
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
return;
skel->bss->pid = getpid();
link = bpf_program__attach_usdt(skel->progs.test_usdt,
-1 /* all PIDs */, "/proc/self/exe",
"optimized_uprobe", "usdt", NULL);
if (!ASSERT_OK_PTR(link, "bpf_program__attach_usdt"))
goto cleanup;
check(skel, link, usdt_test, addr, 2);
cleanup:
uprobe_syscall_executed__destroy(skel);
}
/*
* Borrowed from tools/testing/selftests/x86/test_shadow_stack.c.
*
@ -343,30 +632,166 @@ static void test_uretprobe_shadow_stack(void)
return;
}
/* Run all of the uretprobe tests. */
test_uretprobe_regs_equal();
test_uretprobe_regs_change();
/* Run all the tests with shadow stack in place. */
test_uprobe_regs_equal(false);
test_uprobe_regs_equal(true);
test_uretprobe_syscall_call();
test_uprobe_legacy();
test_uprobe_multi();
test_uprobe_session();
test_uprobe_usdt();
test_regs_change();
ARCH_PRCTL(ARCH_SHSTK_DISABLE, ARCH_SHSTK_SHSTK);
}
static volatile bool race_stop;
static USDT_DEFINE_SEMA(race);
static void *worker_trigger(void *arg)
{
unsigned long rounds = 0;
while (!race_stop) {
uprobe_test();
rounds++;
}
printf("tid %d trigger rounds: %lu\n", gettid(), rounds);
return NULL;
}
static void *worker_attach(void *arg)
{
LIBBPF_OPTS(bpf_uprobe_opts, opts);
struct uprobe_syscall_executed *skel;
unsigned long rounds = 0, offset;
const char *sema[2] = {
__stringify(USDT_SEMA(race)),
NULL,
};
unsigned long *ref;
int err;
offset = get_uprobe_offset(&uprobe_test);
if (!ASSERT_GE(offset, 0, "get_uprobe_offset"))
return NULL;
err = elf_resolve_syms_offsets("/proc/self/exe", 1, (const char **) &sema, &ref, STT_OBJECT);
if (!ASSERT_OK(err, "elf_resolve_syms_offsets_sema"))
return NULL;
opts.ref_ctr_offset = *ref;
skel = uprobe_syscall_executed__open_and_load();
if (!ASSERT_OK_PTR(skel, "uprobe_syscall_executed__open_and_load"))
return NULL;
skel->bss->pid = getpid();
while (!race_stop) {
skel->links.test_uprobe = bpf_program__attach_uprobe_opts(skel->progs.test_uprobe,
0, "/proc/self/exe", offset, &opts);
if (!ASSERT_OK_PTR(skel->links.test_uprobe, "bpf_program__attach_uprobe_opts"))
break;
bpf_link__destroy(skel->links.test_uprobe);
skel->links.test_uprobe = NULL;
rounds++;
}
printf("tid %d attach rounds: %lu hits: %d\n", gettid(), rounds, skel->bss->executed);
uprobe_syscall_executed__destroy(skel);
free(ref);
return NULL;
}
static useconds_t race_msec(void)
{
char *env;
env = getenv("BPF_SELFTESTS_UPROBE_SYSCALL_RACE_MSEC");
if (env)
return atoi(env);
/* default duration is 500ms */
return 500;
}
static void test_uprobe_race(void)
{
int err, i, nr_threads;
pthread_t *threads;
nr_threads = libbpf_num_possible_cpus();
if (!ASSERT_GT(nr_threads, 0, "libbpf_num_possible_cpus"))
return;
nr_threads = max(2, nr_threads);
threads = alloca(sizeof(*threads) * nr_threads);
if (!ASSERT_OK_PTR(threads, "malloc"))
return;
for (i = 0; i < nr_threads; i++) {
err = pthread_create(&threads[i], NULL, i % 2 ? worker_trigger : worker_attach,
NULL);
if (!ASSERT_OK(err, "pthread_create"))
goto cleanup;
}
usleep(race_msec() * 1000);
cleanup:
race_stop = true;
for (nr_threads = i, i = 0; i < nr_threads; i++)
pthread_join(threads[i], NULL);
ASSERT_FALSE(USDT_SEMA_IS_ACTIVE(race), "race_semaphore");
}
#ifndef __NR_uprobe
#define __NR_uprobe 336
#endif
static void test_uprobe_error(void)
{
long err = syscall(__NR_uprobe);
ASSERT_EQ(err, -1, "error");
ASSERT_EQ(errno, ENXIO, "errno");
}
static void __test_uprobe_syscall(void)
{
if (test__start_subtest("uretprobe_regs_equal"))
test_uprobe_regs_equal(true);
if (test__start_subtest("uretprobe_syscall_call"))
test_uretprobe_syscall_call();
if (test__start_subtest("uretprobe_shadow_stack"))
test_uretprobe_shadow_stack();
if (test__start_subtest("uprobe_legacy"))
test_uprobe_legacy();
if (test__start_subtest("uprobe_multi"))
test_uprobe_multi();
if (test__start_subtest("uprobe_session"))
test_uprobe_session();
if (test__start_subtest("uprobe_usdt"))
test_uprobe_usdt();
if (test__start_subtest("uprobe_race"))
test_uprobe_race();
if (test__start_subtest("uprobe_error"))
test_uprobe_error();
if (test__start_subtest("uprobe_regs_equal"))
test_uprobe_regs_equal(false);
if (test__start_subtest("regs_change"))
test_regs_change();
}
#else
static void test_uretprobe_regs_equal(void)
{
test__skip();
}
static void test_uretprobe_regs_change(void)
{
test__skip();
}
static void test_uretprobe_syscall_call(void)
{
test__skip();
}
static void test_uretprobe_shadow_stack(void)
static void __test_uprobe_syscall(void)
{
test__skip();
}
@ -374,12 +799,5 @@ static void test_uretprobe_shadow_stack(void)
void test_uprobe_syscall(void)
{
if (test__start_subtest("uretprobe_regs_equal"))
test_uretprobe_regs_equal();
if (test__start_subtest("uretprobe_regs_change"))
test_uretprobe_regs_change();
if (test__start_subtest("uretprobe_syscall_call"))
test_uretprobe_syscall_call();
if (test__start_subtest("uretprobe_shadow_stack"))
test_uretprobe_shadow_stack();
__test_uprobe_syscall();
}

View File

@ -40,12 +40,19 @@ static void __always_inline trigger_func(int x) {
}
}
static void subtest_basic_usdt(void)
static void subtest_basic_usdt(bool optimized)
{
LIBBPF_OPTS(bpf_usdt_opts, opts);
struct test_usdt *skel;
struct test_usdt__bss *bss;
int err, i;
int err, i, called;
#define TRIGGER(x) ({ \
trigger_func(x); \
if (optimized) \
trigger_func(x); \
optimized ? 2 : 1; \
})
skel = test_usdt__open_and_load();
if (!ASSERT_OK_PTR(skel, "skel_open"))
@ -66,11 +73,11 @@ static void subtest_basic_usdt(void)
if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link"))
goto cleanup;
trigger_func(1);
called = TRIGGER(1);
ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called");
ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called");
ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called");
ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
ASSERT_EQ(bss->usdt12_called, called, "usdt12_called");
ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie");
ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt");
@ -119,11 +126,11 @@ static void subtest_basic_usdt(void)
* bpf_program__attach_usdt() handles this properly and attaches to
* all possible places of USDT invocation.
*/
trigger_func(2);
called += TRIGGER(2);
ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called");
ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called");
ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called");
ASSERT_EQ(bss->usdt0_called, called, "usdt0_called");
ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
ASSERT_EQ(bss->usdt12_called, called, "usdt12_called");
/* only check values that depend on trigger_func()'s input value */
ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1");
@ -142,9 +149,9 @@ static void subtest_basic_usdt(void)
if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach"))
goto cleanup;
trigger_func(3);
called += TRIGGER(3);
ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called");
ASSERT_EQ(bss->usdt3_called, called, "usdt3_called");
/* this time usdt3 has custom cookie */
ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie");
ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt");
@ -158,6 +165,7 @@ static void subtest_basic_usdt(void)
cleanup:
test_usdt__destroy(skel);
#undef TRIGGER
}
unsigned short test_usdt_100_semaphore SEC(".probes");
@ -425,7 +433,11 @@ cleanup:
void test_usdt(void)
{
if (test__start_subtest("basic"))
subtest_basic_usdt();
subtest_basic_usdt(false);
#ifdef __x86_64__
if (test__start_subtest("basic_optimized"))
subtest_basic_usdt(true);
#endif
if (test__start_subtest("multispec"))
subtest_multispec_usdt();
if (test__start_subtest("urand_auto_attach"))

View File

@ -7,8 +7,8 @@ struct pt_regs regs;
char _license[] SEC("license") = "GPL";
SEC("uretprobe//proc/self/exe:uretprobe_regs_trigger")
int uretprobe(struct pt_regs *ctx)
SEC("uprobe")
int probe(struct pt_regs *ctx)
{
__builtin_memcpy(&regs, ctx, sizeof(regs));
return 0;

View File

@ -1,6 +1,8 @@
// SPDX-License-Identifier: GPL-2.0
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/usdt.bpf.h>
#include <string.h>
struct pt_regs regs;
@ -8,10 +10,64 @@ struct pt_regs regs;
char _license[] SEC("license") = "GPL";
int executed = 0;
int pid;
SEC("uretprobe.multi")
int test(struct pt_regs *regs)
SEC("uprobe")
int BPF_UPROBE(test_uprobe)
{
executed = 1;
if (bpf_get_current_pid_tgid() >> 32 != pid)
return 0;
executed++;
return 0;
}
SEC("uretprobe")
int BPF_URETPROBE(test_uretprobe)
{
if (bpf_get_current_pid_tgid() >> 32 != pid)
return 0;
executed++;
return 0;
}
SEC("uprobe.multi")
int test_uprobe_multi(struct pt_regs *ctx)
{
if (bpf_get_current_pid_tgid() >> 32 != pid)
return 0;
executed++;
return 0;
}
SEC("uretprobe.multi")
int test_uretprobe_multi(struct pt_regs *ctx)
{
if (bpf_get_current_pid_tgid() >> 32 != pid)
return 0;
executed++;
return 0;
}
SEC("uprobe.session")
int test_uprobe_session(struct pt_regs *ctx)
{
if (bpf_get_current_pid_tgid() >> 32 != pid)
return 0;
executed++;
return 0;
}
SEC("usdt")
int test_usdt(struct pt_regs *ctx)
{
if (bpf_get_current_pid_tgid() >> 32 != pid)
return 0;
executed++;
return 0;
}

View File

@ -500,15 +500,21 @@ static struct bin_attribute bin_attr_bpf_testmod_file __ro_after_init = {
*/
#ifdef __x86_64__
static int
uprobe_handler(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data)
{
regs->cx = 0x87654321feebdaed;
return 0;
}
static int
uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func,
struct pt_regs *regs, __u64 *data)
{
regs->ax = 0x12345678deadbeef;
regs->cx = 0x87654321feebdaed;
regs->r11 = (u64) -1;
return true;
return 0;
}
struct testmod_uprobe {
@ -520,6 +526,7 @@ struct testmod_uprobe {
static DEFINE_MUTEX(testmod_uprobe_mutex);
static struct testmod_uprobe uprobe = {
.consumer.handler = uprobe_handler,
.consumer.ret_handler = uprobe_ret_handler,
};

View File

@ -0,0 +1,545 @@
// SPDX-License-Identifier: BSD-2-Clause
/*
* This single-header library defines a collection of variadic macros for
* defining and triggering USDTs (User Statically-Defined Tracepoints):
*
* - For USDTs without associated semaphore:
* USDT(group, name, args...)
*
* - For USDTs with implicit (transparent to the user) semaphore:
* USDT_WITH_SEMA(group, name, args...)
* USDT_IS_ACTIVE(group, name)
*
* - For USDTs with explicit (user-defined and provided) semaphore:
* USDT_WITH_EXPLICIT_SEMA(sema, group, name, args...)
* USDT_SEMA_IS_ACTIVE(sema)
*
* all of which emit a NOP instruction into the instruction stream, and so
* have *zero* overhead for the surrounding code. USDTs are identified by
* a combination of `group` and `name` identifiers, which is used by external
* tracing tooling (tracers) for identifying exact USDTs of interest.
*
* USDTs can have an associated (2-byte) activity counter (USDT semaphore),
* automatically maintained by Linux kernel whenever any correctly written
* BPF-based tracer is attached to the USDT. This USDT semaphore can be used
* to check whether there is a need to do any extra data collection and
* processing for a given USDT (if necessary), and otherwise avoid extra work
* for a common case of USDT not being traced ("active").
*
* See documentation for USDT_WITH_SEMA()/USDT_IS_ACTIVE() or
* USDT_WITH_EXPLICIT_SEMA()/USDT_SEMA_IS_ACTIVE() APIs below for details on
* working with USDTs with implicitly or explicitly associated
* USDT semaphores, respectively.
*
* There is also some additional data recorded into an auxiliary note
* section. The data in the note section describes the operands, in terms of
* size and location, used by tracing tooling to know where to find USDT
* arguments. Each location is encoded as an assembler operand string.
* Tracing tools (bpftrace and BPF-based tracers, systemtap, etc) insert
* breakpoints on top of the nop, and decode the location operand-strings,
* like an assembler, to find the values being passed.
*
* The operand strings are selected by the compiler for each operand.
* They are constrained by inline-assembler codes.The default is:
*
* #define USDT_ARG_CONSTRAINT nor
*
* This is a good default if the operands tend to be integral and
* moderate in number (smaller than number of registers). In other
* cases, the compiler may report "'asm' requires impossible reload" or
* similar. In this case, consider simplifying the macro call (fewer
* and simpler operands), reduce optimization, or override the default
* constraints string via:
*
* #define USDT_ARG_CONSTRAINT g
* #include <usdt.h>
*
* For some historical description of USDT v3 format (the one used by this
* library and generally recognized and assumed by BPF-based tracing tools)
* see [0]. The more formal specification can be found at [1]. Additional
* argument constraints information can be found at [2].
*
* Original SystemTap's sys/sdt.h implementation ([3]) was used as a base for
* this USDT library implementation. Current implementation differs *a lot* in
* terms of exposed user API and general usability, which was the main goal
* and focus of the reimplementation work. Nevertheless, underlying recorded
* USDT definitions are fully binary compatible and any USDT-based tooling
* should work equally well with USDTs defined by either SystemTap's or this
* library's USDT implementation.
*
* [0] https://ecos.sourceware.org/ml/systemtap/2010-q3/msg00145.html
* [1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
* [2] https://gcc.gnu.org/onlinedocs/gcc/Constraints.html
* [3] https://sourceware.org/git/?p=systemtap.git;a=blob;f=includes/sys/sdt.h
*/
#ifndef __USDT_H
#define __USDT_H
/*
* Changelog:
*
* 0.1.0
* -----
* - Initial release
*/
#define USDT_MAJOR_VERSION 0
#define USDT_MINOR_VERSION 1
#define USDT_PATCH_VERSION 0
/* C++20 and C23 added __VA_OPT__ as a standard replacement for non-standard `##__VA_ARGS__` extension */
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) || (defined(__cplusplus) && __cplusplus > 201703L)
#define __usdt_va_opt 1
#define __usdt_va_args(...) __VA_OPT__(,) __VA_ARGS__
#else
#define __usdt_va_args(...) , ##__VA_ARGS__
#endif
/*
* Trigger USDT with `group`:`name` identifier and pass through `args` as its
* arguments. Zero arguments are acceptable as well. No USDT semaphore is
* associated with this USDT.
*
* Such "semaphoreless" USDTs are commonly used when there is no extra data
* collection or processing needed to collect and prepare USDT arguments and
* they are just available in the surrounding code. USDT() macro will just
* record their locations in CPU registers or in memory for tracing tooling to
* be able to access them, if necessary.
*/
#ifdef __usdt_va_opt
#define USDT(group, name, ...) \
__usdt_probe(group, name, __usdt_sema_none, 0 __VA_OPT__(,) __VA_ARGS__)
#else
#define USDT(group, name, ...) \
__usdt_probe(group, name, __usdt_sema_none, 0, ##__VA_ARGS__)
#endif
/*
* Trigger USDT with `group`:`name` identifier and pass through `args` as its
* arguments. Zero arguments are acceptable as well. USDT also get an
* implicitly-defined associated USDT semaphore, which will be "activated" by
* tracing tooling and can be used to check whether USDT is being actively
* observed.
*
* USDTs with semaphore are commonly used when there is a need to perform
* additional data collection and processing to prepare USDT arguments, which
* otherwise might not be necessary for the rest of application logic. In such
* case, USDT semaphore can be used to avoid unnecessary extra work. If USDT
* is not traced (which is presumed to be a common situation), the associated
* USDT semaphore is "inactive", and so there is no need to waste resources to
* prepare USDT arguments. Use USDT_IS_ACTIVE(group, name) to check whether
* USDT is "active".
*
* N.B. There is an inherent (albeit short) gap between checking whether USDT
* is active and triggering corresponding USDT, in which external tracer can
* be attached to an USDT and activate USDT semaphore after the activity check.
* If such a race occurs, tracers might miss one USDT execution. Tracers are
* expected to accommodate such possibility and this is expected to not be
* a problem for applications and tracers.
*
* N.B. Implicit USDT semaphore defined by USDT_WITH_SEMA() is contained
* within a single executable or shared library and is not shared outside
* them. I.e., if you use USDT_WITH_SEMA() with the same USDT group and name
* identifier across executable and shared library, it will work and won't
* conflict, per se, but will define independent USDT semaphores, one for each
* shared library/executable in which USDT_WITH_SEMA(group, name) is used.
* That is, if you attach to this USDT in one shared library (or executable),
* then only USDT semaphore within that shared library (or executable) will be
* updated by the kernel, while other libraries (or executable) will not see
* activated USDT semaphore. In short, it's best to use unique USDT group:name
* identifiers across different shared libraries (and, equivalently, between
* executable and shared library). This is advanced consideration and is
* rarely (if ever) seen in practice, but just to avoid surprises this is
* called out here. (Static libraries become a part of final executable, once
* linked by linker, so the above considerations don't apply to them.)
*/
#ifdef __usdt_va_opt
#define USDT_WITH_SEMA(group, name, ...) \
__usdt_probe(group, name, \
__usdt_sema_implicit, __usdt_sema_name(group, name) \
__VA_OPT__(,) __VA_ARGS__)
#else
#define USDT_WITH_SEMA(group, name, ...) \
__usdt_probe(group, name, \
__usdt_sema_implicit, __usdt_sema_name(group, name), \
##__VA_ARGS__)
#endif
struct usdt_sema { volatile unsigned short active; };
/*
* Check if USDT with `group`:`name` identifier is "active" (i.e., whether it
* is attached to by external tracing tooling and is actively observed).
*
* This macro can be used to decide whether any additional and potentially
* expensive data collection or processing should be done to pass extra
* information into the given USDT. It is assumed that USDT is triggered with
* USDT_WITH_SEMA() macro which will implicitly define associated USDT
* semaphore. (If one needs more control over USDT semaphore, see
* USDT_DEFINE_SEMA() and USDT_WITH_EXPLICIT_SEMA() macros below.)
*
* N.B. Such checks are necessarily racy and speculative. Between checking
* whether USDT is active and triggering the USDT itself, tracer can be
* detached with no notification. This race should be extremely rare and worst
* case should result in one-time wasted extra data collection and processing.
*/
#define USDT_IS_ACTIVE(group, name) ({ \
extern struct usdt_sema __usdt_sema_name(group, name) \
__usdt_asm_name(__usdt_sema_name(group, name)); \
__usdt_sema_implicit(__usdt_sema_name(group, name)); \
__usdt_sema_name(group, name).active > 0; \
})
/*
* APIs for working with user-defined explicit USDT semaphores.
*
* This is a less commonly used advanced API for use cases in which user needs
* an explicit control over (potentially shared across multiple USDTs) USDT
* semaphore instance. This can be used when there is a group of logically
* related USDTs that all need extra data collection and processing whenever
* any of a family of related USDTs are "activated" (i.e., traced). In such
* a case, all such related USDTs will be associated with the same shared USDT
* semaphore defined with USDT_DEFINE_SEMA() and the USDTs themselves will be
* triggered with USDT_WITH_EXPLICIT_SEMA() macros, taking an explicit extra
* USDT semaphore identifier as an extra parameter.
*/
/**
* Underlying C global variable name for user-defined USDT semaphore with
* `sema` identifier. Could be useful for debugging, but normally shouldn't be
* used explicitly.
*/
#define USDT_SEMA(sema) __usdt_sema_##sema
/*
* Define storage for user-defined USDT semaphore `sema`.
*
* Should be used only once in non-header source file to let compiler allocate
* space for the semaphore variable. Just like with any other global variable.
*
* This macro can be used anywhere where global variable declaration is
* allowed. Just like with global variable definitions, there should be only
* one definition of user-defined USDT semaphore with given `sema` identifier,
* otherwise compiler or linker will complain about duplicate variable
* definition.
*
* For C++, it is allowed to use USDT_DEFINE_SEMA() both in global namespace
* and inside namespaces (including nested namespaces). Just make sure that
* USDT_DECLARE_SEMA() is placed within the namespace where this semaphore is
* referenced, or any of its parent namespaces, so the C++ language-level
* identifier is visible to the code that needs to reference the semaphore.
* At the lowest layer, USDT semaphores have global naming and visibility
* (they have a corresponding `__usdt_sema_<name>` symbol, which can be linked
* against from C or C++ code, if necessary). To keep it simple, putting
* USDT_DECLARE_SEMA() declarations into global namespaces is the simplest
* no-brainer solution. All these aspects are irrelevant for plain C, because
* C doesn't have namespaces and everything is always in the global namespace.
*
* N.B. Due to USDT metadata being recorded in non-allocatable ELF note
* section, it has limitations when it comes to relocations, which, in
* practice, means that it's not possible to correctly share USDT semaphores
* between main executable and shared libraries, or even between multiple
* shared libraries. USDT semaphore has to be contained to individual shared
* library or executable to avoid unpleasant surprises with half-working USDT
* semaphores. We enforce this by marking semaphore ELF symbols as having
* a hidden visibility. This is quite an advanced use case and consideration
* and for most users this should have no consequences whatsoever.
*/
#define USDT_DEFINE_SEMA(sema) \
struct usdt_sema __usdt_sema_sec USDT_SEMA(sema) \
__usdt_asm_name(USDT_SEMA(sema)) \
__attribute__((visibility("hidden"))) = { 0 }
/*
* Declare extern reference to user-defined USDT semaphore `sema`.
*
* Refers to a variable defined in another compilation unit by
* USDT_DEFINE_SEMA() and allows to use the same USDT semaphore across
* multiple compilation units (i.e., .c and .cpp files).
*
* See USDT_DEFINE_SEMA() notes above for C++ language usage peculiarities.
*/
#define USDT_DECLARE_SEMA(sema) \
extern struct usdt_sema USDT_SEMA(sema) __usdt_asm_name(USDT_SEMA(sema))
/*
* Check if user-defined USDT semaphore `sema` is "active" (i.e., whether it
* is attached to by external tracing tooling and is actively observed).
*
* This macro can be used to decide whether any additional and potentially
* expensive data collection or processing should be done to pass extra
* information into USDT(s) associated with USDT semaphore `sema`.
*
* N.B. Such checks are necessarily racy. Between checking the state of USDT
* semaphore and triggering associated USDT(s), the active tracer might attach
* or detach. This race should be extremely rare and worst case should result
* in one-time missed USDT event or wasted extra data collection and
* processing. USDT-using tracers should be written with this in mind and is
* not a concern of the application defining USDTs with associated semaphore.
*/
#define USDT_SEMA_IS_ACTIVE(sema) (USDT_SEMA(sema).active > 0)
/*
* Invoke USDT specified by `group` and `name` identifiers and associate
* explicitly user-defined semaphore `sema` with it. Pass through `args` as
* USDT arguments. `args` are optional and zero arguments are acceptable.
*
* Semaphore is defined with the help of USDT_DEFINE_SEMA() macro and can be
* checked whether active with USDT_SEMA_IS_ACTIVE().
*/
#ifdef __usdt_va_opt
#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \
__usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema), ##__VA_ARGS__)
#else
#define USDT_WITH_EXPLICIT_SEMA(sema, group, name, ...) \
__usdt_probe(group, name, __usdt_sema_explicit, USDT_SEMA(sema) __VA_OPT__(,) __VA_ARGS__)
#endif
/*
* Adjustable implementation aspects
*/
#ifndef USDT_ARG_CONSTRAINT
#if defined __powerpc__
#define USDT_ARG_CONSTRAINT nZr
#elif defined __arm__
#define USDT_ARG_CONSTRAINT g
#elif defined __loongarch__
#define USDT_ARG_CONSTRAINT nmr
#else
#define USDT_ARG_CONSTRAINT nor
#endif
#endif /* USDT_ARG_CONSTRAINT */
#ifndef USDT_NOP
#if defined(__ia64__) || defined(__s390__) || defined(__s390x__)
#define USDT_NOP nop 0
#else
#define USDT_NOP nop
#endif
#endif /* USDT_NOP */
/*
* Implementation details
*/
/* USDT name for implicitly-defined USDT semaphore, derived from group:name */
#define __usdt_sema_name(group, name) __usdt_sema_##group##__##name
/* ELF section into which USDT semaphores are put */
#define __usdt_sema_sec __attribute__((section(".probes")))
#define __usdt_concat(a, b) a ## b
#define __usdt_apply(fn, n) __usdt_concat(fn, n)
#ifndef __usdt_nth
#define __usdt_nth(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, N, ...) N
#endif
#ifndef __usdt_narg
#ifdef __usdt_va_opt
#define __usdt_narg(...) __usdt_nth(_ __VA_OPT__(,) __VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#else
#define __usdt_narg(...) __usdt_nth(_, ##__VA_ARGS__, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)
#endif
#endif /* __usdt_narg */
#define __usdt_hash #
#define __usdt_str_(x) #x
#define __usdt_str(x) __usdt_str_(x)
#ifndef __usdt_asm_name
#define __usdt_asm_name(name) __asm__(__usdt_str(name))
#endif
#define __usdt_asm0() "\n"
#define __usdt_asm1(x) __usdt_str(x) "\n"
#define __usdt_asm2(x, ...) __usdt_str(x) "," __usdt_asm1(__VA_ARGS__)
#define __usdt_asm3(x, ...) __usdt_str(x) "," __usdt_asm2(__VA_ARGS__)
#define __usdt_asm4(x, ...) __usdt_str(x) "," __usdt_asm3(__VA_ARGS__)
#define __usdt_asm5(x, ...) __usdt_str(x) "," __usdt_asm4(__VA_ARGS__)
#define __usdt_asm6(x, ...) __usdt_str(x) "," __usdt_asm5(__VA_ARGS__)
#define __usdt_asm7(x, ...) __usdt_str(x) "," __usdt_asm6(__VA_ARGS__)
#define __usdt_asm8(x, ...) __usdt_str(x) "," __usdt_asm7(__VA_ARGS__)
#define __usdt_asm9(x, ...) __usdt_str(x) "," __usdt_asm8(__VA_ARGS__)
#define __usdt_asm10(x, ...) __usdt_str(x) "," __usdt_asm9(__VA_ARGS__)
#define __usdt_asm11(x, ...) __usdt_str(x) "," __usdt_asm10(__VA_ARGS__)
#define __usdt_asm12(x, ...) __usdt_str(x) "," __usdt_asm11(__VA_ARGS__)
#define __usdt_asm(...) __usdt_apply(__usdt_asm, __usdt_narg(__VA_ARGS__))(__VA_ARGS__)
#ifdef __LP64__
#define __usdt_asm_addr .8byte
#else
#define __usdt_asm_addr .4byte
#endif
#define __usdt_asm_strz_(x) __usdt_asm1(.asciz #x)
#define __usdt_asm_strz(x) __usdt_asm_strz_(x)
#define __usdt_asm_str_(x) __usdt_asm1(.ascii #x)
#define __usdt_asm_str(x) __usdt_asm_str_(x)
/* "semaphoreless" USDT case */
#ifndef __usdt_sema_none
#define __usdt_sema_none(sema)
#endif
/* implicitly defined __usdt_sema__group__name semaphore (using weak symbols) */
#ifndef __usdt_sema_implicit
#define __usdt_sema_implicit(sema) \
__asm__ __volatile__ ( \
__usdt_asm1(.ifndef sema) \
__usdt_asm3( .pushsection .probes, "aw", "progbits") \
__usdt_asm1( .weak sema) \
__usdt_asm1( .hidden sema) \
__usdt_asm1( .align 2) \
__usdt_asm1(sema:) \
__usdt_asm1( .zero 2) \
__usdt_asm2( .type sema, @object) \
__usdt_asm2( .size sema, 2) \
__usdt_asm1( .popsection) \
__usdt_asm1(.endif) \
);
#endif
/* externally defined semaphore using USDT_DEFINE_SEMA() and passed explicitly by user */
#ifndef __usdt_sema_explicit
#define __usdt_sema_explicit(sema) \
__asm__ __volatile__ ("" :: "m" (sema));
#endif
/* main USDT definition (nop and .note.stapsdt metadata) */
#define __usdt_probe(group, name, sema_def, sema, ...) do { \
sema_def(sema) \
__asm__ __volatile__ ( \
__usdt_asm( 990: USDT_NOP) \
__usdt_asm3( .pushsection .note.stapsdt, "", "note") \
__usdt_asm1( .balign 4) \
__usdt_asm3( .4byte 992f-991f,994f-993f,3) \
__usdt_asm1(991: .asciz "stapsdt") \
__usdt_asm1(992: .balign 4) \
__usdt_asm1(993: __usdt_asm_addr 990b) \
__usdt_asm1( __usdt_asm_addr _.stapsdt.base) \
__usdt_asm1( __usdt_asm_addr sema) \
__usdt_asm_strz(group) \
__usdt_asm_strz(name) \
__usdt_asm_args(__VA_ARGS__) \
__usdt_asm1( .ascii "\0") \
__usdt_asm1(994: .balign 4) \
__usdt_asm1( .popsection) \
__usdt_asm1(.ifndef _.stapsdt.base) \
__usdt_asm5( .pushsection .stapsdt.base,"aG","progbits",.stapsdt.base,comdat)\
__usdt_asm1( .weak _.stapsdt.base) \
__usdt_asm1( .hidden _.stapsdt.base) \
__usdt_asm1(_.stapsdt.base:) \
__usdt_asm1( .space 1) \
__usdt_asm2( .size _.stapsdt.base, 1) \
__usdt_asm1( .popsection) \
__usdt_asm1(.endif) \
:: __usdt_asm_ops(__VA_ARGS__) \
); \
} while (0)
/*
* NB: gdb PR24541 highlighted an unspecified corner of the sdt.h
* operand note format.
*
* The named register may be a longer or shorter (!) alias for the
* storage where the value in question is found. For example, on
* i386, 64-bit value may be put in register pairs, and a register
* name stored would identify just one of them. Previously, gcc was
* asked to emit the %w[id] (16-bit alias of some registers holding
* operands), even when a wider 32-bit value was used.
*
* Bottom line: the byte-width given before the @ sign governs. If
* there is a mismatch between that width and that of the named
* register, then a sys/sdt.h note consumer may need to employ
* architecture-specific heuristics to figure out where the compiler
* has actually put the complete value.
*/
#if defined(__powerpc__) || defined(__powerpc64__)
#define __usdt_argref(id) %I[id]%[id]
#elif defined(__i386__)
#define __usdt_argref(id) %k[id] /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */
#else
#define __usdt_argref(id) %[id]
#endif
#define __usdt_asm_arg(n) __usdt_asm_str(%c[__usdt_asz##n]) \
__usdt_asm1(.ascii "@") \
__usdt_asm_str(__usdt_argref(__usdt_aval##n))
#define __usdt_asm_args0 /* no arguments */
#define __usdt_asm_args1 __usdt_asm_arg(1)
#define __usdt_asm_args2 __usdt_asm_args1 __usdt_asm1(.ascii " ") __usdt_asm_arg(2)
#define __usdt_asm_args3 __usdt_asm_args2 __usdt_asm1(.ascii " ") __usdt_asm_arg(3)
#define __usdt_asm_args4 __usdt_asm_args3 __usdt_asm1(.ascii " ") __usdt_asm_arg(4)
#define __usdt_asm_args5 __usdt_asm_args4 __usdt_asm1(.ascii " ") __usdt_asm_arg(5)
#define __usdt_asm_args6 __usdt_asm_args5 __usdt_asm1(.ascii " ") __usdt_asm_arg(6)
#define __usdt_asm_args7 __usdt_asm_args6 __usdt_asm1(.ascii " ") __usdt_asm_arg(7)
#define __usdt_asm_args8 __usdt_asm_args7 __usdt_asm1(.ascii " ") __usdt_asm_arg(8)
#define __usdt_asm_args9 __usdt_asm_args8 __usdt_asm1(.ascii " ") __usdt_asm_arg(9)
#define __usdt_asm_args10 __usdt_asm_args9 __usdt_asm1(.ascii " ") __usdt_asm_arg(10)
#define __usdt_asm_args11 __usdt_asm_args10 __usdt_asm1(.ascii " ") __usdt_asm_arg(11)
#define __usdt_asm_args12 __usdt_asm_args11 __usdt_asm1(.ascii " ") __usdt_asm_arg(12)
#define __usdt_asm_args(...) __usdt_apply(__usdt_asm_args, __usdt_narg(__VA_ARGS__))
#define __usdt_is_arr(x) (__builtin_classify_type(x) == 14 || __builtin_classify_type(x) == 5)
#define __usdt_arg_size(x) (__usdt_is_arr(x) ? sizeof(void *) : sizeof(x))
/*
* We can't use __builtin_choose_expr() in C++, so fall back to table-based
* signedness determination for known types, utilizing templates magic.
*/
#ifdef __cplusplus
#define __usdt_is_signed(x) (!__usdt_is_arr(x) && __usdt_t<__typeof(x)>::is_signed)
#include <cstddef>
template<typename T> struct __usdt_t { static const bool is_signed = false; };
template<typename A> struct __usdt_t<A[]> : public __usdt_t<A *> {};
template<typename A, size_t N> struct __usdt_t<A[N]> : public __usdt_t<A *> {};
#define __usdt_def_signed(T) \
template<> struct __usdt_t<T> { static const bool is_signed = true; }; \
template<> struct __usdt_t<const T> { static const bool is_signed = true; }; \
template<> struct __usdt_t<volatile T> { static const bool is_signed = true; }; \
template<> struct __usdt_t<const volatile T> { static const bool is_signed = true; }
#define __usdt_maybe_signed(T) \
template<> struct __usdt_t<T> { static const bool is_signed = (T)-1 < (T)1; }; \
template<> struct __usdt_t<const T> { static const bool is_signed = (T)-1 < (T)1; }; \
template<> struct __usdt_t<volatile T> { static const bool is_signed = (T)-1 < (T)1; }; \
template<> struct __usdt_t<const volatile T> { static const bool is_signed = (T)-1 < (T)1; }
__usdt_def_signed(signed char);
__usdt_def_signed(short);
__usdt_def_signed(int);
__usdt_def_signed(long);
__usdt_def_signed(long long);
__usdt_maybe_signed(char);
__usdt_maybe_signed(wchar_t);
#else /* !__cplusplus */
#define __usdt_is_inttype(x) (__builtin_classify_type(x) >= 1 && __builtin_classify_type(x) <= 4)
#define __usdt_inttype(x) __typeof(__builtin_choose_expr(__usdt_is_inttype(x), (x), 0U))
#define __usdt_is_signed(x) ((__usdt_inttype(x))-1 < (__usdt_inttype(x))1)
#endif /* __cplusplus */
#define __usdt_asm_op(n, x) \
[__usdt_asz##n] "n" ((__usdt_is_signed(x) ? (int)-1 : 1) * (int)__usdt_arg_size(x)), \
[__usdt_aval##n] __usdt_str(USDT_ARG_CONSTRAINT)(x)
#define __usdt_asm_ops0() [__usdt_dummy] "g" (0)
#define __usdt_asm_ops1(x) __usdt_asm_op(1, x)
#define __usdt_asm_ops2(a,x) __usdt_asm_ops1(a), __usdt_asm_op(2, x)
#define __usdt_asm_ops3(a,b,x) __usdt_asm_ops2(a,b), __usdt_asm_op(3, x)
#define __usdt_asm_ops4(a,b,c,x) __usdt_asm_ops3(a,b,c), __usdt_asm_op(4, x)
#define __usdt_asm_ops5(a,b,c,d,x) __usdt_asm_ops4(a,b,c,d), __usdt_asm_op(5, x)
#define __usdt_asm_ops6(a,b,c,d,e,x) __usdt_asm_ops5(a,b,c,d,e), __usdt_asm_op(6, x)
#define __usdt_asm_ops7(a,b,c,d,e,f,x) __usdt_asm_ops6(a,b,c,d,e,f), __usdt_asm_op(7, x)
#define __usdt_asm_ops8(a,b,c,d,e,f,g,x) __usdt_asm_ops7(a,b,c,d,e,f,g), __usdt_asm_op(8, x)
#define __usdt_asm_ops9(a,b,c,d,e,f,g,h,x) __usdt_asm_ops8(a,b,c,d,e,f,g,h), __usdt_asm_op(9, x)
#define __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,x) __usdt_asm_ops9(a,b,c,d,e,f,g,h,i), __usdt_asm_op(10, x)
#define __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,x) __usdt_asm_ops10(a,b,c,d,e,f,g,h,i,j), __usdt_asm_op(11, x)
#define __usdt_asm_ops12(a,b,c,d,e,f,g,h,i,j,k,x) __usdt_asm_ops11(a,b,c,d,e,f,g,h,i,j,k), __usdt_asm_op(12, x)
#define __usdt_asm_ops(...) __usdt_apply(__usdt_asm_ops, __usdt_narg(__VA_ARGS__))(__VA_ARGS__)
#endif /* __USDT_H */

View File

@ -74,6 +74,14 @@
#define noinline __attribute__((noinline))
#endif
#ifndef __nocf_check
#define __nocf_check __attribute__((nocf_check))
#endif
#ifndef __naked
#define __naked __attribute__((__naked__))
#endif
#ifndef PR_SET_NO_NEW_PRIVS
#define PR_SET_NO_NEW_PRIVS 38
#define PR_GET_NO_NEW_PRIVS 39
@ -5027,7 +5035,36 @@ TEST(tsync_vs_dead_thread_leader)
EXPECT_EQ(0, status);
}
noinline int probed(void)
#ifdef __x86_64__
/*
* We need naked probed_uprobe function. Using __nocf_check
* check to skip possible endbr64 instruction and ignoring
* -Wattributes, otherwise the compilation might fail.
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wattributes"
__naked __nocf_check noinline int probed_uprobe(void)
{
/*
* Optimized uprobe is possible only on top of nop5 instruction.
*/
asm volatile (" \n"
".byte 0x0f, 0x1f, 0x44, 0x00, 0x00 \n"
"ret \n"
);
}
#pragma GCC diagnostic pop
#else
noinline int probed_uprobe(void)
{
return 1;
}
#endif
noinline int probed_uretprobe(void)
{
return 1;
}
@ -5080,35 +5117,46 @@ static ssize_t get_uprobe_offset(const void *addr)
return found ? (uintptr_t)addr - start + base : -1;
}
FIXTURE(URETPROBE) {
FIXTURE(UPROBE) {
int fd;
};
FIXTURE_VARIANT(URETPROBE) {
FIXTURE_VARIANT(UPROBE) {
/*
* All of the URETPROBE behaviors can be tested with either
* uretprobe attached or not
* All of the U(RET)PROBE behaviors can be tested with either
* u(ret)probe attached or not
*/
bool attach;
/*
* Test both uprobe and uretprobe.
*/
bool uretprobe;
};
FIXTURE_VARIANT_ADD(URETPROBE, attached) {
.attach = true,
};
FIXTURE_VARIANT_ADD(URETPROBE, not_attached) {
FIXTURE_VARIANT_ADD(UPROBE, not_attached) {
.attach = false,
.uretprobe = false,
};
FIXTURE_SETUP(URETPROBE)
FIXTURE_VARIANT_ADD(UPROBE, uprobe_attached) {
.attach = true,
.uretprobe = false,
};
FIXTURE_VARIANT_ADD(UPROBE, uretprobe_attached) {
.attach = true,
.uretprobe = true,
};
FIXTURE_SETUP(UPROBE)
{
const size_t attr_sz = sizeof(struct perf_event_attr);
struct perf_event_attr attr;
ssize_t offset;
int type, bit;
#ifndef __NR_uretprobe
SKIP(return, "__NR_uretprobe syscall not defined");
#if !defined(__NR_uprobe) || !defined(__NR_uretprobe)
SKIP(return, "__NR_uprobe ot __NR_uretprobe syscalls not defined");
#endif
if (!variant->attach)
@ -5118,12 +5166,17 @@ FIXTURE_SETUP(URETPROBE)
type = determine_uprobe_perf_type();
ASSERT_GE(type, 0);
bit = determine_uprobe_retprobe_bit();
ASSERT_GE(bit, 0);
offset = get_uprobe_offset(probed);
if (variant->uretprobe) {
bit = determine_uprobe_retprobe_bit();
ASSERT_GE(bit, 0);
}
offset = get_uprobe_offset(variant->uretprobe ? probed_uretprobe : probed_uprobe);
ASSERT_GE(offset, 0);
attr.config |= 1 << bit;
if (variant->uretprobe)
attr.config |= 1 << bit;
attr.size = attr_sz;
attr.type = type;
attr.config1 = ptr_to_u64("/proc/self/exe");
@ -5134,7 +5187,7 @@ FIXTURE_SETUP(URETPROBE)
PERF_FLAG_FD_CLOEXEC);
}
FIXTURE_TEARDOWN(URETPROBE)
FIXTURE_TEARDOWN(UPROBE)
{
/* we could call close(self->fd), but we'd need extra filter for
* that and since we are calling _exit right away..
@ -5148,11 +5201,17 @@ static int run_probed_with_filter(struct sock_fprog *prog)
return -1;
}
probed();
/*
* Uprobe is optimized after first hit, so let's hit twice.
*/
probed_uprobe();
probed_uprobe();
probed_uretprobe();
return 0;
}
TEST_F(URETPROBE, uretprobe_default_allow)
TEST_F(UPROBE, uprobe_default_allow)
{
struct sock_filter filter[] = {
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
@ -5165,7 +5224,7 @@ TEST_F(URETPROBE, uretprobe_default_allow)
ASSERT_EQ(0, run_probed_with_filter(&prog));
}
TEST_F(URETPROBE, uretprobe_default_block)
TEST_F(UPROBE, uprobe_default_block)
{
struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
@ -5182,11 +5241,14 @@ TEST_F(URETPROBE, uretprobe_default_block)
ASSERT_EQ(0, run_probed_with_filter(&prog));
}
TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
TEST_F(UPROBE, uprobe_block_syscall)
{
struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, nr)),
#ifdef __NR_uprobe
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 1, 2),
#endif
#ifdef __NR_uretprobe
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 0, 1),
#endif
@ -5201,11 +5263,14 @@ TEST_F(URETPROBE, uretprobe_block_uretprobe_syscall)
ASSERT_EQ(0, run_probed_with_filter(&prog));
}
TEST_F(URETPROBE, uretprobe_default_block_with_uretprobe_syscall)
TEST_F(UPROBE, uprobe_default_block_with_syscall)
{
struct sock_filter filter[] = {
BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
offsetof(struct seccomp_data, nr)),
#ifdef __NR_uprobe
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uprobe, 3, 0),
#endif
#ifdef __NR_uretprobe
BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, __NR_uretprobe, 2, 0),
#endif