Commit adf12a39 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull perf fixes from Thomas Gleixner:
 "Perf fixes for perf_mmap() reference counting to prevent potential
  reference count leaks which are caused by:

   - VMA splits, which change the offset or size of a mapping, which
     causes perf_mmap_close() to ignore the unmap or unmap the wrong
     buffer.

   - Several internal issues of perf_mmap(), which can cause reference
     count leaks in the perf mmap, corrupt accounting or cause leaks in
     perf drivers.

  The main fix is to prevent VMA splits by implementing the
  [may_]split() callback for vm operations.

  The other issues are addressed by rearranging code, early returns on
  failure and invocation of cleanups.

  Also provide a selftest to validate the fixes.

  The reference counting should be converted to refcount_t, but that
  requires larger refactoring of the code and will be done once these
  fixes are upstream"

* tag 'perf-fixes-27504' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git:
  selftests/perf_events: Add a mmap() correctness test
  perf/core: Prevent VMA split of buffer mappings
  perf/core: Handle buffer mapping fail correctly in perf_mmap()
  perf/core: Exit early on perf_mmap() fail
  perf/core: Don't leak AUX buffer refcount on allocation failure
  perf/core: Preserve AUX buffer allocation failure result
parents 8466d393 084d2ac4
Loading
Loading
Loading
Loading
+28 −8
Original line number Diff line number Diff line
@@ -6842,10 +6842,20 @@ static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
	return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
}

static int perf_mmap_may_split(struct vm_area_struct *vma, unsigned long addr)
{
	/*
	 * Forbid splitting perf mappings to prevent refcount leaks due to
	 * the resulting non-matching offsets and sizes. See open()/close().
	 */
	return -EINVAL;
}

static const struct vm_operations_struct perf_mmap_vmops = {
	.open		= perf_mmap_open,
	.close		= perf_mmap_close, /* non mergeable */
	.pfn_mkwrite	= perf_mmap_pfn_mkwrite,
	.may_split	= perf_mmap_may_split,
};

static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
@@ -7051,8 +7061,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
			ret = 0;
			goto unlock;
		}

		atomic_set(&rb->aux_mmap_count, 1);
	}

	user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
@@ -7115,14 +7123,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
		perf_event_update_time(event);
		perf_event_init_userpage(event);
		perf_event_update_userpage(event);
		ret = 0;
	} else {
		ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
				   event->attr.aux_watermark, flags);
		if (!ret)
		if (!ret) {
			atomic_set(&rb->aux_mmap_count, 1);
			rb->aux_mmap_locked = extra;
		}

	ret = 0;
	}

unlock:
	if (!ret) {
@@ -7131,6 +7140,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)

		atomic_inc(&event->mmap_count);
	} else if (rb) {
		/* AUX allocation failed */
		atomic_dec(&rb->mmap_count);
	}
aux_unlock:
@@ -7138,6 +7148,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
		mutex_unlock(aux_mutex);
	mutex_unlock(&event->mmap_mutex);

	if (ret)
		return ret;

	/*
	 * Since pinned accounting is per vm we cannot allow fork() to copy our
	 * vma.
@@ -7145,13 +7158,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
	vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
	vma->vm_ops = &perf_mmap_vmops;

	if (!ret)
		ret = map_range(rb, vma);

	mapped = get_mapped(event, event_mapped);
	if (mapped)
		mapped(event, vma->vm_mm);

	/*
	 * Try to map it into the page table. On fail, invoke
	 * perf_mmap_close() to undo the above, as the callsite expects
	 * full cleanup in this case and therefore does not invoke
	 * vmops::close().
	 */
	ret = map_range(rb, vma);
	if (ret)
		perf_mmap_close(vma);

	return ret;
}

+1 −0
Original line number Diff line number Diff line
@@ -2,3 +2,4 @@
sigtrap_threads
remove_on_exec
watermark_signal
mmap
+1 −1
Original line number Diff line number Diff line
@@ -2,5 +2,5 @@
CFLAGS += -Wl,-no-as-needed -Wall $(KHDR_INCLUDES)
LDFLAGS += -lpthread

TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal
TEST_GEN_PROGS := sigtrap_threads remove_on_exec watermark_signal mmap
include ../lib.mk
+236 −0
Original line number Diff line number Diff line
// SPDX-License-Identifier: GPL-2.0-only
#define _GNU_SOURCE

#include <dirent.h>
#include <sched.h>
#include <stdbool.h>
#include <stdio.h>
#include <unistd.h>

#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/types.h>

#include <linux/perf_event.h>

#include "../kselftest_harness.h"

#define RB_SIZE		0x3000
#define AUX_SIZE	0x10000
#define AUX_OFFS	0x4000

#define HOLE_SIZE	0x1000

/* Reserve space for rb, aux with space for shrink-beyond-vma testing. */
#define REGION_SIZE	(2 * RB_SIZE + 2 * AUX_SIZE)
#define REGION_AUX_OFFS (2 * RB_SIZE)

#define MAP_BASE	1
#define MAP_AUX		2

#define EVENT_SRC_DIR	"/sys/bus/event_source/devices"

FIXTURE(perf_mmap)
{
	int		fd;
	void		*ptr;
	void		*region;
};

FIXTURE_VARIANT(perf_mmap)
{
	bool		aux;
	unsigned long	ptr_size;
};

FIXTURE_VARIANT_ADD(perf_mmap, rb)
{
	.aux = false,
	.ptr_size = RB_SIZE,
};

FIXTURE_VARIANT_ADD(perf_mmap, aux)
{
	.aux = true,
	.ptr_size = AUX_SIZE,
};

static bool read_event_type(struct dirent *dent, __u32 *type)
{
	char typefn[512];
	FILE *fp;
	int res;

	snprintf(typefn, sizeof(typefn), "%s/%s/type", EVENT_SRC_DIR, dent->d_name);
	fp = fopen(typefn, "r");
	if (!fp)
		return false;

	res = fscanf(fp, "%u", type);
	fclose(fp);
	return res > 0;
}

FIXTURE_SETUP(perf_mmap)
{
	struct perf_event_attr attr = {
		.size		= sizeof(attr),
		.disabled	= 1,
		.exclude_kernel	= 1,
		.exclude_hv	= 1,
	};
	struct perf_event_attr attr_ok = {};
	unsigned int eacces = 0, map = 0;
	struct perf_event_mmap_page *rb;
	struct dirent *dent;
	void *aux, *region;
	DIR *dir;

	self->ptr = NULL;

	dir = opendir(EVENT_SRC_DIR);
	if (!dir)
		SKIP(return, "perf not available.");

	region = mmap(NULL, REGION_SIZE, PROT_NONE, MAP_ANON | MAP_PRIVATE, -1, 0);
	ASSERT_NE(region, MAP_FAILED);
	self->region = region;

	// Try to find a suitable event on this system
	while ((dent = readdir(dir))) {
		int fd;

		if (!read_event_type(dent, &attr.type))
			continue;

		fd = syscall(SYS_perf_event_open, &attr, 0, -1, -1, 0);
		if (fd < 0) {
			if (errno == EACCES)
				eacces++;
			continue;
		}

		// Check whether the event supports mmap()
		rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, fd, 0);
		if (rb == MAP_FAILED) {
			close(fd);
			continue;
		}

		if (!map) {
			// Save the event in case that no AUX capable event is found
			attr_ok = attr;
			map = MAP_BASE;
		}

		if (!variant->aux)
			continue;

		rb->aux_offset = AUX_OFFS;
		rb->aux_size = AUX_SIZE;

		// Check whether it supports a AUX buffer
		aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
			   MAP_SHARED | MAP_FIXED, fd, AUX_OFFS);
		if (aux == MAP_FAILED) {
			munmap(rb, RB_SIZE);
			close(fd);
			continue;
		}

		attr_ok = attr;
		map = MAP_AUX;
		munmap(aux, AUX_SIZE);
		munmap(rb, RB_SIZE);
		close(fd);
		break;
	}
	closedir(dir);

	if (!map) {
		if (!eacces)
			SKIP(return, "No mappable perf event found.");
		else
			SKIP(return, "No permissions for perf_event_open()");
	}

	self->fd = syscall(SYS_perf_event_open, &attr_ok, 0, -1, -1, 0);
	ASSERT_NE(self->fd, -1);

	rb = mmap(region, RB_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_FIXED, self->fd, 0);
	ASSERT_NE(rb, MAP_FAILED);

	if (!variant->aux) {
		self->ptr = rb;
		return;
	}

	if (map != MAP_AUX)
		SKIP(return, "No AUX event found.");

	rb->aux_offset = AUX_OFFS;
	rb->aux_size = AUX_SIZE;
	aux = mmap(region + REGION_AUX_OFFS, AUX_SIZE, PROT_READ | PROT_WRITE,
		   MAP_SHARED | MAP_FIXED, self->fd, AUX_OFFS);
	ASSERT_NE(aux, MAP_FAILED);
	self->ptr = aux;
}

FIXTURE_TEARDOWN(perf_mmap)
{
	ASSERT_EQ(munmap(self->region, REGION_SIZE), 0);
	if (self->fd != -1)
		ASSERT_EQ(close(self->fd), 0);
}

TEST_F(perf_mmap, remap)
{
	void *tmp, *ptr = self->ptr;
	unsigned long size = variant->ptr_size;

	// Test the invalid remaps
	ASSERT_EQ(mremap(ptr, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
	ASSERT_EQ(mremap(ptr + HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);
	ASSERT_EQ(mremap(ptr + size - HOLE_SIZE, HOLE_SIZE, size, MREMAP_MAYMOVE), MAP_FAILED);
	// Shrink the end of the mapping such that we only unmap past end of the VMA,
	// which should succeed and poke a hole into the PROT_NONE region
	ASSERT_NE(mremap(ptr + size - HOLE_SIZE, size, HOLE_SIZE, MREMAP_MAYMOVE), MAP_FAILED);

	// Remap the whole buffer to a new address
	tmp = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
	ASSERT_NE(tmp, MAP_FAILED);

	// Try splitting offset 1 hole size into VMA, this should fail
	ASSERT_EQ(mremap(ptr + HOLE_SIZE, size - HOLE_SIZE, size - HOLE_SIZE,
			 MREMAP_MAYMOVE | MREMAP_FIXED, tmp), MAP_FAILED);
	// Remapping the whole thing should succeed fine
	ptr = mremap(ptr, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, tmp);
	ASSERT_EQ(ptr, tmp);
	ASSERT_EQ(munmap(tmp, size), 0);
}

TEST_F(perf_mmap, unmap)
{
	unsigned long size = variant->ptr_size;

	// Try to poke holes into the mappings
	ASSERT_NE(munmap(self->ptr, HOLE_SIZE), 0);
	ASSERT_NE(munmap(self->ptr + HOLE_SIZE, HOLE_SIZE), 0);
	ASSERT_NE(munmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE), 0);
}

TEST_F(perf_mmap, map)
{
	unsigned long size = variant->ptr_size;

	// Try to poke holes into the mappings by mapping anonymous memory over it
	ASSERT_EQ(mmap(self->ptr, HOLE_SIZE, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
	ASSERT_EQ(mmap(self->ptr + HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
	ASSERT_EQ(mmap(self->ptr + size - HOLE_SIZE, HOLE_SIZE, PROT_READ | PROT_WRITE,
		       MAP_PRIVATE | MAP_ANON | MAP_FIXED, -1, 0), MAP_FAILED);
}

TEST_HARNESS_MAIN