Commit 3ab1db3c authored by Jens Axboe's avatar Jens Axboe
Browse files

io_uring: get rid of remap_pfn_range() for mapping rings/sqes



Rather than use remap_pfn_range() for this and manually free later,
switch to using vm_insert_pages() and have it Just Work.

If possible, allocate a single compound page that covers the range that
is needed. If that works, then we can just use page_address() on that
page. If we fail to get a compound page, allocate single pages and use
vmap() to map them into the kernel virtual address space.

This just covers the rings/sqes, the other remaining user of the mmap
remap_pfn_range() user will be converted separately. Once that is done,
we can kill the old alloc/free code.

Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 62346c6c
Loading
Loading
Loading
Loading
+131 −8
Original line number Diff line number Diff line
@@ -2599,6 +2599,36 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
	return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}

static void io_pages_unmap(void *ptr, struct page ***pages,
			   unsigned short *npages)
{
	bool do_vunmap = false;

	if (!ptr)
		return;

	if (*npages) {
		struct page **to_free = *pages;
		int i;

		/*
		 * Only did vmap for the non-compound multiple page case.
		 * For the compound page, we just need to put the head.
		 */
		if (PageCompound(to_free[0]))
			*npages = 1;
		else if (*npages > 1)
			do_vunmap = true;
		for (i = 0; i < *npages; i++)
			put_page(to_free[i]);
	}
	if (do_vunmap)
		vunmap(ptr);
	kvfree(*pages);
	*pages = NULL;
	*npages = 0;
}

void io_mem_free(void *ptr)
{
	if (!ptr)
@@ -2699,8 +2729,8 @@ static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
static void io_rings_free(struct io_ring_ctx *ctx)
{
	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
		io_mem_free(ctx->rings);
		io_mem_free(ctx->sq_sqes);
		io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages);
		io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages);
	} else {
		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
		ctx->n_ring_pages = 0;
@@ -2712,6 +2742,80 @@ static void io_rings_free(struct io_ring_ctx *ctx)
	ctx->sq_sqes = NULL;
}

static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
				   size_t size, gfp_t gfp)
{
	struct page *page;
	int i, order;

	order = get_order(size);
	if (order > MAX_PAGE_ORDER)
		return ERR_PTR(-ENOMEM);
	else if (order)
		gfp |= __GFP_COMP;

	page = alloc_pages(gfp, order);
	if (!page)
		return ERR_PTR(-ENOMEM);

	for (i = 0; i < nr_pages; i++)
		pages[i] = page + i;

	return page_address(page);
}

static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size,
				 gfp_t gfp)
{
	void *ret;
	int i;

	for (i = 0; i < nr_pages; i++) {
		pages[i] = alloc_page(gfp);
		if (!pages[i])
			goto err;
	}

	ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
	if (ret)
		return ret;
err:
	while (i--)
		put_page(pages[i]);
	return ERR_PTR(-ENOMEM);
}

static void *io_pages_map(struct page ***out_pages, unsigned short *npages,
			  size_t size)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN;
	struct page **pages;
	int nr_pages;
	void *ret;

	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp);
	if (!pages)
		return ERR_PTR(-ENOMEM);

	ret = io_mem_alloc_compound(pages, nr_pages, size, gfp);
	if (!IS_ERR(ret))
		goto done;

	ret = io_mem_alloc_single(pages, nr_pages, size, gfp);
	if (!IS_ERR(ret)) {
done:
		*out_pages = pages;
		*npages = nr_pages;
		return ret;
	}

	kvfree(pages);
	*out_pages = NULL;
	*npages = 0;
	return ret;
}

void *io_mem_alloc(size_t size)
{
	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
@@ -3298,14 +3402,12 @@ static void *io_uring_validate_mmap_request(struct file *file,
		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
			return ERR_PTR(-EINVAL);
		ptr = ctx->rings;
		break;
		return ctx->rings;
	case IORING_OFF_SQES:
		/* Don't allow mmap if the ring was setup without it */
		if (ctx->flags & IORING_SETUP_NO_MMAP)
			return ERR_PTR(-EINVAL);
		ptr = ctx->sq_sqes;
		break;
		return ctx->sq_sqes;
	case IORING_OFF_PBUF_RING: {
		struct io_buffer_list *bl;
		unsigned int bgid;
@@ -3329,11 +3431,22 @@ static void *io_uring_validate_mmap_request(struct file *file,
	return ptr;
}

int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
			struct page **pages, int npages)
{
	unsigned long nr_pages = npages;

	vm_flags_set(vma, VM_DONTEXPAND);
	return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages);
}

#ifdef CONFIG_MMU

static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct io_ring_ctx *ctx = file->private_data;
	size_t sz = vma->vm_end - vma->vm_start;
	long offset = vma->vm_pgoff << PAGE_SHIFT;
	unsigned long pfn;
	void *ptr;

@@ -3341,6 +3454,16 @@ static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
	if (IS_ERR(ptr))
		return PTR_ERR(ptr);

	switch (offset & IORING_OFF_MMAP_MASK) {
	case IORING_OFF_SQ_RING:
	case IORING_OFF_CQ_RING:
		return io_uring_mmap_pages(ctx, vma, ctx->ring_pages,
						ctx->n_ring_pages);
	case IORING_OFF_SQES:
		return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages,
						ctx->n_sqe_pages);
	}

	pfn = virt_to_phys(ptr) >> PAGE_SHIFT;
	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
}
@@ -3630,7 +3753,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
		return -EOVERFLOW;

	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
		rings = io_mem_alloc(size);
		rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
	else
		rings = io_rings_map(ctx, p->cq_off.user_addr, size);

@@ -3655,7 +3778,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
	}

	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
		ptr = io_mem_alloc(size);
		ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
	else
		ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);

+2 −0
Original line number Diff line number Diff line
@@ -70,6 +70,8 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags);
void __io_commit_cqring_flush(struct io_ring_ctx *ctx);

struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages);
int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma,
			struct page **pages, int npages);

struct file *io_file_get_normal(struct io_kiocb *req, int fd);
struct file *io_file_get_fixed(struct io_kiocb *req, int fd,