Commit 8a34e887 authored by Ming Lei's avatar Ming Lei Committed by Jens Axboe
Browse files

ublk: eliminate permanent pages[] array from struct ublk_buf



The pages[] array (kvmalloc'd, 8 bytes per page = 2MB for a 1GB buffer)
was stored permanently in struct ublk_buf but only needed during
pin_user_pages_fast() and maple tree construction. Since the maple tree
already stores PFN ranges via ublk_buf_range, struct page pointers can
be recovered via pfn_to_page() during unregistration.

Make pages[] a temporary allocation in ublk_ctrl_reg_buf(), freed
immediately after the maple tree is built. Rewrite __ublk_ctrl_unreg_buf()
to iterate the maple tree for matching buf_index entries, recovering
struct page pointers via pfn_to_page() and unpinning in batches of 32.
Simplify ublk_buf_erase_ranges() to iterate the maple tree by buf_index
instead of walking the now-removed pages[] array.

Signed-off-by: default avatarMing Lei <ming.lei@redhat.com>
Link: https://patch.msgid.link/20260331153207.3635125-5-ming.lei@redhat.com


Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 08677040
Loading
Loading
Loading
Loading
+55 −32
Original line number Diff line number Diff line
@@ -296,7 +296,6 @@ struct ublk_queue {

/* Per-registered shared memory buffer */
struct ublk_buf {
	struct page **pages;
	unsigned int nr_pages;
};

@@ -5261,27 +5260,25 @@ static void ublk_unquiesce_and_resume(struct gendisk *disk)
	blk_mq_unquiesce_queue(disk->queue);
}

/* Erase coalesced PFN ranges from the maple tree for pages [0, nr_pages) */
static void ublk_buf_erase_ranges(struct ublk_device *ub,
				  struct ublk_buf *ubuf,
				  unsigned long nr_pages)
/* Erase coalesced PFN ranges from the maple tree matching buf_index */
static void ublk_buf_erase_ranges(struct ublk_device *ub, int buf_index)
{
	unsigned long i;

	for (i = 0; i < nr_pages; ) {
		unsigned long pfn = page_to_pfn(ubuf->pages[i]);
		unsigned long start = i;
	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
	struct ublk_buf_range *range;

		while (i + 1 < nr_pages &&
		       page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
			i++;
		i++;
		kfree(mtree_erase(&ub->buf_tree, pfn));
	mas_lock(&mas);
	mas_for_each(&mas, range, ULONG_MAX) {
		if (range->buf_index == buf_index) {
			mas_erase(&mas);
			kfree(range);
		}
	}
	mas_unlock(&mas);
}

static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
			       struct ublk_buf *ubuf, int index,
			       struct ublk_buf *ubuf,
			       struct page **pages, int index,
			       unsigned short flags)
{
	unsigned long nr_pages = ubuf->nr_pages;
@@ -5289,13 +5286,13 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
	int ret;

	for (i = 0; i < nr_pages; ) {
		unsigned long pfn = page_to_pfn(ubuf->pages[i]);
		unsigned long pfn = page_to_pfn(pages[i]);
		unsigned long start = i;
		struct ublk_buf_range *range;

		/* Find run of consecutive PFNs */
		while (i + 1 < nr_pages &&
		       page_to_pfn(ubuf->pages[i + 1]) == pfn + (i - start) + 1)
		       page_to_pfn(pages[i + 1]) == pfn + (i - start) + 1)
			i++;
		i++;	/* past the last page in this run */

@@ -5320,7 +5317,7 @@ static int __ublk_ctrl_reg_buf(struct ublk_device *ub,
	return 0;

unwind:
	ublk_buf_erase_ranges(ub, ubuf, i);
	ublk_buf_erase_ranges(ub, index);
	return ret;
}

@@ -5335,6 +5332,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
	void __user *argp = (void __user *)(unsigned long)header->addr;
	struct ublk_shmem_buf_reg buf_reg;
	unsigned long addr, size, nr_pages;
	struct page **pages = NULL;
	unsigned int gup_flags;
	struct gendisk *disk;
	struct ublk_buf *ubuf;
@@ -5371,9 +5369,8 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
		goto put_disk;
	}

	ubuf->pages = kvmalloc_array(nr_pages, sizeof(*ubuf->pages),
				     GFP_KERNEL);
	if (!ubuf->pages) {
	pages = kvmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL);
	if (!pages) {
		ret = -ENOMEM;
		goto err_free;
	}
@@ -5382,7 +5379,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
	if (!(buf_reg.flags & UBLK_SHMEM_BUF_READ_ONLY))
		gup_flags |= FOLL_WRITE;

	pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, ubuf->pages);
	pinned = pin_user_pages_fast(addr, nr_pages, gup_flags, pages);
	if (pinned < 0) {
		ret = pinned;
		goto err_free_pages;
@@ -5406,7 +5403,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
	if (ret)
		goto err_unlock;

	ret = __ublk_ctrl_reg_buf(ub, ubuf, index, buf_reg.flags);
	ret = __ublk_ctrl_reg_buf(ub, ubuf, pages, index, buf_reg.flags);
	if (ret) {
		xa_erase(&ub->bufs_xa, index);
		goto err_unlock;
@@ -5414,6 +5411,7 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,

	mutex_unlock(&ub->mutex);

	kvfree(pages);
	ublk_unquiesce_and_resume(disk);
	ublk_put_disk(disk);
	return index;
@@ -5422,9 +5420,9 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
	mutex_unlock(&ub->mutex);
	ublk_unquiesce_and_resume(disk);
err_unpin:
	unpin_user_pages(ubuf->pages, pinned);
	unpin_user_pages(pages, pinned);
err_free_pages:
	kvfree(ubuf->pages);
	kvfree(pages);
err_free:
	kfree(ubuf);
put_disk:
@@ -5433,11 +5431,36 @@ static int ublk_ctrl_reg_buf(struct ublk_device *ub,
}

static void __ublk_ctrl_unreg_buf(struct ublk_device *ub,
				  struct ublk_buf *ubuf)
				  struct ublk_buf *ubuf, int buf_index)
{
	ublk_buf_erase_ranges(ub, ubuf, ubuf->nr_pages);
	unpin_user_pages(ubuf->pages, ubuf->nr_pages);
	kvfree(ubuf->pages);
	MA_STATE(mas, &ub->buf_tree, 0, ULONG_MAX);
	struct ublk_buf_range *range;
	struct page *pages[32];

	mas_lock(&mas);
	mas_for_each(&mas, range, ULONG_MAX) {
		unsigned long base, nr, off;

		if (range->buf_index != buf_index)
			continue;

		base = range->base_pfn;
		nr = mas.last - mas.index + 1;
		mas_erase(&mas);

		for (off = 0; off < nr; ) {
			unsigned int batch = min_t(unsigned long,
						   nr - off, 32);
			unsigned int j;

			for (j = 0; j < batch; j++)
				pages[j] = pfn_to_page(base + off + j);
			unpin_user_pages(pages, batch);
			off += batch;
		}
		kfree(range);
	}
	mas_unlock(&mas);
	kfree(ubuf);
}

@@ -5468,7 +5491,7 @@ static int ublk_ctrl_unreg_buf(struct ublk_device *ub,
		return -ENOENT;
	}

	__ublk_ctrl_unreg_buf(ub, ubuf);
	__ublk_ctrl_unreg_buf(ub, ubuf, index);

	mutex_unlock(&ub->mutex);

@@ -5483,7 +5506,7 @@ static void ublk_buf_cleanup(struct ublk_device *ub)
	unsigned long index;

	xa_for_each(&ub->bufs_xa, index, ubuf)
		__ublk_ctrl_unreg_buf(ub, ubuf);
		__ublk_ctrl_unreg_buf(ub, ubuf, index);
	xa_destroy(&ub->bufs_xa);
	mtree_destroy(&ub->buf_tree);
}