Commit 8dd5e7c7 authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Jens Axboe
Browse files

block: add helpers to bounce buffer an iov_iter into bios



Add helpers to implement bounce buffering of data into a bio to implement
direct I/O for cases where direct user access is not possible because
stable in-flight data is required.  These are intended to be used as
easily as bio_iov_iter_get_pages for the zero-copy path.

The write side is trivial and just copies data into the bounce buffer.
The read side is a lot more complex because it needs to perform the copy
from the completion context, and without preserving the iov_iter through
the call chain.  It steals a trick from the integrity data user interface
and uses the first vector in the bio for the bounce buffer data that is
fed to the block I/O stack, and uses the others to record the user
buffer fragments.

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarAnuj Gupta <anuj20.g@samsung.com>
Reviewed-by: default avatarDamien Le Moal <dlemoal@kernel.org>
Reviewed-by: default avatarJohannes Thumshirn <johannes.thumshirn@wdc.com>
Tested-by: default avatarAnuj Gupta <anuj20.g@samsung.com>
Reviewed-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: default avatarDarrick J. Wong <djwong@kernel.org>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 301f5356
Loading
Loading
Loading
Loading
+179 −0
Original line number Diff line number Diff line
@@ -1266,6 +1266,185 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter,
	return bio_iov_iter_align_down(bio, iter, len_align_mask);
}

static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size)
{
	struct folio *folio;

	while (*size > PAGE_SIZE) {
		folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size));
		if (folio)
			return folio;
		*size = rounddown_pow_of_two(*size - 1);
	}

	return folio_alloc(gfp, get_order(*size));
}

static void bio_free_folios(struct bio *bio)
{
	struct bio_vec *bv;
	int i;

	bio_for_each_bvec_all(bv, bio, i) {
		struct folio *folio = page_folio(bv->bv_page);

		if (!is_zero_folio(folio))
			folio_put(folio);
	}
}

static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter)
{
	size_t total_len = iov_iter_count(iter);

	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
		return -EINVAL;
	if (WARN_ON_ONCE(bio->bi_iter.bi_size))
		return -EINVAL;
	if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs))
		return -EINVAL;

	do {
		size_t this_len = min(total_len, SZ_1M);
		struct folio *folio;

		if (this_len > PAGE_SIZE * 2)
			this_len = rounddown_pow_of_two(this_len);

		if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len)
			break;

		folio = folio_alloc_greedy(GFP_KERNEL, &this_len);
		if (!folio)
			break;
		bio_add_folio_nofail(bio, folio, this_len, 0);

		if (copy_from_iter(folio_address(folio), this_len, iter) !=
				this_len) {
			bio_free_folios(bio);
			return -EFAULT;
		}

		total_len -= this_len;
	} while (total_len && bio->bi_vcnt < bio->bi_max_vecs);

	if (!bio->bi_iter.bi_size)
		return -ENOMEM;
	return 0;
}

static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter)
{
	size_t len = min(iov_iter_count(iter), SZ_1M);
	struct folio *folio;

	folio = folio_alloc_greedy(GFP_KERNEL, &len);
	if (!folio)
		return -ENOMEM;

	do {
		ssize_t ret;

		ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len,
				&bio->bi_vcnt, bio->bi_max_vecs - 1, 0);
		if (ret <= 0) {
			if (!bio->bi_vcnt)
				return ret;
			break;
		}
		len -= ret;
		bio->bi_iter.bi_size += ret;
	} while (len && bio->bi_vcnt < bio->bi_max_vecs - 1);

	/*
	 * Set the folio directly here.  The above loop has already calculated
	 * the correct bi_size, and we use bi_vcnt for the user buffers.  That
	 * is safe as bi_vcnt is only used by the submitter and not the actual
	 * I/O path.
	 */
	bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0);
	if (iov_iter_extract_will_pin(iter))
		bio_set_flag(bio, BIO_PAGE_PINNED);
	return 0;
}

/**
 * bio_iov_iter_bounce - bounce buffer data from an iter into a bio
 * @bio:	bio to send
 * @iter:	iter to read from / write into
 *
 * Helper for direct I/O implementations that need to bounce buffer because
 * we need to checksum the data or perform other operations that require
 * consistency.  Allocates folios to back the bounce buffer, and for writes
 * copies the data into it.  Needs to be paired with bio_iov_iter_unbounce()
 * called on completion.
 */
int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter)
{
	if (op_is_write(bio_op(bio)))
		return bio_iov_iter_bounce_write(bio, iter);
	return bio_iov_iter_bounce_read(bio, iter);
}

static void bvec_unpin(struct bio_vec *bv, bool mark_dirty)
{
	struct folio *folio = page_folio(bv->bv_page);
	size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE -
			bv->bv_offset / PAGE_SIZE + 1;

	if (mark_dirty)
		folio_mark_dirty_lock(folio);
	unpin_user_folio(folio, nr_pages);
}

static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error,
		bool mark_dirty)
{
	unsigned int len = bio->bi_io_vec[0].bv_len;

	if (likely(!is_error)) {
		void *buf = bvec_virt(&bio->bi_io_vec[0]);
		struct iov_iter to;

		iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt,
				len);
		/* copying to pinned pages should always work */
		WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len);
	} else {
		/* No need to mark folios dirty if never copied to them */
		mark_dirty = false;
	}

	if (bio_flagged(bio, BIO_PAGE_PINNED)) {
		int i;

		for (i = 0; i < bio->bi_vcnt; i++)
			bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty);
	}

	folio_put(page_folio(bio->bi_io_vec[0].bv_page));
}

/**
 * bio_iov_iter_unbounce - finish a bounce buffer operation
 * @bio:	completed bio
 * @is_error:	%true if an I/O error occurred and data should not be copied
 * @mark_dirty:	If %true, folios will be marked dirty.
 *
 * Helper for direct I/O implementations that need to bounce buffer because
 * we need to checksum the data or perform other operations that require
 * consistency.  Called to complete a bio set up by bio_iov_iter_bounce().
 * Copies data back for reads, and marks the original folios dirty if
 * requested and then frees the bounce buffer.
 */
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty)
{
	if (op_is_write(bio_op(bio)))
		bio_free_folios(bio);
	else
		bio_iov_iter_unbounce_read(bio, is_error, mark_dirty);
}

static void submit_bio_wait_endio(struct bio *bio)
{
	complete(bio->bi_private);
+26 −0
Original line number Diff line number Diff line
@@ -397,6 +397,29 @@ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs)
	return iov_iter_npages(iter, max_segs);
}

/**
 * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio
 * @iter:	iter to bounce from
 * @op:		REQ_OP_* for the bio
 *
 * Calculates how many bvecs are needed for the next bio to bounce from/to
 * @iter.
 */
static inline unsigned short
bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op)
{
	/*
	 * We still need to bounce bvec iters, so don't special case them
	 * here unlike in bio_iov_vecs_to_alloc.
	 *
	 * For reads we need to use a vector for the bounce buffer, account
	 * for that here.
	 */
	if (op_is_write(op))
		return iov_iter_npages(iter, BIO_MAX_VECS);
	return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1;
}

struct request_queue;

void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
@@ -450,6 +473,9 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);

int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter);
void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty);

extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
			       struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);