Unverified Commit 4966b466 authored by Christian Brauner's avatar Christian Brauner
Browse files

Merge patch series "fuse: use iomap for buffered reads + readahead"

Joanne Koong <joannelkoong@gmail.com> says:

This series adds fuse iomap support for buffered reads and readahead.
This is needed so that granular uptodate tracking can be used in fuse when
large folios are enabled so that only the non-uptodate portions of the folio
need to be read in instead of having to read in the entire folio. It also is
needed in order to turn on large folios for servers that use the writeback
cache since otherwise there is a race condition that may lead to data
corruption if there is a partial write, then a read and the read happens
before the write has undergone writeback, since otherwise the folio will not
be marked uptodate from the partial write so the read will read in the entire
folio from disk, which will overwrite the partial write.

This is on top of two locally-patched iomap patches [1] [2] patched on top of
commit f1c864be6e88 ("Merge branch 'vfs-6.18.async' into vfs.all") in
Christian's vfs.all tree.

This series was run through fstests on fuse passthrough_hp with an
out-of kernel patch enabling fuse large folios.

This patchset does not enable large folios on fuse yet. That will be part
of a different patchset.

* patches from https://lore.kernel.org/20250926002609.1302233-1-joannelkoong@gmail.com

:
  fuse: remove fc->blkbits workaround for partial writes
  fuse: use iomap for readahead
  fuse: use iomap for read_folio
  iomap: make iomap_read_folio() a void return
  iomap: move buffered io bio logic into new file
  iomap: add caller-provided callbacks for read and readahead
  iomap: set accurate iter->pos when reading folio ranges
  iomap: track pending read bytes more optimally
  iomap: rename iomap_readpage_ctx struct to iomap_read_folio_ctx
  iomap: rename iomap_readpage_iter() to iomap_read_folio_iter()
  iomap: iterate over folio mapping in iomap_readpage_iter()
  iomap: store read/readahead bio generically
  iomap: move read/readahead bio submission logic into helper function
  iomap: move bio read logic into helper function

Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parents 7aa6bc3e 93570c65
Loading
Loading
Loading
Loading
+44 −0
Original line number Diff line number Diff line
@@ -135,6 +135,28 @@ These ``struct kiocb`` flags are significant for buffered I/O with iomap:

 * ``IOCB_DONTCACHE``: Turns on ``IOMAP_DONTCACHE``.

``struct iomap_read_ops``
--------------------------

.. code-block:: c

 struct iomap_read_ops {
     int (*read_folio_range)(const struct iomap_iter *iter,
                             struct iomap_read_folio_ctx *ctx, size_t len);
     void (*submit_read)(struct iomap_read_folio_ctx *ctx);
 };

iomap calls these functions:

  - ``read_folio_range``: Called to read in the range. This must be provided
    by the caller. The caller is responsible for calling
    iomap_finish_folio_read() after reading in the folio range. This should be
    done even if an error is encountered during the read. This returns 0 on
    success or a negative error on failure.

  - ``submit_read``: Submit any pending read requests. This function is
    optional.

Internal per-Folio State
------------------------

@@ -182,6 +204,28 @@ The ``flags`` argument to ``->iomap_begin`` will be set to zero.
The pagecache takes whatever locks it needs before calling the
filesystem.

Both ``iomap_readahead`` and ``iomap_read_folio`` pass in a ``struct
iomap_read_folio_ctx``:

.. code-block:: c

 struct iomap_read_folio_ctx {
    const struct iomap_read_ops *ops;
    struct folio *cur_folio;
    struct readahead_control *rac;
    void *read_ctx;
 };

``iomap_readahead`` must set:
 * ``ops->read_folio_range()`` and ``rac``

``iomap_read_folio`` must set:
 * ``ops->read_folio_range()`` and ``cur_folio``

``ops->submit_read()`` and ``read_ctx`` are optional. ``read_ctx`` is used to
pass in any custom data the caller needs accessible in the ops callbacks for
fulfilling reads.

Buffered Writes
---------------

+3 −2
Original line number Diff line number Diff line
@@ -540,12 +540,13 @@ const struct address_space_operations def_blk_aops = {
#else /* CONFIG_BUFFER_HEAD */
static int blkdev_read_folio(struct file *file, struct folio *folio)
{
	return iomap_read_folio(folio, &blkdev_iomap_ops);
	iomap_bio_read_folio(folio, &blkdev_iomap_ops);
	return 0;
}

static void blkdev_readahead(struct readahead_control *rac)
{
	iomap_readahead(rac, &blkdev_iomap_ops);
	iomap_bio_readahead(rac, &blkdev_iomap_ops);
}

static ssize_t blkdev_writeback_range(struct iomap_writepage_ctx *wpc,
+3 −2
Original line number Diff line number Diff line
@@ -371,7 +371,8 @@ static int erofs_read_folio(struct file *file, struct folio *folio)
{
	trace_erofs_read_folio(folio, true);

	return iomap_read_folio(folio, &erofs_iomap_ops);
	iomap_bio_read_folio(folio, &erofs_iomap_ops);
	return 0;
}

static void erofs_readahead(struct readahead_control *rac)
@@ -379,7 +380,7 @@ static void erofs_readahead(struct readahead_control *rac)
	trace_erofs_readahead(rac->mapping->host, readahead_index(rac),
					readahead_count(rac), true);

	return iomap_readahead(rac, &erofs_iomap_ops);
	iomap_bio_readahead(rac, &erofs_iomap_ops);
}

static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+1 −1
Original line number Diff line number Diff line
@@ -1192,7 +1192,7 @@ static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode,
	if (attr->blksize != 0)
		blkbits = ilog2(attr->blksize);
	else
		blkbits = fc->blkbits;
		blkbits = inode->i_sb->s_blocksize_bits;

	stat->blksize = 1 << blkbits;
}
+174 −114
Original line number Diff line number Diff line
@@ -834,23 +834,148 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio,
	return 0;
}

static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
			    unsigned int flags, struct iomap *iomap,
			    struct iomap *srcmap)
{
	iomap->type = IOMAP_MAPPED;
	iomap->length = length;
	iomap->offset = offset;
	return 0;
}

static const struct iomap_ops fuse_iomap_ops = {
	.iomap_begin	= fuse_iomap_begin,
};

struct fuse_fill_read_data {
	struct file *file;

	/* Fields below are used if sending the read request asynchronously */
	struct fuse_conn *fc;
	struct fuse_io_args *ia;
	unsigned int nr_bytes;
};

/* forward declarations */
static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
				  unsigned len, struct fuse_args_pages *ap,
				  unsigned cur_bytes, bool write);
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
				unsigned int count, bool async);

static int fuse_handle_readahead(struct folio *folio,
				 struct readahead_control *rac,
				 struct fuse_fill_read_data *data, loff_t pos,
				 size_t len)
{
	struct fuse_io_args *ia = data->ia;
	size_t off = offset_in_folio(folio, pos);
	struct fuse_conn *fc = data->fc;
	struct fuse_args_pages *ap;
	unsigned int nr_pages;

	if (ia && fuse_folios_need_send(fc, pos, len, &ia->ap, data->nr_bytes,
					false)) {
		fuse_send_readpages(ia, data->file, data->nr_bytes,
				    fc->async_read);
		data->nr_bytes = 0;
		data->ia = NULL;
		ia = NULL;
	}
	if (!ia) {
		if (fc->num_background >= fc->congestion_threshold &&
		    rac->ra->async_size >= readahead_count(rac))
			/*
			 * Congested and only async pages left, so skip the
			 * rest.
			 */
			return -EAGAIN;

		nr_pages = min(fc->max_pages, readahead_count(rac));
		data->ia = fuse_io_alloc(NULL, nr_pages);
		if (!data->ia)
			return -ENOMEM;
		ia = data->ia;
	}
	folio_get(folio);
	ap = &ia->ap;
	ap->folios[ap->num_folios] = folio;
	ap->descs[ap->num_folios].offset = off;
	ap->descs[ap->num_folios].length = len;
	data->nr_bytes += len;
	ap->num_folios++;

	return 0;
}

static int fuse_iomap_read_folio_range_async(const struct iomap_iter *iter,
					     struct iomap_read_folio_ctx *ctx,
					     size_t len)
{
	struct fuse_fill_read_data *data = ctx->read_ctx;
	struct folio *folio = ctx->cur_folio;
	loff_t pos =  iter->pos;
	size_t off = offset_in_folio(folio, pos);
	struct file *file = data->file;
	int ret;

	if (ctx->rac) {
		ret = fuse_handle_readahead(folio, ctx->rac, data, pos, len);
		/*
		 * If fuse_handle_readahead was successful, fuse_readpages_end
		 * will do the iomap_finish_folio_read, else we need to call it
		 * here
		 */
		if (ret)
			iomap_finish_folio_read(folio, off, len, ret);
	} else {
		/*
		 *  for non-readahead read requests, do reads synchronously
		 *  since it's not guaranteed that the server can handle
		 *  out-of-order reads
		 */
		ret = fuse_do_readfolio(file, folio, off, len);
		iomap_finish_folio_read(folio, off, len, ret);
	}
	return ret;
}

static void fuse_iomap_read_submit(struct iomap_read_folio_ctx *ctx)
{
	struct fuse_fill_read_data *data = ctx->read_ctx;

	if (data->ia)
		fuse_send_readpages(data->ia, data->file, data->nr_bytes,
				    data->fc->async_read);
}

static const struct iomap_read_ops fuse_iomap_read_ops = {
	.read_folio_range = fuse_iomap_read_folio_range_async,
	.submit_read = fuse_iomap_read_submit,
};

static int fuse_read_folio(struct file *file, struct folio *folio)
{
	struct inode *inode = folio->mapping->host;
	int err;
	struct fuse_fill_read_data data = {
		.file = file,
	};
	struct iomap_read_folio_ctx ctx = {
		.cur_folio = folio,
		.ops = &fuse_iomap_read_ops,
		.read_ctx = &data,

	err = -EIO;
	if (fuse_is_bad(inode))
		goto out;
	};

	err = fuse_do_readfolio(file, folio, 0, folio_size(folio));
	if (!err)
		folio_mark_uptodate(folio);
	if (fuse_is_bad(inode)) {
		folio_unlock(folio);
		return -EIO;
	}

	iomap_read_folio(&fuse_iomap_ops, &ctx);
	fuse_invalidate_atime(inode);
 out:
	folio_unlock(folio);
	return err;
	return 0;
}

static int fuse_iomap_read_folio_range(const struct iomap_iter *iter,
@@ -887,7 +1012,8 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
	fuse_invalidate_atime(inode);

	for (i = 0; i < ap->num_folios; i++) {
		folio_end_read(ap->folios[i], !err);
		iomap_finish_folio_read(ap->folios[i], ap->descs[i].offset,
					ap->descs[i].length, err);
		folio_put(ap->folios[i]);
	}
	if (ia->ff)
@@ -897,7 +1023,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
}

static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,
				unsigned int count)
				unsigned int count, bool async)
{
	struct fuse_file *ff = file->private_data;
	struct fuse_mount *fm = ff->fm;
@@ -919,7 +1045,7 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file,

	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
	ia->read.attr_ver = fuse_get_attr_version(fm->fc);
	if (fm->fc->async_read) {
	if (async) {
		ia->ff = fuse_file_get(ff);
		ap->args.end = fuse_readpages_end;
		err = fuse_simple_background(fm, &ap->args, GFP_KERNEL);
@@ -936,81 +1062,20 @@ static void fuse_readahead(struct readahead_control *rac)
{
	struct inode *inode = rac->mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
	unsigned int max_pages, nr_pages;
	struct folio *folio = NULL;
	struct fuse_fill_read_data data = {
		.file = rac->file,
		.fc = fc,
	};
	struct iomap_read_folio_ctx ctx = {
		.ops = &fuse_iomap_read_ops,
		.rac = rac,
		.read_ctx = &data
	};

	if (fuse_is_bad(inode))
		return;

	max_pages = min_t(unsigned int, fc->max_pages,
			fc->max_read / PAGE_SIZE);

	/*
	 * This is only accurate the first time through, since readahead_folio()
	 * doesn't update readahead_count() from the previous folio until the
	 * next call.  Grab nr_pages here so we know how many pages we're going
	 * to have to process.  This means that we will exit here with
	 * readahead_count() == folio_nr_pages(last_folio), but we will have
	 * consumed all of the folios, and read_pages() will call
	 * readahead_folio() again which will clean up the rac.
	 */
	nr_pages = readahead_count(rac);

	while (nr_pages) {
		struct fuse_io_args *ia;
		struct fuse_args_pages *ap;
		unsigned cur_pages = min(max_pages, nr_pages);
		unsigned int pages = 0;

		if (fc->num_background >= fc->congestion_threshold &&
		    rac->ra->async_size >= readahead_count(rac))
			/*
			 * Congested and only async pages left, so skip the
			 * rest.
			 */
			break;

		ia = fuse_io_alloc(NULL, cur_pages);
		if (!ia)
			break;
		ap = &ia->ap;

		while (pages < cur_pages) {
			unsigned int folio_pages;

			/*
			 * This returns a folio with a ref held on it.
			 * The ref needs to be held until the request is
			 * completed, since the splice case (see
			 * fuse_try_move_page()) drops the ref after it's
			 * replaced in the page cache.
			 */
			if (!folio)
				folio =  __readahead_folio(rac);

			folio_pages = folio_nr_pages(folio);
			if (folio_pages > cur_pages - pages) {
				/*
				 * Large folios belonging to fuse will never
				 * have more pages than max_pages.
				 */
				WARN_ON(!pages);
				break;
			}

			ap->folios[ap->num_folios] = folio;
			ap->descs[ap->num_folios].length = folio_size(folio);
			ap->num_folios++;
			pages += folio_pages;
			folio = NULL;
		}
		fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT);
		nr_pages -= pages;
	}
	if (folio) {
		folio_end_read(folio, false);
		folio_put(folio);
	}
	iomap_readahead(&fuse_iomap_ops, &ctx);
}

static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
@@ -1397,20 +1462,6 @@ static const struct iomap_write_ops fuse_iomap_write_ops = {
	.read_folio_range = fuse_iomap_read_folio_range,
};

static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
			    unsigned int flags, struct iomap *iomap,
			    struct iomap *srcmap)
{
	iomap->type = IOMAP_MAPPED;
	iomap->length = length;
	iomap->offset = offset;
	return 0;
}

static const struct iomap_ops fuse_iomap_ops = {
	.iomap_begin	= fuse_iomap_begin,
};

static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *file = iocb->ki_filp;
@@ -2047,7 +2098,7 @@ struct fuse_fill_wb_data {
	struct fuse_file *ff;
	unsigned int max_folios;
	/*
	 * nr_bytes won't overflow since fuse_writepage_need_send() caps
	 * nr_bytes won't overflow since fuse_folios_need_send() caps
	 * wb requests to never exceed fc->max_pages (which has an upper bound
	 * of U16_MAX).
	 */
@@ -2092,14 +2143,15 @@ static void fuse_writepages_send(struct inode *inode,
	spin_unlock(&fi->lock);
}

static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
static bool fuse_folios_need_send(struct fuse_conn *fc, loff_t pos,
				  unsigned len, struct fuse_args_pages *ap,
				     struct fuse_fill_wb_data *data)
				  unsigned cur_bytes, bool write)
{
	struct folio *prev_folio;
	struct fuse_folio_desc prev_desc;
	unsigned bytes = data->nr_bytes + len;
	unsigned bytes = cur_bytes + len;
	loff_t prev_pos;
	size_t max_bytes = write ? fc->max_write : fc->max_read;

	WARN_ON(!ap->num_folios);

@@ -2107,8 +2159,7 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
	if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages)
		return true;

	/* Reached max write bytes */
	if (bytes > fc->max_write)
	if (bytes > max_bytes)
		return true;

	/* Discontinuity */
@@ -2118,11 +2169,6 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos,
	if (prev_pos != pos)
		return true;

	/* Need to grow the pages array?  If so, did the expansion fail? */
	if (ap->num_folios == data->max_folios &&
	    !fuse_pages_realloc(data, fc->max_pages))
		return true;

	return false;
}

@@ -2146,11 +2192,25 @@ static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc,
			return -EIO;
	}

	if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) {
	if (wpa) {
		bool send = fuse_folios_need_send(fc, pos, len, ap,
						  data->nr_bytes, true);

		if (!send) {
			/*
			 * Need to grow the pages array?  If so, did the
			 * expansion fail?
			 */
			send = (ap->num_folios == data->max_folios) &&
				!fuse_pages_realloc(data, fc->max_pages);
		}

		if (send) {
			fuse_writepages_send(inode, data);
			data->wpa = NULL;
			data->nr_bytes = 0;
		}
	}

	if (data->wpa == NULL) {
		wpa = fuse_writepage_args_setup(folio, offset, data->ff);
Loading