Unverified Commit ee4cdf7b authored by David Howells's avatar David Howells Committed by Christian Brauner
Browse files

netfs: Speed up buffered reading



Improve the efficiency of buffered reads in a number of ways:

 (1) Overhaul the algorithm in general so that it's a lot more compact and
     split the read submission code between buffered and unbuffered
     versions.  The unbuffered version can be vastly simplified.

 (2) Read-result collection is handed off to a work queue rather than being
     done in the I/O thread.  Multiple subrequests can be processes
     simultaneously.

 (3) When a subrequest is collected, any folios it fully spans are
     collected and "spare" data on either side is donated to either the
     previous or the next subrequest in the sequence.

Notes:

 (*) Readahead expansion is massively slows down fio, presumably because it
     causes a load of extra allocations, both folio and xarray, up front
     before RPC requests can be transmitted.

 (*) RDMA with cifs does appear to work, both with SIW and RXE.

 (*) PG_private_2-based reading and copy-to-cache is split out into its own
     file and altered to use folio_queue.  Note that the copy to the cache
     now creates a new write transaction against the cache and adds the
     folios to be copied into it.  This allows it to use part of the
     writeback I/O code.

Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: netfs@lists.linux.dev
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20240814203850.2240469-20-dhowells@redhat.com/

 # v2
Signed-off-by: default avatarChristian Brauner <brauner@kernel.org>
parent 2e45b922
Loading
Loading
Loading
Loading
+8 −3
Original line number Diff line number Diff line
@@ -68,17 +68,22 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
	struct netfs_io_request *rreq = subreq->rreq;
	struct p9_fid *fid = rreq->netfs_priv;
	unsigned long long pos = subreq->start + subreq->transferred;
	int total, err;

	total = p9_client_read(fid, subreq->start + subreq->transferred,
			       &subreq->io_iter, &err);
	total = p9_client_read(fid, pos, &subreq->io_iter, &err);

	/* if we just extended the file size, any portion not in
	 * cache won't be on server and is zeroes */
	if (subreq->rreq->origin != NETFS_DIO_READ)
		__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
	if (pos + total >= i_size_read(rreq->inode))
		__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);

	netfs_subreq_terminated(subreq, err ?: total, false);
	if (!err)
		subreq->transferred += total;

	netfs_read_subreq_terminated(subreq, err, false);
}

/**
+15 −6
Original line number Diff line number Diff line
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/netfs.h>
#include <trace/events/netfs.h>
#include "internal.h"

static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -242,9 +243,10 @@ static void afs_fetch_data_notify(struct afs_operation *op)

	req->error = error;
	if (subreq) {
		if (subreq->rreq->origin != NETFS_DIO_READ)
			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
		netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
		subreq->rreq->i_size = req->file_size;
		if (req->pos + req->actual_len >= req->file_size)
			__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
		netfs_read_subreq_terminated(subreq, error, false);
		req->subreq = NULL;
	} else if (req->done) {
		req->done(req);
@@ -262,6 +264,12 @@ static void afs_fetch_data_success(struct afs_operation *op)
	afs_fetch_data_notify(op);
}

static void afs_fetch_data_aborted(struct afs_operation *op)
{
	afs_check_for_remote_deletion(op);
	afs_fetch_data_notify(op);
}

static void afs_fetch_data_put(struct afs_operation *op)
{
	op->fetch.req->error = afs_op_error(op);
@@ -272,7 +280,7 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
	.issue_afs_rpc	= afs_fs_fetch_data,
	.issue_yfs_rpc	= yfs_fs_fetch_data,
	.success	= afs_fetch_data_success,
	.aborted	= afs_check_for_remote_deletion,
	.aborted	= afs_fetch_data_aborted,
	.failed		= afs_fetch_data_notify,
	.put		= afs_fetch_data_put,
};
@@ -294,7 +302,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
	op = afs_alloc_operation(req->key, vnode->volume);
	if (IS_ERR(op)) {
		if (req->subreq)
			netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
			netfs_read_subreq_terminated(req->subreq, PTR_ERR(op), false);
		return PTR_ERR(op);
	}

@@ -313,7 +321,7 @@ static void afs_read_worker(struct work_struct *work)

	fsreq = afs_alloc_read(GFP_NOFS);
	if (!fsreq)
		return netfs_subreq_terminated(subreq, -ENOMEM, false);
		return netfs_read_subreq_terminated(subreq, -ENOMEM, false);

	fsreq->subreq	= subreq;
	fsreq->pos	= subreq->start + subreq->transferred;
@@ -322,6 +330,7 @@ static void afs_read_worker(struct work_struct *work)
	fsreq->vnode	= vnode;
	fsreq->iter	= &subreq->io_iter;

	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
	afs_fetch_data(fsreq->vnode, fsreq);
	afs_put_read(fsreq);
}
+7 −2
Original line number Diff line number Diff line
@@ -304,6 +304,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
	struct afs_vnode_param *vp = &op->file[0];
	struct afs_read *req = op->fetch.req;
	const __be32 *bp;
	size_t count_before;
	int ret;

	_enter("{%u,%zu,%zu/%llu}",
@@ -345,10 +346,14 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)

		/* extract the returned data */
	case 2:
		_debug("extract data %zu/%llu",
		       iov_iter_count(call->iter), req->actual_len);
		count_before = call->iov_len;
		_debug("extract data %zu/%llu", count_before, req->actual_len);

		ret = afs_extract_data(call, true);
		if (req->subreq) {
			req->subreq->transferred += count_before - call->iov_len;
			netfs_read_subreq_progress(req->subreq, false);
		}
		if (ret < 0)
			return ret;

+7 −2
Original line number Diff line number Diff line
@@ -355,6 +355,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
	struct afs_vnode_param *vp = &op->file[0];
	struct afs_read *req = op->fetch.req;
	const __be32 *bp;
	size_t count_before;
	int ret;

	_enter("{%u,%zu, %zu/%llu}",
@@ -391,10 +392,14 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)

		/* extract the returned data */
	case 2:
		_debug("extract data %zu/%llu",
		       iov_iter_count(call->iter), req->actual_len);
		count_before = call->iov_len;
		_debug("extract data %zu/%llu", count_before, req->actual_len);

		ret = afs_extract_data(call, true);
		if (req->subreq) {
			req->subreq->transferred += count_before - call->iov_len;
			netfs_read_subreq_progress(req->subreq, false);
		}
		if (ret < 0)
			return ret;

+46 −30
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <linux/iversion.h>
#include <linux/ktime.h>
#include <linux/netfs.h>
#include <trace/events/netfs.h>

#include "super.h"
#include "mds_client.h"
@@ -205,21 +206,6 @@ static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
	}
}

static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)
{
	struct inode *inode = subreq->rreq->inode;
	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
	struct ceph_inode_info *ci = ceph_inode(inode);
	u64 objno, objoff;
	u32 xlen;

	/* Truncate the extent at the end of the current block */
	ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
				      &objno, &objoff, &xlen);
	subreq->len = min(xlen, fsc->mount_options->rsize);
	return true;
}

static void finish_netfs_read(struct ceph_osd_request *req)
{
	struct inode *inode = req->r_inode;
@@ -264,7 +250,12 @@ static void finish_netfs_read(struct ceph_osd_request *req)
				     calc_pages_for(osd_data->alignment,
					osd_data->length), false);
	}
	netfs_subreq_terminated(subreq, err, false);
	if (err > 0) {
		subreq->transferred = err;
		err = 0;
	}
	trace_netfs_sreq(subreq, netfs_sreq_trace_io_progress);
	netfs_read_subreq_terminated(subreq, err, false);
	iput(req->r_inode);
	ceph_dec_osd_stopping_blocker(fsc->mdsc);
}
@@ -278,7 +269,6 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
	struct ceph_mds_request *req;
	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct iov_iter iter;
	ssize_t err = 0;
	size_t len;
	int mode;
@@ -301,6 +291,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
	req->r_num_caps = 2;

	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
	err = ceph_mdsc_do_request(mdsc, NULL, req);
	if (err < 0)
		goto out;
@@ -314,17 +305,36 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
	}

	len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);
	err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
	if (err == 0)
	err = copy_to_iter(iinfo->inline_data + subreq->start, len, &subreq->io_iter);
	if (err == 0) {
		err = -EFAULT;
	} else {
		subreq->transferred += err;
		err = 0;
	}

	ceph_mdsc_put_request(req);
out:
	netfs_subreq_terminated(subreq, err, false);
	netfs_read_subreq_terminated(subreq, err, false);
	return true;
}

static int ceph_netfs_prepare_read(struct netfs_io_subrequest *subreq)
{
	struct netfs_io_request *rreq = subreq->rreq;
	struct inode *inode = rreq->inode;
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
	u64 objno, objoff;
	u32 xlen;

	/* Truncate the extent at the end of the current block */
	ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
				      &objno, &objoff, &xlen);
	rreq->io_streams[0].sreq_max_len = umin(xlen, fsc->mount_options->rsize);
	return 0;
}

static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
{
	struct netfs_io_request *rreq = subreq->rreq;
@@ -334,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
	struct ceph_client *cl = fsc->client;
	struct ceph_osd_request *req = NULL;
	struct ceph_vino vino = ceph_vino(inode);
	struct iov_iter iter;
	int err = 0;
	u64 len = subreq->len;
	int err;
	u64 len;
	bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
	u64 off = subreq->start;
	int extent_cnt;
@@ -349,6 +358,12 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
	if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
		return;

	// TODO: This rounding here is slightly dodgy.  It *should* work, for
	// now, as the cache only deals in blocks that are a multiple of
	// PAGE_SIZE and fscrypt blocks are at most PAGE_SIZE.  What needs to
	// happen is for the fscrypt driving to be moved into netfslib and the
	// data in the cache also to be stored encrypted.
	len = subreq->len;
	ceph_fscrypt_adjust_off_and_len(inode, &off, &len);

	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
@@ -371,8 +386,6 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
	doutc(cl, "%llx.%llx pos=%llu orig_len=%zu len=%llu\n",
	      ceph_vinop(inode), subreq->start, subreq->len, len);

	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);

	/*
	 * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
	 * encrypted inodes. We'd need infrastructure that handles an iov_iter
@@ -384,7 +397,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
		struct page **pages;
		size_t page_off;

		err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
		err = iov_iter_get_pages_alloc2(&subreq->io_iter, &pages, len, &page_off);
		if (err < 0) {
			doutc(cl, "%llx.%llx failed to allocate pages, %d\n",
			      ceph_vinop(inode), err);
@@ -399,7 +412,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
						 false);
	} else {
		osd_req_op_extent_osd_iter(req, 0, &iter);
		osd_req_op_extent_osd_iter(req, 0, &subreq->io_iter);
	}
	if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
		err = -EIO;
@@ -410,17 +423,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
	req->r_inode = inode;
	ihold(inode);

	trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
	ceph_osdc_start_request(req->r_osdc, req);
out:
	ceph_osdc_put_request(req);
	if (err)
		netfs_subreq_terminated(subreq, err, false);
		netfs_read_subreq_terminated(subreq, err, false);
	doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err);
}

static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)
{
	struct inode *inode = rreq->inode;
	struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode);
	struct ceph_client *cl = ceph_inode_to_client(inode);
	int got = 0, want = CEPH_CAP_FILE_CACHE;
	struct ceph_netfs_request_data *priv;
@@ -472,6 +487,7 @@ static int ceph_init_request(struct netfs_io_request *rreq, struct file *file)

	priv->caps = got;
	rreq->netfs_priv = priv;
	rreq->io_streams[0].sreq_max_len = fsc->mount_options->rsize;

out:
	if (ret < 0)
@@ -496,9 +512,9 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
	.init_request		= ceph_init_request,
	.free_request		= ceph_netfs_free_request,
	.prepare_read		= ceph_netfs_prepare_read,
	.issue_read		= ceph_netfs_issue_read,
	.expand_readahead	= ceph_netfs_expand_readahead,
	.clamp_length		= ceph_netfs_clamp_length,
	.check_write_begin	= ceph_netfs_check_write_begin,
};

Loading