Commit 18755b8c authored by Chuck Lever's avatar Chuck Lever
Browse files

svcrdma: Use contiguous pages for RDMA Read sink buffers



svc_rdma_build_read_segment() constructs RDMA Read sink
buffers by consuming pages one-at-a-time from rq_pages[]
and building one bvec per page. A 64KB NFS READ payload
produces 16 separate bvecs, 16 DMA mappings, and
potentially multiple RDMA Read WRs (on platforms with
4KB pages).

A single higher-order allocation followed by split_page()
yields physically contiguous memory while preserving
per-page refcounts. A single bvec spanning the contiguous
range causes rdma_rw_ctx_init_bvec() to take the
rdma_rw_init_single_wr_bvec() fast path: one DMA mapping,
one SGE, one WR.

The split sub-pages replace the original rq_pages[] entries,
so all downstream page tracking, completion handling, and
xdr_buf assembly remain unchanged.

Allocation uses __GFP_NORETRY | __GFP_NOWARN and falls back
through decreasing orders. If even order-1 fails, the
existing per-page path handles the segment.

When nr_pages is not a power of two, get_order() rounds up
and the allocation yields more pages than needed. The extra
split pages replace existing rq_pages[] entries (freed via
put_page() first), so there is no net increase in per-
request page consumption. Successive segments reuse the
same padding slots, preventing accumulation. The
rq_maxpages guard rejects any allocation that would
overrun the array, falling back to the per-page path.
Under memory pressure, __GFP_NORETRY causes the higher-
order allocation to fail without stalling.

The contiguous path is attempted when the segment starts
page-aligned (rc_pageoff == 0) and spans at least two
pages. NFS WRITE segments carry application-modified byte
ranges of arbitrary length, so the optimization is not
restricted to power-of-two page counts.

Signed-off-by: default avatarChuck Lever <chuck.lever@oracle.com>
parent 4e2866b2
Loading
Loading
Loading
Loading
+223 −0
Original line number Diff line number Diff line
@@ -754,6 +754,216 @@ int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma,
	return xdr->len;
}

/*
 * Cap contiguous RDMA Read sink allocations at order-4.
 * Higher orders risk allocation failure under
 * __GFP_NORETRY, which would negate the benefit of the
 * contiguous fast path.
 */
#define SVC_RDMA_CONTIG_MAX_ORDER	4

/**
 * svc_rdma_alloc_read_pages - Allocate physically contiguous pages
 * @nr_pages: number of pages needed
 * @order: on success, set to the allocation order
 *
 * Attempts a higher-order allocation, falling back to smaller orders.
 * The returned pages are split immediately so each sub-page has its
 * own refcount and can be freed independently.
 *
 * Returns a pointer to the first page on success, or NULL if even
 * order-1 allocation fails.
 */
static struct page *
svc_rdma_alloc_read_pages(unsigned int nr_pages, unsigned int *order)
{
	unsigned int o;
	struct page *page;

	o = min(get_order(nr_pages << PAGE_SHIFT),
		SVC_RDMA_CONTIG_MAX_ORDER);

	while (o >= 1) {
		page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN,
				   o);
		if (page) {
			split_page(page, o);
			*order = o;
			return page;
		}
		o--;
	}
	return NULL;
}

/*
 * svc_rdma_fill_contig_bvec - Replace rq_pages with a contiguous allocation
 * @rqstp: RPC transaction context
 * @head: context for ongoing I/O
 * @bv: bvec entry to fill
 * @pages_left: number of data pages remaining in the segment
 * @len_left: bytes remaining in the segment
 *
 * On success, fills @bv with a bvec spanning the contiguous range and
 * advances rc_curpage/rc_page_count. Returns the byte length covered,
 * or zero if the allocation failed or would overrun rq_maxpages.
 */
static unsigned int
svc_rdma_fill_contig_bvec(struct svc_rqst *rqstp,
			  struct svc_rdma_recv_ctxt *head,
			  struct bio_vec *bv, unsigned int pages_left,
			  unsigned int len_left)
{
	unsigned int order, npages, chunk_pages, chunk_len, i;
	struct page *page;

	page = svc_rdma_alloc_read_pages(pages_left, &order);
	if (!page)
		return 0;
	npages = 1 << order;

	if (head->rc_curpage + npages > rqstp->rq_maxpages) {
		for (i = 0; i < npages; i++)
			__free_page(page + i);
		return 0;
	}

	/*
	 * Replace rq_pages[] entries with pages from the contiguous
	 * allocation. If npages exceeds chunk_pages, the extra pages
	 * stay in rq_pages[] for later reuse or normal rqst teardown.
	 */
	for (i = 0; i < npages; i++) {
		svc_rqst_page_release(rqstp,
				      rqstp->rq_pages[head->rc_curpage + i]);
		rqstp->rq_pages[head->rc_curpage + i] = page + i;
	}

	chunk_pages = min(npages, pages_left);
	chunk_len = min_t(unsigned int, chunk_pages << PAGE_SHIFT, len_left);
	bvec_set_page(bv, page, chunk_len, 0);
	head->rc_page_count += chunk_pages;
	head->rc_curpage += chunk_pages;
	return chunk_len;
}

/*
 * svc_rdma_fill_page_bvec - Add a single rq_page to the bvec array
 * @head: context for ongoing I/O
 * @ctxt: R/W context whose bvec array is being filled
 * @cur: page to add
 * @bvec_idx: pointer to current bvec index, not advanced on merge
 * @len_left: bytes remaining in the segment
 *
 * If @cur is physically contiguous with the preceding bvec, it is
 * merged by extending that bvec's length. Otherwise a new bvec
 * entry is created. Returns the byte length covered.
 */
static unsigned int
svc_rdma_fill_page_bvec(struct svc_rdma_recv_ctxt *head,
			struct svc_rdma_rw_ctxt *ctxt, struct page *cur,
			unsigned int *bvec_idx, unsigned int len_left)
{
	unsigned int chunk_len = min_t(unsigned int, PAGE_SIZE, len_left);

	head->rc_page_count++;
	head->rc_curpage++;

	if (*bvec_idx > 0) {
		struct bio_vec *prev = &ctxt->rw_bvec[*bvec_idx - 1];

		if (page_to_phys(prev->bv_page) + prev->bv_offset +
		    prev->bv_len == page_to_phys(cur)) {
			prev->bv_len += chunk_len;
			return chunk_len;
		}
	}

	bvec_set_page(&ctxt->rw_bvec[*bvec_idx], cur, chunk_len, 0);
	(*bvec_idx)++;
	return chunk_len;
}

/**
 * svc_rdma_build_read_segment_contig - Build RDMA Read WR with contiguous pages
 * @rqstp: RPC transaction context
 * @head: context for ongoing I/O
 * @segment: co-ordinates of remote memory to be read
 *
 * Greedily allocates higher-order pages to cover the segment,
 * building one bvec per contiguous chunk. Each allocation is
 * split so sub-pages have independent refcounts. When a
 * higher-order allocation fails, remaining pages are covered
 * individually, merging adjacent pages into the preceding bvec
 * when they are physically contiguous. The split sub-pages
 * replace entries in rq_pages[] so downstream cleanup is
 * unchanged.
 *
 * Returns:
 *   %0: the Read WR was constructed successfully
 *   %-ENOMEM: allocation failed
 *   %-EIO: a DMA mapping error occurred
 */
static int svc_rdma_build_read_segment_contig(struct svc_rqst *rqstp,
					      struct svc_rdma_recv_ctxt *head,
					      const struct svc_rdma_segment *segment)
{
	struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp);
	struct svc_rdma_chunk_ctxt *cc = &head->rc_cc;
	unsigned int nr_data_pages, bvec_idx;
	struct svc_rdma_rw_ctxt *ctxt;
	unsigned int len_left;
	int ret;

	nr_data_pages = PAGE_ALIGN(segment->rs_length) >> PAGE_SHIFT;
	if (head->rc_curpage + nr_data_pages > rqstp->rq_maxpages)
		return -ENOMEM;

	ctxt = svc_rdma_get_rw_ctxt(rdma, nr_data_pages);
	if (!ctxt)
		return -ENOMEM;

	bvec_idx = 0;
	len_left = segment->rs_length;
	while (len_left) {
		unsigned int pages_left = PAGE_ALIGN(len_left) >> PAGE_SHIFT;
		unsigned int chunk_len = 0;

		if (pages_left >= 2)
			chunk_len = svc_rdma_fill_contig_bvec(rqstp, head,
							      &ctxt->rw_bvec[bvec_idx],
							      pages_left, len_left);
		if (chunk_len) {
			bvec_idx++;
		} else {
			struct page *cur =
				rqstp->rq_pages[head->rc_curpage];
			chunk_len = svc_rdma_fill_page_bvec(head, ctxt, cur,
							    &bvec_idx,
							    len_left);
		}

		len_left -= chunk_len;
	}

	ctxt->rw_nents = bvec_idx;

	head->rc_pageoff = offset_in_page(segment->rs_length);
	if (head->rc_pageoff)
		head->rc_curpage--;

	ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset,
				   segment->rs_handle, segment->rs_length,
				   DMA_FROM_DEVICE);
	if (ret < 0)
		return -EIO;
	percpu_counter_inc(&svcrdma_stat_read);

	list_add(&ctxt->rw_list, &cc->cc_rwctxts);
	cc->cc_sqecount += ret;
	return 0;
}

/**
 * svc_rdma_build_read_segment - Build RDMA Read WQEs to pull one RDMA segment
 * @rqstp: RPC transaction context
@@ -780,6 +990,14 @@ static int svc_rdma_build_read_segment(struct svc_rqst *rqstp,
	if (check_add_overflow(head->rc_pageoff, len, &total))
		return -EINVAL;
	nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT;

	if (head->rc_pageoff == 0 && nr_bvec >= 2) {
		ret = svc_rdma_build_read_segment_contig(rqstp, head,
							 segment);
		if (ret != -ENOMEM)
			return ret;
	}

	ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec);
	if (!ctxt)
		return -ENOMEM;
@@ -1125,6 +1343,11 @@ static void svc_rdma_clear_rqst_pages(struct svc_rqst *rqstp,
{
	unsigned int i;

	/*
	 * Move only pages containing RPC data into rc_pages[]. Pages
	 * from a contiguous allocation that were not used for the
	 * payload remain in rq_pages[] for subsequent reuse.
	 */
	for (i = 0; i < head->rc_page_count; i++) {
		head->rc_pages[i] = rqstp->rq_pages[i];
		rqstp->rq_pages[i] = NULL;