Commit eedd5b12 authored by Leon Romanovsky's avatar Leon Romanovsky
Browse files

RDMA/umem: Store ODP access mask information in PFN



As a preparation to remove dma_list, store access mask in PFN pointer
and not in dma_addr_t.

Tested-by: default avatarJens Axboe <axboe@kernel.dk>
Reviewed-by: default avatarJason Gunthorpe <jgg@nvidia.com>
Signed-off-by: default avatarLeon Romanovsky <leonro@nvidia.com>
parent 8cad4713
Loading
Loading
Loading
Loading
+43 −60
Original line number Diff line number Diff line
@@ -296,22 +296,11 @@ EXPORT_SYMBOL(ib_umem_odp_release);
static int ib_umem_odp_map_dma_single_page(
		struct ib_umem_odp *umem_odp,
		unsigned int dma_index,
		struct page *page,
		u64 access_mask)
		struct page *page)
{
	struct ib_device *dev = umem_odp->umem.ibdev;
	dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];

	if (*dma_addr) {
		/*
		 * If the page is already dma mapped it means it went through
		 * a non-invalidating trasition, like read-only to writable.
		 * Resync the flags.
		 */
		*dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
		return 0;
	}

	*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
				    DMA_BIDIRECTIONAL);
	if (ib_dma_mapping_error(dev, *dma_addr)) {
@@ -319,7 +308,6 @@ static int ib_umem_odp_map_dma_single_page(
		return -EFAULT;
	}
	umem_odp->npages++;
	*dma_addr |= access_mask;
	return 0;
}

@@ -355,9 +343,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
	struct hmm_range range = {};
	unsigned long timeout;

	if (access_mask == 0)
		return -EINVAL;

	if (user_virt < ib_umem_start(umem_odp) ||
	    user_virt + bcnt > ib_umem_end(umem_odp))
		return -EFAULT;
@@ -383,7 +368,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
	if (fault) {
		range.default_flags = HMM_PFN_REQ_FAULT;

		if (access_mask & ODP_WRITE_ALLOWED_BIT)
		if (access_mask & HMM_PFN_WRITE)
			range.default_flags |= HMM_PFN_REQ_WRITE;
	}

@@ -415,22 +400,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
	for (pfn_index = 0; pfn_index < num_pfns;
		pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {

		if (fault) {
		/*
		 * Since we asked for hmm_range_fault() to populate
		 * pages it shouldn't return an error entry on success.
		 */
			WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
			WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
		} else {
			if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
				WARN_ON(umem_odp->dma_list[dma_index]);
		WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
		WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
		if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
			continue;

		if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
			continue;
			}
			access_mask = ODP_READ_ALLOWED_BIT;
			if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
				access_mask |= ODP_WRITE_ALLOWED_BIT;
		}

		hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
		/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -445,13 +425,14 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
		}

		ret = ib_umem_odp_map_dma_single_page(
				umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
				access_mask);
			umem_odp, dma_index,
			hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
		if (ret < 0) {
			ibdev_dbg(umem_odp->umem.ibdev,
				  "ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
			break;
		}
		range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
	}
	/* upon success lock should stay on hold for the callee */
	if (!ret)
@@ -471,7 +452,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
				 u64 bound)
{
	dma_addr_t dma_addr;
	dma_addr_t dma;
	int idx;
	u64 addr;
@@ -482,19 +462,22 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
	virt = max_t(u64, virt, ib_umem_start(umem_odp));
	bound = min_t(u64, bound, ib_umem_end(umem_odp));
	for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
		unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
					PAGE_SHIFT;
		struct page *page =
			hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);

		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
		dma = umem_odp->dma_list[idx];

		/* The access flags guaranteed a valid DMA address in case was NULL */
		if (dma) {
			unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
			struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
		if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
			goto clear;
		if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
			goto clear;

			dma_addr = dma & ODP_DMA_ADDR_MASK;
			ib_dma_unmap_page(dev, dma_addr,
					  BIT(umem_odp->page_shift),
		ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
				  DMA_BIDIRECTIONAL);
			if (dma & ODP_WRITE_ALLOWED_BIT) {
		if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
			struct page *head_page = compound_head(page);
			/*
			 * set_page_dirty prefers being called with
@@ -507,9 +490,9 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
			 */
			set_page_dirty(head_page);
		}
			umem_odp->dma_list[idx] = 0;
		umem_odp->npages--;
		}
clear:
		umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
	}
}
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
+1 −0
Original line number Diff line number Diff line
@@ -351,6 +351,7 @@ struct mlx5_ib_flow_db {
#define MLX5_IB_UPD_XLT_PD	      BIT(4)
#define MLX5_IB_UPD_XLT_ACCESS	      BIT(5)
#define MLX5_IB_UPD_XLT_INDIRECT      BIT(6)
#define MLX5_IB_UPD_XLT_DOWNGRADE     BIT(7)

/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
 *
+19 −18
Original line number Diff line number Diff line
@@ -34,6 +34,7 @@
#include <linux/kernel.h>
#include <linux/dma-buf.h>
#include <linux/dma-resv.h>
#include <linux/hmm.h>

#include "mlx5_ib.h"
#include "cmd.h"
@@ -158,22 +159,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
	}
}

static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
{
	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;

	if (umem_dma & ODP_READ_ALLOWED_BIT)
		mtt_entry |= MLX5_IB_MTT_READ;
	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
		mtt_entry |= MLX5_IB_MTT_WRITE;

	return mtt_entry;
}

static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
			 struct mlx5_ib_mr *mr, int flags)
{
	struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
	bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
	unsigned long pfn;
	dma_addr_t pa;
	size_t i;

@@ -181,8 +172,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
		return;

	for (i = 0; i < nentries; i++) {
		pfn = odp->pfn_list[idx + i];
		if (!(pfn & HMM_PFN_VALID))
			/* ODP initialization */
			continue;

		pa = odp->dma_list[idx + i];
		pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
		pa |= MLX5_IB_MTT_READ;
		if ((pfn & HMM_PFN_WRITE) && !downgrade)
			pa |= MLX5_IB_MTT_WRITE;

		pas[i] = cpu_to_be64(pa);
	}
}

@@ -303,8 +303,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
		 * estimate the cost of another UMR vs. the cost of bigger
		 * UMR.
		 */
		if (umem_odp->dma_list[idx] &
		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
		if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
			if (!in_block) {
				blk_start_idx = idx;
				in_block = 1;
@@ -687,7 +686,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
{
	int page_shift, ret, np;
	bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
	u64 access_mask;
	u64 access_mask = 0;
	u64 start_idx;
	bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
	u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
@@ -695,12 +694,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
	if (flags & MLX5_PF_FLAGS_ENABLE)
		xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;

	if (flags & MLX5_PF_FLAGS_DOWNGRADE)
		xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;

	page_shift = odp->page_shift;
	start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
	access_mask = ODP_READ_ALLOWED_BIT;

	if (odp->umem.writable && !downgrade)
		access_mask |= ODP_WRITE_ALLOWED_BIT;
		access_mask |= HMM_PFN_WRITE;

	np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
	if (np < 0)
+6 −8
Original line number Diff line number Diff line
@@ -27,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
	start = max_t(u64, ib_umem_start(umem_odp), range->start);
	end = min_t(u64, ib_umem_end(umem_odp), range->end);

	/* update umem_odp->dma_list */
	/* update umem_odp->map.pfn_list */
	ib_umem_odp_unmap_dma_pages(umem_odp, start, end);

	mutex_unlock(&umem_odp->umem_mutex);
@@ -45,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn
{
	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
	bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
	u64 access_mask;
	u64 access_mask = 0;
	int np;

	access_mask = ODP_READ_ALLOWED_BIT;
	if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
		access_mask |= ODP_WRITE_ALLOWED_BIT;
		access_mask |= HMM_PFN_WRITE;

	/*
	 * ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
@@ -138,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
	while (addr < iova + length) {
		idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;

		if (!(umem_odp->dma_list[idx] & perm)) {
		if (!(umem_odp->map.pfn_list[idx] & perm)) {
			need_fault = true;
			break;
		}
@@ -162,15 +161,14 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u
{
	struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
	bool need_fault;
	u64 perm;
	u64 perm = 0;
	int err;

	if (unlikely(length < 1))
		return -EINVAL;

	perm = ODP_READ_ALLOWED_BIT;
	if (!(flags & RXE_PAGEFAULT_RDONLY))
		perm |= ODP_WRITE_ALLOWED_BIT;
		perm |= HMM_PFN_WRITE;

	mutex_lock(&umem_odp->umem_mutex);

+1 −13
Original line number Diff line number Diff line
@@ -8,6 +8,7 @@

#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <linux/hmm.h>

struct ib_umem_odp {
	struct ib_umem umem;
@@ -67,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
	       umem_odp->page_shift;
}

/*
 * The lower 2 bits of the DMA address signal the R/W permissions for
 * the entry. To upgrade the permissions, provide the appropriate
 * bitmask to the map_dma_pages function.
 *
 * Be aware that upgrading a mapped address might result in change of
 * the DMA address for the page.
 */
#define ODP_READ_ALLOWED_BIT  (1<<0ULL)
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)

#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))

#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING

struct ib_umem_odp *