Commit 864690cf authored by Satyanarayana K V P's avatar Satyanarayana K V P Committed by Matthew Brost
Browse files

drm/xe/vf: Attach and detach CCS copy commands with BO



Attach CCS read/write copy commands to BO for old and new mem types as
NULL -> tt or system -> tt.
Detach the CCS read/write copy commands from BO while deleting ttm bo
from xe_ttm_bo_delete_mem_notify().

Signed-off-by: default avatarSatyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Reviewed-by: default avatarMatthew Brost <matthew.brost@intel.com>
Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Link: https://lore.kernel.org/r/20250722120506.6483-3-satyanarayana.k.v.p@intel.com
parent f3009272
Loading
Loading
Loading
Loading
+35 −0
Original line number Diff line number Diff line
@@ -60,6 +60,41 @@ struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm)
	return ERR_PTR(err);
}

struct xe_bb *xe_bb_ccs_new(struct xe_gt *gt, u32 dwords,
			    enum xe_sriov_vf_ccs_rw_ctxs ctx_id)
{
	struct xe_bb *bb = kmalloc(sizeof(*bb), GFP_KERNEL);
	struct xe_tile *tile = gt_to_tile(gt);
	struct xe_sa_manager *bb_pool;
	int err;

	if (!bb)
		return ERR_PTR(-ENOMEM);
	/*
	 * We need to allocate space for the requested number of dwords &
	 * one additional MI_BATCH_BUFFER_END dword. Since the whole SA
	 * is submitted to HW, we need to make sure that the last instruction
	 * is not over written when the last chunk of SA is allocated for BB.
	 * So, this extra DW acts as a guard here.
	 */

	bb_pool = tile->sriov.vf.ccs[ctx_id].mem.ccs_bb_pool;
	bb->bo = xe_sa_bo_new(bb_pool, 4 * (dwords + 1));

	if (IS_ERR(bb->bo)) {
		err = PTR_ERR(bb->bo);
		goto err;
	}

	bb->cs = xe_sa_bo_cpu_addr(bb->bo);
	bb->len = 0;

	return bb;
err:
	kfree(bb);
	return ERR_PTR(err);
}

static struct xe_sched_job *
__xe_bb_create_job(struct xe_exec_queue *q, struct xe_bb *bb, u64 *addr)
{
+3 −0
Original line number Diff line number Diff line
@@ -13,8 +13,11 @@ struct dma_fence;
struct xe_gt;
struct xe_exec_queue;
struct xe_sched_job;
enum xe_sriov_vf_ccs_rw_ctxs;

struct xe_bb *xe_bb_new(struct xe_gt *gt, u32 dwords, bool usm);
struct xe_bb *xe_bb_ccs_new(struct xe_gt *gt, u32 dwords,
			    enum xe_sriov_vf_ccs_rw_ctxs ctx_id);
struct xe_sched_job *xe_bb_create_job(struct xe_exec_queue *q,
				      struct xe_bb *bb);
struct xe_sched_job *xe_bb_create_migration_job(struct xe_exec_queue *q,
+23 −0
Original line number Diff line number Diff line
@@ -33,6 +33,7 @@
#include "xe_pxp.h"
#include "xe_res_cursor.h"
#include "xe_shrinker.h"
#include "xe_sriov_vf_ccs.h"
#include "xe_trace_bo.h"
#include "xe_ttm_stolen_mgr.h"
#include "xe_vm.h"
@@ -965,6 +966,20 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
	dma_fence_put(fence);
	xe_pm_runtime_put(xe);

	/*
	 * CCS meta data is migrated from TT -> SMEM. So, let us detach the
	 * BBs from BO as it is no longer needed.
	 */
	if (IS_VF_CCS_BB_VALID(xe, bo) && old_mem_type == XE_PL_TT &&
	    new_mem->mem_type == XE_PL_SYSTEM)
		xe_sriov_vf_ccs_detach_bo(bo);

	if (IS_SRIOV_VF(xe) &&
	    ((move_lacks_source && new_mem->mem_type == XE_PL_TT) ||
	     (old_mem_type == XE_PL_SYSTEM && new_mem->mem_type == XE_PL_TT)) &&
	    handle_system_ccs)
		ret = xe_sriov_vf_ccs_attach_bo(bo);

out:
	if ((!ttm_bo->resource || ttm_bo->resource->mem_type == XE_PL_SYSTEM) &&
	    ttm_bo->ttm) {
@@ -975,6 +990,9 @@ static int xe_bo_move(struct ttm_buffer_object *ttm_bo, bool evict,
		if (timeout < 0)
			ret = timeout;

		if (IS_VF_CCS_BB_VALID(xe, bo))
			xe_sriov_vf_ccs_detach_bo(bo);

		xe_tt_unmap_sg(xe, ttm_bo->ttm);
	}

@@ -1502,9 +1520,14 @@ static void xe_ttm_bo_release_notify(struct ttm_buffer_object *ttm_bo)

static void xe_ttm_bo_delete_mem_notify(struct ttm_buffer_object *ttm_bo)
{
	struct xe_bo *bo = ttm_to_xe_bo(ttm_bo);

	if (!xe_bo_is_xe_bo(ttm_bo))
		return;

	if (IS_VF_CCS_BB_VALID(ttm_to_xe_device(ttm_bo->bdev), bo))
		xe_sriov_vf_ccs_detach_bo(bo);

	/*
	 * Object is idle and about to be destroyed. Release the
	 * dma-buf attachment.
+3 −0
Original line number Diff line number Diff line
@@ -77,6 +77,9 @@ struct xe_bo {
	/** @ccs_cleared */
	bool ccs_cleared;

	/** @bb_ccs_rw: BB instructions of CCS read/write. Valid only for VF */
	struct xe_bb *bb_ccs[XE_SRIOV_VF_CCS_CTX_COUNT];

	/**
	 * @cpu_caching: CPU caching mode. Currently only used for userspace
	 * objects. Exceptions are system memory on DGFX, which is always
+148 −0
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@
#include "xe_mocs.h"
#include "xe_pt.h"
#include "xe_res_cursor.h"
#include "xe_sa.h"
#include "xe_sched_job.h"
#include "xe_sync.h"
#include "xe_trace_bo.h"
@@ -954,6 +955,153 @@ struct dma_fence *xe_migrate_copy(struct xe_migrate *m,
	return fence;
}

/**
 * xe_get_migrate_lrc() - Get the LRC from migrate context.
 * @migrate: Migrate context.
 *
 * Return: Pointer to LRC on success, error on failure
 */
struct xe_lrc *xe_migrate_lrc(struct xe_migrate *migrate)
{
	return migrate->q->lrc[0];
}

static int emit_flush_invalidate(struct xe_migrate *m, u32 *dw, int i,
				 u32 flags)
{
	dw[i++] = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW |
		  MI_FLUSH_IMM_DW | flags;
	dw[i++] = lower_32_bits(xe_lrc_start_seqno_ggtt_addr(xe_migrate_lrc(m))) |
		  MI_FLUSH_DW_USE_GTT;
	dw[i++] = upper_32_bits(xe_lrc_start_seqno_ggtt_addr(xe_migrate_lrc(m)));
	dw[i++] = MI_NOOP;
	dw[i++] = MI_NOOP;

	return i;
}

/**
 * xe_migrate_ccs_rw_copy() - Copy content of TTM resources.
 * @m: The migration context.
 * @src_bo: The buffer object @src is currently bound to.
 * @read_write : Creates BB commands for CCS read/write.
 *
 * Creates batch buffer instructions to copy CCS metadata from CCS pool to
 * memory and vice versa.
 *
 * This function should only be called for IGPU.
 *
 * Return: 0 if successful, negative error code on failure.
 */
int xe_migrate_ccs_rw_copy(struct xe_migrate *m,
			   struct xe_bo *src_bo,
			   enum xe_sriov_vf_ccs_rw_ctxs read_write)

{
	bool src_is_pltt = read_write == XE_SRIOV_VF_CCS_READ_CTX;
	bool dst_is_pltt = read_write == XE_SRIOV_VF_CCS_WRITE_CTX;
	struct ttm_resource *src = src_bo->ttm.resource;
	struct xe_gt *gt = m->tile->primary_gt;
	u32 batch_size, batch_size_allocated;
	struct xe_device *xe = gt_to_xe(gt);
	struct xe_res_cursor src_it, ccs_it;
	u64 size = xe_bo_size(src_bo);
	struct xe_bb *bb = NULL;
	u64 src_L0, src_L0_ofs;
	u32 src_L0_pt;
	int err;

	xe_res_first_sg(xe_bo_sg(src_bo), 0, size, &src_it);

	xe_res_first_sg(xe_bo_sg(src_bo), xe_bo_ccs_pages_start(src_bo),
			PAGE_ALIGN(xe_device_ccs_bytes(xe, size)),
			&ccs_it);

	/* Calculate Batch buffer size */
	batch_size = 0;
	while (size) {
		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
		u64 ccs_ofs, ccs_size;
		u32 ccs_pt;

		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;

		src_L0 = min_t(u64, max_mem_transfer_per_pass(xe), size);

		batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
					      &src_L0_ofs, &src_L0_pt, 0, 0,
					      avail_pts);

		ccs_size = xe_device_ccs_bytes(xe, src_L0);
		batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
					      &ccs_pt, 0, avail_pts, avail_pts);
		xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));

		/* Add copy commands size here */
		batch_size += EMIT_COPY_CCS_DW;

		size -= src_L0;
	}

	bb = xe_bb_ccs_new(gt, batch_size, read_write);
	if (IS_ERR(bb)) {
		drm_err(&xe->drm, "BB allocation failed.\n");
		err = PTR_ERR(bb);
		goto err_ret;
	}

	batch_size_allocated = batch_size;
	size = xe_bo_size(src_bo);
	batch_size = 0;

	/*
	 * Emit PTE and copy commands here.
	 * The CCS copy command can only support limited size. If the size to be
	 * copied is more than the limit, divide copy into chunks. So, calculate
	 * sizes here again before copy command is emitted.
	 */
	while (size) {
		batch_size += 10; /* Flush + ggtt addr + 2 NOP */
		u32 flush_flags = 0;
		u64 ccs_ofs, ccs_size;
		u32 ccs_pt;

		u32 avail_pts = max_mem_transfer_per_pass(xe) / LEVEL0_PAGE_TABLE_ENCODE_SIZE;

		src_L0 = xe_migrate_res_sizes(m, &src_it);

		batch_size += pte_update_size(m, false, src, &src_it, &src_L0,
					      &src_L0_ofs, &src_L0_pt, 0, 0,
					      avail_pts);

		ccs_size = xe_device_ccs_bytes(xe, src_L0);
		batch_size += pte_update_size(m, 0, NULL, &ccs_it, &ccs_size, &ccs_ofs,
					      &ccs_pt, 0, avail_pts, avail_pts);
		xe_assert(xe, IS_ALIGNED(ccs_it.start, PAGE_SIZE));
		batch_size += EMIT_COPY_CCS_DW;

		emit_pte(m, bb, src_L0_pt, false, true, &src_it, src_L0, src);

		emit_pte(m, bb, ccs_pt, false, false, &ccs_it, ccs_size, src);

		bb->len = emit_flush_invalidate(m, bb->cs, bb->len, flush_flags);
		flush_flags = xe_migrate_ccs_copy(m, bb, src_L0_ofs, src_is_pltt,
						  src_L0_ofs, dst_is_pltt,
						  src_L0, ccs_ofs, true);
		bb->len = emit_flush_invalidate(m, bb->cs, bb->len, flush_flags);

		size -= src_L0;
	}

	xe_assert(xe, (batch_size_allocated == bb->len));
	src_bo->bb_ccs[read_write] = bb;

	return 0;

err_ret:
	return err;
}

static void emit_clear_link_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
				 u32 size, u32 pitch)
{
Loading