Commit 1e12dbae authored by Matthew Auld's avatar Matthew Auld
Browse files

drm/xe/migrate: support MEM_COPY instruction



Make this the default on xe2+ when doing a copy. This has a few
advantages over the exiting copy instruction:

1) It has a special PAGE_COPY mode that claims to be optimised for
   page-in/page-out, which is the vast majority of current users.

2) It also has a simple BYTE_COPY mode that supports byte granularity
   copying without any restrictions.

With 2) we can now easily skip the bounce buffer flow when copying
buffers with strange sizing/alignment, like for memory_access. But that
is left for the next patch.

v2 (Matt Brost):
  - Use device info to check whether device should use the MEM_COPY
    path. This should fit better with making this a configfs tunable.
  - And with that also keep old path still functional on xe2 for possible
    experimentation.
  - Add a define for PAGE_COPY page-size.
v3 (Matt Brost):
  - Fallback to an actual linear copy for pitch=1.
  - Also update NVL.

BSpec: 57561
Signed-off-by: default avatarMatthew Auld <matthew.auld@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: default avatarMatthew Brost <matthew.brost@intel.com>
Link: https://lore.kernel.org/r/20251022163836.191405-7-matthew.auld@intel.com
parent 0171dcce
Loading
Loading
Loading
Loading
+6 −0
Original line number Diff line number Diff line
@@ -31,6 +31,12 @@
#define   XY_FAST_COPY_BLT_D1_DST_TILE4	REG_BIT(30)
#define   XE2_XY_FAST_COPY_BLT_MOCS_INDEX_MASK	GENMASK(23, 20)

#define MEM_COPY_CMD (2 << 29 | 0x5a << 22 | 0x8)
#define   MEM_COPY_PAGE_COPY_MODE REG_BIT(19)
#define   MEM_COPY_MATRIX_COPY REG_BIT(17)
#define   MEM_COPY_SRC_MOCS_INDEX_MASK	GENMASK(31, 28)
#define   MEM_COPY_DST_MOCS_INDEX_MASK	GENMASK(6, 3)

#define	PVC_MEM_SET_CMD		(2 << 29 | 0x5b << 22)
#define   PVC_MEM_SET_CMD_LEN_DW	7
#define   PVC_MEM_SET_MATRIX		REG_BIT(17)
+2 −0
Original line number Diff line number Diff line
@@ -300,6 +300,8 @@ struct xe_device {
		 * pcode mailbox commands.
		 */
		u8 has_mbx_power_limits:1;
		/** @info.has_mem_copy_instr: Device supports MEM_COPY instruction */
		u8 has_mem_copy_instr:1;
		/** @info.has_pxp: Device has PXP support */
		u8 has_pxp:1;
		/** @info.has_range_tlb_inval: Has range based TLB invalidations */
+58 −3
Original line number Diff line number Diff line
@@ -699,8 +699,8 @@ static void emit_copy_ccs(struct xe_gt *gt, struct xe_bb *bb,
}

#define EMIT_COPY_DW 10
static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
		      u64 src_ofs, u64 dst_ofs, unsigned int size,
static void emit_xy_fast_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
			      u64 dst_ofs, unsigned int size,
			      unsigned int pitch)
{
	struct xe_device *xe = gt_to_xe(gt);
@@ -730,6 +730,61 @@ static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
	bb->cs[bb->len++] = upper_32_bits(src_ofs);
}

#define PAGE_COPY_MODE_PS SZ_256 /* hw uses 256 bytes as the page-size */
static void emit_mem_copy(struct xe_gt *gt, struct xe_bb *bb, u64 src_ofs,
			  u64 dst_ofs, unsigned int size, unsigned int pitch)
{
	u32 mode, copy_type, width;

	xe_gt_assert(gt, IS_ALIGNED(size, pitch));
	xe_gt_assert(gt, pitch <= U16_MAX);
	xe_gt_assert(gt, pitch);
	xe_gt_assert(gt, size);

	if (IS_ALIGNED(size, PAGE_COPY_MODE_PS) &&
	    IS_ALIGNED(lower_32_bits(src_ofs), PAGE_COPY_MODE_PS) &&
	    IS_ALIGNED(lower_32_bits(dst_ofs), PAGE_COPY_MODE_PS)) {
		mode = MEM_COPY_PAGE_COPY_MODE;
		copy_type = 0; /* linear copy */
		width = size / PAGE_COPY_MODE_PS;
	} else if (pitch > 1) {
		xe_gt_assert(gt, size / pitch <= U16_MAX);
		mode = 0; /* BYTE_COPY */
		copy_type = MEM_COPY_MATRIX_COPY;
		width = pitch;
	} else {
		mode = 0; /* BYTE_COPY */
		copy_type = 0; /* linear copy */
		width = size;
	}

	xe_gt_assert(gt, width <= U16_MAX);

	bb->cs[bb->len++] = MEM_COPY_CMD | mode | copy_type;
	bb->cs[bb->len++] = width - 1;
	bb->cs[bb->len++] = size / pitch - 1; /* ignored by hw for page-copy/linear above */
	bb->cs[bb->len++] = pitch - 1;
	bb->cs[bb->len++] = pitch - 1;
	bb->cs[bb->len++] = lower_32_bits(src_ofs);
	bb->cs[bb->len++] = upper_32_bits(src_ofs);
	bb->cs[bb->len++] = lower_32_bits(dst_ofs);
	bb->cs[bb->len++] = upper_32_bits(dst_ofs);
	bb->cs[bb->len++] = FIELD_PREP(MEM_COPY_SRC_MOCS_INDEX_MASK, gt->mocs.uc_index) |
			    FIELD_PREP(MEM_COPY_DST_MOCS_INDEX_MASK, gt->mocs.uc_index);
}

static void emit_copy(struct xe_gt *gt, struct xe_bb *bb,
		      u64 src_ofs, u64 dst_ofs, unsigned int size,
		      unsigned int pitch)
{
	struct xe_device *xe = gt_to_xe(gt);

	if (xe->info.has_mem_copy_instr)
		emit_mem_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
	else
		emit_xy_fast_copy(gt, bb, src_ofs, dst_ofs, size, pitch);
}

static u64 xe_migrate_batch_base(struct xe_migrate *m, bool usm)
{
	return usm ? m->usm_batch_base_ofs : m->batch_base_ofs;
+5 −0
Original line number Diff line number Diff line
@@ -342,6 +342,7 @@ static const struct xe_device_desc lnl_desc = {
	.has_display = true,
	.has_flat_ccs = 1,
	.has_pxp = true,
	.has_mem_copy_instr = true,
	.max_gt_per_tile = 2,
	.needs_scratch = true,
	.va_bits = 48,
@@ -362,6 +363,7 @@ static const struct xe_device_desc bmg_desc = {
	.has_heci_cscfi = 1,
	.has_late_bind = true,
	.has_sriov = true,
	.has_mem_copy_instr = true,
	.max_gt_per_tile = 2,
	.needs_scratch = true,
	.subplatforms = (const struct xe_subplatform_desc[]) {
@@ -378,6 +380,7 @@ static const struct xe_device_desc ptl_desc = {
	.has_display = true,
	.has_flat_ccs = 1,
	.has_sriov = true,
	.has_mem_copy_instr = true,
	.max_gt_per_tile = 2,
	.needs_scratch = true,
	.needs_shared_vf_gt_wq = true,
@@ -390,6 +393,7 @@ static const struct xe_device_desc nvls_desc = {
	.dma_mask_size = 46,
	.has_display = true,
	.has_flat_ccs = 1,
	.has_mem_copy_instr = true,
	.max_gt_per_tile = 2,
	.require_force_probe = true,
	.va_bits = 48,
@@ -655,6 +659,7 @@ static int xe_info_init_early(struct xe_device *xe,
	xe->info.has_pxp = desc->has_pxp;
	xe->info.has_sriov = xe_configfs_primary_gt_allowed(to_pci_dev(xe->drm.dev)) &&
		desc->has_sriov;
	xe->info.has_mem_copy_instr = desc->has_mem_copy_instr;
	xe->info.skip_guc_pc = desc->skip_guc_pc;
	xe->info.skip_mtcfg = desc->skip_mtcfg;
	xe->info.skip_pcode = desc->skip_pcode;
+1 −0
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@ struct xe_device_desc {
	u8 has_late_bind:1;
	u8 has_llc:1;
	u8 has_mbx_power_limits:1;
	u8 has_mem_copy_instr:1;
	u8 has_pxp:1;
	u8 has_sriov:1;
	u8 needs_scratch:1;