Commit 5e300ed8 authored by Lucas De Marchi's avatar Lucas De Marchi
Browse files

drm/xe: Split xe_device_td_flush()



xe_device_td_flush() has 2 possible implementations: an entire L2 flush
or a transient flush, depending on WA 16023588340. Make this clear by
splitting the function so it calls each of them.

Reviewed-by: default avatarMatthew Auld <matthew.auld@intel.com>
Link: https://lore.kernel.org/r/20250618-wa-22019338487-v5-3-b888388477f2@intel.com


Signed-off-by: default avatarLucas De Marchi <lucas.demarchi@intel.com>
parent d878c97d
Loading
Loading
Loading
Loading
+40 −28
Original line number Diff line number Diff line
@@ -986,38 +986,15 @@ void xe_device_wmb(struct xe_device *xe)
		xe_mmio_write32(xe_root_tile_mmio(xe), VF_CAP_REG, 0);
}

/**
 * xe_device_td_flush() - Flush transient L3 cache entries
 * @xe: The device
 *
 * Display engine has direct access to memory and is never coherent with L3/L4
 * caches (or CPU caches), however KMD is responsible for specifically flushing
 * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
 * can happen from such a surface without seeing corruption.
 *
 * Display surfaces can be tagged as transient by mapping it using one of the
 * various L3:XD PAT index modes on Xe2.
 *
 * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
 * at the end of each submission via PIPE_CONTROL for compute/render, since SA
 * Media is not coherent with L3 and we want to support render-vs-media
 * usescases. For other engines like copy/blt the HW internally forces uncached
 * behaviour, hence why we can skip the TDF on such platforms.
/*
 * Issue a TRANSIENT_FLUSH_REQUEST and wait for completion on each gt.
 */
void xe_device_td_flush(struct xe_device *xe)
static void tdf_request_sync(struct xe_device *xe)
{
	struct xe_gt *gt;
	unsigned int fw_ref;
	struct xe_gt *gt;
	u8 id;

	if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
		return;

	if (XE_WA(xe_root_mmio_gt(xe), 16023588340)) {
		xe_device_l2_flush(xe);
		return;
	}

	for_each_gt(gt, xe, id) {
		if (xe_gt_is_media_type(gt))
			continue;
@@ -1027,6 +1004,7 @@ void xe_device_td_flush(struct xe_device *xe)
			return;

		xe_mmio_write32(&gt->mmio, XE2_TDF_CTRL, TRANSIENT_FLUSH_REQUEST);

		/*
		 * FIXME: We can likely do better here with our choice of
		 * timeout. Currently we just assume the worst case, i.e. 150us,
@@ -1057,15 +1035,49 @@ void xe_device_l2_flush(struct xe_device *xe)
		return;

	spin_lock(&gt->global_invl_lock);
	xe_mmio_write32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1);

	xe_mmio_write32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1);
	if (xe_mmio_wait32(&gt->mmio, XE2_GLOBAL_INVAL, 0x1, 0x0, 500, NULL, true))
		xe_gt_err_once(gt, "Global invalidation timeout\n");

	spin_unlock(&gt->global_invl_lock);

	xe_force_wake_put(gt_to_fw(gt), fw_ref);
}

/**
 * xe_device_td_flush() - Flush transient L3 cache entries
 * @xe: The device
 *
 * Display engine has direct access to memory and is never coherent with L3/L4
 * caches (or CPU caches), however KMD is responsible for specifically flushing
 * transient L3 GPU cache entries prior to the flip sequence to ensure scanout
 * can happen from such a surface without seeing corruption.
 *
 * Display surfaces can be tagged as transient by mapping it using one of the
 * various L3:XD PAT index modes on Xe2.
 *
 * Note: On non-discrete xe2 platforms, like LNL, the entire L3 cache is flushed
 * at the end of each submission via PIPE_CONTROL for compute/render, since SA
 * Media is not coherent with L3 and we want to support render-vs-media
 * usescases. For other engines like copy/blt the HW internally forces uncached
 * behaviour, hence why we can skip the TDF on such platforms.
 */
void xe_device_td_flush(struct xe_device *xe)
{
	struct xe_gt *root_gt;

	if (!IS_DGFX(xe) || GRAPHICS_VER(xe) < 20)
		return;

	root_gt = xe_root_mmio_gt(xe);
	if (XE_WA(root_gt, 16023588340))
		/* A transient flush is not sufficient: flush the L2 */
		xe_device_l2_flush(xe);
	else
		tdf_request_sync(xe);
}

u32 xe_device_ccs_bytes(struct xe_device *xe, u64 size)
{
	return xe_device_has_flat_ccs(xe) ?