Commit 4c276870 authored by Satyanarayana K V P's avatar Satyanarayana K V P Committed by Michal Wajdeczko
Browse files

drm/xe/vf: Add debugfs entries to test VF double migration



VF migration sends a marker to the GUC before resource fixups begin,
and repeats the marker with the RESFIX_DONE notification. This prevents
the GUC from submitting jobs during double migration events.

To reliably test double migration, a second migration must be triggered
while fixups from the first migration are still in progress. Since fixups
complete quickly, reproducing this scenario is difficult. Introduce
debugfs controls to add delays in the post-fixup phase, creating a
deterministic window for subsequent migrations.

New debugfs entries:
	/sys/kernel/debug/dri/BDF/
	├── tile0
	│   ├─gt0
	│   │ ├──vf
	│   │ │  ├── resfix_stoppers

resfix_stoppers: Predefined checkpoints that allow the migration process
to pause at specific stages. The stages are given below.

VF_MIGRATION_WAIT_RESFIX_START		- BIT(0)
VF_MIGRATION_WAIT_FIXUPS		- BIT(1)
VF_MIGRATION_WAIT_RESTART_JOBS		- BIT(2)
VF_MIGRATION_WAIT_RESFIX_DONE		- BIT(3)

Each state will pause with a 1-second delay per iteration, continuing until
its corresponding bit is cleared.

Signed-off-by: default avatarSatyanarayana K V P <satyanarayana.k.v.p@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Tomasz Lis <tomasz.lis@intel.com>
Acked-by: default avatarAdam Miszczak <adam.miszczak@linux.intel.com>
Reviewed-by: default avatarMichal Wajdeczko <michal.wajdeczko@intel.com>
Signed-off-by: default avatarMichal Wajdeczko <michal.wajdeczko@intel.com>
Link: https://patch.msgid.link/20251201095011.21453-10-satyanarayana.k.v.p@intel.com
parent 75e7d262
Loading
Loading
Loading
Loading
+40 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@

#include <linux/bitfield.h>
#include <linux/bsearch.h>
#include <linux/delay.h>

#include <drm/drm_managed.h>
#include <drm/drm_print.h>
@@ -41,6 +42,37 @@

#define make_u64_from_u32(hi, lo) ((u64)((u64)(u32)(hi) << 32 | (u32)(lo)))

#ifdef CONFIG_DRM_XE_DEBUG
enum VF_MIGRATION_WAIT_POINTS {
	VF_MIGRATION_WAIT_RESFIX_START	= BIT(0),
	VF_MIGRATION_WAIT_FIXUPS	= BIT(1),
	VF_MIGRATION_WAIT_RESTART_JOBS	= BIT(2),
	VF_MIGRATION_WAIT_RESFIX_DONE	= BIT(3),
};

#define VF_MIGRATION_WAIT_DELAY_IN_MS	1000
static void vf_post_migration_inject_wait(struct xe_gt *gt,
					  enum VF_MIGRATION_WAIT_POINTS wait)
{
	while (gt->sriov.vf.migration.debug.resfix_stoppers & wait) {
		xe_gt_dbg(gt,
			  "*TESTING* injecting %u ms delay due to resfix_stoppers=%#x, to continue clear %#x\n",
			  VF_MIGRATION_WAIT_DELAY_IN_MS,
			  gt->sriov.vf.migration.debug.resfix_stoppers, wait);

		msleep(VF_MIGRATION_WAIT_DELAY_IN_MS);
	}
}

#define VF_MIGRATION_INJECT_WAIT(gt, _POS) ({					\
	struct xe_gt *__gt = (gt);						\
	vf_post_migration_inject_wait(__gt, VF_MIGRATION_WAIT_##_POS);		\
	})

#else
#define VF_MIGRATION_INJECT_WAIT(_gt, ...)	typecheck(struct xe_gt *, (_gt))
#endif

static int guc_action_vf_reset(struct xe_guc *guc)
{
	u32 request[GUC_HXG_REQUEST_MSG_MIN_LEN] = {
@@ -320,6 +352,8 @@ static int vf_resfix_start(struct xe_gt *gt, u16 marker)

	xe_gt_assert(gt, IS_SRIOV_VF(gt_to_xe(gt)));

	VF_MIGRATION_INJECT_WAIT(gt, RESFIX_START);

	xe_gt_sriov_dbg_verbose(gt, "Sending resfix start marker %u\n", marker);

	return guc_action_vf_resfix_start(guc, marker);
@@ -1158,6 +1192,8 @@ static int vf_post_migration_fixups(struct xe_gt *gt)
	void *buf = gt->sriov.vf.migration.scratch;
	int err;

	VF_MIGRATION_INJECT_WAIT(gt, FIXUPS);

	/* xe_gt_sriov_vf_query_config will fixup the GGTT addresses */
	err = xe_gt_sriov_vf_query_config(gt);
	if (err)
@@ -1176,6 +1212,8 @@ static int vf_post_migration_fixups(struct xe_gt *gt)

static void vf_post_migration_rearm(struct xe_gt *gt)
{
	VF_MIGRATION_INJECT_WAIT(gt, RESTART_JOBS);

	/*
	 * Make sure interrupts on the new HW are properly set. The GuC IRQ
	 * must be working at this point, since the recovery did started,
@@ -1206,6 +1244,8 @@ static void vf_post_migration_abort(struct xe_gt *gt)

static int vf_post_migration_resfix_done(struct xe_gt *gt, u16 marker)
{
	VF_MIGRATION_INJECT_WAIT(gt, RESFIX_DONE);

	spin_lock_irq(&gt->sriov.vf.migration.lock);
	if (gt->sriov.vf.migration.recovery_queued)
		xe_gt_sriov_dbg(gt, "another recovery imminent\n");
+12 −0
Original line number Diff line number Diff line
@@ -69,4 +69,16 @@ void xe_gt_sriov_vf_debugfs_register(struct xe_gt *gt, struct dentry *root)
	vfdentry->d_inode->i_private = gt;

	drm_debugfs_create_files(vf_info, ARRAY_SIZE(vf_info), vfdentry, minor);

	/*
	 *      /sys/kernel/debug/dri/BDF/
	 *      ├── tile0
	 *          ├── gt0
	 *              ├── vf
	 *                  ├── resfix_stoppers
	 */
	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
		debugfs_create_x8("resfix_stoppers", 0600, vfdentry,
				  &gt->sriov.vf.migration.debug.resfix_stoppers);
	}
}
+8 −0
Original line number Diff line number Diff line
@@ -52,6 +52,14 @@ struct xe_gt_sriov_vf_migration {
	wait_queue_head_t wq;
	/** @scratch: Scratch memory for VF recovery */
	void *scratch;
	/** @debug: Debug hooks for delaying migration */
	struct {
		/**
		 * @debug.resfix_stoppers: Stop and wait at different stages
		 * during post migration recovery
		 */
		u8 resfix_stoppers;
	} debug;
	/**
	 * @resfix_marker: Marker sent on start and on end of post-migration
	 * steps.