Commit 7e1fe102 authored by Matthew Brost's avatar Matthew Brost
Browse files

drm/xe/guc: Track pending-enable source in submission state



Add explicit tracking in the GuC submission state to record the source
of a pending enable (TDR vs. queue resume path vs. submission).
Disambiguating the origin lets the GuC submission state machine apply
the correct recovery/replay behavior.

This helps VF restore: when the device comes back, the state machine knows
whether the pending enable stems from timeout recovery, from a queue resume
sequence, or submission and can gate sequencing and fixups accordingly.

v4:
 - Clarify commit message (Tomasz)

Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Reviewed-by: default avatarTomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-4-matthew.brost@intel.com
parent 26cd498e
Loading
Loading
Loading
Loading
+36 −0
Original line number Diff line number Diff line
@@ -69,6 +69,8 @@ exec_queue_to_guc(struct xe_exec_queue *q)
#define EXEC_QUEUE_STATE_BANNED			(1 << 9)
#define EXEC_QUEUE_STATE_CHECK_TIMEOUT		(1 << 10)
#define EXEC_QUEUE_STATE_EXTRA_REF		(1 << 11)
#define EXEC_QUEUE_STATE_PENDING_RESUME		(1 << 12)
#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT	(1 << 13)

static bool exec_queue_registered(struct xe_exec_queue *q)
{
@@ -220,6 +222,36 @@ static void set_exec_queue_extra_ref(struct xe_exec_queue *q)
	atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
}

static bool __maybe_unused exec_queue_pending_resume(struct xe_exec_queue *q)
{
	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_RESUME;
}

static void set_exec_queue_pending_resume(struct xe_exec_queue *q)
{
	atomic_or(EXEC_QUEUE_STATE_PENDING_RESUME, &q->guc->state);
}

static void clear_exec_queue_pending_resume(struct xe_exec_queue *q)
{
	atomic_and(~EXEC_QUEUE_STATE_PENDING_RESUME, &q->guc->state);
}

static bool __maybe_unused exec_queue_pending_tdr_exit(struct xe_exec_queue *q)
{
	return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_TDR_EXIT;
}

static void set_exec_queue_pending_tdr_exit(struct xe_exec_queue *q)
{
	atomic_or(EXEC_QUEUE_STATE_PENDING_TDR_EXIT, &q->guc->state);
}

static void clear_exec_queue_pending_tdr_exit(struct xe_exec_queue *q)
{
	atomic_and(~EXEC_QUEUE_STATE_PENDING_TDR_EXIT, &q->guc->state);
}

static bool exec_queue_killed_or_banned_or_wedged(struct xe_exec_queue *q)
{
	return (atomic_read(&q->guc->state) &
@@ -1334,6 +1366,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
	return DRM_GPU_SCHED_STAT_RESET;

sched_enable:
	set_exec_queue_pending_tdr_exit(q);
	enable_scheduling(q);
rearm:
	/*
@@ -1493,6 +1526,7 @@ static void __guc_exec_queue_process_msg_resume(struct xe_sched_msg *msg)
		clear_exec_queue_suspended(q);
		if (!exec_queue_enabled(q)) {
			q->guc->resume_time = RESUME_PENDING;
			set_exec_queue_pending_resume(q);
			enable_scheduling(q);
		}
	} else {
@@ -2065,6 +2099,8 @@ static void handle_sched_done(struct xe_guc *guc, struct xe_exec_queue *q,
		xe_gt_assert(guc_to_gt(guc), exec_queue_pending_enable(q));

		q->guc->resume_time = ktime_get();
		clear_exec_queue_pending_resume(q);
		clear_exec_queue_pending_tdr_exit(q);
		clear_exec_queue_pending_enable(q);
		smp_wmb();
		wake_up_all(&guc->ct.wq);