Commit 1faeeea0 authored by Matthew Brost's avatar Matthew Brost
Browse files

drm/xe/vf: Avoid indefinite blocking in preempt rebind worker for VFs supporting migration



Blocking in work queues on a hardware action that may never occur —
especially when it depends on a software fixup also scheduled on the
a work queue — is a recipe for deadlock. This situation arises with
the preempt rebind worker and VF post-migration recovery. To prevent
potential deadlocks, avoid indefinite blocking in the preempt rebind
worker for VFs that support migration.

v4:
 - Use dma_fence_wait_timeout (CI)

Signed-off-by: default avatarMatthew Brost <matthew.brost@intel.com>
Reviewed-by: default avatarTomasz Lis <tomasz.lis@intel.com>
Link: https://lore.kernel.org/r/20251008214532.3442967-19-matthew.brost@intel.com
parent a4dae94a
Loading
Loading
Loading
Loading
+25 −1
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#include "xe_pt.h"
#include "xe_pxp.h"
#include "xe_res_cursor.h"
#include "xe_sriov_vf.h"
#include "xe_svm.h"
#include "xe_sync.h"
#include "xe_tile.h"
@@ -111,12 +112,22 @@ static int alloc_preempt_fences(struct xe_vm *vm, struct list_head *list,
static int wait_for_existing_preempt_fences(struct xe_vm *vm)
{
	struct xe_exec_queue *q;
	bool vf_migration = IS_SRIOV_VF(vm->xe) &&
		xe_sriov_vf_migration_supported(vm->xe);
	signed long wait_time = vf_migration ? HZ / 5 : MAX_SCHEDULE_TIMEOUT;

	xe_vm_assert_held(vm);

	list_for_each_entry(q, &vm->preempt.exec_queues, lr.link) {
		if (q->lr.pfence) {
			long timeout = dma_fence_wait(q->lr.pfence, false);
			long timeout;

			timeout = dma_fence_wait_timeout(q->lr.pfence, false,
							 wait_time);
			if (!timeout) {
				xe_assert(vm->xe, vf_migration);
				return -EAGAIN;
			}

			/* Only -ETIME on fence indicates VM needs to be killed */
			if (timeout < 0 || q->lr.pfence->error == -ETIME)
@@ -541,6 +552,19 @@ static void preempt_rebind_work_func(struct work_struct *w)
out_unlock_outer:
	if (err == -EAGAIN) {
		trace_xe_vm_rebind_worker_retry(vm);

		/*
		 * We can't block in workers on a VF which supports migration
		 * given this can block the VF post-migration workers from
		 * getting scheduled.
		 */
		if (IS_SRIOV_VF(vm->xe) &&
		    xe_sriov_vf_migration_supported(vm->xe)) {
			up_write(&vm->lock);
			xe_vm_queue_rebind_worker(vm);
			return;
		}

		goto retry;
	}