Commit d5067034 authored by Philipp Stanner's avatar Philipp Stanner Committed by Danilo Krummrich
Browse files

Revert "drm/nouveau: Remove waitque for sched teardown"

This reverts:

commit bead8800 ("drm/nouveau: Remove waitque for sched teardown")
commit 5f46f5c7 ("drm/nouveau: Add new callback for scheduler teardown")

from the drm/sched teardown leak fix series:

https://lore.kernel.org/dri-devel/20250710125412.128476-2-phasta@kernel.org/



The aforementioned series removed a blocking waitqueue from
nouveau_sched_fini(). It was mistakenly assumed that this waitqueue only
prevents jobs from leaking, which the series fixed.

The waitqueue, however, also guarantees that all VM_BIND related jobs
are finished in order, cleaning up mappings in the GPU's MMU. These jobs
must be executed sequentially. Without the waitqueue, this is no longer
guaranteed, because entity and scheduler teardown can race with each
other.

Revert all patches related to the waitqueue removal.

Fixes: bead8800 ("drm/nouveau: Remove waitque for sched teardown")
Suggested-by: default avatarDanilo Krummrich <dakr@kernel.org>
Signed-off-by: default avatarPhilipp Stanner <phasta@kernel.org>
Link: https://lore.kernel.org/r/20250901083107.10206-2-phasta@kernel.org


Signed-off-by: default avatarDanilo Krummrich <dakr@kernel.org>
parent bdd5a14e
Loading
Loading
Loading
Loading
+0 −15
Original line number Diff line number Diff line
@@ -240,21 +240,6 @@ nouveau_fence_emit(struct nouveau_fence *fence)
	return ret;
}

void
nouveau_fence_cancel(struct nouveau_fence *fence)
{
	struct nouveau_fence_chan *fctx = nouveau_fctx(fence);
	unsigned long flags;

	spin_lock_irqsave(&fctx->lock, flags);
	if (!dma_fence_is_signaled_locked(&fence->base)) {
		dma_fence_set_error(&fence->base, -ECANCELED);
		if (nouveau_fence_signal(fence))
			nvif_event_block(&fctx->event);
	}
	spin_unlock_irqrestore(&fctx->lock, flags);
}

bool
nouveau_fence_done(struct nouveau_fence *fence)
{
+0 −1
Original line number Diff line number Diff line
@@ -29,7 +29,6 @@ void nouveau_fence_unref(struct nouveau_fence **);

int  nouveau_fence_emit(struct nouveau_fence *);
bool nouveau_fence_done(struct nouveau_fence *);
void nouveau_fence_cancel(struct nouveau_fence *fence);
int  nouveau_fence_wait(struct nouveau_fence *, bool lazy, bool intr);
int  nouveau_fence_sync(struct nouveau_bo *, struct nouveau_channel *, bool exclusive, bool intr);

+14 −21
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@
#include "nouveau_exec.h"
#include "nouveau_abi16.h"
#include "nouveau_sched.h"
#include "nouveau_chan.h"

#define NOUVEAU_SCHED_JOB_TIMEOUT_MS		10000

@@ -122,9 +121,11 @@ nouveau_job_done(struct nouveau_job *job)
{
	struct nouveau_sched *sched = job->sched;

	spin_lock(&sched->job_list.lock);
	spin_lock(&sched->job.list.lock);
	list_del(&job->entry);
	spin_unlock(&sched->job_list.lock);
	spin_unlock(&sched->job.list.lock);

	wake_up(&sched->job.wq);
}

void
@@ -305,9 +306,9 @@ nouveau_job_submit(struct nouveau_job *job)
	}

	/* Submit was successful; add the job to the schedulers job list. */
	spin_lock(&sched->job_list.lock);
	list_add(&job->entry, &sched->job_list.head);
	spin_unlock(&sched->job_list.lock);
	spin_lock(&sched->job.list.lock);
	list_add(&job->entry, &sched->job.list.head);
	spin_unlock(&sched->job.list.lock);

	drm_sched_job_arm(&job->base);
	job->done_fence = dma_fence_get(&job->base.s_fence->finished);
@@ -392,23 +393,10 @@ nouveau_sched_free_job(struct drm_sched_job *sched_job)
	nouveau_job_fini(job);
}

static void
nouveau_sched_cancel_job(struct drm_sched_job *sched_job)
{
	struct nouveau_fence *fence;
	struct nouveau_job *job;

	job = to_nouveau_job(sched_job);
	fence = to_nouveau_fence(job->done_fence);

	nouveau_fence_cancel(fence);
}

static const struct drm_sched_backend_ops nouveau_sched_ops = {
	.run_job = nouveau_sched_run_job,
	.timedout_job = nouveau_sched_timedout_job,
	.free_job = nouveau_sched_free_job,
	.cancel_job = nouveau_sched_cancel_job,
};

static int
@@ -458,8 +446,9 @@ nouveau_sched_init(struct nouveau_sched *sched, struct nouveau_drm *drm,
		goto fail_sched;

	mutex_init(&sched->mutex);
	spin_lock_init(&sched->job_list.lock);
	INIT_LIST_HEAD(&sched->job_list.head);
	spin_lock_init(&sched->job.list.lock);
	INIT_LIST_HEAD(&sched->job.list.head);
	init_waitqueue_head(&sched->job.wq);

	return 0;

@@ -493,12 +482,16 @@ nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
	return 0;
}


static void
nouveau_sched_fini(struct nouveau_sched *sched)
{
	struct drm_gpu_scheduler *drm_sched = &sched->base;
	struct drm_sched_entity *entity = &sched->entity;

	rmb(); /* for list_empty to work without lock */
	wait_event(sched->job.wq, list_empty(&sched->job.list.head));

	drm_sched_entity_fini(entity);
	drm_sched_fini(drm_sched);

+6 −3
Original line number Diff line number Diff line
@@ -102,10 +102,13 @@ struct nouveau_sched {
	struct workqueue_struct *wq;
	struct mutex mutex;

	struct {
		struct {
			struct list_head head;
			spinlock_t lock;
	} job_list;
		} list;
		struct wait_queue_head wq;
	} job;
};

int nouveau_sched_create(struct nouveau_sched **psched, struct nouveau_drm *drm,
+4 −4
Original line number Diff line number Diff line
@@ -1019,8 +1019,8 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)
	u64 end = addr + range;

again:
	spin_lock(&sched->job_list.lock);
	list_for_each_entry(__job, &sched->job_list.head, entry) {
	spin_lock(&sched->job.list.lock);
	list_for_each_entry(__job, &sched->job.list.head, entry) {
		struct nouveau_uvmm_bind_job *bind_job = to_uvmm_bind_job(__job);

		list_for_each_op(op, &bind_job->ops) {
@@ -1030,7 +1030,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)

				if (!(end <= op_addr || addr >= op_end)) {
					nouveau_uvmm_bind_job_get(bind_job);
					spin_unlock(&sched->job_list.lock);
					spin_unlock(&sched->job.list.lock);
					wait_for_completion(&bind_job->complete);
					nouveau_uvmm_bind_job_put(bind_job);
					goto again;
@@ -1038,7 +1038,7 @@ bind_validate_map_sparse(struct nouveau_job *job, u64 addr, u64 range)
			}
		}
	}
	spin_unlock(&sched->job_list.lock);
	spin_unlock(&sched->job.list.lock);
}

static int