Commit bf8bbaef authored by Philipp Stanner's avatar Philipp Stanner
Browse files

drm/sched: Avoid memory leaks with cancel_job() callback



Since its inception, the GPU scheduler can leak memory if the driver
calls drm_sched_fini() while there are still jobs in flight.

The simplest way to solve this in a backwards compatible manner is by
adding a new callback, drm_sched_backend_ops.cancel_job(), which
instructs the driver to signal the hardware fence associated with the
job. Afterwards, the scheduler can safely use the established free_job()
callback for freeing the job.

Implement the new backend_ops callback cancel_job().

Suggested-by: default avatarTvrtko Ursulin <tvrtko.ursulin@igalia.com>
Link: https://lore.kernel.org/dri-devel/20250418113211.69956-1-tvrtko.ursulin@igalia.com/


Reviewed-by: default avatarMaíra Canal <mcanal@igalia.com>
Acked-by: default avatarTvrtko Ursulin <tvrtko.ursulin@igalia.com>
Signed-off-by: default avatarPhilipp Stanner <phasta@kernel.org>
Link: https://lore.kernel.org/r/20250710125412.128476-4-phasta@kernel.org
parent fe69a391
Loading
Loading
Loading
Loading
+21 −13
Original line number Diff line number Diff line
@@ -1352,6 +1352,18 @@ int drm_sched_init(struct drm_gpu_scheduler *sched, const struct drm_sched_init_
}
EXPORT_SYMBOL(drm_sched_init);

static void drm_sched_cancel_remaining_jobs(struct drm_gpu_scheduler *sched)
{
	struct drm_sched_job *job, *tmp;

	/* All other accessors are stopped. No locking necessary. */
	list_for_each_entry_safe_reverse(job, tmp, &sched->pending_list, list) {
		sched->ops->cancel_job(job);
		list_del(&job->list);
		sched->ops->free_job(job);
	}
}

/**
 * drm_sched_fini - Destroy a gpu scheduler
 *
@@ -1359,19 +1371,11 @@ EXPORT_SYMBOL(drm_sched_init);
 *
 * Tears down and cleans up the scheduler.
 *
 * This stops submission of new jobs to the hardware through
 * drm_sched_backend_ops.run_job(). Consequently, drm_sched_backend_ops.free_job()
 * will not be called for all jobs still in drm_gpu_scheduler.pending_list.
 * There is no solution for this currently. Thus, it is up to the driver to make
 * sure that:
 *
 *  a) drm_sched_fini() is only called after for all submitted jobs
 *     drm_sched_backend_ops.free_job() has been called or that
 *  b) the jobs for which drm_sched_backend_ops.free_job() has not been called
 *     after drm_sched_fini() ran are freed manually.
 *
 * FIXME: Take care of the above problem and prevent this function from leaking
 * the jobs in drm_gpu_scheduler.pending_list under any circumstances.
 * This stops submission of new jobs to the hardware through &struct
 * drm_sched_backend_ops.run_job. If &struct drm_sched_backend_ops.cancel_job
 * is implemented, all jobs will be canceled through it and afterwards cleaned
 * up through &struct drm_sched_backend_ops.free_job. If cancel_job is not
 * implemented, memory could leak.
 */
void drm_sched_fini(struct drm_gpu_scheduler *sched)
{
@@ -1401,6 +1405,10 @@ void drm_sched_fini(struct drm_gpu_scheduler *sched)
	/* Confirm no work left behind accessing device structures */
	cancel_delayed_work_sync(&sched->work_tdr);

	/* Avoid memory leaks if supported by the driver. */
	if (sched->ops->cancel_job)
		drm_sched_cancel_remaining_jobs(sched);

	if (sched->own_submit_wq)
		destroy_workqueue(sched->submit_wq);
	sched->ready = false;
+18 −0
Original line number Diff line number Diff line
@@ -512,6 +512,24 @@ struct drm_sched_backend_ops {
         * and it's time to clean it up.
	 */
	void (*free_job)(struct drm_sched_job *sched_job);

	/**
	 * @cancel_job: Used by the scheduler to guarantee remaining jobs' fences
	 * get signaled in drm_sched_fini().
	 *
	 * Used by the scheduler to cancel all jobs that have not been executed
	 * with &struct drm_sched_backend_ops.run_job by the time
	 * drm_sched_fini() gets invoked.
	 *
	 * Drivers need to signal the passed job's hardware fence with an
	 * appropriate error code (e.g., -ECANCELED) in this callback. They
	 * must not free the job.
	 *
	 * The scheduler will only call this callback once it stopped calling
	 * all other callbacks forever, with the exception of &struct
	 * drm_sched_backend_ops.free_job.
	 */
	void (*cancel_job)(struct drm_sched_job *sched_job);
};

/**