Commit 0071e01c authored by Christian König's avatar Christian König Committed by Alex Deucher
Browse files

drm/amdgpu: fix userq hang detection and reset



Fix lock inversions pointed out by Prike and Sunil. The hang detection
timeout *CAN'T* grab locks under which we wait for fences, especially
not the userq_mutex lock.

Then instead of this completely broken handling with the
hang_detect_fence just cancel the work when fences are processed and
re-start if necessary.

Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarSunil Khatri <sunil.khatri@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 1b62077f045ac6ffde7c97005c6659569ac5c1ec)
parent d0053441
Loading
Loading
Loading
Loading
+26 −39
Original line number Diff line number Diff line
@@ -106,9 +106,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
	int r = 0;
	int i;

	/* Warning if current process mutex is not held */
	WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));

	if (unlikely(adev->debug_disable_gpu_ring_reset)) {
		dev_err(adev->dev, "userq reset disabled by debug mask\n");
		return 0;
@@ -127,9 +124,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
	 */
	for (i = 0; i < num_queue_types; i++) {
		int ring_type = queue_types[i];
		const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
		const struct amdgpu_userq_funcs *funcs =
			adev->userq_funcs[ring_type];

		if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
		if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
							  AMDGPU_RESET_TYPE_PER_QUEUE))
				continue;

		if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
@@ -150,38 +149,22 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)

static void amdgpu_userq_hang_detect_work(struct work_struct *work)
{
	struct amdgpu_usermode_queue *queue = container_of(work,
							  struct amdgpu_usermode_queue,
	struct amdgpu_usermode_queue *queue =
		container_of(work, struct amdgpu_usermode_queue,
			     hang_detect_work.work);
	struct dma_fence *fence;
	struct amdgpu_userq_mgr *uq_mgr;

	if (!queue->userq_mgr)
		return;

	uq_mgr = queue->userq_mgr;
	fence = READ_ONCE(queue->hang_detect_fence);
	/* Fence already signaled – no action needed */
	if (!fence || dma_fence_is_signaled(fence))
		return;

	mutex_lock(&uq_mgr->userq_mutex);
	amdgpu_userq_detect_and_reset_queues(uq_mgr);
	mutex_unlock(&uq_mgr->userq_mutex);
	amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
}

/*
 * Start hang detection for a user queue fence. A delayed work will be scheduled
 * to check if the fence is still pending after the timeout period.
 * to reset the queues when the fence doesn't signal in time.
 */
void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
{
	struct amdgpu_device *adev;
	unsigned long timeout_ms;

	if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
		return;

	adev = queue->userq_mgr->adev;
	/* Determine timeout based on queue type */
	switch (queue->queue_type) {
@@ -199,8 +182,6 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
		break;
	}

	/* Store the fence to monitor and schedule hang detection */
	WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
	schedule_delayed_work(&queue->hang_detect_work,
		     msecs_to_jiffies(timeout_ms));
}
@@ -210,18 +191,24 @@ void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
	struct xarray *xa = &adev->userq_doorbell_xa;
	struct amdgpu_usermode_queue *queue;
	unsigned long flags;
	int r;

	xa_lock_irqsave(xa, flags);
	queue = xa_load(xa, doorbell);
	if (queue)
		amdgpu_userq_fence_driver_process(queue->fence_drv);
	xa_unlock_irqrestore(xa, flags);
}
	if (queue) {
		r = amdgpu_userq_fence_driver_process(queue->fence_drv);
		/*
		 * We are in interrupt context here, this *can't* wait for
		 * reset work to finish.
		 */
		if (r >= 0)
			cancel_delayed_work(&queue->hang_detect_work);

static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue *queue)
{
	INIT_DELAYED_WORK(&queue->hang_detect_work, amdgpu_userq_hang_detect_work);
	queue->hang_detect_fence = NULL;
		/* Restart the timer when there are still fences pending */
		if (r == 1)
			amdgpu_userq_start_hang_detect_work(queue);
	}
	xa_unlock_irqrestore(xa, flags);
}

static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
@@ -640,7 +627,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
	amdgpu_bo_unreserve(vm->root.bo);

	mutex_lock(&uq_mgr->userq_mutex);
	queue->hang_detect_fence = NULL;
	amdgpu_userq_wait_for_last_fence(queue);

#if defined(CONFIG_DEBUG_FS)
@@ -847,7 +833,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
	up_read(&adev->reset_domain->sem);

	amdgpu_debugfs_userq_init(filp, queue, qid);
	amdgpu_userq_init_hang_detect_work(queue);
	INIT_DELAYED_WORK(&queue->hang_detect_work,
			  amdgpu_userq_hang_detect_work);

	args->out.queue_id = qid;
	atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
+0 −1
Original line number Diff line number Diff line
@@ -85,7 +85,6 @@ struct amdgpu_usermode_queue {
	int			priority;
	struct dentry		*debugfs_queue;
	struct delayed_work hang_detect_work;
	struct dma_fence *hang_detect_fence;
	struct kref		refcount;

	struct list_head	userq_va_list;
+13 −4
Original line number Diff line number Diff line
@@ -135,7 +135,14 @@ amdgpu_userq_fence_put_fence_drv_array(struct amdgpu_userq_fence *userq_fence)
	userq_fence->fence_drv_array_count = 0;
}

void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv)
/*
 * Returns:
 * -ENOENT when no fences were processes
 * 1 when more fences are pending
 * 0 when no fences are pending any more
 */
int
amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv)
{
	struct amdgpu_userq_fence *userq_fence, *tmp;
	LIST_HEAD(to_be_signaled);
@@ -143,9 +150,6 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
	unsigned long flags;
	u64 rptr;

	if (!fence_drv)
		return;

	spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
	rptr = amdgpu_userq_fence_read(fence_drv);

@@ -158,6 +162,9 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
				&userq_fence->link);
	spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);

	if (list_empty(&to_be_signaled))
		return -ENOENT;

	list_for_each_entry_safe(userq_fence, tmp, &to_be_signaled, link) {
		fence = &userq_fence->base;
		list_del_init(&userq_fence->link);
@@ -169,6 +176,8 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
		dma_fence_put(fence);
	}

	/* That doesn't need to be accurate so no locking */
	return list_empty(&fence_drv->fences) ? 0 : 1;
}

void amdgpu_userq_fence_driver_destroy(struct kref *ref)
+1 −1
Original line number Diff line number Diff line
@@ -63,7 +63,7 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv);
int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev,
				    struct amdgpu_userq_fence_driver **fence_drv_req);
void amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq);
void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv);
int amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv);
void amdgpu_userq_fence_driver_force_completion(struct amdgpu_usermode_queue *userq);
void amdgpu_userq_fence_driver_destroy(struct kref *ref);
int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,