Commit d0053441 authored by Christian König's avatar Christian König Committed by Alex Deucher
Browse files

drm/amdgpu: remove almost all calls to amdgpu_userq_detect_and_reset_queues



Well the reset handling seems broken on multiple levels.

As first step of fixing this remove most calls to the hang detection.
That function should only be called after we run into a timeout! And *NOT*
as random check spread over the code in multiple places.

Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarSunil Khatri <sunil.khatri@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
(cherry picked from commit 71bea36b54ccfb14cbc90f94267af6369af4e702)
parent 44e5bc73
Loading
Loading
Loading
Loading
+14 −24
Original line number Diff line number Diff line
@@ -345,23 +345,18 @@ static int amdgpu_userq_preempt_helper(struct amdgpu_usermode_queue *queue)
	struct amdgpu_device *adev = uq_mgr->adev;
	const struct amdgpu_userq_funcs *userq_funcs =
		adev->userq_funcs[queue->queue_type];
	bool found_hung_queue = false;
	int r = 0;
	int r;

	if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
		r = userq_funcs->preempt(queue);
		if (r) {
			queue->state = AMDGPU_USERQ_STATE_HUNG;
			found_hung_queue = true;
			return r;
		} else {
			queue->state = AMDGPU_USERQ_STATE_PREEMPTED;
		}
	}

	if (found_hung_queue)
		amdgpu_userq_detect_and_reset_queues(uq_mgr);

	return r;
	return 0;
}

static int amdgpu_userq_restore_helper(struct amdgpu_usermode_queue *queue)
@@ -390,24 +385,21 @@ static int amdgpu_userq_unmap_helper(struct amdgpu_usermode_queue *queue)
	struct amdgpu_device *adev = uq_mgr->adev;
	const struct amdgpu_userq_funcs *userq_funcs =
		adev->userq_funcs[queue->queue_type];
	bool found_hung_queue = false;
	int r = 0;
	int r;

	if ((queue->state == AMDGPU_USERQ_STATE_MAPPED) ||
	    (queue->state == AMDGPU_USERQ_STATE_PREEMPTED)) {

		r = userq_funcs->unmap(queue);
		if (r) {
			queue->state = AMDGPU_USERQ_STATE_HUNG;
			found_hung_queue = true;
			return r;
		} else {
			queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
		}
	}

	if (found_hung_queue)
		amdgpu_userq_detect_and_reset_queues(uq_mgr);

	return r;
	return 0;
}

static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
@@ -416,19 +408,19 @@ static int amdgpu_userq_map_helper(struct amdgpu_usermode_queue *queue)
	struct amdgpu_device *adev = uq_mgr->adev;
	const struct amdgpu_userq_funcs *userq_funcs =
		adev->userq_funcs[queue->queue_type];
	int r = 0;
	int r;

	if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
		r = userq_funcs->map(queue);
		if (r) {
			queue->state = AMDGPU_USERQ_STATE_HUNG;
			amdgpu_userq_detect_and_reset_queues(uq_mgr);
			return r;
		} else {
			queue->state = AMDGPU_USERQ_STATE_MAPPED;
		}
	}

	return r;
	return 0;
}

static void amdgpu_userq_wait_for_last_fence(struct amdgpu_usermode_queue *queue)
@@ -654,7 +646,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
#if defined(CONFIG_DEBUG_FS)
	debugfs_remove_recursive(queue->debugfs_queue);
#endif
	amdgpu_userq_detect_and_reset_queues(uq_mgr);
	r = amdgpu_userq_unmap_helper(queue);
	atomic_dec(&uq_mgr->userq_count[queue->queue_type]);
	amdgpu_userq_cleanup(queue);
@@ -1264,7 +1255,6 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
	unsigned long queue_id;
	int ret = 0, r;

	amdgpu_userq_detect_and_reset_queues(uq_mgr);
	/* Try to unmap all the queues in this process ctx */
	xa_for_each(&uq_mgr->userq_xa, queue_id, queue) {
		r = amdgpu_userq_preempt_helper(queue);
@@ -1272,9 +1262,11 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
			ret = r;
	}

	if (ret)
	if (ret) {
		drm_file_err(uq_mgr->file,
			     "Couldn't unmap all the queues, eviction failed ret=%d\n", ret);
		amdgpu_userq_detect_and_reset_queues(uq_mgr);
	}
	return ret;
}

@@ -1374,7 +1366,6 @@ int amdgpu_userq_suspend(struct amdgpu_device *adev)
		uqm = queue->userq_mgr;
		cancel_delayed_work_sync(&uqm->resume_work);
		guard(mutex)(&uqm->userq_mutex);
		amdgpu_userq_detect_and_reset_queues(uqm);
		if (adev->in_s0ix)
			r = amdgpu_userq_preempt_helper(queue);
		else
@@ -1433,7 +1424,6 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
		if (((queue->queue_type == AMDGPU_HW_IP_GFX) ||
		     (queue->queue_type == AMDGPU_HW_IP_COMPUTE)) &&
		    (queue->xcp_id == idx)) {
			amdgpu_userq_detect_and_reset_queues(uqm);
			r = amdgpu_userq_preempt_helper(queue);
			if (r)
				ret = r;