Commit 38b20968 authored by Alex Deucher's avatar Alex Deucher
Browse files

drm/amdgpu: move scheduler wqueue handling into callbacks



Move the scheduler wqueue stopping and starting into
the ring reset callbacks.  On some IPs we have to reset
an engine which may have multiple queues.  Move the wqueue
handling into the backend so we can handle them as needed
based on the type of reset available.

Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 43ca5eb9
Loading
Loading
Loading
Loading
+0 −8
Original line number Diff line number Diff line
@@ -135,17 +135,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
	} else if (amdgpu_gpu_recovery && ring->funcs->reset) {
		dev_err(adev->dev, "Starting %s ring reset\n",
			s_job->sched->name);

		/*
		 * Stop the scheduler to prevent anybody else from touching the
		 * ring buffer.
		 */
		drm_sched_wqueue_stop(&ring->sched);

		r = amdgpu_ring_reset(ring, job->vmid, NULL);
		if (!r) {
			atomic_inc(&ring->adev->gpu_reset_counter);
			drm_sched_wqueue_start(&ring->sched);
			dev_err(adev->dev, "Ring %s reset succeeded\n",
				ring->sched.name);
			drm_dev_wedged_event(adev_to_drm(adev),
+4 −13
Original line number Diff line number Diff line
@@ -554,22 +554,16 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
	struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
	struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
	struct amdgpu_ring *page_ring = &sdma_instance->page;
	bool gfx_sched_stopped = false, page_sched_stopped = false;

	mutex_lock(&sdma_instance->engine_reset_mutex);
	/* Stop the scheduler's work queue for the GFX and page rings if they are running.
	* This ensures that no new tasks are submitted to the queues while
	* the reset is in progress.
	*/
	if (!amdgpu_ring_sched_ready(gfx_ring)) {
	drm_sched_wqueue_stop(&gfx_ring->sched);
		gfx_sched_stopped = true;
	}

	if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
	if (adev->sdma.has_page_queue)
		drm_sched_wqueue_stop(&page_ring->sched);
		page_sched_stopped = true;
	}

	if (sdma_instance->funcs->stop_kernel_queue) {
		sdma_instance->funcs->stop_kernel_queue(gfx_ring);
@@ -596,13 +590,10 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
	 * to be submitted to the queues after the reset is complete.
	 */
	if (!ret) {
		if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
		drm_sched_wqueue_start(&gfx_ring->sched);
		}
		if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) {
		if (adev->sdma.has_page_queue)
			drm_sched_wqueue_start(&page_ring->sched);
	}
	}
	mutex_unlock(&sdma_instance->engine_reset_mutex);

	return ret;
+6 −0
Original line number Diff line number Diff line
@@ -9540,6 +9540,8 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring,
	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
		return -EINVAL;

	drm_sched_wqueue_stop(&ring->sched);

	spin_lock_irqsave(&kiq->ring_lock, flags);

	if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) {
@@ -9581,6 +9583,7 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring,
	if (r)
		return r;
	amdgpu_fence_driver_force_completion(ring);
	drm_sched_wqueue_start(&ring->sched);
	return 0;
}

@@ -9600,6 +9603,8 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
		return -EINVAL;

	drm_sched_wqueue_stop(&ring->sched);

	spin_lock_irqsave(&kiq->ring_lock, flags);

	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
@@ -9658,6 +9663,7 @@ static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
	if (r)
		return r;
	amdgpu_fence_driver_force_completion(ring);
	drm_sched_wqueue_start(&ring->sched);
	return 0;
}

+6 −0
Original line number Diff line number Diff line
@@ -6821,6 +6821,8 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
	if (amdgpu_sriov_vf(adev))
		return -EINVAL;

	drm_sched_wqueue_stop(&ring->sched);

	r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
	if (r) {

@@ -6846,6 +6848,7 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring,
	if (r)
		return r;
	amdgpu_fence_driver_force_completion(ring);
	drm_sched_wqueue_start(&ring->sched);
	return 0;
}

@@ -6989,6 +6992,8 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
	if (amdgpu_sriov_vf(adev))
		return -EINVAL;

	drm_sched_wqueue_stop(&ring->sched);

	r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
	if (r) {
		dev_warn(adev->dev, "fail(%d) to reset kcq and try pipe reset\n", r);
@@ -7012,6 +7017,7 @@ static int gfx_v11_0_reset_kcq(struct amdgpu_ring *ring,
	if (r)
		return r;
	amdgpu_fence_driver_force_completion(ring);
	drm_sched_wqueue_start(&ring->sched);
	return 0;
}

+6 −0
Original line number Diff line number Diff line
@@ -5317,6 +5317,8 @@ static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring,
	if (amdgpu_sriov_vf(adev))
		return -EINVAL;

	drm_sched_wqueue_stop(&ring->sched);

	r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
	if (r) {
		dev_warn(adev->dev, "reset via MES failed and try pipe reset %d\n", r);
@@ -5341,6 +5343,7 @@ static int gfx_v12_0_reset_kgq(struct amdgpu_ring *ring,
	if (r)
		return r;
	amdgpu_fence_driver_force_completion(ring);
	drm_sched_wqueue_start(&ring->sched);
	return 0;
}

@@ -5437,6 +5440,8 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring,
	if (amdgpu_sriov_vf(adev))
		return -EINVAL;

	drm_sched_wqueue_stop(&ring->sched);

	r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, true);
	if (r) {
		dev_warn(adev->dev, "fail(%d) to reset kcq  and try pipe reset\n", r);
@@ -5460,6 +5465,7 @@ static int gfx_v12_0_reset_kcq(struct amdgpu_ring *ring,
	if (r)
		return r;
	amdgpu_fence_driver_force_completion(ring);
	drm_sched_wqueue_start(&ring->sched);
	return 0;
}

Loading