Commit fa317985 authored by Lijo Lazar's avatar Lijo Lazar Committed by Alex Deucher
Browse files

drm/amdgpu: Fix map/unmap queue logic



In current logic, it calls ring_alloc followed by a ring_test. ring_test
in turn will call another ring_alloc. This is illegal usage as a
ring_alloc is expected to be closed properly with a ring_commit. Change
to commit the map/unmap queue packet first followed by a ring_test. Add a
comment about the usage of ring_test.

Also, reorder the current pre-condition checks of job hang or kiq ring
scheduler not ready. Without them being met, it is not useful to attempt
ring or memory allocations.

Fixes tag refers to the original patch which introduced this issue which
then got carried over into newer code.

Signed-off-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Reviewed-by: default avatarLe Ma <le.ma@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Fixes: 6c10b5cc ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")
parent 2bb7dced
Loading
Loading
Loading
Loading
+11 −2
Original line number Diff line number Diff line
@@ -834,6 +834,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
		return -EINVAL;

	if (!kiq_ring->sched.ready || adev->job_hang)
		return 0;

	ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL);
	if (!ring_funcs)
		return -ENOMEM;
@@ -858,7 +861,13 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,

	kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0);

	if (kiq_ring->sched.ready && !adev->job_hang)
	/* Submit unmap queue packet */
	amdgpu_ring_commit(kiq_ring);
	/*
	 * Ring test will do a basic scratch register change check. Just run
	 * this to ensure that unmap queues that is submitted before got
	 * processed successfully before returning.
	 */
	r = amdgpu_ring_test_helper(kiq_ring);

	spin_unlock(&kiq->ring_lock);
+45 −18
Original line number Diff line number Diff line
@@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
		return -EINVAL;

	if (!kiq_ring->sched.ready || adev->job_hang)
		return 0;
	/**
	 * This is workaround: only skip kiq_ring test
	 * during ras recovery in suspend stage for gfx9.4.3
	 */
	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
	     amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
	    amdgpu_ras_in_recovery(adev))
		return 0;

	spin_lock(&kiq->ring_lock);
	if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
					adev->gfx.num_compute_rings)) {
@@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id)
					   &adev->gfx.compute_ring[j],
					   RESET_QUEUES, 0, 0);
	}

	/**
	 * This is workaround: only skip kiq_ring test
	 * during ras recovery in suspend stage for gfx9.4.3
	/* Submit unmap queue packet */
	amdgpu_ring_commit(kiq_ring);
	/*
	 * Ring test will do a basic scratch register change check. Just run
	 * this to ensure that unmap queues that is submitted before got
	 * processed successfully before returning.
	 */
	if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
	    amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
	    amdgpu_ras_in_recovery(adev)) {
		spin_unlock(&kiq->ring_lock);
		return 0;
	}

	if (kiq_ring->sched.ready && !adev->job_hang)
	r = amdgpu_ring_test_helper(kiq_ring);

	spin_unlock(&kiq->ring_lock);

	return r;
@@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
	if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
		return -EINVAL;

	spin_lock(&kiq->ring_lock);
	if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang)
		return 0;

	if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) {
		spin_lock(&kiq->ring_lock);
		if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size *
						adev->gfx.num_gfx_rings)) {
			spin_unlock(&kiq->ring_lock);
@@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id)
						   &adev->gfx.gfx_ring[j],
						   PREEMPT_QUEUES, 0, 0);
		}
	}
		/* Submit unmap queue packet */
		amdgpu_ring_commit(kiq_ring);

	if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang)
		/*
		 * Ring test will do a basic scratch register change check.
		 * Just run this to ensure that unmap queues that is submitted
		 * before got processed successfully before returning.
		 */
		r = amdgpu_ring_test_helper(kiq_ring);
		spin_unlock(&kiq->ring_lock);
	}

	return r;
}
@@ -692,7 +707,13 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
		kiq->pmf->kiq_map_queues(kiq_ring,
					 &adev->gfx.compute_ring[j]);
	}

	/* Submit map queue packet */
	amdgpu_ring_commit(kiq_ring);
	/*
	 * Ring test will do a basic scratch register change check. Just run
	 * this to ensure that map queues that is submitted before got
	 * processed successfully before returning.
	 */
	r = amdgpu_ring_test_helper(kiq_ring);
	spin_unlock(&kiq->ring_lock);
	if (r)
@@ -743,7 +764,13 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id)
						 &adev->gfx.gfx_ring[j]);
		}
	}

	/* Submit map queue packet */
	amdgpu_ring_commit(kiq_ring);
	/*
	 * Ring test will do a basic scratch register change check. Just run
	 * this to ensure that map queues that is submitted before got
	 * processed successfully before returning.
	 */
	r = amdgpu_ring_test_helper(kiq_ring);
	spin_unlock(&kiq->ring_lock);
	if (r)
+7 −0
Original line number Diff line number Diff line
@@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device *adev)
		amdgpu_ring_write(kiq_ring, 0);
		amdgpu_ring_write(kiq_ring, 0);
	}
	/* Submit unmap queue packet */
	amdgpu_ring_commit(kiq_ring);
	/*
	 * Ring test will do a basic scratch register change check. Just run
	 * this to ensure that unmap queues that is submitted before got
	 * processed successfully before returning.
	 */
	r = amdgpu_ring_test_helper(kiq_ring);
	if (r)
		DRM_ERROR("KCQ disable failed\n");