Commit 3bae7916 authored by Alex Deucher's avatar Alex Deucher
Browse files

drm/amdgpu/sdma: guilty tracking is per instance



The gfx and page queues are per instance, so track them
per instance.

v2: drop extra parameter (Lijo)

Fixes: fdbfaaaa ("drm/amdgpu: Improve SDMA reset logic with guilty queue tracking")
Reviewed-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e02fcf73
Loading
Loading
Loading
Loading
+4 −3
Original line number Diff line number Diff line
@@ -65,6 +65,10 @@ struct amdgpu_sdma_instance {
	uint64_t		sdma_fw_gpu_addr;
	uint32_t		*sdma_fw_ptr;
	struct mutex		engine_reset_mutex;
	/* track guilty state of GFX and PAGE queues */
	bool			gfx_guilty;
	bool			page_guilty;

};

enum amdgpu_sdma_ras_memory_id {
@@ -127,9 +131,6 @@ struct amdgpu_sdma {
	uint32_t		*ip_dump;
	uint32_t 		supported_reset;
	struct list_head	reset_callback_list;
	/* track guilty state of GFX and PAGE queues */
	bool gfx_guilty;
	bool page_guilty;
};

/*
+14 −13
Original line number Diff line number Diff line
@@ -672,12 +672,11 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
 * @adev: amdgpu_device pointer
 * @i: instance to resume
 * @restore: used to restore wptr when restart
 * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
 *
 * Set up the gfx DMA ring buffers and enable them.
 * Returns 0 for success, error for failure.
 */
static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
{
	struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
	u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -714,7 +713,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
	/* For the guilty queue, set RPTR to the current wptr to skip bad commands,
	 * It is not a guilty queue, restore cache_rptr and continue execution.
	 */
	if (guilty)
	if (adev->sdma.instance[i].gfx_guilty)
		rwptr = ring->wptr;
	else
		rwptr = ring->cached_rptr;
@@ -779,12 +778,11 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
 * @adev: amdgpu_device pointer
 * @i: instance to resume
 * @restore: boolean to say restore needed or not
 * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
 *
 * Set up the page DMA ring buffers and enable them.
 * Returns 0 for success, error for failure.
 */
static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
{
	struct amdgpu_ring *ring = &adev->sdma.instance[i].page;
	u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -803,7 +801,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
	/* For the guilty queue, set RPTR to the current wptr to skip bad commands,
	 * It is not a guilty queue, restore cache_rptr and continue execution.
	 */
	if (guilty)
	if (adev->sdma.instance[i].page_guilty)
		rwptr = ring->wptr;
	else
		rwptr = ring->cached_rptr;
@@ -989,9 +987,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev,
		uint32_t temp;

		WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
		sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
		sdma_v4_4_2_gfx_resume(adev, i, restore);
		if (adev->sdma.has_page_queue)
			sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty);
			sdma_v4_4_2_page_resume(adev, i, restore);

		/* set utc l1 enable flag always to 1 */
		temp = RREG32_SDMA(i, regSDMA_CNTL);
@@ -1446,6 +1444,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)

	for (i = 0; i < adev->sdma.num_instances; i++) {
		mutex_init(&adev->sdma.instance[i].engine_reset_mutex);
		/* Initialize guilty flags for GFX and PAGE queues */
		adev->sdma.instance[i].gfx_guilty = false;
		adev->sdma.instance[i].page_guilty = false;

		ring = &adev->sdma.instance[i].ring;
		ring->ring_obj = NULL;
		ring->use_doorbell = true;
@@ -1507,9 +1509,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
	r = amdgpu_sdma_sysfs_reset_mask_init(adev);
	if (r)
		return r;
	/* Initialize guilty flags for GFX and PAGE queues */
	adev->sdma.gfx_guilty = false;
	adev->sdma.page_guilty = false;

	return r;
}
@@ -1689,9 +1688,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_
		return -EINVAL;

	/* Check if this queue is the guilty one */
	adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
	adev->sdma.instance[instance_id].gfx_guilty =
		sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
	if (adev->sdma.has_page_queue)
		adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
		adev->sdma.instance[instance_id].page_guilty =
			sdma_v4_4_2_is_queue_selected(adev, instance_id, true);

	/* Cache the rptr before reset, after the reset,
	* all of the registers will be reset to 0