Commit 0ef930e1 authored by Jonathan Kim's avatar Jonathan Kim Committed by Alex Deucher
Browse files

drm/amdgpu: fix hung reset queue array memory allocation



By design the MES will return an array result that is twice the number
of hung doorbells it can report.

i.e. if up k reported doorbells are supported, then the
second half of the array, also of length k, holds the HQD information
(type/queue/pipe) where queue 1 corresponds to index 0 and k,
queue 2 corresponds to index 1 and k + 1 etc ...

The driver will use the HDQ info to target queue/pipe reset for
hardware scheduled user compute queues.

Signed-off-by: default avatarJonathan Kim <jonathan.kim@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 8745ca5e
Loading
Loading
Loading
Loading
+6 −1
Original line number Diff line number Diff line
@@ -420,12 +420,17 @@ int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
		dev_err(adev->dev, "failed to detect and reset\n");
	} else {
		*hung_db_num = 0;
		for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
		for (i = 0; i < adev->mes.hung_queue_hqd_info_offset; i++) {
			if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
				hung_db_array[i] = db_array[i];
				*hung_db_num += 1;
			}
		}

		/*
		 * TODO: return HQD info for MES scheduled user compute queue reset cases
		 * stored in hung_db_array hqd info offset to full array size
		 */
	}

	return r;
+1 −0
Original line number Diff line number Diff line
@@ -149,6 +149,7 @@ struct amdgpu_mes {
	void                *resource_1_addr[AMDGPU_MAX_MES_PIPES];

	int				hung_queue_db_array_size;
	int				hung_queue_hqd_info_offset;
	struct amdgpu_bo		*hung_queue_db_array_gpu_obj;
	uint64_t			hung_queue_db_array_gpu_addr;
	void				*hung_queue_db_array_cpu_addr;
+3 −3
Original line number Diff line number Diff line
@@ -208,10 +208,10 @@ static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
	struct amdgpu_userq_mgr *uqm, *tmp;
	unsigned int hung_db_num = 0;
	int queue_id, r, i;
	u32 db_array[4];
	u32 db_array[8];

	if (db_array_size > 4) {
		dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
	if (db_array_size > 8) {
		dev_err(adev->dev, "DB array size (%d vs 8) too small\n",
			db_array_size);
		return -EINVAL;
	}
+5 −3
Original line number Diff line number Diff line
@@ -66,7 +66,8 @@ static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev);
#define GFX_MES_DRAM_SIZE	0x80000
#define MES11_HW_RESOURCE_1_SIZE (128 * AMDGPU_GPU_PAGE_SIZE)

#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 4
#define MES11_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset, [4:7] = hqd info */
#define MES11_HUNG_HQD_INFO_OFFSET	4

static void mes_v11_0_ring_set_wptr(struct amdgpu_ring *ring)
{
@@ -1720,8 +1721,9 @@ static int mes_v11_0_early_init(struct amdgpu_ip_block *ip_block)
	struct amdgpu_device *adev = ip_block->adev;
	int pipe, r;

	adev->mes.hung_queue_db_array_size =
		MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
	adev->mes.hung_queue_db_array_size = MES11_HUNG_DB_OFFSET_ARRAY_SIZE;
	adev->mes.hung_queue_hqd_info_offset = MES11_HUNG_HQD_INFO_OFFSET;

	for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
		if (!adev->enable_mes_kiq && pipe == AMDGPU_MES_KIQ_PIPE)
			continue;
+5 −3
Original line number Diff line number Diff line
@@ -47,7 +47,8 @@ static int mes_v12_0_kiq_hw_fini(struct amdgpu_device *adev);

#define MES_EOP_SIZE   2048

#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 4
#define MES12_HUNG_DB_OFFSET_ARRAY_SIZE 8 /* [0:3] = db offset [4:7] hqd info */
#define MES12_HUNG_HQD_INFO_OFFSET	4

static void mes_v12_0_ring_set_wptr(struct amdgpu_ring *ring)
{
@@ -1904,8 +1905,9 @@ static int mes_v12_0_early_init(struct amdgpu_ip_block *ip_block)
	struct amdgpu_device *adev = ip_block->adev;
	int pipe, r;

	adev->mes.hung_queue_db_array_size =
		MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
	adev->mes.hung_queue_db_array_size = MES12_HUNG_DB_OFFSET_ARRAY_SIZE;
	adev->mes.hung_queue_hqd_info_offset = MES12_HUNG_HQD_INFO_OFFSET;

	for (pipe = 0; pipe < AMDGPU_MAX_MES_PIPES; pipe++) {
		r = amdgpu_mes_init_microcode(adev, pipe);
		if (r)