Commit 78e1222f authored by Jesse.Zhang's avatar Jesse.Zhang Committed by Alex Deucher
Browse files

drm/amdgpu/mes: add front end for detect and reset hung queue



Helper function to detect and reset hung queues.  MES will
return an array of doorbell indices of which queues are hung
and were optionally reset.

v2:  Clear the doorbell array before detection

Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarJesse Zhang <Jesse.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6abd725f
Loading
Loading
Loading
Loading
+65 −0
Original line number Diff line number Diff line
@@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
	if (r)
		goto error_doorbell;

	if (adev->mes.hung_queue_db_array_size) {
		r = amdgpu_bo_create_kernel(adev,
					    adev->mes.hung_queue_db_array_size * sizeof(u32),
					    PAGE_SIZE,
					    AMDGPU_GEM_DOMAIN_GTT,
					    &adev->mes.hung_queue_db_array_gpu_obj,
					    &adev->mes.hung_queue_db_array_gpu_addr,
					    &adev->mes.hung_queue_db_array_cpu_addr);
		if (r) {
			dev_warn(adev->dev, "failed to create MES hung db array buffer (%d)", r);
			goto error_doorbell;
		}
	}

	return 0;

error_doorbell:
@@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
{
	int i;

	amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj,
			      &adev->mes.hung_queue_db_array_gpu_addr,
			      &adev->mes.hung_queue_db_array_cpu_addr);

	amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
			      &adev->mes.event_log_gpu_addr,
			      &adev->mes.event_log_cpu_addr);
@@ -366,6 +384,53 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
	return r;
}

int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
{
	return adev->mes.hung_queue_db_array_size;
}

int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
					    int queue_type,
					    bool detect_only,
					    unsigned int *hung_db_num,
					    u32 *hung_db_array)

{
	struct mes_detect_and_reset_queue_input input;
	u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr;
	int r, i;

	if (!hung_db_num || !hung_db_array)
		return -EINVAL;

	if ((queue_type != AMDGPU_RING_TYPE_GFX) &&
	    (queue_type != AMDGPU_RING_TYPE_COMPUTE) &&
	    (queue_type != AMDGPU_RING_TYPE_SDMA))
		return -EINVAL;

	/* Clear the doorbell array before detection */
	memset(adev->mes.hung_queue_db_array_cpu_addr, 0,
		adev->mes.hung_queue_db_array_size * sizeof(u32));
	input.queue_type = queue_type;
	input.detect_only = detect_only;

	r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
							  &input);
	if (r) {
		dev_err(adev->dev, "failed to detect and reset\n");
	} else {
		*hung_db_num = 0;
		for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
			if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
				hung_db_array[i] = db_array[i];
				*hung_db_num += 1;
			}
		}
	}

	return r;
}

uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
{
	struct mes_misc_op_input op_input;
+21 −0
Original line number Diff line number Diff line
@@ -41,6 +41,7 @@
#define AMDGPU_MES_API_VERSION_MASK	0x00fff000
#define AMDGPU_MES_FEAT_VERSION_MASK	0xff000000
#define AMDGPU_MES_MSCRATCH_SIZE	0x40000
#define AMDGPU_MES_INVALID_DB_OFFSET	0xffffffff

enum amdgpu_mes_priority_level {
	AMDGPU_MES_PRIORITY_LEVEL_LOW       = 0,
@@ -147,6 +148,10 @@ struct amdgpu_mes {
	uint64_t            resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES];
	void                *resource_1_addr[AMDGPU_MAX_MES_PIPES];

	int				hung_queue_db_array_size;
	struct amdgpu_bo		*hung_queue_db_array_gpu_obj;
	uint64_t			hung_queue_db_array_gpu_addr;
	void				*hung_queue_db_array_cpu_addr;
};

struct amdgpu_mes_gang {
@@ -280,6 +285,11 @@ struct mes_reset_queue_input {
	bool                               is_kq;
};

struct mes_detect_and_reset_queue_input {
	uint32_t                           queue_type;
	bool                               detect_only;
};

struct mes_inv_tlbs_pasid_input {
	uint32_t        xcc_id;
	uint16_t        pasid;
@@ -375,6 +385,10 @@ struct amdgpu_mes_funcs {
	int (*reset_hw_queue)(struct amdgpu_mes *mes,
			      struct mes_reset_queue_input *input);

	int (*detect_and_reset_hung_queues)(struct amdgpu_mes *mes,
			      struct mes_detect_and_reset_queue_input *input);


	int (*invalidate_tlbs_pasid)(struct amdgpu_mes *mes,
			      struct mes_inv_tlbs_pasid_input *input);
};
@@ -400,6 +414,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
				  unsigned int vmid,
				  bool use_mmio);

int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev);
int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
					    int queue_type,
					    bool detect_only,
					    unsigned int *hung_db_num,
					    u32 *hung_db_array);

uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
int amdgpu_mes_wreg(struct amdgpu_device *adev,
		    uint32_t reg, uint32_t val);