Commit 8cc0f566 authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher
Browse files

drm/amdgpu: Support multiple error query modes



Direct error query mode and firmware error query mode
are supported for now.

Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarYang Wang <kevinyang.wang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 07c1db70
Loading
Loading
Loading
Loading
+70 −23
Original line number Diff line number Diff line
@@ -1165,31 +1165,26 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
	}
}

/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
				  struct ras_query_if *info)
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
						struct ras_query_if *info,
						struct ras_err_data *err_data,
						unsigned int error_query_mode)
{
	enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
	struct amdgpu_ras_block_object *block_obj = NULL;
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
	struct ras_err_data err_data;
	int ret;

	if (!obj)
	if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
		return -EINVAL;

	ret = amdgpu_ras_error_data_init(&err_data);
	if (ret)
		return ret;

	if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
		if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
		amdgpu_ras_get_ecc_info(adev, &err_data);
			amdgpu_ras_get_ecc_info(adev, err_data);
		} else {
			block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
			if (!block_obj || !block_obj->hw_ops) {
				dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
					     get_ras_block_str(&info->head));
			ret = -EINVAL;
			goto out_fini_err_data;
				return -EINVAL;
			}

			if (block_obj->hw_ops->query_ras_error_count)
@@ -1202,6 +1197,38 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
					block_obj->hw_ops->query_ras_error_status(adev);
			}
		}
	} else {
		/* FIXME: add code to check return value later */
		amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
		amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
	}

	return 0;
}

/* query/inject/cure begin */
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
{
	struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
	struct ras_err_data err_data;
	unsigned int error_query_mode;
	int ret;

	if (!obj)
		return -EINVAL;

	ret = amdgpu_ras_error_data_init(&err_data);
	if (ret)
		return ret;

	if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
		return -EINVAL;

	ret = amdgpu_ras_query_error_status_helper(adev, info,
						   &err_data,
						   error_query_mode);
	if (ret)
		goto out_fini_err_data;

	amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);

@@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
		return true;
}

bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
				     unsigned int *error_query_mode)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;

	if (!con) {
		*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
		return false;
	}

	if (mca_funcs && mca_funcs->mca_set_debug_mode)
		*error_query_mode =
			(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
	else
		*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;

	return true;
}

/* Register each ip ras block into amdgpu ras */
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
		struct amdgpu_ras_block_object *ras_block_obj)
+8 −0
Original line number Diff line number Diff line
@@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
	AMDGPU_RAS_PT,
};

enum amdgpu_ras_error_query_mode {
	AMDGPU_RAS_INVALID_ERROR_QUERY		= 0,
	AMDGPU_RAS_DIRECT_ERROR_QUERY		= 1,
	AMDGPU_RAS_FIRMWARE_ERROR_QUERY		= 2,
};

/* ras error status reisger fields */
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT	0x0
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK	0x00000001L
@@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co

void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
				     unsigned int *mode);

int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
				struct amdgpu_ras_block_object *ras_block_obj);