Commit 314c38cd authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: retire bad pages for umc v12_0



Retire bad pages for umc v12_0.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e74313be
Loading
Loading
Loading
Loading
+57 −2
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@
#include "umc/umc_12_0_0_sh_mask.h"
#include "mp/mp_13_0_6_sh_mask.h"

#define MAX_ECC_NUM_PER_RETIREMENT  32

static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
					    uint32_t node_inst,
					    uint32_t umc_inst,
@@ -374,6 +376,7 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
	return 0;
}

#ifdef TO_BE_REMOVED
static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
					void *ras_error_status)
{
@@ -442,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
		}
	}
}
#endif

static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
			enum amdgpu_mca_error_type type, void *ras_error_status)
@@ -633,6 +637,58 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
	return 0;
}

static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
				struct ras_ecc_err *ecc_err, void *ras_error_status)
{
	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
	uint32_t i = 0;
	int ret = 0;

	if (!err_data || !ecc_err)
		return -EINVAL;

	for (i = 0; i < ecc_err->err_pages.count; i++) {
		ret = amdgpu_umc_fill_error_record(err_data,
				ecc_err->addr,
				ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
				MCA_IPID_2_UMC_CH(ecc_err->ipid),
				MCA_IPID_2_UMC_INST(ecc_err->ipid));
		if (ret)
			break;
	}

	err_data->de_count++;

	return ret;
}

static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
					void *ras_error_status)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
	struct radix_tree_root *ecc_tree;
	int new_detected, ret, i;

	ecc_tree = &con->umc_ecc_log.de_page_tree;

	mutex_lock(&con->umc_ecc_log.lock);
	new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
			0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
	for (i = 0; i < new_detected; i++) {
		if (!entries[i])
			continue;

		ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
		if (ret) {
			dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
			break;
		}
		radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
	}
	mutex_unlock(&con->umc_ecc_log.lock);
}

struct amdgpu_umc_ras umc_v12_0_ras = {
	.ras_block = {
		.hw_ops = &umc_v12_0_ras_hw_ops,
@@ -640,8 +696,7 @@ struct amdgpu_umc_ras umc_v12_0_ras = {
	},
	.err_cnt_init = umc_v12_0_err_cnt_init,
	.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
	.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
	.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
	.ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
	.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
	.update_ecc_status = umc_v12_0_update_ecc_status,
};