Commit 56631dee authored by YiPeng Chai's avatar YiPeng Chai Committed by Alex Deucher
Browse files

drm/amdgpu: optimize logging deferred error info



1. Use pa_pfn as the radix-tree key index to log
   deferred error info.
2. Use local array to store a row of bad pages.

Signed-off-by: default avatarYiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 27cdf8c3
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -476,10 +476,10 @@ struct ras_err_pages {
};

struct ras_ecc_err {
	u64 hash_index;
	uint64_t status;
	uint64_t ipid;
	uint64_t addr;
	uint64_t pa_pfn;
	struct ras_err_pages err_pages;
};

+3 −11
Original line number Diff line number Diff line
@@ -519,18 +519,10 @@ int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
	ecc_log = &con->umc_ecc_log;

	mutex_lock(&ecc_log->lock);
	ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
	if (!ret) {
		struct ras_err_pages *err_pages = &ecc_err->err_pages;
		int i;

		/* Reserve memory */
		for (i = 0; i < err_pages->count; i++)
			amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);

	ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err);
	if (!ret)
		radix_tree_tag_set(ecc_tree,
			ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
	}
			ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
	mutex_unlock(&ecc_log->lock);

	return ret;
+31 −34
Original line number Diff line number Diff line
@@ -524,9 +524,9 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	uint16_t hwid, mcatype;
	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
	uint64_t err_addr, hash_val = 0, pa_addr = 0;
	uint64_t err_addr, pa_addr = 0;
	struct ras_ecc_err *ecc_err;
	int count, ret;
	int count, ret, i;

	hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
	mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -559,39 +559,18 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
	if (ret)
		return ret;

	memset(page_pfn, 0, sizeof(page_pfn));
	count = umc_v12_0_lookup_bad_pages_in_a_row(adev,
				pa_addr,
				page_pfn, ARRAY_SIZE(page_pfn));
	if (count <= 0) {
		dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
		return 0;
	}

	ret = amdgpu_umc_build_pages_hash(adev,
			page_pfn, count, &hash_val);
	if (ret) {
		dev_err(adev->dev, "Fail to build error pages hash\n");
		return ret;
	}

	ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
	if (!ecc_err)
		return -ENOMEM;

	ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
	if (!ecc_err->err_pages.pfn) {
		kfree(ecc_err);
		return -ENOMEM;
	}

	memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
	ecc_err->err_pages.count = count;

	ecc_err->hash_index = hash_val;
	ecc_err->status = status;
	ecc_err->ipid = ipid;
	ecc_err->addr = addr;
	ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >> AMDGPU_GPU_PAGE_SHIFT;

	/* If converted pa_pfn is 0, use pa C4 pfn. */
	if (!ecc_err->pa_pfn)
		ecc_err->pa_pfn = BIT_ULL(UMC_V12_0_PA_C4_BIT) >> AMDGPU_GPU_PAGE_SHIFT;

	ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
	if (ret) {
@@ -600,13 +579,25 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
		else
			dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);

		kfree(ecc_err->err_pages.pfn);
		kfree(ecc_err);
		return ret;
	}

	con->umc_ecc_log.de_queried_count++;

	memset(page_pfn, 0, sizeof(page_pfn));
	count = umc_v12_0_lookup_bad_pages_in_a_row(adev,
				pa_addr,
				page_pfn, ARRAY_SIZE(page_pfn));
	if (count <= 0) {
		dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
		return 0;
	}

	/* Reserve memory */
	for (i = 0; i < count; i++)
		amdgpu_ras_reserve_page(adev, page_pfn[i]);

	/* The problem case is as follows:
	 * 1. GPU A triggers a gpu ras reset, and GPU A drives
	 *    GPU B to also perform a gpu ras reset.
@@ -631,16 +622,21 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
				struct ras_ecc_err *ecc_err, void *ras_error_status)
{
	struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
	uint32_t i = 0;
	int ret = 0;
	uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
	int ret, i, count;

	if (!err_data || !ecc_err)
		return -EINVAL;

	for (i = 0; i < ecc_err->err_pages.count; i++) {
	memset(page_pfn, 0, sizeof(page_pfn));
	count = umc_v12_0_lookup_bad_pages_in_a_row(adev,
				ecc_err->pa_pfn << AMDGPU_GPU_PAGE_SHIFT,
				page_pfn, ARRAY_SIZE(page_pfn));

	for (i = 0; i < count; i++) {
		ret = amdgpu_umc_fill_error_record(err_data,
				ecc_err->addr,
				ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
				page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
				MCA_IPID_2_UMC_CH(ecc_err->ipid),
				MCA_IPID_2_UMC_INST(ecc_err->ipid));
		if (ret)
@@ -674,7 +670,8 @@ static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
			dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
			break;
		}
		radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
		radix_tree_tag_clear(ecc_tree,
				entries[i]->pa_pfn, UMC_ECC_NEW_DETECTED_TAG);
	}
	mutex_unlock(&con->umc_ecc_log.lock);
}
+5 −0
Original line number Diff line number Diff line
@@ -81,6 +81,11 @@
	(((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
	 (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))

#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \
	((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \
			(0x1ULL << UMC_V12_0_PA_C4_BIT) | \
			(0x1ULL << UMC_V12_0_PA_R13_BIT)))

bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);