Commit 72289903 authored by Gangliang Xie's avatar Gangliang Xie Committed by Alex Deucher
Browse files

drm/amd/ras: adapt page retirement process for pmfw eeprom



read bad page data from pmfw eeprom when retirement
is triggered, use timestamp read from eeprom

Signed-off-by: default avatarGangliang Xie <ganglxie@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 42c46be2
Loading
Loading
Loading
Loading
+21 −10
Original line number Diff line number Diff line
@@ -234,17 +234,28 @@ static int aca_log_bad_bank(struct ras_core_context *ras_core,
	    bank_ecc->de_count) {
		struct ras_bank_ecc  ras_ecc = {0};

		if (ras_fw_eeprom_supported(ras_core)) {
			ret = ras_fw_eeprom_update_record(ras_core, &ras_ecc);
			if (!ret) {
				ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
				ras_ecc.status = bank_ecc->bank_info.status;
				ras_ecc.seq_no = bank->seq_no;
			}
		} else {
			ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core);
			ras_ecc.addr = bank_ecc->bank_info.addr;
			ras_ecc.ipid = bank_ecc->bank_info.ipid;
			ras_ecc.status = bank_ecc->bank_info.status;
			ras_ecc.seq_no = bank->seq_no;
		}

		if (!ret) {
			if (ras_core_gpu_in_reset(ras_core))
				ras_umc_log_bad_bank_pending(ras_core, &ras_ecc);
			else
				ras_umc_log_bad_bank(ras_core, &ras_ecc);
		}
	}

	aca_report_ecc_info(ras_core,
		bank->seq_no, aca_blk->blk_info->ras_block_id, info->socket_id, info->die_id,
+40 −0
Original line number Diff line number Diff line
@@ -24,6 +24,8 @@

#include "ras.h"

#define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */

void ras_fw_init_feature_flags(struct ras_core_context *ras_core)
{
	struct ras_mp1 *mp1 = &ras_core->ras_mp1;
@@ -329,3 +331,41 @@ uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core)

	return ras_core->ras_fw_eeprom.ras_num_recs;
}

int ras_fw_eeprom_update_record(struct ras_core_context *ras_core,
				struct ras_bank_ecc *ras_ecc)
{
	struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom;
	int ret, retry = 20;
	u32 recs_num_new = control->ras_num_recs;

	do {
		/* 1000ms timeout is long enough, smu_get_badpage_count won't
		 * return -EBUSY before timeout.
		 */
		ret = ras_fw_get_badpage_count(ras_core,
			&recs_num_new, RAS_SMU_MESSAGE_TIMEOUT_MS);
		if (!ret &&
		    (recs_num_new == control->ras_num_recs)) {
			/* record number update in PMFW needs some time,
			 * smu_get_badpage_count may return immediately without
			 * count update, sleep for a while and retry again.
			 */
			msleep(50);
			retry--;
		} else {
			break;
		}
	} while (retry);

	if (ret)
		return ret;

	if (recs_num_new > control->ras_num_recs)
		ret = ras_fw_eeprom_read_idx(ras_core, 0,
					ras_ecc, control->ras_num_recs, 1);
	else
		ret = -EINVAL;

	return ret;
}
+2 −0
Original line number Diff line number Diff line
@@ -75,5 +75,7 @@ int ras_fw_eeprom_read_idx(struct ras_core_context *ras_core,
			 struct ras_bank_ecc *ras_ecc,
			 u32 rec_idx, const u32 num);
uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core);
int ras_fw_eeprom_update_record(struct ras_core_context *ras_core,
				struct ras_bank_ecc *ras_ecc);

#endif
+3 −0
Original line number Diff line number Diff line
@@ -373,6 +373,9 @@ static int umc_v12_0_bank_to_eeprom_record(struct ras_core_context *ras_core,
		ACA_ADDR_2_ERR_ADDR(bank->addr), ACA_IPID_2_UMC_INST(bank->ipid),
		&nps_addr, bank->nps, record);

	if (ras_fw_eeprom_supported(ras_core) && bank->ts)
		record->ts = bank->ts;

	lookup_bad_pages_in_a_row(ras_core, record,
		bank->nps, NULL, 0, bank->seq_no, true);