Commit 05d50ea3 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher
Browse files

drm/amdgpu: format old RAS eeprom data into V3 version



Clear old data and save it in V3 format.

v2: only format eeprom data for new ASICs.

Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 9deacd6c
Loading
Loading
Loading
Loading
+7 −0
Original line number Diff line number Diff line
@@ -3473,6 +3473,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
				adev, control->bad_channel_bitmap);
			con->update_channel_flag = false;
		}

		/* The format action is only applied to new ASICs */
		if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
		    control->tbl_hdr.version < RAS_TABLE_VER_V3)
			if (!amdgpu_ras_eeprom_reset_table(control))
				if (amdgpu_ras_save_bad_pages(adev, NULL))
					dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
	}

	return ret;
+14 −12
Original line number Diff line number Diff line
@@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control

	switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
	case IP_VERSION(8, 10, 0):
	case IP_VERSION(12, 0, 0):
		hdr->version = RAS_TABLE_VER_V2_1;
		return;
	case IP_VERSION(12, 0, 0):
		hdr->version = RAS_TABLE_VER_V3;
		return;
	default:
		hdr->version = RAS_TABLE_VER_V1;
		return;
@@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
	hdr->header = RAS_TABLE_HDR_VAL;
	amdgpu_ras_set_eeprom_table_version(control);

	if (hdr->version == RAS_TABLE_VER_V2_1) {
	if (hdr->version >= RAS_TABLE_VER_V2_1) {
		hdr->first_rec_offset = RAS_RECORD_START_V2_1;
		hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
				RAS_TABLE_V2_1_INFO_SIZE;
@@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
	}

	csum = __calc_hdr_byte_sum(control);
	if (hdr->version == RAS_TABLE_VER_V2_1)
	if (hdr->version >= RAS_TABLE_VER_V2_1)
		csum += __calc_ras_info_byte_sum(control);
	csum = -csum;
	hdr->checksum = csum;
@@ -757,7 +759,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
			"Saved bad pages %d reaches threshold value %d\n",
			control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
		control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
		if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
		if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
			control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
			control->tbl_rai.health_percent = 0;
		}
@@ -770,7 +772,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
		amdgpu_dpm_send_rma_reason(adev);
	}

	if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
		control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
					    RAS_TABLE_V2_1_INFO_SIZE +
					    control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -810,7 +812,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
	 * now calculate gpu health percent
	 */
	if (amdgpu_bad_page_threshold != 0 &&
	    control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
	    control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
	    control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
		control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
						   control->ras_num_bad_pages) * 100) /
@@ -823,7 +825,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
		csum += *pp;

	csum += __calc_hdr_byte_sum(control);
	if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
		csum += __calc_ras_info_byte_sum(control);
	/* avoid sign extension when assigning to "checksum" */
	csum = -csum;
@@ -1040,7 +1042,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co
	/* get available eeprom table version first before eeprom table init */
	amdgpu_ras_set_eeprom_table_version(control);

	if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
		return RAS_MAX_RECORD_COUNT_V2_1;
	else
		return RAS_MAX_RECORD_COUNT;
@@ -1285,7 +1287,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
	int buf_size, res;
	u8  csum, *buf, *pp;

	if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
	if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
		buf_size = RAS_TABLE_HEADER_SIZE +
			   RAS_TABLE_V2_1_INFO_SIZE +
			   control->ras_num_recs * RAS_TABLE_RECORD_SIZE;
@@ -1388,7 +1390,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)

	__decode_table_header_from_buf(hdr, buf);

	if (hdr->version == RAS_TABLE_VER_V2_1) {
	if (hdr->version >= RAS_TABLE_VER_V2_1) {
		control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
		control->ras_record_offset = RAS_RECORD_START_V2_1;
		control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
@@ -1428,7 +1430,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
		DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
				 control->ras_num_bad_pages);

		if (hdr->version == RAS_TABLE_VER_V2_1) {
		if (hdr->version >= RAS_TABLE_VER_V2_1) {
			res = __read_table_ras_info(control);
			if (res)
				return res;
@@ -1448,7 +1450,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
					ras->bad_page_cnt_threshold);
	} else if (hdr->header == RAS_TABLE_HDR_BAD &&
		   amdgpu_bad_page_threshold != 0) {
		if (hdr->version == RAS_TABLE_VER_V2_1) {
		if (hdr->version >= RAS_TABLE_VER_V2_1) {
			res = __read_table_ras_info(control);
			if (res)
				return res;
+1 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@

#define RAS_TABLE_VER_V1           0x00010000
#define RAS_TABLE_VER_V2_1         0x00021000
#define RAS_TABLE_VER_V3           0x00030000

struct amdgpu_device;