Commit cfce8f4f authored by ganglxie's avatar ganglxie Committed by Alex Deucher
Browse files

drm/amdgpu: refine ras error injection when eeprom initialization failed



when eeprom initialization failed, we still support ras error injection,
and reserve bad pages, but do not save bad pages to eeprom

Signed-off-by: default avatarganglxie <ganglxie@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0b7f1355
Loading
Loading
Loading
Loading
+16 −6
Original line number Diff line number Diff line
@@ -3006,6 +3006,15 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
		return 0;
	}

	if (!con->eeprom_control.is_eeprom_valid) {
		dev_warn(adev->dev,
			"Failed to save EEPROM table data because of EEPROM data corruption!");
		if (new_cnt)
			*new_cnt = 0;

		return 0;
	}

	mutex_lock(&con->recovery_lock);
	control = &con->eeprom_control;
	data = con->eh_data;
@@ -3491,8 +3500,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)

	control = &con->eeprom_control;
	ret = amdgpu_ras_eeprom_init(control);
	if (ret)
		return ret;
	control->is_eeprom_valid = !ret;

	if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
		control->ras_num_pa_recs = control->ras_num_recs;
@@ -3501,10 +3509,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
	    adev->umc.ras->get_retire_flip_bits)
		adev->umc.ras->get_retire_flip_bits(adev);

	if (control->ras_num_recs) {
	if (control->ras_num_recs && control->is_eeprom_valid) {
		ret = amdgpu_ras_load_bad_pages(adev);
		if (ret)
			return ret;
		if (ret) {
			control->is_eeprom_valid = false;
			return 0;
		}

		amdgpu_dpm_send_hbm_bad_pages_num(
			adev, control->ras_num_bad_pages);
@@ -3523,7 +3533,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
					dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
	}

	return ret;
	return 0;
}

int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
+2 −0
Original line number Diff line number Diff line
@@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control {
	/* Record channel info which occurred bad pages
	 */
	u32 bad_channel_bitmap;

	bool is_eeprom_valid;
};

/*