Commit b17f8732 authored by Lijo Lazar's avatar Lijo Lazar Committed by Alex Deucher
Browse files

drm/amdgpu: Add helper to initialize badpage info



Add a separate function to read badpage data during initialization.
Reading bad pages will need hardware access and cannot be done during
reset. Hence in cases where device needs a full reset during
init itself, attempting to read will cause a deadlock.

Signed-off-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Reviewed-by: default avatarFeifei Xu <Feifei.Xu@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Acked-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: default avatarRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 0ee23991
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -2953,7 +2953,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
	 * Note: theoretically, this should be called before all vram allocations
	 * to protect retired page from abusing
	 */
	r = amdgpu_ras_recovery_init(adev);
	r = amdgpu_ras_recovery_init(adev, true);
	if (r)
		goto init_failed;

+38 −18
Original line number Diff line number Diff line
@@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param)
	return 0;
}

int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	int ret;

	if (!con || amdgpu_sriov_vf(adev))
		return 0;

	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);

	if (ret)
		return ret;

	/* HW not usable */
	if (amdgpu_ras_is_rma(adev))
		return -EHWPOISON;

	if (con->eeprom_control.ras_num_recs) {
		ret = amdgpu_ras_load_bad_pages(adev);
		if (ret)
			return ret;

		amdgpu_dpm_send_hbm_bad_pages_num(
			adev, con->eeprom_control.ras_num_recs);

		if (con->update_channel_flag == true) {
			amdgpu_dpm_send_hbm_bad_channel_flag(
				adev, con->eeprom_control.bad_channel_bitmap);
			con->update_channel_flag = false;
		}
	}

	return ret;
}

int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
{
	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
	struct ras_err_handler_data **data;
@@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
	 */
	if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
		return 0;
	ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
	/*
	 * This calling fails when is_rma is true or
	 * ret != 0.
	 */
	if (amdgpu_ras_is_rma(adev) || ret)
		goto free;

	if (con->eeprom_control.ras_num_recs) {
		ret = amdgpu_ras_load_bad_pages(adev);
	if (init_bp_info) {
		ret = amdgpu_ras_init_badpage_info(adev);
		if (ret)
			goto free;

		amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);

		if (con->update_channel_flag == true) {
			amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
			con->update_channel_flag = false;
		}
	}

	mutex_init(&con->page_rsv_lock);
+2 −2
Original line number Diff line number Diff line
@@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops {
 * 8: feature disable
 */


int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);

void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);