Commit f4322b9f authored by Yunxiang Li's avatar Yunxiang Li Committed by Alex Deucher
Browse files

drm/amdgpu: Fix two reset triggered in a row



Some times a hang GPU causes multiple reset sources to schedule resets.
The second source will be able to trigger an unnecessary reset if they
schedule after we call amdgpu_device_stop_pending_resets.

Move amdgpu_device_stop_pending_resets to after the reset is done. Since
at this point the GPU is supposedly in a good state, any reset scheduled
after this point would be a legitimate reset.

Remove unnecessary and incorrect checks for amdgpu_in_reset that was
kinda serving this purpose.

Signed-off-by: default avatarYunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f5007c67
Loading
Loading
Loading
Loading
+10 −9
Original line number Diff line number Diff line
@@ -5070,8 +5070,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry:
	amdgpu_amdkfd_pre_reset(adev);

	amdgpu_device_stop_pending_resets(adev);

	if (from_hypervisor)
		r = amdgpu_virt_request_full_gpu(adev, true);
	else
@@ -5823,13 +5821,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
				  r, adev_to_drm(tmp_adev)->unique);
			tmp_adev->asic_reset_res = r;
		}

		if (!amdgpu_sriov_vf(tmp_adev))
			/*
			* Drop all pending non scheduler resets. Scheduler resets
			* were already dropped during drm_sched_stop
			*/
			amdgpu_device_stop_pending_resets(tmp_adev);
	}

	/* Actual ASIC resets if needed.*/
@@ -5851,6 +5842,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			goto retry;
	}

	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		/*
		 * Drop any pending non scheduler resets queued before reset is done.
		 * Any reset scheduled after this point would be valid. Scheduler resets
		 * were already dropped during drm_sched_stop and no new ones can come
		 * in before drm_sched_start.
		 */
		amdgpu_device_stop_pending_resets(tmp_adev);
	}

skip_hw_reset:

	/* Post ASIC reset for all devs .*/
+1 −1
Original line number Diff line number Diff line
@@ -599,7 +599,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
	if (ret) {
		adev->virt.vf2pf_update_retry_cnt++;
		if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
		    amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
		    amdgpu_sriov_runtime(adev)) {
			amdgpu_ras_set_fed(adev, true);
			if (amdgpu_reset_domain_schedule(adev->reset_domain,
							  &adev->virt.flr_work))
+1 −1
Original line number Diff line number Diff line
@@ -319,7 +319,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,

	switch (event) {
		case IDH_FLR_NOTIFICATION:
		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
		if (amdgpu_sriov_runtime(adev))
			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
								&adev->virt.flr_work),
				  "Failed to queue work! at %s",
+1 −1
Original line number Diff line number Diff line
@@ -358,7 +358,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,

	switch (event) {
	case IDH_FLR_NOTIFICATION:
		if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
		if (amdgpu_sriov_runtime(adev))
			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
				   &adev->virt.flr_work),
				  "Failed to queue work! at %s",
+1 −1
Original line number Diff line number Diff line
@@ -560,7 +560,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
		r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION);

		/* only handle FLR_NOTIFY now */
		if (!r && !amdgpu_in_reset(adev))
		if (!r)
			WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
								&adev->virt.flr_work),
				  "Failed to queue work! at %s",