Commit ab66c832 authored by Zhigang Luo's avatar Zhigang Luo Committed by Alex Deucher
Browse files

drm/amdgpu: trigger flr_work if reading pf2vf data failed



if reading pf2vf data failed 30 times continuously, it means something is
wrong. Need to trigger flr_work to recover the issue.

also use dev_err to print the error message to get which device has
issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL
timeout.

Signed-off-by: default avatarZhigang Luo <Zhigang.Luo@amd.com>
Acked-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent dc5c3d48
Loading
Loading
Loading
Loading
+10 −5
Original line number Diff line number Diff line
@@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = {
	"LAST",
};

static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);

/**
 * DOC: pcie_replay_count
 *
@@ -4968,6 +4970,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry:
	amdgpu_amdkfd_pre_reset(adev);

	amdgpu_device_stop_pending_resets(adev);

	if (from_hypervisor)
		r = amdgpu_virt_request_full_gpu(adev, true);
	else
@@ -5708,6 +5712,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			tmp_adev->asic_reset_res = r;
		}

		if (!amdgpu_sriov_vf(tmp_adev))
			/*
			* Drop all pending non scheduler resets. Scheduler resets
			* were already dropped during drm_sched_stop
+24 −5
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@

#include "amdgpu.h"
#include "amdgpu_ras.h"
#include "amdgpu_reset.h"
#include "vi.h"
#include "soc15.h"
#include "nv.h"
@@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
		return -EINVAL;

	if (pf2vf_info->size > 1024) {
		DRM_ERROR("invalid pf2vf message size\n");
		dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size);
		return -EINVAL;
	}

@@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
			adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
			adev->virt.fw_reserve.checksum_key, checksum);
		if (checksum != checkval) {
			DRM_ERROR("invalid pf2vf message\n");
			dev_err(adev->dev,
				"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
				checksum, checkval);
			return -EINVAL;
		}

@@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
			adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
			0, checksum);
		if (checksum != checkval) {
			DRM_ERROR("invalid pf2vf message\n");
			dev_err(adev->dev,
				"invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n",
				checksum, checkval);
			return -EINVAL;
		}

@@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
			((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
		break;
	default:
		DRM_ERROR("invalid pf2vf version\n");
		dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
		return -EINVAL;
	}

@@ -584,8 +589,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
	int ret;

	ret = amdgpu_virt_read_pf2vf_data(adev);
	if (ret)
	if (ret) {
		adev->virt.vf2pf_update_retry_cnt++;
		if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) &&
		    amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
			if (amdgpu_reset_domain_schedule(adev->reset_domain,
							  &adev->virt.flr_work))
				return;
			else
				dev_err(adev->dev, "Failed to queue work! at %s", __func__);
		}

		goto out;
	}

	adev->virt.vf2pf_update_retry_cnt = 0;
	amdgpu_virt_write_vf2pf_data(adev);

out:
@@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev)
	adev->virt.fw_reserve.p_pf2vf = NULL;
	adev->virt.fw_reserve.p_vf2pf = NULL;
	adev->virt.vf2pf_update_interval_ms = 0;
	adev->virt.vf2pf_update_retry_cnt = 0;

	if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) {
		DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!");
+3 −0
Original line number Diff line number Diff line
@@ -52,6 +52,8 @@
/* tonga/fiji use this offset */
#define mmBIF_IOV_FUNC_IDENTIFIER 0x1503

#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 30

enum amdgpu_sriov_vf_mode {
	SRIOV_VF_MODE_BARE_METAL = 0,
	SRIOV_VF_MODE_ONE_VF,
@@ -257,6 +259,7 @@ struct amdgpu_virt {
	/* vf2pf message */
	struct delayed_work vf2pf_work;
	uint32_t vf2pf_update_interval_ms;
	int vf2pf_update_retry_cnt;

	/* multimedia bandwidth config */
	bool     is_mm_bw_enabled;
+2 −0
Original line number Diff line number Diff line
@@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
		timeout -= 10;
	} while (timeout > 1);

	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");

flr_done:
	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
	up_write(&adev->reset_domain->sem);
+2 −0
Original line number Diff line number Diff line
@@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
		timeout -= 10;
	} while (timeout > 1);

	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");

flr_done:
	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
	up_write(&adev->reset_domain->sem);