drm/amdgpu: Extend bus status check to more cases

In case of unexpected errors, check if device is alive on the bus.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Lijo Lazar
2025-06-13 16:30:30 +05:30
committed by Alex Deucher
parent 167049012e
commit 04141c05f3
4 changed files with 30 additions and 7 deletions

View File

@@ -1762,4 +1762,19 @@ extern const struct attribute_group amdgpu_flash_attr_group;
void amdgpu_set_init_level(struct amdgpu_device *adev,
enum amdgpu_init_lvl_id lvl);
static inline int amdgpu_device_bus_status_check(struct amdgpu_device *adev)
{
u32 status;
int r;
r = pci_read_config_dword(adev->pdev, PCI_COMMAND, &status);
if (r || PCI_POSSIBLE_ERROR(status)) {
dev_err(adev->dev, "device lost from bus!");
return -ENODEV;
}
return 0;
}
#endif

View File

@@ -6071,14 +6071,9 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
{
struct amdgpu_device *tmp_adev;
int ret = 0;
u32 status;
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
if (PCI_POSSIBLE_ERROR(status)) {
dev_err(tmp_adev->dev, "device lost from bus!");
ret = -ENODEV;
}
ret |= amdgpu_device_bus_status_check(tmp_adev);
}
return ret;

View File

@@ -353,11 +353,14 @@ static int aqua_vanjaram_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr)
if (adev->nbio.funcs->get_compute_partition_mode) {
mode = adev->nbio.funcs->get_compute_partition_mode(adev);
if (mode != derv_mode)
if (mode != derv_mode) {
dev_warn(
adev->dev,
"Mismatch in compute partition mode - reported : %d derived : %d",
mode, derv_mode);
if (derv_mode == AMDGPU_UNKNOWN_COMPUTE_PARTITION_MODE)
amdgpu_device_bus_status_check(adev);
}
}
return mode;

View File

@@ -86,6 +86,7 @@ static void smu_cmn_read_arg(struct smu_context *smu,
#define SMU_RESP_BUSY_OTHER 0xFC
#define SMU_RESP_DEBUG_END 0xFB
#define SMU_RESP_UNEXP (~0U)
/**
* __smu_cmn_poll_stat -- poll for a status from the SMU
* @smu: a pointer to SMU context
@@ -171,6 +172,15 @@ static void __smu_cmn_reg_print_error(struct smu_context *smu,
dev_err_ratelimited(adev->dev,
"SMU: I'm debugging!");
break;
case SMU_RESP_UNEXP:
if (amdgpu_device_bus_status_check(smu->adev)) {
/* print error immediately if device is off the bus */
dev_err(adev->dev,
"SMU: response:0x%08X for index:%d param:0x%08X message:%s?",
reg_c2pmsg_90, msg_index, param, message);
break;
}
fallthrough;
default:
dev_err_ratelimited(adev->dev,
"SMU: response:0x%08X for index:%d param:0x%08X message:%s?",