drm/amdgpu: process RAS fatal error MB notification

For RAS error scenario, VF guest driver will check mailbox
and set fed flag to avoid unnecessary HW accesses.
additionally, poll for reset completion message first
to avoid accidentally spamming multiple reset requests to host.

v2: add another mailbox check for handling case where kfd detects
timeout first

v3: set host_flr bit and use wait_for_reset

Signed-off-by: Vignesh Chander <Vignesh.Chander@amd.com>
Reviewed-by: Zhigang Luo <Zhigang.Luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Vignesh Chander
2024-06-24 16:44:26 -05:00
committed by Alex Deucher
parent 78146c1dcd
commit cbda2758d8
7 changed files with 55 additions and 8 deletions

View File

@@ -449,6 +449,13 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev,
}
}
static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev)
{
enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev);
return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF);
}
const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.req_full_gpu = xgpu_nv_request_full_gpu_access,
.rel_full_gpu = xgpu_nv_release_full_gpu_access,
@@ -458,4 +465,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = {
.wait_reset = xgpu_nv_wait_reset,
.trans_msg = xgpu_nv_mailbox_trans_msg,
.ras_poison_handler = xgpu_nv_ras_poison_handler,
.rcvd_ras_intr = xgpu_nv_rcvd_ras_intr,
};