drm/amdgpu: make reset method configurable for RAS poison

Each RAS block has different requirement for gpu reset in poison
consumption handling.
Add support for mmhub RAS poison consumption handling.

v2: remove the mmhub poison support for kfd int v10.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Tao Zhou
2024-03-12 11:30:09 +08:00
committed by Alex Deucher
parent e3d4de8d8b
commit 2fc46e0b2f
8 changed files with 41 additions and 29 deletions

View File

@@ -145,6 +145,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
{
enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
if (!p)
@@ -164,6 +165,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
if (ret)
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
if (ret)
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -171,6 +181,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SDMA3:
case SOC15_IH_CLIENTID_SDMA4:
block = AMDGPU_RAS_BLOCK__SDMA;
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
break;
@@ -181,17 +192,16 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
/* resetting queue passes, do page retirement without gpu reset
* resetting queue fails, fallback to gpu reset solution
*/
if (!ret) {
if (!ret)
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, false);
} else {
else
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, true);
}
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}
static bool context_id_expected(struct kfd_dev *dev)