Commit c67db6a6 authored by Trigger Huang's avatar Trigger Huang Committed by Alex Deucher
Browse files

drm/amdgpu: Do core dump immediately when job tmo



Do the coredump immediately after a job timeout to get a closer
representation of GPU's error status.

V2: This will skip printing vram_lost as the GPU reset is not
happened yet (Alex)

V3: Unconditionally call the core dump as we care about all the reset
functions(soft-recovery and queue reset and full adapter reset, Alex)

V4: Do the dump after adev->job_hang = true (Sunil)

Signed-off-by: default avatarTrigger Huang <Trigger.Huang@amd.com>
Acked-by: default avatarSunil Khatri <sunil.khatri@amd.com>
Reviewed-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 6122f5c7
Loading
Loading
Loading
Loading
+67 −1
Original line number Diff line number Diff line
@@ -30,6 +30,61 @@
#include "amdgpu.h"
#include "amdgpu_trace.h"
#include "amdgpu_reset.h"
#include "amdgpu_dev_coredump.h"
#include "amdgpu_xgmi.h"

static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
				    struct amdgpu_job *job)
{
	int i;

	dev_info(adev->dev, "Dumping IP State\n");
	for (i = 0; i < adev->num_ip_blocks; i++) {
		if (adev->ip_blocks[i].version->funcs->dump_ip_state)
			adev->ip_blocks[i].version->funcs
				->dump_ip_state((void *)adev);
		dev_info(adev->dev, "Dumping IP State Completed\n");
	}

	amdgpu_coredump(adev, true, false, job);
}

static void amdgpu_job_core_dump(struct amdgpu_device *adev,
				 struct amdgpu_job *job)
{
	struct list_head device_list, *device_list_handle =  NULL;
	struct amdgpu_device *tmp_adev = NULL;
	struct amdgpu_hive_info *hive = NULL;

	if (!amdgpu_sriov_vf(adev))
		hive = amdgpu_get_xgmi_hive(adev);
	if (hive)
		mutex_lock(&hive->hive_lock);
	/*
	 * Reuse the logic in amdgpu_device_gpu_recover() to build list of
	 * devices for code dump
	 */
	INIT_LIST_HEAD(&device_list);
	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
			list_add_tail(&tmp_adev->reset_list, &device_list);
		if (!list_is_first(&adev->reset_list, &device_list))
			list_rotate_to_front(&adev->reset_list, &device_list);
		device_list_handle = &device_list;
	} else {
		list_add_tail(&adev->reset_list, &device_list);
		device_list_handle = &device_list;
	}

	/* Do the coredump for each device */
	list_for_each_entry(tmp_adev, device_list_handle, reset_list)
		amdgpu_job_do_core_dump(tmp_adev, job);

	if (hive) {
		mutex_unlock(&hive->hive_lock);
		amdgpu_put_xgmi_hive(hive);
	}
}

static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
@@ -48,9 +103,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
		return DRM_GPU_SCHED_STAT_ENODEV;
	}


	adev->job_hang = true;

	/*
	 * Do the coredump immediately after a job timeout to get a very
	 * close dump/snapshot/representation of GPU's current error status
	 */
	amdgpu_job_core_dump(adev, job);

	if (amdgpu_gpu_recovery &&
	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
		dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
@@ -101,6 +161,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
		reset_context.src = AMDGPU_RESET_SRC_JOB;
		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

		/*
		 * To avoid an unnecessary extra coredump, as we have already
		 * got the very close representation of GPU's error status
		 */
		set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);

		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
		if (r)
			dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);