Commit f1549c09 authored by Likun Gao's avatar Likun Gao Committed by Alex Deucher
Browse files

drm/amdgpu: support reset flag set for gpu reset



Move reset_context out of gpu recover function to make it configurable
for different reset purpose.
For the reset way of call gpu_recovery sysfs, force to use full reset
method. Otherwise, try soft reset by default if the related ASIC
supportted, if soft reset failed, will use full reset.

Signed-off-by: default avatarLikun Gao <Likun.Gao@amd.com>
Reviewed-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 58e969b6
Loading
Loading
Loading
Loading
+2 −3
Original line number Diff line number Diff line
@@ -1253,9 +1253,8 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
bool amdgpu_device_has_job_running(struct amdgpu_device *adev);
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev);
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			      struct amdgpu_job* job);
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			      struct amdgpu_job *job);
			      struct amdgpu_job *job,
			      struct amdgpu_reset_context *reset_context);
void amdgpu_device_pci_config_reset(struct amdgpu_device *adev);
int amdgpu_device_pci_reset(struct amdgpu_device *adev);
bool amdgpu_device_need_post(struct amdgpu_device *adev);
+8 −1
Original line number Diff line number Diff line
@@ -129,7 +129,14 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)
	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
						  kfd.reset_work);

	amdgpu_device_gpu_recover(adev, NULL);
	struct amdgpu_reset_context reset_context;
	memset(&reset_context, 0, sizeof(reset_context));

	reset_context.method = AMD_RESET_METHOD_NONE;
	reset_context.reset_req_dev = adev;
	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}

void amdgpu_amdkfd_device_init(struct amdgpu_device *adev)
+7 −13
Original line number Diff line number Diff line
@@ -5109,7 +5109,8 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
 */

int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
			      struct amdgpu_job *job)
			      struct amdgpu_job *job,
			      struct amdgpu_reset_context *reset_context)
{
	struct list_head device_list, *device_list_handle =  NULL;
	bool job_signaled = false;
@@ -5119,9 +5120,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	bool need_emergency_restart = false;
	bool audio_suspended = false;
	int tmp_vram_lost_counter;
	struct amdgpu_reset_context reset_context;

	memset(&reset_context, 0, sizeof(reset_context));

	/*
	 * Special case: RAS triggered and full reset isn't supported
@@ -5147,12 +5145,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
	if (hive)
		mutex_lock(&hive->hive_lock);

	reset_context.method = AMD_RESET_METHOD_NONE;
	reset_context.reset_req_dev = adev;
	reset_context.job = job;
	reset_context.hive = hive;
	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

	reset_context->job = job;
	reset_context->hive = hive;
	/*
	 * Build list of devices to reset.
	 * In case we are in XGMI hive mode, resort the device list
@@ -5245,7 +5239,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

retry:	/* Rest of adevs pre asic reset from XGMI hive. */
	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
		r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context);
		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
		/*TODO Should we stop ?*/
		if (r) {
			dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
@@ -5272,7 +5266,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
			amdgpu_ras_resume(adev);
	} else {
		r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
		r = amdgpu_do_asic_reset(device_list_handle, reset_context);
		if (r && r == -EAGAIN)
			goto retry;
	}
@@ -5292,7 +5286,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
		if (amdgpu_gpu_recovery == 2 &&
			!(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter)))
			amdgpu_device_recheck_guilty_jobs(
				tmp_adev, device_list_handle, &reset_context);
				tmp_adev, device_list_handle, reset_context);

		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
			struct amdgpu_ring *ring = tmp_adev->rings[i];
+8 −1
Original line number Diff line number Diff line
@@ -844,7 +844,14 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work)
	struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
						  reset_work);

	amdgpu_device_gpu_recover(adev, NULL);
	struct amdgpu_reset_context reset_context;
	memset(&reset_context, 0, sizeof(reset_context));

	reset_context.method = AMD_RESET_METHOD_NONE;
	reset_context.reset_req_dev = adev;
	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
}

#endif
+9 −1
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@

#include "amdgpu.h"
#include "amdgpu_trace.h"
#include "amdgpu_reset.h"

static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
@@ -64,7 +65,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
		  ti.process_name, ti.tgid, ti.task_name, ti.pid);

	if (amdgpu_device_should_recover_gpu(ring->adev)) {
		r = amdgpu_device_gpu_recover(ring->adev, job);
		struct amdgpu_reset_context reset_context;
		memset(&reset_context, 0, sizeof(reset_context));

		reset_context.method = AMD_RESET_METHOD_NONE;
		reset_context.reset_req_dev = adev;
		clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

		r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
		if (r)
			DRM_ERROR("GPU Recovery Failed: %d\n", r);
	} else {
Loading