Commit dbe2c4c8 authored by Eric Huang's avatar Eric Huang Committed by Alex Deucher
Browse files

drm/amdkfd: add reset cause in gpu pre-reset smi event



reset cause is requested by customer as additional
info for gpu reset smi event.

v2: integerate reset sources suggested by Lijo Lazar

Signed-off-by: default avatarEric Huang <jinhuieric.huang@amd.com>
Reviewed-by: default avatarLijo Lazar <lijo.lazar@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3c7758be
Loading
Loading
Loading
Loading
+6 −2
Original line number Diff line number Diff line
@@ -133,6 +133,9 @@ static void amdgpu_amdkfd_reset_work(struct work_struct *work)

	reset_context.method = AMD_RESET_METHOD_NONE;
	reset_context.reset_req_dev = adev;
	reset_context.src = adev->enable_mes ?
			    AMDGPU_RESET_SRC_MES :
			    AMDGPU_RESET_SRC_HWS;
	clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);

	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
@@ -261,12 +264,13 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
	return r;
}

int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
			    struct amdgpu_reset_context *reset_context)
{
	int r = 0;

	if (adev->kfd.dev)
		r = kgd2kfd_pre_reset(adev->kfd.dev);
		r = kgd2kfd_pre_reset(adev->kfd.dev, reset_context);

	return r;
}
+7 −3
Original line number Diff line number Diff line
@@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
};

struct amdgpu_device;
struct amdgpu_reset_context;

enum kfd_mem_attachment_type {
	KFD_MEM_ATT_SHARED,	/* Share kgd_mem->bo or another attachment's */
@@ -170,7 +171,8 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);

bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);

int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);
int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev,
			    struct amdgpu_reset_context *reset_context);

int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);

@@ -416,7 +418,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
void kgd2kfd_device_exit(struct kfd_dev *kfd);
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
int kgd2kfd_pre_reset(struct kfd_dev *kfd);
int kgd2kfd_pre_reset(struct kfd_dev *kfd,
		      struct amdgpu_reset_context *reset_context);
int kgd2kfd_post_reset(struct kfd_dev *kfd);
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
@@ -459,7 +462,8 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
	return 0;
}

static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd,
				    struct amdgpu_reset_context *reset_context)
{
	return 0;
}
+1 −1
Original line number Diff line number Diff line
@@ -5775,7 +5775,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,

		cancel_delayed_work_sync(&tmp_adev->delayed_init_work);

		amdgpu_amdkfd_pre_reset(tmp_adev);
		amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);

		/*
		 * Mark these ASICs to be reseted as untracked first
+4 −3
Original line number Diff line number Diff line
@@ -924,7 +924,8 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
	kfree(kfd);
}

int kgd2kfd_pre_reset(struct kfd_dev *kfd)
int kgd2kfd_pre_reset(struct kfd_dev *kfd,
		      struct amdgpu_reset_context *reset_context)
{
	struct kfd_node *node;
	int i;
@@ -934,7 +935,7 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd)

	for (i = 0; i < kfd->num_nodes; i++) {
		node = kfd->nodes[i];
		kfd_smi_event_update_gpu_reset(node, false);
		kfd_smi_event_update_gpu_reset(node, false, reset_context);
		node->dqm->ops.pre_reset(node->dqm);
	}

@@ -974,7 +975,7 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
	for (i = 0; i < kfd->num_nodes; i++) {
		node = kfd->nodes[i];
		atomic_set(&node->sram_ecc_flag, 0);
		kfd_smi_event_update_gpu_reset(node, true);
		kfd_smi_event_update_gpu_reset(node, true, NULL);
	}

	return 0;
+14 −2
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include "amdgpu_vm.h"
#include "kfd_priv.h"
#include "kfd_smi_events.h"
#include "amdgpu_reset.h"

struct kfd_smi_client {
	struct list_head list;
@@ -215,9 +216,11 @@ static void kfd_smi_event_add(pid_t pid, struct kfd_node *dev,
	add_event_to_kfifo(pid, dev, event, fifo_in, len);
}

void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset,
				    struct amdgpu_reset_context *reset_context)
{
	unsigned int event;
	char reset_cause[64];

	if (post_reset) {
		event = KFD_SMI_EVENT_GPU_POST_RESET;
@@ -225,7 +228,16 @@ void kfd_smi_event_update_gpu_reset(struct kfd_node *dev, bool post_reset)
		event = KFD_SMI_EVENT_GPU_PRE_RESET;
		++(dev->reset_seq_num);
	}
	kfd_smi_event_add(0, dev, event, "%x\n", dev->reset_seq_num);

	memset(reset_cause, 0, sizeof(reset_cause));

	if (reset_context)
		amdgpu_reset_get_desc(reset_context, reset_cause,
				      sizeof(reset_cause));

	kfd_smi_event_add(0, dev, event, "%x %s\n",
			  dev->reset_seq_num,
			  reset_cause);
}

void kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
Loading