Commit 2c2b0d88 authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher
Browse files

drm/amdkfd: Add thermal throttling SMI event



Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.

Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent df9c8d1a
Loading
Loading
Loading
Loading
+4 −0
Original line number Diff line number Diff line
@@ -789,4 +789,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{
}

void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
}
#endif
+1 −0
Original line number Diff line number Diff line
@@ -270,5 +270,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
					       struct dma_fence *fence);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);

#endif /* AMDGPU_AMDKFD_H_INCLUDED */
+7 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@
#include "cwsr_trap_handler.h"
#include "kfd_iommu.h"
#include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"

#define MQD_SIZE_ALIGNED 768

@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
	WARN_ONCE(count < 0, "Compute profile ref. count error");
}

void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
	if (kfd)
		kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
}

#if defined(CONFIG_DEBUG_FS)

/* This function will send a package to HIQ to hang the HWS
+50 −17
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <linux/wait.h>
#include <linux/anon_inodes.h>
#include <uapi/linux/kfd_ioctl.h>
#include "amdgpu.h"
#include "amdgpu_vm.h"
#include "kfd_priv.h"
#include "kfd_smi_events.h"
@@ -148,6 +149,54 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
	return 0;
}

static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
			      char *event_msg, int len)
{
	struct kfd_smi_client *client;

	rcu_read_lock();

	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
		if (!(READ_ONCE(client->events) & smi_event))
			continue;
		spin_lock(&client->lock);
		if (kfifo_avail(&client->fifo) >= len) {
			kfifo_in(&client->fifo, event_msg, len);
			wake_up_all(&client->wait_queue);
		} else {
			pr_debug("smi_event(EventID: %llu): no space left\n",
					smi_event);
		}
		spin_unlock(&client->lock);
	}

	rcu_read_unlock();
}

void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
					     uint32_t throttle_bitmask)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
	/*
	 * ThermalThrottle msg = throttle_bitmask(8):
	 * 			 thermal_interrupt_count(16):
	 * 16 bytes event + 1 byte space + 8 byte throttle_bitmask +
	 * 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n +
	 * 1 byte \0 = 44
	 */
	char fifo_in[44];
	int len;

	if (list_empty(&dev->smi_clients))
		return;

	len = snprintf(fifo_in, 44, "%x %x:%llx\n",
		       KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
		       atomic64_read(&adev->smu.throttle_int_counter));

	add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
}

void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
{
	struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
@@ -156,7 +205,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
	/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
	 */
	char fifo_in[43];
	struct kfd_smi_client *client;
	int len;

	if (list_empty(&dev->smi_clients))
@@ -171,22 +219,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
	len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
		task_info.pid, task_info.task_name);

	rcu_read_lock();

	list_for_each_entry_rcu(client, &dev->smi_clients, list) {
		if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
			continue;
		spin_lock(&client->lock);
		if (kfifo_avail(&client->fifo) >= len) {
			kfifo_in(&client->fifo, fifo_in, len);
			wake_up_all(&client->wait_queue);
		}
		else
			pr_debug("smi_event(vmfault): no space left\n");
		spin_unlock(&client->lock);
	}

	rcu_read_unlock();
	add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
}

int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
+2 −0
Original line number Diff line number Diff line
@@ -25,5 +25,7 @@

int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
					     uint32_t throttle_bitmask);

#endif
Loading