Commit 6cca686d authored by Xiaogang Chen's avatar Xiaogang Chen Committed by Alex Deucher
Browse files

drm/amdkfd: kfd driver supports hot unplug/replug amdgpu devices



This patch allows kfd driver function correctly when AMD gpu devices got
unplug/replug at run time.

When an AMD gpu device got unplug kfd driver gracefully terminates existing
kfd processes after stops all queues by sending SIGBUS to user process. After
that user space can still use remaining AMD gpu devices. When all AMD gpu
devices at system got removed kfd driver will not response new requests.

Unplugged AMD gpu devices can be re-plugged. kfd driver will use added devices
to function as usual.

The purpose of this patch is having kfd driver behavior as expected during and
after AMD gpu devices unplug/replug at run time.

Signed-off-by: default avatarXiaogang Chen <Xiaogang.Chen@amd.com>
Acked-by: default avatarAlex Deucher <alexander.deucher@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent d81e52fc
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
		kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
}

void amdgpu_amdkfd_teardown_processes(struct amdgpu_device *adev)
{
	kgd2kfd_teardown_processes(adev);
}

void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc)
{
	if (adev->kfd.dev) {
+11 −0
Original line number Diff line number Diff line
@@ -158,6 +158,7 @@ struct amdkfd_process_info {

int amdgpu_amdkfd_init(void);
void amdgpu_amdkfd_fini(void);
void amdgpu_amdkfd_teardown_processes(struct amdgpu_device *adev);

void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool suspend_proc);
int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool resume_proc);
@@ -438,6 +439,8 @@ int kgd2kfd_stop_sched_all_nodes(struct kfd_dev *kfd);
bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry,
			       bool retry_fault);
void kgd2kfd_lock_kfd(void);
void kgd2kfd_teardown_processes(struct amdgpu_device *adev);

#else
static inline int kgd2kfd_init(void)
@@ -550,5 +553,13 @@ static inline bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct
	return false;
}

static inline void kgd2kfd_lock_kfd(void)
{
}

static inline void kgd2kfd_teardown_processes(struct amdgpu_device *adev)
{
}

#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
+1 −0
Original line number Diff line number Diff line
@@ -3510,6 +3510,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);

	amdgpu_amdkfd_suspend(adev, true);
	amdgpu_amdkfd_teardown_processes(adev);
	amdgpu_userq_suspend(adev);

	/* Workaround for ASICs need to disable SMC first */
+75 −1
Original line number Diff line number Diff line
@@ -973,6 +973,9 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
	}

	kfree(kfd);

	/* after remove a kfd device unlock kfd driver */
	kgd2kfd_unlock_kfd(NULL);
}

int kgd2kfd_pre_reset(struct kfd_dev *kfd,
@@ -1557,10 +1560,14 @@ int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
	return r;
}

/* unlock a kfd dev or kfd driver */
void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
{
	mutex_lock(&kfd_processes_mutex);
	if (kfd)
		--kfd->kfd_dev_lock;
	else
		--kfd_locked;
	mutex_unlock(&kfd_processes_mutex);
}

@@ -1729,6 +1736,73 @@ bool kgd2kfd_vmfault_fast_path(struct amdgpu_device *adev, struct amdgpu_iv_entr
	return false;
}

/* check if there is kfd process still uses adev */
static bool kgd2kfd_check_device_idle(struct amdgpu_device *adev)
{
	struct kfd_process *p;
	struct hlist_node *p_temp;
	unsigned int temp;
	struct kfd_node *dev;

	mutex_lock(&kfd_processes_mutex);

	if (hash_empty(kfd_processes_table)) {
		mutex_unlock(&kfd_processes_mutex);
		return true;
	}

	/* check if there is device still use adev */
	hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
		for (int i = 0; i < p->n_pdds; i++) {
			dev = p->pdds[i]->dev;
			if (dev->adev == adev) {
				mutex_unlock(&kfd_processes_mutex);
				return false;
			}
		}
	}

	mutex_unlock(&kfd_processes_mutex);

	return true;
}

/** kgd2kfd_teardown_processes - gracefully tear down existing
 *  kfd processes that use adev
 *
 * @adev: amdgpu_device where kfd processes run on and will be
 *  teardown
 *
 */
void kgd2kfd_teardown_processes(struct amdgpu_device *adev)
{
	struct hlist_node *p_temp;
	struct kfd_process *p;
	struct kfd_node *dev;
	unsigned int temp;

	mutex_lock(&kfd_processes_mutex);

	if (hash_empty(kfd_processes_table)) {
		mutex_unlock(&kfd_processes_mutex);
		return;
	}

	hash_for_each_safe(kfd_processes_table, temp, p_temp, p, kfd_processes) {
		for (int i = 0; i < p->n_pdds; i++) {
			dev = p->pdds[i]->dev;
			if (dev->adev == adev)
				kfd_signal_process_terminate_event(p);
		}
	}

	mutex_unlock(&kfd_processes_mutex);

	/* wait all kfd processes use adev terminate */
	while (!kgd2kfd_check_device_idle(adev))
		cond_resched();
}

#if defined(CONFIG_DEBUG_FS)

/* This function will send a package to HIQ to hang the HWS
+29 −0
Original line number Diff line number Diff line
@@ -1380,3 +1380,32 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)

	kfd_unref_process(p);
}

/* signal KFD_EVENT_TYPE_SIGNAL events from process p
 * send signal SIGBUS to correspondent user space process
 */
void kfd_signal_process_terminate_event(struct kfd_process *p)
{
	struct kfd_event *ev;
	u32 id;

	rcu_read_lock();

	/* iterate from id 1 for KFD_EVENT_TYPE_SIGNAL events */
	id = 1;
	idr_for_each_entry_continue(&p->event_idr, ev, id)
		if (ev->type == KFD_EVENT_TYPE_SIGNAL) {
			spin_lock(&ev->lock);
			set_event(ev);
			spin_unlock(&ev->lock);
		}

	/* Send SIGBUS to p->lead_thread */
	dev_notice(kfd_device,
		   "Sending SIGBUS to process %d",
		   p->lead_thread->pid);

	send_sig(SIGBUS, p->lead_thread, 0);

	rcu_read_unlock();
}
Loading