Commit 1802b042 authored by Yunxiang Li's avatar Yunxiang Li Committed by Alex Deucher
Browse files

drm/amdgpu/kfd: remove is_hws_hang and is_resetting



is_hws_hang and is_resetting serves pretty much the same purpose and
they all duplicates the work of the reset_domain lock, just check that
directly instead. This also eliminate a few bugs listed below and get
rid of dqm->ops.pre_reset.

kfd_hws_hang did not need to avoid scheduling another reset. If the
on-going reset decided to skip GPU reset we have a bad time, otherwise
the extra reset will get cancelled anyway.

remove_queue_mes forgot to check is_resetting flag compared to the
pre-MES path unmap_queue_cpsch, so it did not block hw access during
reset correctly.

Signed-off-by: default avatarYunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: default avatarFelix Kuehling <felix.kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5c0a1cdd
Loading
Loading
Loading
Loading
+0 −1
Original line number Diff line number Diff line
@@ -936,7 +936,6 @@ int kgd2kfd_pre_reset(struct kfd_dev *kfd,
	for (i = 0; i < kfd->num_nodes; i++) {
		node = kfd->nodes[i];
		kfd_smi_event_update_gpu_reset(node, false, reset_context);
		node->dqm->ops.pre_reset(node->dqm);
	}

	kgd2kfd_suspend(kfd, false);
+32 −47
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@
#include "cik_regs.h"
#include "kfd_kernel_queue.h"
#include "amdgpu_amdkfd.h"
#include "amdgpu_reset.h"
#include "mes_v11_api_def.h"
#include "kfd_debug.h"

@@ -155,13 +156,6 @@ static void kfd_hws_hang(struct device_queue_manager *dqm)
	/*
	 * Issue a GPU reset if HWS is unresponsive
	 */
	dqm->is_hws_hang = true;

	/* It's possible we're detecting a HWS hang in the
	 * middle of a GPU reset. No need to schedule another
	 * reset in this case.
	 */
	if (!dqm->is_resetting)
	schedule_work(&dqm->hw_exception_work);
}

@@ -194,7 +188,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
	int r, queue_type;
	uint64_t wptr_addr_off;

	if (dqm->is_hws_hang)
	if (!down_read_trylock(&adev->reset_domain->sem))
		return -EIO;

	memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
@@ -245,6 +239,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
	amdgpu_mes_lock(&adev->mes);
	r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
	amdgpu_mes_unlock(&adev->mes);
	up_read(&adev->reset_domain->sem);
	if (r) {
		dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n",
			q->properties.doorbell_off);
@@ -262,7 +257,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
	int r;
	struct mes_remove_queue_input queue_input;

	if (dqm->is_hws_hang)
	if (!down_read_trylock(&adev->reset_domain->sem))
		return -EIO;

	memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
@@ -272,6 +267,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
	amdgpu_mes_lock(&adev->mes);
	r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
	amdgpu_mes_unlock(&adev->mes);
	up_read(&adev->reset_domain->sem);

	if (r) {
		dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
@@ -1468,20 +1464,13 @@ static int stop_nocpsch(struct device_queue_manager *dqm)
	}

	if (dqm->dev->adev->asic_type == CHIP_HAWAII)
		pm_uninit(&dqm->packet_mgr, false);
		pm_uninit(&dqm->packet_mgr);
	dqm->sched_running = false;
	dqm_unlock(dqm);

	return 0;
}

static void pre_reset(struct device_queue_manager *dqm)
{
	dqm_lock(dqm);
	dqm->is_resetting = true;
	dqm_unlock(dqm);
}

static int allocate_sdma_queue(struct device_queue_manager *dqm,
				struct queue *q, const uint32_t *restore_sdma_id)
{
@@ -1669,8 +1658,6 @@ static int start_cpsch(struct device_queue_manager *dqm)
	init_interrupts(dqm);

	/* clear hang status when driver try to start the hw scheduler */
	dqm->is_hws_hang = false;
	dqm->is_resetting = false;
	dqm->sched_running = true;

	if (!dqm->dev->kfd->shared_resources.enable_mes)
@@ -1700,7 +1687,7 @@ static int start_cpsch(struct device_queue_manager *dqm)
fail_allocate_vidmem:
fail_set_sched_resources:
	if (!dqm->dev->kfd->shared_resources.enable_mes)
		pm_uninit(&dqm->packet_mgr, false);
		pm_uninit(&dqm->packet_mgr);
fail_packet_manager_init:
	dqm_unlock(dqm);
	return retval;
@@ -1708,22 +1695,17 @@ static int start_cpsch(struct device_queue_manager *dqm)

static int stop_cpsch(struct device_queue_manager *dqm)
{
	bool hanging;

	dqm_lock(dqm);
	if (!dqm->sched_running) {
		dqm_unlock(dqm);
		return 0;
	}

	if (!dqm->is_hws_hang) {
	if (!dqm->dev->kfd->shared_resources.enable_mes)
		unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false);
	else
		remove_all_queues_mes(dqm);
	}

	hanging = dqm->is_hws_hang || dqm->is_resetting;
	dqm->sched_running = false;

	if (!dqm->dev->kfd->shared_resources.enable_mes)
@@ -1731,7 +1713,7 @@ static int stop_cpsch(struct device_queue_manager *dqm)

	kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
	if (!dqm->dev->kfd->shared_resources.enable_mes)
		pm_uninit(&dqm->packet_mgr, hanging);
		pm_uninit(&dqm->packet_mgr);
	dqm_unlock(dqm);

	return 0;
@@ -1957,24 +1939,24 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
{
	struct device *dev = dqm->dev->adev->dev;
	struct mqd_manager *mqd_mgr;
	int retval = 0;
	int retval;

	if (!dqm->sched_running)
		return 0;
	if (dqm->is_hws_hang || dqm->is_resetting)
		return -EIO;
	if (!dqm->active_runlist)
		return retval;
		return 0;
	if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem))
		return -EIO;

	if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
		retval = pm_update_grace_period(&dqm->packet_mgr, grace_period);
		if (retval)
			return retval;
			goto out;
	}

	retval = pm_send_unmap_queue(&dqm->packet_mgr, filter, filter_param, reset);
	if (retval)
		return retval;
		goto out;

	*dqm->fence_addr = KFD_FENCE_INIT;
	pm_send_query_status(&dqm->packet_mgr, dqm->fence_gpu_addr,
@@ -1985,7 +1967,7 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
	if (retval) {
		dev_err(dev, "The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
		kfd_hws_hang(dqm);
		return retval;
		goto out;
	}

	/* In the current MEC firmware implementation, if compute queue
@@ -2001,7 +1983,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
		while (halt_if_hws_hang)
			schedule();
		kfd_hws_hang(dqm);
		return -ETIME;
		retval = -ETIME;
		goto out;
	}

	/* We need to reset the grace period value for this device */
@@ -2014,6 +1997,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
	pm_release_ib(&dqm->packet_mgr);
	dqm->active_runlist = false;

out:
	up_read(&dqm->dev->adev->reset_domain->sem);
	return retval;
}

@@ -2040,13 +2025,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
{
	int retval;

	if (dqm->is_hws_hang)
	if (!down_read_trylock(&dqm->dev->adev->reset_domain->sem))
		return -EIO;
	retval = unmap_queues_cpsch(dqm, filter, filter_param, grace_period, false);
	if (retval)
	if (!retval)
		retval = map_queues_cpsch(dqm);
	up_read(&dqm->dev->adev->reset_domain->sem);
	return retval;

	return map_queues_cpsch(dqm);
}

static int wait_on_destroy_queue(struct device_queue_manager *dqm,
@@ -2427,10 +2412,12 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
	if (!dqm->dev->kfd->shared_resources.enable_mes)
		retval = execute_queues_cpsch(dqm, filter, 0, USE_DEFAULT_GRACE_PERIOD);

	if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
	if ((retval || qpd->reset_wavefronts) &&
	    down_read_trylock(&dqm->dev->adev->reset_domain->sem)) {
		pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
		dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
		qpd->reset_wavefronts = false;
		up_read(&dqm->dev->adev->reset_domain->sem);
	}

	/* Lastly, free mqd resources.
@@ -2537,7 +2524,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
		dqm->ops.initialize = initialize_cpsch;
		dqm->ops.start = start_cpsch;
		dqm->ops.stop = stop_cpsch;
		dqm->ops.pre_reset = pre_reset;
		dqm->ops.destroy_queue = destroy_queue_cpsch;
		dqm->ops.update_queue = update_queue;
		dqm->ops.register_process = register_process;
@@ -2558,7 +2544,6 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
		/* initialize dqm for no cp scheduling */
		dqm->ops.start = start_nocpsch;
		dqm->ops.stop = stop_nocpsch;
		dqm->ops.pre_reset = pre_reset;
		dqm->ops.create_queue = create_queue_nocpsch;
		dqm->ops.destroy_queue = destroy_queue_nocpsch;
		dqm->ops.update_queue = update_queue;
+0 −1
Original line number Diff line number Diff line
@@ -152,7 +152,6 @@ struct device_queue_manager_ops {
	int	(*initialize)(struct device_queue_manager *dqm);
	int	(*start)(struct device_queue_manager *dqm);
	int	(*stop)(struct device_queue_manager *dqm);
	void	(*pre_reset)(struct device_queue_manager *dqm);
	void	(*uninitialize)(struct device_queue_manager *dqm);
	int	(*create_kernel_queue)(struct device_queue_manager *dqm,
					struct kernel_queue *kq,
+7 −4
Original line number Diff line number Diff line
@@ -32,6 +32,7 @@
#include "kfd_device_queue_manager.h"
#include "kfd_pm4_headers.h"
#include "kfd_pm4_opcodes.h"
#include "amdgpu_reset.h"

#define PM4_COUNT_ZERO (((1 << 15) - 1) << 16)

@@ -196,15 +197,17 @@ static bool kq_initialize(struct kernel_queue *kq, struct kfd_node *dev,
}

/* Uninitialize a kernel queue and free all its memory usages. */
static void kq_uninitialize(struct kernel_queue *kq, bool hanging)
static void kq_uninitialize(struct kernel_queue *kq)
{
	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && !hanging)
	if (kq->queue->properties.type == KFD_QUEUE_TYPE_HIQ && down_read_trylock(&kq->dev->adev->reset_domain->sem)) {
		kq->mqd_mgr->destroy_mqd(kq->mqd_mgr,
					kq->queue->mqd,
					KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
					KFD_UNMAP_LATENCY_MS,
					kq->queue->pipe,
					kq->queue->queue);
		up_read(&kq->dev->adev->reset_domain->sem);
	}
	else if (kq->queue->properties.type == KFD_QUEUE_TYPE_DIQ)
		kfd_gtt_sa_free(kq->dev, kq->fence_mem_obj);

@@ -344,9 +347,9 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
	return NULL;
}

void kernel_queue_uninit(struct kernel_queue *kq, bool hanging)
void kernel_queue_uninit(struct kernel_queue *kq)
{
	kq_uninitialize(kq, hanging);
	kq_uninitialize(kq);
	kfree(kq);
}

+2 −2
Original line number Diff line number Diff line
@@ -263,10 +263,10 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm)
	return 0;
}

void pm_uninit(struct packet_manager *pm, bool hanging)
void pm_uninit(struct packet_manager *pm)
{
	mutex_destroy(&pm->lock);
	kernel_queue_uninit(pm->priv_queue, hanging);
	kernel_queue_uninit(pm->priv_queue);
	pm->priv_queue = NULL;
}

Loading