mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
synced 2026-04-23 05:56:14 -04:00
drm/amdkfd: implement per queue sdma reset for gfx 9.4+
To reset hung SDMA queues on GFX 9.4+ for the GFX9 family, a soft reset must be issued through SMU. Since soft resets will reset an entire SDMA engine, use a common KGD call to do the reset as the KGD will handle avoiding a reset of in flight GFX and paging queues on that engine. In addition, create a common call for all reset types to simplify the handling of module parameter settings that block gpu resets. Signed-off-by: Jonathan Kim <jonathan.kim@amd.com> Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
057fef20b8
commit
bac38ca8c4
@@ -36,6 +36,7 @@
|
||||
#include "kfd_kernel_queue.h"
|
||||
#include "amdgpu_amdkfd.h"
|
||||
#include "amdgpu_reset.h"
|
||||
#include "amdgpu_sdma.h"
|
||||
#include "mes_v11_api_def.h"
|
||||
#include "kfd_debug.h"
|
||||
|
||||
@@ -67,6 +68,8 @@ static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q);
|
||||
static int allocate_sdma_queue(struct device_queue_manager *dqm,
|
||||
struct queue *q, const uint32_t *restore_sdma_id);
|
||||
|
||||
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma);
|
||||
|
||||
static inline
|
||||
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
|
||||
{
|
||||
@@ -2205,8 +2208,7 @@ static struct queue *find_queue_by_address(struct device_queue_manager *dqm, uin
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* only for compute queue */
|
||||
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm)
|
||||
static int reset_hung_queues(struct device_queue_manager *dqm)
|
||||
{
|
||||
int r = 0, reset_count = 0, i;
|
||||
|
||||
@@ -2259,6 +2261,104 @@ reset_fail:
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool sdma_has_hang(struct device_queue_manager *dqm)
|
||||
{
|
||||
int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
|
||||
int engine_end = engine_start + get_num_all_sdma_engines(dqm);
|
||||
int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
|
||||
int i, j;
|
||||
|
||||
for (i = engine_start; i < engine_end; i++) {
|
||||
for (j = 0; j < num_queues_per_eng; j++) {
|
||||
if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j))
|
||||
continue;
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm,
|
||||
uint32_t doorbell_off)
|
||||
{
|
||||
struct device_process_node *cur;
|
||||
struct qcm_process_device *qpd;
|
||||
struct queue *q;
|
||||
|
||||
list_for_each_entry(cur, &dqm->queues, list) {
|
||||
qpd = cur->qpd;
|
||||
list_for_each_entry(q, &qpd->queues_list, list) {
|
||||
if ((q->properties.type == KFD_QUEUE_TYPE_SDMA ||
|
||||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) &&
|
||||
q->properties.doorbell_off == doorbell_off) {
|
||||
set_queue_as_reset(dqm, q, qpd);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
|
||||
{
|
||||
int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm);
|
||||
int engine_end = engine_start + get_num_all_sdma_engines(dqm);
|
||||
int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine;
|
||||
int r = 0, i, j;
|
||||
|
||||
if (dqm->is_hws_hang)
|
||||
return -EIO;
|
||||
|
||||
/* Scan for hung HW queues and reset engine. */
|
||||
dqm->detect_hang_count = 0;
|
||||
for (i = engine_start; i < engine_end; i++) {
|
||||
for (j = 0; j < num_queues_per_eng; j++) {
|
||||
uint32_t doorbell_off =
|
||||
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j);
|
||||
|
||||
if (!doorbell_off)
|
||||
continue;
|
||||
|
||||
/* Reset engine and check. */
|
||||
if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
|
||||
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
|
||||
!set_sdma_queue_as_reset(dqm, doorbell_off)) {
|
||||
r = -ENOTRECOVERABLE;
|
||||
goto reset_fail;
|
||||
}
|
||||
|
||||
/* Should only expect one queue active per engine */
|
||||
dqm->detect_hang_count++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Signal process reset */
|
||||
if (dqm->detect_hang_count)
|
||||
kfd_signal_reset_event(dqm->dev);
|
||||
else
|
||||
r = -ENOTRECOVERABLE;
|
||||
|
||||
reset_fail:
|
||||
dqm->detect_hang_count = 0;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma)
|
||||
{
|
||||
while (halt_if_hws_hang)
|
||||
schedule();
|
||||
|
||||
if (!amdgpu_gpu_recovery)
|
||||
return -ENOTRECOVERABLE;
|
||||
|
||||
return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm);
|
||||
}
|
||||
|
||||
/* dqm->lock mutex has to be locked before calling this function */
|
||||
static int unmap_queues_cpsch(struct device_queue_manager *dqm,
|
||||
enum kfd_unmap_queues_filter filter,
|
||||
@@ -2309,16 +2409,13 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
|
||||
* check those fields
|
||||
*/
|
||||
mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ];
|
||||
if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) {
|
||||
while (halt_if_hws_hang)
|
||||
schedule();
|
||||
if (reset_queues_on_hws_hang(dqm)) {
|
||||
dqm->is_hws_hang = true;
|
||||
kfd_hws_hang(dqm);
|
||||
retval = -ETIME;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) &&
|
||||
reset_queues_on_hws_hang(dqm, false))
|
||||
goto reset_fail;
|
||||
|
||||
/* Check for SDMA hang and attempt SDMA reset */
|
||||
if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true))
|
||||
goto reset_fail;
|
||||
|
||||
/* We need to reset the grace period value for this device */
|
||||
if (grace_period != USE_DEFAULT_GRACE_PERIOD) {
|
||||
@@ -2329,10 +2426,15 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
|
||||
|
||||
pm_release_ib(&dqm->packet_mgr);
|
||||
dqm->active_runlist = false;
|
||||
|
||||
out:
|
||||
up_read(&dqm->dev->adev->reset_domain->sem);
|
||||
return retval;
|
||||
|
||||
reset_fail:
|
||||
dqm->is_hws_hang = true;
|
||||
kfd_hws_hang(dqm);
|
||||
up_read(&dqm->dev->adev->reset_domain->sem);
|
||||
return -ETIME;
|
||||
}
|
||||
|
||||
/* only for compute queue */
|
||||
|
||||
Reference in New Issue
Block a user