Files
linux-cryptodev-2.6/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
Jesse.Zhang 290f46cf57 drm/amdgpu: Implement user queue reset functionality
This patch adds robust reset handling for user queues (userq) to improve
recovery from queue failures. The key components include:

1. Queue detection and reset logic:
   - amdgpu_userq_detect_and_reset_queues() identifies failed queues
   - Per-IP detect_and_reset callbacks for targeted recovery
   - Falls back to full GPU reset when needed

2. Reset infrastructure:
   - Adds userq_reset_work workqueue for async reset handling
   - Implements pre/post reset handlers for queue state management
   - Integrates with existing GPU reset framework

3. Error handling improvements:
   - Enhanced state tracking with HUNG state
   - Automatic reset triggering on critical failures
   - VRAM loss handling during recovery

4. Integration points:
   - Added to device init/reset paths
   - Called during queue destroy, suspend, and isolation events
   - Handles both individual queue and full GPU resets

The reset functionality works with both gfx/compute and sdma queues,
providing better resilience against queue failures while minimizing
disruption to unaffected queues.

v2: add detection and reset calls when preemption/unmaped fails.
    add a per device userq counter for each user queue type.(Alex)
v3: make sure we hold the adev->userq_mutex when we call amdgpu_userq_detect_and_reset_queues. (Alex)
   warn if the adev->userq_mutex is not held.
v4: make sure we have all of the uqm->userq_mutex held.
   warn if the uqm->userq_mutex is not held.

v5: Use array for user queue type counters.(Alex)
    all of the uqm->userq_mutex need to be held when calling detect and reset.  (Alex)

v6: fix lock dep warning in amdgpu_userq_fence_dence_driver_process

v7: add the queue types in an array and use a loop in amdgpu_userq_detect_and_reset_queues (Lijo)
v8: remove atomic_set(&userq_mgr->userq_count[i], 0).
   it should already be 0 since we kzalloc the structure (Alex)
v9: For consistency with kernel queues, We may want something like:
   amdgpu_userq_is_reset_type_supported (Alex)

Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
2025-11-04 11:53:05 -05:00

162 lines
5.4 KiB
C

/* SPDX-License-Identifier: MIT */
/*
* Copyright 2023 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
#ifndef AMDGPU_USERQ_H_
#define AMDGPU_USERQ_H_
#include "amdgpu_eviction_fence.h"
#define AMDGPU_MAX_USERQ_COUNT 512
#define to_ev_fence(f) container_of(f, struct amdgpu_eviction_fence, base)
#define uq_mgr_to_fpriv(u) container_of(u, struct amdgpu_fpriv, userq_mgr)
#define work_to_uq_mgr(w, name) container_of(w, struct amdgpu_userq_mgr, name)
enum amdgpu_userq_state {
AMDGPU_USERQ_STATE_UNMAPPED = 0,
AMDGPU_USERQ_STATE_MAPPED,
AMDGPU_USERQ_STATE_PREEMPTED,
AMDGPU_USERQ_STATE_HUNG,
AMDGPU_USERQ_STATE_INVALID_VA,
};
struct amdgpu_mqd_prop;
struct amdgpu_userq_obj {
void *cpu_ptr;
uint64_t gpu_addr;
struct amdgpu_bo *obj;
};
struct amdgpu_userq_va_cursor {
u64 gpu_addr;
struct list_head list;
};
struct amdgpu_usermode_queue {
int queue_type;
enum amdgpu_userq_state state;
uint64_t doorbell_handle;
uint64_t doorbell_index;
uint64_t flags;
struct amdgpu_mqd_prop *userq_prop;
struct amdgpu_userq_mgr *userq_mgr;
struct amdgpu_vm *vm;
struct amdgpu_userq_obj mqd;
struct amdgpu_userq_obj db_obj;
struct amdgpu_userq_obj fw_obj;
struct amdgpu_userq_obj wptr_obj;
struct xarray fence_drv_xa;
struct amdgpu_userq_fence_driver *fence_drv;
struct dma_fence *last_fence;
u32 xcp_id;
int priority;
struct dentry *debugfs_queue;
struct list_head userq_va_list;
};
struct amdgpu_userq_funcs {
int (*mqd_create)(struct amdgpu_userq_mgr *uq_mgr,
struct drm_amdgpu_userq_in *args,
struct amdgpu_usermode_queue *queue);
void (*mqd_destroy)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *uq);
int (*unmap)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
int (*map)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
int (*preempt)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
int (*restore)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
int (*detect_and_reset)(struct amdgpu_device *adev,
int queue_type);
};
/* Usermode queues for gfx */
struct amdgpu_userq_mgr {
/**
* @userq_mgr_xa: Per-process user queue map (queue ID → queue)
* Key: queue_id (unique ID within the process's userq manager)
* Value: struct amdgpu_usermode_queue
*/
struct xarray userq_mgr_xa;
struct mutex userq_mutex;
struct amdgpu_device *adev;
struct delayed_work resume_work;
struct drm_file *file;
atomic_t userq_count[AMDGPU_RING_TYPE_MAX];
};
struct amdgpu_db_info {
uint64_t doorbell_handle;
uint32_t queue_type;
uint32_t doorbell_offset;
struct amdgpu_userq_obj *db_obj;
};
int amdgpu_userq_ioctl(struct drm_device *dev, void *data, struct drm_file *filp);
int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct drm_file *file_priv,
struct amdgpu_device *adev);
void amdgpu_userq_mgr_fini(struct amdgpu_userq_mgr *userq_mgr);
int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_userq_obj *userq_obj,
int size);
void amdgpu_userq_destroy_object(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_userq_obj *userq_obj);
void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence);
void amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *userq_mgr,
struct amdgpu_eviction_fence_mgr *evf_mgr);
uint64_t amdgpu_userq_get_doorbell_index(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_db_info *db_info,
struct drm_file *filp);
u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev);
int amdgpu_userq_suspend(struct amdgpu_device *adev);
int amdgpu_userq_resume(struct amdgpu_device *adev);
int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
void amdgpu_userq_reset_work(struct work_struct *work);
void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
int amdgpu_userq_input_va_validate(struct amdgpu_usermode_queue *queue,
u64 addr, u64 expected_size);
int amdgpu_userq_gem_va_unmap_validate(struct amdgpu_device *adev,
struct amdgpu_bo_va_mapping *mapping,
uint64_t saddr);
#endif