mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-18 06:33:43 -04:00
Merge tag 'drm-misc-next-2022-02-23' of git://anongit.freedesktop.org/drm/drm-misc into drm-next
drm-misc-next for v5.18: UAPI Changes: Cross-subsystem Changes: - Split out panel-lvds and lvds dt bindings . - Put yes/no on/off disabled/enabled strings in linux/string_helpers.h and use it in drivers and tomoyo. - Clarify dma_fence_chain and dma_fence_array should never include eachother. - Flatten chains in syncobj's. - Don't double add in fbdev/defio when page is already enlisted. - Don't sort deferred-I/O pages by default in fbdev. Core Changes: - Fix missing pm_runtime_put_sync in bridge. - Set modifier support to only linear fb modifier if drivers don't advertise support. - As a result, we remove allow_fb_modifiers. - Add missing clear for EDID Deep Color Modes in drm_reset_display_info. - Assorted documentation updates. - Warn once in drm_clflush if there is no arch support. - Add missing select for dp helper in drm_panel_edp. - Assorted small fixes. - Improve fb-helper's clipping handling. - Don't dump shmem mmaps in a core dump. - Add accounting to ttm resource manager, and use it in amdgpu. - Allow querying the detected eDP panel through debugfs. - Add helpers for xrgb8888 to 8 and 1 bits gray. - Improve drm's buddy allocator. - Add selftests for the buddy allocator. Driver Changes: - Add support for nomodeset to a lot of drm drivers. - Use drm_module_*_driver in a lot of drm drivers. - Assorted small fixes to bridge/lt9611, v3d, vc4, vmwgfx, mxsfb, nouveau, bridge/dw-hdmi, panfrost, lima, ingenic, sprd, bridge/anx7625, ti-sn65dsi86. - Add bridge/it6505. - Create DP and DVI-I connectors in ast. - Assorted nouveau backlight fixes. - Rework amdgpu reset handling. - Add dt bindings for ingenic,jz4780-dw-hdmi. - Support reading edid through aux channel in ingenic. - Add a drm driver for Solomon SSD130x OLED displays. - Add simple support for sharp LQ140M1JW46. - Add more panels to nt35560. Signed-off-by: Dave Airlie <airlied@redhat.com> From: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/686ec871-e77f-c230-22e5-9e3bb80f064a@linux.intel.com
This commit is contained in:
@@ -426,10 +426,10 @@ bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
|
||||
* the lock.
|
||||
*/
|
||||
if (in_task()) {
|
||||
if (down_read_trylock(&adev->reset_sem))
|
||||
up_read(&adev->reset_sem);
|
||||
if (down_read_trylock(&adev->reset_domain->sem))
|
||||
up_read(&adev->reset_domain->sem);
|
||||
else
|
||||
lockdep_assert_held(&adev->reset_sem);
|
||||
lockdep_assert_held(&adev->reset_domain->sem);
|
||||
}
|
||||
#endif
|
||||
return false;
|
||||
@@ -455,9 +455,9 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
|
||||
if ((reg * 4) < adev->rmmio_size) {
|
||||
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
|
||||
amdgpu_sriov_runtime(adev) &&
|
||||
down_read_trylock(&adev->reset_sem)) {
|
||||
down_read_trylock(&adev->reset_domain->sem)) {
|
||||
ret = amdgpu_kiq_rreg(adev, reg);
|
||||
up_read(&adev->reset_sem);
|
||||
up_read(&adev->reset_domain->sem);
|
||||
} else {
|
||||
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
|
||||
}
|
||||
@@ -540,9 +540,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
|
||||
if ((reg * 4) < adev->rmmio_size) {
|
||||
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
|
||||
amdgpu_sriov_runtime(adev) &&
|
||||
down_read_trylock(&adev->reset_sem)) {
|
||||
down_read_trylock(&adev->reset_domain->sem)) {
|
||||
amdgpu_kiq_wreg(adev, reg, v);
|
||||
up_read(&adev->reset_sem);
|
||||
up_read(&adev->reset_domain->sem);
|
||||
} else {
|
||||
writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
|
||||
}
|
||||
@@ -2331,6 +2331,49 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
|
||||
return r;
|
||||
}
|
||||
|
||||
static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
|
||||
{
|
||||
long timeout;
|
||||
int r, i;
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = adev->rings[i];
|
||||
|
||||
/* No need to setup the GPU scheduler for rings that don't need it */
|
||||
if (!ring || ring->no_scheduler)
|
||||
continue;
|
||||
|
||||
switch (ring->funcs->type) {
|
||||
case AMDGPU_RING_TYPE_GFX:
|
||||
timeout = adev->gfx_timeout;
|
||||
break;
|
||||
case AMDGPU_RING_TYPE_COMPUTE:
|
||||
timeout = adev->compute_timeout;
|
||||
break;
|
||||
case AMDGPU_RING_TYPE_SDMA:
|
||||
timeout = adev->sdma_timeout;
|
||||
break;
|
||||
default:
|
||||
timeout = adev->video_timeout;
|
||||
break;
|
||||
}
|
||||
|
||||
r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
|
||||
ring->num_hw_submission, amdgpu_job_hang_limit,
|
||||
timeout, adev->reset_domain->wq,
|
||||
ring->sched_score, ring->name,
|
||||
adev->dev);
|
||||
if (r) {
|
||||
DRM_ERROR("Failed to create scheduler on ring %s.\n",
|
||||
ring->name);
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* amdgpu_device_ip_init - run init for hardware IPs
|
||||
*
|
||||
@@ -2442,8 +2485,28 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
|
||||
if (r)
|
||||
goto init_failed;
|
||||
|
||||
if (adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
amdgpu_xgmi_add_device(adev);
|
||||
/**
|
||||
* In case of XGMI grab extra reference for reset domain for this device
|
||||
*/
|
||||
if (adev->gmc.xgmi.num_physical_nodes > 1) {
|
||||
if (amdgpu_xgmi_add_device(adev) == 0) {
|
||||
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
|
||||
|
||||
if (!hive->reset_domain ||
|
||||
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
|
||||
r = -ENOENT;
|
||||
goto init_failed;
|
||||
}
|
||||
|
||||
/* Drop the early temporary reset domain we created for device */
|
||||
amdgpu_reset_put_reset_domain(adev->reset_domain);
|
||||
adev->reset_domain = hive->reset_domain;
|
||||
}
|
||||
}
|
||||
|
||||
r = amdgpu_device_init_schedulers(adev);
|
||||
if (r)
|
||||
goto init_failed;
|
||||
|
||||
/* Don't init kfd if whole hive need to be reset during init */
|
||||
if (!adev->gmc.xgmi.pending_reset)
|
||||
@@ -3543,8 +3606,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
mutex_init(&adev->mn_lock);
|
||||
mutex_init(&adev->virt.vf_errors.lock);
|
||||
hash_init(adev->mn_hash);
|
||||
atomic_set(&adev->in_gpu_reset, 0);
|
||||
init_rwsem(&adev->reset_sem);
|
||||
mutex_init(&adev->psp.mutex);
|
||||
mutex_init(&adev->notifier_lock);
|
||||
mutex_init(&adev->pm.stable_pstate_ctx_lock);
|
||||
@@ -3630,6 +3691,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
return r;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reset domain needs to be present early, before XGMI hive discovered
|
||||
* (if any) and intitialized to use reset sem and in_gpu reset flag
|
||||
* early on during init.
|
||||
*/
|
||||
adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
|
||||
if (!adev->reset_domain)
|
||||
return -ENOMEM;
|
||||
|
||||
/* early init functions */
|
||||
r = amdgpu_device_ip_early_init(adev);
|
||||
if (r)
|
||||
@@ -4006,6 +4076,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
|
||||
if (adev->mman.discovery_bin)
|
||||
amdgpu_discovery_fini(adev);
|
||||
|
||||
amdgpu_reset_put_reset_domain(adev->reset_domain);
|
||||
adev->reset_domain = NULL;
|
||||
|
||||
kfree(adev->pci_state);
|
||||
|
||||
}
|
||||
@@ -4817,17 +4890,8 @@ end:
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
|
||||
struct amdgpu_hive_info *hive)
|
||||
static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
|
||||
{
|
||||
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
|
||||
return false;
|
||||
|
||||
if (hive) {
|
||||
down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
|
||||
} else {
|
||||
down_write(&adev->reset_sem);
|
||||
}
|
||||
|
||||
switch (amdgpu_asic_reset_method(adev)) {
|
||||
case AMD_RESET_METHOD_MODE1:
|
||||
@@ -4840,56 +4904,12 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
|
||||
adev->mp1_state = PP_MP1_STATE_NONE;
|
||||
break;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
|
||||
static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
|
||||
{
|
||||
amdgpu_vf_error_trans_all(adev);
|
||||
adev->mp1_state = PP_MP1_STATE_NONE;
|
||||
atomic_set(&adev->in_gpu_reset, 0);
|
||||
up_write(&adev->reset_sem);
|
||||
}
|
||||
|
||||
/*
|
||||
* to lockup a list of amdgpu devices in a hive safely, if not a hive
|
||||
* with multiple nodes, it will be similar as amdgpu_device_lock_adev.
|
||||
*
|
||||
* unlock won't require roll back.
|
||||
*/
|
||||
static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
|
||||
{
|
||||
struct amdgpu_device *tmp_adev = NULL;
|
||||
|
||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
|
||||
if (!hive) {
|
||||
dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
|
||||
return -ENODEV;
|
||||
}
|
||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
||||
if (!amdgpu_device_lock_adev(tmp_adev, hive))
|
||||
goto roll_back;
|
||||
}
|
||||
} else if (!amdgpu_device_lock_adev(adev, hive))
|
||||
return -EAGAIN;
|
||||
|
||||
return 0;
|
||||
roll_back:
|
||||
if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
|
||||
/*
|
||||
* if the lockup iteration break in the middle of a hive,
|
||||
* it may means there may has a race issue,
|
||||
* or a hive device locked up independently.
|
||||
* we may be in trouble and may not, so will try to roll back
|
||||
* the lock and give out a warnning.
|
||||
*/
|
||||
dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
|
||||
list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
||||
amdgpu_device_unlock_adev(tmp_adev);
|
||||
}
|
||||
}
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
|
||||
@@ -5023,7 +5043,7 @@ retry:
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
|
||||
* amdgpu_device_gpu_recover_imp - reset the asic and recover scheduler
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
* @job: which job trigger hang
|
||||
@@ -5033,7 +5053,7 @@ retry:
|
||||
* Returns 0 for success or an error on failure.
|
||||
*/
|
||||
|
||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
|
||||
struct amdgpu_job *job)
|
||||
{
|
||||
struct list_head device_list, *device_list_handle = NULL;
|
||||
@@ -5067,26 +5087,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
dev_info(adev->dev, "GPU %s begin!\n",
|
||||
need_emergency_restart ? "jobs stop":"reset");
|
||||
|
||||
/*
|
||||
* Here we trylock to avoid chain of resets executing from
|
||||
* either trigger by jobs on different adevs in XGMI hive or jobs on
|
||||
* different schedulers for same device while this TO handler is running.
|
||||
* We always reset all schedulers for device and all devices for XGMI
|
||||
* hive so that should take care of them too.
|
||||
*/
|
||||
if (!amdgpu_sriov_vf(adev))
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
if (hive) {
|
||||
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
|
||||
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
|
||||
job ? job->base.id : -1, hive->hive_id);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
if (job && job->vm)
|
||||
drm_sched_increase_karma(&job->base);
|
||||
return 0;
|
||||
}
|
||||
if (hive)
|
||||
mutex_lock(&hive->hive_lock);
|
||||
}
|
||||
|
||||
reset_context.method = AMD_RESET_METHOD_NONE;
|
||||
reset_context.reset_req_dev = adev;
|
||||
@@ -5094,22 +5098,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
reset_context.hive = hive;
|
||||
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
|
||||
|
||||
/*
|
||||
* lock the device before we try to operate the linked list
|
||||
* if didn't get the device lock, don't touch the linked list since
|
||||
* others may iterating it.
|
||||
*/
|
||||
r = amdgpu_device_lock_hive_adev(adev, hive);
|
||||
if (r) {
|
||||
dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
|
||||
job ? job->base.id : -1);
|
||||
|
||||
/* even we skipped this reset, still need to set the job to guilty */
|
||||
if (job && job->vm)
|
||||
drm_sched_increase_karma(&job->base);
|
||||
goto skip_recovery;
|
||||
}
|
||||
|
||||
/*
|
||||
* Build list of devices to reset.
|
||||
* In case we are in XGMI hive mode, resort the device list
|
||||
@@ -5127,8 +5115,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
device_list_handle = &device_list;
|
||||
}
|
||||
|
||||
/* We need to lock reset domain only once both for XGMI and single device */
|
||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||
reset_list);
|
||||
amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
|
||||
|
||||
/* block all schedulers and reset given job's ring */
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
|
||||
amdgpu_device_set_mp1_state(tmp_adev);
|
||||
|
||||
/*
|
||||
* Try to put the audio codec into suspend state
|
||||
* before gpu reset started.
|
||||
@@ -5280,21 +5276,55 @@ skip_sched_resume:
|
||||
|
||||
if (audio_suspended)
|
||||
amdgpu_device_resume_display_audio(tmp_adev);
|
||||
amdgpu_device_unlock_adev(tmp_adev);
|
||||
|
||||
amdgpu_device_unset_mp1_state(tmp_adev);
|
||||
}
|
||||
|
||||
skip_recovery:
|
||||
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
|
||||
reset_list);
|
||||
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
|
||||
|
||||
if (hive) {
|
||||
atomic_set(&hive->in_reset, 0);
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
if (r && r != -EAGAIN)
|
||||
if (r)
|
||||
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
|
||||
return r;
|
||||
}
|
||||
|
||||
struct amdgpu_recover_work_struct {
|
||||
struct work_struct base;
|
||||
struct amdgpu_device *adev;
|
||||
struct amdgpu_job *job;
|
||||
int ret;
|
||||
};
|
||||
|
||||
static void amdgpu_device_queue_gpu_recover_work(struct work_struct *work)
|
||||
{
|
||||
struct amdgpu_recover_work_struct *recover_work = container_of(work, struct amdgpu_recover_work_struct, base);
|
||||
|
||||
recover_work->ret = amdgpu_device_gpu_recover_imp(recover_work->adev, recover_work->job);
|
||||
}
|
||||
/*
|
||||
* Serialize gpu recover into reset domain single threaded wq
|
||||
*/
|
||||
int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
struct amdgpu_job *job)
|
||||
{
|
||||
struct amdgpu_recover_work_struct work = {.adev = adev, .job = job};
|
||||
|
||||
INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
|
||||
|
||||
if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
|
||||
return -EAGAIN;
|
||||
|
||||
flush_work(&work.base);
|
||||
|
||||
return work.ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
|
||||
*
|
||||
@@ -5482,20 +5512,6 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = adev->rings[i];
|
||||
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
|
||||
cancel_delayed_work_sync(&ring->sched.work_tdr);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_pci_error_detected - Called when a PCI error is detected.
|
||||
* @pdev: PCI device struct
|
||||
@@ -5526,14 +5542,11 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
|
||||
/* Fatal error, prepare for slot reset */
|
||||
case pci_channel_io_frozen:
|
||||
/*
|
||||
* Cancel and wait for all TDRs in progress if failing to
|
||||
* set adev->in_gpu_reset in amdgpu_device_lock_adev
|
||||
*
|
||||
* Locking adev->reset_sem will prevent any external access
|
||||
* Locking adev->reset_domain->sem will prevent any external access
|
||||
* to GPU during PCI error recovery
|
||||
*/
|
||||
while (!amdgpu_device_lock_adev(adev, NULL))
|
||||
amdgpu_cancel_all_tdr(adev);
|
||||
amdgpu_device_lock_reset_domain(adev->reset_domain);
|
||||
amdgpu_device_set_mp1_state(adev);
|
||||
|
||||
/*
|
||||
* Block any work scheduling as we do for regular GPU reset
|
||||
@@ -5640,7 +5653,8 @@ out:
|
||||
DRM_INFO("PCIe error recovery succeeded\n");
|
||||
} else {
|
||||
DRM_ERROR("PCIe error recovery failed, err:%d", r);
|
||||
amdgpu_device_unlock_adev(adev);
|
||||
amdgpu_device_unset_mp1_state(adev);
|
||||
amdgpu_device_unlock_reset_domain(adev->reset_domain);
|
||||
}
|
||||
|
||||
return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
|
||||
@@ -5677,7 +5691,8 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
|
||||
drm_sched_start(&ring->sched, true);
|
||||
}
|
||||
|
||||
amdgpu_device_unlock_adev(adev);
|
||||
amdgpu_device_unset_mp1_state(adev);
|
||||
amdgpu_device_unlock_reset_domain(adev->reset_domain);
|
||||
}
|
||||
|
||||
bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
|
||||
@@ -5754,6 +5769,11 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
|
||||
amdgpu_asic_invalidate_hdp(adev, ring);
|
||||
}
|
||||
|
||||
int amdgpu_in_reset(struct amdgpu_device *adev)
|
||||
{
|
||||
return atomic_read(&adev->reset_domain->in_gpu_reset);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_halt() - bring hardware to some kind of halt state
|
||||
*
|
||||
|
||||
Reference in New Issue
Block a user