mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-23 17:02:06 -04:00
Merge tag 'amd-drm-next-5.17-2021-12-16' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amdgpu: - Add some display debugfs entries - RAS fixes - SR-IOV fixes - W=1 fixes - Documentation fixes - IH timestamp fix - Misc power fixes - IP discovery fixes - Large driver documentation updates - Multi-GPU memory use reductions - Misc display fixes and cleanups - Add new SMU debug option amdkfd: - SVM fixes radeon: - Fix typo in comment From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20211216202731.5900-1-alexander.deucher@amd.com
This commit is contained in:
@@ -30,6 +30,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/console.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/iommu.h>
|
||||
|
||||
#include <drm/drm_atomic_helper.h>
|
||||
#include <drm/drm_probe_helper.h>
|
||||
@@ -331,7 +332,7 @@ void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_vram_access - access vram by vram aperature
|
||||
* amdgpu_device_aper_access - access vram by vram aperature
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
* @pos: offset of the buffer in vram
|
||||
@@ -550,11 +551,11 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
|
||||
trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
* amdgpu_mm_wreg_mmio_rlc - write register either with mmio or with RLC path if in range
|
||||
*
|
||||
* this function is invoked only the debugfs register access
|
||||
* */
|
||||
*/
|
||||
void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
|
||||
uint32_t reg, uint32_t v)
|
||||
{
|
||||
@@ -1100,7 +1101,7 @@ static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_wb_init- Init Writeback driver info and allocate memory
|
||||
* amdgpu_device_wb_init - Init Writeback driver info and allocate memory
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
*
|
||||
@@ -2657,6 +2658,36 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_smu_fini_early - smu hw_fini wrapper
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
*
|
||||
* For ASICs need to disable SMC first
|
||||
*/
|
||||
static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
|
||||
{
|
||||
int i, r;
|
||||
|
||||
if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
|
||||
return;
|
||||
|
||||
for (i = 0; i < adev->num_ip_blocks; i++) {
|
||||
if (!adev->ip_blocks[i].status.hw)
|
||||
continue;
|
||||
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
|
||||
r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
|
||||
/* XXX handle errors */
|
||||
if (r) {
|
||||
DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
|
||||
adev->ip_blocks[i].version->funcs->name, r);
|
||||
}
|
||||
adev->ip_blocks[i].status.hw = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
|
||||
{
|
||||
int i, r;
|
||||
@@ -2677,21 +2708,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
|
||||
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
|
||||
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
|
||||
|
||||
/* need to disable SMC first */
|
||||
for (i = 0; i < adev->num_ip_blocks; i++) {
|
||||
if (!adev->ip_blocks[i].status.hw)
|
||||
continue;
|
||||
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
|
||||
r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
|
||||
/* XXX handle errors */
|
||||
if (r) {
|
||||
DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
|
||||
adev->ip_blocks[i].version->funcs->name, r);
|
||||
}
|
||||
adev->ip_blocks[i].status.hw = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Workaroud for ASICs need to disable SMC first */
|
||||
amdgpu_device_smu_fini_early(adev);
|
||||
|
||||
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
|
||||
if (!adev->ip_blocks[i].status.hw)
|
||||
@@ -2733,8 +2751,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
|
||||
if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
|
||||
amdgpu_virt_release_ras_err_handler_data(adev);
|
||||
|
||||
amdgpu_ras_pre_fini(adev);
|
||||
|
||||
if (adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
amdgpu_xgmi_remove_device(adev);
|
||||
|
||||
@@ -3367,6 +3383,22 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
*
|
||||
* RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
|
||||
*/
|
||||
static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
|
||||
{
|
||||
struct iommu_domain *domain;
|
||||
|
||||
domain = iommu_get_domain_for_dev(adev->dev);
|
||||
if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
|
||||
adev->ram_is_direct_mapped = true;
|
||||
}
|
||||
|
||||
static const struct attribute *amdgpu_dev_attributes[] = {
|
||||
&dev_attr_product_name.attr,
|
||||
&dev_attr_product_number.attr,
|
||||
@@ -3770,6 +3802,8 @@ fence_driver_init:
|
||||
queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
|
||||
msecs_to_jiffies(AMDGPU_RESUME_MS));
|
||||
|
||||
amdgpu_device_check_iommu_direct_map(adev);
|
||||
|
||||
return 0;
|
||||
|
||||
release_ras_con:
|
||||
@@ -3803,7 +3837,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_fini - tear down the driver
|
||||
* amdgpu_device_fini_hw - tear down the driver
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
*
|
||||
@@ -3844,6 +3878,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
|
||||
amdgpu_ucode_sysfs_fini(adev);
|
||||
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
|
||||
|
||||
/* disable ras feature must before hw fini */
|
||||
amdgpu_ras_pre_fini(adev);
|
||||
|
||||
amdgpu_device_ip_fini_early(adev);
|
||||
|
||||
amdgpu_irq_fini_hw(adev);
|
||||
@@ -4284,6 +4321,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
bool from_hypervisor)
|
||||
{
|
||||
int r;
|
||||
struct amdgpu_hive_info *hive = NULL;
|
||||
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
@@ -4312,8 +4352,19 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
if (r)
|
||||
goto error;
|
||||
|
||||
amdgpu_irq_gpu_reset_resume_helper(adev);
|
||||
r = amdgpu_ib_ring_tests(adev);
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
/* Update PSP FW topology after reset */
|
||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
r = amdgpu_xgmi_update_topology(hive, adev);
|
||||
|
||||
if (hive)
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
|
||||
if (!r) {
|
||||
amdgpu_irq_gpu_reset_resume_helper(adev);
|
||||
r = amdgpu_ib_ring_tests(adev);
|
||||
amdgpu_amdkfd_post_reset(adev);
|
||||
}
|
||||
|
||||
error:
|
||||
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
|
||||
@@ -4745,7 +4796,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
|
||||
{
|
||||
struct amdgpu_device *tmp_adev = NULL;
|
||||
|
||||
if (adev->gmc.xgmi.num_physical_nodes > 1) {
|
||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
|
||||
if (!hive) {
|
||||
dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
|
||||
return -ENODEV;
|
||||
@@ -4957,7 +5008,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
* We always reset all schedulers for device and all devices for XGMI
|
||||
* hive so that should take care of them too.
|
||||
*/
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
if (!amdgpu_sriov_vf(adev))
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
if (hive) {
|
||||
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
|
||||
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
|
||||
@@ -4998,7 +5050,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
* to put adev in the 1st position.
|
||||
*/
|
||||
INIT_LIST_HEAD(&device_list);
|
||||
if (adev->gmc.xgmi.num_physical_nodes > 1) {
|
||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
|
||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
|
||||
list_add_tail(&tmp_adev->reset_list, &device_list);
|
||||
if (!list_is_first(&adev->reset_list, &device_list))
|
||||
@@ -5632,3 +5684,42 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
|
||||
|
||||
amdgpu_asic_invalidate_hdp(adev, ring);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_halt() - bring hardware to some kind of halt state
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
*
|
||||
* Bring hardware to some kind of halt state so that no one can touch it
|
||||
* any more. It will help to maintain error context when error occurred.
|
||||
* Compare to a simple hang, the system will keep stable at least for SSH
|
||||
* access. Then it should be trivial to inspect the hardware state and
|
||||
* see what's going on. Implemented as following:
|
||||
*
|
||||
* 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
|
||||
* clears all CPU mappings to device, disallows remappings through page faults
|
||||
* 2. amdgpu_irq_disable_all() disables all interrupts
|
||||
* 3. amdgpu_fence_driver_hw_fini() signals all HW fences
|
||||
* 4. set adev->no_hw_access to avoid potential crashes after setp 5
|
||||
* 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
|
||||
* 6. pci_disable_device() and pci_wait_for_pending_transaction()
|
||||
* flush any in flight DMA operations
|
||||
*/
|
||||
void amdgpu_device_halt(struct amdgpu_device *adev)
|
||||
{
|
||||
struct pci_dev *pdev = adev->pdev;
|
||||
struct drm_device *ddev = adev_to_drm(adev);
|
||||
|
||||
drm_dev_unplug(ddev);
|
||||
|
||||
amdgpu_irq_disable_all(adev);
|
||||
|
||||
amdgpu_fence_driver_hw_fini(adev);
|
||||
|
||||
adev->no_hw_access = true;
|
||||
|
||||
amdgpu_device_unmap_mmio(adev);
|
||||
|
||||
pci_disable_device(pdev);
|
||||
pci_wait_for_pending_transaction(pdev);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user