mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
synced 2026-04-27 03:49:57 -04:00
Merge tag 'amd-drm-next-5.10-2020-09-03' of git://people.freedesktop.org/~agd5f/linux into drm-next
amd-drm-next-5.10-2020-09-03: amdgpu: - RAS fixes - Sienna Cichlid updates - Navy Flounder updates - DCE6 (SI) support in DC - Enable plane rotation - Rework pre-OS vram reservation handling during driver init - Add standard interface to dump GPU metrics table from SMU - Rework tiling and tmz state handling in atomic commits - Pstate fixes - Add voltage and power hwmon interfaces for renoir - SW CTF fixes - S/G display fix for Raven - Print client strings for vmfaults for vega and newer - Manual fan control fixes - Display updates - Reorg power management directory structure - Misc bug fixes - Misc code cleanups amdkfd: - Topology fixes - Add SMI events for thermal throttling and GPU resets radeon: - switch from pci_* to dma_* for dma allocations - PLL fix Scheduler: - Clean up priority levels UAPI: - amdgpu INFO IOCTL query update for TMZ state https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6049 - amdkfd SMI event interface updates https://github.com/RadeonOpenCompute/rocm_smi_lib/tree/therm_thrott From: Alex Deucher <alexdeucher@gmail.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200903222921.4152-1-alexander.deucher@amd.com
This commit is contained in:
@@ -132,7 +132,7 @@ static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct drm_device *ddev = dev_get_drvdata(dev);
|
||||
struct amdgpu_device *adev = ddev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(ddev);
|
||||
uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
|
||||
@@ -157,7 +157,7 @@ static ssize_t amdgpu_device_get_product_name(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct drm_device *ddev = dev_get_drvdata(dev);
|
||||
struct amdgpu_device *adev = ddev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(ddev);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
|
||||
}
|
||||
@@ -179,7 +179,7 @@ static ssize_t amdgpu_device_get_product_number(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct drm_device *ddev = dev_get_drvdata(dev);
|
||||
struct amdgpu_device *adev = ddev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(ddev);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
|
||||
}
|
||||
@@ -201,7 +201,7 @@ static ssize_t amdgpu_device_get_serial_number(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct drm_device *ddev = dev_get_drvdata(dev);
|
||||
struct amdgpu_device *adev = ddev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(ddev);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
|
||||
}
|
||||
@@ -219,7 +219,7 @@ static DEVICE_ATTR(serial_number, S_IRUGO,
|
||||
*/
|
||||
bool amdgpu_device_supports_boco(struct drm_device *dev)
|
||||
{
|
||||
struct amdgpu_device *adev = dev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
|
||||
if (adev->flags & AMD_IS_PX)
|
||||
return true;
|
||||
@@ -236,7 +236,7 @@ bool amdgpu_device_supports_boco(struct drm_device *dev)
|
||||
*/
|
||||
bool amdgpu_device_supports_baco(struct drm_device *dev)
|
||||
{
|
||||
struct amdgpu_device *adev = dev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
|
||||
return amdgpu_asic_supports_baco(adev);
|
||||
}
|
||||
@@ -319,8 +319,12 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
|
||||
{
|
||||
uint32_t ret;
|
||||
|
||||
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
|
||||
return amdgpu_kiq_rreg(adev, reg);
|
||||
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
|
||||
down_read_trylock(&adev->reset_sem)) {
|
||||
ret = amdgpu_kiq_rreg(adev, reg);
|
||||
up_read(&adev->reset_sem);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if ((reg * 4) < adev->rmmio_size)
|
||||
ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
|
||||
@@ -332,6 +336,7 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
|
||||
ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
|
||||
spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
|
||||
}
|
||||
|
||||
trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
|
||||
return ret;
|
||||
}
|
||||
@@ -378,7 +383,9 @@ void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
|
||||
BUG();
|
||||
}
|
||||
|
||||
void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
|
||||
static inline void amdgpu_mm_wreg_mmio(struct amdgpu_device *adev,
|
||||
uint32_t reg, uint32_t v,
|
||||
uint32_t acc_flags)
|
||||
{
|
||||
trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
|
||||
|
||||
@@ -407,8 +414,12 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
|
||||
void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
|
||||
uint32_t acc_flags)
|
||||
{
|
||||
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
|
||||
return amdgpu_kiq_wreg(adev, reg, v);
|
||||
if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev) &&
|
||||
down_read_trylock(&adev->reset_sem)) {
|
||||
amdgpu_kiq_wreg(adev, reg, v);
|
||||
up_read(&adev->reset_sem);
|
||||
return;
|
||||
}
|
||||
|
||||
amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
|
||||
}
|
||||
@@ -653,6 +664,20 @@ static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
|
||||
BUG();
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_asic_init - Wrapper for atom asic_init
|
||||
*
|
||||
* @dev: drm_device pointer
|
||||
*
|
||||
* Does any asic specific work and then calls atom asic init.
|
||||
*/
|
||||
static int amdgpu_device_asic_init(struct amdgpu_device *adev)
|
||||
{
|
||||
amdgpu_asic_pre_asic_init(adev);
|
||||
|
||||
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
|
||||
*
|
||||
@@ -1199,6 +1224,11 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
|
||||
|
||||
amdgpu_gmc_tmz_set(adev);
|
||||
|
||||
if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
|
||||
amdgpu_num_kcq = 8;
|
||||
dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1211,7 +1241,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
|
||||
* Callback for the switcheroo driver. Suspends or resumes the
|
||||
* the asics before or after it is powered up using ACPI methods.
|
||||
*/
|
||||
static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
|
||||
static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
|
||||
enum vga_switcheroo_state state)
|
||||
{
|
||||
struct drm_device *dev = pci_get_drvdata(pdev);
|
||||
int r;
|
||||
@@ -1504,7 +1535,7 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
|
||||
adev->enable_virtual_display = false;
|
||||
|
||||
if (amdgpu_virtual_display) {
|
||||
struct drm_device *ddev = adev->ddev;
|
||||
struct drm_device *ddev = adev_to_drm(adev);
|
||||
const char *pci_address_name = pci_name(ddev->pdev);
|
||||
char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
|
||||
|
||||
@@ -1563,7 +1594,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
|
||||
|
||||
adev->firmware.gpu_info_fw = NULL;
|
||||
|
||||
if (adev->discovery_bin) {
|
||||
if (adev->mman.discovery_bin) {
|
||||
amdgpu_discovery_get_gfx_info(adev);
|
||||
|
||||
/*
|
||||
@@ -1935,7 +1966,7 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
|
||||
if (adev->ip_blocks[i].status.hw == true)
|
||||
break;
|
||||
|
||||
if (adev->in_gpu_reset || adev->in_suspend) {
|
||||
if (amdgpu_in_reset(adev) || adev->in_suspend) {
|
||||
r = adev->ip_blocks[i].version->funcs->resume(adev);
|
||||
if (r) {
|
||||
DRM_ERROR("resume of IP block <%s> failed %d\n",
|
||||
@@ -2055,13 +2086,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
|
||||
* it should be called after amdgpu_device_ip_hw_init_phase2 since
|
||||
* for some ASICs the RAS EEPROM code relies on SMU fully functioning
|
||||
* for I2C communication which only true at this point.
|
||||
* recovery_init may fail, but it can free all resources allocated by
|
||||
* itself and its failure should not stop amdgpu init process.
|
||||
*
|
||||
* amdgpu_ras_recovery_init may fail, but the upper only cares the
|
||||
* failure from bad gpu situation and stop amdgpu init process
|
||||
* accordingly. For other failed cases, it will still release all
|
||||
* the resource and print error message, rather than returning one
|
||||
* negative value to upper level.
|
||||
*
|
||||
* Note: theoretically, this should be called before all vram allocations
|
||||
* to protect retired page from abusing
|
||||
*/
|
||||
amdgpu_ras_recovery_init(adev);
|
||||
r = amdgpu_ras_recovery_init(adev);
|
||||
if (r)
|
||||
goto init_failed;
|
||||
|
||||
if (adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
amdgpu_xgmi_add_device(adev);
|
||||
@@ -2106,7 +2143,7 @@ static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
|
||||
AMDGPU_RESET_MAGIC_NUM))
|
||||
return true;
|
||||
|
||||
if (!adev->in_gpu_reset)
|
||||
if (!amdgpu_in_reset(adev))
|
||||
return false;
|
||||
|
||||
/*
|
||||
@@ -2217,9 +2254,7 @@ static int amdgpu_device_enable_mgpu_fan_boost(void)
|
||||
gpu_ins = &(mgpu_info.gpu_ins[i]);
|
||||
adev = gpu_ins->adev;
|
||||
if (!(adev->flags & AMD_IS_APU) &&
|
||||
!gpu_ins->mgpu_fan_enabled &&
|
||||
adev->powerplay.pp_funcs &&
|
||||
adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
|
||||
!gpu_ins->mgpu_fan_enabled) {
|
||||
ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
|
||||
if (ret)
|
||||
break;
|
||||
@@ -2574,17 +2609,16 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
|
||||
AMD_IP_BLOCK_TYPE_IH,
|
||||
};
|
||||
|
||||
for (i = 0; i < adev->num_ip_blocks; i++)
|
||||
adev->ip_blocks[i].status.hw = false;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
|
||||
int j;
|
||||
struct amdgpu_ip_block *block;
|
||||
|
||||
for (j = 0; j < adev->num_ip_blocks; j++) {
|
||||
block = &adev->ip_blocks[j];
|
||||
block = &adev->ip_blocks[i];
|
||||
block->status.hw = false;
|
||||
|
||||
if (block->version->type != ip_order[i] ||
|
||||
for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
|
||||
|
||||
if (block->version->type != ip_order[j] ||
|
||||
!block->status.valid)
|
||||
continue;
|
||||
|
||||
@@ -2777,6 +2811,12 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
|
||||
{
|
||||
switch (asic_type) {
|
||||
#if defined(CONFIG_DRM_AMD_DC)
|
||||
#if defined(CONFIG_DRM_AMD_DC_SI)
|
||||
case CHIP_TAHITI:
|
||||
case CHIP_PITCAIRN:
|
||||
case CHIP_VERDE:
|
||||
case CHIP_OLAND:
|
||||
#endif
|
||||
case CHIP_BONAIRE:
|
||||
case CHIP_KAVERI:
|
||||
case CHIP_KABINI:
|
||||
@@ -2831,7 +2871,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
|
||||
*/
|
||||
bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
|
||||
{
|
||||
if (amdgpu_sriov_vf(adev))
|
||||
if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
|
||||
return false;
|
||||
|
||||
return amdgpu_device_asic_has_dc_support(adev->asic_type);
|
||||
@@ -2842,7 +2882,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
|
||||
{
|
||||
struct amdgpu_device *adev =
|
||||
container_of(__work, struct amdgpu_device, xgmi_reset_work);
|
||||
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
|
||||
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
|
||||
|
||||
/* It's a bug to not have a hive within this function */
|
||||
if (WARN_ON(!hive))
|
||||
@@ -2857,13 +2897,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
|
||||
if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
|
||||
|
||||
task_barrier_enter(&hive->tb);
|
||||
adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
|
||||
adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
|
||||
|
||||
if (adev->asic_reset_res)
|
||||
goto fail;
|
||||
|
||||
task_barrier_exit(&hive->tb);
|
||||
adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
|
||||
adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
|
||||
|
||||
if (adev->asic_reset_res)
|
||||
goto fail;
|
||||
@@ -2879,7 +2919,8 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
|
||||
fail:
|
||||
if (adev->asic_reset_res)
|
||||
DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
|
||||
adev->asic_reset_res, adev->ddev->unique);
|
||||
adev->asic_reset_res, adev_to_drm(adev)->unique);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
|
||||
@@ -2962,8 +3003,6 @@ static const struct attribute *amdgpu_dev_attributes[] = {
|
||||
* amdgpu_device_init - initialize the driver
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
* @ddev: drm dev pointer
|
||||
* @pdev: pci dev pointer
|
||||
* @flags: driver flags
|
||||
*
|
||||
* Initializes the driver info and hw (all asics).
|
||||
@@ -2971,18 +3010,15 @@ static const struct attribute *amdgpu_dev_attributes[] = {
|
||||
* Called at driver startup.
|
||||
*/
|
||||
int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
struct drm_device *ddev,
|
||||
struct pci_dev *pdev,
|
||||
uint32_t flags)
|
||||
{
|
||||
struct drm_device *ddev = adev_to_drm(adev);
|
||||
struct pci_dev *pdev = adev->pdev;
|
||||
int r, i;
|
||||
bool boco = false;
|
||||
u32 max_MBps;
|
||||
|
||||
adev->shutdown = false;
|
||||
adev->dev = &pdev->dev;
|
||||
adev->ddev = ddev;
|
||||
adev->pdev = pdev;
|
||||
adev->flags = flags;
|
||||
|
||||
if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
|
||||
@@ -3038,7 +3074,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
mutex_init(&adev->mn_lock);
|
||||
mutex_init(&adev->virt.vf_errors.lock);
|
||||
hash_init(adev->mn_hash);
|
||||
mutex_init(&adev->lock_reset);
|
||||
atomic_set(&adev->in_gpu_reset, 0);
|
||||
init_rwsem(&adev->reset_sem);
|
||||
mutex_init(&adev->psp.mutex);
|
||||
mutex_init(&adev->notifier_lock);
|
||||
|
||||
@@ -3188,7 +3225,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
goto failed;
|
||||
}
|
||||
DRM_INFO("GPU posting now...\n");
|
||||
r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
|
||||
r = amdgpu_device_asic_init(adev);
|
||||
if (r) {
|
||||
dev_err(adev->dev, "gpu post error!\n");
|
||||
goto failed;
|
||||
@@ -3226,7 +3263,7 @@ fence_driver_init:
|
||||
}
|
||||
|
||||
/* init the mode config */
|
||||
drm_mode_config_init(adev->ddev);
|
||||
drm_mode_config_init(adev_to_drm(adev));
|
||||
|
||||
r = amdgpu_device_ip_init(adev);
|
||||
if (r) {
|
||||
@@ -3352,9 +3389,7 @@ failed:
|
||||
*/
|
||||
void amdgpu_device_fini(struct amdgpu_device *adev)
|
||||
{
|
||||
int r;
|
||||
|
||||
DRM_INFO("amdgpu: finishing device.\n");
|
||||
dev_info(adev->dev, "amdgpu: finishing device.\n");
|
||||
flush_delayed_work(&adev->delayed_init_work);
|
||||
adev->shutdown = true;
|
||||
|
||||
@@ -3368,15 +3403,15 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
|
||||
amdgpu_irq_disable_all(adev);
|
||||
if (adev->mode_info.mode_config_initialized){
|
||||
if (!amdgpu_device_has_dc_support(adev))
|
||||
drm_helper_force_disable_all(adev->ddev);
|
||||
drm_helper_force_disable_all(adev_to_drm(adev));
|
||||
else
|
||||
drm_atomic_helper_shutdown(adev->ddev);
|
||||
drm_atomic_helper_shutdown(adev_to_drm(adev));
|
||||
}
|
||||
amdgpu_fence_driver_fini(adev);
|
||||
if (adev->pm_sysfs_en)
|
||||
amdgpu_pm_sysfs_fini(adev);
|
||||
amdgpu_fbdev_fini(adev);
|
||||
r = amdgpu_device_ip_fini(adev);
|
||||
amdgpu_device_ip_fini(adev);
|
||||
release_firmware(adev->firmware.gpu_info_fw);
|
||||
adev->firmware.gpu_info_fw = NULL;
|
||||
adev->accel_working = false;
|
||||
@@ -3394,7 +3429,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
|
||||
amdgpu_has_atpx_dgpu_power_cntl()) &&
|
||||
!pci_is_thunderbolt_attached(adev->pdev))
|
||||
vga_switcheroo_unregister_client(adev->pdev);
|
||||
if (amdgpu_device_supports_boco(adev->ddev))
|
||||
if (amdgpu_device_supports_boco(adev_to_drm(adev)))
|
||||
vga_switcheroo_fini_domain_pm_ops(adev->dev);
|
||||
vga_client_register(adev->pdev, NULL, NULL, NULL);
|
||||
if (adev->rio_mem)
|
||||
@@ -3410,7 +3445,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
|
||||
sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
|
||||
if (IS_ENABLED(CONFIG_PERF_EVENTS))
|
||||
amdgpu_pmu_fini(adev);
|
||||
if (adev->discovery_bin)
|
||||
if (adev->mman.discovery_bin)
|
||||
amdgpu_discovery_fini(adev);
|
||||
}
|
||||
|
||||
@@ -3436,11 +3471,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
|
||||
struct drm_connector_list_iter iter;
|
||||
int r;
|
||||
|
||||
if (dev == NULL || dev->dev_private == NULL) {
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
adev = dev->dev_private;
|
||||
adev = drm_to_adev(dev);
|
||||
|
||||
if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
|
||||
return 0;
|
||||
@@ -3528,7 +3559,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
|
||||
{
|
||||
struct drm_connector *connector;
|
||||
struct drm_connector_list_iter iter;
|
||||
struct amdgpu_device *adev = dev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
struct drm_crtc *crtc;
|
||||
int r = 0;
|
||||
|
||||
@@ -3537,14 +3568,14 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
|
||||
|
||||
/* post card */
|
||||
if (amdgpu_device_need_post(adev)) {
|
||||
r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
|
||||
r = amdgpu_device_asic_init(adev);
|
||||
if (r)
|
||||
DRM_ERROR("amdgpu asic init failed\n");
|
||||
dev_err(adev->dev, "amdgpu asic init failed\n");
|
||||
}
|
||||
|
||||
r = amdgpu_device_ip_resume(adev);
|
||||
if (r) {
|
||||
DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
|
||||
dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
|
||||
return r;
|
||||
}
|
||||
amdgpu_fence_driver_resume(adev);
|
||||
@@ -3568,7 +3599,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
|
||||
if (r == 0) {
|
||||
r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
|
||||
if (r != 0)
|
||||
DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
|
||||
dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
|
||||
amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
|
||||
amdgpu_bo_unreserve(aobj);
|
||||
}
|
||||
@@ -3658,7 +3689,7 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
|
||||
adev->ip_blocks[i].status.hang =
|
||||
adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
|
||||
if (adev->ip_blocks[i].status.hang) {
|
||||
DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
|
||||
dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
|
||||
asic_hang = true;
|
||||
}
|
||||
}
|
||||
@@ -3719,7 +3750,7 @@ static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
|
||||
(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
|
||||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
|
||||
if (adev->ip_blocks[i].status.hang) {
|
||||
DRM_INFO("Some block need full reset!\n");
|
||||
dev_info(adev->dev, "Some block need full reset!\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -3807,7 +3838,7 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
|
||||
else
|
||||
tmo = msecs_to_jiffies(100);
|
||||
|
||||
DRM_INFO("recover vram bo from shadow start\n");
|
||||
dev_info(adev->dev, "recover vram bo from shadow start\n");
|
||||
mutex_lock(&adev->shadow_list_lock);
|
||||
list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
|
||||
|
||||
@@ -3843,11 +3874,11 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
|
||||
dma_fence_put(fence);
|
||||
|
||||
if (r < 0 || tmo <= 0) {
|
||||
DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
|
||||
dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
DRM_INFO("recover vram bo from shadow done\n");
|
||||
dev_info(adev->dev, "recover vram bo from shadow done\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -3907,6 +3938,34 @@ error:
|
||||
return r;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_has_job_running - check if there is any job in mirror list
|
||||
*
|
||||
* @adev: amdgpu device pointer
|
||||
*
|
||||
* check if there is any job in mirror list
|
||||
*/
|
||||
bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
|
||||
{
|
||||
int i;
|
||||
struct drm_sched_job *job;
|
||||
|
||||
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
|
||||
struct amdgpu_ring *ring = adev->rings[i];
|
||||
|
||||
if (!ring || !ring->sched.thread)
|
||||
continue;
|
||||
|
||||
spin_lock(&ring->sched.job_list_lock);
|
||||
job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
|
||||
struct drm_sched_job, node);
|
||||
spin_unlock(&ring->sched.job_list_lock);
|
||||
if (job)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_should_recover_gpu - check if we should try GPU recovery
|
||||
*
|
||||
@@ -3918,7 +3977,7 @@ error:
|
||||
bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
|
||||
{
|
||||
if (!amdgpu_device_ip_check_soft_reset(adev)) {
|
||||
DRM_INFO("Timeout, but no hardware hang detected.\n");
|
||||
dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3958,7 +4017,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
|
||||
return true;
|
||||
|
||||
disabled:
|
||||
DRM_INFO("GPU recovery disabled.\n");
|
||||
dev_info(adev->dev, "GPU recovery disabled.\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -3997,7 +4056,7 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
|
||||
r = amdgpu_device_ip_soft_reset(adev);
|
||||
amdgpu_device_ip_post_soft_reset(adev);
|
||||
if (r || amdgpu_device_ip_check_soft_reset(adev)) {
|
||||
DRM_INFO("soft reset failed, will fallback to full reset!\n");
|
||||
dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
|
||||
need_full_reset = true;
|
||||
}
|
||||
}
|
||||
@@ -4033,8 +4092,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
||||
r = amdgpu_asic_reset(tmp_adev);
|
||||
|
||||
if (r) {
|
||||
DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
|
||||
r, tmp_adev->ddev->unique);
|
||||
dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
|
||||
r, adev_to_drm(tmp_adev)->unique);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -4066,8 +4125,8 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
||||
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
|
||||
if (need_full_reset) {
|
||||
/* post card */
|
||||
if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
|
||||
DRM_WARN("asic atom init failed!");
|
||||
if (amdgpu_device_asic_init(tmp_adev))
|
||||
dev_warn(tmp_adev->dev, "asic atom init failed!");
|
||||
|
||||
if (!r) {
|
||||
dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
|
||||
@@ -4108,8 +4167,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
||||
|
||||
amdgpu_fbdev_set_suspend(tmp_adev, 0);
|
||||
|
||||
/* must succeed. */
|
||||
amdgpu_ras_resume(tmp_adev);
|
||||
/*
|
||||
* The GPU enters bad state once faulty pages
|
||||
* by ECC has reached the threshold, and ras
|
||||
* recovery is scheduled next. So add one check
|
||||
* here to break recovery if it indeed exceeds
|
||||
* bad page threshold, and remind user to
|
||||
* retire this GPU or setting one bigger
|
||||
* bad_page_threshold value to fix this once
|
||||
* probing driver again.
|
||||
*/
|
||||
if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
|
||||
/* must succeed. */
|
||||
amdgpu_ras_resume(tmp_adev);
|
||||
} else {
|
||||
r = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Update PSP FW topology after reset */
|
||||
if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
@@ -4117,7 +4191,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
out:
|
||||
if (!r) {
|
||||
amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
|
||||
@@ -4142,16 +4215,19 @@ end:
|
||||
return r;
|
||||
}
|
||||
|
||||
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
|
||||
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
|
||||
struct amdgpu_hive_info *hive)
|
||||
{
|
||||
if (trylock) {
|
||||
if (!mutex_trylock(&adev->lock_reset))
|
||||
return false;
|
||||
} else
|
||||
mutex_lock(&adev->lock_reset);
|
||||
if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
|
||||
return false;
|
||||
|
||||
if (hive) {
|
||||
down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
|
||||
} else {
|
||||
down_write(&adev->reset_sem);
|
||||
}
|
||||
|
||||
atomic_inc(&adev->gpu_reset_counter);
|
||||
adev->in_gpu_reset = true;
|
||||
switch (amdgpu_asic_reset_method(adev)) {
|
||||
case AMD_RESET_METHOD_MODE1:
|
||||
adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
|
||||
@@ -4171,8 +4247,8 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
|
||||
{
|
||||
amdgpu_vf_error_trans_all(adev);
|
||||
adev->mp1_state = PP_MP1_STATE_NONE;
|
||||
adev->in_gpu_reset = false;
|
||||
mutex_unlock(&adev->lock_reset);
|
||||
atomic_set(&adev->in_gpu_reset, 0);
|
||||
up_write(&adev->reset_sem);
|
||||
}
|
||||
|
||||
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
|
||||
@@ -4282,12 +4358,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
* We always reset all schedulers for device and all devices for XGMI
|
||||
* hive so that should take care of them too.
|
||||
*/
|
||||
hive = amdgpu_get_xgmi_hive(adev, true);
|
||||
if (hive && !mutex_trylock(&hive->reset_lock)) {
|
||||
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
|
||||
job ? job->base.id : -1, hive->hive_id);
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
return 0;
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
if (hive) {
|
||||
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
|
||||
DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
|
||||
job ? job->base.id : -1, hive->hive_id);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
return 0;
|
||||
}
|
||||
mutex_lock(&hive->hive_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4309,11 +4388,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
|
||||
/* block all schedulers and reset given job's ring */
|
||||
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
|
||||
if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
|
||||
DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
|
||||
if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
|
||||
dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
|
||||
job ? job->base.id : -1);
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
return 0;
|
||||
r = 0;
|
||||
goto skip_recovery;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4385,8 +4464,8 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
&need_full_reset);
|
||||
/*TODO Should we stop ?*/
|
||||
if (r) {
|
||||
DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
|
||||
r, tmp_adev->ddev->unique);
|
||||
dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
|
||||
r, adev_to_drm(tmp_adev)->unique);
|
||||
tmp_adev->asic_reset_res = r;
|
||||
}
|
||||
}
|
||||
@@ -4422,7 +4501,7 @@ skip_hw_reset:
|
||||
}
|
||||
|
||||
if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
|
||||
drm_helper_resume_force_mode(tmp_adev->ddev);
|
||||
drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
|
||||
}
|
||||
|
||||
tmp_adev->asic_reset_res = 0;
|
||||
@@ -4446,9 +4525,11 @@ skip_sched_resume:
|
||||
amdgpu_device_unlock_adev(tmp_adev);
|
||||
}
|
||||
|
||||
skip_recovery:
|
||||
if (hive) {
|
||||
mutex_unlock(&hive->reset_lock);
|
||||
atomic_set(&hive->in_reset, 0);
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
|
||||
if (r)
|
||||
@@ -4594,10 +4675,10 @@ static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
|
||||
|
||||
int amdgpu_device_baco_enter(struct drm_device *dev)
|
||||
{
|
||||
struct amdgpu_device *adev = dev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
|
||||
if (!amdgpu_device_supports_baco(adev->ddev))
|
||||
if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
|
||||
return -ENOTSUPP;
|
||||
|
||||
if (ras && ras->supported)
|
||||
@@ -4608,11 +4689,11 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
|
||||
|
||||
int amdgpu_device_baco_exit(struct drm_device *dev)
|
||||
{
|
||||
struct amdgpu_device *adev = dev->dev_private;
|
||||
struct amdgpu_device *adev = drm_to_adev(dev);
|
||||
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
|
||||
int ret = 0;
|
||||
|
||||
if (!amdgpu_device_supports_baco(adev->ddev))
|
||||
if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
|
||||
return -ENOTSUPP;
|
||||
|
||||
ret = amdgpu_dpm_baco_exit(adev);
|
||||
|
||||
Reference in New Issue
Block a user