mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git/
synced 2026-04-18 06:33:43 -04:00
Merge drm/drm-next into drm-misc-next
Backmerging to get a late RC of v6.10 before moving into v6.11. Signed-off-by: Thomas Zimmermann <tzimmermann@suse.de>
This commit is contained in:
@@ -679,7 +679,7 @@ uint32_t amdgpu_device_xcc_rreg(struct amdgpu_device *adev,
|
||||
amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
|
||||
GC_HWIP, false,
|
||||
&rlcg_flag)) {
|
||||
ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, xcc_id);
|
||||
ret = amdgpu_virt_rlcg_reg_rw(adev, reg, 0, rlcg_flag, GET_INST(GC, xcc_id));
|
||||
} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
|
||||
amdgpu_sriov_runtime(adev) &&
|
||||
down_read_trylock(&adev->reset_domain->sem)) {
|
||||
@@ -810,7 +810,7 @@ void amdgpu_device_xcc_wreg(struct amdgpu_device *adev,
|
||||
amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags,
|
||||
GC_HWIP, true,
|
||||
&rlcg_flag)) {
|
||||
amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, xcc_id);
|
||||
amdgpu_virt_rlcg_reg_rw(adev, reg, v, rlcg_flag, GET_INST(GC, xcc_id));
|
||||
} else if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
|
||||
amdgpu_sriov_runtime(adev) &&
|
||||
down_read_trylock(&adev->reset_domain->sem)) {
|
||||
@@ -1308,6 +1308,7 @@ static int amdgpu_device_asic_init(struct amdgpu_device *adev)
|
||||
amdgpu_asic_pre_asic_init(adev);
|
||||
|
||||
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
|
||||
amdgpu_psp_wait_for_bootloader(adev);
|
||||
ret = amdgpu_atomfirmware_asic_init(adev, true);
|
||||
@@ -2349,7 +2350,6 @@ void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
|
||||
static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
|
||||
{
|
||||
const char *chip_name;
|
||||
char fw_name[40];
|
||||
int err;
|
||||
const struct gpu_info_firmware_header_v1_0 *hdr;
|
||||
|
||||
@@ -2383,12 +2383,12 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
|
||||
break;
|
||||
}
|
||||
|
||||
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
|
||||
err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
|
||||
err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw,
|
||||
"amdgpu/%s_gpu_info.bin", chip_name);
|
||||
if (err) {
|
||||
dev_err(adev->dev,
|
||||
"Failed to get gpu_info firmware \"%s\"\n",
|
||||
fw_name);
|
||||
"Failed to get gpu_info firmware \"%s_gpu_info.bin\"\n",
|
||||
chip_name);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -3142,7 +3142,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
|
||||
return r;
|
||||
}
|
||||
|
||||
amdgpu_ras_set_error_query_ready(adev, true);
|
||||
if (!amdgpu_in_reset(adev))
|
||||
amdgpu_ras_set_error_query_ready(adev, true);
|
||||
|
||||
amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
|
||||
amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
|
||||
@@ -4048,6 +4049,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
|
||||
mutex_init(&adev->grbm_idx_mutex);
|
||||
mutex_init(&adev->mn_lock);
|
||||
mutex_init(&adev->virt.vf_errors.lock);
|
||||
mutex_init(&adev->virt.rlcg_reg_lock);
|
||||
hash_init(adev->mn_hash);
|
||||
mutex_init(&adev->psp.mutex);
|
||||
mutex_init(&adev->notifier_lock);
|
||||
@@ -5011,7 +5013,8 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
|
||||
shadow = vmbo->shadow;
|
||||
|
||||
/* No need to recover an evicted BO */
|
||||
if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
|
||||
if (!shadow->tbo.resource ||
|
||||
shadow->tbo.resource->mem_type != TTM_PL_TT ||
|
||||
shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
|
||||
shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
|
||||
continue;
|
||||
@@ -5055,29 +5058,29 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
|
||||
* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
|
||||
*
|
||||
* @adev: amdgpu_device pointer
|
||||
* @from_hypervisor: request from hypervisor
|
||||
* @reset_context: amdgpu reset context pointer
|
||||
*
|
||||
* do VF FLR and reinitialize Asic
|
||||
* return 0 means succeeded otherwise failed
|
||||
*/
|
||||
static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
|
||||
bool from_hypervisor)
|
||||
struct amdgpu_reset_context *reset_context)
|
||||
{
|
||||
int r;
|
||||
struct amdgpu_hive_info *hive = NULL;
|
||||
int retry_limit = 0;
|
||||
|
||||
retry:
|
||||
amdgpu_amdkfd_pre_reset(adev);
|
||||
|
||||
amdgpu_device_stop_pending_resets(adev);
|
||||
|
||||
if (from_hypervisor)
|
||||
if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
|
||||
if (!amdgpu_ras_get_fed_status(adev))
|
||||
amdgpu_virt_ready_to_reset(adev);
|
||||
amdgpu_virt_wait_reset(adev);
|
||||
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
|
||||
r = amdgpu_virt_request_full_gpu(adev, true);
|
||||
else
|
||||
} else {
|
||||
r = amdgpu_virt_reset_gpu(adev);
|
||||
}
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
amdgpu_ras_set_fed(adev, false);
|
||||
amdgpu_irq_gpu_reset_resume_helper(adev);
|
||||
|
||||
@@ -5087,7 +5090,7 @@ retry:
|
||||
/* Resume IP prior to SMC */
|
||||
r = amdgpu_device_ip_reinit_early_sriov(adev);
|
||||
if (r)
|
||||
goto error;
|
||||
return r;
|
||||
|
||||
amdgpu_virt_init_data_exchange(adev);
|
||||
|
||||
@@ -5098,38 +5101,41 @@ retry:
|
||||
/* now we are okay to resume SMC/CP/SDMA */
|
||||
r = amdgpu_device_ip_reinit_late_sriov(adev);
|
||||
if (r)
|
||||
goto error;
|
||||
return r;
|
||||
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
/* Update PSP FW topology after reset */
|
||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
r = amdgpu_xgmi_update_topology(hive, adev);
|
||||
|
||||
if (hive)
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
if (!r) {
|
||||
r = amdgpu_ib_ring_tests(adev);
|
||||
r = amdgpu_ib_ring_tests(adev);
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
amdgpu_amdkfd_post_reset(adev);
|
||||
}
|
||||
|
||||
error:
|
||||
if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
|
||||
if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
|
||||
amdgpu_inc_vram_lost(adev);
|
||||
r = amdgpu_device_recover_vram(adev);
|
||||
}
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
/* need to be called during full access so we can't do it later like
|
||||
* bare-metal does.
|
||||
*/
|
||||
amdgpu_amdkfd_post_reset(adev);
|
||||
amdgpu_virt_release_full_gpu(adev, true);
|
||||
|
||||
if (AMDGPU_RETRY_SRIOV_RESET(r)) {
|
||||
if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
|
||||
retry_limit++;
|
||||
goto retry;
|
||||
} else
|
||||
DRM_ERROR("GPU reset retry is beyond the retry limit\n");
|
||||
}
|
||||
|
||||
return r;
|
||||
/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
|
||||
if (amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 2) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
|
||||
amdgpu_ras_resume(adev);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -5220,11 +5226,14 @@ int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
|
||||
|
||||
dev_info(adev->dev, "GPU mode1 reset\n");
|
||||
|
||||
/* Cache the state before bus master disable. The saved config space
|
||||
* values are used in other cases like restore after mode-2 reset.
|
||||
*/
|
||||
amdgpu_device_cache_pci_state(adev->pdev);
|
||||
|
||||
/* disable BM */
|
||||
pci_clear_master(adev->pdev);
|
||||
|
||||
amdgpu_device_cache_pci_state(adev->pdev);
|
||||
|
||||
if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
|
||||
dev_info(adev->dev, "GPU smu mode1 reset\n");
|
||||
ret = amdgpu_dpm_mode1_reset(adev);
|
||||
@@ -5371,11 +5380,13 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
|
||||
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) {
|
||||
amdgpu_reset_reg_dumps(tmp_adev);
|
||||
|
||||
dev_info(tmp_adev->dev, "Dumping IP State\n");
|
||||
/* Trigger ip dump before we reset the asic */
|
||||
for (i = 0; i < tmp_adev->num_ip_blocks; i++)
|
||||
if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
|
||||
tmp_adev->ip_blocks[i].version->funcs
|
||||
->dump_ip_state((void *)tmp_adev);
|
||||
dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
|
||||
}
|
||||
|
||||
reset_context->reset_device_list = device_list_handle;
|
||||
@@ -5688,6 +5699,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
int i, r = 0;
|
||||
bool need_emergency_restart = false;
|
||||
bool audio_suspended = false;
|
||||
int retry_limit = AMDGPU_MAX_RETRY_LIMIT;
|
||||
|
||||
/*
|
||||
* Special case: RAS triggered and full reset isn't supported
|
||||
@@ -5722,7 +5734,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
* to put adev in the 1st position.
|
||||
*/
|
||||
INIT_LIST_HEAD(&device_list);
|
||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
|
||||
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
|
||||
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
|
||||
list_add_tail(&tmp_adev->reset_list, &device_list);
|
||||
if (adev->shutdown)
|
||||
@@ -5769,8 +5781,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
|
||||
cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
|
||||
|
||||
if (!amdgpu_sriov_vf(tmp_adev))
|
||||
amdgpu_amdkfd_pre_reset(tmp_adev);
|
||||
amdgpu_amdkfd_pre_reset(tmp_adev, reset_context);
|
||||
|
||||
/*
|
||||
* Mark these ASICs to be reseted as untracked first
|
||||
@@ -5823,34 +5834,40 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
r, adev_to_drm(tmp_adev)->unique);
|
||||
tmp_adev->asic_reset_res = r;
|
||||
}
|
||||
|
||||
if (!amdgpu_sriov_vf(tmp_adev))
|
||||
/*
|
||||
* Drop all pending non scheduler resets. Scheduler resets
|
||||
* were already dropped during drm_sched_stop
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
}
|
||||
|
||||
/* Actual ASIC resets if needed.*/
|
||||
/* Host driver will handle XGMI hive reset for SRIOV */
|
||||
if (amdgpu_sriov_vf(adev)) {
|
||||
r = amdgpu_device_reset_sriov(adev, job ? false : true);
|
||||
if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) {
|
||||
dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n");
|
||||
amdgpu_ras_set_fed(adev, true);
|
||||
set_bit(AMDGPU_HOST_FLR, &reset_context->flags);
|
||||
}
|
||||
|
||||
r = amdgpu_device_reset_sriov(adev, reset_context);
|
||||
if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) {
|
||||
amdgpu_virt_release_full_gpu(adev, true);
|
||||
goto retry;
|
||||
}
|
||||
if (r)
|
||||
adev->asic_reset_res = r;
|
||||
|
||||
/* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
|
||||
if (amdgpu_ip_version(adev, GC_HWIP, 0) ==
|
||||
IP_VERSION(9, 4, 2) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
|
||||
amdgpu_ras_resume(adev);
|
||||
} else {
|
||||
r = amdgpu_do_asic_reset(device_list_handle, reset_context);
|
||||
if (r && r == -EAGAIN)
|
||||
goto retry;
|
||||
}
|
||||
|
||||
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
|
||||
/*
|
||||
* Drop any pending non scheduler resets queued before reset is done.
|
||||
* Any reset scheduled after this point would be valid. Scheduler resets
|
||||
* were already dropped during drm_sched_stop and no new ones can come
|
||||
* in before drm_sched_start.
|
||||
*/
|
||||
amdgpu_device_stop_pending_resets(tmp_adev);
|
||||
}
|
||||
|
||||
skip_hw_reset:
|
||||
|
||||
/* Post ASIC reset for all devs .*/
|
||||
@@ -5944,13 +5961,18 @@ static void amdgpu_device_partner_bandwidth(struct amdgpu_device *adev,
|
||||
*speed = PCI_SPEED_UNKNOWN;
|
||||
*width = PCIE_LNK_WIDTH_UNKNOWN;
|
||||
|
||||
while ((parent = pci_upstream_bridge(parent))) {
|
||||
/* skip upstream/downstream switches internal to dGPU*/
|
||||
if (parent->vendor == PCI_VENDOR_ID_ATI)
|
||||
continue;
|
||||
*speed = pcie_get_speed_cap(parent);
|
||||
*width = pcie_get_width_cap(parent);
|
||||
break;
|
||||
if (amdgpu_device_pcie_dynamic_switching_supported(adev)) {
|
||||
while ((parent = pci_upstream_bridge(parent))) {
|
||||
/* skip upstream/downstream switches internal to dGPU*/
|
||||
if (parent->vendor == PCI_VENDOR_ID_ATI)
|
||||
continue;
|
||||
*speed = pcie_get_speed_cap(parent);
|
||||
*width = pcie_get_width_cap(parent);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
/* use the current speeds rather than max if switching is not supported */
|
||||
pcie_bandwidth_available(adev->pdev, NULL, speed, width);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6165,7 +6187,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
|
||||
adev->nbio.funcs->enable_doorbell_interrupt)
|
||||
adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
|
||||
|
||||
if (amdgpu_passthrough(adev) &&
|
||||
if (amdgpu_passthrough(adev) && adev->nbio.funcs &&
|
||||
adev->nbio.funcs->clear_doorbell_interrupt)
|
||||
adev->nbio.funcs->clear_doorbell_interrupt(adev);
|
||||
|
||||
@@ -6265,19 +6287,11 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
|
||||
struct amdgpu_reset_context reset_context;
|
||||
u32 memsize;
|
||||
struct list_head device_list;
|
||||
struct amdgpu_hive_info *hive;
|
||||
int hive_ras_recovery = 0;
|
||||
struct amdgpu_ras *ras;
|
||||
|
||||
/* PCI error slot reset should be skipped During RAS recovery */
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
if (hive) {
|
||||
hive_ras_recovery = atomic_read(&hive->ras_recovery);
|
||||
amdgpu_put_xgmi_hive(hive);
|
||||
}
|
||||
ras = amdgpu_ras_get_context(adev);
|
||||
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3)) &&
|
||||
ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery))
|
||||
if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) ||
|
||||
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) &&
|
||||
amdgpu_ras_in_recovery(adev))
|
||||
return PCI_ERS_RESULT_RECOVERED;
|
||||
|
||||
DRM_INFO("PCI error: slot reset callback!!\n");
|
||||
@@ -6519,6 +6533,22 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
|
||||
spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_get_gang - return a reference to the current gang
|
||||
* @adev: amdgpu_device pointer
|
||||
*
|
||||
* Returns: A new reference to the current gang leader.
|
||||
*/
|
||||
struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev)
|
||||
{
|
||||
struct dma_fence *fence;
|
||||
|
||||
rcu_read_lock();
|
||||
fence = dma_fence_get_rcu_safe(&adev->gang_submit);
|
||||
rcu_read_unlock();
|
||||
return fence;
|
||||
}
|
||||
|
||||
/**
|
||||
* amdgpu_device_switch_gang - switch to a new gang
|
||||
* @adev: amdgpu_device pointer
|
||||
@@ -6535,10 +6565,7 @@ struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
|
||||
|
||||
do {
|
||||
dma_fence_put(old);
|
||||
rcu_read_lock();
|
||||
old = dma_fence_get_rcu_safe(&adev->gang_submit);
|
||||
rcu_read_unlock();
|
||||
|
||||
old = amdgpu_device_get_gang(adev);
|
||||
if (old == gang)
|
||||
break;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user