Merge tag 'amd-drm-next-6.13-2024-10-25' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-6.13-2024-10-25:

amdgpu:
- SDMA queue reset support
- SMU 13.0.6 updates
- Add debugfs interface to help limit jpeg queue scheduling for testing
- JPEG 4.0.3 updates
- Initial runtime repartitioning support
- GFX9 fixes
- Misc code cleanups
- Rework IP structures to better handle multiple instances of an IP
- DML updates
- DSC fixes
- HDR fixes
- Brightness control updates
- Runtime pm cleanup
- DMCUB fixes
- DCN 3.5 updates
- Struct drm_edid cleanup
- Fetch EDID from _DDC if available
- Ring noop optimizations
- MES logging fixes
- 3DLUT fixes
- DCN 4.x fixes
- SMU 13.x fixes
- Fixes for set_soft_freq_range()
- ACPI fixes
- SMU 14.x updates
- PSR-SU fixes
- fdinfo cleanup
- DCN documentation updates

amdkfd:
- Misc code cleanups
- Increase event FIFO size
- Copy wave state fixes for SDMA

radeon:
- Fix possible overflow in packet3 check
- Late init connector fix
- Always set GEM function pointer

Documentation:
- Update drm-memory documentation

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20241025132336.2416913-1-alexander.deucher@amd.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
This commit is contained in:
Dave Airlie
2024-10-29 18:25:24 +10:00
334 changed files with 9334 additions and 8899 deletions

View File

@@ -145,6 +145,51 @@ const char *amdgpu_asic_name[] = {
"LAST",
};
#define AMDGPU_IP_BLK_MASK_ALL GENMASK(AMDGPU_MAX_IP_NUM - 1, 0)
/*
* Default init level where all blocks are expected to be initialized. This is
* the level of initialization expected by default and also after a full reset
* of the device.
*/
struct amdgpu_init_level amdgpu_init_default = {
.level = AMDGPU_INIT_LEVEL_DEFAULT,
.hwini_ip_block_mask = AMDGPU_IP_BLK_MASK_ALL,
};
/*
* Minimal blocks needed to be initialized before a XGMI hive can be reset. This
* is used for cases like reset on initialization where the entire hive needs to
* be reset before first use.
*/
struct amdgpu_init_level amdgpu_init_minimal_xgmi = {
.level = AMDGPU_INIT_LEVEL_MINIMAL_XGMI,
.hwini_ip_block_mask =
BIT(AMD_IP_BLOCK_TYPE_GMC) | BIT(AMD_IP_BLOCK_TYPE_SMC) |
BIT(AMD_IP_BLOCK_TYPE_COMMON) | BIT(AMD_IP_BLOCK_TYPE_IH) |
BIT(AMD_IP_BLOCK_TYPE_PSP)
};
static inline bool amdgpu_ip_member_of_hwini(struct amdgpu_device *adev,
enum amd_ip_block_type block)
{
return (adev->init_lvl->hwini_ip_block_mask & (1U << block)) != 0;
}
void amdgpu_set_init_level(struct amdgpu_device *adev,
enum amdgpu_init_lvl_id lvl)
{
switch (lvl) {
case AMDGPU_INIT_LEVEL_MINIMAL_XGMI:
adev->init_lvl = &amdgpu_init_minimal_xgmi;
break;
case AMDGPU_INIT_LEVEL_DEFAULT:
fallthrough;
default:
adev->init_lvl = &amdgpu_init_default;
break;
}
}
static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev);
/**
@@ -228,6 +273,42 @@ void amdgpu_reg_state_sysfs_fini(struct amdgpu_device *adev)
sysfs_remove_bin_file(&adev->dev->kobj, &bin_attr_reg_state);
}
int amdgpu_ip_block_suspend(struct amdgpu_ip_block *ip_block)
{
int r;
if (ip_block->version->funcs->suspend) {
r = ip_block->version->funcs->suspend(ip_block);
if (r) {
dev_err(ip_block->adev->dev,
"suspend of IP block <%s> failed %d\n",
ip_block->version->funcs->name, r);
return r;
}
}
ip_block->status.hw = false;
return 0;
}
int amdgpu_ip_block_resume(struct amdgpu_ip_block *ip_block)
{
int r;
if (ip_block->version->funcs->resume) {
r = ip_block->version->funcs->resume(ip_block);
if (r) {
dev_err(ip_block->adev->dev,
"resume of IP block <%s> failed %d\n",
ip_block->version->funcs->name, r);
return r;
}
}
ip_block->status.hw = true;
return 0;
}
/**
* DOC: board_info
*
@@ -1656,7 +1737,7 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
}
/* Don't post if we need to reset whole hive on init */
if (adev->gmc.xgmi.pending_reset)
if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
return false;
if (adev->has_hw_reset) {
@@ -2160,9 +2241,12 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
if (!adev->ip_blocks[i].status.valid)
continue;
if (adev->ip_blocks[i].version->type == block_type) {
r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
if (r)
return r;
if (adev->ip_blocks[i].version->funcs->wait_for_idle) {
r = adev->ip_blocks[i].version->funcs->wait_for_idle(
&adev->ip_blocks[i]);
if (r)
return r;
}
break;
}
}
@@ -2171,26 +2255,24 @@ int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
}
/**
* amdgpu_device_ip_is_idle - is the hardware IP idle
* amdgpu_device_ip_is_valid - is the hardware IP enabled
*
* @adev: amdgpu_device pointer
* @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
*
* Check if the hardware IP is idle or not.
* Returns true if it the IP is idle, false if not.
* Check if the hardware IP is enable or not.
* Returns true if it the IP is enable, false if not.
*/
bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
enum amd_ip_block_type block_type)
bool amdgpu_device_ip_is_valid(struct amdgpu_device *adev,
enum amd_ip_block_type block_type)
{
int i;
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.valid)
continue;
if (adev->ip_blocks[i].version->type == block_type)
return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
return adev->ip_blocks[i].status.valid;
}
return true;
return false;
}
@@ -2272,6 +2354,8 @@ int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
ip_block_version->funcs->name);
adev->ip_blocks[adev->num_ip_blocks].adev = adev;
adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
return 0;
@@ -2567,25 +2651,25 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
total = true;
for (i = 0; i < adev->num_ip_blocks; i++) {
ip_block = &adev->ip_blocks[i];
if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
DRM_WARN("disabled ip block: %d <%s>\n",
i, adev->ip_blocks[i].version->funcs->name);
adev->ip_blocks[i].status.valid = false;
} else {
if (adev->ip_blocks[i].version->funcs->early_init) {
r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
if (r == -ENOENT) {
adev->ip_blocks[i].status.valid = false;
} else if (r) {
DRM_ERROR("early_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
total = false;
} else {
adev->ip_blocks[i].status.valid = true;
}
} else if (ip_block->version->funcs->early_init) {
r = ip_block->version->funcs->early_init(ip_block);
if (r == -ENOENT) {
adev->ip_blocks[i].status.valid = false;
} else if (r) {
DRM_ERROR("early_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
total = false;
} else {
adev->ip_blocks[i].status.valid = true;
}
} else {
adev->ip_blocks[i].status.valid = true;
}
/* get the vbios after the asic_funcs are set up */
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
@@ -2634,10 +2718,13 @@ static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].status.hw)
continue;
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
(amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
r = adev->ip_blocks[i].version->funcs->hw_init(adev);
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
@@ -2659,7 +2746,10 @@ static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].status.hw)
continue;
r = adev->ip_blocks[i].version->funcs->hw_init(adev);
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
continue;
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
@@ -2682,6 +2772,10 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
continue;
if (!amdgpu_ip_member_of_hwini(adev,
AMD_IP_BLOCK_TYPE_PSP))
break;
if (!adev->ip_blocks[i].status.sw)
continue;
@@ -2690,22 +2784,18 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
break;
if (amdgpu_in_reset(adev) || adev->in_suspend) {
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
if (r)
return r;
}
} else {
r = adev->ip_blocks[i].version->funcs->hw_init(adev);
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
return r;
}
adev->ip_blocks[i].status.hw = true;
}
adev->ip_blocks[i].status.hw = true;
break;
}
}
@@ -2787,6 +2877,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
*/
static int amdgpu_device_ip_init(struct amdgpu_device *adev)
{
bool init_badpage;
int i, r;
r = amdgpu_ras_init(adev);
@@ -2796,17 +2887,23 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.valid)
continue;
r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
if (r) {
DRM_ERROR("sw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
goto init_failed;
if (adev->ip_blocks[i].version->funcs->sw_init) {
r = adev->ip_blocks[i].version->funcs->sw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("sw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
goto init_failed;
}
}
adev->ip_blocks[i].status.sw = true;
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
/* need to do common hw init early so everything is set up for gmc */
r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("hw_init %d failed %d\n", i, r);
goto init_failed;
@@ -2823,7 +2920,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
goto init_failed;
}
r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
r = adev->ip_blocks[i].version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("hw_init %d failed %d\n", i, r);
goto init_failed;
@@ -2896,7 +2993,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
* Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing
*/
r = amdgpu_ras_recovery_init(adev);
init_badpage = (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
r = amdgpu_ras_recovery_init(adev, init_badpage);
if (r)
goto init_failed;
@@ -2936,7 +3034,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
amdgpu_ttm_set_buffer_funcs_status(adev, true);
/* Don't init kfd if whole hive need to be reset during init */
if (!adev->gmc.xgmi.pending_reset) {
if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
kgd2kfd_init_zone_device(adev);
amdgpu_amdkfd_device_init(adev);
}
@@ -3136,7 +3234,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
if (!adev->ip_blocks[i].status.hw)
continue;
if (adev->ip_blocks[i].version->funcs->late_init) {
r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
r = adev->ip_blocks[i].version->funcs->late_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("late_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
@@ -3207,6 +3305,25 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
return 0;
}
static void amdgpu_ip_block_hw_fini(struct amdgpu_ip_block *ip_block)
{
int r;
if (!ip_block->version->funcs->hw_fini) {
DRM_ERROR("hw_fini of IP block <%s> not defined\n",
ip_block->version->funcs->name);
} else {
r = ip_block->version->funcs->hw_fini(ip_block);
/* XXX handle errors */
if (r) {
DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
ip_block->version->funcs->name, r);
}
}
ip_block->status.hw = false;
}
/**
* amdgpu_device_smu_fini_early - smu hw_fini wrapper
*
@@ -3216,7 +3333,7 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
*/
static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
{
int i, r;
int i;
if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(9, 0, 0))
return;
@@ -3225,13 +3342,7 @@ static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
if (!adev->ip_blocks[i].status.hw)
continue;
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
/* XXX handle errors */
if (r) {
DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
}
adev->ip_blocks[i].status.hw = false;
amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
break;
}
}
@@ -3245,7 +3356,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
if (!adev->ip_blocks[i].version->funcs->early_fini)
continue;
r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
r = adev->ip_blocks[i].version->funcs->early_fini(&adev->ip_blocks[i]);
if (r) {
DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
@@ -3264,14 +3375,7 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
if (!adev->ip_blocks[i].status.hw)
continue;
r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
/* XXX handle errors */
if (r) {
DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
}
adev->ip_blocks[i].status.hw = false;
amdgpu_ip_block_hw_fini(&adev->ip_blocks[i]);
}
if (amdgpu_sriov_vf(adev)) {
@@ -3317,12 +3421,13 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
amdgpu_ib_pool_fini(adev);
amdgpu_seq64_fini(adev);
}
r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
/* XXX handle errors */
if (r) {
DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
if (adev->ip_blocks[i].version->funcs->sw_fini) {
r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]);
/* XXX handle errors */
if (r) {
DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
}
}
adev->ip_blocks[i].status.sw = false;
adev->ip_blocks[i].status.valid = false;
@@ -3332,7 +3437,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
if (!adev->ip_blocks[i].status.late_initialized)
continue;
if (adev->ip_blocks[i].version->funcs->late_fini)
adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
adev->ip_blocks[i].version->funcs->late_fini(&adev->ip_blocks[i]);
adev->ip_blocks[i].status.late_initialized = false;
}
@@ -3404,15 +3509,9 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
continue;
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
if (r) {
DRM_ERROR("suspend of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
if (r)
return r;
}
adev->ip_blocks[i].status.hw = false;
}
return 0;
@@ -3450,14 +3549,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
}
/* skip unnecessary suspend if we do not initialize them yet */
if (adev->gmc.xgmi.pending_reset &&
!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
adev->ip_blocks[i].status.hw = false;
if (!amdgpu_ip_member_of_hwini(
adev, adev->ip_blocks[i].version->type))
continue;
}
/* skip suspend of gfx/mes and psp for S0ix
* gfx is in gfxoff state, so on resume it will exit gfxoff just
@@ -3491,13 +3585,9 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
continue;
/* XXX handle errors */
r = adev->ip_blocks[i].version->funcs->suspend(adev);
/* XXX handle errors */
if (r) {
DRM_ERROR("suspend of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
}
r = amdgpu_ip_block_suspend(&adev->ip_blocks[i]);
adev->ip_blocks[i].status.hw = false;
/* handle putting the SMC in the appropriate state */
if (!amdgpu_sriov_vf(adev)) {
if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
@@ -3571,7 +3661,7 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
!block->status.valid)
continue;
r = block->version->funcs->hw_init(adev);
r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
if (r)
return r;
@@ -3610,15 +3700,19 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
block->status.hw)
continue;
if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
r = block->version->funcs->resume(adev);
else
r = block->version->funcs->hw_init(adev);
DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
if (r)
return r;
block->status.hw = true;
if (block->version->type == AMD_IP_BLOCK_TYPE_SMC) {
r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
if (r)
return r;
} else {
r = block->version->funcs->hw_init(&adev->ip_blocks[i]);
if (r) {
DRM_ERROR("hw_init of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
return r;
}
block->status.hw = true;
}
}
}
@@ -3649,13 +3743,9 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
if (r)
return r;
}
adev->ip_blocks[i].status.hw = true;
}
}
@@ -3687,13 +3777,9 @@ static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
continue;
r = adev->ip_blocks[i].version->funcs->resume(adev);
if (r) {
DRM_ERROR("resume of IP block <%s> failed %d\n",
adev->ip_blocks[i].version->funcs->name, r);
r = amdgpu_ip_block_resume(&adev->ip_blocks[i]);
if (r)
return r;
}
adev->ip_blocks[i].status.hw = true;
}
return 0;
@@ -4194,6 +4280,12 @@ int amdgpu_device_init(struct amdgpu_device *adev,
amdgpu_device_set_mcbp(adev);
/*
* By default, use default mode where all blocks are expected to be
* initialized. At present a 'swinit' of blocks is required to be
* completed before the need for a different level is detected.
*/
amdgpu_set_init_level(adev, AMDGPU_INIT_LEVEL_DEFAULT);
/* early init functions */
r = amdgpu_device_ip_early_init(adev);
if (r)
@@ -4266,20 +4358,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
if (adev->gmc.xgmi.num_physical_nodes) {
dev_info(adev->dev, "Pending hive reset.\n");
adev->gmc.xgmi.pending_reset = true;
/* Only need to init necessary block for SMU to handle the reset */
for (i = 0; i < adev->num_ip_blocks; i++) {
if (!adev->ip_blocks[i].status.valid)
continue;
if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
DRM_DEBUG("IP %s disabled for hw_init.\n",
adev->ip_blocks[i].version->funcs->name);
adev->ip_blocks[i].status.hw = true;
}
}
amdgpu_set_init_level(adev,
AMDGPU_INIT_LEVEL_MINIMAL_XGMI);
} else if (amdgpu_ip_version(adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 10) &&
!amdgpu_device_has_display_hardware(adev)) {
r = psp_gpu_reset(adev);
@@ -4387,7 +4467,7 @@ fence_driver_init:
/* enable clockgating, etc. after ib tests, etc. since some blocks require
* explicit gating rather than handling it automatically.
*/
if (!adev->gmc.xgmi.pending_reset) {
if (adev->init_lvl->level != AMDGPU_INIT_LEVEL_MINIMAL_XGMI) {
r = amdgpu_device_ip_late_init(adev);
if (r) {
dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
@@ -4437,6 +4517,7 @@ fence_driver_init:
amdgpu_fru_sysfs_init(adev);
amdgpu_reg_state_sysfs_init(adev);
amdgpu_xcp_cfg_sysfs_init(adev);
if (IS_ENABLED(CONFIG_PERF_EVENTS))
r = amdgpu_pmu_init(adev);
@@ -4464,9 +4545,8 @@ fence_driver_init:
if (px)
vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
if (adev->gmc.xgmi.pending_reset)
queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
msecs_to_jiffies(AMDGPU_RESUME_MS));
if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
amdgpu_xgmi_reset_on_init(adev);
amdgpu_device_check_iommu_direct_map(adev);
@@ -4560,6 +4640,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
amdgpu_fru_sysfs_fini(adev);
amdgpu_reg_state_sysfs_fini(adev);
amdgpu_xcp_cfg_sysfs_fini(adev);
/* disable ras feature must before hw fini */
amdgpu_ras_pre_fini(adev);
@@ -4695,7 +4776,7 @@ int amdgpu_device_prepare(struct drm_device *dev)
continue;
if (!adev->ip_blocks[i].version->funcs->prepare_suspend)
continue;
r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev);
r = adev->ip_blocks[i].version->funcs->prepare_suspend(&adev->ip_blocks[i]);
if (r)
goto unprepare;
}
@@ -4899,7 +4980,8 @@ static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].version->funcs->check_soft_reset)
adev->ip_blocks[i].status.hang =
adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
adev->ip_blocks[i].version->funcs->check_soft_reset(
&adev->ip_blocks[i]);
if (adev->ip_blocks[i].status.hang) {
dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
asic_hang = true;
@@ -4928,7 +5010,7 @@ static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].status.hang &&
adev->ip_blocks[i].version->funcs->pre_soft_reset) {
r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
r = adev->ip_blocks[i].version->funcs->pre_soft_reset(&adev->ip_blocks[i]);
if (r)
return r;
}
@@ -4990,7 +5072,7 @@ static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].status.hang &&
adev->ip_blocks[i].version->funcs->soft_reset) {
r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
r = adev->ip_blocks[i].version->funcs->soft_reset(&adev->ip_blocks[i]);
if (r)
return r;
}
@@ -5019,7 +5101,7 @@ static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
continue;
if (adev->ip_blocks[i].status.hang &&
adev->ip_blocks[i].version->funcs->post_soft_reset)
r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
r = adev->ip_blocks[i].version->funcs->post_soft_reset(&adev->ip_blocks[i]);
if (r)
return r;
}
@@ -5310,7 +5392,7 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
for (i = 0; i < tmp_adev->num_ip_blocks; i++)
if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state)
tmp_adev->ip_blocks[i].version->funcs
->dump_ip_state((void *)tmp_adev);
->dump_ip_state((void *)&tmp_adev->ip_blocks[i]);
dev_info(tmp_adev->dev, "Dumping IP State Completed\n");
}
@@ -5326,74 +5408,25 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
return r;
}
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset, vram_lost = false;
int r = 0;
struct list_head *device_list_handle;
bool full_reset, vram_lost = false;
struct amdgpu_device *tmp_adev;
int r;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
device_list_handle = reset_context->reset_device_list;
reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
if (r == -EOPNOTSUPP)
r = 0;
else
return r;
if (!device_list_handle)
return -EINVAL;
/* Reset handler not implemented, use the default method */
need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
*/
if (!skip_hw_reset && need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
tmp_adev->gmc.xgmi.pending_reset = false;
if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
r = amdgpu_asic_reset(tmp_adev);
if (r) {
dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
r, adev_to_drm(tmp_adev)->unique);
goto out;
}
}
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(&tmp_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
if (r)
break;
}
}
}
}
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
amdgpu_ras_reset_error_count(tmp_adev, AMDGPU_RAS_BLOCK__MMHUB);
}
amdgpu_ras_intr_cleared();
}
full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
r = 0;
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
if (need_full_reset) {
/* After reset, it's default init level */
amdgpu_set_init_level(tmp_adev, AMDGPU_INIT_LEVEL_DEFAULT);
if (full_reset) {
/* post card */
amdgpu_ras_set_fed(tmp_adev, false);
r = amdgpu_device_asic_init(tmp_adev);
@@ -5483,7 +5516,6 @@ out:
r = amdgpu_ib_ring_tests(tmp_adev);
if (r) {
dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
need_full_reset = true;
r = -EAGAIN;
goto end;
}
@@ -5494,10 +5526,85 @@ out:
}
end:
if (need_full_reset)
return r;
}
int amdgpu_do_asic_reset(struct list_head *device_list_handle,
struct amdgpu_reset_context *reset_context)
{
struct amdgpu_device *tmp_adev = NULL;
bool need_full_reset, skip_hw_reset;
int r = 0;
/* Try reset handler method first */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
reset_context->reset_device_list = device_list_handle;
r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
/* If reset handler not implemented, continue; otherwise return */
if (r == -EOPNOTSUPP)
r = 0;
else
return r;
/* Reset handler not implemented, use the default method */
need_full_reset =
test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
/*
* ASIC reset has to be done on all XGMI hive nodes ASAP
* to allow proper links negotiation in FW (within 1 sec)
*/
if (!skip_hw_reset && need_full_reset) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
/* For XGMI run all resets in parallel to speed up the process */
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
if (!queue_work(system_unbound_wq,
&tmp_adev->xgmi_reset_work))
r = -EALREADY;
} else
r = amdgpu_asic_reset(tmp_adev);
if (r) {
dev_err(tmp_adev->dev,
"ASIC reset failed with error, %d for drm dev, %s",
r, adev_to_drm(tmp_adev)->unique);
goto out;
}
}
/* For XGMI wait for all resets to complete before proceed */
if (!r) {
list_for_each_entry(tmp_adev, device_list_handle,
reset_list) {
if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
flush_work(&tmp_adev->xgmi_reset_work);
r = tmp_adev->asic_reset_res;
if (r)
break;
}
}
}
}
if (!r && amdgpu_ras_intr_triggered()) {
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
amdgpu_ras_reset_error_count(tmp_adev,
AMDGPU_RAS_BLOCK__MMHUB);
}
amdgpu_ras_intr_cleared();
}
r = amdgpu_device_reinit_after_reset(reset_context);
if (r == -EAGAIN)
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
else
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
out:
return r;
}