drm/amdgpu: get extended xgmi topology data

The TA has a limit to the amount of data that can be retrieved from
GET_TOPOLOGY.  For setups that exceed this limit, the xGMI topology
needs to be re-initialized and data needs to be re-fetched from the
extended link records by setting a flag in the shared command buffer.

The number of hops and the number of links must be accumulated by the
driver. Other data points are all fetched from the first request.
Because the TA has already exceeded its link record limit, it
cannot hold bidirectional information.  Otherwise the driver would
have to do more than two fetches so the driver has to reflect the
topology information in the opposite direction.

v2: squashed with internal reviewed fix

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Hawking Zhang <hawking.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Jonathan Kim
2021-08-03 19:01:55 -04:00
committed by Alex Deucher
parent 3a6e4106a8
commit 44357a1bd5
4 changed files with 145 additions and 14 deletions

View File

@@ -498,6 +498,32 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
return -EINVAL;
}
/*
* Devices that support extended data require the entire hive to initialize with
* the shared memory buffer flag set.
*
* Hive locks and conditions apply - see amdgpu_xgmi_add_device
*/
static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
bool set_extended_data)
{
struct amdgpu_device *tmp_adev;
int ret;
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Failed to initialize xgmi session for data partition %i\n",
set_extended_data);
return ret;
}
}
return 0;
}
int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
{
struct psp_xgmi_topology_info *top_info;
@@ -512,7 +538,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.pending_reset &&
amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
ret = psp_xgmi_initialize(&adev->psp);
ret = psp_xgmi_initialize(&adev->psp, false, true);
if (ret) {
dev_err(adev->dev,
"XGMI: Failed to initialize xgmi session\n");
@@ -575,7 +601,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
/* get latest topology info for each device from psp */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
&tmp_adev->psp.xgmi_context.top_info);
&tmp_adev->psp.xgmi_context.top_info, false);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Get topology failure on device %llx, hive %llx, ret %d",
@@ -585,6 +611,34 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit_unlock;
}
}
/* get topology again for hives that support extended data */
if (adev->psp.xgmi_context.supports_extended_data) {
/* initialize the hive to get extended data. */
ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
if (ret)
goto exit_unlock;
/* get the extended data. */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
&tmp_adev->psp.xgmi_context.top_info, true);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret);
goto exit_unlock;
}
}
/* initialize the hive to get non-extended data for the next round. */
ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
if (ret)
goto exit_unlock;
}
}
if (!ret && !adev->gmc.xgmi.pending_reset)