Commit 44357a1b authored by Jonathan Kim's avatar Jonathan Kim Committed by Alex Deucher

drm/amdgpu: get extended xgmi topology data

The TA has a limit to the amount of data that can be retrieved from
GET_TOPOLOGY.  For setups that exceed this limit, the xGMI topology
needs to be re-initialized and data needs to be re-fetched from the
extended link records by setting a flag in the shared command buffer.

The number of hops and the number of links must be accumulated by the
driver. Other data points are all fetched from the first request.
Because the TA has already exceeded its link record limit, it
cannot hold bidirectional information.  Otherwise the driver would
have to do more than two fetches so the driver has to reflect the
topology information in the opposite direction.

v2: squashed with internal reviewed fix
Signed-off-by: default avatarJonathan Kim <jonathan.kim@amd.com>
Reviewed-by: default avatarHawking Zhang <hawking.zhang@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 3a6e4106
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "amdgpu.h" #include "amdgpu.h"
#include "amdgpu_psp.h" #include "amdgpu_psp.h"
#include "amdgpu_ucode.h" #include "amdgpu_ucode.h"
#include "amdgpu_xgmi.h"
#include "soc15_common.h" #include "soc15_common.h"
#include "psp_v3_1.h" #include "psp_v3_1.h"
#include "psp_v10_0.h" #include "psp_v10_0.h"
...@@ -1026,7 +1027,7 @@ int psp_xgmi_terminate(struct psp_context *psp) ...@@ -1026,7 +1027,7 @@ int psp_xgmi_terminate(struct psp_context *psp)
return 0; return 0;
} }
int psp_xgmi_initialize(struct psp_context *psp) int psp_xgmi_initialize(struct psp_context *psp, bool set_extended_data, bool load_ta)
{ {
struct ta_xgmi_shared_memory *xgmi_cmd; struct ta_xgmi_shared_memory *xgmi_cmd;
int ret; int ret;
...@@ -1036,6 +1037,9 @@ int psp_xgmi_initialize(struct psp_context *psp) ...@@ -1036,6 +1037,9 @@ int psp_xgmi_initialize(struct psp_context *psp)
!psp->xgmi.start_addr) !psp->xgmi.start_addr)
return -ENOENT; return -ENOENT;
if (!load_ta)
goto invoke;
if (!psp->xgmi_context.initialized) { if (!psp->xgmi_context.initialized) {
ret = psp_xgmi_init_shared_buf(psp); ret = psp_xgmi_init_shared_buf(psp);
if (ret) if (ret)
...@@ -1047,9 +1051,11 @@ int psp_xgmi_initialize(struct psp_context *psp) ...@@ -1047,9 +1051,11 @@ int psp_xgmi_initialize(struct psp_context *psp)
if (ret) if (ret)
return ret; return ret;
invoke:
/* Initialize XGMI session */ /* Initialize XGMI session */
xgmi_cmd = (struct ta_xgmi_shared_memory *)(psp->xgmi_context.xgmi_shared_buf); xgmi_cmd = (struct ta_xgmi_shared_memory *)(psp->xgmi_context.xgmi_shared_buf);
memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory)); memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
xgmi_cmd->flag_extend_link_record = set_extended_data;
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__INITIALIZE; xgmi_cmd->cmd_id = TA_COMMAND_XGMI__INITIALIZE;
ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id); ret = psp_xgmi_invoke(psp, xgmi_cmd->cmd_id);
...@@ -1103,9 +1109,56 @@ static bool psp_xgmi_peer_link_info_supported(struct psp_context *psp) ...@@ -1103,9 +1109,56 @@ static bool psp_xgmi_peer_link_info_supported(struct psp_context *psp)
psp->xgmi.feature_version >= 0x2000000b; psp->xgmi.feature_version >= 0x2000000b;
} }
/*
* Chips that support extended topology information require the driver to
* reflect topology information in the opposite direction. This is
* because the TA has already exceeded its link record limit and if the
* TA holds bi-directional information, the driver would have to do
* multiple fetches instead of just two.
*/
static void psp_xgmi_reflect_topology_info(struct psp_context *psp,
struct psp_xgmi_node_info node_info)
{
struct amdgpu_device *mirror_adev;
struct amdgpu_hive_info *hive;
uint64_t src_node_id = psp->adev->gmc.xgmi.node_id;
uint64_t dst_node_id = node_info.node_id;
uint8_t dst_num_hops = node_info.num_hops;
uint8_t dst_num_links = node_info.num_links;
hive = amdgpu_get_xgmi_hive(psp->adev);
list_for_each_entry(mirror_adev, &hive->device_list, gmc.xgmi.head) {
struct psp_xgmi_topology_info *mirror_top_info;
int j;
if (mirror_adev->gmc.xgmi.node_id != dst_node_id)
continue;
mirror_top_info = &mirror_adev->psp.xgmi_context.top_info;
for (j = 0; j < mirror_top_info->num_nodes; j++) {
if (mirror_top_info->nodes[j].node_id != src_node_id)
continue;
mirror_top_info->nodes[j].num_hops = dst_num_hops;
/*
* prevent 0 num_links value re-reflection since reflection
* criteria is based on num_hops (direct or indirect).
*
*/
if (dst_num_links)
mirror_top_info->nodes[j].num_links = dst_num_links;
break;
}
break;
}
}
int psp_xgmi_get_topology_info(struct psp_context *psp, int psp_xgmi_get_topology_info(struct psp_context *psp,
int number_devices, int number_devices,
struct psp_xgmi_topology_info *topology) struct psp_xgmi_topology_info *topology,
bool get_extended_data)
{ {
struct ta_xgmi_shared_memory *xgmi_cmd; struct ta_xgmi_shared_memory *xgmi_cmd;
struct ta_xgmi_cmd_get_topology_info_input *topology_info_input; struct ta_xgmi_cmd_get_topology_info_input *topology_info_input;
...@@ -1118,6 +1171,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, ...@@ -1118,6 +1171,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
xgmi_cmd = (struct ta_xgmi_shared_memory *)psp->xgmi_context.xgmi_shared_buf; xgmi_cmd = (struct ta_xgmi_shared_memory *)psp->xgmi_context.xgmi_shared_buf;
memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory)); memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
xgmi_cmd->flag_extend_link_record = get_extended_data;
/* Fill in the shared memory with topology information as input */ /* Fill in the shared memory with topology information as input */
topology_info_input = &xgmi_cmd->xgmi_in_message.get_topology_info; topology_info_input = &xgmi_cmd->xgmi_in_message.get_topology_info;
...@@ -1140,10 +1194,19 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, ...@@ -1140,10 +1194,19 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
topology_info_output = &xgmi_cmd->xgmi_out_message.get_topology_info; topology_info_output = &xgmi_cmd->xgmi_out_message.get_topology_info;
topology->num_nodes = xgmi_cmd->xgmi_out_message.get_topology_info.num_nodes; topology->num_nodes = xgmi_cmd->xgmi_out_message.get_topology_info.num_nodes;
for (i = 0; i < topology->num_nodes; i++) { for (i = 0; i < topology->num_nodes; i++) {
topology->nodes[i].node_id = topology_info_output->nodes[i].node_id; /* extended data will either be 0 or equal to non-extended data */
if (topology_info_output->nodes[i].num_hops)
topology->nodes[i].num_hops = topology_info_output->nodes[i].num_hops; topology->nodes[i].num_hops = topology_info_output->nodes[i].num_hops;
topology->nodes[i].is_sharing_enabled = topology_info_output->nodes[i].is_sharing_enabled;
topology->nodes[i].sdma_engine = topology_info_output->nodes[i].sdma_engine; /* non-extended data gets everything here so no need to update */
if (!get_extended_data) {
topology->nodes[i].node_id = topology_info_output->nodes[i].node_id;
topology->nodes[i].is_sharing_enabled =
topology_info_output->nodes[i].is_sharing_enabled;
topology->nodes[i].sdma_engine =
topology_info_output->nodes[i].sdma_engine;
}
} }
/* Invoke xgmi ta again to get the link information */ /* Invoke xgmi ta again to get the link information */
...@@ -1158,9 +1221,18 @@ int psp_xgmi_get_topology_info(struct psp_context *psp, ...@@ -1158,9 +1221,18 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
return ret; return ret;
link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info; link_info_output = &xgmi_cmd->xgmi_out_message.get_link_info;
for (i = 0; i < topology->num_nodes; i++) for (i = 0; i < topology->num_nodes; i++) {
topology->nodes[i].num_links = /* accumulate num_links on extended data */
topology->nodes[i].num_links = get_extended_data ?
topology->nodes[i].num_links +
link_info_output->nodes[i].num_links :
link_info_output->nodes[i].num_links; link_info_output->nodes[i].num_links;
/* reflect the topology information for bi-directionality */
if (psp->xgmi_context.supports_extended_data &&
get_extended_data && topology->nodes[i].num_hops)
psp_xgmi_reflect_topology_info(psp, topology->nodes[i]);
}
} }
return 0; return 0;
...@@ -2817,7 +2889,7 @@ static int psp_resume(void *handle) ...@@ -2817,7 +2889,7 @@ static int psp_resume(void *handle)
} }
if (adev->gmc.xgmi.num_physical_nodes > 1) { if (adev->gmc.xgmi.num_physical_nodes > 1) {
ret = psp_xgmi_initialize(psp); ret = psp_xgmi_initialize(psp, false, true);
/* Warning the XGMI seesion initialize failure /* Warning the XGMI seesion initialize failure
* Instead of stop driver initialization * Instead of stop driver initialization
*/ */
...@@ -3123,6 +3195,7 @@ static int psp_init_sos_base_fw(struct amdgpu_device *adev) ...@@ -3123,6 +3195,7 @@ static int psp_init_sos_base_fw(struct amdgpu_device *adev)
adev->psp.sos.size_bytes = le32_to_cpu(sos_hdr->sos.size_bytes); adev->psp.sos.size_bytes = le32_to_cpu(sos_hdr->sos.size_bytes);
adev->psp.sos.start_addr = ucode_array_start_addr + adev->psp.sos.start_addr = ucode_array_start_addr +
le32_to_cpu(sos_hdr->sos.offset_bytes); le32_to_cpu(sos_hdr->sos.offset_bytes);
adev->psp.xgmi_context.supports_extended_data = false;
} else { } else {
/* Load alternate PSP SOS FW */ /* Load alternate PSP SOS FW */
sos_hdr_v1_3 = (const struct psp_firmware_header_v1_3 *)adev->psp.sos_fw->data; sos_hdr_v1_3 = (const struct psp_firmware_header_v1_3 *)adev->psp.sos_fw->data;
...@@ -3137,6 +3210,7 @@ static int psp_init_sos_base_fw(struct amdgpu_device *adev) ...@@ -3137,6 +3210,7 @@ static int psp_init_sos_base_fw(struct amdgpu_device *adev)
adev->psp.sos.size_bytes = le32_to_cpu(sos_hdr_v1_3->sos_aux.size_bytes); adev->psp.sos.size_bytes = le32_to_cpu(sos_hdr_v1_3->sos_aux.size_bytes);
adev->psp.sos.start_addr = ucode_array_start_addr + adev->psp.sos.start_addr = ucode_array_start_addr +
le32_to_cpu(sos_hdr_v1_3->sos_aux.offset_bytes); le32_to_cpu(sos_hdr_v1_3->sos_aux.offset_bytes);
adev->psp.xgmi_context.supports_extended_data = true;
} }
if ((adev->psp.sys.size_bytes == 0) || (adev->psp.sos.size_bytes == 0)) { if ((adev->psp.sys.size_bytes == 0) || (adev->psp.sos.size_bytes == 0)) {
......
...@@ -143,6 +143,7 @@ struct psp_xgmi_context { ...@@ -143,6 +143,7 @@ struct psp_xgmi_context {
uint64_t xgmi_shared_mc_addr; uint64_t xgmi_shared_mc_addr;
void *xgmi_shared_buf; void *xgmi_shared_buf;
struct psp_xgmi_topology_info top_info; struct psp_xgmi_topology_info top_info;
bool supports_extended_data;
}; };
struct psp_ras_context { struct psp_ras_context {
...@@ -433,14 +434,15 @@ int psp_gpu_reset(struct amdgpu_device *adev); ...@@ -433,14 +434,15 @@ int psp_gpu_reset(struct amdgpu_device *adev);
int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx, int psp_update_vcn_sram(struct amdgpu_device *adev, int inst_idx,
uint64_t cmd_gpu_addr, int cmd_size); uint64_t cmd_gpu_addr, int cmd_size);
int psp_xgmi_initialize(struct psp_context *psp); int psp_xgmi_initialize(struct psp_context *psp, bool set_extended_data, bool load_ta);
int psp_xgmi_terminate(struct psp_context *psp); int psp_xgmi_terminate(struct psp_context *psp);
int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id); int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
int psp_xgmi_get_hive_id(struct psp_context *psp, uint64_t *hive_id); int psp_xgmi_get_hive_id(struct psp_context *psp, uint64_t *hive_id);
int psp_xgmi_get_node_id(struct psp_context *psp, uint64_t *node_id); int psp_xgmi_get_node_id(struct psp_context *psp, uint64_t *node_id);
int psp_xgmi_get_topology_info(struct psp_context *psp, int psp_xgmi_get_topology_info(struct psp_context *psp,
int number_devices, int number_devices,
struct psp_xgmi_topology_info *topology); struct psp_xgmi_topology_info *topology,
bool get_extended_data);
int psp_xgmi_set_topology_info(struct psp_context *psp, int psp_xgmi_set_topology_info(struct psp_context *psp,
int number_devices, int number_devices,
struct psp_xgmi_topology_info *topology); struct psp_xgmi_topology_info *topology);
......
...@@ -498,6 +498,32 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev, ...@@ -498,6 +498,32 @@ int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
return -EINVAL; return -EINVAL;
} }
/*
* Devices that support extended data require the entire hive to initialize with
* the shared memory buffer flag set.
*
* Hive locks and conditions apply - see amdgpu_xgmi_add_device
*/
static int amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_info *hive,
bool set_extended_data)
{
struct amdgpu_device *tmp_adev;
int ret;
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_initialize(&tmp_adev->psp, set_extended_data, false);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Failed to initialize xgmi session for data partition %i\n",
set_extended_data);
return ret;
}
}
return 0;
}
int amdgpu_xgmi_add_device(struct amdgpu_device *adev) int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
{ {
struct psp_xgmi_topology_info *top_info; struct psp_xgmi_topology_info *top_info;
...@@ -512,7 +538,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) ...@@ -512,7 +538,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
if (!adev->gmc.xgmi.pending_reset && if (!adev->gmc.xgmi.pending_reset &&
amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) { amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
ret = psp_xgmi_initialize(&adev->psp); ret = psp_xgmi_initialize(&adev->psp, false, true);
if (ret) { if (ret) {
dev_err(adev->dev, dev_err(adev->dev,
"XGMI: Failed to initialize xgmi session\n"); "XGMI: Failed to initialize xgmi session\n");
...@@ -575,7 +601,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) ...@@ -575,7 +601,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
/* get latest topology info for each device from psp */ /* get latest topology info for each device from psp */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count, ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
&tmp_adev->psp.xgmi_context.top_info); &tmp_adev->psp.xgmi_context.top_info, false);
if (ret) { if (ret) {
dev_err(tmp_adev->dev, dev_err(tmp_adev->dev,
"XGMI: Get topology failure on device %llx, hive %llx, ret %d", "XGMI: Get topology failure on device %llx, hive %llx, ret %d",
...@@ -585,6 +611,34 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) ...@@ -585,6 +611,34 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit_unlock; goto exit_unlock;
} }
} }
/* get topology again for hives that support extended data */
if (adev->psp.xgmi_context.supports_extended_data) {
/* initialize the hive to get extended data. */
ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, true);
if (ret)
goto exit_unlock;
/* get the extended data. */
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
ret = psp_xgmi_get_topology_info(&tmp_adev->psp, count,
&tmp_adev->psp.xgmi_context.top_info, true);
if (ret) {
dev_err(tmp_adev->dev,
"XGMI: Get topology for extended data failure on device %llx, hive %llx, ret %d",
tmp_adev->gmc.xgmi.node_id,
tmp_adev->gmc.xgmi.hive_id, ret);
goto exit_unlock;
}
}
/* initialize the hive to get non-extended data for the next round. */
ret = amdgpu_xgmi_initialize_hive_get_data_partition(hive, false);
if (ret)
goto exit_unlock;
}
} }
if (!ret && !adev->gmc.xgmi.pending_reset) if (!ret && !adev->gmc.xgmi.pending_reset)
......
...@@ -134,7 +134,8 @@ struct ta_xgmi_shared_memory { ...@@ -134,7 +134,8 @@ struct ta_xgmi_shared_memory {
uint32_t cmd_id; uint32_t cmd_id;
uint32_t resp_id; uint32_t resp_id;
enum ta_xgmi_status xgmi_status; enum ta_xgmi_status xgmi_status;
uint32_t reserved; uint8_t flag_extend_link_record;
uint8_t reserved0[3];
union ta_xgmi_cmd_input xgmi_in_message; union ta_xgmi_cmd_input xgmi_in_message;
union ta_xgmi_cmd_output xgmi_out_message; union ta_xgmi_cmd_output xgmi_out_message;
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment