drm/amdkfd: Cleanup IO links during KFD device removal
Currently, the IO-links to the device being removed from topology, are not cleared. As a result, there would be dangling links left in the KFD topology. This patch aims to fix the following: 1. Cleanup all IO links to the device being removed. 2. Ensure that node numbering in sysfs and nodes proximity domain values are consistent after the device is removed: a. Adding a device and removing a GPU device are made mutually exclusive. b. The global proximity domain counter is no longer required to be an atomic counter. A normal 32-bit counter can be used instead. 3. Update generation_count to let user-mode know that topology has changed due to device removal. CC: Shuotao Xu <shuotaoxu@microsoft.com> Reviewed-by: Shuotao Xu <shuotaoxu@microsoft.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: Mukul Joshi <mukul.joshi@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
eb85fc2389
commit
46d18d510d
@ -1056,7 +1056,7 @@ static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink,
|
|||||||
* table, add corresponded reversed direction link now.
|
* table, add corresponded reversed direction link now.
|
||||||
*/
|
*/
|
||||||
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
|
if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) {
|
||||||
to_dev = kfd_topology_device_by_proximity_domain(id_to);
|
to_dev = kfd_topology_device_by_proximity_domain_no_lock(id_to);
|
||||||
if (!to_dev)
|
if (!to_dev)
|
||||||
return -ENODEV;
|
return -ENODEV;
|
||||||
/* same everything but the other direction */
|
/* same everything but the other direction */
|
||||||
@ -2225,7 +2225,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image,
|
|||||||
*/
|
*/
|
||||||
if (kdev->hive_id) {
|
if (kdev->hive_id) {
|
||||||
for (nid = 0; nid < proximity_domain; ++nid) {
|
for (nid = 0; nid < proximity_domain; ++nid) {
|
||||||
peer_dev = kfd_topology_device_by_proximity_domain(nid);
|
peer_dev = kfd_topology_device_by_proximity_domain_no_lock(nid);
|
||||||
if (!peer_dev->gpu)
|
if (!peer_dev->gpu)
|
||||||
continue;
|
continue;
|
||||||
if (peer_dev->gpu->hive_id != kdev->hive_id)
|
if (peer_dev->gpu->hive_id != kdev->hive_id)
|
||||||
|
@ -1016,6 +1016,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu);
|
|||||||
int kfd_topology_remove_device(struct kfd_dev *gpu);
|
int kfd_topology_remove_device(struct kfd_dev *gpu);
|
||||||
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
|
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
|
||||||
uint32_t proximity_domain);
|
uint32_t proximity_domain);
|
||||||
|
struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
|
||||||
|
uint32_t proximity_domain);
|
||||||
struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
|
struct kfd_topology_device *kfd_topology_device_by_id(uint32_t gpu_id);
|
||||||
struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
|
struct kfd_dev *kfd_device_by_id(uint32_t gpu_id);
|
||||||
struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
|
struct kfd_dev *kfd_device_by_pci_dev(const struct pci_dev *pdev);
|
||||||
|
@ -46,22 +46,32 @@ static struct list_head topology_device_list;
|
|||||||
static struct kfd_system_properties sys_props;
|
static struct kfd_system_properties sys_props;
|
||||||
|
|
||||||
static DECLARE_RWSEM(topology_lock);
|
static DECLARE_RWSEM(topology_lock);
|
||||||
static atomic_t topology_crat_proximity_domain;
|
static uint32_t topology_crat_proximity_domain;
|
||||||
|
|
||||||
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
|
struct kfd_topology_device *kfd_topology_device_by_proximity_domain_no_lock(
|
||||||
uint32_t proximity_domain)
|
uint32_t proximity_domain)
|
||||||
{
|
{
|
||||||
struct kfd_topology_device *top_dev;
|
struct kfd_topology_device *top_dev;
|
||||||
struct kfd_topology_device *device = NULL;
|
struct kfd_topology_device *device = NULL;
|
||||||
|
|
||||||
down_read(&topology_lock);
|
|
||||||
|
|
||||||
list_for_each_entry(top_dev, &topology_device_list, list)
|
list_for_each_entry(top_dev, &topology_device_list, list)
|
||||||
if (top_dev->proximity_domain == proximity_domain) {
|
if (top_dev->proximity_domain == proximity_domain) {
|
||||||
device = top_dev;
|
device = top_dev;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return device;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct kfd_topology_device *kfd_topology_device_by_proximity_domain(
|
||||||
|
uint32_t proximity_domain)
|
||||||
|
{
|
||||||
|
struct kfd_topology_device *device = NULL;
|
||||||
|
|
||||||
|
down_read(&topology_lock);
|
||||||
|
|
||||||
|
device = kfd_topology_device_by_proximity_domain_no_lock(
|
||||||
|
proximity_domain);
|
||||||
up_read(&topology_lock);
|
up_read(&topology_lock);
|
||||||
|
|
||||||
return device;
|
return device;
|
||||||
@ -1060,7 +1070,7 @@ int kfd_topology_init(void)
|
|||||||
down_write(&topology_lock);
|
down_write(&topology_lock);
|
||||||
kfd_topology_update_device_list(&temp_topology_device_list,
|
kfd_topology_update_device_list(&temp_topology_device_list,
|
||||||
&topology_device_list);
|
&topology_device_list);
|
||||||
atomic_set(&topology_crat_proximity_domain, sys_props.num_devices-1);
|
topology_crat_proximity_domain = sys_props.num_devices-1;
|
||||||
ret = kfd_topology_update_sysfs();
|
ret = kfd_topology_update_sysfs();
|
||||||
up_write(&topology_lock);
|
up_write(&topology_lock);
|
||||||
|
|
||||||
@ -1295,8 +1305,6 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
|
|||||||
|
|
||||||
pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
|
pr_debug("Adding new GPU (ID: 0x%x) to topology\n", gpu_id);
|
||||||
|
|
||||||
proximity_domain = atomic_inc_return(&topology_crat_proximity_domain);
|
|
||||||
|
|
||||||
/* Include the CPU in xGMI hive if xGMI connected by assigning it the hive ID. */
|
/* Include the CPU in xGMI hive if xGMI connected by assigning it the hive ID. */
|
||||||
if (gpu->hive_id && gpu->adev->gmc.xgmi.connected_to_cpu) {
|
if (gpu->hive_id && gpu->adev->gmc.xgmi.connected_to_cpu) {
|
||||||
struct kfd_topology_device *top_dev;
|
struct kfd_topology_device *top_dev;
|
||||||
@ -1321,12 +1329,16 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
|
|||||||
*/
|
*/
|
||||||
dev = kfd_assign_gpu(gpu);
|
dev = kfd_assign_gpu(gpu);
|
||||||
if (!dev) {
|
if (!dev) {
|
||||||
|
down_write(&topology_lock);
|
||||||
|
proximity_domain = ++topology_crat_proximity_domain;
|
||||||
|
|
||||||
res = kfd_create_crat_image_virtual(&crat_image, &image_size,
|
res = kfd_create_crat_image_virtual(&crat_image, &image_size,
|
||||||
COMPUTE_UNIT_GPU, gpu,
|
COMPUTE_UNIT_GPU, gpu,
|
||||||
proximity_domain);
|
proximity_domain);
|
||||||
if (res) {
|
if (res) {
|
||||||
pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
|
pr_err("Error creating VCRAT for GPU (ID: 0x%x)\n",
|
||||||
gpu_id);
|
gpu_id);
|
||||||
|
topology_crat_proximity_domain--;
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
res = kfd_parse_crat_table(crat_image,
|
res = kfd_parse_crat_table(crat_image,
|
||||||
@ -1335,10 +1347,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu)
|
|||||||
if (res) {
|
if (res) {
|
||||||
pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
|
pr_err("Error parsing VCRAT for GPU (ID: 0x%x)\n",
|
||||||
gpu_id);
|
gpu_id);
|
||||||
|
topology_crat_proximity_domain--;
|
||||||
goto err;
|
goto err;
|
||||||
}
|
}
|
||||||
|
|
||||||
down_write(&topology_lock);
|
|
||||||
kfd_topology_update_device_list(&temp_topology_device_list,
|
kfd_topology_update_device_list(&temp_topology_device_list,
|
||||||
&topology_device_list);
|
&topology_device_list);
|
||||||
|
|
||||||
@ -1485,25 +1497,78 @@ err:
|
|||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* kfd_topology_update_io_links() - Update IO links after device removal.
|
||||||
|
* @proximity_domain: Proximity domain value of the dev being removed.
|
||||||
|
*
|
||||||
|
* The topology list currently is arranged in increasing order of
|
||||||
|
* proximity domain.
|
||||||
|
*
|
||||||
|
* Two things need to be done when a device is removed:
|
||||||
|
* 1. All the IO links to this device need to be removed.
|
||||||
|
* 2. All nodes after the current device node need to move
|
||||||
|
* up once this device node is removed from the topology
|
||||||
|
* list. As a result, the proximity domain values for
|
||||||
|
* all nodes after the node being deleted reduce by 1.
|
||||||
|
* This would also cause the proximity domain values for
|
||||||
|
* io links to be updated based on new proximity domain
|
||||||
|
* values.
|
||||||
|
*
|
||||||
|
* Context: The caller must hold write topology_lock.
|
||||||
|
*/
|
||||||
|
static void kfd_topology_update_io_links(int proximity_domain)
|
||||||
|
{
|
||||||
|
struct kfd_topology_device *dev;
|
||||||
|
struct kfd_iolink_properties *iolink, *tmp;
|
||||||
|
|
||||||
|
list_for_each_entry(dev, &topology_device_list, list) {
|
||||||
|
if (dev->proximity_domain > proximity_domain)
|
||||||
|
dev->proximity_domain--;
|
||||||
|
|
||||||
|
list_for_each_entry_safe(iolink, tmp, &dev->io_link_props, list) {
|
||||||
|
/*
|
||||||
|
* If there is an io link to the dev being deleted
|
||||||
|
* then remove that IO link also.
|
||||||
|
*/
|
||||||
|
if (iolink->node_to == proximity_domain) {
|
||||||
|
list_del(&iolink->list);
|
||||||
|
dev->io_link_count--;
|
||||||
|
dev->node_props.io_links_count--;
|
||||||
|
} else if (iolink->node_from > proximity_domain) {
|
||||||
|
iolink->node_from--;
|
||||||
|
} else if (iolink->node_to > proximity_domain) {
|
||||||
|
iolink->node_to--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int kfd_topology_remove_device(struct kfd_dev *gpu)
|
int kfd_topology_remove_device(struct kfd_dev *gpu)
|
||||||
{
|
{
|
||||||
struct kfd_topology_device *dev, *tmp;
|
struct kfd_topology_device *dev, *tmp;
|
||||||
uint32_t gpu_id;
|
uint32_t gpu_id;
|
||||||
int res = -ENODEV;
|
int res = -ENODEV;
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
down_write(&topology_lock);
|
down_write(&topology_lock);
|
||||||
|
|
||||||
list_for_each_entry_safe(dev, tmp, &topology_device_list, list)
|
list_for_each_entry_safe(dev, tmp, &topology_device_list, list) {
|
||||||
if (dev->gpu == gpu) {
|
if (dev->gpu == gpu) {
|
||||||
gpu_id = dev->gpu_id;
|
gpu_id = dev->gpu_id;
|
||||||
kfd_remove_sysfs_node_entry(dev);
|
kfd_remove_sysfs_node_entry(dev);
|
||||||
kfd_release_topology_device(dev);
|
kfd_release_topology_device(dev);
|
||||||
sys_props.num_devices--;
|
sys_props.num_devices--;
|
||||||
|
kfd_topology_update_io_links(i);
|
||||||
|
topology_crat_proximity_domain = sys_props.num_devices-1;
|
||||||
|
sys_props.generation_count++;
|
||||||
res = 0;
|
res = 0;
|
||||||
if (kfd_topology_update_sysfs() < 0)
|
if (kfd_topology_update_sysfs() < 0)
|
||||||
kfd_topology_release_sysfs();
|
kfd_topology_release_sysfs();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
up_write(&topology_lock);
|
up_write(&topology_lock);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user