drm/amd/amdgpu: add missing mutex lock to amdgpu_get_xgmi_hive() (v3)
v2: Move locks around in other functions so that this function can stand on its own. Also only hold the hive specific lock for add/remove device instead of the driver global lock so you can't add/remove devices in parallel from one hive. v3: add reset_lock Acked-by: Shaoyun.liu < Shaoyun.liu@amd.com> Signed-off-by: Tom St Denis <tom.stdenis@amd.com> Reviewed-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
4b9674e509
commit
22d6575b8d
@ -3525,9 +3525,9 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
|
||||
* by different nodes. No point also since the one node already executing
|
||||
* reset will also reset all the other nodes in the hive.
|
||||
*/
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
hive = amdgpu_get_xgmi_hive(adev, 0);
|
||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1 &&
|
||||
!mutex_trylock(&hive->hive_lock))
|
||||
!mutex_trylock(&hive->reset_lock))
|
||||
return 0;
|
||||
|
||||
/* Start with adev pre asic reset first for soft reset check.*/
|
||||
@ -3606,7 +3606,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */
|
||||
}
|
||||
|
||||
if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
mutex_unlock(&hive->reset_lock);
|
||||
|
||||
if (r)
|
||||
dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
|
||||
|
@ -40,26 +40,40 @@ void *amdgpu_xgmi_hive_try_lock(struct amdgpu_hive_info *hive)
|
||||
return &hive->device_list;
|
||||
}
|
||||
|
||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
|
||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
|
||||
{
|
||||
int i;
|
||||
struct amdgpu_hive_info *tmp;
|
||||
|
||||
if (!adev->gmc.xgmi.hive_id)
|
||||
return NULL;
|
||||
|
||||
mutex_lock(&xgmi_mutex);
|
||||
|
||||
for (i = 0 ; i < hive_count; ++i) {
|
||||
tmp = &xgmi_hives[i];
|
||||
if (tmp->hive_id == adev->gmc.xgmi.hive_id)
|
||||
if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
|
||||
if (lock)
|
||||
mutex_lock(&tmp->hive_lock);
|
||||
mutex_unlock(&xgmi_mutex);
|
||||
return tmp;
|
||||
}
|
||||
}
|
||||
if (i >= AMDGPU_MAX_XGMI_HIVE)
|
||||
if (i >= AMDGPU_MAX_XGMI_HIVE) {
|
||||
mutex_unlock(&xgmi_mutex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* initialize new hive if not exist */
|
||||
tmp = &xgmi_hives[hive_count++];
|
||||
tmp->hive_id = adev->gmc.xgmi.hive_id;
|
||||
INIT_LIST_HEAD(&tmp->device_list);
|
||||
mutex_init(&tmp->hive_lock);
|
||||
mutex_init(&tmp->reset_lock);
|
||||
if (lock)
|
||||
mutex_lock(&tmp->hive_lock);
|
||||
|
||||
mutex_unlock(&xgmi_mutex);
|
||||
|
||||
return tmp;
|
||||
}
|
||||
@ -111,8 +125,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
|
||||
return ret;
|
||||
}
|
||||
|
||||
mutex_lock(&xgmi_mutex);
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
hive = amdgpu_get_xgmi_hive(adev, 1);
|
||||
if (!hive) {
|
||||
ret = -EINVAL;
|
||||
dev_err(adev->dev,
|
||||
@ -147,8 +160,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
|
||||
break;
|
||||
}
|
||||
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
exit:
|
||||
mutex_unlock(&xgmi_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -159,15 +172,14 @@ void amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
|
||||
if (!adev->gmc.xgmi.supported)
|
||||
return;
|
||||
|
||||
mutex_lock(&xgmi_mutex);
|
||||
|
||||
hive = amdgpu_get_xgmi_hive(adev);
|
||||
hive = amdgpu_get_xgmi_hive(adev, 1);
|
||||
if (!hive)
|
||||
goto exit;
|
||||
return;
|
||||
|
||||
if (!(hive->number_devices--))
|
||||
if (!(hive->number_devices--)) {
|
||||
mutex_destroy(&hive->hive_lock);
|
||||
|
||||
exit:
|
||||
mutex_unlock(&xgmi_mutex);
|
||||
mutex_destroy(&hive->reset_lock);
|
||||
} else {
|
||||
mutex_unlock(&hive->hive_lock);
|
||||
}
|
||||
}
|
||||
|
@ -29,10 +29,11 @@ struct amdgpu_hive_info {
|
||||
struct list_head device_list;
|
||||
struct psp_xgmi_topology_info topology_info;
|
||||
int number_devices;
|
||||
struct mutex hive_lock;
|
||||
struct mutex hive_lock,
|
||||
reset_lock;
|
||||
};
|
||||
|
||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
|
||||
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
|
||||
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
|
||||
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
|
||||
void amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
|
||||
|
Loading…
Reference in New Issue
Block a user