mirror of
https://github.com/torvalds/linux.git
synced 2024-12-04 10:01:41 +00:00
habanalabs: prevent race between fd close/open
The driver allows only a single process to open a device's FD at any single time. This is done by checking "hdev->compute_ctx" under mutex. Therefore, to prevent a race between the moment a user closes it's FD and when another user tries to open the device, we need to make sure that clearing this variable is the very last thing that is done in the code of the FD's release. I'm moving the idle check before clearing this variable and the "reset on device release". btw, if the reset happens it will prevent any other user from opening the device until the reset is finished. An important thing to note is that we need to remove the user process that is closing the device from the process list BEFORE calling the reset function. That is to prevent a case where the reset code will try to kill that user process and it is unnecessary as the process doesn't hold any device/driver resources anymore. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
1282dbbd29
commit
c1904127ce
@ -69,13 +69,6 @@ static void hpriv_release(struct kref *ref)
|
||||
|
||||
mutex_destroy(&hpriv->restore_phase_mutex);
|
||||
|
||||
mutex_lock(&hdev->fpriv_list_lock);
|
||||
list_del(&hpriv->dev_node);
|
||||
hdev->compute_ctx = NULL;
|
||||
mutex_unlock(&hdev->fpriv_list_lock);
|
||||
|
||||
kfree(hpriv);
|
||||
|
||||
if ((!hdev->pldm) && (hdev->pdev) &&
|
||||
(!hdev->asic_funcs->is_device_idle(hdev,
|
||||
idle_mask,
|
||||
@ -87,9 +80,32 @@ static void hpriv_release(struct kref *ref)
|
||||
device_is_idle = false;
|
||||
}
|
||||
|
||||
/* We need to remove the user from the list to make sure the reset process won't
|
||||
* try to kill the user process. Because, if we got here, it means there are no
|
||||
* more driver/device resources that the user process is occupying so there is
|
||||
* no need to kill it
|
||||
*
|
||||
* However, we can't set the compute_ctx to NULL at this stage. This is to prevent
|
||||
* a race between the release and opening the device again. We don't want to let
|
||||
* a user open the device while there a reset is about to happen.
|
||||
*/
|
||||
mutex_lock(&hdev->fpriv_list_lock);
|
||||
list_del(&hpriv->dev_node);
|
||||
mutex_unlock(&hdev->fpriv_list_lock);
|
||||
|
||||
if ((hdev->reset_if_device_not_idle && !device_is_idle)
|
||||
|| hdev->reset_upon_device_release)
|
||||
hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
|
||||
|
||||
/* Now we can mark the compute_ctx as empty. Even if a reset is running in a different
|
||||
* thread, we don't care because the in_reset is marked so if a user will try to open
|
||||
* the device it will fail on that, even if compute_ctx is NULL.
|
||||
*/
|
||||
mutex_lock(&hdev->fpriv_list_lock);
|
||||
hdev->compute_ctx = NULL;
|
||||
mutex_unlock(&hdev->fpriv_list_lock);
|
||||
|
||||
kfree(hpriv);
|
||||
}
|
||||
|
||||
void hl_hpriv_get(struct hl_fpriv *hpriv)
|
||||
|
Loading…
Reference in New Issue
Block a user