habanalabs: Fix reset upon device release bug

In case user application was interrupted while some cs still in-flight
or in the middle of completion handling in driver, the
last refcount of the kernel private data for the user process
will not be put in the fd close flow, but in the cs completion
workqueue context.

This means that the device reset-upon-device-release will be called
from that context. During the reset flow, the driver flushes all the cs
workqueue to ensure that any scheduled work has run to completion,
and since we are running from the completion context we will
have deadlock.

Therefore, we need to skip flushing the workqueue in those cases.
It is safe to do it because the user won't be able to release the device
unless the workqueues are already empty.

Signed-off-by: farah kassabri <fkassabri@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
farah kassabri 2022-02-17 16:15:26 +02:00 committed by Oded Gabbay
parent e8458e20e0
commit a78b07dcae
3 changed files with 19 additions and 13 deletions

View File

@ -921,11 +921,12 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
complete_job(hdev, job);
}
void hl_cs_rollback_all(struct hl_device *hdev)
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
{
int i;
struct hl_cs *cs, *tmp;
if (!skip_wq_flush) {
flush_workqueue(hdev->ts_free_obj_wq);
/* flush all completions before iterating over the CS mirror list in
@ -934,6 +935,8 @@ void hl_cs_rollback_all(struct hl_device *hdev)
for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
flush_workqueue(hdev->cq_wq[i]);
}
/* Make sure we don't have leftovers in the CS mirror list */
list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
cs_get(cs);

View File

@ -685,7 +685,8 @@ static void take_release_locks(struct hl_device *hdev)
mutex_unlock(&hdev->fpriv_ctrl_list_lock);
}
static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset,
bool skip_wq_flush)
{
if (hard_reset)
device_late_fini(hdev);
@ -698,7 +699,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_r
hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);
/* Go over all the queues, release all CS and their jobs */
hl_cs_rollback_all(hdev);
hl_cs_rollback_all(hdev, skip_wq_flush);
/* Release all pending user interrupts, each pending user interrupt
* holds a reference to user context
@ -978,7 +979,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
int hl_device_reset(struct hl_device *hdev, u32 flags)
{
bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
reset_upon_device_release = false, schedule_hard_reset = false;
reset_upon_device_release = false, schedule_hard_reset = false,
skip_wq_flush = false;
u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
struct hl_ctx *ctx;
int i, rc;
@ -991,6 +993,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)
hard_reset = !!(flags & HL_DRV_RESET_HARD);
from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
skip_wq_flush = !!(flags & HL_DRV_RESET_DEV_RELEASE);
if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
hard_instead_soft = true;
@ -1076,7 +1079,7 @@ again:
return 0;
}
cleanup_resources(hdev, hard_reset, fw_reset);
cleanup_resources(hdev, hard_reset, fw_reset, skip_wq_flush);
kill_processes:
if (hard_reset) {
@ -1686,7 +1689,7 @@ void hl_device_fini(struct hl_device *hdev)
hl_hwmon_fini(hdev);
cleanup_resources(hdev, true, false);
cleanup_resources(hdev, true, false, false);
/* Kill processes here after CS rollback. This is because the process
* can't really exit until all its CSs are done, which is what we

View File

@ -3054,7 +3054,7 @@ int hl_cb_pool_fini(struct hl_device *hdev);
int hl_cb_va_pool_init(struct hl_ctx *ctx);
void hl_cb_va_pool_fini(struct hl_ctx *ctx);
void hl_cs_rollback_all(struct hl_device *hdev);
void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush);
struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
void hl_sob_reset_error(struct kref *ref);