habanalabs: fix race between wait and irq

There is a race in the user interrupts code, where between checking the target value and adding the new pend to the list, there is a chance the interrupt happened. In that case, no one will complete the node, and we will get a timeout on it. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
2022-01-19 00:10:43 +02:00 · 2022-01-19 00:10:43 +02:00 · 7a78d4d481
commit 7a78d4d481
parent 54faa5607b
1 changed files with 5 additions and 1 deletions
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@ -2892,16 +2892,21 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	pend->cq_kernel_addr = (u64 *) cb->kernel_address + cq_counters_offset;
 	pend->cq_target_value = target_value;

+	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
+
 	/* We check for completion value as interrupt could have been received
 	 * before we added the node to the wait list
 	 */
 	if (*pend->cq_kernel_addr >= target_value) {
+		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
+
 		*status = HL_WAIT_CS_STATUS_COMPLETED;
 		/* There was no interrupt, we assume the completion is now. */
 		pend->fence.timestamp = ktime_get();
 		goto set_timestamp;

 	} else if (!timeout_us) {
+		spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
 		*status = HL_WAIT_CS_STATUS_BUSY;
 		pend->fence.timestamp = ktime_get();
 		goto set_timestamp;
@ -2910,7 +2915,6 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	/* Add pending user interrupt to relevant list for the interrupt
 	 * handler to monitor
 	 */
-	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
 	list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
 	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);