habanalabs: add support for f/w reset

When the f/w runs in secured mode, it can reset the ASIC when certain events occur. In unsecured mode, the driver asks the f/w to reset the ASIC for those events. We need to perform the entire reset procedure but without accessing the ASIC. i.e. without halting the engines and without sending messages to the f/w. Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
2024-11-25 21:51:40 +00:00 · 2021-08-09 22:43:37 +03:00 · 2021-08-09 22:43:37 +03:00 · 8d9aa980be
commit 8d9aa980be
parent 56e753d595
5 changed files with 61 additions and 35 deletions
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@ -311,9 +311,15 @@ static void device_hard_reset_pending(struct work_struct *work)
 		container_of(work, struct hl_device_reset_work,
 				reset_work.work);
 	struct hl_device *hdev = device_reset_work->hdev;
+	u32 flags;
 	int rc;

-	rc = hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD);
+	flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
+
+	if (device_reset_work->fw_reset)
+		flags |= HL_RESET_FW;
+
+	rc = hl_device_reset(hdev, flags);
 	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
 		dev_info(hdev->dev,
 			"Could not reset device. will try again in %u seconds",
@ -702,7 +708,7 @@ static void take_release_locks(struct hl_device *hdev)
 	mutex_unlock(&hdev->fpriv_list_lock);
 }

-static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
+static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	if (hard_reset)
 		device_late_fini(hdev);
@ -712,7 +718,7 @@ static void cleanup_resources(struct hl_device *hdev, bool hard_reset)
 	 * completions from H/W and we won't have any accesses from the
 	 * H/W to the host machine
 	 */
-	hdev->asic_funcs->halt_engines(hdev, hard_reset);
+	hdev->asic_funcs->halt_engines(hdev, hard_reset, fw_reset);

 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
@ -922,7 +928,7 @@ static void device_disable_open_processes(struct hl_device *hdev)
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-	bool hard_reset, from_hard_reset_thread, hard_instead_soft = false;
+	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
 	int i, rc;

 	if (!hdev->init_done) {
@ -933,6 +939,7 @@ int hl_device_reset(struct hl_device *hdev, u32 flags)

 	hard_reset = !!(flags & HL_RESET_HARD);
 	from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
+	fw_reset = !!(flags & HL_RESET_FW);

 	if (!hard_reset && !hdev->supports_soft_reset) {
 		hard_instead_soft = true;
@ -984,11 +991,13 @@ do_reset:
 		else
 			hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;

-		/*
-		 * if reset is due to heartbeat, device CPU is no responsive in
-		 * which case no point sending PCI disable message to it
+		/* If reset is due to heartbeat, device CPU is no responsive in
+		 * which case no point sending PCI disable message to it.
+		 *
+		 * If F/W is performing the reset, no need to send it a message to disable
+		 * PCI access
 		 */
-		if (hard_reset && !(flags & HL_RESET_HEARTBEAT)) {
+		if (hard_reset && !(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
 			/* Disable PCI access from device F/W so he won't send
 			 * us additional interrupts. We disable MSI/MSI-X at
 			 * the halt_engines function and we can't have the F/W
@ -1018,6 +1027,8 @@ again:

 		hdev->process_kill_trial_cnt = 0;

+		hdev->device_reset_work.fw_reset = fw_reset;
+
 		/*
 		 * Because the reset function can't run from heartbeat work,
 		 * we need to call the reset function from a dedicated work.
@ -1028,7 +1039,7 @@ again:
 		return 0;
 	}

-	cleanup_resources(hdev, hard_reset);
+	cleanup_resources(hdev, hard_reset, fw_reset);

 kill_processes:
 	if (hard_reset) {
@ -1062,7 +1073,7 @@ kill_processes:
 	}

 	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, hard_reset);
+	hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);

 	if (hard_reset) {
 		hdev->fw_loader.linux_loaded = false;
@ -1587,7 +1598,7 @@ void hl_device_fini(struct hl_device *hdev)

 	hl_hwmon_fini(hdev);

-	cleanup_resources(hdev, true);
+	cleanup_resources(hdev, true, false);

 	/* Kill processes here after CS rollback. This is because the process
 	 * can't really exit until all its CSs are done, which is what we
@ -1606,7 +1617,7 @@ void hl_device_fini(struct hl_device *hdev)
 	hl_cb_pool_fini(hdev);

 	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, true);
+	hdev->asic_funcs->hw_fini(hdev, true, false);

 	hdev->fw_loader.linux_loaded = false;

--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@ -128,12 +128,17 @@ enum hl_mmu_page_table_location {
 *
 * - HL_RESET_DEVICE_RELEASE
 *       Set if reset is due to device release
+ *
+ * - HL_RESET_FW
+ *       F/W will perform the reset. No need to ask it to reset the device. This is relevant
+ *       only when running with secured f/w
 */
 #define HL_RESET_HARD			(1 << 0)
 #define HL_RESET_FROM_RESET_THREAD	(1 << 1)
 #define HL_RESET_HEARTBEAT		(1 << 2)
 #define HL_RESET_TDR			(1 << 3)
 #define HL_RESET_DEVICE_RELEASE		(1 << 4)
+#define HL_RESET_FW			(1 << 5)

 #define HL_MAX_SOBS_PER_MONITOR	8

@ -1170,8 +1175,8 @@ struct hl_asic_funcs {
 	int (*sw_init)(struct hl_device *hdev);
 	int (*sw_fini)(struct hl_device *hdev);
 	int (*hw_init)(struct hl_device *hdev);
-	void (*hw_fini)(struct hl_device *hdev, bool hard_reset);
-	void (*halt_engines)(struct hl_device *hdev, bool hard_reset);
+	void (*hw_fini)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
+	void (*halt_engines)(struct hl_device *hdev, bool hard_reset, bool fw_reset);
 	int (*suspend)(struct hl_device *hdev);
 	int (*resume)(struct hl_device *hdev);
 	int (*mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
@ -2138,11 +2143,13 @@ struct hwmon_chip_info;
 * @wq: work queue for device reset procedure.
 * @reset_work: reset work to be done.
 * @hdev: habanalabs device structure.
+ * @fw_reset: whether f/w will do the reset without us sending them a message to do it.
 */
 struct hl_device_reset_work {
 	struct workqueue_struct		*wq;
 	struct delayed_work		reset_work;
 	struct hl_device		*hdev;
+	bool				fw_reset;
 };

 /**
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@ -535,7 +535,7 @@ hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
 		result = PCI_ERS_RESULT_NONE;
 	}

-	hdev->asic_funcs->halt_engines(hdev, true);
+	hdev->asic_funcs->halt_engines(hdev, true, false);

 	return result;
 }
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@ -833,14 +833,14 @@ pci_init:
 					GAUDI_BOOT_FIT_REQ_TIMEOUT_USEC);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
-			hdev->asic_funcs->hw_fini(hdev, true);
+			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}

 	if (gaudi_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_info(hdev->dev,
 			"H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true);
+		hdev->asic_funcs->hw_fini(hdev, true, false);
 	}

 	return 0;
@ -3836,7 +3836,7 @@ static void gaudi_disable_timestamp(struct hl_device *hdev)
 	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
 }

-static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
+static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	u32 wait_timeout_ms;

@ -3848,6 +3848,9 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)
 	else
 		wait_timeout_ms = GAUDI_RESET_WAIT_MSEC;

+	if (fw_reset)
+		goto skip_engines;
+
 	gaudi_stop_nic_qmans(hdev);
 	gaudi_stop_mme_qmans(hdev);
 	gaudi_stop_tpc_qmans(hdev);
@ -3873,6 +3876,7 @@ static void gaudi_halt_engines(struct hl_device *hdev, bool hard_reset)

 	gaudi_disable_timestamp(hdev);

+skip_engines:
 	gaudi_disable_msi(hdev);
 }

@ -4240,7 +4244,7 @@ disable_queues:
 	return rc;
 }

-static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
+static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	struct cpu_dyn_regs *dyn_regs =
 			&hdev->fw_loader.dynamic_loader.comm_desc.cpu_dyn_regs;
@ -4261,6 +4265,14 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 		cpu_timeout_ms = GAUDI_CPU_RESET_WAIT_MSEC;
 	}

+	if (fw_reset) {
+		dev_info(hdev->dev,
+			"Firmware performs HARD reset, going to wait %dms\n",
+			reset_timeout_ms);
+
+		goto skip_reset;
+	}
+
 	driver_performs_reset = !!(!hdev->asic_prop.fw_security_enabled &&
 					!hdev->asic_prop.hard_reset_done_by_fw);

@ -4337,6 +4349,7 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset)
 			reset_timeout_ms);
 	}

+skip_reset:
 	/*
 	 * After hard reset, we can't poll the BTM_FSM register because the PSOC
 	 * itself is in reset. Need to wait until the reset is deasserted
@ -7999,10 +8012,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 					tpc_dec_event_to_tpc_id(event_type),
 					"AXI_SLV_DEC_Error");
 		if (reset_required) {
-			dev_err(hdev->dev, "hard reset required due to %s\n",
+			dev_err(hdev->dev, "reset required due to %s\n",
 				gaudi_irq_map_table[event_type].name);

-			goto reset_device;
+			hl_device_reset(hdev, 0);
 		} else {
 			hl_fw_unmask_irq(hdev, event_type);
 		}
@ -8021,10 +8034,10 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 					tpc_krn_event_to_tpc_id(event_type),
 					"KRN_ERR");
 		if (reset_required) {
-			dev_err(hdev->dev, "hard reset required due to %s\n",
+			dev_err(hdev->dev, "reset required due to %s\n",
 				gaudi_irq_map_table[event_type].name);

-			goto reset_device;
+			hl_device_reset(hdev, 0);
 		} else {
 			hl_fw_unmask_irq(hdev, event_type);
 		}
@ -8154,7 +8167,9 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	return;

 reset_device:
-	if (hdev->hard_reset_on_fw_events)
+	if (hdev->asic_prop.fw_security_enabled)
+		hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW);
+	else if (hdev->hard_reset_on_fw_events)
 		hl_device_reset(hdev, HL_RESET_HARD);
 	else
 		hl_fw_unmask_irq(hdev, event_type);
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@ -654,14 +654,14 @@ pci_init:
 					GOYA_BOOT_FIT_REQ_TIMEOUT_USEC);
 	if (rc) {
 		if (hdev->reset_on_preboot_fail)
-			hdev->asic_funcs->hw_fini(hdev, true);
+			hdev->asic_funcs->hw_fini(hdev, true, false);
 		goto pci_fini;
 	}

 	if (goya_get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
 		dev_info(hdev->dev,
 			"H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->hw_fini(hdev, true);
+		hdev->asic_funcs->hw_fini(hdev, true, false);
 	}

 	if (!hdev->pldm) {
@ -2380,7 +2380,7 @@ static void goya_disable_timestamp(struct hl_device *hdev)
 	WREG32(mmPSOC_TIMESTAMP_BASE - CFG_BASE, 0);
 }

-static void goya_halt_engines(struct hl_device *hdev, bool hard_reset)
+static void goya_halt_engines(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	u32 wait_timeout_ms;

@ -2703,14 +2703,7 @@ disable_queues:
 	return rc;
 }

-/*
- * goya_hw_fini - Goya hardware tear-down code
- *
- * @hdev: pointer to hl_device structure
- * @hard_reset: should we do hard reset to all engines or just reset the
- *              compute/dma engines
- */
-static void goya_hw_fini(struct hl_device *hdev, bool hard_reset)
+static void goya_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset)
 {
 	struct goya_device *goya = hdev->asic_specific;
 	u32 reset_timeout_ms, cpu_timeout_ms, status;