diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 7da9847eb9d6..75d8302352e5 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -99,7 +99,6 @@ static const struct dma_fence_ops hl_fence_ops = { .get_driver_name = hl_fence_get_driver_name, .get_timeline_name = hl_fence_get_timeline_name, .enable_signaling = hl_fence_enable_signaling, - .wait = dma_fence_default_wait, .release = hl_fence_release }; diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c index 4b6c8de46dd8..2b38a119704c 100644 --- a/drivers/misc/habanalabs/device.c +++ b/drivers/misc/habanalabs/device.c @@ -726,7 +726,7 @@ disable_device: return rc; } -static void device_kill_open_processes(struct hl_device *hdev) +static int device_kill_open_processes(struct hl_device *hdev) { u16 pending_total, pending_cnt; struct hl_fpriv *hpriv; @@ -779,9 +779,7 @@ static void device_kill_open_processes(struct hl_device *hdev) ssleep(1); } - if (!list_empty(&hdev->fpriv_list)) - dev_crit(hdev->dev, - "Going to hard reset with open user contexts\n"); + return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY; } static void device_hard_reset_pending(struct work_struct *work) @@ -801,6 +799,7 @@ static void device_hard_reset_pending(struct work_struct *work) * @hdev: pointer to habanalabs device structure * @hard_reset: should we do hard reset to all engines or just reset the * compute/dma engines + * @from_hard_reset_thread: is the caller the hard-reset thread * * Block future CS and wait for pending CS to be enqueued * Call ASIC H/W fini @@ -823,6 +822,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset, return 0; } + if ((!hard_reset) && (!hdev->supports_soft_reset)) { + dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n"); + hard_reset = true; + } + /* * Prevent concurrency in this function - only one reset should be * done at any given time. Only need to perform this if we didn't @@ -902,7 +906,12 @@ again: * process can't really exit until all its CSs are done, which * is what we do in cs rollback */ - device_kill_open_processes(hdev); + rc = device_kill_open_processes(hdev); + if (rc) { + dev_crit(hdev->dev, + "Failed to kill all open processes, stopping hard reset\n"); + goto out_err; + } /* Flush the Event queue workers to make sure no other thread is * reading or writing to registers during the reset @@ -1385,7 +1394,9 @@ void hl_device_fini(struct hl_device *hdev) * can't really exit until all its CSs are done, which is what we * do in cs rollback */ - device_kill_open_processes(hdev); + rc = device_kill_open_processes(hdev); + if (rc) + dev_crit(hdev->dev, "Failed to kill all open processes\n"); hl_cb_pool_fini(hdev); diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 4cb1f71dd4f1..61f88e9884ce 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev, u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK) >> EQ_CTL_EVENT_TYPE_SHIFT); u8 cause; - bool soft_reset_required; + bool reset_required; gaudi->events_stat[event_type]++; gaudi->events_stat_aggregate[event_type]++; @@ -5840,12 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_TPC6_DEC: case GAUDI_EVENT_TPC7_DEC: gaudi_print_irq_info(hdev, event_type, true); - soft_reset_required = gaudi_tpc_read_interrupts(hdev, + reset_required = gaudi_tpc_read_interrupts(hdev, tpc_dec_event_to_tpc_id(event_type), "AXI_SLV_DEC_Error"); - if (soft_reset_required) - hl_device_reset(hdev, false, false); - hl_fw_unmask_irq(hdev, event_type); + if (reset_required) { + dev_err(hdev->dev, "hard reset required due to %s\n", + gaudi_irq_map_table[event_type].name); + + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + } else { + hl_fw_unmask_irq(hdev, event_type); + } break; case GAUDI_EVENT_TPC0_KRN_ERR: @@ -5857,12 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_TPC6_KRN_ERR: case GAUDI_EVENT_TPC7_KRN_ERR: gaudi_print_irq_info(hdev, event_type, true); - soft_reset_required = gaudi_tpc_read_interrupts(hdev, + reset_required = gaudi_tpc_read_interrupts(hdev, tpc_krn_event_to_tpc_id(event_type), "KRN_ERR"); - if (soft_reset_required) - hl_device_reset(hdev, false, false); - hl_fw_unmask_irq(hdev, event_type); + if (reset_required) { + dev_err(hdev->dev, "hard reset required due to %s\n", + gaudi_irq_map_table[event_type].name); + + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); + } else { + hl_fw_unmask_irq(hdev, event_type); + } break; case GAUDI_EVENT_PCIE_CORE_SERR: @@ -5913,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev, case GAUDI_EVENT_RAZWI_OR_ADC_SW: gaudi_print_irq_info(hdev, event_type, true); - hl_device_reset(hdev, false, false); - hl_fw_unmask_irq(hdev, event_type); + if (hdev->hard_reset_on_fw_events) + hl_device_reset(hdev, true, false); break; case GAUDI_EVENT_TPC0_BMON_SPMU: @@ -5963,7 +5975,7 @@ static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate, return gaudi->events_stat; } -static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, +static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) { struct gaudi_device *gaudi = hdev->asic_specific; @@ -5972,34 +5984,40 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) || hdev->hard_reset_pending) - return; - - mutex_lock(&hdev->mmu_cache_lock); + return 0; if (hdev->pldm) timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC; else timeout_usec = MMU_CONFIG_TIMEOUT_USEC; + mutex_lock(&hdev->mmu_cache_lock); + /* L0 & L1 invalidation */ - WREG32(mmSTLB_INV_ALL_START, 1); + WREG32(mmSTLB_INV_PS, 2); rc = hl_poll_timeout( hdev, - mmSTLB_INV_ALL_START, + mmSTLB_INV_PS, status, !status, 1000, timeout_usec); - if (rc) - dev_notice_ratelimited(hdev->dev, - "Timeout when waiting for MMU cache invalidation\n"); + WREG32(mmSTLB_INV_SET, 0); mutex_unlock(&hdev->mmu_cache_lock); + + if (rc) { + dev_err_ratelimited(hdev->dev, + "MMU cache invalidation timeout\n"); + hl_device_reset(hdev, true, false); + } + + return rc; } -static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev, +static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard, u32 asid, u64 va, u64 size) { struct gaudi_device *gaudi = hdev->asic_specific; @@ -6010,7 +6028,7 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev, if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) || hdev->hard_reset_pending) - return; + return 0; mutex_lock(&hdev->mmu_cache_lock); @@ -6041,11 +6059,15 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev, 1000, timeout_usec); - if (rc) - dev_notice_ratelimited(hdev->dev, - "Timeout when waiting for MMU cache invalidation\n"); - mutex_unlock(&hdev->mmu_cache_lock); + + if (rc) { + dev_err_ratelimited(hdev->dev, + "MMU cache invalidation timeout\n"); + hl_device_reset(hdev, true, false); + } + + return rc; } static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev, diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c index 15b6c3228e37..0d2952bb58df 100644 --- a/drivers/misc/habanalabs/goya/goya.c +++ b/drivers/misc/habanalabs/goya/goya.c @@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev) spin_lock_init(&goya->hw_queues_lock); hdev->supports_coresight = true; + hdev->supports_soft_reset = true; return 0; @@ -4883,7 +4884,7 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid) goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid); } -static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, +static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags) { struct goya_device *goya = hdev->asic_specific; @@ -4892,11 +4893,11 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, if (!(goya->hw_cap_initialized & HW_CAP_MMU) || hdev->hard_reset_pending) - return; + return 0; /* no need in L1 only invalidation in Goya */ if (!is_hard) - return; + return 0; if (hdev->pldm) timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC; @@ -4918,13 +4919,17 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, mutex_unlock(&hdev->mmu_cache_lock); - if (rc) - dev_notice_ratelimited(hdev->dev, - "Timeout when waiting for MMU cache invalidation\n"); + if (rc) { + dev_err_ratelimited(hdev->dev, + "MMU cache invalidation timeout\n"); + hl_device_reset(hdev, true, false); + } + + return rc; } -static void goya_mmu_invalidate_cache_range(struct hl_device *hdev, - bool is_hard, u32 asid, u64 va, u64 size) +static int goya_mmu_invalidate_cache_range(struct hl_device *hdev, + bool is_hard, u32 asid, u64 va, u64 size) { struct goya_device *goya = hdev->asic_specific; u32 status, timeout_usec, inv_data, pi; @@ -4932,11 +4937,11 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev, if (!(goya->hw_cap_initialized & HW_CAP_MMU) || hdev->hard_reset_pending) - return; + return 0; /* no need in L1 only invalidation in Goya */ if (!is_hard) - return; + return 0; if (hdev->pldm) timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC; @@ -4969,9 +4974,13 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev, mutex_unlock(&hdev->mmu_cache_lock); - if (rc) - dev_notice_ratelimited(hdev->dev, - "Timeout when waiting for MMU cache invalidation\n"); + if (rc) { + dev_err_ratelimited(hdev->dev, + "MMU cache invalidation timeout\n"); + hl_device_reset(hdev, true, false); + } + + return rc; } int goya_send_heartbeat(struct hl_device *hdev) diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h index 5a855b7edf43..1ecdcf8b763a 100644 --- a/drivers/misc/habanalabs/habanalabs.h +++ b/drivers/misc/habanalabs/habanalabs.h @@ -675,9 +675,9 @@ struct hl_asic_funcs { u32 *size); u64 (*read_pte)(struct hl_device *hdev, u64 addr); void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val); - void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard, + int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard, u32 flags); - void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, + int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard, u32 asid, u64 va, u64 size); int (*send_heartbeat)(struct hl_device *hdev); void (*enable_clock_gating)(struct hl_device *hdev); @@ -755,8 +755,8 @@ struct hl_va_range { * with huge pages. * @dram_va_range: holds available virtual addresses for DRAM mappings. * @mem_hash_lock: protects the mem_hash. - * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the - * MMU hash or walking the PGT requires talking this lock + * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the + * MMU hash or walking the PGT requires talking this lock. * @debugfs_list: node in debugfs list of contexts. * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed * to user so user could inquire about CS. It is used as @@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts { * @stop_on_err: true if engines should stop on error. * @supports_sync_stream: is sync stream supported. * @supports_coresight: is CoreSight supported. + * @supports_soft_reset: is soft reset supported. */ struct hl_device { struct pci_dev *pdev; @@ -1522,6 +1523,7 @@ struct hl_device { u8 stop_on_err; u8 supports_sync_stream; u8 supports_coresight; + u8 supports_soft_reset; /* Parameters for bring-up */ u8 mmu_enable; diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c index a72f766ca470..4b8eed1ca513 100644 --- a/drivers/misc/habanalabs/memory.c +++ b/drivers/misc/habanalabs/memory.c @@ -886,6 +886,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, vm_type = (enum vm_type_t *) userptr; hint_addr = args->map_host.hint_addr; + handle = phys_pg_pack->handle; } else { handle = lower_32_bits(args->map_device.handle); @@ -954,10 +955,17 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, goto map_err; } - hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type); + rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type); mutex_unlock(&ctx->mmu_lock); + if (rc) { + dev_err(hdev->dev, + "mapping handle %u failed due to MMU cache invalidation\n", + handle); + goto map_err; + } + ret_vaddr += phys_pg_pack->offset; hnode->ptr = vm_type; @@ -1083,21 +1091,34 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free) * at the loop end rather than for each iteration */ if (!ctx_free) - hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type); + rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, true, + *vm_type); mutex_unlock(&ctx->mmu_lock); /* - * No point in maintaining the free VA block list if the context is - * closing as the list will be freed anyway + * If the context is closing we don't need to check for the MMU cache + * invalidation return code and update the VA free list as in this flow + * we invalidate the MMU cache outside of this unmap function and the VA + * free list will be freed anyway. */ if (!ctx_free) { - rc = add_va_block(hdev, va_range, vaddr, - vaddr + phys_pg_pack->total_size - 1); + int tmp_rc; + if (rc) + dev_err(hdev->dev, + "unmapping vaddr 0x%llx failed due to MMU cache invalidation\n", + vaddr); + + tmp_rc = add_va_block(hdev, va_range, vaddr, + vaddr + phys_pg_pack->total_size - 1); + if (tmp_rc) { dev_warn(hdev->dev, "add va block failed for vaddr: 0x%llx\n", vaddr); + if (!rc) + rc = tmp_rc; + } } atomic_dec(&phys_pg_pack->mapping_cnt); @@ -1108,7 +1129,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free) dma_unmap_host_va(hdev, userptr); } - return 0; + return rc; mapping_cnt_err: if (is_userptr) diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/sysfs.c index e4454414d0e1..5d78d5e1c782 100644 --- a/drivers/misc/habanalabs/sysfs.c +++ b/drivers/misc/habanalabs/sysfs.c @@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev, goto out; } + if (!hdev->supports_soft_reset) { + dev_err(hdev->dev, "Device does not support soft-reset\n"); + goto out; + } + dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n"); hl_device_reset(hdev, false, false);