From fc6121e961cba2e4790493f656b7066e250f9521 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 23 Sep 2020 14:07:32 +0300 Subject: [PATCH 1/5] habanalabs: correct an error message We don't try to allocate huge pages here so remove the huge word. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c index 3324332811bc..84227819e4d1 100644 --- a/drivers/misc/habanalabs/common/memory.c +++ b/drivers/misc/habanalabs/common/memory.c @@ -77,8 +77,8 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args, paddr = (u64) gen_pool_alloc(vm->dram_pg_pool, total_size); if (!paddr) { dev_err(hdev->dev, - "failed to allocate %llu huge contiguous pages\n", - num_pgs); + "failed to allocate %llu contiguous pages with total size of %llu\n", + num_pgs, total_size); return -ENOMEM; } } From 9e2e8fc7d65c9c2b58a4bf5b6c62a6fbb3762d05 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 23 Sep 2020 15:06:52 +0300 Subject: [PATCH 2/5] habanalabs: release kernel context after hw_fini Some engines use resources that belong to the kernel context (e.g. MMU mappings). In case the halt-engines doesn't work properly due to H/W restriction, we need to make sure the kernel context lives on until after the hw_fini. The hw_fini resets the ASIC after that no engine is alive and we can safely close the kernel context. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/device.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 196e35d71118..20572224099a 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -967,14 +967,13 @@ again: flush_workqueue(hdev->eq_wq); } - /* Release kernel context */ - if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1)) - hdev->kernel_ctx = NULL; - /* Reset the H/W. It will be in idle state after this returns */ hdev->asic_funcs->hw_fini(hdev, hard_reset); if (hard_reset) { + /* Release kernel context */ + if (hl_ctx_put(hdev->kernel_ctx) == 1) + hdev->kernel_ctx = NULL; hl_vm_fini(hdev); hl_mmu_fini(hdev); hl_eq_reset(hdev, &hdev->event_queue); @@ -1465,13 +1464,13 @@ void hl_device_fini(struct hl_device *hdev) hl_cb_pool_fini(hdev); + /* Reset the H/W. It will be in idle state after this returns */ + hdev->asic_funcs->hw_fini(hdev, true); + /* Release kernel context */ if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) dev_err(hdev->dev, "kernel ctx is still alive\n"); - /* Reset the H/W. It will be in idle state after this returns */ - hdev->asic_funcs->hw_fini(hdev, true); - hl_vm_fini(hdev); hl_mmu_fini(hdev); From 3c3aa5dbd65915abaa87d91330f4463c99d0df44 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 23 Sep 2020 23:51:03 +0300 Subject: [PATCH 3/5] habanalabs: add debug messages for opening/closing context During debugging of error we sometimes need to know whether the error happened when a user context was open. Add debug prints when opening and closing user contexts. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index df8171a2226c..bd03ef074eed 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -28,6 +28,8 @@ static void hl_ctx_fini(struct hl_ctx *ctx) kfree(ctx->cs_pending); if (ctx->asid != HL_KERNEL_ASID_ID) { + dev_dbg(hdev->dev, "closing user context %d\n", ctx->asid); + /* The engines are stopped as there is no executing CS, but the * Coresight might be still working by accessing addresses * related to the stopped engines. Hence stop it explicitly. @@ -41,6 +43,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx) hl_vm_ctx_fini(ctx); hl_asid_free(hdev, ctx->asid); } else { + dev_dbg(hdev->dev, "closing kernel context\n"); hl_mmu_ctx_fini(ctx); } } @@ -168,6 +171,8 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx) dev_err(hdev->dev, "ctx_init failed\n"); goto err_cb_va_pool_fini; } + + dev_dbg(hdev->dev, "create user context %d\n", ctx->asid); } return 0; From eab1f6e7b035896a03e9683d3a80c6bd33b83476 Mon Sep 17 00:00:00 2001 From: Oded Gabbay Date: Wed, 23 Sep 2020 23:56:18 +0300 Subject: [PATCH 4/5] habanalabs: add notice of device not idle The device should be idle after a context is closed. If not, print a notice. Reviewed-by: Tomer Tayar Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/common/context.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c index bd03ef074eed..7a59dd7c6450 100644 --- a/drivers/misc/habanalabs/common/context.c +++ b/drivers/misc/habanalabs/common/context.c @@ -12,6 +12,7 @@ static void hl_ctx_fini(struct hl_ctx *ctx) { struct hl_device *hdev = ctx->hdev; + u64 idle_mask = 0; int i; /* @@ -42,6 +43,13 @@ static void hl_ctx_fini(struct hl_ctx *ctx) hl_cb_va_pool_fini(ctx); hl_vm_ctx_fini(ctx); hl_asid_free(hdev, ctx->asid); + + if ((!hdev->pldm) && (hdev->pdev) && + (!hdev->asic_funcs->is_device_idle(hdev, + &idle_mask, NULL))) + dev_notice(hdev->dev, + "device not idle after user context is closed (0x%llx)\n", + idle_mask); } else { dev_dbg(hdev->dev, "closing kernel context\n"); hl_mmu_ctx_fini(ctx); From 25121d9804b8421851e3ccb88d1d35a5d93692b8 Mon Sep 17 00:00:00 2001 From: Ofir Bitton Date: Thu, 24 Sep 2020 08:22:58 +0300 Subject: [PATCH 5/5] habanalabs/gaudi: configure QMAN LDMA registers properly LDMA registers are configured with a fixed value. We add new define set which gives the configuration a proper meaning. Signed-off-by: Ofir Bitton Reviewed-by: Oded Gabbay Signed-off-by: Oded Gabbay --- drivers/misc/habanalabs/gaudi/gaudi.c | 62 +++++++++++++++++--------- drivers/misc/habanalabs/gaudi/gaudiP.h | 8 ++++ 2 files changed, 49 insertions(+), 21 deletions(-) diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 1b51e670bd4e..a227806be328 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -1865,9 +1865,11 @@ static void gaudi_init_pci_dma_qman(struct hl_device *hdev, int dma_id, WREG32(mmDMA0_QM_PQ_PI_0 + q_off, 0); WREG32(mmDMA0_QM_PQ_CI_0 + q_off, 0); - WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x74); - WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x14); - WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, QMAN_LDMA_SIZE_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_LO_0 + q_off, mtr_base_en_lo); WREG32(mmDMA0_QM_CP_MSG_BASE0_ADDR_HI_0 + q_off, mtr_base_en_hi); @@ -2025,13 +2027,19 @@ static void gaudi_init_hbm_dma_qman(struct hl_device *hdev, int dma_id, WREG32(mmDMA0_QM_PQ_PI_0 + q_off, 0); WREG32(mmDMA0_QM_PQ_CI_0 + q_off, 0); - WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x81BC); - WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x81B4); - WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_CPDMA_SIZE_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_SRC_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_DST_OFFSET); } else { - WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x74); - WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x14); - WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmDMA0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmDMA0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); /* Configure RAZWI IRQ */ dma_qm_err_cfg = HBM_DMA_QMAN_GLBL_ERR_CFG_MSG_EN_MASK; @@ -2135,13 +2143,19 @@ static void gaudi_init_mme_qman(struct hl_device *hdev, u32 mme_offset, WREG32(mmMME0_QM_PQ_PI_0 + q_off, 0); WREG32(mmMME0_QM_PQ_CI_0 + q_off, 0); - WREG32(mmMME0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x81BC); - WREG32(mmMME0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x81B4); - WREG32(mmMME0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmMME0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_CPDMA_SIZE_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_SRC_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_DST_OFFSET); } else { - WREG32(mmMME0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x74); - WREG32(mmMME0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x14); - WREG32(mmMME0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmMME0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmMME0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); /* Configure RAZWI IRQ */ mme_id = mme_offset / @@ -2249,13 +2263,19 @@ static void gaudi_init_tpc_qman(struct hl_device *hdev, u32 tpc_offset, WREG32(mmTPC0_QM_PQ_PI_0 + q_off, 0); WREG32(mmTPC0_QM_PQ_CI_0 + q_off, 0); - WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x81BC); - WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x81B4); - WREG32(mmTPC0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_CPDMA_SIZE_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_SRC_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_CPDMA_DST_OFFSET); } else { - WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, 0x74); - WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, 0x14); - WREG32(mmTPC0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, 0x1C); + WREG32(mmTPC0_QM_CP_LDMA_TSIZE_OFFSET_0 + q_off, + QMAN_LDMA_SIZE_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_SRC_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_SRC_OFFSET); + WREG32(mmTPC0_QM_CP_LDMA_DST_BASE_LO_OFFSET_0 + q_off, + QMAN_LDMA_DST_OFFSET); /* Configure RAZWI IRQ */ tpc_id = tpc_offset / diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h index b70b810c21c9..83ad2b0a3a61 100644 --- a/drivers/misc/habanalabs/gaudi/gaudiP.h +++ b/drivers/misc/habanalabs/gaudi/gaudiP.h @@ -84,6 +84,14 @@ #define DMA_CORE_OFFSET (mmDMA1_CORE_BASE - mmDMA0_CORE_BASE) +#define QMAN_LDMA_SRC_OFFSET (mmDMA0_CORE_SRC_BASE_LO - mmDMA0_CORE_CFG_0) +#define QMAN_LDMA_DST_OFFSET (mmDMA0_CORE_DST_BASE_LO - mmDMA0_CORE_CFG_0) +#define QMAN_LDMA_SIZE_OFFSET (mmDMA0_CORE_DST_TSIZE_0 - mmDMA0_CORE_CFG_0) + +#define QMAN_CPDMA_SRC_OFFSET (mmDMA0_QM_CQ_PTR_LO_4 - mmDMA0_CORE_CFG_0) +#define QMAN_CPDMA_DST_OFFSET (mmDMA0_CORE_DST_BASE_LO - mmDMA0_CORE_CFG_0) +#define QMAN_CPDMA_SIZE_OFFSET (mmDMA0_QM_CQ_TSIZE_4 - mmDMA0_CORE_CFG_0) + #define SIF_RTR_CTRL_OFFSET (mmSIF_RTR_CTRL_1_BASE - mmSIF_RTR_CTRL_0_BASE) #define NIF_RTR_CTRL_OFFSET (mmNIF_RTR_CTRL_1_BASE - mmNIF_RTR_CTRL_0_BASE)