diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 38e1ad432e51..0f804ecb6caa 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -1531,10 +1531,11 @@ out_err: return rc; } -static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event) +static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask) { mutex_lock(¬ifier_event->lock); - notifier_event->events_mask |= event; + notifier_event->events_mask |= event_mask; + if (notifier_event->eventfd) eventfd_signal(notifier_event->eventfd, 1); @@ -1545,17 +1546,17 @@ static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 * hl_notifier_event_send_all - notify all user processes via eventfd * * @hdev: pointer to habanalabs device structure - * @event: the occurred event + * @event_mask: the occurred event/s * Returns 0 for success or an error on failure. */ -void hl_notifier_event_send_all(struct hl_device *hdev, u64 event) +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask) { struct hl_fpriv *hpriv; mutex_lock(&hdev->fpriv_list_lock); list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) - hl_notifier_event_send(&hpriv->notifier_event, event); + hl_notifier_event_send(&hpriv->notifier_event, event_mask); mutex_unlock(&hdev->fpriv_list_lock); @@ -1563,7 +1564,7 @@ void hl_notifier_event_send_all(struct hl_device *hdev, u64 event) mutex_lock(&hdev->fpriv_ctrl_list_lock); list_for_each_entry(hpriv, &hdev->fpriv_ctrl_list, dev_node) - hl_notifier_event_send(&hpriv->notifier_event, event); + hl_notifier_event_send(&hpriv->notifier_event, event_mask); mutex_unlock(&hdev->fpriv_ctrl_list_lock); } diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h index 1ab64e8a05c6..3a0f6dca8361 100644 --- a/drivers/misc/habanalabs/common/habanalabs.h +++ b/drivers/misc/habanalabs/common/habanalabs.h @@ -2644,14 +2644,48 @@ struct razwi_info { u8 type; }; +#define MAX_QMAN_STREAMS_INFO 4 +#define OPCODE_INFO_MAX_ADDR_SIZE 8 +/** + * struct undefined_opcode_info - info about last undefined opcode error + * @timestamp: timestamp of the undefined opcode error + * @cb_addr_streams: CB addresses (per stream) that are currently exists in the PQ + * entiers. In case all streams array entries are + * filled with values, it means the execution was in Lower-CP. + * @cq_addr: the address of the current handled command buffer + * @cq_size: the size of the current handled command buffer + * @cb_addr_streams_len: num of streams - actual len of cb_addr_streams array. + * should be equal to 1 incase of undefined opcode + * in Upper-CP (specific stream) and equal to 4 incase + * of undefined opcode in Lower-CP. + * @engine_id: engine-id that the error occurred on + * @stream_id: the stream id the error occurred on. In case the stream equals to + * MAX_QMAN_STREAMS_INFO it means the error occurred on a Lower-CP. + * @write_enable: if set, writing to undefined opcode parameters in the structure + * is enable so the first (root cause) undefined opcode will not be + * overwritten. + */ +struct undefined_opcode_info { + ktime_t timestamp; + u64 cb_addr_streams[MAX_QMAN_STREAMS_INFO][OPCODE_INFO_MAX_ADDR_SIZE]; + u64 cq_addr; + u32 cq_size; + u32 cb_addr_streams_len; + u32 engine_id; + u32 stream_id; + bool write_enable; +}; + /** * struct last_error_session_info - info about last session errors occurred. * @cs_timeout: CS timeout error last information. * @razwi: razwi last information. + * @undef_opcode: undefined opcode information */ struct last_error_session_info { - struct cs_timeout_info cs_timeout; - struct razwi_info razwi; + struct cs_timeout_info cs_timeout; + struct razwi_info razwi; + struct undefined_opcode_info undef_opcode; }; /** @@ -3159,7 +3193,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization); int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr); -void hl_notifier_event_send_all(struct hl_device *hdev, u64 event); +void hl_notifier_event_send_all(struct hl_device *hdev, u64 event_mask); int hl_sysfs_init(struct hl_device *hdev); void hl_sysfs_fini(struct hl_device *hdev); diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c index e617cc394ff7..d02533666746 100644 --- a/drivers/misc/habanalabs/common/habanalabs_drv.c +++ b/drivers/misc/habanalabs/common/habanalabs_drv.c @@ -198,6 +198,7 @@ int hl_device_open(struct inode *inode, struct file *filp) atomic_set(&hdev->last_error.cs_timeout.write_enable, 1); atomic_set(&hdev->last_error.razwi.write_enable, 1); + hdev->last_error.undef_opcode.write_enable = true; hdev->open_counter++; hdev->last_successful_open_jif = jiffies; diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c index 72b0d145e853..ec9f0a93cbe2 100644 --- a/drivers/misc/habanalabs/gaudi/gaudi.c +++ b/drivers/misc/habanalabs/gaudi/gaudi.c @@ -443,6 +443,38 @@ static s64 gaudi_state_dump_specs_props[] = { [SP_NUM_CORES] = 1, }; +static const int gaudi_queue_id_to_engine_id[] = { + [GAUDI_QUEUE_ID_DMA_0_0...GAUDI_QUEUE_ID_DMA_0_3] = GAUDI_ENGINE_ID_DMA_0, + [GAUDI_QUEUE_ID_DMA_1_0...GAUDI_QUEUE_ID_DMA_1_3] = GAUDI_ENGINE_ID_DMA_1, + [GAUDI_QUEUE_ID_CPU_PQ] = GAUDI_ENGINE_ID_SIZE, + [GAUDI_QUEUE_ID_DMA_2_0...GAUDI_QUEUE_ID_DMA_2_3] = GAUDI_ENGINE_ID_DMA_2, + [GAUDI_QUEUE_ID_DMA_3_0...GAUDI_QUEUE_ID_DMA_3_3] = GAUDI_ENGINE_ID_DMA_3, + [GAUDI_QUEUE_ID_DMA_4_0...GAUDI_QUEUE_ID_DMA_4_3] = GAUDI_ENGINE_ID_DMA_4, + [GAUDI_QUEUE_ID_DMA_5_0...GAUDI_QUEUE_ID_DMA_5_3] = GAUDI_ENGINE_ID_DMA_5, + [GAUDI_QUEUE_ID_DMA_6_0...GAUDI_QUEUE_ID_DMA_6_3] = GAUDI_ENGINE_ID_DMA_6, + [GAUDI_QUEUE_ID_DMA_7_0...GAUDI_QUEUE_ID_DMA_7_3] = GAUDI_ENGINE_ID_DMA_7, + [GAUDI_QUEUE_ID_MME_0_0...GAUDI_QUEUE_ID_MME_0_3] = GAUDI_ENGINE_ID_MME_0, + [GAUDI_QUEUE_ID_MME_1_0...GAUDI_QUEUE_ID_MME_1_3] = GAUDI_ENGINE_ID_MME_1, + [GAUDI_QUEUE_ID_TPC_0_0...GAUDI_QUEUE_ID_TPC_0_3] = GAUDI_ENGINE_ID_TPC_0, + [GAUDI_QUEUE_ID_TPC_1_0...GAUDI_QUEUE_ID_TPC_1_3] = GAUDI_ENGINE_ID_TPC_1, + [GAUDI_QUEUE_ID_TPC_2_0...GAUDI_QUEUE_ID_TPC_2_3] = GAUDI_ENGINE_ID_TPC_2, + [GAUDI_QUEUE_ID_TPC_3_0...GAUDI_QUEUE_ID_TPC_3_3] = GAUDI_ENGINE_ID_TPC_3, + [GAUDI_QUEUE_ID_TPC_4_0...GAUDI_QUEUE_ID_TPC_4_3] = GAUDI_ENGINE_ID_TPC_4, + [GAUDI_QUEUE_ID_TPC_5_0...GAUDI_QUEUE_ID_TPC_5_3] = GAUDI_ENGINE_ID_TPC_5, + [GAUDI_QUEUE_ID_TPC_6_0...GAUDI_QUEUE_ID_TPC_6_3] = GAUDI_ENGINE_ID_TPC_6, + [GAUDI_QUEUE_ID_TPC_7_0...GAUDI_QUEUE_ID_TPC_7_3] = GAUDI_ENGINE_ID_TPC_7, + [GAUDI_QUEUE_ID_NIC_0_0...GAUDI_QUEUE_ID_NIC_0_3] = GAUDI_ENGINE_ID_NIC_0, + [GAUDI_QUEUE_ID_NIC_1_0...GAUDI_QUEUE_ID_NIC_1_3] = GAUDI_ENGINE_ID_NIC_1, + [GAUDI_QUEUE_ID_NIC_2_0...GAUDI_QUEUE_ID_NIC_2_3] = GAUDI_ENGINE_ID_NIC_2, + [GAUDI_QUEUE_ID_NIC_3_0...GAUDI_QUEUE_ID_NIC_3_3] = GAUDI_ENGINE_ID_NIC_3, + [GAUDI_QUEUE_ID_NIC_4_0...GAUDI_QUEUE_ID_NIC_4_3] = GAUDI_ENGINE_ID_NIC_4, + [GAUDI_QUEUE_ID_NIC_5_0...GAUDI_QUEUE_ID_NIC_5_3] = GAUDI_ENGINE_ID_NIC_5, + [GAUDI_QUEUE_ID_NIC_6_0...GAUDI_QUEUE_ID_NIC_6_3] = GAUDI_ENGINE_ID_NIC_6, + [GAUDI_QUEUE_ID_NIC_7_0...GAUDI_QUEUE_ID_NIC_7_3] = GAUDI_ENGINE_ID_NIC_7, + [GAUDI_QUEUE_ID_NIC_8_0...GAUDI_QUEUE_ID_NIC_8_3] = GAUDI_ENGINE_ID_NIC_8, + [GAUDI_QUEUE_ID_NIC_9_0...GAUDI_QUEUE_ID_NIC_9_3] = GAUDI_ENGINE_ID_NIC_9, +}; + /* The order here is opposite to the order of the indexing in the h/w. * i.e. SYNC_MGR_W_S is actually 0, SYNC_MGR_E_S is 1, etc. */ @@ -6989,14 +7021,15 @@ static inline u32 gaudi_queue_idx_dec(u32 idx, u32 q_len) } /** - * gaudi_print_sw_config_stream_data - print SW config stream data + * gaudi_handle_sw_config_stream_data - print SW config stream data * * @hdev: pointer to the habanalabs device structure * @stream: the QMAN's stream * @qman_base: base address of QMAN registers block + * @event_mask: mask of the last events occurred */ -static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream, - u64 qman_base) +static void gaudi_handle_sw_config_stream_data(struct hl_device *hdev, u32 stream, + u64 qman_base, u64 event_mask) { u64 cq_ptr_lo, cq_ptr_hi, cq_tsize, cq_ptr; u32 cq_ptr_lo_off, size; @@ -7014,24 +7047,32 @@ static void gaudi_print_sw_config_stream_data(struct hl_device *hdev, u32 stream size = RREG32(cq_tsize); dev_info(hdev->dev, "stop on err: stream: %u, addr: %#llx, size: %u\n", stream, cq_ptr, size); + + if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { + hdev->last_error.undef_opcode.cq_addr = cq_ptr; + hdev->last_error.undef_opcode.cq_size = size; + hdev->last_error.undef_opcode.stream_id = stream; + } } /** - * gaudi_print_last_pqes_on_err - print last PQEs on error + * gaudi_handle_last_pqes_on_err - print last PQEs on error * * @hdev: pointer to the habanalabs device structure * @qid_base: first QID of the QMAN (out of 4 streams) * @stream: the QMAN's stream * @qman_base: base address of QMAN registers block + * @event_mask: mask of the last events occurred * @pr_sw_conf: if true print the SW config stream data (CQ PTR and SIZE) */ -static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, +static void gaudi_handle_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, u32 stream, u64 qman_base, + u64 event_mask, bool pr_sw_conf) { u32 ci, qm_ci_stream_off, queue_len; struct hl_hw_queue *q; - u64 pq_ci; + u64 pq_ci, addr[PQ_FETCHER_CACHE_SIZE]; int i; q = &hdev->kernel_queues[qid_base + stream]; @@ -7046,16 +7087,16 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, hdev->asic_funcs->hw_queues_lock(hdev); if (pr_sw_conf) - gaudi_print_sw_config_stream_data(hdev, stream, qman_base); + gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask); ci = RREG32(pq_ci); /* we should start printing form ci -1 */ ci = gaudi_queue_idx_dec(ci, queue_len); + memset(addr, 0, sizeof(addr)); for (i = 0; i < PQ_FETCHER_CACHE_SIZE; i++) { struct hl_bd *bd; - u64 addr; u32 len; bd = q->kernel_address; @@ -7066,52 +7107,68 @@ static void gaudi_print_last_pqes_on_err(struct hl_device *hdev, u32 qid_base, if (!len) break; - addr = le64_to_cpu(bd->ptr); + addr[i] = le64_to_cpu(bd->ptr); dev_info(hdev->dev, "stop on err PQE(stream %u): ci: %u, addr: %#llx, size: %u\n", - stream, ci, addr, len); + stream, ci, addr[i], len); /* get previous ci, wrap if needed */ ci = gaudi_queue_idx_dec(ci, queue_len); } + if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { + struct undefined_opcode_info *undef_opcode = &hdev->last_error.undef_opcode; + u32 arr_idx = undef_opcode->cb_addr_streams_len; + + if (arr_idx == 0) { + undef_opcode->timestamp = ktime_get(); + undef_opcode->engine_id = gaudi_queue_id_to_engine_id[qid_base]; + } + + memcpy(undef_opcode->cb_addr_streams[arr_idx], addr, sizeof(addr)); + undef_opcode->cb_addr_streams_len++; + } + hdev->asic_funcs->hw_queues_unlock(hdev); } /** - * print_qman_data_on_err - extract QMAN data on error + * handle_qman_data_on_err - extract QMAN data on error * * @hdev: pointer to the habanalabs device structure * @qid_base: first QID of the QMAN (out of 4 streams) * @stream: the QMAN's stream * @qman_base: base address of QMAN registers block + * @event_mask: mask of the last events occurred * * This function attempt to exatract as much data as possible on QMAN error. * On upper CP print the SW config stream data and last 8 PQEs. * On lower CP print SW config data and last PQEs of ALL 4 upper CPs */ -static void print_qman_data_on_err(struct hl_device *hdev, u32 qid_base, - u32 stream, u64 qman_base) +static void handle_qman_data_on_err(struct hl_device *hdev, u32 qid_base, + u32 stream, u64 qman_base, u64 event_mask) { u32 i; if (stream != QMAN_STREAMS) { - gaudi_print_last_pqes_on_err(hdev, qid_base, stream, qman_base, - true); + gaudi_handle_last_pqes_on_err(hdev, qid_base, stream, + qman_base, event_mask, true); return; } - gaudi_print_sw_config_stream_data(hdev, stream, qman_base); + /* handle Lower-CP */ + gaudi_handle_sw_config_stream_data(hdev, stream, qman_base, event_mask); for (i = 0; i < QMAN_STREAMS; i++) - gaudi_print_last_pqes_on_err(hdev, qid_base, i, qman_base, - false); + gaudi_handle_last_pqes_on_err(hdev, qid_base, i, + qman_base, event_mask, false); } static void gaudi_handle_qman_err_generic(struct hl_device *hdev, const char *qm_name, u64 qman_base, - u32 qid_base) + u32 qid_base, + u64 *event_mask) { u32 i, j, glbl_sts_val, arb_err_val, glbl_sts_clr_val; u64 glbl_sts_addr, arb_err_addr; @@ -7142,12 +7199,21 @@ static void gaudi_handle_qman_err_generic(struct hl_device *hdev, glbl_sts_clr_val |= BIT(j); } } + /* check for undefined opcode */ + if (glbl_sts_val & TPC0_QM_GLBL_STS1_CP_UNDEF_CMD_ERR_MASK && + hdev->last_error.undef_opcode.write_enable) { + memset(&hdev->last_error.undef_opcode, 0, + sizeof(hdev->last_error.undef_opcode)); + + hdev->last_error.undef_opcode.write_enable = false; + *event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE; + } /* Write 1 clear errors */ if (!hdev->stop_on_err) WREG32(glbl_sts_addr + 4 * i, glbl_sts_clr_val); else - print_qman_data_on_err(hdev, qid_base, i, qman_base); + handle_qman_data_on_err(hdev, qid_base, i, qman_base, *event_mask); } arb_err_val = RREG32(arb_err_addr); @@ -7385,7 +7451,7 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type, u64 *e return; } - gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base); + gaudi_handle_qman_err_generic(hdev, desc, qman_base, qid_base, event_mask); } static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type, diff --git a/include/uapi/misc/habanalabs.h b/include/uapi/misc/habanalabs.h index 6d2ccc09dcf2..c94b89cf1ec1 100644 --- a/include/uapi/misc/habanalabs.h +++ b/include/uapi/misc/habanalabs.h @@ -1402,9 +1402,13 @@ struct hl_debug_args { /* * Notifier event values - for the notification mechanism and the HL_INFO_GET_EVENTS command * - * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event + * HL_NOTIFIER_EVENT_TPC_ASSERT - Indicates TPC assert event + * HL_NOTIFIER_EVENT_UNDEFINED_OPCODE - Indicates undefined operation code + * HL_NOTIFIER_EVENT_DEVICE_RESET - Indicates device requires a reset */ -#define HL_NOTIFIER_EVENT_TPC_ASSERT (1 << 0) +#define HL_NOTIFIER_EVENT_TPC_ASSERT (1ULL << 0) +#define HL_NOTIFIER_EVENT_UNDEFINED_OPCODE (1ULL << 1) +#define HL_NOTIFIER_EVENT_DEVICE_RESET (1ULL << 2) /* * Various information operations such as: