forked from Minki/linux
habanalabs: improve communication protocol with cpucp
Current messaging communictaion protocol with cpucp can get out of sync due to coherency issues. In order to improve the protocol reliability, we modify the protocol to expect a different acknowledgment for every packet sent to cpucp. Signed-off-by: Ofir Bitton <obitton@habana.ai> Reviewed-by: Oded Gabbay <ogabbay@kernel.org> Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
parent
6c1e3f92f9
commit
5dbd7b4de6
@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
|
||||
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
|
||||
u16 len, u32 timeout, u64 *result)
|
||||
{
|
||||
struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
|
||||
struct cpucp_packet *pkt;
|
||||
dma_addr_t pkt_dma_addr;
|
||||
u32 tmp;
|
||||
u32 tmp, expected_ack_val;
|
||||
int rc = 0;
|
||||
|
||||
pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
|
||||
@ -115,14 +116,22 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* set fence to a non valid value */
|
||||
pkt->fence = UINT_MAX;
|
||||
|
||||
rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
|
||||
if (rc) {
|
||||
dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (hdev->asic_prop.fw_cpucp_ack_with_pi)
|
||||
expected_ack_val = queue->pi;
|
||||
else
|
||||
expected_ack_val = CPUCP_PACKET_FENCE_VAL;
|
||||
|
||||
rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
|
||||
(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
|
||||
(tmp == expected_ack_val), 1000,
|
||||
timeout, true);
|
||||
|
||||
hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
|
||||
@ -777,6 +786,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
|
||||
CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
|
||||
prop->hard_reset_done_by_fw = true;
|
||||
|
||||
if (prop->fw_boot_cpu_security_map &
|
||||
CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
|
||||
prop->fw_cpucp_ack_with_pi = true;
|
||||
|
||||
dev_dbg(hdev->dev,
|
||||
"Firmware boot CPU security status %#x\n",
|
||||
prop->fw_boot_cpu_security_map);
|
||||
|
@ -419,6 +419,8 @@ struct hl_mmu_properties {
|
||||
* from BOOT_DEV_STS0
|
||||
* @dram_supports_virtual_memory: is there an MMU towards the DRAM
|
||||
* @hard_reset_done_by_fw: true if firmware is handling hard reset flow
|
||||
* @fw_cpucp_ack_with_pi: true if cpucp is acking messages with the PQ PI
|
||||
* instead of a magic number
|
||||
* @num_functional_hbms: number of functional HBMs in each DCORE.
|
||||
*/
|
||||
struct asic_fixed_properties {
|
||||
@ -479,6 +481,7 @@ struct asic_fixed_properties {
|
||||
u8 fw_security_status_valid;
|
||||
u8 dram_supports_virtual_memory;
|
||||
u8 hard_reset_done_by_fw;
|
||||
u8 fw_cpucp_ack_with_pi;
|
||||
u8 num_functional_hbms;
|
||||
};
|
||||
|
||||
|
@ -533,6 +533,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
|
||||
prop->fw_security_disabled = true;
|
||||
prop->fw_security_status_valid = false;
|
||||
prop->hard_reset_done_by_fw = false;
|
||||
prop->fw_cpucp_ack_with_pi = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -4438,9 +4439,12 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
|
||||
/* ring the doorbell */
|
||||
WREG32(db_reg_offset, db_value);
|
||||
|
||||
if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ)
|
||||
if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) {
|
||||
/* make sure device CPU will read latest data from host */
|
||||
mb();
|
||||
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
|
||||
GAUDI_EVENT_PI_UPDATE);
|
||||
}
|
||||
}
|
||||
|
||||
static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe,
|
||||
|
@ -461,6 +461,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
|
||||
prop->fw_security_disabled = true;
|
||||
prop->fw_security_status_valid = false;
|
||||
prop->hard_reset_done_by_fw = false;
|
||||
prop->fw_cpucp_ack_with_pi = false;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -2806,9 +2807,12 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
|
||||
/* ring the doorbell */
|
||||
WREG32(db_reg_offset, db_value);
|
||||
|
||||
if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ)
|
||||
if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) {
|
||||
/* make sure device CPU will read latest data from host */
|
||||
mb();
|
||||
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
|
||||
GOYA_ASYNC_EVENT_ID_PI_UPDATE);
|
||||
}
|
||||
}
|
||||
|
||||
void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd)
|
||||
|
@ -166,6 +166,10 @@
|
||||
* FW handles HBM ECC indications.
|
||||
* Initialized in: linux
|
||||
*
|
||||
* CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN Packets ack value used in the armcpd
|
||||
* is set to the PI counter.
|
||||
* Initialized in: linux
|
||||
*
|
||||
* CPU_BOOT_DEV_STS0_ENABLED Device status register enabled.
|
||||
* This is a main indication that the
|
||||
* running FW populates the device status
|
||||
@ -190,6 +194,7 @@
|
||||
#define CPU_BOOT_DEV_STS0_SP_SRAM_EN (1 << 12)
|
||||
#define CPU_BOOT_DEV_STS0_CLK_GATE_EN (1 << 13)
|
||||
#define CPU_BOOT_DEV_STS0_HBM_ECC_EN (1 << 14)
|
||||
#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN (1 << 15)
|
||||
#define CPU_BOOT_DEV_STS0_ENABLED (1 << 31)
|
||||
|
||||
enum cpu_boot_status {
|
||||
|
Loading…
Reference in New Issue
Block a user