habanalabs: improve communication protocol with cpucp

Current messaging communictaion protocol with cpucp can get out
of sync due to coherency issues. In order to improve the protocol
reliability, we modify the protocol to expect a different
acknowledgment for every packet sent to cpucp.

Signed-off-by: Ofir Bitton <obitton@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Ofir Bitton 2021-01-28 16:30:25 +02:00 committed by Oded Gabbay
parent 6c1e3f92f9
commit 5dbd7b4de6
5 changed files with 33 additions and 4 deletions

View File

@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
u16 len, u32 timeout, u64 *result)
{
struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
struct cpucp_packet *pkt;
dma_addr_t pkt_dma_addr;
u32 tmp;
u32 tmp, expected_ack_val;
int rc = 0;
pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
@ -115,14 +116,22 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
goto out;
}
/* set fence to a non valid value */
pkt->fence = UINT_MAX;
rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
if (rc) {
dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
goto out;
}
if (hdev->asic_prop.fw_cpucp_ack_with_pi)
expected_ack_val = queue->pi;
else
expected_ack_val = CPUCP_PACKET_FENCE_VAL;
rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
(tmp == expected_ack_val), 1000,
timeout, true);
hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@ -777,6 +786,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
prop->hard_reset_done_by_fw = true;
if (prop->fw_boot_cpu_security_map &
CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
prop->fw_cpucp_ack_with_pi = true;
dev_dbg(hdev->dev,
"Firmware boot CPU security status %#x\n",
prop->fw_boot_cpu_security_map);

View File

@ -419,6 +419,8 @@ struct hl_mmu_properties {
* from BOOT_DEV_STS0
* @dram_supports_virtual_memory: is there an MMU towards the DRAM
* @hard_reset_done_by_fw: true if firmware is handling hard reset flow
* @fw_cpucp_ack_with_pi: true if cpucp is acking messages with the PQ PI
* instead of a magic number
* @num_functional_hbms: number of functional HBMs in each DCORE.
*/
struct asic_fixed_properties {
@ -479,6 +481,7 @@ struct asic_fixed_properties {
u8 fw_security_status_valid;
u8 dram_supports_virtual_memory;
u8 hard_reset_done_by_fw;
u8 fw_cpucp_ack_with_pi;
u8 num_functional_hbms;
};

View File

@ -533,6 +533,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->fw_security_disabled = true;
prop->fw_security_status_valid = false;
prop->hard_reset_done_by_fw = false;
prop->fw_cpucp_ack_with_pi = false;
return 0;
}
@ -4438,9 +4439,12 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
/* ring the doorbell */
WREG32(db_reg_offset, db_value);
if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ)
if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) {
/* make sure device CPU will read latest data from host */
mb();
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GAUDI_EVENT_PI_UPDATE);
}
}
static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe,

View File

@ -461,6 +461,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
prop->fw_security_disabled = true;
prop->fw_security_status_valid = false;
prop->hard_reset_done_by_fw = false;
prop->fw_cpucp_ack_with_pi = false;
return 0;
}
@ -2806,9 +2807,12 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
/* ring the doorbell */
WREG32(db_reg_offset, db_value);
if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ)
if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) {
/* make sure device CPU will read latest data from host */
mb();
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GOYA_ASYNC_EVENT_ID_PI_UPDATE);
}
}
void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd)

View File

@ -166,6 +166,10 @@
* FW handles HBM ECC indications.
* Initialized in: linux
*
* CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN Packets ack value used in the armcpd
* is set to the PI counter.
* Initialized in: linux
*
* CPU_BOOT_DEV_STS0_ENABLED Device status register enabled.
* This is a main indication that the
* running FW populates the device status
@ -190,6 +194,7 @@
#define CPU_BOOT_DEV_STS0_SP_SRAM_EN (1 << 12)
#define CPU_BOOT_DEV_STS0_CLK_GATE_EN (1 << 13)
#define CPU_BOOT_DEV_STS0_HBM_ECC_EN (1 << 14)
#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN (1 << 15)
#define CPU_BOOT_DEV_STS0_ENABLED (1 << 31)
enum cpu_boot_status {