habanalabs/gaudi: use HBM_ECC_EN bit for ECC ERR

driver should use ECC info from FW only if HBM ECC CAP is set.
otherwise, try to fetch the data from MC regs only if security is
disabled.

Signed-off-by: Ohad Sharabi <osharabi@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
This commit is contained in:
Ohad Sharabi 2021-01-27 15:42:53 +02:00 committed by Oded Gabbay
parent e52606d2f5
commit b520ca5d82

View File

@ -7105,7 +7105,9 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
u32 base, val, val2, wr_par, rd_par, ca_par, derr, serr, type, ch;
int err = 0;
if (!hdev->asic_prop.fw_security_disabled) {
if (hdev->asic_prop.fw_security_status_valid &&
(hdev->asic_prop.fw_app_security_map &
CPU_BOOT_DEV_STS0_HBM_ECC_EN)) {
if (!hbm_ecc_data) {
dev_err(hdev->dev, "No FW ECC data");
return 0;
@ -7127,14 +7129,24 @@ static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
le32_to_cpu(hbm_ecc_data->hbm_ecc_info));
dev_err(hdev->dev,
"HBM%d pc%d ECC: TYPE=%d, WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
device, ch, type, wr_par, rd_par, ca_par, serr, derr);
"HBM%d pc%d interrupts info: WR_PAR=%d, RD_PAR=%d, CA_PAR=%d, SERR=%d, DERR=%d\n",
device, ch, wr_par, rd_par, ca_par, serr, derr);
dev_err(hdev->dev,
"HBM%d pc%d ECC info: 1ST_ERR_ADDR=0x%x, 1ST_ERR_TYPE=%d, SEC_CONT_CNT=%u, SEC_CNT=%d, DEC_CNT=%d\n",
device, ch, hbm_ecc_data->first_addr, type,
hbm_ecc_data->sec_cont_cnt, hbm_ecc_data->sec_cnt,
hbm_ecc_data->dec_cnt);
err = 1;
return 0;
}
if (!hdev->asic_prop.fw_security_disabled) {
dev_info(hdev->dev, "Cannot access MC regs for ECC data while security is enabled\n");
return 0;
}
base = GAUDI_HBM_CFG_BASE + device * GAUDI_HBM_CFG_OFFSET;
for (ch = 0 ; ch < GAUDI_HBM_CHANNELS ; ch++) {
val = RREG32_MASK(base + ch * 0x1000 + 0x06C, 0x0000FFFF);