From cbbd5fff86e8fd22152982d077ed06b1210f2779 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Mon, 20 Dec 2021 12:56:11 +0200 Subject: [PATCH 1/4] mlxsw: Fix naming convention of MFDE fields Currently, the MFDE register field names are using the convention: reg_mfde_, and do not consider the name of the MFDE event. Fix the field names so they fit the more accurate convention: reg_mfde__. Signed-off-by: Danielle Ratson Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 8 ++++---- drivers/net/ethernet/mellanox/mlxsw/reg.h | 19 ++++++++----------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 7e89d5980d5e..e1d4056f9eea 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1801,20 +1801,20 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor return err; if (event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO) { - val = mlxsw_reg_mfde_log_address_get(mfde_pl); + val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl); err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val); if (err) return err; - val = mlxsw_reg_mfde_log_id_get(mfde_pl); + val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl); err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val); if (err) return err; - val = mlxsw_reg_mfde_log_ip_get(mfde_pl); + val = mlxsw_reg_mfde_crspace_to_log_ip_get(mfde_pl); err = devlink_fmsg_u64_pair_put(fmsg, "log_ip", val); if (err) return err; } else if (event_id == MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP) { - val = mlxsw_reg_mfde_pipes_mask_get(mfde_pl); + val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl); err = devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index f748b537bdab..ed0767cc71c2 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -11372,32 +11372,29 @@ MLXSW_ITEM32(reg, mfde, command_type, 0x04, 24, 2); */ MLXSW_ITEM32(reg, mfde, reg_attr_id, 0x04, 0, 16); -/* reg_mfde_log_address +/* reg_mfde_crspace_to_log_address * crspace address accessed, which resulted in timeout. - * Valid in case event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO * Access: RO */ -MLXSW_ITEM32(reg, mfde, log_address, 0x10, 0, 32); +MLXSW_ITEM32(reg, mfde, crspace_to_log_address, 0x10, 0, 32); -/* reg_mfde_log_id +/* reg_mfde_crspace_to_log_id * Which irisc triggered the timeout. - * Valid in case event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO * Access: RO */ -MLXSW_ITEM32(reg, mfde, log_id, 0x14, 0, 4); +MLXSW_ITEM32(reg, mfde, crspace_to_log_id, 0x14, 0, 4); -/* reg_mfde_log_ip +/* reg_mfde_crspace_to_log_ip * IP (instruction pointer) that triggered the timeout. - * Valid in case event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO * Access: RO */ -MLXSW_ITEM64(reg, mfde, log_ip, 0x18, 0, 64); +MLXSW_ITEM64(reg, mfde, crspace_to_log_ip, 0x18, 0, 64); -/* reg_mfde_pipes_mask +/* reg_mfde_kvd_im_stop_pipes_mask * Bit per kvh pipe. * Access: RO */ -MLXSW_ITEM32(reg, mfde, pipes_mask, 0x10, 0, 16); +MLXSW_ITEM32(reg, mfde, kvd_im_stop_pipes_mask, 0x10, 0, 16); /* TNGCR - Tunneling NVE General Configuration Register * ---------------------------------------------------- From 4bcbf50291f3b61ad231d31267842fcc73d7e6fd Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Mon, 20 Dec 2021 12:56:12 +0200 Subject: [PATCH 2/4] mlxsw: core: Convert a series of if statements to switch case Convert a series of if statements that handle different events to a switch case statement. Encapsulate the per-event code in different functions to simplify the code. This is a preparation for subsequent patches that will add more events that need to be handled. Signed-off-by: Danielle Ratson Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 58 +++++++++++++++------- 1 file changed, 40 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index e1d4056f9eea..d9f12d9cd0ff 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1708,6 +1708,39 @@ static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg, static const struct mlxsw_listener mlxsw_core_health_listener = MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE); +static int +mlxsw_core_health_fw_fatal_dump_kvd_im_stop(const char *mfde_pl, + struct devlink_fmsg *fmsg) +{ + u32 val; + + val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl); + return devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val); +} + +static int +mlxsw_core_health_fw_fatal_dump_crspace_to(const char *mfde_pl, + struct devlink_fmsg *fmsg) +{ + u32 val; + int err; + + val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val); + if (err) + return err; + val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl); + err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val); + if (err) + return err; + val = mlxsw_reg_mfde_crspace_to_log_ip_get(mfde_pl); + err = devlink_fmsg_u64_pair_put(fmsg, "log_ip", val); + if (err) + return err; + + return 0; +} + static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg, void *priv_ctx, struct netlink_ext_ack *extack) @@ -1800,24 +1833,13 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor if (err) return err; - if (event_id == MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO) { - val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl); - err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val); - if (err) - return err; - val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl); - err = devlink_fmsg_u8_pair_put(fmsg, "log_irisc_id", val); - if (err) - return err; - val = mlxsw_reg_mfde_crspace_to_log_ip_get(mfde_pl); - err = devlink_fmsg_u64_pair_put(fmsg, "log_ip", val); - if (err) - return err; - } else if (event_id == MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP) { - val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl); - err = devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val); - if (err) - return err; + switch (event_id) { + case MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO: + return mlxsw_core_health_fw_fatal_dump_crspace_to(mfde_pl, + fmsg); + case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP: + return mlxsw_core_health_fw_fatal_dump_kvd_im_stop(mfde_pl, + fmsg); } return 0; From e25c060c5f24b19d2596445c6d0cf40da6f13cc1 Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Mon, 20 Dec 2021 12:56:13 +0200 Subject: [PATCH 3/4] mlxsw: reg: Extend MFDE register with new events and parameters Extend the Monitoring Firmware Debug (MFDE) register with new events and their related parameters. These events will be utilized by devlink-health in the next patch. Signed-off-by: Danielle Ratson Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/reg.h | 105 +++++++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/reg.h b/drivers/net/ethernet/mellanox/mlxsw/reg.h index ed0767cc71c2..c97d2c744725 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/reg.h +++ b/drivers/net/ethernet/mellanox/mlxsw/reg.h @@ -11318,7 +11318,7 @@ mlxsw_reg_mgpir_unpack(char *payload, u8 *num_of_devices, * ----------------------------------- */ #define MLXSW_REG_MFDE_ID 0x9200 -#define MLXSW_REG_MFDE_LEN 0x18 +#define MLXSW_REG_MFDE_LEN 0x30 MLXSW_REG_DEFINE(mfde, MLXSW_REG_MFDE_ID, MLXSW_REG_MFDE_LEN); @@ -11328,10 +11328,32 @@ MLXSW_REG_DEFINE(mfde, MLXSW_REG_MFDE_ID, MLXSW_REG_MFDE_LEN); */ MLXSW_ITEM32(reg, mfde, irisc_id, 0x00, 24, 8); +enum mlxsw_reg_mfde_severity { + /* Unrecoverable switch behavior */ + MLXSW_REG_MFDE_SEVERITY_FATL = 2, + /* Unexpected state with possible systemic failure */ + MLXSW_REG_MFDE_SEVERITY_NRML = 3, + /* Unexpected state without systemic failure */ + MLXSW_REG_MFDE_SEVERITY_INTR = 5, +}; + +/* reg_mfde_severity + * The severity of the event. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, severity, 0x00, 16, 8); + enum mlxsw_reg_mfde_event_id { + /* CRspace timeout */ MLXSW_REG_MFDE_EVENT_ID_CRSPACE_TO = 1, /* KVD insertion machine stopped */ MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP, + /* Triggered by MFGD.trigger_test */ + MLXSW_REG_MFDE_EVENT_ID_TEST, + /* Triggered when firmware hits an assert */ + MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT, + /* Fatal error interrupt from hardware */ + MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE, }; /* reg_mfde_event_id @@ -11378,6 +11400,13 @@ MLXSW_ITEM32(reg, mfde, reg_attr_id, 0x04, 0, 16); */ MLXSW_ITEM32(reg, mfde, crspace_to_log_address, 0x10, 0, 32); +/* reg_mfde_crspace_to_oe + * 0 - New event + * 1 - Old event, occurred before MFGD activation. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, crspace_to_oe, 0x14, 24, 1); + /* reg_mfde_crspace_to_log_id * Which irisc triggered the timeout. * Access: RO @@ -11390,12 +11419,86 @@ MLXSW_ITEM32(reg, mfde, crspace_to_log_id, 0x14, 0, 4); */ MLXSW_ITEM64(reg, mfde, crspace_to_log_ip, 0x18, 0, 64); +/* reg_mfde_kvd_im_stop_oe + * 0 - New event + * 1 - Old event, occurred before MFGD activation. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, kvd_im_stop_oe, 0x10, 24, 1); + /* reg_mfde_kvd_im_stop_pipes_mask * Bit per kvh pipe. * Access: RO */ MLXSW_ITEM32(reg, mfde, kvd_im_stop_pipes_mask, 0x10, 0, 16); +/* reg_mfde_fw_assert_var0-4 + * Variables passed to assert. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_var0, 0x10, 0, 32); +MLXSW_ITEM32(reg, mfde, fw_assert_var1, 0x14, 0, 32); +MLXSW_ITEM32(reg, mfde, fw_assert_var2, 0x18, 0, 32); +MLXSW_ITEM32(reg, mfde, fw_assert_var3, 0x1C, 0, 32); +MLXSW_ITEM32(reg, mfde, fw_assert_var4, 0x20, 0, 32); + +/* reg_mfde_fw_assert_existptr + * The instruction pointer when assert was triggered. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_existptr, 0x24, 0, 32); + +/* reg_mfde_fw_assert_callra + * The return address after triggering assert. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_callra, 0x28, 0, 32); + +/* reg_mfde_fw_assert_oe + * 0 - New event + * 1 - Old event, occurred before MFGD activation. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_oe, 0x2C, 24, 1); + +/* reg_mfde_fw_assert_tile_v + * 0: The assert was from main + * 1: The assert was from a tile + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_tile_v, 0x2C, 23, 1); + +/* reg_mfde_fw_assert_tile_index + * When tile_v=1, the tile_index that caused the assert. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_tile_index, 0x2C, 16, 6); + +/* reg_mfde_fw_assert_ext_synd + * A generated one-to-one identifier which is specific per-assert. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fw_assert_ext_synd, 0x2C, 0, 16); + +/* reg_mfde_fatal_cause_id + * HW interrupt cause id. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fatal_cause_id, 0x10, 0, 18); + +/* reg_mfde_fatal_cause_tile_v + * 0: The assert was from main + * 1: The assert was from a tile + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fatal_cause_tile_v, 0x14, 23, 1); + +/* reg_mfde_fatal_cause_tile_index + * When tile_v=1, the tile_index that caused the assert. + * Access: RO + */ +MLXSW_ITEM32(reg, mfde, fatal_cause_tile_index, 0x14, 16, 6); + /* TNGCR - Tunneling NVE General Configuration Register * ---------------------------------------------------- * The TNGCR register is used for setting up the NVE Tunneling configuration. From 239cdd3f4cb0fe87da95e9a64a41644ee5a744ec Mon Sep 17 00:00:00 2001 From: Danielle Ratson Date: Mon, 20 Dec 2021 12:56:14 +0200 Subject: [PATCH 4/4] mlxsw: core: Extend devlink health reporter with new events and parameters Extend the devlink health reporter registered by mlxsw to report new health events and their related parameters. These are meant to aid in debugging of hardware / firmware issues. Beside the test event ('MLXSW_REG_MFDE_EVENT_ID_TEST') that is triggered following the devlink health 'test' sub-command, the new events are used to report the triggering of asserts in firmware code ('MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT') and hardware issues ('MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE'). Each event is accompanied with a severity parameter and per-event parameters that are meant to help root cause the detected issue. Signed-off-by: Danielle Ratson Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 131 +++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index d9f12d9cd0ff..866b9357939b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -1708,12 +1708,93 @@ static void mlxsw_core_health_listener_func(const struct mlxsw_reg_info *reg, static const struct mlxsw_listener mlxsw_core_health_listener = MLXSW_EVENTL(mlxsw_core_health_listener_func, MFDE, MFDE); +static int +mlxsw_core_health_fw_fatal_dump_fatal_cause(const char *mfde_pl, + struct devlink_fmsg *fmsg) +{ + u32 val, tile_v; + int err; + + val = mlxsw_reg_mfde_fatal_cause_id_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "cause_id", val); + if (err) + return err; + tile_v = mlxsw_reg_mfde_fatal_cause_tile_v_get(mfde_pl); + if (tile_v) { + val = mlxsw_reg_mfde_fatal_cause_tile_index_get(mfde_pl); + err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val); + if (err) + return err; + } + + return 0; +} + +static int +mlxsw_core_health_fw_fatal_dump_fw_assert(const char *mfde_pl, + struct devlink_fmsg *fmsg) +{ + u32 val, tile_v; + int err; + + val = mlxsw_reg_mfde_fw_assert_var0_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "var0", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_var1_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "var1", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_var2_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "var2", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_var3_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "var3", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_var4_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "var4", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_existptr_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "existptr", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_callra_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "callra", val); + if (err) + return err; + val = mlxsw_reg_mfde_fw_assert_oe_get(mfde_pl); + err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val); + if (err) + return err; + tile_v = mlxsw_reg_mfde_fw_assert_tile_v_get(mfde_pl); + if (tile_v) { + val = mlxsw_reg_mfde_fw_assert_tile_index_get(mfde_pl); + err = devlink_fmsg_u8_pair_put(fmsg, "tile_index", val); + if (err) + return err; + } + val = mlxsw_reg_mfde_fw_assert_ext_synd_get(mfde_pl); + err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd", val); + if (err) + return err; + + return 0; +} + static int mlxsw_core_health_fw_fatal_dump_kvd_im_stop(const char *mfde_pl, struct devlink_fmsg *fmsg) { u32 val; + int err; + val = mlxsw_reg_mfde_kvd_im_stop_oe_get(mfde_pl); + err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val); + if (err) + return err; val = mlxsw_reg_mfde_kvd_im_stop_pipes_mask_get(mfde_pl); return devlink_fmsg_u32_pair_put(fmsg, "pipes_mask", val); } @@ -1727,6 +1808,10 @@ mlxsw_core_health_fw_fatal_dump_crspace_to(const char *mfde_pl, val = mlxsw_reg_mfde_crspace_to_log_address_get(mfde_pl); err = devlink_fmsg_u32_pair_put(fmsg, "log_address", val); + if (err) + return err; + val = mlxsw_reg_mfde_crspace_to_oe_get(mfde_pl); + err = devlink_fmsg_bool_pair_put(fmsg, "old_event", val); if (err) return err; val = mlxsw_reg_mfde_crspace_to_log_id_get(mfde_pl); @@ -1774,6 +1859,15 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP: val_str = "KVD insertion machine stopped"; break; + case MLXSW_REG_MFDE_EVENT_ID_TEST: + val_str = "Test"; + break; + case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT: + val_str = "FW assert"; + break; + case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE: + val_str = "Fatal cause"; + break; default: val_str = NULL; } @@ -1782,6 +1876,38 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor if (err) return err; } + + err = devlink_fmsg_arr_pair_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_arr_pair_nest_start(fmsg, "severity"); + if (err) + return err; + + val = mlxsw_reg_mfde_severity_get(mfde_pl); + err = devlink_fmsg_u8_pair_put(fmsg, "id", val); + if (err) + return err; + switch (val) { + case MLXSW_REG_MFDE_SEVERITY_FATL: + val_str = "Fatal"; + break; + case MLXSW_REG_MFDE_SEVERITY_NRML: + val_str = "Normal"; + break; + case MLXSW_REG_MFDE_SEVERITY_INTR: + val_str = "Debug"; + break; + default: + val_str = NULL; + } + if (val_str) { + err = devlink_fmsg_string_pair_put(fmsg, "desc", val_str); + if (err) + return err; + } + err = devlink_fmsg_arr_pair_nest_end(fmsg); if (err) return err; @@ -1840,6 +1966,11 @@ static int mlxsw_core_health_fw_fatal_dump(struct devlink_health_reporter *repor case MLXSW_REG_MFDE_EVENT_ID_KVD_IM_STOP: return mlxsw_core_health_fw_fatal_dump_kvd_im_stop(mfde_pl, fmsg); + case MLXSW_REG_MFDE_EVENT_ID_FW_ASSERT: + return mlxsw_core_health_fw_fatal_dump_fw_assert(mfde_pl, fmsg); + case MLXSW_REG_MFDE_EVENT_ID_FATAL_CAUSE: + return mlxsw_core_health_fw_fatal_dump_fatal_cause(mfde_pl, + fmsg); } return 0;