net/mlx5: Report devlink health on FW issues
Use devlink_health_report() to report any symptom of FW issue as FW counter miss or new health syndrome. The FW issues detected in mlx5 during poll_health which is called in timer atomic context and so health work queue is used to schedule the reports. Signed-off-by: Moshe Shemesh <moshe@mellanox.com> Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com> Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
This commit is contained in:
parent
fd1483fe1f
commit
d1bf0e2cc4
@ -515,6 +515,29 @@ mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
|
|||||||
return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
|
return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void mlx5_fw_reporter_err_work(struct work_struct *work)
|
||||||
|
{
|
||||||
|
struct mlx5_fw_reporter_ctx fw_reporter_ctx;
|
||||||
|
struct mlx5_core_health *health;
|
||||||
|
|
||||||
|
health = container_of(work, struct mlx5_core_health, report_work);
|
||||||
|
|
||||||
|
if (IS_ERR_OR_NULL(health->fw_reporter))
|
||||||
|
return;
|
||||||
|
|
||||||
|
fw_reporter_ctx.err_synd = health->synd;
|
||||||
|
fw_reporter_ctx.miss_counter = health->miss_counter;
|
||||||
|
if (fw_reporter_ctx.err_synd) {
|
||||||
|
devlink_health_report(health->fw_reporter,
|
||||||
|
"FW syndrom reported", &fw_reporter_ctx);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (fw_reporter_ctx.miss_counter)
|
||||||
|
devlink_health_report(health->fw_reporter,
|
||||||
|
"FW miss counter reported",
|
||||||
|
&fw_reporter_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
|
static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
|
||||||
.name = "fw",
|
.name = "fw",
|
||||||
.diagnose = mlx5_fw_reporter_diagnose,
|
.diagnose = mlx5_fw_reporter_diagnose,
|
||||||
@ -572,7 +595,9 @@ static void poll_health(struct timer_list *t)
|
|||||||
{
|
{
|
||||||
struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
|
struct mlx5_core_dev *dev = from_timer(dev, t, priv.health.timer);
|
||||||
struct mlx5_core_health *health = &dev->priv.health;
|
struct mlx5_core_health *health = &dev->priv.health;
|
||||||
|
struct health_buffer __iomem *h = health->health;
|
||||||
u32 fatal_error;
|
u32 fatal_error;
|
||||||
|
u8 prev_synd;
|
||||||
u32 count;
|
u32 count;
|
||||||
|
|
||||||
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
||||||
@ -588,8 +613,14 @@ static void poll_health(struct timer_list *t)
|
|||||||
if (health->miss_counter == MAX_MISSES) {
|
if (health->miss_counter == MAX_MISSES) {
|
||||||
mlx5_core_err(dev, "device's health compromised - reached miss count\n");
|
mlx5_core_err(dev, "device's health compromised - reached miss count\n");
|
||||||
print_health_info(dev);
|
print_health_info(dev);
|
||||||
|
queue_work(health->wq, &health->report_work);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
prev_synd = health->synd;
|
||||||
|
health->synd = ioread8(&h->synd);
|
||||||
|
if (health->synd && health->synd != prev_synd)
|
||||||
|
queue_work(health->wq, &health->report_work);
|
||||||
|
|
||||||
fatal_error = check_fatal_sensors(dev);
|
fatal_error = check_fatal_sensors(dev);
|
||||||
|
|
||||||
if (fatal_error && !health->fatal_error) {
|
if (fatal_error && !health->fatal_error) {
|
||||||
@ -639,6 +670,7 @@ void mlx5_drain_health_wq(struct mlx5_core_dev *dev)
|
|||||||
spin_lock_irqsave(&health->wq_lock, flags);
|
spin_lock_irqsave(&health->wq_lock, flags);
|
||||||
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
|
set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
|
||||||
spin_unlock_irqrestore(&health->wq_lock, flags);
|
spin_unlock_irqrestore(&health->wq_lock, flags);
|
||||||
|
cancel_work_sync(&health->report_work);
|
||||||
cancel_work_sync(&health->work);
|
cancel_work_sync(&health->work);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -675,6 +707,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
|
|||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
spin_lock_init(&health->wq_lock);
|
spin_lock_init(&health->wq_lock);
|
||||||
INIT_WORK(&health->work, health_care);
|
INIT_WORK(&health->work, health_care);
|
||||||
|
INIT_WORK(&health->report_work, mlx5_fw_reporter_err_work);
|
||||||
|
|
||||||
mlx5_fw_reporter_create(dev);
|
mlx5_fw_reporter_create(dev);
|
||||||
|
|
||||||
|
@ -435,7 +435,7 @@ struct mlx5_core_health {
|
|||||||
struct timer_list timer;
|
struct timer_list timer;
|
||||||
u32 prev;
|
u32 prev;
|
||||||
int miss_counter;
|
int miss_counter;
|
||||||
bool sick;
|
u8 synd;
|
||||||
u32 fatal_error;
|
u32 fatal_error;
|
||||||
u32 crdump_size;
|
u32 crdump_size;
|
||||||
/* wq spinlock to synchronize draining */
|
/* wq spinlock to synchronize draining */
|
||||||
@ -443,6 +443,7 @@ struct mlx5_core_health {
|
|||||||
struct workqueue_struct *wq;
|
struct workqueue_struct *wq;
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
struct work_struct work;
|
struct work_struct work;
|
||||||
|
struct work_struct report_work;
|
||||||
struct delayed_work recover_work;
|
struct delayed_work recover_work;
|
||||||
struct devlink_health_reporter *fw_reporter;
|
struct devlink_health_reporter *fw_reporter;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user