diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 2cb4094c9c49..2d00022c92d8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -64,6 +64,10 @@ enum { MLX5_NIC_IFC_NO_DRAM_NIC = 2 }; +enum { + MLX5_DROP_NEW_HEALTH_WORK, +}; + static u8 get_nic_interface(struct mlx5_core_dev *dev) { return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 3; @@ -272,7 +276,13 @@ static void poll_health(unsigned long data) if (in_fatal(dev) && !health->sick) { health->sick = true; print_health_info(dev); - schedule_work(&health->work); + spin_lock(&health->wq_lock); + if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags)) + queue_work(health->wq, &health->work); + else + dev_err(&dev->pdev->dev, + "new health works are not permitted at this stage\n"); + spin_unlock(&health->wq_lock); } } @@ -282,6 +292,7 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev) init_timer(&health->timer); health->sick = 0; + clear_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); health->health = &dev->iseg->health; health->health_counter = &dev->iseg->health_counter; @@ -298,11 +309,21 @@ void mlx5_stop_health_poll(struct mlx5_core_dev *dev) del_timer_sync(&health->timer); } +void mlx5_drain_health_wq(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + + spin_lock(&health->wq_lock); + set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags); + spin_unlock(&health->wq_lock); + cancel_work_sync(&health->work); +} + void mlx5_health_cleanup(struct mlx5_core_dev *dev) { struct mlx5_core_health *health = &dev->priv.health; - flush_work(&health->work); + destroy_workqueue(health->wq); } int mlx5_health_init(struct mlx5_core_dev *dev) @@ -317,8 +338,11 @@ int mlx5_health_init(struct mlx5_core_dev *dev) strcpy(name, "mlx5_health"); strcat(name, dev_name(&dev->pdev->dev)); + health->wq = create_singlethread_workqueue(name); kfree(name); - + if (!health->wq) + return -ENOMEM; + spin_lock_init(&health->wq_lock); INIT_WORK(&health->work, health_care); return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index 8a63910bfccf..9f90226bc120 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1315,8 +1315,13 @@ static pci_ers_result_t mlx5_pci_err_detected(struct pci_dev *pdev, dev_info(&pdev->dev, "%s was called\n", __func__); mlx5_enter_error_state(dev); mlx5_unload_one(dev, priv, false); - pci_save_state(pdev); - mlx5_pci_disable_device(dev); + /* In case of kernel call save the pci state and drain health wq */ + if (state) { + pci_save_state(pdev); + mlx5_drain_health_wq(dev); + mlx5_pci_disable_device(dev); + } + return state == pci_channel_io_perm_failure ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_NEED_RESET; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5dbda60a09f4..7d9a5d08eb59 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -418,7 +418,10 @@ struct mlx5_core_health { u32 prev; int miss_counter; bool sick; + /* wq spinlock to synchronize draining */ + spinlock_t wq_lock; struct workqueue_struct *wq; + unsigned long flags; struct work_struct work; }; @@ -778,6 +781,7 @@ void mlx5_health_cleanup(struct mlx5_core_dev *dev); int mlx5_health_init(struct mlx5_core_dev *dev); void mlx5_start_health_poll(struct mlx5_core_dev *dev); void mlx5_stop_health_poll(struct mlx5_core_dev *dev); +void mlx5_drain_health_wq(struct mlx5_core_dev *dev); int mlx5_buf_alloc_node(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf, int node); int mlx5_buf_alloc(struct mlx5_core_dev *dev, int size, struct mlx5_buf *buf);