net/mlx5: Fix fatal error handling during device load
Currently, in case of fatal error during mlx5_load_one(), we cannot
enter error state until mlx5_load_one() is finished, what can take
several minutes until commands will get timeouts, because these commands
can't be processed due to the fatal error.
Fix it by setting dev->state as MLX5_DEVICE_STATE_INTERNAL_ERROR before
requesting the lock.
Fixes: c1d4d2e92a
("net/mlx5: Avoid calling sleeping function by the health poll thread")
Signed-off-by: Shay Drory <shayd@mellanox.com>
Reviewed-by: Moshe Shemesh <moshe@mellanox.com>
Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
This commit is contained in:
parent
42ea9f1b5c
commit
b6e0b6bebe
@ -192,15 +192,23 @@ static bool reset_fw_if_needed(struct mlx5_core_dev *dev)
|
||||
|
||||
void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
|
||||
{
|
||||
bool err_detected = false;
|
||||
|
||||
/* Mark the device as fatal in order to abort FW commands */
|
||||
if ((check_fatal_sensors(dev) || force) &&
|
||||
dev->state == MLX5_DEVICE_STATE_UP) {
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
err_detected = true;
|
||||
}
|
||||
mutex_lock(&dev->intf_state_mutex);
|
||||
if (dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
||||
goto unlock;
|
||||
if (!err_detected && dev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
|
||||
goto unlock;/* a previous error is still being handled */
|
||||
if (dev->state == MLX5_DEVICE_STATE_UNINITIALIZED) {
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (check_fatal_sensors(dev) || force) {
|
||||
if (check_fatal_sensors(dev) || force) { /* protected state setting */
|
||||
dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
|
||||
mlx5_cmd_flush(dev);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user