linux/drivers/net/ethernet/intel/ice/ice_idc.c
Ivan Vecera 486b9eee57 ice: Fix race during aux device (un)plugging
Function ice_plug_aux_dev() assigns pf->adev field too early prior
aux device initialization and on other side ice_unplug_aux_dev()
starts aux device deinit and at the end assigns NULL to pf->adev.
This is wrong because pf->adev should always be non-NULL only when
aux device is fully initialized and ready. This wrong order causes
a crash when ice_send_event_to_aux() call occurs because that function
depends on non-NULL value of pf->adev and does not assume that
aux device is half-initialized or half-destroyed.
After order correction the race window is tiny but it is still there,
as Leon mentioned and manipulation with pf->adev needs to be protected
by mutex.

Fix (un-)plugging functions so pf->adev field is set after aux device
init and prior aux device destroy and protect pf->adev assignment by
new mutex. This mutex is also held during ice_send_event_to_aux()
call to ensure that aux device is valid during that call.
Note that device lock used ice_send_event_to_aux() needs to be kept
to avoid race with aux drv unload.

Reproducer:
cycle=1
while :;do
        echo "#### Cycle: $cycle"

        ip link set ens7f0 mtu 9000
        ip link add bond0 type bond mode 1 miimon 100
        ip link set bond0 up
        ifenslave bond0 ens7f0
        ip link set bond0 mtu 9000
        ethtool -L ens7f0 combined 1
        ip link del bond0
        ip link set ens7f0 mtu 1500
        sleep 1

        let cycle++
done

In short when the device is added/removed to/from bond the aux device
is unplugged/plugged. When MTU of the device is changed an event is
sent to aux device asynchronously. This can race with (un)plugging
operation and because pf->adev is set too early (plug) or too late
(unplug) the function ice_send_event_to_aux() can touch uninitialized
or destroyed fields. In the case of crash below pf->adev->dev.mutex.

Crash:
[   53.372066] bond0: (slave ens7f0): making interface the new active one
[   53.378622] bond0: (slave ens7f0): Enslaving as an active interface with an u
p link
[   53.386294] IPv6: ADDRCONF(NETDEV_CHANGE): bond0: link becomes ready
[   53.549104] bond0: (slave ens7f1): Enslaving as a backup interface with an up
 link
[   54.118906] ice 0000:ca:00.0 ens7f0: Number of in use tx queues changed inval
idating tc mappings. Priority traffic classification disabled!
[   54.233374] ice 0000:ca:00.1 ens7f1: Number of in use tx queues changed inval
idating tc mappings. Priority traffic classification disabled!
[   54.248204] bond0: (slave ens7f0): Releasing backup interface
[   54.253955] bond0: (slave ens7f1): making interface the new active one
[   54.274875] bond0: (slave ens7f1): Releasing backup interface
[   54.289153] bond0 (unregistering): Released all slaves
[   55.383179] MII link monitoring set to 100 ms
[   55.398696] bond0: (slave ens7f0): making interface the new active one
[   55.405241] BUG: kernel NULL pointer dereference, address: 0000000000000080
[   55.405289] bond0: (slave ens7f0): Enslaving as an active interface with an u
p link
[   55.412198] #PF: supervisor write access in kernel mode
[   55.412200] #PF: error_code(0x0002) - not-present page
[   55.412201] PGD 25d2ad067 P4D 0
[   55.412204] Oops: 0002 [#1] PREEMPT SMP NOPTI
[   55.412207] CPU: 0 PID: 403 Comm: kworker/0:2 Kdump: loaded Tainted: G S
           5.17.0-13579-g57f2d6540f03 #1
[   55.429094] bond0: (slave ens7f1): Enslaving as a backup interface with an up
 link
[   55.430224] Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.4.4 10/07/
2021
[   55.430226] Workqueue: ice ice_service_task [ice]
[   55.468169] RIP: 0010:mutex_unlock+0x10/0x20
[   55.472439] Code: 0f b1 13 74 96 eb e0 4c 89 ee eb d8 e8 79 54 ff ff 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 65 48 8b 04 25 40 ef 01 00 31 d2 <f0> 48 0f b1 17 75 01 c3 e9 e3 fe ff ff 0f 1f 00 0f 1f 44 00 00 48
[   55.491186] RSP: 0018:ff4454230d7d7e28 EFLAGS: 00010246
[   55.496413] RAX: ff1a79b208b08000 RBX: ff1a79b2182e8880 RCX: 0000000000000001
[   55.503545] RDX: 0000000000000000 RSI: ff4454230d7d7db0 RDI: 0000000000000080
[   55.510678] RBP: ff1a79d1c7e48b68 R08: ff4454230d7d7db0 R09: 0000000000000041
[   55.517812] R10: 00000000000000a5 R11: 00000000000006e6 R12: ff1a79d1c7e48bc0
[   55.524945] R13: 0000000000000000 R14: ff1a79d0ffc305c0 R15: 0000000000000000
[   55.532076] FS:  0000000000000000(0000) GS:ff1a79d0ffc00000(0000) knlGS:0000000000000000
[   55.540163] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   55.545908] CR2: 0000000000000080 CR3: 00000003487ae003 CR4: 0000000000771ef0
[   55.553041] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   55.560173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   55.567305] PKRU: 55555554
[   55.570018] Call Trace:
[   55.572474]  <TASK>
[   55.574579]  ice_service_task+0xaab/0xef0 [ice]
[   55.579130]  process_one_work+0x1c5/0x390
[   55.583141]  ? process_one_work+0x390/0x390
[   55.587326]  worker_thread+0x30/0x360
[   55.590994]  ? process_one_work+0x390/0x390
[   55.595180]  kthread+0xe6/0x110
[   55.598325]  ? kthread_complete_and_exit+0x20/0x20
[   55.603116]  ret_from_fork+0x1f/0x30
[   55.606698]  </TASK>

Fixes: f9f5301e7e ("ice: Register auxiliary device to provide RDMA")
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Dave Ertman <david.m.ertman@intel.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
2022-05-06 10:09:24 -07:00

358 lines
7.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2021, Intel Corporation. */
/* Inter-Driver Communication */
#include "ice.h"
#include "ice_lib.h"
#include "ice_dcb_lib.h"
/**
* ice_get_auxiliary_drv - retrieve iidc_auxiliary_drv struct
* @pf: pointer to PF struct
*
* This function has to be called with a device_lock on the
* pf->adev.dev to avoid race conditions.
*/
static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf)
{
struct auxiliary_device *adev;
adev = pf->adev;
if (!adev || !adev->dev.driver)
return NULL;
return container_of(adev->dev.driver, struct iidc_auxiliary_drv,
adrv.driver);
}
/**
* ice_send_event_to_aux - send event to RDMA AUX driver
* @pf: pointer to PF struct
* @event: event struct
*/
void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event)
{
struct iidc_auxiliary_drv *iadrv;
if (WARN_ON_ONCE(!in_task()))
return;
mutex_lock(&pf->adev_mutex);
if (!pf->adev)
goto finish;
device_lock(&pf->adev->dev);
iadrv = ice_get_auxiliary_drv(pf);
if (iadrv && iadrv->event_handler)
iadrv->event_handler(pf, event);
device_unlock(&pf->adev->dev);
finish:
mutex_unlock(&pf->adev_mutex);
}
/**
* ice_find_vsi - Find the VSI from VSI ID
* @pf: The PF pointer to search in
* @vsi_num: The VSI ID to search for
*/
static struct ice_vsi *ice_find_vsi(struct ice_pf *pf, u16 vsi_num)
{
int i;
ice_for_each_vsi(pf, i)
if (pf->vsi[i] && pf->vsi[i]->vsi_num == vsi_num)
return pf->vsi[i];
return NULL;
}
/**
* ice_add_rdma_qset - Add Leaf Node for RDMA Qset
* @pf: PF struct
* @qset: Resource to be allocated
*/
int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset)
{
u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS];
struct ice_vsi *vsi;
struct device *dev;
u32 qset_teid;
u16 qs_handle;
int status;
int i;
if (WARN_ON(!pf || !qset))
return -EINVAL;
dev = ice_pf_to_dev(pf);
if (!ice_is_rdma_ena(pf))
return -EINVAL;
vsi = ice_get_main_vsi(pf);
if (!vsi) {
dev_err(dev, "RDMA QSet invalid VSI\n");
return -EINVAL;
}
ice_for_each_traffic_class(i)
max_rdmaqs[i] = 0;
max_rdmaqs[qset->tc]++;
qs_handle = qset->qs_handle;
status = ice_cfg_vsi_rdma(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
max_rdmaqs);
if (status) {
dev_err(dev, "Failed VSI RDMA Qset config\n");
return status;
}
status = ice_ena_vsi_rdma_qset(vsi->port_info, vsi->idx, qset->tc,
&qs_handle, 1, &qset_teid);
if (status) {
dev_err(dev, "Failed VSI RDMA Qset enable\n");
return status;
}
vsi->qset_handle[qset->tc] = qset->qs_handle;
qset->teid = qset_teid;
return 0;
}
EXPORT_SYMBOL_GPL(ice_add_rdma_qset);
/**
* ice_del_rdma_qset - Delete leaf node for RDMA Qset
* @pf: PF struct
* @qset: Resource to be freed
*/
int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset)
{
struct ice_vsi *vsi;
u32 teid;
u16 q_id;
if (WARN_ON(!pf || !qset))
return -EINVAL;
vsi = ice_find_vsi(pf, qset->vport_id);
if (!vsi) {
dev_err(ice_pf_to_dev(pf), "RDMA Invalid VSI\n");
return -EINVAL;
}
q_id = qset->qs_handle;
teid = qset->teid;
vsi->qset_handle[qset->tc] = 0;
return ice_dis_vsi_rdma_qset(vsi->port_info, 1, &teid, &q_id);
}
EXPORT_SYMBOL_GPL(ice_del_rdma_qset);
/**
* ice_rdma_request_reset - accept request from RDMA to perform a reset
* @pf: struct for PF
* @reset_type: type of reset
*/
int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type)
{
enum ice_reset_req reset;
if (WARN_ON(!pf))
return -EINVAL;
switch (reset_type) {
case IIDC_PFR:
reset = ICE_RESET_PFR;
break;
case IIDC_CORER:
reset = ICE_RESET_CORER;
break;
case IIDC_GLOBR:
reset = ICE_RESET_GLOBR;
break;
default:
dev_err(ice_pf_to_dev(pf), "incorrect reset request\n");
return -EINVAL;
}
return ice_schedule_reset(pf, reset);
}
EXPORT_SYMBOL_GPL(ice_rdma_request_reset);
/**
* ice_rdma_update_vsi_filter - update main VSI filters for RDMA
* @pf: pointer to struct for PF
* @vsi_id: VSI HW idx to update filter on
* @enable: bool whether to enable or disable filters
*/
int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable)
{
struct ice_vsi *vsi;
int status;
if (WARN_ON(!pf))
return -EINVAL;
vsi = ice_find_vsi(pf, vsi_id);
if (!vsi)
return -EINVAL;
status = ice_cfg_rdma_fltr(&pf->hw, vsi->idx, enable);
if (status) {
dev_err(ice_pf_to_dev(pf), "Failed to %sable RDMA filtering\n",
enable ? "en" : "dis");
} else {
if (enable)
vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
else
vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
}
return status;
}
EXPORT_SYMBOL_GPL(ice_rdma_update_vsi_filter);
/**
* ice_get_qos_params - parse QoS params for RDMA consumption
* @pf: pointer to PF struct
* @qos: set of QoS values
*/
void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos)
{
struct ice_dcbx_cfg *dcbx_cfg;
unsigned int i;
u32 up2tc;
dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
up2tc = rd32(&pf->hw, PRTDCB_TUP2TC);
qos->num_tc = ice_dcb_get_num_tc(dcbx_cfg);
for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++)
qos->up2tc[i] = (up2tc >> (i * 3)) & 0x7;
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
qos->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i];
qos->pfc_mode = dcbx_cfg->pfc_mode;
if (qos->pfc_mode == IIDC_DSCP_PFC_MODE)
for (i = 0; i < IIDC_MAX_DSCP_MAPPING; i++)
qos->dscp_map[i] = dcbx_cfg->dscp_map[i];
}
EXPORT_SYMBOL_GPL(ice_get_qos_params);
/**
* ice_reserve_rdma_qvector - Reserve vector resources for RDMA driver
* @pf: board private structure to initialize
*/
static int ice_reserve_rdma_qvector(struct ice_pf *pf)
{
if (ice_is_rdma_ena(pf)) {
int index;
index = ice_get_res(pf, pf->irq_tracker, pf->num_rdma_msix,
ICE_RES_RDMA_VEC_ID);
if (index < 0)
return index;
pf->num_avail_sw_msix -= pf->num_rdma_msix;
pf->rdma_base_vector = (u16)index;
}
return 0;
}
/**
* ice_adev_release - function to be mapped to AUX dev's release op
* @dev: pointer to device to free
*/
static void ice_adev_release(struct device *dev)
{
struct iidc_auxiliary_dev *iadev;
iadev = container_of(dev, struct iidc_auxiliary_dev, adev.dev);
kfree(iadev);
}
/**
* ice_plug_aux_dev - allocate and register AUX device
* @pf: pointer to pf struct
*/
int ice_plug_aux_dev(struct ice_pf *pf)
{
struct iidc_auxiliary_dev *iadev;
struct auxiliary_device *adev;
int ret;
/* if this PF doesn't support a technology that requires auxiliary
* devices, then gracefully exit
*/
if (!ice_is_rdma_ena(pf))
return 0;
iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
if (!iadev)
return -ENOMEM;
adev = &iadev->adev;
iadev->pf = pf;
adev->id = pf->aux_idx;
adev->dev.release = ice_adev_release;
adev->dev.parent = &pf->pdev->dev;
adev->name = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? "roce" : "iwarp";
ret = auxiliary_device_init(adev);
if (ret) {
kfree(iadev);
return ret;
}
ret = auxiliary_device_add(adev);
if (ret) {
auxiliary_device_uninit(adev);
return ret;
}
mutex_lock(&pf->adev_mutex);
pf->adev = adev;
mutex_unlock(&pf->adev_mutex);
return 0;
}
/* ice_unplug_aux_dev - unregister and free AUX device
* @pf: pointer to pf struct
*/
void ice_unplug_aux_dev(struct ice_pf *pf)
{
struct auxiliary_device *adev;
mutex_lock(&pf->adev_mutex);
adev = pf->adev;
pf->adev = NULL;
mutex_unlock(&pf->adev_mutex);
if (adev) {
auxiliary_device_delete(adev);
auxiliary_device_uninit(adev);
}
}
/**
* ice_init_rdma - initializes PF for RDMA use
* @pf: ptr to ice_pf
*/
int ice_init_rdma(struct ice_pf *pf)
{
struct device *dev = &pf->pdev->dev;
int ret;
/* Reserve vector resources */
ret = ice_reserve_rdma_qvector(pf);
if (ret < 0) {
dev_err(dev, "failed to reserve vectors for RDMA\n");
return ret;
}
pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2;
return ice_plug_aux_dev(pf);
}