Function ice_plug_aux_dev() assigns pf->adev field too early prior
aux device initialization and on other side ice_unplug_aux_dev()
starts aux device deinit and at the end assigns NULL to pf->adev.
This is wrong because pf->adev should always be non-NULL only when
aux device is fully initialized and ready. This wrong order causes
a crash when ice_send_event_to_aux() call occurs because that function
depends on non-NULL value of pf->adev and does not assume that
aux device is half-initialized or half-destroyed.
After order correction the race window is tiny but it is still there,
as Leon mentioned and manipulation with pf->adev needs to be protected
by mutex.
Fix (un-)plugging functions so pf->adev field is set after aux device
init and prior aux device destroy and protect pf->adev assignment by
new mutex. This mutex is also held during ice_send_event_to_aux()
call to ensure that aux device is valid during that call.
Note that device lock used ice_send_event_to_aux() needs to be kept
to avoid race with aux drv unload.
Reproducer:
cycle=1
while :;do
echo "#### Cycle: $cycle"
ip link set ens7f0 mtu 9000
ip link add bond0 type bond mode 1 miimon 100
ip link set bond0 up
ifenslave bond0 ens7f0
ip link set bond0 mtu 9000
ethtool -L ens7f0 combined 1
ip link del bond0
ip link set ens7f0 mtu 1500
sleep 1
let cycle++
done
In short when the device is added/removed to/from bond the aux device
is unplugged/plugged. When MTU of the device is changed an event is
sent to aux device asynchronously. This can race with (un)plugging
operation and because pf->adev is set too early (plug) or too late
(unplug) the function ice_send_event_to_aux() can touch uninitialized
or destroyed fields. In the case of crash below pf->adev->dev.mutex.
Crash:
[ 53.372066] bond0: (slave ens7f0): making interface the new active one
[ 53.378622] bond0: (slave ens7f0): Enslaving as an active interface with an u
p link
[ 53.386294] IPv6: ADDRCONF(NETDEV_CHANGE): bond0: link becomes ready
[ 53.549104] bond0: (slave ens7f1): Enslaving as a backup interface with an up
link
[ 54.118906] ice 0000:ca:00.0 ens7f0: Number of in use tx queues changed inval
idating tc mappings. Priority traffic classification disabled!
[ 54.233374] ice 0000:ca:00.1 ens7f1: Number of in use tx queues changed inval
idating tc mappings. Priority traffic classification disabled!
[ 54.248204] bond0: (slave ens7f0): Releasing backup interface
[ 54.253955] bond0: (slave ens7f1): making interface the new active one
[ 54.274875] bond0: (slave ens7f1): Releasing backup interface
[ 54.289153] bond0 (unregistering): Released all slaves
[ 55.383179] MII link monitoring set to 100 ms
[ 55.398696] bond0: (slave ens7f0): making interface the new active one
[ 55.405241] BUG: kernel NULL pointer dereference, address: 0000000000000080
[ 55.405289] bond0: (slave ens7f0): Enslaving as an active interface with an u
p link
[ 55.412198] #PF: supervisor write access in kernel mode
[ 55.412200] #PF: error_code(0x0002) - not-present page
[ 55.412201] PGD 25d2ad067 P4D 0
[ 55.412204] Oops: 0002 [#1] PREEMPT SMP NOPTI
[ 55.412207] CPU: 0 PID: 403 Comm: kworker/0:2 Kdump: loaded Tainted: G S
5.17.0-13579-g57f2d6540f03 #1
[ 55.429094] bond0: (slave ens7f1): Enslaving as a backup interface with an up
link
[ 55.430224] Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.4.4 10/07/
2021
[ 55.430226] Workqueue: ice ice_service_task [ice]
[ 55.468169] RIP: 0010:mutex_unlock+0x10/0x20
[ 55.472439] Code: 0f b1 13 74 96 eb e0 4c 89 ee eb d8 e8 79 54 ff ff 66 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 65 48 8b 04 25 40 ef 01 00 31 d2 <f0> 48 0f b1 17 75 01 c3 e9 e3 fe ff ff 0f 1f 00 0f 1f 44 00 00 48
[ 55.491186] RSP: 0018:ff4454230d7d7e28 EFLAGS: 00010246
[ 55.496413] RAX: ff1a79b208b08000 RBX: ff1a79b2182e8880 RCX: 0000000000000001
[ 55.503545] RDX: 0000000000000000 RSI: ff4454230d7d7db0 RDI: 0000000000000080
[ 55.510678] RBP: ff1a79d1c7e48b68 R08: ff4454230d7d7db0 R09: 0000000000000041
[ 55.517812] R10: 00000000000000a5 R11: 00000000000006e6 R12: ff1a79d1c7e48bc0
[ 55.524945] R13: 0000000000000000 R14: ff1a79d0ffc305c0 R15: 0000000000000000
[ 55.532076] FS: 0000000000000000(0000) GS:ff1a79d0ffc00000(0000) knlGS:0000000000000000
[ 55.540163] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 55.545908] CR2: 0000000000000080 CR3: 00000003487ae003 CR4: 0000000000771ef0
[ 55.553041] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 55.560173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 55.567305] PKRU: 55555554
[ 55.570018] Call Trace:
[ 55.572474] <TASK>
[ 55.574579] ice_service_task+0xaab/0xef0 [ice]
[ 55.579130] process_one_work+0x1c5/0x390
[ 55.583141] ? process_one_work+0x390/0x390
[ 55.587326] worker_thread+0x30/0x360
[ 55.590994] ? process_one_work+0x390/0x390
[ 55.595180] kthread+0xe6/0x110
[ 55.598325] ? kthread_complete_and_exit+0x20/0x20
[ 55.603116] ret_from_fork+0x1f/0x30
[ 55.606698] </TASK>
Fixes: f9f5301e7e
("ice: Register auxiliary device to provide RDMA")
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Ivan Vecera <ivecera@redhat.com>
Reviewed-by: Dave Ertman <david.m.ertman@intel.com>
Tested-by: Gurucharan <gurucharanx.g@intel.com> (A Contingent worker at Intel)
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
358 lines
7.7 KiB
C
358 lines
7.7 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/* Copyright (C) 2021, Intel Corporation. */
|
|
|
|
/* Inter-Driver Communication */
|
|
#include "ice.h"
|
|
#include "ice_lib.h"
|
|
#include "ice_dcb_lib.h"
|
|
|
|
/**
|
|
* ice_get_auxiliary_drv - retrieve iidc_auxiliary_drv struct
|
|
* @pf: pointer to PF struct
|
|
*
|
|
* This function has to be called with a device_lock on the
|
|
* pf->adev.dev to avoid race conditions.
|
|
*/
|
|
static struct iidc_auxiliary_drv *ice_get_auxiliary_drv(struct ice_pf *pf)
|
|
{
|
|
struct auxiliary_device *adev;
|
|
|
|
adev = pf->adev;
|
|
if (!adev || !adev->dev.driver)
|
|
return NULL;
|
|
|
|
return container_of(adev->dev.driver, struct iidc_auxiliary_drv,
|
|
adrv.driver);
|
|
}
|
|
|
|
/**
|
|
* ice_send_event_to_aux - send event to RDMA AUX driver
|
|
* @pf: pointer to PF struct
|
|
* @event: event struct
|
|
*/
|
|
void ice_send_event_to_aux(struct ice_pf *pf, struct iidc_event *event)
|
|
{
|
|
struct iidc_auxiliary_drv *iadrv;
|
|
|
|
if (WARN_ON_ONCE(!in_task()))
|
|
return;
|
|
|
|
mutex_lock(&pf->adev_mutex);
|
|
if (!pf->adev)
|
|
goto finish;
|
|
|
|
device_lock(&pf->adev->dev);
|
|
iadrv = ice_get_auxiliary_drv(pf);
|
|
if (iadrv && iadrv->event_handler)
|
|
iadrv->event_handler(pf, event);
|
|
device_unlock(&pf->adev->dev);
|
|
finish:
|
|
mutex_unlock(&pf->adev_mutex);
|
|
}
|
|
|
|
/**
|
|
* ice_find_vsi - Find the VSI from VSI ID
|
|
* @pf: The PF pointer to search in
|
|
* @vsi_num: The VSI ID to search for
|
|
*/
|
|
static struct ice_vsi *ice_find_vsi(struct ice_pf *pf, u16 vsi_num)
|
|
{
|
|
int i;
|
|
|
|
ice_for_each_vsi(pf, i)
|
|
if (pf->vsi[i] && pf->vsi[i]->vsi_num == vsi_num)
|
|
return pf->vsi[i];
|
|
return NULL;
|
|
}
|
|
|
|
/**
|
|
* ice_add_rdma_qset - Add Leaf Node for RDMA Qset
|
|
* @pf: PF struct
|
|
* @qset: Resource to be allocated
|
|
*/
|
|
int ice_add_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset)
|
|
{
|
|
u16 max_rdmaqs[ICE_MAX_TRAFFIC_CLASS];
|
|
struct ice_vsi *vsi;
|
|
struct device *dev;
|
|
u32 qset_teid;
|
|
u16 qs_handle;
|
|
int status;
|
|
int i;
|
|
|
|
if (WARN_ON(!pf || !qset))
|
|
return -EINVAL;
|
|
|
|
dev = ice_pf_to_dev(pf);
|
|
|
|
if (!ice_is_rdma_ena(pf))
|
|
return -EINVAL;
|
|
|
|
vsi = ice_get_main_vsi(pf);
|
|
if (!vsi) {
|
|
dev_err(dev, "RDMA QSet invalid VSI\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
ice_for_each_traffic_class(i)
|
|
max_rdmaqs[i] = 0;
|
|
|
|
max_rdmaqs[qset->tc]++;
|
|
qs_handle = qset->qs_handle;
|
|
|
|
status = ice_cfg_vsi_rdma(vsi->port_info, vsi->idx, vsi->tc_cfg.ena_tc,
|
|
max_rdmaqs);
|
|
if (status) {
|
|
dev_err(dev, "Failed VSI RDMA Qset config\n");
|
|
return status;
|
|
}
|
|
|
|
status = ice_ena_vsi_rdma_qset(vsi->port_info, vsi->idx, qset->tc,
|
|
&qs_handle, 1, &qset_teid);
|
|
if (status) {
|
|
dev_err(dev, "Failed VSI RDMA Qset enable\n");
|
|
return status;
|
|
}
|
|
vsi->qset_handle[qset->tc] = qset->qs_handle;
|
|
qset->teid = qset_teid;
|
|
|
|
return 0;
|
|
}
|
|
EXPORT_SYMBOL_GPL(ice_add_rdma_qset);
|
|
|
|
/**
|
|
* ice_del_rdma_qset - Delete leaf node for RDMA Qset
|
|
* @pf: PF struct
|
|
* @qset: Resource to be freed
|
|
*/
|
|
int ice_del_rdma_qset(struct ice_pf *pf, struct iidc_rdma_qset_params *qset)
|
|
{
|
|
struct ice_vsi *vsi;
|
|
u32 teid;
|
|
u16 q_id;
|
|
|
|
if (WARN_ON(!pf || !qset))
|
|
return -EINVAL;
|
|
|
|
vsi = ice_find_vsi(pf, qset->vport_id);
|
|
if (!vsi) {
|
|
dev_err(ice_pf_to_dev(pf), "RDMA Invalid VSI\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
q_id = qset->qs_handle;
|
|
teid = qset->teid;
|
|
|
|
vsi->qset_handle[qset->tc] = 0;
|
|
|
|
return ice_dis_vsi_rdma_qset(vsi->port_info, 1, &teid, &q_id);
|
|
}
|
|
EXPORT_SYMBOL_GPL(ice_del_rdma_qset);
|
|
|
|
/**
|
|
* ice_rdma_request_reset - accept request from RDMA to perform a reset
|
|
* @pf: struct for PF
|
|
* @reset_type: type of reset
|
|
*/
|
|
int ice_rdma_request_reset(struct ice_pf *pf, enum iidc_reset_type reset_type)
|
|
{
|
|
enum ice_reset_req reset;
|
|
|
|
if (WARN_ON(!pf))
|
|
return -EINVAL;
|
|
|
|
switch (reset_type) {
|
|
case IIDC_PFR:
|
|
reset = ICE_RESET_PFR;
|
|
break;
|
|
case IIDC_CORER:
|
|
reset = ICE_RESET_CORER;
|
|
break;
|
|
case IIDC_GLOBR:
|
|
reset = ICE_RESET_GLOBR;
|
|
break;
|
|
default:
|
|
dev_err(ice_pf_to_dev(pf), "incorrect reset request\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
return ice_schedule_reset(pf, reset);
|
|
}
|
|
EXPORT_SYMBOL_GPL(ice_rdma_request_reset);
|
|
|
|
/**
|
|
* ice_rdma_update_vsi_filter - update main VSI filters for RDMA
|
|
* @pf: pointer to struct for PF
|
|
* @vsi_id: VSI HW idx to update filter on
|
|
* @enable: bool whether to enable or disable filters
|
|
*/
|
|
int ice_rdma_update_vsi_filter(struct ice_pf *pf, u16 vsi_id, bool enable)
|
|
{
|
|
struct ice_vsi *vsi;
|
|
int status;
|
|
|
|
if (WARN_ON(!pf))
|
|
return -EINVAL;
|
|
|
|
vsi = ice_find_vsi(pf, vsi_id);
|
|
if (!vsi)
|
|
return -EINVAL;
|
|
|
|
status = ice_cfg_rdma_fltr(&pf->hw, vsi->idx, enable);
|
|
if (status) {
|
|
dev_err(ice_pf_to_dev(pf), "Failed to %sable RDMA filtering\n",
|
|
enable ? "en" : "dis");
|
|
} else {
|
|
if (enable)
|
|
vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
|
|
else
|
|
vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
|
|
}
|
|
|
|
return status;
|
|
}
|
|
EXPORT_SYMBOL_GPL(ice_rdma_update_vsi_filter);
|
|
|
|
/**
|
|
* ice_get_qos_params - parse QoS params for RDMA consumption
|
|
* @pf: pointer to PF struct
|
|
* @qos: set of QoS values
|
|
*/
|
|
void ice_get_qos_params(struct ice_pf *pf, struct iidc_qos_params *qos)
|
|
{
|
|
struct ice_dcbx_cfg *dcbx_cfg;
|
|
unsigned int i;
|
|
u32 up2tc;
|
|
|
|
dcbx_cfg = &pf->hw.port_info->qos_cfg.local_dcbx_cfg;
|
|
up2tc = rd32(&pf->hw, PRTDCB_TUP2TC);
|
|
|
|
qos->num_tc = ice_dcb_get_num_tc(dcbx_cfg);
|
|
for (i = 0; i < IIDC_MAX_USER_PRIORITY; i++)
|
|
qos->up2tc[i] = (up2tc >> (i * 3)) & 0x7;
|
|
|
|
for (i = 0; i < IEEE_8021QAZ_MAX_TCS; i++)
|
|
qos->tc_info[i].rel_bw = dcbx_cfg->etscfg.tcbwtable[i];
|
|
|
|
qos->pfc_mode = dcbx_cfg->pfc_mode;
|
|
if (qos->pfc_mode == IIDC_DSCP_PFC_MODE)
|
|
for (i = 0; i < IIDC_MAX_DSCP_MAPPING; i++)
|
|
qos->dscp_map[i] = dcbx_cfg->dscp_map[i];
|
|
}
|
|
EXPORT_SYMBOL_GPL(ice_get_qos_params);
|
|
|
|
/**
|
|
* ice_reserve_rdma_qvector - Reserve vector resources for RDMA driver
|
|
* @pf: board private structure to initialize
|
|
*/
|
|
static int ice_reserve_rdma_qvector(struct ice_pf *pf)
|
|
{
|
|
if (ice_is_rdma_ena(pf)) {
|
|
int index;
|
|
|
|
index = ice_get_res(pf, pf->irq_tracker, pf->num_rdma_msix,
|
|
ICE_RES_RDMA_VEC_ID);
|
|
if (index < 0)
|
|
return index;
|
|
pf->num_avail_sw_msix -= pf->num_rdma_msix;
|
|
pf->rdma_base_vector = (u16)index;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* ice_adev_release - function to be mapped to AUX dev's release op
|
|
* @dev: pointer to device to free
|
|
*/
|
|
static void ice_adev_release(struct device *dev)
|
|
{
|
|
struct iidc_auxiliary_dev *iadev;
|
|
|
|
iadev = container_of(dev, struct iidc_auxiliary_dev, adev.dev);
|
|
kfree(iadev);
|
|
}
|
|
|
|
/**
|
|
* ice_plug_aux_dev - allocate and register AUX device
|
|
* @pf: pointer to pf struct
|
|
*/
|
|
int ice_plug_aux_dev(struct ice_pf *pf)
|
|
{
|
|
struct iidc_auxiliary_dev *iadev;
|
|
struct auxiliary_device *adev;
|
|
int ret;
|
|
|
|
/* if this PF doesn't support a technology that requires auxiliary
|
|
* devices, then gracefully exit
|
|
*/
|
|
if (!ice_is_rdma_ena(pf))
|
|
return 0;
|
|
|
|
iadev = kzalloc(sizeof(*iadev), GFP_KERNEL);
|
|
if (!iadev)
|
|
return -ENOMEM;
|
|
|
|
adev = &iadev->adev;
|
|
iadev->pf = pf;
|
|
|
|
adev->id = pf->aux_idx;
|
|
adev->dev.release = ice_adev_release;
|
|
adev->dev.parent = &pf->pdev->dev;
|
|
adev->name = pf->rdma_mode & IIDC_RDMA_PROTOCOL_ROCEV2 ? "roce" : "iwarp";
|
|
|
|
ret = auxiliary_device_init(adev);
|
|
if (ret) {
|
|
kfree(iadev);
|
|
return ret;
|
|
}
|
|
|
|
ret = auxiliary_device_add(adev);
|
|
if (ret) {
|
|
auxiliary_device_uninit(adev);
|
|
return ret;
|
|
}
|
|
|
|
mutex_lock(&pf->adev_mutex);
|
|
pf->adev = adev;
|
|
mutex_unlock(&pf->adev_mutex);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* ice_unplug_aux_dev - unregister and free AUX device
|
|
* @pf: pointer to pf struct
|
|
*/
|
|
void ice_unplug_aux_dev(struct ice_pf *pf)
|
|
{
|
|
struct auxiliary_device *adev;
|
|
|
|
mutex_lock(&pf->adev_mutex);
|
|
adev = pf->adev;
|
|
pf->adev = NULL;
|
|
mutex_unlock(&pf->adev_mutex);
|
|
|
|
if (adev) {
|
|
auxiliary_device_delete(adev);
|
|
auxiliary_device_uninit(adev);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* ice_init_rdma - initializes PF for RDMA use
|
|
* @pf: ptr to ice_pf
|
|
*/
|
|
int ice_init_rdma(struct ice_pf *pf)
|
|
{
|
|
struct device *dev = &pf->pdev->dev;
|
|
int ret;
|
|
|
|
/* Reserve vector resources */
|
|
ret = ice_reserve_rdma_qvector(pf);
|
|
if (ret < 0) {
|
|
dev_err(dev, "failed to reserve vectors for RDMA\n");
|
|
return ret;
|
|
}
|
|
pf->rdma_mode |= IIDC_RDMA_PROTOCOL_ROCEV2;
|
|
return ice_plug_aux_dev(pf);
|
|
}
|