linux/drivers/infiniband/core/cache.c
Parav Pandit 5c5702e259 RDMA/core: Set right entry state before releasing reference
Currently add_modify_gid() for IB link layer has followong issue
in cache update path.

When GID update event occurs, core releases reference to the GID
table without updating its state and/or entry pointer.

CPU-0                              CPU-1
------                             -----
ib_cache_update()                    IPoIB ULP
   add_modify_gid()                   [..]
      put_gid_entry()
      refcnt = 0, but
      state = valid,
      entry is valid.
      (work item is not yet executed).
                                   ipoib_create_ah()
                                     rdma_create_ah()
                                        rdma_get_gid_attr() <--
                                   	Tries to acquire gid_attr
                                        which has refcnt = 0.
                                   	This is incorrect.

GID entry state and entry pointer is provides the accurate GID enty
state. Such fields must be updated with rwlock to protect against
readers and, such fields must be in sane state before refcount can drop
to zero. Otherwise above race condition can happen leading to
use-after-free situation.

Following backtrace has been observed when cache update for an IB port
is triggered while IPoIB ULP is creating an AH.

Therefore, when updating GID entry, first mark a valid entry as invalid
through state and set the barrier so that no callers can acquired
the GID entry, followed by release reference to it.

refcount_t: increment on 0; use-after-free.
WARNING: CPU: 4 PID: 29106 at lib/refcount.c:153 refcount_inc_checked+0x30/0x50
Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core]
RIP: 0010:refcount_inc_checked+0x30/0x50
RSP: 0018:ffff8802ad36f600 EFLAGS: 00010082
RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 0000000000000002 RSI: 0000000000000008 RDI: ffffffff86710100
RBP: ffff8802d6e60a30 R08: ffffed005d67bf8b R09: ffffed005d67bf8b
R10: 0000000000000001 R11: ffffed005d67bf8a R12: ffff88027620cee8
R13: ffff8802d6e60988 R14: ffff8802d6e60a78 R15: 0000000000000202
FS: 0000000000000000(0000) GS:ffff8802eb200000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007f3ab35e5c88 CR3: 00000002ce84a000 CR4: 00000000000006e0
IPv6: ADDRCONF(NETDEV_CHANGE): ib1: link becomes ready
Call Trace:
rdma_get_gid_attr+0x220/0x310 [ib_core]
? lock_acquire+0x145/0x3a0
rdma_fill_sgid_attr+0x32c/0x470 [ib_core]
rdma_create_ah+0x89/0x160 [ib_core]
? rdma_fill_sgid_attr+0x470/0x470 [ib_core]
? ipoib_create_ah+0x52/0x260 [ib_ipoib]
ipoib_create_ah+0xf5/0x260 [ib_ipoib]
ipoib_mcast_join_complete+0xbbe/0x2540 [ib_ipoib]

Fixes: b150c3862d ("IB/core: Introduce GID entry reference counts")
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>

Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
2018-09-25 15:01:09 -06:00

1464 lines
37 KiB
C

/*
* Copyright (c) 2004 Topspin Communications. All rights reserved.
* Copyright (c) 2005 Intel Corporation. All rights reserved.
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2005 Voltaire, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/netdevice.h>
#include <net/addrconf.h>
#include <rdma/ib_cache.h>
#include "core_priv.h"
struct ib_pkey_cache {
int table_len;
u16 table[0];
};
struct ib_update_work {
struct work_struct work;
struct ib_device *device;
u8 port_num;
bool enforce_security;
};
union ib_gid zgid;
EXPORT_SYMBOL(zgid);
enum gid_attr_find_mask {
GID_ATTR_FIND_MASK_GID = 1UL << 0,
GID_ATTR_FIND_MASK_NETDEV = 1UL << 1,
GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2,
GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3,
};
enum gid_table_entry_state {
GID_TABLE_ENTRY_INVALID = 1,
GID_TABLE_ENTRY_VALID = 2,
/*
* Indicates that entry is pending to be removed, there may
* be active users of this GID entry.
* When last user of the GID entry releases reference to it,
* GID entry is detached from the table.
*/
GID_TABLE_ENTRY_PENDING_DEL = 3,
};
struct ib_gid_table_entry {
struct kref kref;
struct work_struct del_work;
struct ib_gid_attr attr;
void *context;
enum gid_table_entry_state state;
};
struct ib_gid_table {
int sz;
/* In RoCE, adding a GID to the table requires:
* (a) Find if this GID is already exists.
* (b) Find a free space.
* (c) Write the new GID
*
* Delete requires different set of operations:
* (a) Find the GID
* (b) Delete it.
*
**/
/* Any writer to data_vec must hold this lock and the write side of
* rwlock. Readers must hold only rwlock. All writers must be in a
* sleepable context.
*/
struct mutex lock;
/* rwlock protects data_vec[ix]->state and entry pointer.
*/
rwlock_t rwlock;
struct ib_gid_table_entry **data_vec;
/* bit field, each bit indicates the index of default GID */
u32 default_gid_indices;
};
static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port)
{
struct ib_event event;
event.device = ib_dev;
event.element.port_num = port;
event.event = IB_EVENT_GID_CHANGE;
ib_dispatch_event(&event);
}
static const char * const gid_type_str[] = {
[IB_GID_TYPE_IB] = "IB/RoCE v1",
[IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2",
};
const char *ib_cache_gid_type_str(enum ib_gid_type gid_type)
{
if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type])
return gid_type_str[gid_type];
return "Invalid GID type";
}
EXPORT_SYMBOL(ib_cache_gid_type_str);
/** rdma_is_zero_gid - Check if given GID is zero or not.
* @gid: GID to check
* Returns true if given GID is zero, returns false otherwise.
*/
bool rdma_is_zero_gid(const union ib_gid *gid)
{
return !memcmp(gid, &zgid, sizeof(*gid));
}
EXPORT_SYMBOL(rdma_is_zero_gid);
/** is_gid_index_default - Check if a given index belongs to
* reserved default GIDs or not.
* @table: GID table pointer
* @index: Index to check in GID table
* Returns true if index is one of the reserved default GID index otherwise
* returns false.
*/
static bool is_gid_index_default(const struct ib_gid_table *table,
unsigned int index)
{
return index < 32 && (BIT(index) & table->default_gid_indices);
}
int ib_cache_gid_parse_type_str(const char *buf)
{
unsigned int i;
size_t len;
int err = -EINVAL;
len = strlen(buf);
if (len == 0)
return -EINVAL;
if (buf[len - 1] == '\n')
len--;
for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i)
if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) &&
len == strlen(gid_type_str[i])) {
err = i;
break;
}
return err;
}
EXPORT_SYMBOL(ib_cache_gid_parse_type_str);
static struct ib_gid_table *rdma_gid_table(struct ib_device *device, u8 port)
{
return device->cache.ports[port - rdma_start_port(device)].gid;
}
static bool is_gid_entry_free(const struct ib_gid_table_entry *entry)
{
return !entry;
}
static bool is_gid_entry_valid(const struct ib_gid_table_entry *entry)
{
return entry && entry->state == GID_TABLE_ENTRY_VALID;
}
static void schedule_free_gid(struct kref *kref)
{
struct ib_gid_table_entry *entry =
container_of(kref, struct ib_gid_table_entry, kref);
queue_work(ib_wq, &entry->del_work);
}
static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
{
struct ib_device *device = entry->attr.device;
u8 port_num = entry->attr.port_num;
struct ib_gid_table *table = rdma_gid_table(device, port_num);
pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
device->name, port_num, entry->attr.index,
entry->attr.gid.raw);
if (rdma_cap_roce_gid_table(device, port_num) &&
entry->state != GID_TABLE_ENTRY_INVALID)
device->del_gid(&entry->attr, &entry->context);
write_lock_irq(&table->rwlock);
/*
* The only way to avoid overwriting NULL in table is
* by comparing if it is same entry in table or not!
* If new entry in table is added by the time we free here,
* don't overwrite the table entry.
*/
if (entry == table->data_vec[entry->attr.index])
table->data_vec[entry->attr.index] = NULL;
/* Now this index is ready to be allocated */
write_unlock_irq(&table->rwlock);
if (entry->attr.ndev)
dev_put(entry->attr.ndev);
kfree(entry);
}
static void free_gid_entry(struct kref *kref)
{
struct ib_gid_table_entry *entry =
container_of(kref, struct ib_gid_table_entry, kref);
free_gid_entry_locked(entry);
}
/**
* free_gid_work - Release reference to the GID entry
* @work: Work structure to refer to GID entry which needs to be
* deleted.
*
* free_gid_work() frees the entry from the HCA's hardware table
* if provider supports it. It releases reference to netdevice.
*/
static void free_gid_work(struct work_struct *work)
{
struct ib_gid_table_entry *entry =
container_of(work, struct ib_gid_table_entry, del_work);
struct ib_device *device = entry->attr.device;
u8 port_num = entry->attr.port_num;
struct ib_gid_table *table = rdma_gid_table(device, port_num);
mutex_lock(&table->lock);
free_gid_entry_locked(entry);
mutex_unlock(&table->lock);
}
static struct ib_gid_table_entry *
alloc_gid_entry(const struct ib_gid_attr *attr)
{
struct ib_gid_table_entry *entry;
entry = kzalloc(sizeof(*entry), GFP_KERNEL);
if (!entry)
return NULL;
kref_init(&entry->kref);
memcpy(&entry->attr, attr, sizeof(*attr));
if (entry->attr.ndev)
dev_hold(entry->attr.ndev);
INIT_WORK(&entry->del_work, free_gid_work);
entry->state = GID_TABLE_ENTRY_INVALID;
return entry;
}
static void store_gid_entry(struct ib_gid_table *table,
struct ib_gid_table_entry *entry)
{
entry->state = GID_TABLE_ENTRY_VALID;
pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
entry->attr.device->name, entry->attr.port_num,
entry->attr.index, entry->attr.gid.raw);
lockdep_assert_held(&table->lock);
write_lock_irq(&table->rwlock);
table->data_vec[entry->attr.index] = entry;
write_unlock_irq(&table->rwlock);
}
static void get_gid_entry(struct ib_gid_table_entry *entry)
{
kref_get(&entry->kref);
}
static void put_gid_entry(struct ib_gid_table_entry *entry)
{
kref_put(&entry->kref, schedule_free_gid);
}
static void put_gid_entry_locked(struct ib_gid_table_entry *entry)
{
kref_put(&entry->kref, free_gid_entry);
}
static int add_roce_gid(struct ib_gid_table_entry *entry)
{
const struct ib_gid_attr *attr = &entry->attr;
int ret;
if (!attr->ndev) {
pr_err("%s NULL netdev device=%s port=%d index=%d\n",
__func__, attr->device->name, attr->port_num,
attr->index);
return -EINVAL;
}
if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) {
ret = attr->device->add_gid(attr, &entry->context);
if (ret) {
pr_err("%s GID add failed device=%s port=%d index=%d\n",
__func__, attr->device->name, attr->port_num,
attr->index);
return ret;
}
}
return 0;
}
/**
* del_gid - Delete GID table entry
*
* @ib_dev: IB device whose GID entry to be deleted
* @port: Port number of the IB device
* @table: GID table of the IB device for a port
* @ix: GID entry index to delete
*
*/
static void del_gid(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table, int ix)
{
struct ib_gid_table_entry *entry;
lockdep_assert_held(&table->lock);
pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__,
ib_dev->name, port, ix,
table->data_vec[ix]->attr.gid.raw);
write_lock_irq(&table->rwlock);
entry = table->data_vec[ix];
entry->state = GID_TABLE_ENTRY_PENDING_DEL;
/*
* For non RoCE protocol, GID entry slot is ready to use.
*/
if (!rdma_protocol_roce(ib_dev, port))
table->data_vec[ix] = NULL;
write_unlock_irq(&table->rwlock);
put_gid_entry_locked(entry);
}
/**
* add_modify_gid - Add or modify GID table entry
*
* @table: GID table in which GID to be added or modified
* @attr: Attributes of the GID
*
* Returns 0 on success or appropriate error code. It accepts zero
* GID addition for non RoCE ports for HCA's who report them as valid
* GID. However such zero GIDs are not added to the cache.
*/
static int add_modify_gid(struct ib_gid_table *table,
const struct ib_gid_attr *attr)
{
struct ib_gid_table_entry *entry;
int ret = 0;
/*
* Invalidate any old entry in the table to make it safe to write to
* this index.
*/
if (is_gid_entry_valid(table->data_vec[attr->index]))
del_gid(attr->device, attr->port_num, table, attr->index);
/*
* Some HCA's report multiple GID entries with only one valid GID, and
* leave other unused entries as the zero GID. Convert zero GIDs to
* empty table entries instead of storing them.
*/
if (rdma_is_zero_gid(&attr->gid))
return 0;
entry = alloc_gid_entry(attr);
if (!entry)
return -ENOMEM;
if (rdma_protocol_roce(attr->device, attr->port_num)) {
ret = add_roce_gid(entry);
if (ret)
goto done;
}
store_gid_entry(table, entry);
return 0;
done:
put_gid_entry(entry);
return ret;
}
/* rwlock should be read locked, or lock should be held */
static int find_gid(struct ib_gid_table *table, const union ib_gid *gid,
const struct ib_gid_attr *val, bool default_gid,
unsigned long mask, int *pempty)
{
int i = 0;
int found = -1;
int empty = pempty ? -1 : 0;
while (i < table->sz && (found < 0 || empty < 0)) {
struct ib_gid_table_entry *data = table->data_vec[i];
struct ib_gid_attr *attr;
int curr_index = i;
i++;
/* find_gid() is used during GID addition where it is expected
* to return a free entry slot which is not duplicate.
* Free entry slot is requested and returned if pempty is set,
* so lookup free slot only if requested.
*/
if (pempty && empty < 0) {
if (is_gid_entry_free(data) &&
default_gid ==
is_gid_index_default(table, curr_index)) {
/*
* Found an invalid (free) entry; allocate it.
* If default GID is requested, then our
* found slot must be one of the DEFAULT
* reserved slots or we fail.
* This ensures that only DEFAULT reserved
* slots are used for default property GIDs.
*/
empty = curr_index;
}
}
/*
* Additionally find_gid() is used to find valid entry during
* lookup operation; so ignore the entries which are marked as
* pending for removal and the entries which are marked as
* invalid.
*/
if (!is_gid_entry_valid(data))
continue;
if (found >= 0)
continue;
attr = &data->attr;
if (mask & GID_ATTR_FIND_MASK_GID_TYPE &&
attr->gid_type != val->gid_type)
continue;
if (mask & GID_ATTR_FIND_MASK_GID &&
memcmp(gid, &data->attr.gid, sizeof(*gid)))
continue;
if (mask & GID_ATTR_FIND_MASK_NETDEV &&
attr->ndev != val->ndev)
continue;
if (mask & GID_ATTR_FIND_MASK_DEFAULT &&
is_gid_index_default(table, curr_index) != default_gid)
continue;
found = curr_index;
}
if (pempty)
*pempty = empty;
return found;
}
static void make_default_gid(struct net_device *dev, union ib_gid *gid)
{
gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
addrconf_ifid_eui48(&gid->raw[8], dev);
}
static int __ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
union ib_gid *gid, struct ib_gid_attr *attr,
unsigned long mask, bool default_gid)
{
struct ib_gid_table *table;
int ret = 0;
int empty;
int ix;
/* Do not allow adding zero GID in support of
* IB spec version 1.3 section 4.1.1 point (6) and
* section 12.7.10 and section 12.7.20
*/
if (rdma_is_zero_gid(gid))
return -EINVAL;
table = rdma_gid_table(ib_dev, port);
mutex_lock(&table->lock);
ix = find_gid(table, gid, attr, default_gid, mask, &empty);
if (ix >= 0)
goto out_unlock;
if (empty < 0) {
ret = -ENOSPC;
goto out_unlock;
}
attr->device = ib_dev;
attr->index = empty;
attr->port_num = port;
attr->gid = *gid;
ret = add_modify_gid(table, attr);
if (!ret)
dispatch_gid_change_event(ib_dev, port);
out_unlock:
mutex_unlock(&table->lock);
if (ret)
pr_warn("%s: unable to add gid %pI6 error=%d\n",
__func__, gid->raw, ret);
return ret;
}
int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
union ib_gid *gid, struct ib_gid_attr *attr)
{
struct net_device *idev;
unsigned long mask;
int ret;
if (ib_dev->get_netdev) {
idev = ib_dev->get_netdev(ib_dev, port);
if (idev && attr->ndev != idev) {
union ib_gid default_gid;
/* Adding default GIDs in not permitted */
make_default_gid(idev, &default_gid);
if (!memcmp(gid, &default_gid, sizeof(*gid))) {
dev_put(idev);
return -EPERM;
}
}
if (idev)
dev_put(idev);
}
mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE |
GID_ATTR_FIND_MASK_NETDEV;
ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
return ret;
}
static int
_ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
union ib_gid *gid, struct ib_gid_attr *attr,
unsigned long mask, bool default_gid)
{
struct ib_gid_table *table;
int ret = 0;
int ix;
table = rdma_gid_table(ib_dev, port);
mutex_lock(&table->lock);
ix = find_gid(table, gid, attr, default_gid, mask, NULL);
if (ix < 0) {
ret = -EINVAL;
goto out_unlock;
}
del_gid(ib_dev, port, table, ix);
dispatch_gid_change_event(ib_dev, port);
out_unlock:
mutex_unlock(&table->lock);
if (ret)
pr_debug("%s: can't delete gid %pI6 error=%d\n",
__func__, gid->raw, ret);
return ret;
}
int ib_cache_gid_del(struct ib_device *ib_dev, u8 port,
union ib_gid *gid, struct ib_gid_attr *attr)
{
unsigned long mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE |
GID_ATTR_FIND_MASK_DEFAULT |
GID_ATTR_FIND_MASK_NETDEV;
return _ib_cache_gid_del(ib_dev, port, gid, attr, mask, false);
}
int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port,
struct net_device *ndev)
{
struct ib_gid_table *table;
int ix;
bool deleted = false;
table = rdma_gid_table(ib_dev, port);
mutex_lock(&table->lock);
for (ix = 0; ix < table->sz; ix++) {
if (is_gid_entry_valid(table->data_vec[ix]) &&
table->data_vec[ix]->attr.ndev == ndev) {
del_gid(ib_dev, port, table, ix);
deleted = true;
}
}
mutex_unlock(&table->lock);
if (deleted)
dispatch_gid_change_event(ib_dev, port);
return 0;
}
/**
* rdma_find_gid_by_port - Returns the GID entry attributes when it finds
* a valid GID entry for given search parameters. It searches for the specified
* GID value in the local software cache.
* @device: The device to query.
* @gid: The GID value to search for.
* @gid_type: The GID type to search for.
* @port_num: The port number of the device where the GID value should be
* searched.
* @ndev: In RoCE, the net device of the device. NULL means ignore.
*
* Returns sgid attributes if the GID is found with valid reference or
* returns ERR_PTR for the error.
* The caller must invoke rdma_put_gid_attr() to release the reference.
*/
const struct ib_gid_attr *
rdma_find_gid_by_port(struct ib_device *ib_dev,
const union ib_gid *gid,
enum ib_gid_type gid_type,
u8 port, struct net_device *ndev)
{
int local_index;
struct ib_gid_table *table;
unsigned long mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE;
struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type};
const struct ib_gid_attr *attr;
unsigned long flags;
if (!rdma_is_port_valid(ib_dev, port))
return ERR_PTR(-ENOENT);
table = rdma_gid_table(ib_dev, port);
if (ndev)
mask |= GID_ATTR_FIND_MASK_NETDEV;
read_lock_irqsave(&table->rwlock, flags);
local_index = find_gid(table, gid, &val, false, mask, NULL);
if (local_index >= 0) {
get_gid_entry(table->data_vec[local_index]);
attr = &table->data_vec[local_index]->attr;
read_unlock_irqrestore(&table->rwlock, flags);
return attr;
}
read_unlock_irqrestore(&table->rwlock, flags);
return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL(rdma_find_gid_by_port);
/**
* rdma_find_gid_by_filter - Returns the GID table attribute where a
* specified GID value occurs
* @device: The device to query.
* @gid: The GID value to search for.
* @port: The port number of the device where the GID value could be
* searched.
* @filter: The filter function is executed on any matching GID in the table.
* If the filter function returns true, the corresponding index is returned,
* otherwise, we continue searching the GID table. It's guaranteed that
* while filter is executed, ndev field is valid and the structure won't
* change. filter is executed in an atomic context. filter must not be NULL.
*
* rdma_find_gid_by_filter() searches for the specified GID value
* of which the filter function returns true in the port's GID table.
*
*/
const struct ib_gid_attr *rdma_find_gid_by_filter(
struct ib_device *ib_dev, const union ib_gid *gid, u8 port,
bool (*filter)(const union ib_gid *gid, const struct ib_gid_attr *,
void *),
void *context)
{
const struct ib_gid_attr *res = ERR_PTR(-ENOENT);
struct ib_gid_table *table;
unsigned long flags;
unsigned int i;
if (!rdma_is_port_valid(ib_dev, port))
return ERR_PTR(-EINVAL);
table = rdma_gid_table(ib_dev, port);
read_lock_irqsave(&table->rwlock, flags);
for (i = 0; i < table->sz; i++) {
struct ib_gid_table_entry *entry = table->data_vec[i];
if (!is_gid_entry_valid(entry))
continue;
if (memcmp(gid, &entry->attr.gid, sizeof(*gid)))
continue;
if (filter(gid, &entry->attr, context)) {
get_gid_entry(entry);
res = &entry->attr;
break;
}
}
read_unlock_irqrestore(&table->rwlock, flags);
return res;
}
static struct ib_gid_table *alloc_gid_table(int sz)
{
struct ib_gid_table *table = kzalloc(sizeof(*table), GFP_KERNEL);
if (!table)
return NULL;
table->data_vec = kcalloc(sz, sizeof(*table->data_vec), GFP_KERNEL);
if (!table->data_vec)
goto err_free_table;
mutex_init(&table->lock);
table->sz = sz;
rwlock_init(&table->rwlock);
return table;
err_free_table:
kfree(table);
return NULL;
}
static void release_gid_table(struct ib_device *device, u8 port,
struct ib_gid_table *table)
{
bool leak = false;
int i;
if (!table)
return;
for (i = 0; i < table->sz; i++) {
if (is_gid_entry_free(table->data_vec[i]))
continue;
if (kref_read(&table->data_vec[i]->kref) > 1) {
pr_err("GID entry ref leak for %s (index %d) ref=%d\n",
device->name, i,
kref_read(&table->data_vec[i]->kref));
leak = true;
}
}
if (leak)
return;
kfree(table->data_vec);
kfree(table);
}
static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table)
{
int i;
bool deleted = false;
if (!table)
return;
mutex_lock(&table->lock);
for (i = 0; i < table->sz; ++i) {
if (is_gid_entry_valid(table->data_vec[i])) {
del_gid(ib_dev, port, table, i);
deleted = true;
}
}
mutex_unlock(&table->lock);
if (deleted)
dispatch_gid_change_event(ib_dev, port);
}
void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port,
struct net_device *ndev,
unsigned long gid_type_mask,
enum ib_cache_gid_default_mode mode)
{
union ib_gid gid = { };
struct ib_gid_attr gid_attr;
unsigned int gid_type;
unsigned long mask;
mask = GID_ATTR_FIND_MASK_GID_TYPE |
GID_ATTR_FIND_MASK_DEFAULT |
GID_ATTR_FIND_MASK_NETDEV;
memset(&gid_attr, 0, sizeof(gid_attr));
gid_attr.ndev = ndev;
for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) {
if (1UL << gid_type & ~gid_type_mask)
continue;
gid_attr.gid_type = gid_type;
if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) {
make_default_gid(ndev, &gid);
__ib_cache_gid_add(ib_dev, port, &gid,
&gid_attr, mask, true);
} else if (mode == IB_CACHE_GID_DEFAULT_MODE_DELETE) {
_ib_cache_gid_del(ib_dev, port, &gid,
&gid_attr, mask, true);
}
}
}
static void gid_table_reserve_default(struct ib_device *ib_dev, u8 port,
struct ib_gid_table *table)
{
unsigned int i;
unsigned long roce_gid_type_mask;
unsigned int num_default_gids;
roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port);
num_default_gids = hweight_long(roce_gid_type_mask);
/* Reserve starting indices for default GIDs */
for (i = 0; i < num_default_gids && i < table->sz; i++)
table->default_gid_indices |= BIT(i);
}
static void gid_table_release_one(struct ib_device *ib_dev)
{
struct ib_gid_table *table;
u8 port;
for (port = 0; port < ib_dev->phys_port_cnt; port++) {
table = ib_dev->cache.ports[port].gid;
release_gid_table(ib_dev, port, table);
ib_dev->cache.ports[port].gid = NULL;
}
}
static int _gid_table_setup_one(struct ib_device *ib_dev)
{
u8 port;
struct ib_gid_table *table;
for (port = 0; port < ib_dev->phys_port_cnt; port++) {
u8 rdma_port = port + rdma_start_port(ib_dev);
table = alloc_gid_table(
ib_dev->port_immutable[rdma_port].gid_tbl_len);
if (!table)
goto rollback_table_setup;
gid_table_reserve_default(ib_dev, rdma_port, table);
ib_dev->cache.ports[port].gid = table;
}
return 0;
rollback_table_setup:
gid_table_release_one(ib_dev);
return -ENOMEM;
}
static void gid_table_cleanup_one(struct ib_device *ib_dev)
{
struct ib_gid_table *table;
u8 port;
for (port = 0; port < ib_dev->phys_port_cnt; port++) {
table = ib_dev->cache.ports[port].gid;
cleanup_gid_table_port(ib_dev, port + rdma_start_port(ib_dev),
table);
}
}
static int gid_table_setup_one(struct ib_device *ib_dev)
{
int err;
err = _gid_table_setup_one(ib_dev);
if (err)
return err;
rdma_roce_rescan_device(ib_dev);
return err;
}
/**
* rdma_query_gid - Read the GID content from the GID software cache
* @device: Device to query the GID
* @port_num: Port number of the device
* @index: Index of the GID table entry to read
* @gid: Pointer to GID where to store the entry's GID
*
* rdma_query_gid() only reads the GID entry content for requested device,
* port and index. It reads for IB, RoCE and iWarp link layers. It doesn't
* hold any reference to the GID table entry in the HCA or software cache.
*
* Returns 0 on success or appropriate error code.
*
*/
int rdma_query_gid(struct ib_device *device, u8 port_num,
int index, union ib_gid *gid)
{
struct ib_gid_table *table;
unsigned long flags;
int res = -EINVAL;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
table = rdma_gid_table(device, port_num);
read_lock_irqsave(&table->rwlock, flags);
if (index < 0 || index >= table->sz ||
!is_gid_entry_valid(table->data_vec[index]))
goto done;
memcpy(gid, &table->data_vec[index]->attr.gid, sizeof(*gid));
res = 0;
done:
read_unlock_irqrestore(&table->rwlock, flags);
return res;
}
EXPORT_SYMBOL(rdma_query_gid);
/**
* rdma_find_gid - Returns SGID attributes if the matching GID is found.
* @device: The device to query.
* @gid: The GID value to search for.
* @gid_type: The GID type to search for.
* @ndev: In RoCE, the net device of the device. NULL means ignore.
*
* rdma_find_gid() searches for the specified GID value in the software cache.
*
* Returns GID attributes if a valid GID is found or returns ERR_PTR for the
* error. The caller must invoke rdma_put_gid_attr() to release the reference.
*
*/
const struct ib_gid_attr *rdma_find_gid(struct ib_device *device,
const union ib_gid *gid,
enum ib_gid_type gid_type,
struct net_device *ndev)
{
unsigned long mask = GID_ATTR_FIND_MASK_GID |
GID_ATTR_FIND_MASK_GID_TYPE;
struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type};
u8 p;
if (ndev)
mask |= GID_ATTR_FIND_MASK_NETDEV;
for (p = 0; p < device->phys_port_cnt; p++) {
struct ib_gid_table *table;
unsigned long flags;
int index;
table = device->cache.ports[p].gid;
read_lock_irqsave(&table->rwlock, flags);
index = find_gid(table, gid, &gid_attr_val, false, mask, NULL);
if (index >= 0) {
const struct ib_gid_attr *attr;
get_gid_entry(table->data_vec[index]);
attr = &table->data_vec[index]->attr;
read_unlock_irqrestore(&table->rwlock, flags);
return attr;
}
read_unlock_irqrestore(&table->rwlock, flags);
}
return ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL(rdma_find_gid);
int ib_get_cached_pkey(struct ib_device *device,
u8 port_num,
int index,
u16 *pkey)
{
struct ib_pkey_cache *cache;
unsigned long flags;
int ret = 0;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
if (index < 0 || index >= cache->table_len)
ret = -EINVAL;
else
*pkey = cache->table[index];
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_get_cached_pkey);
int ib_get_cached_subnet_prefix(struct ib_device *device,
u8 port_num,
u64 *sn_pfx)
{
unsigned long flags;
int p;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
p = port_num - rdma_start_port(device);
read_lock_irqsave(&device->cache.lock, flags);
*sn_pfx = device->cache.ports[p].subnet_prefix;
read_unlock_irqrestore(&device->cache.lock, flags);
return 0;
}
EXPORT_SYMBOL(ib_get_cached_subnet_prefix);
int ib_find_cached_pkey(struct ib_device *device,
u8 port_num,
u16 pkey,
u16 *index)
{
struct ib_pkey_cache *cache;
unsigned long flags;
int i;
int ret = -ENOENT;
int partial_ix = -1;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
*index = -1;
for (i = 0; i < cache->table_len; ++i)
if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
if (cache->table[i] & 0x8000) {
*index = i;
ret = 0;
break;
} else
partial_ix = i;
}
if (ret && partial_ix >= 0) {
*index = partial_ix;
ret = 0;
}
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_find_cached_pkey);
int ib_find_exact_cached_pkey(struct ib_device *device,
u8 port_num,
u16 pkey,
u16 *index)
{
struct ib_pkey_cache *cache;
unsigned long flags;
int i;
int ret = -ENOENT;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
cache = device->cache.ports[port_num - rdma_start_port(device)].pkey;
*index = -1;
for (i = 0; i < cache->table_len; ++i)
if (cache->table[i] == pkey) {
*index = i;
ret = 0;
break;
}
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_find_exact_cached_pkey);
int ib_get_cached_lmc(struct ib_device *device,
u8 port_num,
u8 *lmc)
{
unsigned long flags;
int ret = 0;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
*lmc = device->cache.ports[port_num - rdma_start_port(device)].lmc;
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_get_cached_lmc);
int ib_get_cached_port_state(struct ib_device *device,
u8 port_num,
enum ib_port_state *port_state)
{
unsigned long flags;
int ret = 0;
if (!rdma_is_port_valid(device, port_num))
return -EINVAL;
read_lock_irqsave(&device->cache.lock, flags);
*port_state = device->cache.ports[port_num
- rdma_start_port(device)].port_state;
read_unlock_irqrestore(&device->cache.lock, flags);
return ret;
}
EXPORT_SYMBOL(ib_get_cached_port_state);
/**
* rdma_get_gid_attr - Returns GID attributes for a port of a device
* at a requested gid_index, if a valid GID entry exists.
* @device: The device to query.
* @port_num: The port number on the device where the GID value
* is to be queried.
* @index: Index of the GID table entry whose attributes are to
* be queried.
*
* rdma_get_gid_attr() acquires reference count of gid attributes from the
* cached GID table. Caller must invoke rdma_put_gid_attr() to release
* reference to gid attribute regardless of link layer.
*
* Returns pointer to valid gid attribute or ERR_PTR for the appropriate error
* code.
*/
const struct ib_gid_attr *
rdma_get_gid_attr(struct ib_device *device, u8 port_num, int index)
{
const struct ib_gid_attr *attr = ERR_PTR(-EINVAL);
struct ib_gid_table *table;
unsigned long flags;
if (!rdma_is_port_valid(device, port_num))
return ERR_PTR(-EINVAL);
table = rdma_gid_table(device, port_num);
if (index < 0 || index >= table->sz)
return ERR_PTR(-EINVAL);
read_lock_irqsave(&table->rwlock, flags);
if (!is_gid_entry_valid(table->data_vec[index]))
goto done;
get_gid_entry(table->data_vec[index]);
attr = &table->data_vec[index]->attr;
done:
read_unlock_irqrestore(&table->rwlock, flags);
return attr;
}
EXPORT_SYMBOL(rdma_get_gid_attr);
/**
* rdma_put_gid_attr - Release reference to the GID attribute
* @attr: Pointer to the GID attribute whose reference
* needs to be released.
*
* rdma_put_gid_attr() must be used to release reference whose
* reference is acquired using rdma_get_gid_attr() or any APIs
* which returns a pointer to the ib_gid_attr regardless of link layer
* of IB or RoCE.
*
*/
void rdma_put_gid_attr(const struct ib_gid_attr *attr)
{
struct ib_gid_table_entry *entry =
container_of(attr, struct ib_gid_table_entry, attr);
put_gid_entry(entry);
}
EXPORT_SYMBOL(rdma_put_gid_attr);
/**
* rdma_hold_gid_attr - Get reference to existing GID attribute
*
* @attr: Pointer to the GID attribute whose reference
* needs to be taken.
*
* Increase the reference count to a GID attribute to keep it from being
* freed. Callers are required to already be holding a reference to attribute.
*
*/
void rdma_hold_gid_attr(const struct ib_gid_attr *attr)
{
struct ib_gid_table_entry *entry =
container_of(attr, struct ib_gid_table_entry, attr);
get_gid_entry(entry);
}
EXPORT_SYMBOL(rdma_hold_gid_attr);
static int config_non_roce_gid_cache(struct ib_device *device,
u8 port, int gid_tbl_len)
{
struct ib_gid_attr gid_attr = {};
struct ib_gid_table *table;
int ret = 0;
int i;
gid_attr.device = device;
gid_attr.port_num = port;
table = rdma_gid_table(device, port);
mutex_lock(&table->lock);
for (i = 0; i < gid_tbl_len; ++i) {
if (!device->query_gid)
continue;
ret = device->query_gid(device, port, i, &gid_attr.gid);
if (ret) {
pr_warn("query_gid failed (%d) for %s (index %d)\n",
ret, device->name, i);
goto err;
}
gid_attr.index = i;
add_modify_gid(table, &gid_attr);
}
err:
mutex_unlock(&table->lock);
return ret;
}
static void ib_cache_update(struct ib_device *device,
u8 port,
bool enforce_security)
{
struct ib_port_attr *tprops = NULL;
struct ib_pkey_cache *pkey_cache = NULL, *old_pkey_cache;
int i;
int ret;
if (!rdma_is_port_valid(device, port))
return;
tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
if (!tprops)
return;
ret = ib_query_port(device, port, tprops);
if (ret) {
pr_warn("ib_query_port failed (%d) for %s\n",
ret, device->name);
goto err;
}
if (!rdma_protocol_roce(device, port)) {
ret = config_non_roce_gid_cache(device, port,
tprops->gid_tbl_len);
if (ret)
goto err;
}
pkey_cache = kmalloc(struct_size(pkey_cache, table,
tprops->pkey_tbl_len),
GFP_KERNEL);
if (!pkey_cache)
goto err;
pkey_cache->table_len = tprops->pkey_tbl_len;
for (i = 0; i < pkey_cache->table_len; ++i) {
ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
if (ret) {
pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n",
ret, device->name, i);
goto err;
}
}
write_lock_irq(&device->cache.lock);
old_pkey_cache = device->cache.ports[port -
rdma_start_port(device)].pkey;
device->cache.ports[port - rdma_start_port(device)].pkey = pkey_cache;
device->cache.ports[port - rdma_start_port(device)].lmc = tprops->lmc;
device->cache.ports[port - rdma_start_port(device)].port_state =
tprops->state;
device->cache.ports[port - rdma_start_port(device)].subnet_prefix =
tprops->subnet_prefix;
write_unlock_irq(&device->cache.lock);
if (enforce_security)
ib_security_cache_change(device,
port,
tprops->subnet_prefix);
kfree(old_pkey_cache);
kfree(tprops);
return;
err:
kfree(pkey_cache);
kfree(tprops);
}
static void ib_cache_task(struct work_struct *_work)
{
struct ib_update_work *work =
container_of(_work, struct ib_update_work, work);
ib_cache_update(work->device,
work->port_num,
work->enforce_security);
kfree(work);
}
static void ib_cache_event(struct ib_event_handler *handler,
struct ib_event *event)
{
struct ib_update_work *work;
if (event->event == IB_EVENT_PORT_ERR ||
event->event == IB_EVENT_PORT_ACTIVE ||
event->event == IB_EVENT_LID_CHANGE ||
event->event == IB_EVENT_PKEY_CHANGE ||
event->event == IB_EVENT_SM_CHANGE ||
event->event == IB_EVENT_CLIENT_REREGISTER ||
event->event == IB_EVENT_GID_CHANGE) {
work = kmalloc(sizeof *work, GFP_ATOMIC);
if (work) {
INIT_WORK(&work->work, ib_cache_task);
work->device = event->device;
work->port_num = event->element.port_num;
if (event->event == IB_EVENT_PKEY_CHANGE ||
event->event == IB_EVENT_GID_CHANGE)
work->enforce_security = true;
else
work->enforce_security = false;
queue_work(ib_wq, &work->work);
}
}
}
int ib_cache_setup_one(struct ib_device *device)
{
int p;
int err;
rwlock_init(&device->cache.lock);
device->cache.ports =
kcalloc(rdma_end_port(device) - rdma_start_port(device) + 1,
sizeof(*device->cache.ports),
GFP_KERNEL);
if (!device->cache.ports)
return -ENOMEM;
err = gid_table_setup_one(device);
if (err) {
kfree(device->cache.ports);
device->cache.ports = NULL;
return err;
}
for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
ib_cache_update(device, p + rdma_start_port(device), true);
INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
device, ib_cache_event);
ib_register_event_handler(&device->cache.event_handler);
return 0;
}
void ib_cache_release_one(struct ib_device *device)
{
int p;
/*
* The release function frees all the cache elements.
* This function should be called as part of freeing
* all the device's resources when the cache could no
* longer be accessed.
*/
for (p = 0; p <= rdma_end_port(device) - rdma_start_port(device); ++p)
kfree(device->cache.ports[p].pkey);
gid_table_release_one(device);
kfree(device->cache.ports);
}
void ib_cache_cleanup_one(struct ib_device *device)
{
/* The cleanup function unregisters the event handler,
* waits for all in-progress workqueue elements and cleans
* up the GID cache. This function should be called after
* the device was removed from the devices list and all
* clients were removed, so the cache exists but is
* non-functional and shouldn't be updated anymore.
*/
ib_unregister_event_handler(&device->cache.event_handler);
flush_workqueue(ib_wq);
gid_table_cleanup_one(device);
/*
* Flush the wq second time for any pending GID delete work.
*/
flush_workqueue(ib_wq);
}