linux/drivers/infiniband/ulp/ipoib/ipoib_main.c
Yossi Etigin a50df398cd IPoIB: Fix loss of connectivity after bonding failover on both sides
Fix bonding failover in the case both peers failover and the
gratuitous ARP is lost.  In that case, the sender side will create an
ipoib_neigh and issue a path request with the old GID first.  When
skb->dst->neighbour->ha changes due to ARP refresh, this ipoib_neigh
will not be added to the path->list of the path of the new GID,
because the ipoib_neigh already exists.  It will not have an AH
either, because of sender-side failover.  Therefore, it will not get
an AH when the path is resolved.

The solution here is to compare GIDs in ipoib_start_xmit() even if
neigh->ah is invalid.  Comparing with an uninitialized value of
neigh->dgid should be fine, since a spurious match is harmless (and
astronomically unlikely too).

Signed-off-by: Moni Shoua <monis@voltaire.com>
Signed-off-by: Yossi Etigin <yosefe@voltaire.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
2009-01-09 14:05:11 -08:00

1454 lines
35 KiB
C

/*
* Copyright (c) 2004 Topspin Communications. All rights reserved.
* Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
* Copyright (c) 2004 Voltaire, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "ipoib.h"
#include <linux/module.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h> /* For ARPHRD_xxx */
#include <linux/ip.h>
#include <linux/in.h>
#include <net/dst.h>
MODULE_AUTHOR("Roland Dreier");
MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
MODULE_LICENSE("Dual BSD/GPL");
int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
static int lro;
module_param(lro, bool, 0444);
MODULE_PARM_DESC(lro, "Enable LRO (Large Receive Offload)");
static int lro_max_aggr = IPOIB_LRO_MAX_AGGR;
module_param(lro_max_aggr, int, 0644);
MODULE_PARM_DESC(lro_max_aggr, "LRO: Max packets to be aggregated "
"(default = 64)");
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
int ipoib_debug_level;
module_param_named(debug_level, ipoib_debug_level, int, 0644);
MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
#endif
struct ipoib_path_iter {
struct net_device *dev;
struct ipoib_path path;
};
static const u8 ipv4_bcast_addr[] = {
0x00, 0xff, 0xff, 0xff,
0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
};
struct workqueue_struct *ipoib_workqueue;
struct ib_sa_client ipoib_sa_client;
static void ipoib_add_one(struct ib_device *device);
static void ipoib_remove_one(struct ib_device *device);
static struct ib_client ipoib_client = {
.name = "ipoib",
.add = ipoib_add_one,
.remove = ipoib_remove_one
};
int ipoib_open(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
ipoib_dbg(priv, "bringing up interface\n");
set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
if (ipoib_pkey_dev_delay_open(dev))
return 0;
napi_enable(&priv->napi);
if (ipoib_ib_dev_open(dev)) {
napi_disable(&priv->napi);
return -EINVAL;
}
if (ipoib_ib_dev_up(dev)) {
ipoib_ib_dev_stop(dev, 1);
napi_disable(&priv->napi);
return -EINVAL;
}
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
struct ipoib_dev_priv *cpriv;
/* Bring up any child interfaces too */
mutex_lock(&priv->vlan_mutex);
list_for_each_entry(cpriv, &priv->child_intfs, list) {
int flags;
flags = cpriv->dev->flags;
if (flags & IFF_UP)
continue;
dev_change_flags(cpriv->dev, flags | IFF_UP);
}
mutex_unlock(&priv->vlan_mutex);
}
netif_start_queue(dev);
return 0;
}
static int ipoib_stop(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
ipoib_dbg(priv, "stopping interface\n");
clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
napi_disable(&priv->napi);
netif_stop_queue(dev);
ipoib_ib_dev_down(dev, 0);
ipoib_ib_dev_stop(dev, 0);
if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
struct ipoib_dev_priv *cpriv;
/* Bring down any child interfaces too */
mutex_lock(&priv->vlan_mutex);
list_for_each_entry(cpriv, &priv->child_intfs, list) {
int flags;
flags = cpriv->dev->flags;
if (!(flags & IFF_UP))
continue;
dev_change_flags(cpriv->dev, flags & ~IFF_UP);
}
mutex_unlock(&priv->vlan_mutex);
}
return 0;
}
static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
/* dev->mtu > 2K ==> connected mode */
if (ipoib_cm_admin_enabled(dev)) {
if (new_mtu > ipoib_cm_max_mtu(dev))
return -EINVAL;
if (new_mtu > priv->mcast_mtu)
ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
priv->mcast_mtu);
dev->mtu = new_mtu;
return 0;
}
if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
return -EINVAL;
priv->admin_mtu = new_mtu;
dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
return 0;
}
static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct rb_node *n = priv->path_tree.rb_node;
struct ipoib_path *path;
int ret;
while (n) {
path = rb_entry(n, struct ipoib_path, rb_node);
ret = memcmp(gid, path->pathrec.dgid.raw,
sizeof (union ib_gid));
if (ret < 0)
n = n->rb_left;
else if (ret > 0)
n = n->rb_right;
else
return path;
}
return NULL;
}
static int __path_add(struct net_device *dev, struct ipoib_path *path)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct rb_node **n = &priv->path_tree.rb_node;
struct rb_node *pn = NULL;
struct ipoib_path *tpath;
int ret;
while (*n) {
pn = *n;
tpath = rb_entry(pn, struct ipoib_path, rb_node);
ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
sizeof (union ib_gid));
if (ret < 0)
n = &pn->rb_left;
else if (ret > 0)
n = &pn->rb_right;
else
return -EEXIST;
}
rb_link_node(&path->rb_node, pn, n);
rb_insert_color(&path->rb_node, &priv->path_tree);
list_add_tail(&path->list, &priv->path_list);
return 0;
}
static void path_free(struct net_device *dev, struct ipoib_path *path)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_neigh *neigh, *tn;
struct sk_buff *skb;
unsigned long flags;
while ((skb = __skb_dequeue(&path->queue)))
dev_kfree_skb_irq(skb);
spin_lock_irqsave(&priv->lock, flags);
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
/*
* It's safe to call ipoib_put_ah() inside priv->lock
* here, because we know that path->ah will always
* hold one more reference, so ipoib_put_ah() will
* never do more than decrement the ref count.
*/
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
}
spin_unlock_irqrestore(&priv->lock, flags);
if (path->ah)
ipoib_put_ah(path->ah);
kfree(path);
}
#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
{
struct ipoib_path_iter *iter;
iter = kmalloc(sizeof *iter, GFP_KERNEL);
if (!iter)
return NULL;
iter->dev = dev;
memset(iter->path.pathrec.dgid.raw, 0, 16);
if (ipoib_path_iter_next(iter)) {
kfree(iter);
return NULL;
}
return iter;
}
int ipoib_path_iter_next(struct ipoib_path_iter *iter)
{
struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
struct rb_node *n;
struct ipoib_path *path;
int ret = 1;
spin_lock_irq(&priv->lock);
n = rb_first(&priv->path_tree);
while (n) {
path = rb_entry(n, struct ipoib_path, rb_node);
if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
sizeof (union ib_gid)) < 0) {
iter->path = *path;
ret = 0;
break;
}
n = rb_next(n);
}
spin_unlock_irq(&priv->lock);
return ret;
}
void ipoib_path_iter_read(struct ipoib_path_iter *iter,
struct ipoib_path *path)
{
*path = iter->path;
}
#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
void ipoib_mark_paths_invalid(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path, *tp;
spin_lock_irq(&priv->lock);
list_for_each_entry_safe(path, tp, &priv->path_list, list) {
ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
be16_to_cpu(path->pathrec.dlid),
path->pathrec.dgid.raw);
path->valid = 0;
}
spin_unlock_irq(&priv->lock);
}
void ipoib_flush_paths(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path, *tp;
LIST_HEAD(remove_list);
unsigned long flags;
netif_tx_lock_bh(dev);
spin_lock_irqsave(&priv->lock, flags);
list_splice_init(&priv->path_list, &remove_list);
list_for_each_entry(path, &remove_list, list)
rb_erase(&path->rb_node, &priv->path_tree);
list_for_each_entry_safe(path, tp, &remove_list, list) {
if (path->query)
ib_sa_cancel_query(path->query_id, path->query);
spin_unlock_irqrestore(&priv->lock, flags);
netif_tx_unlock_bh(dev);
wait_for_completion(&path->done);
path_free(dev, path);
netif_tx_lock_bh(dev);
spin_lock_irqsave(&priv->lock, flags);
}
spin_unlock_irqrestore(&priv->lock, flags);
netif_tx_unlock_bh(dev);
}
static void path_rec_completion(int status,
struct ib_sa_path_rec *pathrec,
void *path_ptr)
{
struct ipoib_path *path = path_ptr;
struct net_device *dev = path->dev;
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_ah *ah = NULL;
struct ipoib_ah *old_ah = NULL;
struct ipoib_neigh *neigh, *tn;
struct sk_buff_head skqueue;
struct sk_buff *skb;
unsigned long flags;
if (!status)
ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
else
ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
status, path->pathrec.dgid.raw);
skb_queue_head_init(&skqueue);
if (!status) {
struct ib_ah_attr av;
if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
ah = ipoib_create_ah(dev, priv->pd, &av);
}
spin_lock_irqsave(&priv->lock, flags);
if (ah) {
path->pathrec = *pathrec;
old_ah = path->ah;
path->ah = ah;
ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
while ((skb = __skb_dequeue(&path->queue)))
__skb_queue_tail(&skqueue, skb);
list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
if (neigh->ah) {
WARN_ON(neigh->ah != old_ah);
/*
* Dropping the ah reference inside
* priv->lock is safe here, because we
* will hold one more reference from
* the original value of path->ah (ie
* old_ah).
*/
ipoib_put_ah(neigh->ah);
}
kref_get(&path->ah->ref);
neigh->ah = path->ah;
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
sizeof(union ib_gid));
if (ipoib_cm_enabled(dev, neigh->neighbour)) {
if (!ipoib_cm_get(neigh))
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
path,
neigh));
if (!ipoib_cm_get(neigh)) {
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
continue;
}
}
while ((skb = __skb_dequeue(&neigh->queue)))
__skb_queue_tail(&skqueue, skb);
}
path->valid = 1;
}
path->query = NULL;
complete(&path->done);
spin_unlock_irqrestore(&priv->lock, flags);
if (old_ah)
ipoib_put_ah(old_ah);
while ((skb = __skb_dequeue(&skqueue))) {
skb->dev = dev;
if (dev_queue_xmit(skb))
ipoib_warn(priv, "dev_queue_xmit failed "
"to requeue packet\n");
}
}
static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path;
if (!priv->broadcast)
return NULL;
path = kzalloc(sizeof *path, GFP_ATOMIC);
if (!path)
return NULL;
path->dev = dev;
skb_queue_head_init(&path->queue);
INIT_LIST_HEAD(&path->neigh_list);
memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
path->pathrec.sgid = priv->local_gid;
path->pathrec.pkey = cpu_to_be16(priv->pkey);
path->pathrec.numb_path = 1;
path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
return path;
}
static int path_rec_start(struct net_device *dev,
struct ipoib_path *path)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
ipoib_dbg(priv, "Start path record lookup for %pI6\n",
path->pathrec.dgid.raw);
init_completion(&path->done);
path->query_id =
ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
&path->pathrec,
IB_SA_PATH_REC_DGID |
IB_SA_PATH_REC_SGID |
IB_SA_PATH_REC_NUMB_PATH |
IB_SA_PATH_REC_TRAFFIC_CLASS |
IB_SA_PATH_REC_PKEY,
1000, GFP_ATOMIC,
path_rec_completion,
path, &path->query);
if (path->query_id < 0) {
ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
path->query = NULL;
complete(&path->done);
return path->query_id;
}
return 0;
}
static void neigh_add_path(struct sk_buff *skb, struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path;
struct ipoib_neigh *neigh;
unsigned long flags;
neigh = ipoib_neigh_alloc(skb->dst->neighbour, skb->dev);
if (!neigh) {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
return;
}
spin_lock_irqsave(&priv->lock, flags);
path = __path_find(dev, skb->dst->neighbour->ha + 4);
if (!path) {
path = path_rec_create(dev, skb->dst->neighbour->ha + 4);
if (!path)
goto err_path;
__path_add(dev, path);
}
list_add_tail(&neigh->list, &path->neigh_list);
if (path->ah) {
kref_get(&path->ah->ref);
neigh->ah = path->ah;
memcpy(&neigh->dgid.raw, &path->pathrec.dgid.raw,
sizeof(union ib_gid));
if (ipoib_cm_enabled(dev, neigh->neighbour)) {
if (!ipoib_cm_get(neigh))
ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
if (!ipoib_cm_get(neigh)) {
list_del(&neigh->list);
if (neigh->ah)
ipoib_put_ah(neigh->ah);
ipoib_neigh_free(dev, neigh);
goto err_drop;
}
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
__skb_queue_tail(&neigh->queue, skb);
else {
ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
skb_queue_len(&neigh->queue));
goto err_drop;
}
} else
ipoib_send(dev, skb, path->ah, IPOIB_QPN(skb->dst->neighbour->ha));
} else {
neigh->ah = NULL;
if (!path->query && path_rec_start(dev, path))
goto err_list;
__skb_queue_tail(&neigh->queue, skb);
}
spin_unlock_irqrestore(&priv->lock, flags);
return;
err_list:
list_del(&neigh->list);
err_path:
ipoib_neigh_free(dev, neigh);
err_drop:
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
spin_unlock_irqrestore(&priv->lock, flags);
}
static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(skb->dev);
/* Look up path record for unicasts */
if (skb->dst->neighbour->ha[4] != 0xff) {
neigh_add_path(skb, dev);
return;
}
/* Add in the P_Key for multicasts */
skb->dst->neighbour->ha[8] = (priv->pkey >> 8) & 0xff;
skb->dst->neighbour->ha[9] = priv->pkey & 0xff;
ipoib_mcast_send(dev, skb->dst->neighbour->ha + 4, skb);
}
static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
struct ipoib_pseudoheader *phdr)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path;
unsigned long flags;
spin_lock_irqsave(&priv->lock, flags);
path = __path_find(dev, phdr->hwaddr + 4);
if (!path || !path->valid) {
if (!path)
path = path_rec_create(dev, phdr->hwaddr + 4);
if (path) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof *phdr);
__skb_queue_tail(&path->queue, skb);
if (!path->query && path_rec_start(dev, path)) {
spin_unlock_irqrestore(&priv->lock, flags);
path_free(dev, path);
return;
} else
__path_add(dev, path);
} else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
spin_unlock_irqrestore(&priv->lock, flags);
return;
}
if (path->ah) {
ipoib_dbg(priv, "Send unicast ARP to %04x\n",
be16_to_cpu(path->pathrec.dlid));
ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
} else if ((path->query || !path_rec_start(dev, path)) &&
skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof *phdr);
__skb_queue_tail(&path->queue, skb);
} else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
spin_unlock_irqrestore(&priv->lock, flags);
}
static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_neigh *neigh;
unsigned long flags;
if (likely(skb->dst && skb->dst->neighbour)) {
if (unlikely(!*to_ipoib_neigh(skb->dst->neighbour))) {
ipoib_path_lookup(skb, dev);
return NETDEV_TX_OK;
}
neigh = *to_ipoib_neigh(skb->dst->neighbour);
if (unlikely((memcmp(&neigh->dgid.raw,
skb->dst->neighbour->ha + 4,
sizeof(union ib_gid))) ||
(neigh->dev != dev))) {
spin_lock_irqsave(&priv->lock, flags);
/*
* It's safe to call ipoib_put_ah() inside
* priv->lock here, because we know that
* path->ah will always hold one more reference,
* so ipoib_put_ah() will never do more than
* decrement the ref count.
*/
if (neigh->ah)
ipoib_put_ah(neigh->ah);
list_del(&neigh->list);
ipoib_neigh_free(dev, neigh);
spin_unlock_irqrestore(&priv->lock, flags);
ipoib_path_lookup(skb, dev);
return NETDEV_TX_OK;
}
if (ipoib_cm_get(neigh)) {
if (ipoib_cm_up(neigh)) {
ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
return NETDEV_TX_OK;
}
} else if (neigh->ah) {
ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(skb->dst->neighbour->ha));
return NETDEV_TX_OK;
}
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags);
} else {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
} else {
struct ipoib_pseudoheader *phdr =
(struct ipoib_pseudoheader *) skb->data;
skb_pull(skb, sizeof *phdr);
if (phdr->hwaddr[4] == 0xff) {
/* Add in the P_Key for multicast*/
phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
phdr->hwaddr[9] = priv->pkey & 0xff;
ipoib_mcast_send(dev, phdr->hwaddr + 4, skb);
} else {
/* unicast GID -- should be ARP or RARP reply */
if ((be16_to_cpup((__be16 *) skb->data) != ETH_P_ARP) &&
(be16_to_cpup((__be16 *) skb->data) != ETH_P_RARP)) {
ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n",
skb->dst ? "neigh" : "dst",
be16_to_cpup((__be16 *) skb->data),
IPOIB_QPN(phdr->hwaddr),
phdr->hwaddr + 4);
dev_kfree_skb_any(skb);
++dev->stats.tx_dropped;
return NETDEV_TX_OK;
}
unicast_arp_send(skb, dev, phdr);
}
}
return NETDEV_TX_OK;
}
static void ipoib_timeout(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
jiffies_to_msecs(jiffies - dev->trans_start));
ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
netif_queue_stopped(dev),
priv->tx_head, priv->tx_tail);
/* XXX reset QP, etc. */
}
static int ipoib_hard_header(struct sk_buff *skb,
struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr, unsigned len)
{
struct ipoib_header *header;
header = (struct ipoib_header *) skb_push(skb, sizeof *header);
header->proto = htons(type);
header->reserved = 0;
/*
* If we don't have a neighbour structure, stuff the
* destination address onto the front of the skb so we can
* figure out where to send the packet later.
*/
if ((!skb->dst || !skb->dst->neighbour) && daddr) {
struct ipoib_pseudoheader *phdr =
(struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr);
memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
}
return 0;
}
static void ipoib_set_mcast_list(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
return;
}
queue_work(ipoib_workqueue, &priv->restart_task);
}
static void ipoib_neigh_cleanup(struct neighbour *n)
{
struct ipoib_neigh *neigh;
struct ipoib_dev_priv *priv = netdev_priv(n->dev);
unsigned long flags;
struct ipoib_ah *ah = NULL;
neigh = *to_ipoib_neigh(n);
if (neigh)
priv = netdev_priv(neigh->dev);
else
return;
ipoib_dbg(priv,
"neigh_cleanup for %06x %pI6\n",
IPOIB_QPN(n->ha),
n->ha + 4);
spin_lock_irqsave(&priv->lock, flags);
if (neigh->ah)
ah = neigh->ah;
list_del(&neigh->list);
ipoib_neigh_free(n->dev, neigh);
spin_unlock_irqrestore(&priv->lock, flags);
if (ah)
ipoib_put_ah(ah);
}
struct ipoib_neigh *ipoib_neigh_alloc(struct neighbour *neighbour,
struct net_device *dev)
{
struct ipoib_neigh *neigh;
neigh = kmalloc(sizeof *neigh, GFP_ATOMIC);
if (!neigh)
return NULL;
neigh->neighbour = neighbour;
neigh->dev = dev;
*to_ipoib_neigh(neighbour) = neigh;
skb_queue_head_init(&neigh->queue);
ipoib_cm_set(neigh, NULL);
return neigh;
}
void ipoib_neigh_free(struct net_device *dev, struct ipoib_neigh *neigh)
{
struct sk_buff *skb;
*to_ipoib_neigh(neigh->neighbour) = NULL;
while ((skb = __skb_dequeue(&neigh->queue))) {
++dev->stats.tx_dropped;
dev_kfree_skb_any(skb);
}
if (ipoib_cm_get(neigh))
ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
kfree(neigh);
}
static int ipoib_neigh_setup_dev(struct net_device *dev, struct neigh_parms *parms)
{
parms->neigh_cleanup = ipoib_neigh_cleanup;
return 0;
}
int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
/* Allocate RX/TX "rings" to hold queued skbs */
priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
GFP_KERNEL);
if (!priv->rx_ring) {
printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
ca->name, ipoib_recvq_size);
goto out;
}
priv->tx_ring = vmalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
if (!priv->tx_ring) {
printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
ca->name, ipoib_sendq_size);
goto out_rx_ring_cleanup;
}
memset(priv->tx_ring, 0, ipoib_sendq_size * sizeof *priv->tx_ring);
/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
if (ipoib_ib_dev_init(dev, ca, port))
goto out_tx_ring_cleanup;
return 0;
out_tx_ring_cleanup:
vfree(priv->tx_ring);
out_rx_ring_cleanup:
kfree(priv->rx_ring);
out:
return -ENOMEM;
}
void ipoib_dev_cleanup(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
ipoib_delete_debug_files(dev);
/* Delete any child interfaces first */
list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
unregister_netdev(cpriv->dev);
ipoib_dev_cleanup(cpriv->dev);
free_netdev(cpriv->dev);
}
ipoib_ib_dev_cleanup(dev);
kfree(priv->rx_ring);
vfree(priv->tx_ring);
priv->rx_ring = NULL;
priv->tx_ring = NULL;
}
static const struct header_ops ipoib_header_ops = {
.create = ipoib_hard_header,
};
static int get_skb_hdr(struct sk_buff *skb, void **iphdr,
void **tcph, u64 *hdr_flags, void *priv)
{
unsigned int ip_len;
struct iphdr *iph;
if (unlikely(skb->protocol != htons(ETH_P_IP)))
return -1;
/*
* In the future we may add an else clause that verifies the
* checksum and allows devices which do not calculate checksum
* to use LRO.
*/
if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
return -1;
/* Check for non-TCP packet */
skb_reset_network_header(skb);
iph = ip_hdr(skb);
if (iph->protocol != IPPROTO_TCP)
return -1;
ip_len = ip_hdrlen(skb);
skb_set_transport_header(skb, ip_len);
*tcph = tcp_hdr(skb);
/* check if IP header and TCP header are complete */
if (ntohs(iph->tot_len) < ip_len + tcp_hdrlen(skb))
return -1;
*hdr_flags = LRO_IPV4 | LRO_TCP;
*iphdr = iph;
return 0;
}
static void ipoib_lro_setup(struct ipoib_dev_priv *priv)
{
priv->lro.lro_mgr.max_aggr = lro_max_aggr;
priv->lro.lro_mgr.max_desc = IPOIB_MAX_LRO_DESCRIPTORS;
priv->lro.lro_mgr.lro_arr = priv->lro.lro_desc;
priv->lro.lro_mgr.get_skb_header = get_skb_hdr;
priv->lro.lro_mgr.features = LRO_F_NAPI;
priv->lro.lro_mgr.dev = priv->dev;
priv->lro.lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
}
static void ipoib_setup(struct net_device *dev)
{
struct ipoib_dev_priv *priv = netdev_priv(dev);
dev->open = ipoib_open;
dev->stop = ipoib_stop;
dev->change_mtu = ipoib_change_mtu;
dev->hard_start_xmit = ipoib_start_xmit;
dev->tx_timeout = ipoib_timeout;
dev->header_ops = &ipoib_header_ops;
dev->set_multicast_list = ipoib_set_mcast_list;
dev->neigh_setup = ipoib_neigh_setup_dev;
ipoib_set_ethtool_ops(dev);
netif_napi_add(dev, &priv->napi, ipoib_poll, 100);
dev->watchdog_timeo = HZ;
dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
/*
* We add in INFINIBAND_ALEN to allow for the destination
* address "pseudoheader" for skbs without neighbour struct.
*/
dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN;
dev->addr_len = INFINIBAND_ALEN;
dev->type = ARPHRD_INFINIBAND;
dev->tx_queue_len = ipoib_sendq_size * 2;
dev->features = (NETIF_F_VLAN_CHALLENGED |
NETIF_F_HIGHDMA);
memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
netif_carrier_off(dev);
priv->dev = dev;
ipoib_lro_setup(priv);
spin_lock_init(&priv->lock);
mutex_init(&priv->vlan_mutex);
INIT_LIST_HEAD(&priv->path_list);
INIT_LIST_HEAD(&priv->child_intfs);
INIT_LIST_HEAD(&priv->dead_ahs);
INIT_LIST_HEAD(&priv->multicast_list);
INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll);
INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
}
struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
{
struct net_device *dev;
dev = alloc_netdev((int) sizeof (struct ipoib_dev_priv), name,
ipoib_setup);
if (!dev)
return NULL;
return netdev_priv(dev);
}
static ssize_t show_pkey(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
return sprintf(buf, "0x%04x\n", priv->pkey);
}
static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
static ssize_t show_umcast(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
}
static ssize_t set_umcast(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
if (umcast_val > 0) {
set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
ipoib_warn(priv, "ignoring multicast groups joined directly "
"by userspace\n");
} else
clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
return count;
}
static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
int ipoib_add_umcast_attr(struct net_device *dev)
{
return device_create_file(&dev->dev, &dev_attr_umcast);
}
static ssize_t create_child(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int pkey;
int ret;
if (sscanf(buf, "%i", &pkey) != 1)
return -EINVAL;
if (pkey < 0 || pkey > 0xffff)
return -EINVAL;
/*
* Set the full membership bit, so that we join the right
* broadcast group, etc.
*/
pkey |= 0x8000;
ret = ipoib_vlan_add(to_net_dev(dev), pkey);
return ret ? ret : count;
}
static DEVICE_ATTR(create_child, S_IWUGO, NULL, create_child);
static ssize_t delete_child(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
int pkey;
int ret;
if (sscanf(buf, "%i", &pkey) != 1)
return -EINVAL;
if (pkey < 0 || pkey > 0xffff)
return -EINVAL;
ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
return ret ? ret : count;
}
static DEVICE_ATTR(delete_child, S_IWUGO, NULL, delete_child);
int ipoib_add_pkey_attr(struct net_device *dev)
{
return device_create_file(&dev->dev, &dev_attr_pkey);
}
int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
{
struct ib_device_attr *device_attr;
int result = -ENOMEM;
device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
if (!device_attr) {
printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
hca->name, sizeof *device_attr);
return result;
}
result = ib_query_device(hca, device_attr);
if (result) {
printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
hca->name, result);
kfree(device_attr);
return result;
}
priv->hca_caps = device_attr->device_cap_flags;
kfree(device_attr);
if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
set_bit(IPOIB_FLAG_CSUM, &priv->flags);
priv->dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
}
if (lro)
priv->dev->features |= NETIF_F_LRO;
if (priv->dev->features & NETIF_F_SG && priv->hca_caps & IB_DEVICE_UD_TSO)
priv->dev->features |= NETIF_F_TSO;
return 0;
}
static struct net_device *ipoib_add_port(const char *format,
struct ib_device *hca, u8 port)
{
struct ipoib_dev_priv *priv;
struct ib_port_attr attr;
int result = -ENOMEM;
priv = ipoib_intf_alloc(format);
if (!priv)
goto alloc_mem_failed;
SET_NETDEV_DEV(priv->dev, hca->dma_device);
if (!ib_query_port(hca, port, &attr))
priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
else {
printk(KERN_WARNING "%s: ib_query_port %d failed\n",
hca->name, port);
goto device_init_failed;
}
/* MTU will be reset when mcast join happens */
priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
result = ib_query_pkey(hca, port, 0, &priv->pkey);
if (result) {
printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
hca->name, port, result);
goto device_init_failed;
}
if (ipoib_set_dev_features(priv, hca))
goto device_init_failed;
/*
* Set the full membership bit, so that we join the right
* broadcast group, etc.
*/
priv->pkey |= 0x8000;
priv->dev->broadcast[8] = priv->pkey >> 8;
priv->dev->broadcast[9] = priv->pkey & 0xff;
result = ib_query_gid(hca, port, 0, &priv->local_gid);
if (result) {
printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
hca->name, port, result);
goto device_init_failed;
} else
memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
result = ipoib_dev_init(priv->dev, hca, port);
if (result < 0) {
printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
hca->name, port, result);
goto device_init_failed;
}
INIT_IB_EVENT_HANDLER(&priv->event_handler,
priv->ca, ipoib_event);
result = ib_register_event_handler(&priv->event_handler);
if (result < 0) {
printk(KERN_WARNING "%s: ib_register_event_handler failed for "
"port %d (ret = %d)\n",
hca->name, port, result);
goto event_failed;
}
result = register_netdev(priv->dev);
if (result) {
printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
hca->name, port, result);
goto register_failed;
}
ipoib_create_debug_files(priv->dev);
if (ipoib_cm_add_mode_attr(priv->dev))
goto sysfs_failed;
if (ipoib_add_pkey_attr(priv->dev))
goto sysfs_failed;
if (ipoib_add_umcast_attr(priv->dev))
goto sysfs_failed;
if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
goto sysfs_failed;
if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
goto sysfs_failed;
return priv->dev;
sysfs_failed:
ipoib_delete_debug_files(priv->dev);
unregister_netdev(priv->dev);
register_failed:
ib_unregister_event_handler(&priv->event_handler);
flush_workqueue(ipoib_workqueue);
event_failed:
ipoib_dev_cleanup(priv->dev);
device_init_failed:
free_netdev(priv->dev);
alloc_mem_failed:
return ERR_PTR(result);
}
static void ipoib_add_one(struct ib_device *device)
{
struct list_head *dev_list;
struct net_device *dev;
struct ipoib_dev_priv *priv;
int s, e, p;
if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
return;
dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
if (!dev_list)
return;
INIT_LIST_HEAD(dev_list);
if (device->node_type == RDMA_NODE_IB_SWITCH) {
s = 0;
e = 0;
} else {
s = 1;
e = device->phys_port_cnt;
}
for (p = s; p <= e; ++p) {
dev = ipoib_add_port("ib%d", device, p);
if (!IS_ERR(dev)) {
priv = netdev_priv(dev);
list_add_tail(&priv->list, dev_list);
}
}
ib_set_client_data(device, &ipoib_client, dev_list);
}
static void ipoib_remove_one(struct ib_device *device)
{
struct ipoib_dev_priv *priv, *tmp;
struct list_head *dev_list;
if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
return;
dev_list = ib_get_client_data(device, &ipoib_client);
list_for_each_entry_safe(priv, tmp, dev_list, list) {
ib_unregister_event_handler(&priv->event_handler);
rtnl_lock();
dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
rtnl_unlock();
flush_workqueue(ipoib_workqueue);
unregister_netdev(priv->dev);
ipoib_dev_cleanup(priv->dev);
free_netdev(priv->dev);
}
kfree(dev_list);
}
static int __init ipoib_init_module(void)
{
int ret;
ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
ipoib_sendq_size = max(ipoib_sendq_size, max(2 * MAX_SEND_CQE,
IPOIB_MIN_QUEUE_SIZE));
#ifdef CONFIG_INFINIBAND_IPOIB_CM
ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
#endif
/*
* When copying small received packets, we only copy from the
* linear data part of the SKB, so we rely on this condition.
*/
BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
ret = ipoib_register_debugfs();
if (ret)
return ret;
/*
* We create our own workqueue mainly because we want to be
* able to flush it when devices are being removed. We can't
* use schedule_work()/flush_scheduled_work() because both
* unregister_netdev() and linkwatch_event take the rtnl lock,
* so flush_scheduled_work() can deadlock during device
* removal.
*/
ipoib_workqueue = create_singlethread_workqueue("ipoib");
if (!ipoib_workqueue) {
ret = -ENOMEM;
goto err_fs;
}
ib_sa_register_client(&ipoib_sa_client);
ret = ib_register_client(&ipoib_client);
if (ret)
goto err_sa;
return 0;
err_sa:
ib_sa_unregister_client(&ipoib_sa_client);
destroy_workqueue(ipoib_workqueue);
err_fs:
ipoib_unregister_debugfs();
return ret;
}
static void __exit ipoib_cleanup_module(void)
{
ib_unregister_client(&ipoib_client);
ib_sa_unregister_client(&ipoib_sa_client);
ipoib_unregister_debugfs();
destroy_workqueue(ipoib_workqueue);
}
module_init(ipoib_init_module);
module_exit(ipoib_cleanup_module);