linux/drivers/net/hyperv/netvsc_drv.c

1693 lines
42 KiB
C
Raw Normal View History

/*
* Copyright (c) 2009, Microsoft Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, see <http://www.gnu.org/licenses/>.
*
* Authors:
* Haiyang Zhang <haiyangz@microsoft.com>
* Hank Janssen <hjanssen@microsoft.com>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/atomic.h>
#include <linux/module.h>
#include <linux/highmem.h>
#include <linux/device.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 08:04:11 +00:00
#include <linux/slab.h>
#include <net/arp.h>
#include <net/route.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include "hyperv_net.h"
#define RING_SIZE_MIN 64
#define LINKCHANGE_INT (2 * HZ)
static int ring_size = 128;
module_param(ring_size, int, S_IRUGO);
MODULE_PARM_DESC(ring_size, "Ring buffer size (# of pages)");
static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE |
NETIF_MSG_LINK | NETIF_MSG_IFUP |
NETIF_MSG_IFDOWN | NETIF_MSG_RX_ERR |
NETIF_MSG_TX_ERR;
static int debug = -1;
module_param(debug, int, S_IRUGO);
MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
static void netvsc_set_multicast_list(struct net_device *net)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
rndis_filter_update(nvdev);
}
static int netvsc_open(struct net_device *net)
{
struct net_device_context *ndev_ctx = netdev_priv(net);
struct netvsc_device *nvdev = ndev_ctx->nvdev;
struct rndis_device *rdev;
int ret = 0;
netif_carrier_off(net);
/* Open up the device */
ret = rndis_filter_open(nvdev);
if (ret != 0) {
netdev_err(net, "unable to open device (ret %d).\n", ret);
return ret;
}
netif_tx_wake_all_queues(net);
rdev = nvdev->extension;
if (!rdev->link_state && !ndev_ctx->datapath)
netif_carrier_on(net);
return ret;
}
static int netvsc_close(struct net_device *net)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
int ret;
u32 aread, i, msec = 10, retry = 0, retry_max = 20;
struct vmbus_channel *chn;
netif_tx_disable(net);
ret = rndis_filter_close(nvdev);
if (ret != 0) {
netdev_err(net, "unable to close device (ret %d).\n", ret);
return ret;
}
/* Ensure pending bytes in ring are read */
while (true) {
aread = 0;
for (i = 0; i < nvdev->num_chn; i++) {
chn = nvdev->chan_table[i].channel;
if (!chn)
continue;
aread = hv_get_bytes_to_read(&chn->inbound);
if (aread)
break;
aread = hv_get_bytes_to_read(&chn->outbound);
if (aread)
break;
}
retry++;
if (retry > retry_max || aread == 0)
break;
msleep(msec);
if (msec < 1000)
msec *= 2;
}
if (aread) {
netdev_err(net, "Ring buffer not empty after closing rndis\n");
ret = -ETIMEDOUT;
}
return ret;
}
static void *init_ppi_data(struct rndis_message *msg, u32 ppi_size,
int pkt_type)
{
struct rndis_packet *rndis_pkt;
struct rndis_per_packet_info *ppi;
rndis_pkt = &msg->msg.pkt;
rndis_pkt->data_offset += ppi_size;
ppi = (struct rndis_per_packet_info *)((void *)rndis_pkt +
rndis_pkt->per_pkt_info_offset + rndis_pkt->per_pkt_info_len);
ppi->size = ppi_size;
ppi->type = pkt_type;
ppi->ppi_offset = sizeof(struct rndis_per_packet_info);
rndis_pkt->per_pkt_info_len += ppi_size;
return ppi;
}
/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
* hash for non-TCP traffic with only IP numbers.
*/
static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk)
{
struct flow_keys flow;
u32 hash;
static u32 hashrnd __read_mostly;
net_get_random_once(&hashrnd, sizeof(hashrnd));
if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
return 0;
if (flow.basic.ip_proto == IPPROTO_TCP) {
return skb_get_hash(skb);
} else {
if (flow.basic.n_proto == htons(ETH_P_IP))
hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
else
hash = 0;
skb_set_hash(skb, hash, PKT_HASH_TYPE_L3);
}
return hash;
}
static inline int netvsc_get_tx_queue(struct net_device *ndev,
struct sk_buff *skb, int old_idx)
{
const struct net_device_context *ndc = netdev_priv(ndev);
struct sock *sk = skb->sk;
int q_idx;
q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) &
(VRSS_SEND_TAB_SIZE - 1)];
/* If queue index changed record the new value */
if (q_idx != old_idx &&
sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, q_idx);
return q_idx;
}
/*
* Select queue for transmit.
*
* If a valid queue has already been assigned, then use that.
* Otherwise compute tx queue based on hash and the send table.
*
* This is basically similar to default (__netdev_pick_tx) with the added step
* of using the host send_table when no other queue has been assigned.
*
* TODO support XPS - but get_xps_queue not exported
*/
static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback)
{
unsigned int num_tx_queues = ndev->real_num_tx_queues;
int q_idx = sk_tx_queue_get(skb->sk);
if (q_idx < 0 || skb->ooo_okay) {
/* If forwarding a packet, we use the recorded queue when
* available for better cache locality.
*/
if (skb_rx_queue_recorded(skb))
q_idx = skb_get_rx_queue(skb);
else
q_idx = netvsc_get_tx_queue(ndev, skb, q_idx);
}
while (unlikely(q_idx >= num_tx_queues))
q_idx -= num_tx_queues;
return q_idx;
}
static u32 fill_pg_buf(struct page *page, u32 offset, u32 len,
struct hv_page_buffer *pb)
{
int j = 0;
/* Deal with compund pages by ignoring unused part
* of the page.
*/
page += (offset >> PAGE_SHIFT);
offset &= ~PAGE_MASK;
while (len > 0) {
unsigned long bytes;
bytes = PAGE_SIZE - offset;
if (bytes > len)
bytes = len;
pb[j].pfn = page_to_pfn(page);
pb[j].offset = offset;
pb[j].len = bytes;
offset += bytes;
len -= bytes;
if (offset == PAGE_SIZE && len) {
page++;
offset = 0;
j++;
}
}
return j + 1;
}
static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb,
struct hv_netvsc_packet *packet,
struct hv_page_buffer **page_buf)
{
struct hv_page_buffer *pb = *page_buf;
u32 slots_used = 0;
char *data = skb->data;
int frags = skb_shinfo(skb)->nr_frags;
int i;
/* The packet is laid out thus:
* 1. hdr: RNDIS header and PPI
* 2. skb linear data
* 3. skb fragment data
*/
if (hdr != NULL)
slots_used += fill_pg_buf(virt_to_page(hdr),
offset_in_page(hdr),
len, &pb[slots_used]);
packet->rmsg_size = len;
packet->rmsg_pgcnt = slots_used;
slots_used += fill_pg_buf(virt_to_page(data),
offset_in_page(data),
skb_headlen(skb), &pb[slots_used]);
for (i = 0; i < frags; i++) {
skb_frag_t *frag = skb_shinfo(skb)->frags + i;
slots_used += fill_pg_buf(skb_frag_page(frag),
frag->page_offset,
skb_frag_size(frag), &pb[slots_used]);
}
return slots_used;
}
/* Estimate number of page buffers neede to transmit
* Need at most 2 for RNDIS header plus skb body and fragments.
*/
static unsigned int netvsc_get_slots(const struct sk_buff *skb)
{
return PFN_UP(offset_in_page(skb->data) + skb_headlen(skb))
+ skb_shinfo(skb)->nr_frags
+ 2;
}
static u32 net_checksum_info(struct sk_buff *skb)
{
if (skb->protocol == htons(ETH_P_IP)) {
struct iphdr *ip = ip_hdr(skb);
if (ip->protocol == IPPROTO_TCP)
return TRANSPORT_INFO_IPV4_TCP;
else if (ip->protocol == IPPROTO_UDP)
return TRANSPORT_INFO_IPV4_UDP;
} else {
struct ipv6hdr *ip6 = ipv6_hdr(skb);
if (ip6->nexthdr == IPPROTO_TCP)
return TRANSPORT_INFO_IPV6_TCP;
else if (ipv6_hdr(skb)->nexthdr == IPPROTO_UDP)
return TRANSPORT_INFO_IPV6_UDP;
}
return TRANSPORT_INFO_NOT_IP;
}
static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct hv_netvsc_packet *packet = NULL;
int ret;
unsigned int num_data_pgs;
struct rndis_message *rndis_msg;
struct rndis_packet *rndis_pkt;
u32 rndis_msg_size;
struct rndis_per_packet_info *ppi;
u32 hash;
struct hv_page_buffer page_buf[MAX_PAGE_BUFFER_COUNT];
struct hv_page_buffer *pb = page_buf;
/* We can only transmit MAX_PAGE_BUFFER_COUNT number
* of pages in a single packet. If skb is scattered around
* more pages we try linearizing it.
*/
num_data_pgs = netvsc_get_slots(skb);
if (unlikely(num_data_pgs > MAX_PAGE_BUFFER_COUNT)) {
++net_device_ctx->eth_stats.tx_scattered;
if (skb_linearize(skb))
goto no_memory;
num_data_pgs = netvsc_get_slots(skb);
if (num_data_pgs > MAX_PAGE_BUFFER_COUNT) {
++net_device_ctx->eth_stats.tx_too_big;
goto drop;
}
}
/*
* Place the rndis header in the skb head room and
* the skb->cb will be used for hv_netvsc_packet
* structure.
*/
ret = skb_cow_head(skb, RNDIS_AND_PPI_SIZE);
if (ret)
goto no_memory;
/* Use the skb control buffer for building up the packet */
BUILD_BUG_ON(sizeof(struct hv_netvsc_packet) >
FIELD_SIZEOF(struct sk_buff, cb));
packet = (struct hv_netvsc_packet *)skb->cb;
packet->q_idx = skb_get_queue_mapping(skb);
packet->total_data_buflen = skb->len;
packet->total_bytes = skb->len;
packet->total_packets = 1;
rndis_msg = (struct rndis_message *)skb->head;
memset(rndis_msg, 0, RNDIS_AND_PPI_SIZE);
/* Add the rndis header */
rndis_msg->ndis_msg_type = RNDIS_MSG_PACKET;
rndis_msg->msg_len = packet->total_data_buflen;
rndis_pkt = &rndis_msg->msg.pkt;
rndis_pkt->data_offset = sizeof(struct rndis_packet);
rndis_pkt->data_len = packet->total_data_buflen;
rndis_pkt->per_pkt_info_offset = sizeof(struct rndis_packet);
rndis_msg_size = RNDIS_MESSAGE_SIZE(struct rndis_packet);
hash = skb_get_hash_raw(skb);
if (hash != 0 && net->real_num_tx_queues > 1) {
rndis_msg_size += NDIS_HASH_PPI_SIZE;
ppi = init_ppi_data(rndis_msg, NDIS_HASH_PPI_SIZE,
NBL_HASH_VALUE);
*(u32 *)((void *)ppi + ppi->ppi_offset) = hash;
}
if (skb_vlan_tag_present(skb)) {
struct ndis_pkt_8021q_info *vlan;
rndis_msg_size += NDIS_VLAN_PPI_SIZE;
ppi = init_ppi_data(rndis_msg, NDIS_VLAN_PPI_SIZE,
IEEE_8021Q_INFO);
vlan = (struct ndis_pkt_8021q_info *)((void *)ppi +
ppi->ppi_offset);
vlan->vlanid = skb->vlan_tci & VLAN_VID_MASK;
vlan->pri = (skb->vlan_tci & VLAN_PRIO_MASK) >>
VLAN_PRIO_SHIFT;
}
if (skb_is_gso(skb)) {
struct ndis_tcp_lso_info *lso_info;
rndis_msg_size += NDIS_LSO_PPI_SIZE;
ppi = init_ppi_data(rndis_msg, NDIS_LSO_PPI_SIZE,
TCP_LARGESEND_PKTINFO);
lso_info = (struct ndis_tcp_lso_info *)((void *)ppi +
ppi->ppi_offset);
lso_info->lso_v2_transmit.type = NDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE;
if (skb->protocol == htons(ETH_P_IP)) {
lso_info->lso_v2_transmit.ip_version =
NDIS_TCP_LARGE_SEND_OFFLOAD_IPV4;
ip_hdr(skb)->tot_len = 0;
ip_hdr(skb)->check = 0;
tcp_hdr(skb)->check =
~csum_tcpudp_magic(ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
} else {
lso_info->lso_v2_transmit.ip_version =
NDIS_TCP_LARGE_SEND_OFFLOAD_IPV6;
ipv6_hdr(skb)->payload_len = 0;
tcp_hdr(skb)->check =
~csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, 0, IPPROTO_TCP, 0);
}
lso_info->lso_v2_transmit.tcp_header_offset = skb_transport_offset(skb);
lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (net_checksum_info(skb) & net_device_ctx->tx_checksum_mask) {
struct ndis_tcp_ip_checksum_info *csum_info;
rndis_msg_size += NDIS_CSUM_PPI_SIZE;
ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
TCPIP_CHKSUM_PKTINFO);
csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi +
ppi->ppi_offset);
csum_info->transmit.tcp_header_offset = skb_transport_offset(skb);
if (skb->protocol == htons(ETH_P_IP)) {
csum_info->transmit.is_ipv4 = 1;
if (ip_hdr(skb)->protocol == IPPROTO_TCP)
csum_info->transmit.tcp_checksum = 1;
else
csum_info->transmit.udp_checksum = 1;
} else {
csum_info->transmit.is_ipv6 = 1;
if (ipv6_hdr(skb)->nexthdr == IPPROTO_TCP)
csum_info->transmit.tcp_checksum = 1;
else
csum_info->transmit.udp_checksum = 1;
}
} else {
/* Can't do offload of this type of checksum */
if (skb_checksum_help(skb))
goto drop;
}
}
/* Start filling in the page buffers with the rndis hdr */
rndis_msg->msg_len += rndis_msg_size;
packet->total_data_buflen = rndis_msg->msg_len;
packet->page_buf_cnt = init_page_array(rndis_msg, rndis_msg_size,
skb, packet, &pb);
/* timestamp packet in software */
skb_tx_timestamp(skb);
ret = netvsc_send(net_device_ctx->device_ctx, packet,
rndis_msg, &pb, skb);
if (likely(ret == 0))
return NETDEV_TX_OK;
if (ret == -EAGAIN) {
++net_device_ctx->eth_stats.tx_busy;
return NETDEV_TX_BUSY;
}
if (ret == -ENOSPC)
++net_device_ctx->eth_stats.tx_no_space;
drop:
dev_kfree_skb_any(skb);
net->stats.tx_dropped++;
return NETDEV_TX_OK;
no_memory:
++net_device_ctx->eth_stats.tx_no_memory;
goto drop;
}
/*
* netvsc_linkstatus_callback - Link up/down notification
*/
void netvsc_linkstatus_callback(struct hv_device *device_obj,
struct rndis_message *resp)
{
struct rndis_indicate_status *indicate = &resp->msg.indicate_status;
struct net_device *net;
struct net_device_context *ndev_ctx;
struct netvsc_reconfig *event;
unsigned long flags;
net = hv_get_drvdata(device_obj);
if (!net)
return;
ndev_ctx = netdev_priv(net);
/* Update the physical link speed when changing to another vSwitch */
if (indicate->status == RNDIS_STATUS_LINK_SPEED_CHANGE) {
u32 speed;
speed = *(u32 *)((void *)indicate + indicate->
status_buf_offset) / 10000;
ndev_ctx->speed = speed;
return;
}
/* Handle these link change statuses below */
if (indicate->status != RNDIS_STATUS_NETWORK_CHANGE &&
indicate->status != RNDIS_STATUS_MEDIA_CONNECT &&
indicate->status != RNDIS_STATUS_MEDIA_DISCONNECT)
return;
if (net->reg_state != NETREG_REGISTERED)
return;
event = kzalloc(sizeof(*event), GFP_ATOMIC);
if (!event)
return;
event->event = indicate->status;
spin_lock_irqsave(&ndev_ctx->lock, flags);
list_add_tail(&event->list, &ndev_ctx->reconfig_events);
spin_unlock_irqrestore(&ndev_ctx->lock, flags);
schedule_delayed_work(&ndev_ctx->dwork, 0);
}
static struct sk_buff *netvsc_alloc_recv_skb(struct net_device *net,
struct napi_struct *napi,
const struct ndis_tcp_ip_checksum_info *csum_info,
const struct ndis_pkt_8021q_info *vlan,
void *data, u32 buflen)
{
struct sk_buff *skb;
skb = napi_alloc_skb(napi, buflen);
if (!skb)
return skb;
/*
* Copy to skb. This copy is needed here since the memory pointed by
* hv_netvsc_packet cannot be deallocated
*/
skb_put_data(skb, data, buflen);
skb->protocol = eth_type_trans(skb, net);
/* skb is already created with CHECKSUM_NONE */
skb_checksum_none_assert(skb);
/*
* In Linux, the IP checksum is always checked.
* Do L4 checksum offload if enabled and present.
*/
if (csum_info && (net->features & NETIF_F_RXCSUM)) {
if (csum_info->receive.tcp_checksum_succeeded ||
csum_info->receive.udp_checksum_succeeded)
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
if (vlan) {
u16 vlan_tci = vlan->vlanid | (vlan->pri << VLAN_PRIO_SHIFT);
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
vlan_tci);
}
return skb;
}
/*
* netvsc_recv_callback - Callback when we receive a packet from the
* "wire" on the specified device.
*/
int netvsc_recv_callback(struct net_device *net,
struct vmbus_channel *channel,
void *data, u32 len,
const struct ndis_tcp_ip_checksum_info *csum_info,
const struct ndis_pkt_8021q_info *vlan)
{
hv_netvsc: untangle the pointer mess We have the following structures keeping netvsc adapter state: - struct net_device - struct net_device_context - struct netvsc_device - struct rndis_device - struct hv_device and there are pointers/dependencies between them: - struct net_device_context is contained in struct net_device - struct hv_device has driver_data pointer which points to 'struct net_device' OR 'struct netvsc_device' depending on driver's state (!). - struct net_device_context has a pointer to 'struct hv_device'. - struct netvsc_device has pointers to 'struct hv_device' and 'struct net_device_context'. - struct rndis_device has a pointer to 'struct netvsc_device'. Different functions get different structures as parameters and use these pointers for traveling. The problem is (in addition to keeping in mind this complex graph) that some of these structures (struct netvsc_device and struct rndis_device) are being removed and re-created on mtu change (as we implement it as re-creation of hyper-v device) so our travel using these pointers is dangerous. Simplify this to a the following: - add struct netvsc_device pointer to struct net_device_context (which is a part of struct net_device and thus never disappears) - remove struct hv_device and struct net_device_context pointers from struct netvsc_device - replace pointer to 'struct netvsc_device' with pointer to 'struct net_device'. - always keep 'struct net_device' in hv_device driver_data. We'll end up with the following 'circular' structure: net_device: [net_device_context] -> netvsc_device -> rndis_device -> net_device -> hv_device -> net_device On MTU change we'll be removing the 'netvsc_device -> rndis_device' branch and re-creating it making the synchronization easier. There is one additional redundant pointer left, it is struct net_device link in struct netvsc_device, it is going to be removed in a separate commit. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-13 11:55:22 +00:00
struct net_device_context *net_device_ctx = netdev_priv(net);
struct netvsc_device *net_device;
u16 q_idx = channel->offermsg.offer.sub_channel_index;
struct netvsc_channel *nvchan;
struct net_device *vf_netdev;
struct sk_buff *skb;
struct netvsc_stats *rx_stats;
if (net->reg_state != NETREG_REGISTERED)
return NVSP_STAT_FAIL;
/*
* If necessary, inject this packet into the VF interface.
* On Hyper-V, multicast and brodcast packets are only delivered
* to the synthetic interface (after subjecting these to
* policy filters on the host). Deliver these via the VF
* interface in the guest.
*/
rcu_read_lock();
net_device = rcu_dereference(net_device_ctx->nvdev);
if (unlikely(!net_device))
goto drop;
nvchan = &net_device->chan_table[q_idx];
vf_netdev = rcu_dereference(net_device_ctx->vf_netdev);
if (vf_netdev && (vf_netdev->flags & IFF_UP))
net = vf_netdev;
/* Allocate a skb - TODO direct I/O to pages? */
skb = netvsc_alloc_recv_skb(net, &nvchan->napi,
csum_info, vlan, data, len);
if (unlikely(!skb)) {
drop:
++net->stats.rx_dropped;
rcu_read_unlock();
return NVSP_STAT_FAIL;
}
if (net != vf_netdev)
skb_record_rx_queue(skb, q_idx);
/*
* Even if injecting the packet, record the statistics
* on the synthetic device because modifying the VF device
* statistics will not work correctly.
*/
rx_stats = &nvchan->rx_stats;
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->packets++;
rx_stats->bytes += len;
if (skb->pkt_type == PACKET_BROADCAST)
++rx_stats->broadcast;
else if (skb->pkt_type == PACKET_MULTICAST)
++rx_stats->multicast;
u64_stats_update_end(&rx_stats->syncp);
napi_gro_receive(&nvchan->napi, skb);
rcu_read_unlock();
return 0;
}
static void netvsc_get_drvinfo(struct net_device *net,
struct ethtool_drvinfo *info)
{
strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
}
static void netvsc_get_channels(struct net_device *net,
struct ethtool_channels *channel)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
if (nvdev) {
channel->max_combined = nvdev->max_chn;
channel->combined_count = nvdev->num_chn;
}
}
static int netvsc_set_queues(struct net_device *net, struct hv_device *dev,
u32 num_chn)
{
struct netvsc_device_info device_info;
int ret;
memset(&device_info, 0, sizeof(device_info));
device_info.num_chn = num_chn;
device_info.ring_size = ring_size;
device_info.max_num_vrss_chns = num_chn;
ret = rndis_filter_device_add(dev, &device_info);
if (ret)
return ret;
ret = netif_set_real_num_tx_queues(net, num_chn);
if (ret)
return ret;
ret = netif_set_real_num_rx_queues(net, num_chn);
return ret;
}
static int netvsc_set_channels(struct net_device *net,
struct ethtool_channels *channels)
{
struct net_device_context *net_device_ctx = netdev_priv(net);
struct hv_device *dev = net_device_ctx->device_ctx;
struct netvsc_device *nvdev = rtnl_dereference(net_device_ctx->nvdev);
unsigned int count = channels->combined_count;
bool was_running;
int ret;
/* We do not support separate count for rx, tx, or other */
if (count == 0 ||
channels->rx_count || channels->tx_count || channels->other_count)
return -EINVAL;
if (count > net->num_tx_queues || count > net->num_rx_queues)
return -EINVAL;
if (!nvdev || nvdev->destroy)
return -ENODEV;
if (nvdev->nvsp_version < NVSP_PROTOCOL_VERSION_5)
return -EINVAL;
if (count > nvdev->max_chn)
return -EINVAL;
was_running = netif_running(net);
if (was_running) {
ret = netvsc_close(net);
if (ret)
return ret;
}
rndis_filter_device_remove(dev, nvdev);
ret = netvsc_set_queues(net, dev, count);
if (ret == 0)
nvdev->num_chn = count;
else
netvsc_set_queues(net, dev, nvdev->num_chn);
if (was_running)
ret = netvsc_open(net);
/* We may have missed link change notifications */
schedule_delayed_work(&net_device_ctx->dwork, 0);
return ret;
}
static bool
netvsc_validate_ethtool_ss_cmd(const struct ethtool_link_ksettings *cmd)
{
struct ethtool_link_ksettings diff1 = *cmd;
struct ethtool_link_ksettings diff2 = {};
diff1.base.speed = 0;
diff1.base.duplex = 0;
/* advertising and cmd are usually set */
ethtool_link_ksettings_zero_link_mode(&diff1, advertising);
diff1.base.cmd = 0;
/* We set port to PORT_OTHER */
diff2.base.port = PORT_OTHER;
return !memcmp(&diff1, &diff2, sizeof(diff1));
}
static void netvsc_init_settings(struct net_device *dev)
{
struct net_device_context *ndc = netdev_priv(dev);
ndc->speed = SPEED_UNKNOWN;
ndc->duplex = DUPLEX_FULL;
}
static int netvsc_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *cmd)
{
struct net_device_context *ndc = netdev_priv(dev);
cmd->base.speed = ndc->speed;
cmd->base.duplex = ndc->duplex;
cmd->base.port = PORT_OTHER;
return 0;
}
static int netvsc_set_link_ksettings(struct net_device *dev,
const struct ethtool_link_ksettings *cmd)
{
struct net_device_context *ndc = netdev_priv(dev);
u32 speed;
speed = cmd->base.speed;
if (!ethtool_validate_speed(speed) ||
!ethtool_validate_duplex(cmd->base.duplex) ||
!netvsc_validate_ethtool_ss_cmd(cmd))
return -EINVAL;
ndc->speed = speed;
ndc->duplex = cmd->base.duplex;
return 0;
}
static int netvsc_change_mtu(struct net_device *ndev, int mtu)
{
struct net_device_context *ndevctx = netdev_priv(ndev);
struct netvsc_device *nvdev = rtnl_dereference(ndevctx->nvdev);
hv_netvsc: untangle the pointer mess We have the following structures keeping netvsc adapter state: - struct net_device - struct net_device_context - struct netvsc_device - struct rndis_device - struct hv_device and there are pointers/dependencies between them: - struct net_device_context is contained in struct net_device - struct hv_device has driver_data pointer which points to 'struct net_device' OR 'struct netvsc_device' depending on driver's state (!). - struct net_device_context has a pointer to 'struct hv_device'. - struct netvsc_device has pointers to 'struct hv_device' and 'struct net_device_context'. - struct rndis_device has a pointer to 'struct netvsc_device'. Different functions get different structures as parameters and use these pointers for traveling. The problem is (in addition to keeping in mind this complex graph) that some of these structures (struct netvsc_device and struct rndis_device) are being removed and re-created on mtu change (as we implement it as re-creation of hyper-v device) so our travel using these pointers is dangerous. Simplify this to a the following: - add struct netvsc_device pointer to struct net_device_context (which is a part of struct net_device and thus never disappears) - remove struct hv_device and struct net_device_context pointers from struct netvsc_device - replace pointer to 'struct netvsc_device' with pointer to 'struct net_device'. - always keep 'struct net_device' in hv_device driver_data. We'll end up with the following 'circular' structure: net_device: [net_device_context] -> netvsc_device -> rndis_device -> net_device -> hv_device -> net_device On MTU change we'll be removing the 'netvsc_device -> rndis_device' branch and re-creating it making the synchronization easier. There is one additional redundant pointer left, it is struct net_device link in struct netvsc_device, it is going to be removed in a separate commit. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-13 11:55:22 +00:00
struct hv_device *hdev = ndevctx->device_ctx;
struct netvsc_device_info device_info;
bool was_running;
int ret = 0;
if (!nvdev || nvdev->destroy)
return -ENODEV;
was_running = netif_running(ndev);
if (was_running) {
ret = netvsc_close(ndev);
if (ret)
return ret;
}
memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
device_info.num_chn = nvdev->num_chn;
device_info.max_num_vrss_chns = nvdev->num_chn;
rndis_filter_device_remove(hdev, nvdev);
/* 'nvdev' has been freed in rndis_filter_device_remove() ->
* netvsc_device_remove () -> free_netvsc_device().
* We mustn't access it before it's re-created in
* rndis_filter_device_add() -> netvsc_device_add().
*/
ndev->mtu = mtu;
rndis_filter_device_add(hdev, &device_info);
if (was_running)
ret = netvsc_open(ndev);
/* We may have missed link change notifications */
schedule_delayed_work(&ndevctx->dwork, 0);
return ret;
}
static void netvsc_get_stats64(struct net_device *net,
struct rtnl_link_stats64 *t)
{
struct net_device_context *ndev_ctx = netdev_priv(net);
struct netvsc_device *nvdev = rcu_dereference_rtnl(ndev_ctx->nvdev);
int i;
if (!nvdev)
return;
for (i = 0; i < nvdev->num_chn; i++) {
const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
const struct netvsc_stats *stats;
u64 packets, bytes, multicast;
unsigned int start;
stats = &nvchan->tx_stats;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
t->tx_bytes += bytes;
t->tx_packets += packets;
stats = &nvchan->rx_stats;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
multicast = stats->multicast + stats->broadcast;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
t->rx_bytes += bytes;
t->rx_packets += packets;
t->multicast += multicast;
}
t->tx_dropped = net->stats.tx_dropped;
t->tx_errors = net->stats.tx_errors;
t->rx_dropped = net->stats.rx_dropped;
t->rx_errors = net->stats.rx_errors;
}
static int netvsc_set_mac_addr(struct net_device *ndev, void *p)
{
struct sockaddr *addr = p;
char save_adr[ETH_ALEN];
unsigned char save_aatype;
int err;
memcpy(save_adr, ndev->dev_addr, ETH_ALEN);
save_aatype = ndev->addr_assign_type;
err = eth_mac_addr(ndev, p);
if (err != 0)
return err;
err = rndis_filter_set_device_mac(ndev, addr->sa_data);
if (err != 0) {
/* roll back to saved MAC */
memcpy(ndev->dev_addr, save_adr, ETH_ALEN);
ndev->addr_assign_type = save_aatype;
}
return err;
}
static const struct {
char name[ETH_GSTRING_LEN];
u16 offset;
} netvsc_stats[] = {
{ "tx_scattered", offsetof(struct netvsc_ethtool_stats, tx_scattered) },
{ "tx_no_memory", offsetof(struct netvsc_ethtool_stats, tx_no_memory) },
{ "tx_no_space", offsetof(struct netvsc_ethtool_stats, tx_no_space) },
{ "tx_too_big", offsetof(struct netvsc_ethtool_stats, tx_too_big) },
{ "tx_busy", offsetof(struct netvsc_ethtool_stats, tx_busy) },
};
#define NETVSC_GLOBAL_STATS_LEN ARRAY_SIZE(netvsc_stats)
/* 4 statistics per queue (rx/tx packets/bytes) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 4)
static int netvsc_get_sset_count(struct net_device *dev, int string_set)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
if (!nvdev)
return -ENODEV;
switch (string_set) {
case ETH_SS_STATS:
return NETVSC_GLOBAL_STATS_LEN + NETVSC_QUEUE_STATS_LEN(nvdev);
default:
return -EINVAL;
}
}
static void netvsc_get_ethtool_stats(struct net_device *dev,
struct ethtool_stats *stats, u64 *data)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
const void *nds = &ndc->eth_stats;
const struct netvsc_stats *qstats;
unsigned int start;
u64 packets, bytes;
int i, j;
if (!nvdev)
return;
for (i = 0; i < NETVSC_GLOBAL_STATS_LEN; i++)
data[i] = *(unsigned long *)(nds + netvsc_stats[i].offset);
for (j = 0; j < nvdev->num_chn; j++) {
qstats = &nvdev->chan_table[j].tx_stats;
do {
start = u64_stats_fetch_begin_irq(&qstats->syncp);
packets = qstats->packets;
bytes = qstats->bytes;
} while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
data[i++] = packets;
data[i++] = bytes;
qstats = &nvdev->chan_table[j].rx_stats;
do {
start = u64_stats_fetch_begin_irq(&qstats->syncp);
packets = qstats->packets;
bytes = qstats->bytes;
} while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
data[i++] = packets;
data[i++] = bytes;
}
}
static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
u8 *p = data;
int i;
if (!nvdev)
return;
switch (stringset) {
case ETH_SS_STATS:
for (i = 0; i < ARRAY_SIZE(netvsc_stats); i++)
memcpy(p + i * ETH_GSTRING_LEN,
netvsc_stats[i].name, ETH_GSTRING_LEN);
p += i * ETH_GSTRING_LEN;
for (i = 0; i < nvdev->num_chn; i++) {
sprintf(p, "tx_queue_%u_packets", i);
p += ETH_GSTRING_LEN;
sprintf(p, "tx_queue_%u_bytes", i);
p += ETH_GSTRING_LEN;
sprintf(p, "rx_queue_%u_packets", i);
p += ETH_GSTRING_LEN;
sprintf(p, "rx_queue_%u_bytes", i);
p += ETH_GSTRING_LEN;
}
break;
}
}
static int
netvsc_get_rss_hash_opts(struct netvsc_device *nvdev,
struct ethtool_rxnfc *info)
{
info->data = RXH_IP_SRC | RXH_IP_DST;
switch (info->flow_type) {
case TCP_V4_FLOW:
case TCP_V6_FLOW:
info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
/* fallthrough */
case UDP_V4_FLOW:
case UDP_V6_FLOW:
case IPV4_FLOW:
case IPV6_FLOW:
break;
default:
info->data = 0;
break;
}
return 0;
}
static int
netvsc_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info,
u32 *rules)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rcu_dereference(ndc->nvdev);
if (!nvdev)
return -ENODEV;
switch (info->cmd) {
case ETHTOOL_GRXRINGS:
info->data = nvdev->num_chn;
return 0;
case ETHTOOL_GRXFH:
return netvsc_get_rss_hash_opts(nvdev, info);
}
return -EOPNOTSUPP;
}
#ifdef CONFIG_NET_POLL_CONTROLLER
static void netvsc_poll_controller(struct net_device *dev)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *ndev;
int i;
rcu_read_lock();
ndev = rcu_dereference(ndc->nvdev);
if (ndev) {
for (i = 0; i < ndev->num_chn; i++) {
struct netvsc_channel *nvchan = &ndev->chan_table[i];
napi_schedule(&nvchan->napi);
}
}
rcu_read_unlock();
}
#endif
static u32 netvsc_get_rxfh_key_size(struct net_device *dev)
{
return NETVSC_HASH_KEYLEN;
}
static u32 netvsc_rss_indir_size(struct net_device *dev)
{
return ITAB_NUM;
}
static int netvsc_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
u8 *hfunc)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *ndev = rcu_dereference(ndc->nvdev);
struct rndis_device *rndis_dev;
int i;
if (!ndev)
return -ENODEV;
if (hfunc)
*hfunc = ETH_RSS_HASH_TOP; /* Toeplitz */
rndis_dev = ndev->extension;
if (indir) {
for (i = 0; i < ITAB_NUM; i++)
indir[i] = rndis_dev->ind_table[i];
}
if (key)
memcpy(key, rndis_dev->rss_key, NETVSC_HASH_KEYLEN);
return 0;
}
static int netvsc_set_rxfh(struct net_device *dev, const u32 *indir,
const u8 *key, const u8 hfunc)
{
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *ndev = rtnl_dereference(ndc->nvdev);
struct rndis_device *rndis_dev;
int i;
if (!ndev)
return -ENODEV;
if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
return -EOPNOTSUPP;
rndis_dev = ndev->extension;
if (indir) {
for (i = 0; i < ITAB_NUM; i++)
if (indir[i] >= dev->num_rx_queues)
return -EINVAL;
for (i = 0; i < ITAB_NUM; i++)
rndis_dev->ind_table[i] = indir[i];
}
if (!key) {
if (!indir)
return 0;
key = rndis_dev->rss_key;
}
return rndis_filter_set_rss_param(rndis_dev, key, ndev->num_chn);
}
static const struct ethtool_ops ethtool_ops = {
.get_drvinfo = netvsc_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_ethtool_stats = netvsc_get_ethtool_stats,
.get_sset_count = netvsc_get_sset_count,
.get_strings = netvsc_get_strings,
.get_channels = netvsc_get_channels,
.set_channels = netvsc_set_channels,
.get_ts_info = ethtool_op_get_ts_info,
.get_rxnfc = netvsc_get_rxnfc,
.get_rxfh_key_size = netvsc_get_rxfh_key_size,
.get_rxfh_indir_size = netvsc_rss_indir_size,
.get_rxfh = netvsc_get_rxfh,
.set_rxfh = netvsc_set_rxfh,
.get_link_ksettings = netvsc_get_link_ksettings,
.set_link_ksettings = netvsc_set_link_ksettings,
};
static const struct net_device_ops device_ops = {
.ndo_open = netvsc_open,
.ndo_stop = netvsc_close,
.ndo_start_xmit = netvsc_start_xmit,
.ndo_set_rx_mode = netvsc_set_multicast_list,
.ndo_change_mtu = netvsc_change_mtu,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = netvsc_set_mac_addr,
.ndo_select_queue = netvsc_select_queue,
.ndo_get_stats64 = netvsc_get_stats64,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = netvsc_poll_controller,
#endif
};
/*
* Handle link status changes. For RNDIS_STATUS_NETWORK_CHANGE emulate link
* down/up sequence. In case of RNDIS_STATUS_MEDIA_CONNECT when carrier is
* present send GARP packet to network peers with netif_notify_peers().
*/
static void netvsc_link_change(struct work_struct *w)
{
struct net_device_context *ndev_ctx =
container_of(w, struct net_device_context, dwork.work);
struct hv_device *device_obj = ndev_ctx->device_ctx;
struct net_device *net = hv_get_drvdata(device_obj);
struct netvsc_device *net_device;
struct rndis_device *rdev;
struct netvsc_reconfig *event = NULL;
bool notify = false, reschedule = false;
unsigned long flags, next_reconfig, delay;
rtnl_lock();
net_device = rtnl_dereference(ndev_ctx->nvdev);
if (!net_device)
goto out_unlock;
rdev = net_device->extension;
next_reconfig = ndev_ctx->last_reconfig + LINKCHANGE_INT;
if (time_is_after_jiffies(next_reconfig)) {
/* link_watch only sends one notification with current state
* per second, avoid doing reconfig more frequently. Handle
* wrap around.
*/
delay = next_reconfig - jiffies;
delay = delay < LINKCHANGE_INT ? delay : LINKCHANGE_INT;
schedule_delayed_work(&ndev_ctx->dwork, delay);
goto out_unlock;
}
ndev_ctx->last_reconfig = jiffies;
spin_lock_irqsave(&ndev_ctx->lock, flags);
if (!list_empty(&ndev_ctx->reconfig_events)) {
event = list_first_entry(&ndev_ctx->reconfig_events,
struct netvsc_reconfig, list);
list_del(&event->list);
reschedule = !list_empty(&ndev_ctx->reconfig_events);
}
spin_unlock_irqrestore(&ndev_ctx->lock, flags);
if (!event)
goto out_unlock;
switch (event->event) {
/* Only the following events are possible due to the check in
* netvsc_linkstatus_callback()
*/
case RNDIS_STATUS_MEDIA_CONNECT:
if (rdev->link_state) {
rdev->link_state = false;
if (!ndev_ctx->datapath)
netif_carrier_on(net);
netif_tx_wake_all_queues(net);
} else {
notify = true;
}
kfree(event);
break;
case RNDIS_STATUS_MEDIA_DISCONNECT:
if (!rdev->link_state) {
rdev->link_state = true;
netif_carrier_off(net);
netif_tx_stop_all_queues(net);
}
kfree(event);
break;
case RNDIS_STATUS_NETWORK_CHANGE:
/* Only makes sense if carrier is present */
if (!rdev->link_state) {
rdev->link_state = true;
netif_carrier_off(net);
netif_tx_stop_all_queues(net);
event->event = RNDIS_STATUS_MEDIA_CONNECT;
spin_lock_irqsave(&ndev_ctx->lock, flags);
list_add(&event->list, &ndev_ctx->reconfig_events);
spin_unlock_irqrestore(&ndev_ctx->lock, flags);
reschedule = true;
}
break;
}
rtnl_unlock();
if (notify)
netdev_notify_peers(net);
/* link_watch only sends one notification with current state per
* second, handle next reconfig event in 2 seconds.
*/
if (reschedule)
schedule_delayed_work(&ndev_ctx->dwork, LINKCHANGE_INT);
return;
out_unlock:
rtnl_unlock();
}
static struct net_device *get_netvsc_bymac(const u8 *mac)
{
struct net_device *dev;
ASSERT_RTNL();
for_each_netdev(&init_net, dev) {
if (dev->netdev_ops != &device_ops)
continue; /* not a netvsc device */
if (ether_addr_equal(mac, dev->perm_addr))
return dev;
}
return NULL;
}
static struct net_device *get_netvsc_byref(struct net_device *vf_netdev)
{
struct net_device *dev;
ASSERT_RTNL();
for_each_netdev(&init_net, dev) {
struct net_device_context *net_device_ctx;
if (dev->netdev_ops != &device_ops)
continue; /* not a netvsc device */
net_device_ctx = netdev_priv(dev);
if (net_device_ctx->nvdev == NULL)
continue; /* device is removed */
if (rtnl_dereference(net_device_ctx->vf_netdev) == vf_netdev)
return dev; /* a match */
}
return NULL;
}
static int netvsc_register_vf(struct net_device *vf_netdev)
{
struct net_device *ndev;
struct net_device_context *net_device_ctx;
struct netvsc_device *netvsc_dev;
if (vf_netdev->addr_len != ETH_ALEN)
return NOTIFY_DONE;
/*
* We will use the MAC address to locate the synthetic interface to
* associate with the VF interface. If we don't find a matching
* synthetic interface, move on.
*/
ndev = get_netvsc_bymac(vf_netdev->perm_addr);
if (!ndev)
return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
if (!netvsc_dev || rtnl_dereference(net_device_ctx->vf_netdev))
return NOTIFY_DONE;
netdev_info(ndev, "VF registering: %s\n", vf_netdev->name);
/*
* Take a reference on the module.
*/
try_module_get(THIS_MODULE);
dev_hold(vf_netdev);
rcu_assign_pointer(net_device_ctx->vf_netdev, vf_netdev);
return NOTIFY_OK;
}
static int netvsc_vf_up(struct net_device *vf_netdev)
{
struct net_device *ndev;
struct netvsc_device *netvsc_dev;
struct net_device_context *net_device_ctx;
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
netdev_info(ndev, "VF up: %s\n", vf_netdev->name);
/*
* Open the device before switching data path.
*/
rndis_filter_open(netvsc_dev);
/*
* notify the host to switch the data path.
*/
netvsc_switch_datapath(ndev, true);
netdev_info(ndev, "Data path switched to VF: %s\n", vf_netdev->name);
netif_carrier_off(ndev);
/* Now notify peers through VF device. */
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, vf_netdev);
return NOTIFY_OK;
}
static int netvsc_vf_down(struct net_device *vf_netdev)
{
struct net_device *ndev;
struct netvsc_device *netvsc_dev;
struct net_device_context *net_device_ctx;
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
netvsc_dev = rtnl_dereference(net_device_ctx->nvdev);
netdev_info(ndev, "VF down: %s\n", vf_netdev->name);
netvsc_switch_datapath(ndev, false);
netdev_info(ndev, "Data path switched from VF: %s\n", vf_netdev->name);
rndis_filter_close(netvsc_dev);
netif_carrier_on(ndev);
/* Now notify peers through netvsc device. */
call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, ndev);
return NOTIFY_OK;
}
static int netvsc_unregister_vf(struct net_device *vf_netdev)
{
struct net_device *ndev;
struct net_device_context *net_device_ctx;
ndev = get_netvsc_byref(vf_netdev);
if (!ndev)
return NOTIFY_DONE;
net_device_ctx = netdev_priv(ndev);
netdev_info(ndev, "VF unregistering: %s\n", vf_netdev->name);
RCU_INIT_POINTER(net_device_ctx->vf_netdev, NULL);
dev_put(vf_netdev);
module_put(THIS_MODULE);
return NOTIFY_OK;
}
static int netvsc_probe(struct hv_device *dev,
const struct hv_vmbus_device_id *dev_id)
{
struct net_device *net = NULL;
struct net_device_context *net_device_ctx;
struct netvsc_device_info device_info;
struct netvsc_device *nvdev;
int ret;
net = alloc_etherdev_mq(sizeof(struct net_device_context),
VRSS_CHANNEL_MAX);
if (!net)
return -ENOMEM;
netif_carrier_off(net);
netvsc_init_settings(net);
net_device_ctx = netdev_priv(net);
net_device_ctx->device_ctx = dev;
net_device_ctx->msg_enable = netif_msg_init(debug, default_msg);
if (netif_msg_probe(net_device_ctx))
netdev_dbg(net, "netvsc msg_enable: %d\n",
net_device_ctx->msg_enable);
hv_set_drvdata(dev, net);
INIT_DELAYED_WORK(&net_device_ctx->dwork, netvsc_link_change);
spin_lock_init(&net_device_ctx->lock);
INIT_LIST_HEAD(&net_device_ctx->reconfig_events);
net->netdev_ops = &device_ops;
net->ethtool_ops = &ethtool_ops;
SET_NETDEV_DEV(net, &dev->device);
/* We always need headroom for rndis header */
net->needed_headroom = RNDIS_AND_PPI_SIZE;
/* Notify the netvsc driver of the new device */
memset(&device_info, 0, sizeof(device_info));
device_info.ring_size = ring_size;
device_info.num_chn = VRSS_CHANNEL_DEFAULT;
ret = rndis_filter_device_add(dev, &device_info);
if (ret != 0) {
netdev_err(net, "unable to add netvsc device (ret %d)\n", ret);
free_netdev(net);
hv_set_drvdata(dev, NULL);
return ret;
}
memcpy(net->dev_addr, device_info.mac_adr, ETH_ALEN);
/* hw_features computed in rndis_filter_device_add */
net->features = net->hw_features |
NETIF_F_HIGHDMA | NETIF_F_SG |
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
net->vlan_features = net->features;
/* RCU not necessary here, device not registered */
hv_netvsc: untangle the pointer mess We have the following structures keeping netvsc adapter state: - struct net_device - struct net_device_context - struct netvsc_device - struct rndis_device - struct hv_device and there are pointers/dependencies between them: - struct net_device_context is contained in struct net_device - struct hv_device has driver_data pointer which points to 'struct net_device' OR 'struct netvsc_device' depending on driver's state (!). - struct net_device_context has a pointer to 'struct hv_device'. - struct netvsc_device has pointers to 'struct hv_device' and 'struct net_device_context'. - struct rndis_device has a pointer to 'struct netvsc_device'. Different functions get different structures as parameters and use these pointers for traveling. The problem is (in addition to keeping in mind this complex graph) that some of these structures (struct netvsc_device and struct rndis_device) are being removed and re-created on mtu change (as we implement it as re-creation of hyper-v device) so our travel using these pointers is dangerous. Simplify this to a the following: - add struct netvsc_device pointer to struct net_device_context (which is a part of struct net_device and thus never disappears) - remove struct hv_device and struct net_device_context pointers from struct netvsc_device - replace pointer to 'struct netvsc_device' with pointer to 'struct net_device'. - always keep 'struct net_device' in hv_device driver_data. We'll end up with the following 'circular' structure: net_device: [net_device_context] -> netvsc_device -> rndis_device -> net_device -> hv_device -> net_device On MTU change we'll be removing the 'netvsc_device -> rndis_device' branch and re-creating it making the synchronization easier. There is one additional redundant pointer left, it is struct net_device link in struct netvsc_device, it is going to be removed in a separate commit. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-13 11:55:22 +00:00
nvdev = net_device_ctx->nvdev;
netif_set_real_num_tx_queues(net, nvdev->num_chn);
netif_set_real_num_rx_queues(net, nvdev->num_chn);
/* MTU range: 68 - 1500 or 65521 */
net->min_mtu = NETVSC_MTU_MIN;
if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2)
net->max_mtu = NETVSC_MTU - ETH_HLEN;
else
net->max_mtu = ETH_DATA_LEN;
ret = register_netdev(net);
if (ret != 0) {
pr_err("Unable to register netdev.\n");
rndis_filter_device_remove(dev, nvdev);
free_netdev(net);
}
return ret;
}
static int netvsc_remove(struct hv_device *dev)
{
struct net_device *net;
struct net_device_context *ndev_ctx;
hv_netvsc: untangle the pointer mess We have the following structures keeping netvsc adapter state: - struct net_device - struct net_device_context - struct netvsc_device - struct rndis_device - struct hv_device and there are pointers/dependencies between them: - struct net_device_context is contained in struct net_device - struct hv_device has driver_data pointer which points to 'struct net_device' OR 'struct netvsc_device' depending on driver's state (!). - struct net_device_context has a pointer to 'struct hv_device'. - struct netvsc_device has pointers to 'struct hv_device' and 'struct net_device_context'. - struct rndis_device has a pointer to 'struct netvsc_device'. Different functions get different structures as parameters and use these pointers for traveling. The problem is (in addition to keeping in mind this complex graph) that some of these structures (struct netvsc_device and struct rndis_device) are being removed and re-created on mtu change (as we implement it as re-creation of hyper-v device) so our travel using these pointers is dangerous. Simplify this to a the following: - add struct netvsc_device pointer to struct net_device_context (which is a part of struct net_device and thus never disappears) - remove struct hv_device and struct net_device_context pointers from struct netvsc_device - replace pointer to 'struct netvsc_device' with pointer to 'struct net_device'. - always keep 'struct net_device' in hv_device driver_data. We'll end up with the following 'circular' structure: net_device: [net_device_context] -> netvsc_device -> rndis_device -> net_device -> hv_device -> net_device On MTU change we'll be removing the 'netvsc_device -> rndis_device' branch and re-creating it making the synchronization easier. There is one additional redundant pointer left, it is struct net_device link in struct netvsc_device, it is going to be removed in a separate commit. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-13 11:55:22 +00:00
net = hv_get_drvdata(dev);
if (net == NULL) {
dev_err(&dev->device, "No net device to remove\n");
return 0;
}
ndev_ctx = netdev_priv(net);
hv_netvsc: untangle the pointer mess We have the following structures keeping netvsc adapter state: - struct net_device - struct net_device_context - struct netvsc_device - struct rndis_device - struct hv_device and there are pointers/dependencies between them: - struct net_device_context is contained in struct net_device - struct hv_device has driver_data pointer which points to 'struct net_device' OR 'struct netvsc_device' depending on driver's state (!). - struct net_device_context has a pointer to 'struct hv_device'. - struct netvsc_device has pointers to 'struct hv_device' and 'struct net_device_context'. - struct rndis_device has a pointer to 'struct netvsc_device'. Different functions get different structures as parameters and use these pointers for traveling. The problem is (in addition to keeping in mind this complex graph) that some of these structures (struct netvsc_device and struct rndis_device) are being removed and re-created on mtu change (as we implement it as re-creation of hyper-v device) so our travel using these pointers is dangerous. Simplify this to a the following: - add struct netvsc_device pointer to struct net_device_context (which is a part of struct net_device and thus never disappears) - remove struct hv_device and struct net_device_context pointers from struct netvsc_device - replace pointer to 'struct netvsc_device' with pointer to 'struct net_device'. - always keep 'struct net_device' in hv_device driver_data. We'll end up with the following 'circular' structure: net_device: [net_device_context] -> netvsc_device -> rndis_device -> net_device -> hv_device -> net_device On MTU change we'll be removing the 'netvsc_device -> rndis_device' branch and re-creating it making the synchronization easier. There is one additional redundant pointer left, it is struct net_device link in struct netvsc_device, it is going to be removed in a separate commit. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-13 11:55:22 +00:00
netif_device_detach(net);
cancel_delayed_work_sync(&ndev_ctx->dwork);
/*
* Call to the vsc driver to let it know that the device is being
* removed. Also blocks mtu and channel changes.
*/
rtnl_lock();
rndis_filter_device_remove(dev, ndev_ctx->nvdev);
rtnl_unlock();
unregister_netdev(net);
hv_netvsc: untangle the pointer mess We have the following structures keeping netvsc adapter state: - struct net_device - struct net_device_context - struct netvsc_device - struct rndis_device - struct hv_device and there are pointers/dependencies between them: - struct net_device_context is contained in struct net_device - struct hv_device has driver_data pointer which points to 'struct net_device' OR 'struct netvsc_device' depending on driver's state (!). - struct net_device_context has a pointer to 'struct hv_device'. - struct netvsc_device has pointers to 'struct hv_device' and 'struct net_device_context'. - struct rndis_device has a pointer to 'struct netvsc_device'. Different functions get different structures as parameters and use these pointers for traveling. The problem is (in addition to keeping in mind this complex graph) that some of these structures (struct netvsc_device and struct rndis_device) are being removed and re-created on mtu change (as we implement it as re-creation of hyper-v device) so our travel using these pointers is dangerous. Simplify this to a the following: - add struct netvsc_device pointer to struct net_device_context (which is a part of struct net_device and thus never disappears) - remove struct hv_device and struct net_device_context pointers from struct netvsc_device - replace pointer to 'struct netvsc_device' with pointer to 'struct net_device'. - always keep 'struct net_device' in hv_device driver_data. We'll end up with the following 'circular' structure: net_device: [net_device_context] -> netvsc_device -> rndis_device -> net_device -> hv_device -> net_device On MTU change we'll be removing the 'netvsc_device -> rndis_device' branch and re-creating it making the synchronization easier. There is one additional redundant pointer left, it is struct net_device link in struct netvsc_device, it is going to be removed in a separate commit. Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2016-05-13 11:55:22 +00:00
hv_set_drvdata(dev, NULL);
free_netdev(net);
return 0;
}
static const struct hv_vmbus_device_id id_table[] = {
/* Network guid */
{ HV_NIC_GUID, },
{ },
};
MODULE_DEVICE_TABLE(vmbus, id_table);
/* The one and only one */
static struct hv_driver netvsc_drv = {
.name = KBUILD_MODNAME,
.id_table = id_table,
.probe = netvsc_probe,
.remove = netvsc_remove,
};
/*
* On Hyper-V, every VF interface is matched with a corresponding
* synthetic interface. The synthetic interface is presented first
* to the guest. When the corresponding VF instance is registered,
* we will take care of switching the data path.
*/
static int netvsc_netdev_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
/* Skip our own events */
if (event_dev->netdev_ops == &device_ops)
return NOTIFY_DONE;
/* Avoid non-Ethernet type devices */
if (event_dev->type != ARPHRD_ETHER)
return NOTIFY_DONE;
/* Avoid Vlan dev with same MAC registering as VF */
if (is_vlan_dev(event_dev))
return NOTIFY_DONE;
/* Avoid Bonding master dev with same MAC registering as VF */
if ((event_dev->priv_flags & IFF_BONDING) &&
(event_dev->flags & IFF_MASTER))
return NOTIFY_DONE;
switch (event) {
case NETDEV_REGISTER:
return netvsc_register_vf(event_dev);
case NETDEV_UNREGISTER:
return netvsc_unregister_vf(event_dev);
case NETDEV_UP:
return netvsc_vf_up(event_dev);
case NETDEV_DOWN:
return netvsc_vf_down(event_dev);
default:
return NOTIFY_DONE;
}
}
static struct notifier_block netvsc_netdev_notifier = {
.notifier_call = netvsc_netdev_event,
};
static void __exit netvsc_drv_exit(void)
{
unregister_netdevice_notifier(&netvsc_netdev_notifier);
vmbus_driver_unregister(&netvsc_drv);
}
static int __init netvsc_drv_init(void)
{
int ret;
if (ring_size < RING_SIZE_MIN) {
ring_size = RING_SIZE_MIN;
pr_info("Increased ring_size to %d (min allowed)\n",
ring_size);
}
ret = vmbus_driver_register(&netvsc_drv);
if (ret)
return ret;
register_netdevice_notifier(&netvsc_netdev_notifier);
return 0;
}
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Microsoft Hyper-V network driver");
module_init(netvsc_drv_init);
module_exit(netvsc_drv_exit);