linux/drivers/net/team/team_mode_loadbalance.c

716 lines
18 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* drivers/net/team/team_mode_loadbalance.c - Load-balancing mode for team
* Copyright (c) 2012 Jiri Pirko <jpirko@redhat.com>
*/
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/filter.h>
#include <linux/if_team.h>
static rx_handler_result_t lb_receive(struct team *team, struct team_port *port,
struct sk_buff *skb)
{
if (unlikely(skb->protocol == htons(ETH_P_SLOW))) {
/* LACPDU packets should go to exact delivery */
const unsigned char *dest = eth_hdr(skb)->h_dest;
if (is_link_local_ether_addr(dest) && dest[5] == 0x02)
return RX_HANDLER_EXACT;
}
return RX_HANDLER_ANOTHER;
}
struct lb_priv;
typedef struct team_port *lb_select_tx_port_func_t(struct team *,
struct lb_priv *,
struct sk_buff *,
unsigned char);
#define LB_TX_HASHTABLE_SIZE 256 /* hash is a char */
struct lb_stats {
u64 tx_bytes;
};
struct lb_pcpu_stats {
struct lb_stats hash_stats[LB_TX_HASHTABLE_SIZE];
struct u64_stats_sync syncp;
};
struct lb_stats_info {
struct lb_stats stats;
struct lb_stats last_stats;
struct team_option_inst_info *opt_inst_info;
};
struct lb_port_mapping {
struct team_port __rcu *port;
struct team_option_inst_info *opt_inst_info;
};
struct lb_priv_ex {
struct team *team;
struct lb_port_mapping tx_hash_to_port_mapping[LB_TX_HASHTABLE_SIZE];
struct sock_fprog_kern *orig_fprog;
struct {
unsigned int refresh_interval; /* in tenths of second */
struct delayed_work refresh_dw;
struct lb_stats_info info[LB_TX_HASHTABLE_SIZE];
} stats;
};
struct lb_priv {
net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 03:34:16 +00:00
struct bpf_prog __rcu *fp;
lb_select_tx_port_func_t __rcu *select_tx_port_func;
struct lb_pcpu_stats __percpu *pcpu_stats;
struct lb_priv_ex *ex; /* priv extension */
};
static struct lb_priv *get_lb_priv(struct team *team)
{
return (struct lb_priv *) &team->mode_priv;
}
struct lb_port_priv {
struct lb_stats __percpu *pcpu_stats;
struct lb_stats_info stats_info;
};
static struct lb_port_priv *get_lb_port_priv(struct team_port *port)
{
return (struct lb_port_priv *) &port->mode_priv;
}
#define LB_HTPM_PORT_BY_HASH(lp_priv, hash) \
(lb_priv)->ex->tx_hash_to_port_mapping[hash].port
#define LB_HTPM_OPT_INST_INFO_BY_HASH(lp_priv, hash) \
(lb_priv)->ex->tx_hash_to_port_mapping[hash].opt_inst_info
static void lb_tx_hash_to_port_mapping_null_port(struct team *team,
struct team_port *port)
{
struct lb_priv *lb_priv = get_lb_priv(team);
bool changed = false;
int i;
for (i = 0; i < LB_TX_HASHTABLE_SIZE; i++) {
struct lb_port_mapping *pm;
pm = &lb_priv->ex->tx_hash_to_port_mapping[i];
if (rcu_access_pointer(pm->port) == port) {
RCU_INIT_POINTER(pm->port, NULL);
team_option_inst_set_change(pm->opt_inst_info);
changed = true;
}
}
if (changed)
team_options_change_check(team);
}
/* Basic tx selection based solely by hash */
static struct team_port *lb_hash_select_tx_port(struct team *team,
struct lb_priv *lb_priv,
struct sk_buff *skb,
unsigned char hash)
{
int port_index = team_num_to_port_index(team, hash);
return team_get_port_by_index_rcu(team, port_index);
}
/* Hash to port mapping select tx port */
static struct team_port *lb_htpm_select_tx_port(struct team *team,
struct lb_priv *lb_priv,
struct sk_buff *skb,
unsigned char hash)
{
struct team_port *port;
port = rcu_dereference_bh(LB_HTPM_PORT_BY_HASH(lb_priv, hash));
if (likely(port))
return port;
/* If no valid port in the table, fall back to simple hash */
return lb_hash_select_tx_port(team, lb_priv, skb, hash);
}
struct lb_select_tx_port {
char *name;
lb_select_tx_port_func_t *func;
};
static const struct lb_select_tx_port lb_select_tx_port_list[] = {
{
.name = "hash",
.func = lb_hash_select_tx_port,
},
{
.name = "hash_to_port_mapping",
.func = lb_htpm_select_tx_port,
},
};
#define LB_SELECT_TX_PORT_LIST_COUNT ARRAY_SIZE(lb_select_tx_port_list)
static char *lb_select_tx_port_get_name(lb_select_tx_port_func_t *func)
{
int i;
for (i = 0; i < LB_SELECT_TX_PORT_LIST_COUNT; i++) {
const struct lb_select_tx_port *item;
item = &lb_select_tx_port_list[i];
if (item->func == func)
return item->name;
}
return NULL;
}
static lb_select_tx_port_func_t *lb_select_tx_port_get_func(const char *name)
{
int i;
for (i = 0; i < LB_SELECT_TX_PORT_LIST_COUNT; i++) {
const struct lb_select_tx_port *item;
item = &lb_select_tx_port_list[i];
if (!strcmp(item->name, name))
return item->func;
}
return NULL;
}
static unsigned int lb_get_skb_hash(struct lb_priv *lb_priv,
struct sk_buff *skb)
{
net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 03:34:16 +00:00
struct bpf_prog *fp;
uint32_t lhash;
unsigned char *c;
fp = rcu_dereference_bh(lb_priv->fp);
if (unlikely(!fp))
return 0;
lhash = bpf_prog_run(fp, skb);
c = (char *) &lhash;
return c[0] ^ c[1] ^ c[2] ^ c[3];
}
static void lb_update_tx_stats(unsigned int tx_bytes, struct lb_priv *lb_priv,
struct lb_port_priv *lb_port_priv,
unsigned char hash)
{
struct lb_pcpu_stats *pcpu_stats;
struct lb_stats *port_stats;
struct lb_stats *hash_stats;
pcpu_stats = this_cpu_ptr(lb_priv->pcpu_stats);
port_stats = this_cpu_ptr(lb_port_priv->pcpu_stats);
hash_stats = &pcpu_stats->hash_stats[hash];
u64_stats_update_begin(&pcpu_stats->syncp);
port_stats->tx_bytes += tx_bytes;
hash_stats->tx_bytes += tx_bytes;
u64_stats_update_end(&pcpu_stats->syncp);
}
static bool lb_transmit(struct team *team, struct sk_buff *skb)
{
struct lb_priv *lb_priv = get_lb_priv(team);
lb_select_tx_port_func_t *select_tx_port_func;
struct team_port *port;
unsigned char hash;
unsigned int tx_bytes = skb->len;
hash = lb_get_skb_hash(lb_priv, skb);
select_tx_port_func = rcu_dereference_bh(lb_priv->select_tx_port_func);
port = select_tx_port_func(team, lb_priv, skb, hash);
if (unlikely(!port))
goto drop;
if (team_dev_queue_xmit(team, port, skb))
return false;
lb_update_tx_stats(tx_bytes, lb_priv, get_lb_port_priv(port), hash);
return true;
drop:
dev_kfree_skb_any(skb);
return false;
}
static int lb_bpf_func_get(struct team *team, struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
if (!lb_priv->ex->orig_fprog) {
ctx->data.bin_val.len = 0;
ctx->data.bin_val.ptr = NULL;
return 0;
}
ctx->data.bin_val.len = lb_priv->ex->orig_fprog->len *
sizeof(struct sock_filter);
ctx->data.bin_val.ptr = lb_priv->ex->orig_fprog->filter;
return 0;
}
static int __fprog_create(struct sock_fprog_kern **pfprog, u32 data_len,
const void *data)
{
struct sock_fprog_kern *fprog;
struct sock_filter *filter = (struct sock_filter *) data;
if (data_len % sizeof(struct sock_filter))
return -EINVAL;
fprog = kmalloc(sizeof(*fprog), GFP_KERNEL);
if (!fprog)
return -ENOMEM;
fprog->filter = kmemdup(filter, data_len, GFP_KERNEL);
if (!fprog->filter) {
kfree(fprog);
return -ENOMEM;
}
fprog->len = data_len / sizeof(struct sock_filter);
*pfprog = fprog;
return 0;
}
static void __fprog_destroy(struct sock_fprog_kern *fprog)
{
kfree(fprog->filter);
kfree(fprog);
}
static int lb_bpf_func_set(struct team *team, struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 03:34:16 +00:00
struct bpf_prog *fp = NULL;
struct bpf_prog *orig_fp = NULL;
struct sock_fprog_kern *fprog = NULL;
int err;
if (ctx->data.bin_val.len) {
err = __fprog_create(&fprog, ctx->data.bin_val.len,
ctx->data.bin_val.ptr);
if (err)
return err;
net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 03:34:16 +00:00
err = bpf_prog_create(&fp, fprog);
if (err) {
__fprog_destroy(fprog);
return err;
}
}
if (lb_priv->ex->orig_fprog) {
/* Clear old filter data */
__fprog_destroy(lb_priv->ex->orig_fprog);
orig_fp = rcu_dereference_protected(lb_priv->fp,
lockdep_is_held(&team->lock));
}
rcu_assign_pointer(lb_priv->fp, fp);
lb_priv->ex->orig_fprog = fprog;
net: filter: don't release unattached filter through call_rcu() sk_unattached_filter_destroy() does not always need to release the filter object via rcu. Since this filter is never attached to the socket, the caller should be responsible for releasing the filter in a safe way, which may not necessarily imply rcu. This is a short summary of clients of this function: 1) xt_bpf.c and cls_bpf.c use the bpf matchers from rules, these rules are removed from the packet path before the filter is released. Thus, the framework makes sure the filter is safely removed. 2) In the ppp driver, the ppp_lock ensures serialization between the xmit and filter attachment/detachment path. This doesn't use rcu so deferred release via rcu makes no sense. 3) In the isdn/ppp driver, it is called from isdn_ppp_release() the isdn_ppp_ioctl(). This driver uses mutex and spinlocks, no rcu. Thus, deferred rcu makes no sense to me either, the deferred releases may be just masking the effects of wrong locking strategy, which should be fixed in the driver itself. 4) In the team driver, this is the only place where the rcu synchronization with unattached filter is used. Therefore, this patch introduces synchronize_rcu() which is called from the genetlink path to make sure the filter doesn't go away while packets are still walking over it. I think we can revisit this once struct bpf_prog (that only wraps specific bpf code bits) is in place, then add some specific struct rcu_head in the scope of the team driver if Jiri thinks this is needed. Deferred rcu release for unattached filters was originally introduced in 302d663 ("filter: Allow to create sk-unattached filters"). Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-29 15:36:28 +00:00
if (orig_fp) {
synchronize_rcu();
net: filter: split 'struct sk_filter' into socket and bpf parts clean up names related to socket filtering and bpf in the following way: - everything that deals with sockets keeps 'sk_*' prefix - everything that is pure BPF is changed to 'bpf_*' prefix split 'struct sk_filter' into struct sk_filter { atomic_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; and struct bpf_prog { u32 jited:1, len:31; struct sock_fprog_kern *orig_prog; unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); union { struct sock_filter insns[0]; struct bpf_insn insnsi[0]; struct work_struct work; }; }; so that 'struct bpf_prog' can be used independent of sockets and cleans up 'unattached' bpf use cases split SK_RUN_FILTER macro into: SK_RUN_FILTER to be used with 'struct sk_filter *' and BPF_PROG_RUN to be used with 'struct bpf_prog *' __sk_filter_release(struct sk_filter *) gains __bpf_prog_release(struct bpf_prog *) helper function also perform related renames for the functions that work with 'struct bpf_prog *', since they're on the same lines: sk_filter_size -> bpf_prog_size sk_filter_select_runtime -> bpf_prog_select_runtime sk_filter_free -> bpf_prog_free sk_unattached_filter_create -> bpf_prog_create sk_unattached_filter_destroy -> bpf_prog_destroy sk_store_orig_filter -> bpf_prog_store_orig_filter sk_release_orig_filter -> bpf_release_orig_filter __sk_migrate_filter -> bpf_migrate_filter __sk_prepare_filter -> bpf_prepare_filter API for attaching classic BPF to a socket stays the same: sk_attach_filter(prog, struct sock *)/sk_detach_filter(struct sock *) and SK_RUN_FILTER(struct sk_filter *, ctx) to execute a program which is used by sockets, tun, af_packet API for 'unattached' BPF programs becomes: bpf_prog_create(struct bpf_prog **)/bpf_prog_destroy(struct bpf_prog *) and BPF_PROG_RUN(struct bpf_prog *, ctx) to execute a program which is used by isdn, ppp, team, seccomp, ptp, xt_bpf, cls_bpf, test_bpf Signed-off-by: Alexei Starovoitov <ast@plumgrid.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-31 03:34:16 +00:00
bpf_prog_destroy(orig_fp);
net: filter: don't release unattached filter through call_rcu() sk_unattached_filter_destroy() does not always need to release the filter object via rcu. Since this filter is never attached to the socket, the caller should be responsible for releasing the filter in a safe way, which may not necessarily imply rcu. This is a short summary of clients of this function: 1) xt_bpf.c and cls_bpf.c use the bpf matchers from rules, these rules are removed from the packet path before the filter is released. Thus, the framework makes sure the filter is safely removed. 2) In the ppp driver, the ppp_lock ensures serialization between the xmit and filter attachment/detachment path. This doesn't use rcu so deferred release via rcu makes no sense. 3) In the isdn/ppp driver, it is called from isdn_ppp_release() the isdn_ppp_ioctl(). This driver uses mutex and spinlocks, no rcu. Thus, deferred rcu makes no sense to me either, the deferred releases may be just masking the effects of wrong locking strategy, which should be fixed in the driver itself. 4) In the team driver, this is the only place where the rcu synchronization with unattached filter is used. Therefore, this patch introduces synchronize_rcu() which is called from the genetlink path to make sure the filter doesn't go away while packets are still walking over it. I think we can revisit this once struct bpf_prog (that only wraps specific bpf code bits) is in place, then add some specific struct rcu_head in the scope of the team driver if Jiri thinks this is needed. Deferred rcu release for unattached filters was originally introduced in 302d663 ("filter: Allow to create sk-unattached filters"). Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Signed-off-by: David S. Miller <davem@davemloft.net>
2014-07-29 15:36:28 +00:00
}
return 0;
}
team: Free BPF filter when unregistering netdev When team is used in loadbalance mode a BPF filter can be used to provide a hash which will determine the Tx port. When the netdev is later unregistered the filter is not freed which results in memory leaks [1]. Fix by freeing the program and the corresponding filter when unregistering the netdev. [1] unreferenced object 0xffff8881dbc47cc8 (size 16): comm "teamd", pid 3068, jiffies 4294997779 (age 438.247s) hex dump (first 16 bytes): a3 00 6b 6b 6b 6b 6b 6b 88 a5 82 e1 81 88 ff ff ..kkkkkk........ backtrace: [<000000008a3b47e3>] team_nl_cmd_options_set+0x88f/0x11b0 [<00000000c4f4f27e>] genl_family_rcv_msg+0x78f/0x1080 [<00000000610ef838>] genl_rcv_msg+0xca/0x170 [<00000000a281df93>] netlink_rcv_skb+0x132/0x380 [<000000004d9448a2>] genl_rcv+0x29/0x40 [<000000000321b2f4>] netlink_unicast+0x4c0/0x690 [<000000008c25dffb>] netlink_sendmsg+0x929/0xe10 [<00000000068298c5>] sock_sendmsg+0xc8/0x110 [<0000000082a61ff0>] ___sys_sendmsg+0x77a/0x8f0 [<00000000663ae29d>] __sys_sendmsg+0xf7/0x250 [<0000000027c5f11a>] do_syscall_64+0x14d/0x610 [<000000006cfbc8d3>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<00000000e23197e2>] 0xffffffffffffffff unreferenced object 0xffff8881e182a588 (size 2048): comm "teamd", pid 3068, jiffies 4294997780 (age 438.247s) hex dump (first 32 bytes): 20 00 00 00 02 00 00 00 30 00 00 00 28 f0 ff ff .......0...(... 07 00 00 00 00 00 00 00 28 00 00 00 00 00 00 00 ........(....... backtrace: [<000000002daf01fb>] lb_bpf_func_set+0x45c/0x6d0 [<000000008a3b47e3>] team_nl_cmd_options_set+0x88f/0x11b0 [<00000000c4f4f27e>] genl_family_rcv_msg+0x78f/0x1080 [<00000000610ef838>] genl_rcv_msg+0xca/0x170 [<00000000a281df93>] netlink_rcv_skb+0x132/0x380 [<000000004d9448a2>] genl_rcv+0x29/0x40 [<000000000321b2f4>] netlink_unicast+0x4c0/0x690 [<000000008c25dffb>] netlink_sendmsg+0x929/0xe10 [<00000000068298c5>] sock_sendmsg+0xc8/0x110 [<0000000082a61ff0>] ___sys_sendmsg+0x77a/0x8f0 [<00000000663ae29d>] __sys_sendmsg+0xf7/0x250 [<0000000027c5f11a>] do_syscall_64+0x14d/0x610 [<000000006cfbc8d3>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<00000000e23197e2>] 0xffffffffffffffff Fixes: 01d7f30a9f96 ("team: add loadbalance mode") Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reported-by: Amit Cohen <amitc@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-03 07:35:51 +00:00
static void lb_bpf_func_free(struct team *team)
{
struct lb_priv *lb_priv = get_lb_priv(team);
struct bpf_prog *fp;
if (!lb_priv->ex->orig_fprog)
return;
__fprog_destroy(lb_priv->ex->orig_fprog);
fp = rcu_dereference_protected(lb_priv->fp,
lockdep_is_held(&team->lock));
bpf_prog_destroy(fp);
}
static int lb_tx_method_get(struct team *team, struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
lb_select_tx_port_func_t *func;
char *name;
func = rcu_dereference_protected(lb_priv->select_tx_port_func,
lockdep_is_held(&team->lock));
name = lb_select_tx_port_get_name(func);
BUG_ON(!name);
ctx->data.str_val = name;
return 0;
}
static int lb_tx_method_set(struct team *team, struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
lb_select_tx_port_func_t *func;
func = lb_select_tx_port_get_func(ctx->data.str_val);
if (!func)
return -EINVAL;
rcu_assign_pointer(lb_priv->select_tx_port_func, func);
return 0;
}
static int lb_tx_hash_to_port_mapping_init(struct team *team,
struct team_option_inst_info *info)
{
struct lb_priv *lb_priv = get_lb_priv(team);
unsigned char hash = info->array_index;
LB_HTPM_OPT_INST_INFO_BY_HASH(lb_priv, hash) = info;
return 0;
}
static int lb_tx_hash_to_port_mapping_get(struct team *team,
struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
struct team_port *port;
unsigned char hash = ctx->info->array_index;
port = LB_HTPM_PORT_BY_HASH(lb_priv, hash);
ctx->data.u32_val = port ? port->dev->ifindex : 0;
return 0;
}
static int lb_tx_hash_to_port_mapping_set(struct team *team,
struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
struct team_port *port;
unsigned char hash = ctx->info->array_index;
list_for_each_entry(port, &team->port_list, list) {
if (ctx->data.u32_val == port->dev->ifindex &&
team_port_enabled(port)) {
rcu_assign_pointer(LB_HTPM_PORT_BY_HASH(lb_priv, hash),
port);
return 0;
}
}
return -ENODEV;
}
static int lb_hash_stats_init(struct team *team,
struct team_option_inst_info *info)
{
struct lb_priv *lb_priv = get_lb_priv(team);
unsigned char hash = info->array_index;
lb_priv->ex->stats.info[hash].opt_inst_info = info;
return 0;
}
static int lb_hash_stats_get(struct team *team, struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
unsigned char hash = ctx->info->array_index;
ctx->data.bin_val.ptr = &lb_priv->ex->stats.info[hash].stats;
ctx->data.bin_val.len = sizeof(struct lb_stats);
return 0;
}
static int lb_port_stats_init(struct team *team,
struct team_option_inst_info *info)
{
struct team_port *port = info->port;
struct lb_port_priv *lb_port_priv = get_lb_port_priv(port);
lb_port_priv->stats_info.opt_inst_info = info;
return 0;
}
static int lb_port_stats_get(struct team *team, struct team_gsetter_ctx *ctx)
{
struct team_port *port = ctx->info->port;
struct lb_port_priv *lb_port_priv = get_lb_port_priv(port);
ctx->data.bin_val.ptr = &lb_port_priv->stats_info.stats;
ctx->data.bin_val.len = sizeof(struct lb_stats);
return 0;
}
static void __lb_stats_info_refresh_prepare(struct lb_stats_info *s_info)
{
memcpy(&s_info->last_stats, &s_info->stats, sizeof(struct lb_stats));
memset(&s_info->stats, 0, sizeof(struct lb_stats));
}
static bool __lb_stats_info_refresh_check(struct lb_stats_info *s_info,
struct team *team)
{
if (memcmp(&s_info->last_stats, &s_info->stats,
sizeof(struct lb_stats))) {
team_option_inst_set_change(s_info->opt_inst_info);
return true;
}
return false;
}
static void __lb_one_cpu_stats_add(struct lb_stats *acc_stats,
struct lb_stats *cpu_stats,
struct u64_stats_sync *syncp)
{
unsigned int start;
struct lb_stats tmp;
do {
start = u64_stats_fetch_begin(syncp);
tmp.tx_bytes = cpu_stats->tx_bytes;
} while (u64_stats_fetch_retry(syncp, start));
acc_stats->tx_bytes += tmp.tx_bytes;
}
static void lb_stats_refresh(struct work_struct *work)
{
struct team *team;
struct lb_priv *lb_priv;
struct lb_priv_ex *lb_priv_ex;
struct lb_pcpu_stats *pcpu_stats;
struct lb_stats *stats;
struct lb_stats_info *s_info;
struct team_port *port;
bool changed = false;
int i;
int j;
lb_priv_ex = container_of(work, struct lb_priv_ex,
stats.refresh_dw.work);
team = lb_priv_ex->team;
lb_priv = get_lb_priv(team);
if (!mutex_trylock(&team->lock)) {
schedule_delayed_work(&lb_priv_ex->stats.refresh_dw, 0);
return;
}
for (j = 0; j < LB_TX_HASHTABLE_SIZE; j++) {
s_info = &lb_priv->ex->stats.info[j];
__lb_stats_info_refresh_prepare(s_info);
for_each_possible_cpu(i) {
pcpu_stats = per_cpu_ptr(lb_priv->pcpu_stats, i);
stats = &pcpu_stats->hash_stats[j];
__lb_one_cpu_stats_add(&s_info->stats, stats,
&pcpu_stats->syncp);
}
changed |= __lb_stats_info_refresh_check(s_info, team);
}
list_for_each_entry(port, &team->port_list, list) {
struct lb_port_priv *lb_port_priv = get_lb_port_priv(port);
s_info = &lb_port_priv->stats_info;
__lb_stats_info_refresh_prepare(s_info);
for_each_possible_cpu(i) {
pcpu_stats = per_cpu_ptr(lb_priv->pcpu_stats, i);
stats = per_cpu_ptr(lb_port_priv->pcpu_stats, i);
__lb_one_cpu_stats_add(&s_info->stats, stats,
&pcpu_stats->syncp);
}
changed |= __lb_stats_info_refresh_check(s_info, team);
}
if (changed)
team_options_change_check(team);
schedule_delayed_work(&lb_priv_ex->stats.refresh_dw,
(lb_priv_ex->stats.refresh_interval * HZ) / 10);
mutex_unlock(&team->lock);
}
static int lb_stats_refresh_interval_get(struct team *team,
struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
ctx->data.u32_val = lb_priv->ex->stats.refresh_interval;
return 0;
}
static int lb_stats_refresh_interval_set(struct team *team,
struct team_gsetter_ctx *ctx)
{
struct lb_priv *lb_priv = get_lb_priv(team);
unsigned int interval;
interval = ctx->data.u32_val;
if (lb_priv->ex->stats.refresh_interval == interval)
return 0;
lb_priv->ex->stats.refresh_interval = interval;
if (interval)
schedule_delayed_work(&lb_priv->ex->stats.refresh_dw, 0);
else
cancel_delayed_work(&lb_priv->ex->stats.refresh_dw);
return 0;
}
static const struct team_option lb_options[] = {
{
.name = "bpf_hash_func",
.type = TEAM_OPTION_TYPE_BINARY,
.getter = lb_bpf_func_get,
.setter = lb_bpf_func_set,
},
{
.name = "lb_tx_method",
.type = TEAM_OPTION_TYPE_STRING,
.getter = lb_tx_method_get,
.setter = lb_tx_method_set,
},
{
.name = "lb_tx_hash_to_port_mapping",
.array_size = LB_TX_HASHTABLE_SIZE,
.type = TEAM_OPTION_TYPE_U32,
.init = lb_tx_hash_to_port_mapping_init,
.getter = lb_tx_hash_to_port_mapping_get,
.setter = lb_tx_hash_to_port_mapping_set,
},
{
.name = "lb_hash_stats",
.array_size = LB_TX_HASHTABLE_SIZE,
.type = TEAM_OPTION_TYPE_BINARY,
.init = lb_hash_stats_init,
.getter = lb_hash_stats_get,
},
{
.name = "lb_port_stats",
.per_port = true,
.type = TEAM_OPTION_TYPE_BINARY,
.init = lb_port_stats_init,
.getter = lb_port_stats_get,
},
{
.name = "lb_stats_refresh_interval",
.type = TEAM_OPTION_TYPE_U32,
.getter = lb_stats_refresh_interval_get,
.setter = lb_stats_refresh_interval_set,
},
};
static int lb_init(struct team *team)
{
struct lb_priv *lb_priv = get_lb_priv(team);
lb_select_tx_port_func_t *func;
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 22:51:58 +00:00
int i, err;
/* set default tx port selector */
func = lb_select_tx_port_get_func("hash");
BUG_ON(!func);
rcu_assign_pointer(lb_priv->select_tx_port_func, func);
lb_priv->ex = kzalloc(sizeof(*lb_priv->ex), GFP_KERNEL);
if (!lb_priv->ex)
return -ENOMEM;
lb_priv->ex->team = team;
lb_priv->pcpu_stats = alloc_percpu(struct lb_pcpu_stats);
if (!lb_priv->pcpu_stats) {
err = -ENOMEM;
goto err_alloc_pcpu_stats;
}
net: Explicitly initialize u64_stats_sync structures for lockdep In order to enable lockdep on seqcount/seqlock structures, we must explicitly initialize any locks. The u64_stats_sync structure, uses a seqcount, and thus we need to introduce a u64_stats_init() function and use it to initialize the structure. This unfortunately adds a lot of fairly trivial initialization code to a number of drivers. But the benefit of ensuring correctness makes this worth while. Because these changes are required for lockdep to be enabled, and the changes are quite trivial, I've not yet split this patch out into 30-some separate patches, as I figured it would be better to get the various maintainers thoughts on how to best merge this change along with the seqcount lockdep enablement. Feedback would be appreciated! Signed-off-by: John Stultz <john.stultz@linaro.org> Acked-by: Julian Anastasov <ja@ssi.bg> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> Cc: "David S. Miller" <davem@davemloft.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org> Cc: James Morris <jmorris@namei.org> Cc: Jesse Gross <jesse@nicira.com> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: "Michael S. Tsirkin" <mst@redhat.com> Cc: Mirko Lindner <mlindner@marvell.com> Cc: Patrick McHardy <kaber@trash.net> Cc: Roger Luethi <rl@hellgate.ch> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Simon Horman <horms@verge.net.au> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Thomas Petazzoni <thomas.petazzoni@free-electrons.com> Cc: Wensong Zhang <wensong@linux-vs.org> Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/1381186321-4906-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-10-07 22:51:58 +00:00
for_each_possible_cpu(i) {
struct lb_pcpu_stats *team_lb_stats;
team_lb_stats = per_cpu_ptr(lb_priv->pcpu_stats, i);
u64_stats_init(&team_lb_stats->syncp);
}
INIT_DELAYED_WORK(&lb_priv->ex->stats.refresh_dw, lb_stats_refresh);
err = team_options_register(team, lb_options, ARRAY_SIZE(lb_options));
if (err)
goto err_options_register;
return 0;
err_options_register:
free_percpu(lb_priv->pcpu_stats);
err_alloc_pcpu_stats:
kfree(lb_priv->ex);
return err;
}
static void lb_exit(struct team *team)
{
struct lb_priv *lb_priv = get_lb_priv(team);
team_options_unregister(team, lb_options,
ARRAY_SIZE(lb_options));
team: Free BPF filter when unregistering netdev When team is used in loadbalance mode a BPF filter can be used to provide a hash which will determine the Tx port. When the netdev is later unregistered the filter is not freed which results in memory leaks [1]. Fix by freeing the program and the corresponding filter when unregistering the netdev. [1] unreferenced object 0xffff8881dbc47cc8 (size 16): comm "teamd", pid 3068, jiffies 4294997779 (age 438.247s) hex dump (first 16 bytes): a3 00 6b 6b 6b 6b 6b 6b 88 a5 82 e1 81 88 ff ff ..kkkkkk........ backtrace: [<000000008a3b47e3>] team_nl_cmd_options_set+0x88f/0x11b0 [<00000000c4f4f27e>] genl_family_rcv_msg+0x78f/0x1080 [<00000000610ef838>] genl_rcv_msg+0xca/0x170 [<00000000a281df93>] netlink_rcv_skb+0x132/0x380 [<000000004d9448a2>] genl_rcv+0x29/0x40 [<000000000321b2f4>] netlink_unicast+0x4c0/0x690 [<000000008c25dffb>] netlink_sendmsg+0x929/0xe10 [<00000000068298c5>] sock_sendmsg+0xc8/0x110 [<0000000082a61ff0>] ___sys_sendmsg+0x77a/0x8f0 [<00000000663ae29d>] __sys_sendmsg+0xf7/0x250 [<0000000027c5f11a>] do_syscall_64+0x14d/0x610 [<000000006cfbc8d3>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<00000000e23197e2>] 0xffffffffffffffff unreferenced object 0xffff8881e182a588 (size 2048): comm "teamd", pid 3068, jiffies 4294997780 (age 438.247s) hex dump (first 32 bytes): 20 00 00 00 02 00 00 00 30 00 00 00 28 f0 ff ff .......0...(... 07 00 00 00 00 00 00 00 28 00 00 00 00 00 00 00 ........(....... backtrace: [<000000002daf01fb>] lb_bpf_func_set+0x45c/0x6d0 [<000000008a3b47e3>] team_nl_cmd_options_set+0x88f/0x11b0 [<00000000c4f4f27e>] genl_family_rcv_msg+0x78f/0x1080 [<00000000610ef838>] genl_rcv_msg+0xca/0x170 [<00000000a281df93>] netlink_rcv_skb+0x132/0x380 [<000000004d9448a2>] genl_rcv+0x29/0x40 [<000000000321b2f4>] netlink_unicast+0x4c0/0x690 [<000000008c25dffb>] netlink_sendmsg+0x929/0xe10 [<00000000068298c5>] sock_sendmsg+0xc8/0x110 [<0000000082a61ff0>] ___sys_sendmsg+0x77a/0x8f0 [<00000000663ae29d>] __sys_sendmsg+0xf7/0x250 [<0000000027c5f11a>] do_syscall_64+0x14d/0x610 [<000000006cfbc8d3>] entry_SYSCALL_64_after_hwframe+0x49/0xbe [<00000000e23197e2>] 0xffffffffffffffff Fixes: 01d7f30a9f96 ("team: add loadbalance mode") Signed-off-by: Ido Schimmel <idosch@mellanox.com> Reported-by: Amit Cohen <amitc@mellanox.com> Acked-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-03-03 07:35:51 +00:00
lb_bpf_func_free(team);
cancel_delayed_work_sync(&lb_priv->ex->stats.refresh_dw);
free_percpu(lb_priv->pcpu_stats);
kfree(lb_priv->ex);
}
static int lb_port_enter(struct team *team, struct team_port *port)
{
struct lb_port_priv *lb_port_priv = get_lb_port_priv(port);
lb_port_priv->pcpu_stats = alloc_percpu(struct lb_stats);
if (!lb_port_priv->pcpu_stats)
return -ENOMEM;
return 0;
}
static void lb_port_leave(struct team *team, struct team_port *port)
{
struct lb_port_priv *lb_port_priv = get_lb_port_priv(port);
free_percpu(lb_port_priv->pcpu_stats);
}
static void lb_port_disabled(struct team *team, struct team_port *port)
{
lb_tx_hash_to_port_mapping_null_port(team, port);
}
static const struct team_mode_ops lb_mode_ops = {
.init = lb_init,
.exit = lb_exit,
.port_enter = lb_port_enter,
.port_leave = lb_port_leave,
.port_disabled = lb_port_disabled,
.receive = lb_receive,
.transmit = lb_transmit,
};
static const struct team_mode lb_mode = {
.kind = "loadbalance",
.owner = THIS_MODULE,
.priv_size = sizeof(struct lb_priv),
.port_priv_size = sizeof(struct lb_port_priv),
.ops = &lb_mode_ops,
.lag_tx_type = NETDEV_LAG_TX_TYPE_HASH,
};
static int __init lb_init_module(void)
{
return team_mode_register(&lb_mode);
}
static void __exit lb_cleanup_module(void)
{
team_mode_unregister(&lb_mode);
}
module_init(lb_init_module);
module_exit(lb_cleanup_module);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Jiri Pirko <jpirko@redhat.com>");
MODULE_DESCRIPTION("Load-balancing mode for team");
MODULE_ALIAS_TEAM_MODE("loadbalance");