mlx5-updates-2017-24-01

The first seven patches from Or Gerlitz in this series further enhances
 the mlx5 SRIOV switchdev mode to support offloading IPv6 tunnels using the
 TC tunnel key set (encap) and unset (decap) actions.
 
 Or Gerlitz says:
 ========================
 As part of doing this change, few cleanups are done in the IPv4 code,
 later we move to use the full tunnel key info provided to the driver as
 the key for our internal hashing which is used to identify cases where
 the same tunnel is used for encapsulating multiple flows. As done in the
 IPv4 case, the control path for offloading IPv6 tunnels uses route/neigh
 lookups and construction of the IPv6 tunnel headers on the encap path and
 matching on the outer hears in the decap path.
 
 The last patch of the series enlarges the HW FDB size for the switchdev mode,
 so it has now room to contain offloaded flows as many as min(max number
 of HW flow counters supported, max HW table size supported).
 ========================
 
 Next to Or's series you can find several patches handling several topics.
 
 From Mohamad, add support for SRIOV VF min rate guarantee by using the
 TSAR BW share weights mechanism.
 
 From Or, Two patches to enable Eth VFs to query their min-inline value for
 user-space.
 for that we move a mlx5 low level min inline helper function from mlx5
 ethernet driver into the core driver and then use it in mlx5_ib to expose
 the inline mode to rdma applications through libmlx5.
 
 From Kamal Heib, Reduce memory consumption on kdump kernel.
 
 From Shaker Daibes, code reuse in CQE compression control logic
 
 Thanks,
 Saeed.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1
 
 iQEcBAABAgAGBQJYh7FNAAoJEEg/ir3gV/o+TjsIAL1e92+5eutBS9ZvhMARi+Tc
 c2V9V8bG8W1RWWTvx1G0aU4nNjWsr5L8Q8gzqpwhrQITBfgpWd+hlnxQCucyhxC3
 AC1qQ+AKREe/C+25D+WJRq34/61ZHEH2rbKZvpZ1O8SuicVPbcvJ9eM+wOEDxwwX
 u5C5kWQ0HRtCcnFiiOYkB+0CQPH7m3+ZzZek+jDowrexHMSE+yl8ZNtaSTX9c9QN
 bE2cPiCVZd7ufKPIwY8LWHBryyl7sh5P+NqzD633OeiqP/pkZsW9A+czyt+d330f
 6XTKOS1PCD+TfHE0sZJT4VMCjICMHrOFbNRZuwcxJQ6NfmwIJZfskX4NLbyGQTI=
 =vF7U
 -----END PGP SIGNATURE-----

Merge tag 'mlx5-updates-2017-01-24' of git://git.kernel.org/pub/scm/linux/kernel/git/saeed/linux

Saeed Mahameed says:

====================
mlx5-updates-2017-24-01

The first seven patches from Or Gerlitz in this series further enhances
the mlx5 SRIOV switchdev mode to support offloading IPv6 tunnels using the
TC tunnel key set (encap) and unset (decap) actions.

Or Gerlitz says:
========================
As part of doing this change, few cleanups are done in the IPv4 code,
later we move to use the full tunnel key info provided to the driver as
the key for our internal hashing which is used to identify cases where
the same tunnel is used for encapsulating multiple flows. As done in the
IPv4 case, the control path for offloading IPv6 tunnels uses route/neigh
lookups and construction of the IPv6 tunnel headers on the encap path and
matching on the outer hears in the decap path.

The last patch of the series enlarges the HW FDB size for the switchdev mode,
so it has now room to contain offloaded flows as many as min(max number
of HW flow counters supported, max HW table size supported).
========================

Next to Or's series you can find several patches handling several topics.

From Mohamad, add support for SRIOV VF min rate guarantee by using the
TSAR BW share weights mechanism.

From Or, Two patches to enable Eth VFs to query their min-inline value for
user-space.
for that we move a mlx5 low level min inline helper function from mlx5
ethernet driver into the core driver and then use it in mlx5_ib to expose
the inline mode to rdma applications through libmlx5.

From Kamal Heib, Reduce memory consumption on kdump kernel.

From Shaker Daibes, code reuse in CQE compression control logic
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2017-01-25 12:49:58 -05:00
commit 716dcaebed
15 changed files with 390 additions and 121 deletions

View File

@ -53,6 +53,7 @@
#include <linux/in.h>
#include <linux/etherdevice.h>
#include <linux/mlx5/fs.h>
#include <linux/mlx5/vport.h>
#include "mlx5_ib.h"
#define DRIVER_NAME "mlx5_ib"
@ -1202,6 +1203,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
resp.response_length += sizeof(resp.cmds_supp_uhw);
}
if (field_avail(typeof(resp), eth_min_inline, udata->outlen)) {
if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) {
mlx5_query_min_inline(dev->mdev, &resp.eth_min_inline);
resp.eth_min_inline++;
}
resp.response_length += sizeof(resp.eth_min_inline);
}
/*
* We don't want to expose information from the PCI bar that is located
* after 4096 bytes, so if the arch only supports larger pages, let's

View File

@ -101,6 +101,7 @@
#define MLX5E_LOG_INDIR_RQT_SIZE 0x7
#define MLX5E_INDIR_RQT_SIZE BIT(MLX5E_LOG_INDIR_RQT_SIZE)
#define MLX5E_MIN_NUM_CHANNELS 0x1
#define MLX5E_MAX_NUM_CHANNELS (MLX5E_INDIR_RQT_SIZE >> 1)
#define MLX5E_MAX_NUM_SQS (MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC)
#define MLX5E_TX_CQ_POLL_BUDGET 128
@ -786,7 +787,7 @@ void mlx5e_pps_event_handler(struct mlx5e_priv *priv,
struct ptp_clock_event *event);
int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr);
int mlx5e_hwstamp_get(struct net_device *dev, struct ifreq *ifr);
void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val);
void mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val);
int mlx5e_vlan_rx_add_vid(struct net_device *dev, __always_unused __be16 proto,
u16 vid);
@ -847,12 +848,6 @@ static inline u32 mlx5e_get_wqe_mtt_offset(struct mlx5e_rq *rq, u16 wqe_ix)
return wqe_ix * ALIGN(MLX5_MPWRQ_PAGES_PER_WQE, 8);
}
static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
{
return min_t(int, mdev->priv.eq_table.num_comp_vectors,
MLX5E_MAX_NUM_CHANNELS);
}
extern const struct ethtool_ops mlx5e_ethtool_ops;
#ifdef CONFIG_MLX5_CORE_EN_DCB
extern const struct dcbnl_rtnl_ops mlx5e_dcbnl_ops;

View File

@ -106,11 +106,12 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
return -ERANGE;
}
mutex_lock(&priv->state_lock);
/* RX HW timestamp */
switch (config.rx_filter) {
case HWTSTAMP_FILTER_NONE:
/* Reset CQE compression to Admin default */
mlx5e_modify_rx_cqe_compression(priv, priv->params.rx_cqe_compress_def);
mlx5e_modify_rx_cqe_compression_locked(priv, priv->params.rx_cqe_compress_def);
break;
case HWTSTAMP_FILTER_ALL:
case HWTSTAMP_FILTER_SOME:
@ -128,14 +129,16 @@ int mlx5e_hwstamp_set(struct net_device *dev, struct ifreq *ifr)
case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
/* Disable CQE compression */
netdev_warn(dev, "Disabling cqe compression");
mlx5e_modify_rx_cqe_compression(priv, false);
mlx5e_modify_rx_cqe_compression_locked(priv, false);
config.rx_filter = HWTSTAMP_FILTER_ALL;
break;
default:
mutex_unlock(&priv->state_lock);
return -ERANGE;
}
memcpy(&priv->tstamp.hwtstamp_config, &config, sizeof(config));
mutex_unlock(&priv->state_lock);
return copy_to_user(ifr->ifr_data, &config,
sizeof(config)) ? -EFAULT : 0;

View File

@ -552,7 +552,7 @@ static void mlx5e_get_channels(struct net_device *dev,
{
struct mlx5e_priv *priv = netdev_priv(dev);
ch->max_combined = mlx5e_get_max_num_channels(priv->mdev);
ch->max_combined = priv->profile->max_nch(priv->mdev);
ch->combined_count = priv->params.num_channels;
}
@ -560,7 +560,7 @@ static int mlx5e_set_channels(struct net_device *dev,
struct ethtool_channels *ch)
{
struct mlx5e_priv *priv = netdev_priv(dev);
int ncv = mlx5e_get_max_num_channels(priv->mdev);
int ncv = priv->profile->max_nch(priv->mdev);
unsigned int count = ch->combined_count;
bool arfs_enabled;
bool was_opened;
@ -1476,8 +1476,6 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev,
{
struct mlx5e_priv *priv = netdev_priv(netdev);
struct mlx5_core_dev *mdev = priv->mdev;
int err = 0;
bool reset;
if (!MLX5_CAP_GEN(mdev, cqe_compression))
return -ENOTSUPP;
@ -1487,17 +1485,10 @@ static int set_pflag_rx_cqe_compress(struct net_device *netdev,
return -EINVAL;
}
reset = test_bit(MLX5E_STATE_OPENED, &priv->state);
if (reset)
mlx5e_close_locked(netdev);
MLX5E_SET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS, enable);
mlx5e_modify_rx_cqe_compression_locked(priv, enable);
priv->params.rx_cqe_compress_def = enable;
if (reset)
err = mlx5e_open_locked(netdev);
return err;
return 0;
}
static int mlx5e_handle_pflag(struct net_device *netdev,

View File

@ -31,6 +31,7 @@
*/
#include <net/tc_act/tc_gact.h>
#include <linux/crash_dump.h>
#include <net/pkt_cls.h>
#include <linux/mlx5/fs.h>
#include <net/vxlan.h>
@ -83,7 +84,9 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type)
priv->params.rq_wq_type = rq_type;
switch (priv->params.rq_wq_type) {
case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
priv->params.log_rq_size = is_kdump_kernel() ?
MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW :
MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
priv->params.mpwqe_log_stride_sz =
MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) ?
MLX5_MPWRQ_LOG_STRIDE_SIZE_CQE_COMPRESS :
@ -92,7 +95,9 @@ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type)
priv->params.mpwqe_log_stride_sz;
break;
default: /* MLX5_WQ_TYPE_LINKED_LIST */
priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
priv->params.log_rq_size = is_kdump_kernel() ?
MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE :
MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
}
priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type,
BIT(priv->params.log_rq_size));
@ -1508,6 +1513,14 @@ static int mlx5e_set_tx_maxrate(struct net_device *dev, int index, u32 rate)
return err;
}
static inline int mlx5e_get_max_num_channels(struct mlx5_core_dev *mdev)
{
return is_kdump_kernel() ?
MLX5E_MIN_NUM_CHANNELS :
min_t(int, mdev->priv.eq_table.num_comp_vectors,
MLX5E_MAX_NUM_CHANNELS);
}
static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix,
struct mlx5e_channel_param *cparam,
struct mlx5e_channel **cp)
@ -3021,11 +3034,8 @@ static int mlx5e_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5_core_dev *mdev = priv->mdev;
if (min_tx_rate)
return -EOPNOTSUPP;
return mlx5_eswitch_set_vport_rate(mdev->priv.eswitch, vf + 1,
max_tx_rate);
max_tx_rate, min_tx_rate);
}
static int mlx5_vport_link2ifla(u8 esw_link)
@ -3461,22 +3471,6 @@ void mlx5e_set_rx_cq_mode_params(struct mlx5e_params *params, u8 cq_period_mode)
MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE;
}
static void mlx5e_query_min_inline(struct mlx5_core_dev *mdev,
u8 *min_inline_mode)
{
switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
case MLX5_CAP_INLINE_MODE_L2:
*min_inline_mode = MLX5_INLINE_MODE_L2;
break;
case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
break;
case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
*min_inline_mode = MLX5_INLINE_MODE_NONE;
break;
}
}
u32 mlx5e_choose_lro_timeout(struct mlx5_core_dev *mdev, u32 wanted_timeout)
{
int i;
@ -3510,7 +3504,9 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
priv->params.lro_timeout =
mlx5e_choose_lro_timeout(mdev, MLX5E_DEFAULT_LRO_TIMEOUT);
priv->params.log_sq_size = MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
priv->params.log_sq_size = is_kdump_kernel() ?
MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
MLX5E_PARAMS_DEFAULT_LOG_SQ_SIZE;
/* set CQE compression */
priv->params.rx_cqe_compress_def = false;
@ -3536,7 +3532,7 @@ static void mlx5e_build_nic_netdev_priv(struct mlx5_core_dev *mdev,
priv->params.tx_cq_moderation.pkts =
MLX5E_PARAMS_DEFAULT_TX_CQ_MODERATION_PKTS;
priv->params.tx_max_inline = mlx5e_get_max_inline_cap(mdev);
mlx5e_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
mlx5_query_min_inline(mdev, &priv->params.tx_min_inline_mode);
priv->params.num_tc = 1;
priv->params.rss_hfunc = ETH_RSS_HASH_XOR;

View File

@ -155,17 +155,15 @@ static inline u32 mlx5e_decompress_cqes_start(struct mlx5e_rq *rq,
return mlx5e_decompress_cqes_cont(rq, cq, 1, budget_rem) - 1;
}
void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val)
void mlx5e_modify_rx_cqe_compression_locked(struct mlx5e_priv *priv, bool val)
{
bool was_opened;
if (!MLX5_CAP_GEN(priv->mdev, cqe_compression))
return;
mutex_lock(&priv->state_lock);
if (MLX5E_GET_PFLAG(priv, MLX5E_PFLAG_RX_CQE_COMPRESS) == val)
goto unlock;
return;
was_opened = test_bit(MLX5E_STATE_OPENED, &priv->state);
if (was_opened)
@ -176,8 +174,6 @@ void mlx5e_modify_rx_cqe_compression(struct mlx5e_priv *priv, bool val)
if (was_opened)
mlx5e_open_locked(priv->netdev);
unlock:
mutex_unlock(&priv->state_lock);
}
#define RQ_PAGE_SIZE(rq) ((1 << rq->buff.page_order) << PAGE_SHIFT)

View File

@ -298,6 +298,32 @@ vxlan_match_offload_err:
MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IP);
} else if (enc_control->addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
struct flow_dissector_key_ipv6_addrs *key =
skb_flow_dissector_target(f->dissector,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
f->key);
struct flow_dissector_key_ipv6_addrs *mask =
skb_flow_dissector_target(f->dissector,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS,
f->mask);
memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
src_ipv4_src_ipv6.ipv6_layout.ipv6),
&mask->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
src_ipv4_src_ipv6.ipv6_layout.ipv6),
&key->src, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_c,
dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
&mask->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v,
dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
&key->dst, MLX5_FLD_SZ_BYTES(ipv6_layout, ipv6));
MLX5_SET_TO_ONES(fte_match_set_lyr_2_4, headers_c, ethertype);
MLX5_SET(fte_match_set_lyr_2_4, headers_v, ethertype, ETH_P_IPV6);
}
/* Enforce DMAC when offloading incoming tunneled flows.
@ -358,12 +384,10 @@ static int __parse_cls_flower(struct mlx5e_priv *priv,
f->key);
switch (key->addr_type) {
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
if (parse_tunnel_attr(priv, spec, f))
return -EOPNOTSUPP;
break;
case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
netdev_warn(priv->netdev,
"IPv6 tunnel decap offload isn't supported\n");
default:
return -EOPNOTSUPP;
}
@ -644,15 +668,15 @@ static int parse_tc_nic_actions(struct mlx5e_priv *priv, struct tcf_exts *exts,
return 0;
}
static inline int cmp_encap_info(struct mlx5_encap_info *a,
struct mlx5_encap_info *b)
static inline int cmp_encap_info(struct ip_tunnel_key *a,
struct ip_tunnel_key *b)
{
return memcmp(a, b, sizeof(*a));
}
static inline int hash_encap_info(struct mlx5_encap_info *info)
static inline int hash_encap_info(struct ip_tunnel_key *key)
{
return jhash(info, sizeof(*info), 0);
return jhash(key, sizeof(*key), 0);
}
static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
@ -660,12 +684,10 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
struct net_device **out_dev,
struct flowi4 *fl4,
struct neighbour **out_n,
__be32 *saddr,
int *out_ttl)
{
struct rtable *rt;
struct neighbour *n = NULL;
int ttl;
#if IS_ENABLED(CONFIG_INET)
int ret;
@ -684,20 +706,59 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv,
return -EOPNOTSUPP;
}
ttl = ip4_dst_hoplimit(&rt->dst);
*out_ttl = ip4_dst_hoplimit(&rt->dst);
n = dst_neigh_lookup(&rt->dst, &fl4->daddr);
ip_rt_put(rt);
if (!n)
return -ENOMEM;
*out_n = n;
*saddr = fl4->saddr;
*out_ttl = ttl;
*out_dev = rt->dst.dev;
return 0;
}
static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv,
struct net_device *mirred_dev,
struct net_device **out_dev,
struct flowi6 *fl6,
struct neighbour **out_n,
int *out_ttl)
{
struct neighbour *n = NULL;
struct dst_entry *dst;
#if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6)
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
int ret;
dst = ip6_route_output(dev_net(mirred_dev), NULL, fl6);
if (dst->error) {
ret = dst->error;
dst_release(dst);
return ret;
}
*out_ttl = ip6_dst_hoplimit(dst);
/* if the egress device isn't on the same HW e-switch, we use the uplink */
if (!switchdev_port_same_parent_id(priv->netdev, dst->dev))
*out_dev = mlx5_eswitch_get_uplink_netdev(esw);
else
*out_dev = dst->dev;
#else
return -EOPNOTSUPP;
#endif
n = dst_neigh_lookup(dst, &fl6->daddr);
dst_release(dst);
if (!n)
return -ENOMEM;
*out_n = n;
return 0;
}
static int gen_vxlan_header_ipv4(struct net_device *out_dev,
char buf[],
unsigned char h_dest[ETH_ALEN],
@ -734,19 +795,52 @@ static int gen_vxlan_header_ipv4(struct net_device *out_dev,
return encap_size;
}
static int gen_vxlan_header_ipv6(struct net_device *out_dev,
char buf[],
unsigned char h_dest[ETH_ALEN],
int ttl,
struct in6_addr *daddr,
struct in6_addr *saddr,
__be16 udp_dst_port,
__be32 vx_vni)
{
int encap_size = VXLAN_HLEN + sizeof(struct ipv6hdr) + ETH_HLEN;
struct ethhdr *eth = (struct ethhdr *)buf;
struct ipv6hdr *ip6h = (struct ipv6hdr *)((char *)eth + sizeof(struct ethhdr));
struct udphdr *udp = (struct udphdr *)((char *)ip6h + sizeof(struct ipv6hdr));
struct vxlanhdr *vxh = (struct vxlanhdr *)((char *)udp + sizeof(struct udphdr));
memset(buf, 0, encap_size);
ether_addr_copy(eth->h_dest, h_dest);
ether_addr_copy(eth->h_source, out_dev->dev_addr);
eth->h_proto = htons(ETH_P_IPV6);
ip6_flow_hdr(ip6h, 0, 0);
/* the HW fills up ipv6 payload len */
ip6h->nexthdr = IPPROTO_UDP;
ip6h->hop_limit = ttl;
ip6h->daddr = *daddr;
ip6h->saddr = *saddr;
udp->dest = udp_dst_port;
vxh->vx_flags = VXLAN_HF_VNI;
vxh->vx_vni = vxlan_vni_field(vx_vni);
return encap_size;
}
static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
struct net_device *mirred_dev,
struct mlx5_encap_entry *e,
struct net_device **out_dev)
{
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
struct ip_tunnel_key *tun_key = &e->tun_info.key;
int encap_size, ttl, err;
struct neighbour *n = NULL;
struct flowi4 fl4 = {};
char *encap_header;
int encap_size;
__be32 saddr;
int ttl;
int err;
encap_header = kzalloc(max_encap_size, GFP_KERNEL);
if (!encap_header)
@ -755,37 +849,108 @@ static int mlx5e_create_encap_header_ipv4(struct mlx5e_priv *priv,
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
fl4.flowi4_proto = IPPROTO_UDP;
fl4.fl4_dport = e->tun_info.tp_dst;
fl4.fl4_dport = tun_key->tp_dst;
break;
default:
err = -EOPNOTSUPP;
goto out;
}
fl4.daddr = e->tun_info.daddr;
fl4.flowi4_tos = tun_key->tos;
fl4.daddr = tun_key->u.ipv4.dst;
fl4.saddr = tun_key->u.ipv4.src;
err = mlx5e_route_lookup_ipv4(priv, mirred_dev, out_dev,
&fl4, &n, &saddr, &ttl);
&fl4, &n, &ttl);
if (err)
goto out;
e->n = n;
e->out_dev = *out_dev;
if (!(n->nud_state & NUD_VALID)) {
pr_warn("%s: can't offload, neighbour to %pI4 invalid\n", __func__, &fl4.daddr);
err = -EOPNOTSUPP;
goto out;
}
e->n = n;
e->out_dev = *out_dev;
neigh_ha_snapshot(e->h_dest, n, *out_dev);
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
encap_size = gen_vxlan_header_ipv4(*out_dev, encap_header,
e->h_dest, ttl,
e->tun_info.daddr,
saddr, e->tun_info.tp_dst,
e->tun_info.tun_id);
fl4.daddr,
fl4.saddr, tun_key->tp_dst,
tunnel_id_to_key32(tun_key->tun_id));
break;
default:
err = -EOPNOTSUPP;
goto out;
}
err = mlx5_encap_alloc(priv->mdev, e->tunnel_type,
encap_size, encap_header, &e->encap_id);
out:
if (err && n)
neigh_release(n);
kfree(encap_header);
return err;
}
static int mlx5e_create_encap_header_ipv6(struct mlx5e_priv *priv,
struct net_device *mirred_dev,
struct mlx5_encap_entry *e,
struct net_device **out_dev)
{
int max_encap_size = MLX5_CAP_ESW(priv->mdev, max_encap_header_size);
struct ip_tunnel_key *tun_key = &e->tun_info.key;
int encap_size, err, ttl = 0;
struct neighbour *n = NULL;
struct flowi6 fl6 = {};
char *encap_header;
encap_header = kzalloc(max_encap_size, GFP_KERNEL);
if (!encap_header)
return -ENOMEM;
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
fl6.flowi6_proto = IPPROTO_UDP;
fl6.fl6_dport = tun_key->tp_dst;
break;
default:
err = -EOPNOTSUPP;
goto out;
}
fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tun_key->tos), tun_key->label);
fl6.daddr = tun_key->u.ipv6.dst;
fl6.saddr = tun_key->u.ipv6.src;
err = mlx5e_route_lookup_ipv6(priv, mirred_dev, out_dev,
&fl6, &n, &ttl);
if (err)
goto out;
if (!(n->nud_state & NUD_VALID)) {
pr_warn("%s: can't offload, neighbour to %pI6 invalid\n", __func__, &fl6.daddr);
err = -EOPNOTSUPP;
goto out;
}
e->n = n;
e->out_dev = *out_dev;
neigh_ha_snapshot(e->h_dest, n, *out_dev);
switch (e->tunnel_type) {
case MLX5_HEADER_TYPE_VXLAN:
encap_size = gen_vxlan_header_ipv6(*out_dev, encap_header,
e->h_dest, ttl,
&fl6.daddr,
&fl6.saddr, tun_key->tp_dst,
tunnel_id_to_key32(tun_key->tun_id));
break;
default:
err = -EOPNOTSUPP;
@ -809,13 +974,11 @@ static int mlx5e_attach_encap(struct mlx5e_priv *priv,
struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
unsigned short family = ip_tunnel_info_af(tun_info);
struct ip_tunnel_key *key = &tun_info->key;
struct mlx5_encap_info info;
struct mlx5_encap_entry *e;
struct net_device *out_dev;
int tunnel_type, err = -EOPNOTSUPP;
uintptr_t hash_key;
bool found = false;
int tunnel_type;
int err;
/* udp dst port must be set */
if (!memchr_inv(&key->tp_dst, 0, sizeof(key->tp_dst)))
@ -831,8 +994,6 @@ vxlan_encap_offload_err:
if (mlx5e_vxlan_lookup_port(priv, be16_to_cpu(key->tp_dst)) &&
MLX5_CAP_ESW(priv->mdev, vxlan_encap_decap)) {
info.tp_dst = key->tp_dst;
info.tun_id = tunnel_id_to_key32(key->tun_id);
tunnel_type = MLX5_HEADER_TYPE_VXLAN;
} else {
netdev_warn(priv->netdev,
@ -840,22 +1001,11 @@ vxlan_encap_offload_err:
return -EOPNOTSUPP;
}
switch (family) {
case AF_INET:
info.daddr = key->u.ipv4.dst;
break;
case AF_INET6:
netdev_warn(priv->netdev,
"IPv6 tunnel encap offload isn't supported\n");
default:
return -EOPNOTSUPP;
}
hash_key = hash_encap_info(&info);
hash_key = hash_encap_info(key);
hash_for_each_possible_rcu(esw->offloads.encap_tbl, e,
encap_hlist, hash_key) {
if (!cmp_encap_info(&e->tun_info, &info)) {
if (!cmp_encap_info(&e->tun_info.key, key)) {
found = true;
break;
}
@ -870,11 +1020,15 @@ vxlan_encap_offload_err:
if (!e)
return -ENOMEM;
e->tun_info = info;
e->tun_info = *tun_info;
e->tunnel_type = tunnel_type;
INIT_LIST_HEAD(&e->flows);
err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e, &out_dev);
if (family == AF_INET)
err = mlx5e_create_encap_header_ipv4(priv, mirred_dev, e, &out_dev);
else if (family == AF_INET6)
err = mlx5e_create_encap_header_ipv6(priv, mirred_dev, e, &out_dev);
if (err)
goto out_err;

View File

@ -1415,7 +1415,7 @@ static void esw_destroy_tsar(struct mlx5_eswitch *esw)
}
static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
u32 initial_max_rate)
u32 initial_max_rate, u32 initial_bw_share)
{
u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
struct mlx5_vport *vport = &esw->vports[vport_num];
@ -1439,6 +1439,7 @@ static int esw_vport_enable_qos(struct mlx5_eswitch *esw, int vport_num,
esw->qos.root_tsar_id);
MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
initial_max_rate);
MLX5_SET(scheduling_context, &sched_ctx, bw_share, initial_bw_share);
err = mlx5_create_scheduling_element_cmd(dev,
SCHEDULING_HIERARCHY_E_SWITCH,
@ -1473,7 +1474,7 @@ static void esw_vport_disable_qos(struct mlx5_eswitch *esw, int vport_num)
}
static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
u32 max_rate)
u32 max_rate, u32 bw_share)
{
u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {0};
struct mlx5_vport *vport = &esw->vports[vport_num];
@ -1497,7 +1498,9 @@ static int esw_vport_qos_config(struct mlx5_eswitch *esw, int vport_num,
esw->qos.root_tsar_id);
MLX5_SET(scheduling_context, &sched_ctx, max_average_bw,
max_rate);
MLX5_SET(scheduling_context, &sched_ctx, bw_share, bw_share);
bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
err = mlx5_modify_scheduling_element_cmd(dev,
SCHEDULING_HIERARCHY_E_SWITCH,
@ -1563,7 +1566,8 @@ static void esw_enable_vport(struct mlx5_eswitch *esw, int vport_num,
esw_apply_vport_conf(esw, vport);
/* Attach vport to the eswitch rate limiter */
if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate))
if (esw_vport_enable_qos(esw, vport_num, vport->info.max_rate,
vport->qos.bw_share))
esw_warn(esw->dev, "Failed to attach vport %d to eswitch rate limiter", vport_num);
/* Sync with current vport context */
@ -1952,6 +1956,7 @@ int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
ivi->qos = evport->info.qos;
ivi->spoofchk = evport->info.spoofchk;
ivi->trusted = evport->info.trusted;
ivi->min_tx_rate = evport->info.min_rate;
ivi->max_tx_rate = evport->info.max_rate;
mutex_unlock(&esw->state_lock);
@ -2046,23 +2051,103 @@ int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
return 0;
}
int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw,
int vport, u32 max_rate)
static u32 calculate_vports_min_rate_divider(struct mlx5_eswitch *esw)
{
u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
struct mlx5_vport *evport;
u32 max_guarantee = 0;
int i;
for (i = 0; i <= esw->total_vports; i++) {
evport = &esw->vports[i];
if (!evport->enabled || evport->info.min_rate < max_guarantee)
continue;
max_guarantee = evport->info.min_rate;
}
return max_t(u32, max_guarantee / fw_max_bw_share, 1);
}
static int normalize_vports_min_rate(struct mlx5_eswitch *esw, u32 divider)
{
u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
struct mlx5_vport *evport;
u32 vport_max_rate;
u32 vport_min_rate;
u32 bw_share;
int err;
int i;
for (i = 0; i <= esw->total_vports; i++) {
evport = &esw->vports[i];
if (!evport->enabled)
continue;
vport_min_rate = evport->info.min_rate;
vport_max_rate = evport->info.max_rate;
bw_share = MLX5_MIN_BW_SHARE;
if (vport_min_rate)
bw_share = MLX5_RATE_TO_BW_SHARE(vport_min_rate,
divider,
fw_max_bw_share);
if (bw_share == evport->qos.bw_share)
continue;
err = esw_vport_qos_config(esw, i, vport_max_rate,
bw_share);
if (!err)
evport->qos.bw_share = bw_share;
else
return err;
}
return 0;
}
int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
u32 max_rate, u32 min_rate)
{
u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
bool min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) &&
fw_max_bw_share >= MLX5_MIN_BW_SHARE;
bool max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit);
struct mlx5_vport *evport;
u32 previous_min_rate;
u32 divider;
int err = 0;
if (!ESW_ALLOWED(esw))
return -EPERM;
if (!LEGAL_VPORT(esw, vport))
return -EINVAL;
if ((min_rate && !min_rate_supported) || (max_rate && !max_rate_supported))
return -EOPNOTSUPP;
mutex_lock(&esw->state_lock);
evport = &esw->vports[vport];
err = esw_vport_qos_config(esw, vport, max_rate);
if (min_rate == evport->info.min_rate)
goto set_max_rate;
previous_min_rate = evport->info.min_rate;
evport->info.min_rate = min_rate;
divider = calculate_vports_min_rate_divider(esw);
err = normalize_vports_min_rate(esw, divider);
if (err) {
evport->info.min_rate = previous_min_rate;
goto unlock;
}
set_max_rate:
if (max_rate == evport->info.max_rate)
goto unlock;
err = esw_vport_qos_config(esw, vport, max_rate, evport->qos.bw_share);
if (!err)
evport->info.max_rate = max_rate;
unlock:
mutex_unlock(&esw->state_lock);
return err;
}

View File

@ -36,6 +36,7 @@
#include <linux/if_ether.h>
#include <linux/if_link.h>
#include <net/devlink.h>
#include <net/ip_tunnels.h>
#include <linux/mlx5/device.h>
#define MLX5_MAX_UC_PER_VPORT(dev) \
@ -49,6 +50,11 @@
#define FDB_UPLINK_VPORT 0xffff
#define MLX5_MIN_BW_SHARE 1
#define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
min_t(u32, max_t(u32, (rate) / (divider), MLX5_MIN_BW_SHARE), limit)
/* L2 -mac address based- hash helpers */
struct l2addr_node {
struct hlist_node hlist;
@ -115,6 +121,7 @@ struct mlx5_vport_info {
u8 qos;
u64 node_guid;
int link_state;
u32 min_rate;
u32 max_rate;
bool spoofchk;
bool trusted;
@ -137,6 +144,7 @@ struct mlx5_vport {
struct {
bool enabled;
u32 esw_tsar_ix;
u32 bw_share;
} qos;
bool enabled;
@ -248,8 +256,8 @@ int mlx5_eswitch_set_vport_spoofchk(struct mlx5_eswitch *esw,
int vport, bool spoofchk);
int mlx5_eswitch_set_vport_trust(struct mlx5_eswitch *esw,
int vport_num, bool setting);
int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw,
int vport, u32 max_rate);
int mlx5_eswitch_set_vport_rate(struct mlx5_eswitch *esw, int vport,
u32 max_rate, u32 min_rate);
int mlx5_eswitch_get_vport_config(struct mlx5_eswitch *esw,
int vport, struct ifla_vf_info *ivi);
int mlx5_eswitch_get_vport_stats(struct mlx5_eswitch *esw,
@ -274,18 +282,12 @@ enum {
#define MLX5_FLOW_CONTEXT_ACTION_VLAN_POP 0x40
#define MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH 0x80
struct mlx5_encap_info {
__be32 daddr;
__be32 tun_id;
__be16 tp_dst;
};
struct mlx5_encap_entry {
struct hlist_node encap_hlist;
struct list_head flows;
u32 encap_id;
struct neighbour *n;
struct mlx5_encap_info tun_info;
struct ip_tunnel_info tun_info;
unsigned char h_dest[ETH_ALEN]; /* destination eth addr */
struct net_device *out_dev;

View File

@ -402,19 +402,18 @@ out:
}
#define MAX_PF_SQ 256
#define ESW_OFFLOADS_NUM_ENTRIES (1 << 13) /* 8K */
#define ESW_OFFLOADS_NUM_GROUPS 4
static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
{
int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in);
int table_size, ix, esw_size, err = 0;
struct mlx5_core_dev *dev = esw->dev;
struct mlx5_flow_namespace *root_ns;
struct mlx5_flow_table *fdb = NULL;
struct mlx5_flow_group *g;
u32 *flow_group_in;
void *match_criteria;
int table_size, ix, err = 0;
u32 flags = 0;
flow_group_in = mlx5_vzalloc(inlen);
@ -427,15 +426,19 @@ static int esw_create_offloads_fdb_table(struct mlx5_eswitch *esw, int nvports)
goto ns_err;
}
esw_debug(dev, "Create offloads FDB table, log_max_size(%d)\n",
MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
esw_debug(dev, "Create offloads FDB table, min (max esw size(2^%d), max counters(%d)*groups(%d))\n",
MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size),
MLX5_CAP_GEN(dev, max_flow_counter), ESW_OFFLOADS_NUM_GROUPS);
esw_size = min_t(int, MLX5_CAP_GEN(dev, max_flow_counter) * ESW_OFFLOADS_NUM_GROUPS,
1 << MLX5_CAP_ESW_FLOWTABLE_FDB(dev, log_max_ft_size));
if (MLX5_CAP_ESW_FLOWTABLE_FDB(dev, encap) &&
MLX5_CAP_ESW_FLOWTABLE_FDB(dev, decap))
flags |= MLX5_FLOW_TABLE_TUNNEL_EN;
fdb = mlx5_create_auto_grouped_flow_table(root_ns, FDB_FAST_PATH,
ESW_OFFLOADS_NUM_ENTRIES,
esw_size,
ESW_OFFLOADS_NUM_GROUPS, 0,
flags);
if (IS_ERR(fdb)) {

View File

@ -473,10 +473,13 @@ int mlx5_encap_alloc(struct mlx5_core_dev *dev,
int err;
u32 *in;
if (size > MLX5_CAP_ESW(dev, max_encap_header_size))
if (size > max_encap_size) {
mlx5_core_warn(dev, "encap size %zd too big, max supported is %d\n",
size, max_encap_size);
return -EINVAL;
}
in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + max_encap_size,
in = kzalloc(MLX5_ST_SZ_BYTES(alloc_encap_header_in) + size,
GFP_KERNEL);
if (!in)
return -ENOMEM;

View File

@ -127,6 +127,23 @@ int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
}
EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_min_inline);
void mlx5_query_min_inline(struct mlx5_core_dev *mdev,
u8 *min_inline_mode)
{
switch (MLX5_CAP_ETH(mdev, wqe_inline_mode)) {
case MLX5_CAP_INLINE_MODE_L2:
*min_inline_mode = MLX5_INLINE_MODE_L2;
break;
case MLX5_CAP_INLINE_MODE_VPORT_CONTEXT:
mlx5_query_nic_vport_min_inline(mdev, 0, min_inline_mode);
break;
case MLX5_CAP_INLINE_MODE_NOT_REQUIRED:
*min_inline_mode = MLX5_INLINE_MODE_NONE;
break;
}
}
EXPORT_SYMBOL_GPL(mlx5_query_min_inline);
int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
u16 vport, u8 min_inline)
{

View File

@ -547,7 +547,9 @@ struct mlx5_ifc_e_switch_cap_bits {
struct mlx5_ifc_qos_cap_bits {
u8 packet_pacing[0x1];
u8 esw_scheduling[0x1];
u8 reserved_at_2[0x1e];
u8 esw_bw_share[0x1];
u8 esw_rate_limit[0x1];
u8 reserved_at_4[0x1c];
u8 reserved_at_20[0x20];

View File

@ -51,6 +51,7 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev,
u16 vport, u8 *addr);
int mlx5_query_nic_vport_min_inline(struct mlx5_core_dev *mdev,
u16 vport, u8 *min_inline);
void mlx5_query_min_inline(struct mlx5_core_dev *mdev, u8 *min_inline);
int mlx5_modify_nic_vport_min_inline(struct mlx5_core_dev *mdev,
u16 vport, u8 min_inline);
int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev,

View File

@ -90,6 +90,17 @@ enum mlx5_user_cmds_supp_uhw {
MLX5_USER_CMDS_SUPP_UHW_CREATE_AH = 1 << 1,
};
/* The eth_min_inline response value is set to off-by-one vs the FW
* returned value to allow user-space to deal with older kernels.
*/
enum mlx5_user_inline_mode {
MLX5_USER_INLINE_MODE_NA,
MLX5_USER_INLINE_MODE_NONE,
MLX5_USER_INLINE_MODE_L2,
MLX5_USER_INLINE_MODE_IP,
MLX5_USER_INLINE_MODE_TCP_UDP,
};
struct mlx5_ib_alloc_ucontext_resp {
__u32 qp_tab_size;
__u32 bf_reg_size;
@ -106,7 +117,8 @@ struct mlx5_ib_alloc_ucontext_resp {
__u32 response_length;
__u8 cqe_version;
__u8 cmds_supp_uhw;
__u16 reserved2;
__u8 eth_min_inline;
__u8 reserved2;
__u64 hca_core_clock_offset;
__u32 log_uar_size;
__u32 num_uars_per_page;