udp: no longer use SLAB_DESTROY_BY_RCU

Tom Herbert would like not touching UDP socket refcnt for encapsulated
traffic. For this to happen, we need to use normal RCU rules, with a grace
period before freeing a socket. UDP sockets are not short lived in the
high usage case, so the added cost of call_rcu() should not be a concern.

This actually removes a lot of complexity in UDP stack.

Multicast receives no longer need to hold a bucket spinlock.

Note that ip early demux still needs to take a reference on the socket.

Same remark for functions used by xt_socket and xt_PROXY netfilter modules,
but this might be changed later.

Performance for a single UDP socket receiving flood traffic from
many RX queues/cpus.

Simple udp_rx using simple recvfrom() loop :
438 kpps instead of 374 kpps : 17 % increase of the peak rate.

v2: Addressed Willem de Bruijn feedback in multicast handling
 - keep early demux break in __udp4_lib_demux_lookup()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Willem de Bruijn <willemb@google.com>
Tested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet
2016-04-01 08:52:13 -07:00
committed by David S. Miller
parent a4298e4522
commit ca065d0cf8
6 changed files with 172 additions and 359 deletions

View File

@@ -213,37 +213,28 @@ static struct sock *udp6_lib_lookup2(struct net *net,
struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
bool select_ok = true;
u32 hash = 0;
begin:
result = NULL;
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
score = compute_score2(sk, net, saddr, sport,
daddr, hnum, dif);
if (score > badness) {
result = sk;
badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
if (select_ok) {
struct sock *sk2;
sk2 = reuseport_select_sock(sk, hash, skb,
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (sk2) {
result = sk2;
select_ok = false;
goto found;
}
}
if (result)
return result;
matches = 1;
}
result = sk;
badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -251,27 +242,10 @@ begin:
hash = next_pseudo_random32(hash);
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot2)
goto begin;
if (result) {
found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
daddr, hnum, dif) < badness)) {
sock_put(result);
goto begin;
}
}
return result;
}
/* rcu_read_lock() must be held */
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
@@ -279,15 +253,12 @@ struct sock *__udp6_lib_lookup(struct net *net,
struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
bool select_ok = true;
u32 hash = 0;
rcu_read_lock();
if (hslot->count > 10) {
hash2 = udp6_portaddr_hash(net, daddr, hnum);
slot2 = hash2 & udptable->mask;
@@ -309,34 +280,26 @@ struct sock *__udp6_lib_lookup(struct net *net,
&in6addr_any, hnum, dif,
hslot2, slot2, skb);
}
rcu_read_unlock();
return result;
}
begin:
result = NULL;
badness = -1;
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
sk_for_each_rcu(sk, &hslot->head) {
score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
if (score > badness) {
result = sk;
badness = score;
reuseport = sk->sk_reuseport;
if (reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
if (select_ok) {
struct sock *sk2;
sk2 = reuseport_select_sock(sk, hash, skb,
result = reuseport_select_sock(sk, hash, skb,
sizeof(struct udphdr));
if (sk2) {
result = sk2;
select_ok = false;
goto found;
}
}
if (result)
return result;
matches = 1;
}
result = sk;
badness = score;
} else if (score == badness && reuseport) {
matches++;
if (reciprocal_scale(hash, matches) == 0)
@@ -344,25 +307,6 @@ begin:
hash = next_pseudo_random32(hash);
}
}
/*
* if the nulls value we got at the end of this lookup is
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
if (get_nulls_value(node) != slot)
goto begin;
if (result) {
found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, saddr, sport,
daddr, dport, dif) < badness)) {
sock_put(result);
goto begin;
}
}
rcu_read_unlock();
return result;
}
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
@@ -382,12 +326,24 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
udptable, skb);
}
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
dif, &udp_table, NULL);
if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
#endif
/*
* This should be easy, if there is something there we
@@ -585,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
sk->sk_err = err;
sk->sk_error_report(sk);
out:
sock_put(sk);
return;
}
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -747,33 +703,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
return true;
}
static void flush_stack(struct sock **stack, unsigned int count,
struct sk_buff *skb, unsigned int final)
{
struct sk_buff *skb1 = NULL;
struct sock *sk;
unsigned int i;
for (i = 0; i < count; i++) {
sk = stack[i];
if (likely(!skb1))
skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
if (!skb1) {
atomic_inc(&sk->sk_drops);
UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
}
if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
skb1 = NULL;
sock_put(sk);
}
if (unlikely(skb1))
kfree_skb(skb1);
}
static void udp6_csum_zero_error(struct sk_buff *skb)
{
/* RFC 2460 section 8.1 says that we SHOULD log
@@ -792,15 +721,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
const struct in6_addr *saddr, const struct in6_addr *daddr,
struct udp_table *udptable, int proto)
{
struct sock *sk, *stack[256 / sizeof(struct sock *)];
struct sock *sk, *first = NULL;
const struct udphdr *uh = udp_hdr(skb);
struct hlist_nulls_node *node;
unsigned short hnum = ntohs(uh->dest);
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
int dif = inet6_iif(skb);
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
bool inner_flushed = false;
int dif = inet6_iif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
if (use_hash2) {
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
@@ -811,27 +740,32 @@ start_lookup:
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
spin_lock(&hslot->lock);
sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
if (__udp_v6_is_mcast_sock(net, sk,
uh->dest, daddr,
uh->source, saddr,
dif, hnum) &&
/* If zero checksum and no_check is not on for
* the socket then skip it.
*/
(uh->check || udp_sk(sk)->no_check6_rx)) {
if (unlikely(count == ARRAY_SIZE(stack))) {
flush_stack(stack, count, skb, ~0);
inner_flushed = true;
count = 0;
}
stack[count++] = sk;
sock_hold(sk);
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
uh->source, saddr, dif, hnum))
continue;
/* If zero checksum and no_check is not on for
* the socket then skip it.
*/
if (!uh->check && !udp_sk(sk)->no_check6_rx)
continue;
if (!first) {
first = sk;
continue;
}
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
atomic_inc(&sk->sk_drops);
UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS,
IS_UDPLITE(sk));
continue;
}
}
spin_unlock(&hslot->lock);
if (udpv6_queue_rcv_skb(sk, nskb) > 0)
consume_skb(nskb);
}
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
if (use_hash2 && hash2 != hash2_any) {
@@ -839,13 +773,13 @@ start_lookup:
goto start_lookup;
}
if (count) {
flush_stack(stack, count, skb, count - 1);
if (first) {
if (udpv6_queue_rcv_skb(first, skb) > 0)
consume_skb(skb);
} else {
if (!inner_flushed)
UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
proto == IPPROTO_UDPLITE);
consume_skb(skb);
kfree_skb(skb);
UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
proto == IPPROTO_UDPLITE);
}
return 0;
}
@@ -853,10 +787,10 @@ start_lookup:
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
struct net *net = dev_net(skb->dev);
struct sock *sk;
struct udphdr *uh;
const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
struct udphdr *uh;
struct sock *sk;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -910,7 +844,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int ret;
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
sock_put(sk);
udp6_csum_zero_error(skb);
goto csum_error;
}
@@ -920,7 +853,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
ip6_compute_pseudo);
ret = udpv6_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input */
if (ret > 0)