udp: no longer use SLAB_DESTROY_BY_RCU
Tom Herbert would like not touching UDP socket refcnt for encapsulated traffic. For this to happen, we need to use normal RCU rules, with a grace period before freeing a socket. UDP sockets are not short lived in the high usage case, so the added cost of call_rcu() should not be a concern. This actually removes a lot of complexity in UDP stack. Multicast receives no longer need to hold a bucket spinlock. Note that ip early demux still needs to take a reference on the socket. Same remark for functions used by xt_socket and xt_PROXY netfilter modules, but this might be changed later. Performance for a single UDP socket receiving flood traffic from many RX queues/cpus. Simple udp_rx using simple recvfrom() loop : 438 kpps instead of 374 kpps : 17 % increase of the peak rate. v2: Addressed Willem de Bruijn feedback in multicast handling - keep early demux break in __udp4_lib_demux_lookup() Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Tom Herbert <tom@herbertland.com> Cc: Willem de Bruijn <willemb@google.com> Tested-by: Tom Herbert <tom@herbertland.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
committed by
David S. Miller
parent
a4298e4522
commit
ca065d0cf8
198
net/ipv6/udp.c
198
net/ipv6/udp.c
@@ -213,37 +213,28 @@ static struct sock *udp6_lib_lookup2(struct net *net,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
struct sock *sk, *result;
|
||||
struct hlist_nulls_node *node;
|
||||
int score, badness, matches = 0, reuseport = 0;
|
||||
bool select_ok = true;
|
||||
u32 hash = 0;
|
||||
|
||||
begin:
|
||||
result = NULL;
|
||||
badness = -1;
|
||||
udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
|
||||
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
|
||||
score = compute_score2(sk, net, saddr, sport,
|
||||
daddr, hnum, dif);
|
||||
if (score > badness) {
|
||||
result = sk;
|
||||
badness = score;
|
||||
reuseport = sk->sk_reuseport;
|
||||
if (reuseport) {
|
||||
hash = udp6_ehashfn(net, daddr, hnum,
|
||||
saddr, sport);
|
||||
if (select_ok) {
|
||||
struct sock *sk2;
|
||||
|
||||
sk2 = reuseport_select_sock(sk, hash, skb,
|
||||
result = reuseport_select_sock(sk, hash, skb,
|
||||
sizeof(struct udphdr));
|
||||
if (sk2) {
|
||||
result = sk2;
|
||||
select_ok = false;
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
if (result)
|
||||
return result;
|
||||
matches = 1;
|
||||
}
|
||||
result = sk;
|
||||
badness = score;
|
||||
} else if (score == badness && reuseport) {
|
||||
matches++;
|
||||
if (reciprocal_scale(hash, matches) == 0)
|
||||
@@ -251,27 +242,10 @@ begin:
|
||||
hash = next_pseudo_random32(hash);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot2)
|
||||
goto begin;
|
||||
|
||||
if (result) {
|
||||
found:
|
||||
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
|
||||
result = NULL;
|
||||
else if (unlikely(compute_score2(result, net, saddr, sport,
|
||||
daddr, hnum, dif) < badness)) {
|
||||
sock_put(result);
|
||||
goto begin;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* rcu_read_lock() must be held */
|
||||
struct sock *__udp6_lib_lookup(struct net *net,
|
||||
const struct in6_addr *saddr, __be16 sport,
|
||||
const struct in6_addr *daddr, __be16 dport,
|
||||
@@ -279,15 +253,12 @@ struct sock *__udp6_lib_lookup(struct net *net,
|
||||
struct sk_buff *skb)
|
||||
{
|
||||
struct sock *sk, *result;
|
||||
struct hlist_nulls_node *node;
|
||||
unsigned short hnum = ntohs(dport);
|
||||
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
|
||||
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
|
||||
int score, badness, matches = 0, reuseport = 0;
|
||||
bool select_ok = true;
|
||||
u32 hash = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
if (hslot->count > 10) {
|
||||
hash2 = udp6_portaddr_hash(net, daddr, hnum);
|
||||
slot2 = hash2 & udptable->mask;
|
||||
@@ -309,34 +280,26 @@ struct sock *__udp6_lib_lookup(struct net *net,
|
||||
&in6addr_any, hnum, dif,
|
||||
hslot2, slot2, skb);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return result;
|
||||
}
|
||||
begin:
|
||||
result = NULL;
|
||||
badness = -1;
|
||||
sk_nulls_for_each_rcu(sk, node, &hslot->head) {
|
||||
sk_for_each_rcu(sk, &hslot->head) {
|
||||
score = compute_score(sk, net, hnum, saddr, sport, daddr, dport, dif);
|
||||
if (score > badness) {
|
||||
result = sk;
|
||||
badness = score;
|
||||
reuseport = sk->sk_reuseport;
|
||||
if (reuseport) {
|
||||
hash = udp6_ehashfn(net, daddr, hnum,
|
||||
saddr, sport);
|
||||
if (select_ok) {
|
||||
struct sock *sk2;
|
||||
|
||||
sk2 = reuseport_select_sock(sk, hash, skb,
|
||||
result = reuseport_select_sock(sk, hash, skb,
|
||||
sizeof(struct udphdr));
|
||||
if (sk2) {
|
||||
result = sk2;
|
||||
select_ok = false;
|
||||
goto found;
|
||||
}
|
||||
}
|
||||
if (result)
|
||||
return result;
|
||||
matches = 1;
|
||||
}
|
||||
result = sk;
|
||||
badness = score;
|
||||
} else if (score == badness && reuseport) {
|
||||
matches++;
|
||||
if (reciprocal_scale(hash, matches) == 0)
|
||||
@@ -344,25 +307,6 @@ begin:
|
||||
hash = next_pseudo_random32(hash);
|
||||
}
|
||||
}
|
||||
/*
|
||||
* if the nulls value we got at the end of this lookup is
|
||||
* not the expected one, we must restart lookup.
|
||||
* We probably met an item that was moved to another chain.
|
||||
*/
|
||||
if (get_nulls_value(node) != slot)
|
||||
goto begin;
|
||||
|
||||
if (result) {
|
||||
found:
|
||||
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
|
||||
result = NULL;
|
||||
else if (unlikely(compute_score(result, net, hnum, saddr, sport,
|
||||
daddr, dport, dif) < badness)) {
|
||||
sock_put(result);
|
||||
goto begin;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return result;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
|
||||
@@ -382,12 +326,24 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
|
||||
udptable, skb);
|
||||
}
|
||||
|
||||
/* Must be called under rcu_read_lock().
|
||||
* Does increment socket refcount.
|
||||
*/
|
||||
#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \
|
||||
IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY)
|
||||
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
|
||||
const struct in6_addr *daddr, __be16 dport, int dif)
|
||||
{
|
||||
return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
|
||||
struct sock *sk;
|
||||
|
||||
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
|
||||
dif, &udp_table, NULL);
|
||||
if (sk && !atomic_inc_not_zero(&sk->sk_refcnt))
|
||||
sk = NULL;
|
||||
return sk;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This should be easy, if there is something there we
|
||||
@@ -585,7 +541,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
|
||||
sk->sk_err = err;
|
||||
sk->sk_error_report(sk);
|
||||
out:
|
||||
sock_put(sk);
|
||||
return;
|
||||
}
|
||||
|
||||
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
|
||||
@@ -747,33 +703,6 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
|
||||
return true;
|
||||
}
|
||||
|
||||
static void flush_stack(struct sock **stack, unsigned int count,
|
||||
struct sk_buff *skb, unsigned int final)
|
||||
{
|
||||
struct sk_buff *skb1 = NULL;
|
||||
struct sock *sk;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < count; i++) {
|
||||
sk = stack[i];
|
||||
if (likely(!skb1))
|
||||
skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
|
||||
if (!skb1) {
|
||||
atomic_inc(&sk->sk_drops);
|
||||
UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
|
||||
IS_UDPLITE(sk));
|
||||
UDP6_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
|
||||
IS_UDPLITE(sk));
|
||||
}
|
||||
|
||||
if (skb1 && udpv6_queue_rcv_skb(sk, skb1) <= 0)
|
||||
skb1 = NULL;
|
||||
sock_put(sk);
|
||||
}
|
||||
if (unlikely(skb1))
|
||||
kfree_skb(skb1);
|
||||
}
|
||||
|
||||
static void udp6_csum_zero_error(struct sk_buff *skb)
|
||||
{
|
||||
/* RFC 2460 section 8.1 says that we SHOULD log
|
||||
@@ -792,15 +721,15 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
|
||||
const struct in6_addr *saddr, const struct in6_addr *daddr,
|
||||
struct udp_table *udptable, int proto)
|
||||
{
|
||||
struct sock *sk, *stack[256 / sizeof(struct sock *)];
|
||||
struct sock *sk, *first = NULL;
|
||||
const struct udphdr *uh = udp_hdr(skb);
|
||||
struct hlist_nulls_node *node;
|
||||
unsigned short hnum = ntohs(uh->dest);
|
||||
struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum);
|
||||
int dif = inet6_iif(skb);
|
||||
unsigned int count = 0, offset = offsetof(typeof(*sk), sk_nulls_node);
|
||||
unsigned int offset = offsetof(typeof(*sk), sk_node);
|
||||
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
|
||||
bool inner_flushed = false;
|
||||
int dif = inet6_iif(skb);
|
||||
struct hlist_node *node;
|
||||
struct sk_buff *nskb;
|
||||
|
||||
if (use_hash2) {
|
||||
hash2_any = udp6_portaddr_hash(net, &in6addr_any, hnum) &
|
||||
@@ -811,27 +740,32 @@ start_lookup:
|
||||
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
|
||||
}
|
||||
|
||||
spin_lock(&hslot->lock);
|
||||
sk_nulls_for_each_entry_offset(sk, node, &hslot->head, offset) {
|
||||
if (__udp_v6_is_mcast_sock(net, sk,
|
||||
uh->dest, daddr,
|
||||
uh->source, saddr,
|
||||
dif, hnum) &&
|
||||
/* If zero checksum and no_check is not on for
|
||||
* the socket then skip it.
|
||||
*/
|
||||
(uh->check || udp_sk(sk)->no_check6_rx)) {
|
||||
if (unlikely(count == ARRAY_SIZE(stack))) {
|
||||
flush_stack(stack, count, skb, ~0);
|
||||
inner_flushed = true;
|
||||
count = 0;
|
||||
}
|
||||
stack[count++] = sk;
|
||||
sock_hold(sk);
|
||||
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
|
||||
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
|
||||
uh->source, saddr, dif, hnum))
|
||||
continue;
|
||||
/* If zero checksum and no_check is not on for
|
||||
* the socket then skip it.
|
||||
*/
|
||||
if (!uh->check && !udp_sk(sk)->no_check6_rx)
|
||||
continue;
|
||||
if (!first) {
|
||||
first = sk;
|
||||
continue;
|
||||
}
|
||||
nskb = skb_clone(skb, GFP_ATOMIC);
|
||||
if (unlikely(!nskb)) {
|
||||
atomic_inc(&sk->sk_drops);
|
||||
UDP6_INC_STATS_BH(net, UDP_MIB_RCVBUFERRORS,
|
||||
IS_UDPLITE(sk));
|
||||
UDP6_INC_STATS_BH(net, UDP_MIB_INERRORS,
|
||||
IS_UDPLITE(sk));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&hslot->lock);
|
||||
if (udpv6_queue_rcv_skb(sk, nskb) > 0)
|
||||
consume_skb(nskb);
|
||||
}
|
||||
|
||||
/* Also lookup *:port if we are using hash2 and haven't done so yet. */
|
||||
if (use_hash2 && hash2 != hash2_any) {
|
||||
@@ -839,13 +773,13 @@ start_lookup:
|
||||
goto start_lookup;
|
||||
}
|
||||
|
||||
if (count) {
|
||||
flush_stack(stack, count, skb, count - 1);
|
||||
if (first) {
|
||||
if (udpv6_queue_rcv_skb(first, skb) > 0)
|
||||
consume_skb(skb);
|
||||
} else {
|
||||
if (!inner_flushed)
|
||||
UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
|
||||
proto == IPPROTO_UDPLITE);
|
||||
consume_skb(skb);
|
||||
kfree_skb(skb);
|
||||
UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
|
||||
proto == IPPROTO_UDPLITE);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -853,10 +787,10 @@ start_lookup:
|
||||
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
|
||||
int proto)
|
||||
{
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct sock *sk;
|
||||
struct udphdr *uh;
|
||||
const struct in6_addr *saddr, *daddr;
|
||||
struct net *net = dev_net(skb->dev);
|
||||
struct udphdr *uh;
|
||||
struct sock *sk;
|
||||
u32 ulen = 0;
|
||||
|
||||
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
|
||||
@@ -910,7 +844,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
|
||||
int ret;
|
||||
|
||||
if (!uh->check && !udp_sk(sk)->no_check6_rx) {
|
||||
sock_put(sk);
|
||||
udp6_csum_zero_error(skb);
|
||||
goto csum_error;
|
||||
}
|
||||
@@ -920,7 +853,6 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
|
||||
ip6_compute_pseudo);
|
||||
|
||||
ret = udpv6_queue_rcv_skb(sk, skb);
|
||||
sock_put(sk);
|
||||
|
||||
/* a return value > 0 means to resubmit the input */
|
||||
if (ret > 0)
|
||||
|
||||
Reference in New Issue
Block a user