forked from Minki/linux
tcp: attach SYNACK messages to request sockets instead of listener
If a listen backlog is very big (to avoid syncookies), then the listener sk->sk_wmem_alloc is the main source of false sharing, as we need to touch it twice per SYNACK re-transmit and TX completion. (One SYN packet takes listener lock once, but up to 6 SYNACK are generated) By attaching the skb to the request socket, we remove this source of contention. Tested: listen(fd, 10485760); // single listener (no SO_REUSEPORT) 16 RX/TX queue NIC Sustain a SYNFLOOD attack of ~320,000 SYN per second, Sending ~1,400,000 SYNACK per second. Perf profiles now show listener spinlock being next bottleneck. 20.29% [kernel] [k] queued_spin_lock_slowpath 10.06% [kernel] [k] __inet_lookup_established 5.12% [kernel] [k] reqsk_timer_handler 3.22% [kernel] [k] get_next_timer_interrupt 3.00% [kernel] [k] tcp_make_synack 2.77% [kernel] [k] ipt_do_table 2.70% [kernel] [k] run_timer_softirq 2.50% [kernel] [k] ip_finish_output 2.04% [kernel] [k] cascade Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
1b33bc3e9e
commit
ca6fb06518
@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
|
|||||||
int tcp_connect(struct sock *sk);
|
int tcp_connect(struct sock *sk);
|
||||||
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
||||||
struct request_sock *req,
|
struct request_sock *req,
|
||||||
struct tcp_fastopen_cookie *foc);
|
struct tcp_fastopen_cookie *foc,
|
||||||
|
bool attach_req);
|
||||||
int tcp_disconnect(struct sock *sk, int flags);
|
int tcp_disconnect(struct sock *sk, int flags);
|
||||||
|
|
||||||
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
|
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
|
||||||
@ -1715,7 +1716,8 @@ struct tcp_request_sock_ops {
|
|||||||
__u32 (*init_seq)(const struct sk_buff *skb);
|
__u32 (*init_seq)(const struct sk_buff *skb);
|
||||||
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
|
int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
|
||||||
struct flowi *fl, struct request_sock *req,
|
struct flowi *fl, struct request_sock *req,
|
||||||
u16 queue_mapping, struct tcp_fastopen_cookie *foc);
|
u16 queue_mapping, struct tcp_fastopen_cookie *foc,
|
||||||
|
bool attach_req);
|
||||||
};
|
};
|
||||||
|
|
||||||
#ifdef CONFIG_SYN_COOKIES
|
#ifdef CONFIG_SYN_COOKIES
|
||||||
|
@ -628,7 +628,7 @@ static void reqsk_queue_hash_req(struct request_sock *req,
|
|||||||
* are committed to memory and refcnt initialized.
|
* are committed to memory and refcnt initialized.
|
||||||
*/
|
*/
|
||||||
smp_wmb();
|
smp_wmb();
|
||||||
atomic_set(&req->rsk_refcnt, 2);
|
atomic_set(&req->rsk_refcnt, 2 + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
|
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
|
||||||
|
@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
|
|||||||
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
|
tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
|
||||||
|
|
||||||
/* Activate the retrans timer so that SYNACK can be retransmitted.
|
/* Activate the retrans timer so that SYNACK can be retransmitted.
|
||||||
* The request socket is not added to the SYN table of the parent
|
* The request socket is not added to the ehash
|
||||||
* because it's been added to the accept queue directly.
|
* because it's been added to the accept queue directly.
|
||||||
*/
|
*/
|
||||||
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
|
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
|
||||||
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
|
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
|
||||||
|
|
||||||
atomic_set(&req->rsk_refcnt, 1);
|
atomic_set(&req->rsk_refcnt, 2);
|
||||||
/* Add the child socket directly into the accept queue */
|
/* Add the child socket directly into the accept queue */
|
||||||
inet_csk_reqsk_queue_add(sk, req, child);
|
inet_csk_reqsk_queue_add(sk, req, child);
|
||||||
|
|
||||||
|
@ -6120,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|||||||
struct request_sock *req;
|
struct request_sock *req;
|
||||||
bool want_cookie = false;
|
bool want_cookie = false;
|
||||||
struct flowi fl;
|
struct flowi fl;
|
||||||
int err;
|
|
||||||
|
|
||||||
|
|
||||||
/* TW buckets are converted to open requests without
|
/* TW buckets are converted to open requests without
|
||||||
* limitations, they conserve resources and peer is
|
* limitations, they conserve resources and peer is
|
||||||
@ -6230,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
|
|||||||
tcp_rsk(req)->snt_isn = isn;
|
tcp_rsk(req)->snt_isn = isn;
|
||||||
tcp_rsk(req)->txhash = net_tx_rndhash();
|
tcp_rsk(req)->txhash = net_tx_rndhash();
|
||||||
tcp_openreq_init_rwin(req, sk, dst);
|
tcp_openreq_init_rwin(req, sk, dst);
|
||||||
if (!want_cookie)
|
if (!want_cookie) {
|
||||||
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
|
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
|
||||||
err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req,
|
tcp_reqsk_record_syn(sk, req, skb);
|
||||||
skb_get_queue_mapping(skb), &foc);
|
}
|
||||||
if (fastopen_sk) {
|
if (fastopen_sk) {
|
||||||
|
af_ops->send_synack(fastopen_sk, dst, &fl, req,
|
||||||
|
skb_get_queue_mapping(skb), &foc, false);
|
||||||
sock_put(fastopen_sk);
|
sock_put(fastopen_sk);
|
||||||
} else {
|
} else {
|
||||||
if (err || want_cookie)
|
|
||||||
goto drop_and_free;
|
|
||||||
|
|
||||||
tcp_rsk(req)->tfo_listener = false;
|
tcp_rsk(req)->tfo_listener = false;
|
||||||
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
|
if (!want_cookie)
|
||||||
|
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
|
||||||
|
af_ops->send_synack(sk, dst, &fl, req,
|
||||||
|
skb_get_queue_mapping(skb), &foc, !want_cookie);
|
||||||
|
if (want_cookie)
|
||||||
|
goto drop_and_free;
|
||||||
}
|
}
|
||||||
tcp_reqsk_record_syn(sk, req, skb);
|
reqsk_put(req);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
drop_and_release:
|
drop_and_release:
|
||||||
|
@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|||||||
struct flowi *fl,
|
struct flowi *fl,
|
||||||
struct request_sock *req,
|
struct request_sock *req,
|
||||||
u16 queue_mapping,
|
u16 queue_mapping,
|
||||||
struct tcp_fastopen_cookie *foc)
|
struct tcp_fastopen_cookie *foc,
|
||||||
|
bool attach_req)
|
||||||
{
|
{
|
||||||
const struct inet_request_sock *ireq = inet_rsk(req);
|
const struct inet_request_sock *ireq = inet_rsk(req);
|
||||||
struct flowi4 fl4;
|
struct flowi4 fl4;
|
||||||
@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|||||||
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
|
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
skb = tcp_make_synack(sk, dst, req, foc);
|
skb = tcp_make_synack(sk, dst, req, foc, attach_req);
|
||||||
|
|
||||||
if (skb) {
|
if (skb) {
|
||||||
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
|
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
|
||||||
|
@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk)
|
|||||||
*/
|
*/
|
||||||
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
||||||
struct request_sock *req,
|
struct request_sock *req,
|
||||||
struct tcp_fastopen_cookie *foc)
|
struct tcp_fastopen_cookie *foc,
|
||||||
|
bool attach_req)
|
||||||
{
|
{
|
||||||
struct inet_request_sock *ireq = inet_rsk(req);
|
struct inet_request_sock *ireq = inet_rsk(req);
|
||||||
const struct tcp_sock *tp = tcp_sk(sk);
|
const struct tcp_sock *tp = tcp_sk(sk);
|
||||||
@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
|||||||
u16 user_mss;
|
u16 user_mss;
|
||||||
int mss;
|
int mss;
|
||||||
|
|
||||||
/* sk is a const pointer, because we want to express multiple cpus
|
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
|
||||||
* might call us concurrently.
|
|
||||||
* sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way.
|
|
||||||
*/
|
|
||||||
skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
|
|
||||||
if (unlikely(!skb)) {
|
if (unlikely(!skb)) {
|
||||||
dst_release(dst);
|
dst_release(dst);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
|
|||||||
/* Reserve space for headers. */
|
/* Reserve space for headers. */
|
||||||
skb_reserve(skb, MAX_TCP_HEADER);
|
skb_reserve(skb, MAX_TCP_HEADER);
|
||||||
|
|
||||||
|
if (attach_req) {
|
||||||
|
skb->destructor = sock_edemux;
|
||||||
|
sock_hold(req_to_sk(req));
|
||||||
|
skb->sk = req_to_sk(req);
|
||||||
|
} else {
|
||||||
|
/* sk is a const pointer, because we want to express multiple
|
||||||
|
* cpu might call us concurrently.
|
||||||
|
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
|
||||||
|
*/
|
||||||
|
skb_set_owner_w(skb, (struct sock *)sk);
|
||||||
|
}
|
||||||
skb_dst_set(skb, dst);
|
skb_dst_set(skb, dst);
|
||||||
|
|
||||||
mss = dst_metric_advmss(dst);
|
mss = dst_metric_advmss(dst);
|
||||||
@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
|
|||||||
int res;
|
int res;
|
||||||
|
|
||||||
tcp_rsk(req)->txhash = net_tx_rndhash();
|
tcp_rsk(req)->txhash = net_tx_rndhash();
|
||||||
res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL);
|
res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true);
|
||||||
if (!res) {
|
if (!res) {
|
||||||
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
|
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
|
||||||
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
|
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
|
||||||
|
@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|||||||
struct flowi *fl,
|
struct flowi *fl,
|
||||||
struct request_sock *req,
|
struct request_sock *req,
|
||||||
u16 queue_mapping,
|
u16 queue_mapping,
|
||||||
struct tcp_fastopen_cookie *foc)
|
struct tcp_fastopen_cookie *foc,
|
||||||
|
bool attach_req)
|
||||||
{
|
{
|
||||||
struct inet_request_sock *ireq = inet_rsk(req);
|
struct inet_request_sock *ireq = inet_rsk(req);
|
||||||
struct ipv6_pinfo *np = inet6_sk(sk);
|
struct ipv6_pinfo *np = inet6_sk(sk);
|
||||||
@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
|
|||||||
IPPROTO_TCP)) == NULL)
|
IPPROTO_TCP)) == NULL)
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
skb = tcp_make_synack(sk, dst, req, foc);
|
skb = tcp_make_synack(sk, dst, req, foc, attach_req);
|
||||||
|
|
||||||
if (skb) {
|
if (skb) {
|
||||||
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
|
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
|
||||||
|
@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
|
|||||||
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
|
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
|
||||||
return &q->internal;
|
return &q->internal;
|
||||||
|
|
||||||
/* SYNACK messages are attached to a listener socket.
|
/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
|
||||||
* 1) They are not part of a 'flow' yet
|
* 1) request sockets are not full blown,
|
||||||
* 2) We do not want to rate limit them (eg SYNFLOOD attack),
|
* they do not contain sk_pacing_rate
|
||||||
|
* 2) They are not part of a 'flow' yet
|
||||||
|
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
|
||||||
* especially if the listener set SO_MAX_PACING_RATE
|
* especially if the listener set SO_MAX_PACING_RATE
|
||||||
* 3) We pretend they are orphaned
|
* 4) We pretend they are orphaned
|
||||||
*/
|
*/
|
||||||
if (!sk || sk->sk_state == TCP_LISTEN) {
|
if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) {
|
||||||
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
|
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
|
||||||
|
|
||||||
/* By forcing low order bit to 1, we make sure to not
|
/* By forcing low order bit to 1, we make sure to not
|
||||||
|
Loading…
Reference in New Issue
Block a user