2005-06-19 05:47:59 +00:00
|
|
|
/*
|
|
|
|
* NET Generic infrastructure for Network protocols.
|
|
|
|
*
|
|
|
|
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
|
|
|
|
*
|
|
|
|
* From code originally in include/net/tcp.h
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/random.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/string.h>
|
2012-08-31 12:29:12 +00:00
|
|
|
#include <linux/tcp.h>
|
2006-11-16 10:30:37 +00:00
|
|
|
#include <linux/vmalloc.h>
|
2005-06-19 05:47:59 +00:00
|
|
|
|
|
|
|
#include <net/request_sock.h>
|
|
|
|
|
2005-06-19 05:49:40 +00:00
|
|
|
/*
|
|
|
|
* Maximum number of SYN_RECV sockets in queue per LISTEN socket.
|
|
|
|
* One SYN_RECV socket costs about 80bytes on a 32bit machine.
|
|
|
|
* It would be better to replace it with a global counter for all sockets
|
|
|
|
* but then some measure against one socket starving all other sockets
|
|
|
|
* would be needed.
|
|
|
|
*
|
2011-12-05 21:39:41 +00:00
|
|
|
* The minimum value of it is 128. Experiments with real servers show that
|
2005-06-19 05:49:40 +00:00
|
|
|
* it is absolutely not enough even at 100conn/sec. 256 cures most
|
2011-12-05 21:39:41 +00:00
|
|
|
* of problems.
|
|
|
|
* This value is adjusted to 128 for low memory machines,
|
|
|
|
* and it will increase in proportion to the memory of machine.
|
2006-11-16 10:30:37 +00:00
|
|
|
* Note : Dont forget somaxconn that may limit backlog too.
|
2005-06-19 05:49:40 +00:00
|
|
|
*/
|
|
|
|
int sysctl_max_syn_backlog = 256;
|
2010-12-02 20:14:29 +00:00
|
|
|
EXPORT_SYMBOL(sysctl_max_syn_backlog);
|
2005-06-19 05:49:40 +00:00
|
|
|
|
2005-06-19 05:47:59 +00:00
|
|
|
int reqsk_queue_alloc(struct request_sock_queue *queue,
|
2006-11-16 10:30:37 +00:00
|
|
|
unsigned int nr_table_entries)
|
2005-06-19 05:47:59 +00:00
|
|
|
{
|
2006-11-16 10:30:37 +00:00
|
|
|
size_t lopt_size = sizeof(struct listen_sock);
|
2014-06-24 12:32:48 +00:00
|
|
|
struct listen_sock *lopt = NULL;
|
2006-11-16 10:30:37 +00:00
|
|
|
|
|
|
|
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
|
|
|
|
nr_table_entries = max_t(u32, nr_table_entries, 8);
|
|
|
|
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
|
|
|
|
lopt_size += nr_table_entries * sizeof(struct request_sock *);
|
2014-06-24 12:32:48 +00:00
|
|
|
|
|
|
|
if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
|
|
|
|
lopt = kzalloc(lopt_size, GFP_KERNEL |
|
|
|
|
__GFP_NOWARN |
|
|
|
|
__GFP_NORETRY);
|
|
|
|
if (!lopt)
|
2010-11-20 07:46:35 +00:00
|
|
|
lopt = vzalloc(lopt_size);
|
2014-06-24 12:32:48 +00:00
|
|
|
if (!lopt)
|
2005-06-19 05:47:59 +00:00
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
|
|
|
|
rwlock_init(&queue->syn_wait_lock);
|
2006-03-27 01:39:55 +00:00
|
|
|
queue->rskq_accept_head = NULL;
|
2005-08-10 02:33:31 +00:00
|
|
|
lopt->nr_table_entries = nr_table_entries;
|
2014-06-24 12:32:48 +00:00
|
|
|
lopt->max_qlen_log = ilog2(nr_table_entries);
|
2005-06-19 05:47:59 +00:00
|
|
|
|
|
|
|
write_lock_bh(&queue->syn_wait_lock);
|
|
|
|
queue->listen_opt = lopt;
|
|
|
|
write_unlock_bh(&queue->syn_wait_lock);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2007-11-15 10:57:06 +00:00
|
|
|
void __reqsk_queue_destroy(struct request_sock_queue *queue)
|
|
|
|
{
|
2014-06-24 12:32:48 +00:00
|
|
|
/* This is an error recovery path only, no locking needed */
|
|
|
|
kvfree(queue->listen_opt);
|
2007-11-15 10:57:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
|
|
|
|
struct request_sock_queue *queue)
|
|
|
|
{
|
|
|
|
struct listen_sock *lopt;
|
|
|
|
|
|
|
|
write_lock_bh(&queue->syn_wait_lock);
|
|
|
|
lopt = queue->listen_opt;
|
|
|
|
queue->listen_opt = NULL;
|
|
|
|
write_unlock_bh(&queue->syn_wait_lock);
|
|
|
|
|
|
|
|
return lopt;
|
|
|
|
}
|
|
|
|
|
2005-08-10 02:33:31 +00:00
|
|
|
void reqsk_queue_destroy(struct request_sock_queue *queue)
|
|
|
|
{
|
|
|
|
/* make all the listen_opt local to us */
|
|
|
|
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
|
|
|
|
|
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 02:04:20 +00:00
|
|
|
if (listen_sock_qlen(lopt) != 0) {
|
2006-11-16 10:30:37 +00:00
|
|
|
unsigned int i;
|
2005-08-10 02:33:31 +00:00
|
|
|
|
|
|
|
for (i = 0; i < lopt->nr_table_entries; i++) {
|
|
|
|
struct request_sock *req;
|
|
|
|
|
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 02:04:20 +00:00
|
|
|
write_lock_bh(&queue->syn_wait_lock);
|
2005-08-10 02:33:31 +00:00
|
|
|
while ((req = lopt->syn_table[i]) != NULL) {
|
|
|
|
lopt->syn_table[i] = req->dl_next;
|
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 02:04:20 +00:00
|
|
|
atomic_inc(&lopt->qlen_dec);
|
|
|
|
if (del_timer(&req->rsk_timer))
|
|
|
|
reqsk_put(req);
|
2015-03-16 04:12:16 +00:00
|
|
|
reqsk_put(req);
|
2005-08-10 02:33:31 +00:00
|
|
|
}
|
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 02:04:20 +00:00
|
|
|
write_unlock_bh(&queue->syn_wait_lock);
|
2005-08-10 02:33:31 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 02:04:20 +00:00
|
|
|
if (WARN_ON(listen_sock_qlen(lopt) != 0))
|
|
|
|
pr_err("qlen %u\n", listen_sock_qlen(lopt));
|
2014-06-24 12:32:48 +00:00
|
|
|
kvfree(lopt);
|
2005-08-10 02:33:31 +00:00
|
|
|
}
|
|
|
|
|
2012-08-31 12:29:12 +00:00
|
|
|
/*
|
|
|
|
* This function is called to set a Fast Open socket's "fastopen_rsk" field
|
|
|
|
* to NULL when a TFO socket no longer needs to access the request_sock.
|
|
|
|
* This happens only after 3WHS has been either completed or aborted (e.g.,
|
|
|
|
* RST is received).
|
|
|
|
*
|
|
|
|
* Before TFO, a child socket is created only after 3WHS is completed,
|
|
|
|
* hence it never needs to access the request_sock. things get a lot more
|
|
|
|
* complex with TFO. A child socket, accepted or not, has to access its
|
|
|
|
* request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
|
|
|
|
* until 3WHS is either completed or aborted. Afterwards the req will stay
|
|
|
|
* until either the child socket is accepted, or in the rare case when the
|
|
|
|
* listener is closed before the child is accepted.
|
|
|
|
*
|
|
|
|
* In short, a request socket is only freed after BOTH 3WHS has completed
|
|
|
|
* (or aborted) and the child socket has been accepted (or listener closed).
|
|
|
|
* When a child socket is accepted, its corresponding req->sk is set to
|
|
|
|
* NULL since it's no longer needed. More importantly, "req->sk == NULL"
|
|
|
|
* will be used by the code below to determine if a child socket has been
|
|
|
|
* accepted or not, and the check is protected by the fastopenq->lock
|
|
|
|
* described below.
|
|
|
|
*
|
|
|
|
* Note that fastopen_rsk is only accessed from the child socket's context
|
|
|
|
* with its socket lock held. But a request_sock (req) can be accessed by
|
|
|
|
* both its child socket through fastopen_rsk, and a listener socket through
|
|
|
|
* icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
|
|
|
|
* lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
|
|
|
|
* only in the rare case when both the listener and the child locks are held,
|
|
|
|
* e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
|
|
|
|
* The lock also protects other fields such as fastopenq->qlen, which is
|
|
|
|
* decremented by this function when fastopen_rsk is no longer needed.
|
|
|
|
*
|
|
|
|
* Note that another solution was to simply use the existing socket lock
|
|
|
|
* from the listener. But first socket lock is difficult to use. It is not
|
|
|
|
* a simple spin lock - one must consider sock_owned_by_user() and arrange
|
|
|
|
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
|
|
|
|
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
|
|
|
|
* acquire a child's lock while holding listener's socket lock. A corner
|
|
|
|
* case might also exist in tcp_v4_hnd_req() that will trigger this locking
|
|
|
|
* order.
|
|
|
|
*
|
2015-03-18 01:32:29 +00:00
|
|
|
* This function also sets "treq->tfo_listener" to false.
|
|
|
|
* treq->tfo_listener is used by the listener so it is protected by the
|
2012-08-31 12:29:12 +00:00
|
|
|
* fastopenq->lock in this function.
|
|
|
|
*/
|
|
|
|
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
|
|
|
|
bool reset)
|
|
|
|
{
|
2015-03-18 01:32:29 +00:00
|
|
|
struct sock *lsk = req->rsk_listener;
|
|
|
|
struct fastopen_queue *fastopenq;
|
|
|
|
|
|
|
|
fastopenq = inet_csk(lsk)->icsk_accept_queue.fastopenq;
|
2012-08-31 12:29:12 +00:00
|
|
|
|
|
|
|
tcp_sk(sk)->fastopen_rsk = NULL;
|
|
|
|
spin_lock_bh(&fastopenq->lock);
|
|
|
|
fastopenq->qlen--;
|
2015-03-18 01:32:29 +00:00
|
|
|
tcp_rsk(req)->tfo_listener = false;
|
2012-08-31 12:29:12 +00:00
|
|
|
if (req->sk) /* the child socket hasn't been accepted yet */
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (!reset || lsk->sk_state != TCP_LISTEN) {
|
|
|
|
/* If the listener has been closed don't bother with the
|
|
|
|
* special RST handling below.
|
|
|
|
*/
|
|
|
|
spin_unlock_bh(&fastopenq->lock);
|
2015-03-16 04:12:16 +00:00
|
|
|
reqsk_put(req);
|
2012-08-31 12:29:12 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
/* Wait for 60secs before removing a req that has triggered RST.
|
|
|
|
* This is a simple defense against TFO spoofing attack - by
|
|
|
|
* counting the req against fastopen.max_qlen, and disabling
|
|
|
|
* TFO when the qlen exceeds max_qlen.
|
|
|
|
*
|
|
|
|
* For more details see CoNext'11 "TCP Fast Open" paper.
|
|
|
|
*/
|
inet: get rid of central tcp/dccp listener timer
One of the major issue for TCP is the SYNACK rtx handling,
done by inet_csk_reqsk_queue_prune(), fired by the keepalive
timer of a TCP_LISTEN socket.
This function runs for awful long times, with socket lock held,
meaning that other cpus needing this lock have to spin for hundred of ms.
SYNACK are sent in huge bursts, likely to cause severe drops anyway.
This model was OK 15 years ago when memory was very tight.
We now can afford to have a timer per request sock.
Timer invocations no longer need to lock the listener,
and can be run from all cpus in parallel.
With following patch increasing somaxconn width to 32 bits,
I tested a listener with more than 4 million active request sockets,
and a steady SYNFLOOD of ~200,000 SYN per second.
Host was sending ~830,000 SYNACK per second.
This is ~100 times more what we could achieve before this patch.
Later, we will get rid of the listener hash and use ehash instead.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2015-03-20 02:04:20 +00:00
|
|
|
req->rsk_timer.expires = jiffies + 60*HZ;
|
2012-08-31 12:29:12 +00:00
|
|
|
if (fastopenq->rskq_rst_head == NULL)
|
|
|
|
fastopenq->rskq_rst_head = req;
|
|
|
|
else
|
|
|
|
fastopenq->rskq_rst_tail->dl_next = req;
|
|
|
|
|
|
|
|
req->dl_next = NULL;
|
|
|
|
fastopenq->rskq_rst_tail = req;
|
|
|
|
fastopenq->qlen++;
|
|
|
|
out:
|
|
|
|
spin_unlock_bh(&fastopenq->lock);
|
|
|
|
}
|