Merge branch 'bind_addr_zero'
Kuniyuki Iwashima says: ==================== Improve bind(addr, 0) behaviour. Currently we fail to bind sockets to ephemeral ports when all of the ports are exhausted even if all sockets have SO_REUSEADDR enabled. In this case, we still have a chance to connect to the different remote hosts. These patches add net.ipv4.ip_autobind_reuse option and fix the behaviour to fully utilize all space of the local (addr, port) tuples. Changes in v5: - Add more description to documents. - Fix sysctl option to use proc_dointvec_minmax. - Remove the Fixes: tag and squash two commits. Changes in v4: - Add net.ipv4.ip_autobind_reuse option to not change the current behaviour. - Modify .gitignore for test. https://lore.kernel.org/netdev/20200308181615.90135-1-kuniyu@amazon.co.jp/ Changes in v3: - Change the title and write more specific description of the 3rd patch. - Add a test in tools/testing/selftests/net/ as the 4th patch. https://lore.kernel.org/netdev/20200229113554.78338-1-kuniyu@amazon.co.jp/ Changes in v2: - Change the description of the 2nd patch ('localhost' -> 'address'). - Correct the description and the if statement of the 3rd patch. https://lore.kernel.org/netdev/20200226074631.67688-1-kuniyu@amazon.co.jp/ v1 with tests: https://lore.kernel.org/netdev/20200220152020.13056-1-kuniyu@amazon.co.jp/ ==================== Reviewed-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
93e616131a
@ -958,6 +958,15 @@ ip_nonlocal_bind - BOOLEAN
|
||||
which can be quite useful - but may break some applications.
|
||||
Default: 0
|
||||
|
||||
ip_autobind_reuse - BOOLEAN
|
||||
By default, bind() does not select the ports automatically even if
|
||||
the new socket and all sockets bound to the port have SO_REUSEADDR.
|
||||
ip_autobind_reuse allows bind() to reuse the port and this is useful
|
||||
when you use bind()+connect(), but may break some applications.
|
||||
The preferred solution is to use IP_BIND_ADDRESS_NO_PORT and this
|
||||
option should only be set by experts.
|
||||
Default: 0
|
||||
|
||||
ip_dynaddr - BOOLEAN
|
||||
If set non-zero, enables support for dynamic addresses.
|
||||
If set to a non-zero value larger than 1, a kernel log
|
||||
|
@ -101,6 +101,7 @@ struct netns_ipv4 {
|
||||
int sysctl_ip_fwd_use_pmtu;
|
||||
int sysctl_ip_fwd_update_priority;
|
||||
int sysctl_ip_nonlocal_bind;
|
||||
int sysctl_ip_autobind_reuse;
|
||||
/* Shall we try to damage output packets if routing dev changes? */
|
||||
int sysctl_ip_dynaddr;
|
||||
int sysctl_ip_early_demux;
|
||||
|
@ -131,7 +131,7 @@ static int inet_csk_bind_conflict(const struct sock *sk,
|
||||
{
|
||||
struct sock *sk2;
|
||||
bool reuse = sk->sk_reuse;
|
||||
bool reuseport = !!sk->sk_reuseport && reuseport_ok;
|
||||
bool reuseport = !!sk->sk_reuseport;
|
||||
kuid_t uid = sock_i_uid((struct sock *)sk);
|
||||
|
||||
/*
|
||||
@ -146,17 +146,21 @@ static int inet_csk_bind_conflict(const struct sock *sk,
|
||||
(!sk->sk_bound_dev_if ||
|
||||
!sk2->sk_bound_dev_if ||
|
||||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
|
||||
if ((!reuse || !sk2->sk_reuse ||
|
||||
sk2->sk_state == TCP_LISTEN) &&
|
||||
(!reuseport || !sk2->sk_reuseport ||
|
||||
rcu_access_pointer(sk->sk_reuseport_cb) ||
|
||||
(sk2->sk_state != TCP_TIME_WAIT &&
|
||||
!uid_eq(uid, sock_i_uid(sk2))))) {
|
||||
if (inet_rcv_saddr_equal(sk, sk2, true))
|
||||
break;
|
||||
}
|
||||
if (!relax && reuse && sk2->sk_reuse &&
|
||||
if (reuse && sk2->sk_reuse &&
|
||||
sk2->sk_state != TCP_LISTEN) {
|
||||
if ((!relax ||
|
||||
(!reuseport_ok &&
|
||||
reuseport && sk2->sk_reuseport &&
|
||||
!rcu_access_pointer(sk->sk_reuseport_cb) &&
|
||||
(sk2->sk_state == TCP_TIME_WAIT ||
|
||||
uid_eq(uid, sock_i_uid(sk2))))) &&
|
||||
inet_rcv_saddr_equal(sk, sk2, true))
|
||||
break;
|
||||
} else if (!reuseport_ok ||
|
||||
!reuseport || !sk2->sk_reuseport ||
|
||||
rcu_access_pointer(sk->sk_reuseport_cb) ||
|
||||
(sk2->sk_state != TCP_TIME_WAIT &&
|
||||
!uid_eq(uid, sock_i_uid(sk2)))) {
|
||||
if (inet_rcv_saddr_equal(sk, sk2, true))
|
||||
break;
|
||||
}
|
||||
@ -176,12 +180,14 @@ inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *
|
||||
int port = 0;
|
||||
struct inet_bind_hashbucket *head;
|
||||
struct net *net = sock_net(sk);
|
||||
bool relax = false;
|
||||
int i, low, high, attempt_half;
|
||||
struct inet_bind_bucket *tb;
|
||||
u32 remaining, offset;
|
||||
int l3mdev;
|
||||
|
||||
l3mdev = inet_sk_bound_l3mdev(sk);
|
||||
ports_exhausted:
|
||||
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
|
||||
other_half_scan:
|
||||
inet_get_local_port_range(net, &low, &high);
|
||||
@ -219,7 +225,7 @@ other_parity_scan:
|
||||
inet_bind_bucket_for_each(tb, &head->chain)
|
||||
if (net_eq(ib_net(tb), net) && tb->l3mdev == l3mdev &&
|
||||
tb->port == port) {
|
||||
if (!inet_csk_bind_conflict(sk, tb, false, false))
|
||||
if (!inet_csk_bind_conflict(sk, tb, relax, false))
|
||||
goto success;
|
||||
goto next_port;
|
||||
}
|
||||
@ -239,6 +245,12 @@ next_port:
|
||||
attempt_half = 2;
|
||||
goto other_half_scan;
|
||||
}
|
||||
|
||||
if (net->ipv4.sysctl_ip_autobind_reuse && !relax) {
|
||||
/* We still have a chance to connect to different destinations */
|
||||
relax = true;
|
||||
goto ports_exhausted;
|
||||
}
|
||||
return NULL;
|
||||
success:
|
||||
*port_ret = port;
|
||||
|
@ -763,6 +763,15 @@ static struct ctl_table ipv4_net_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec
|
||||
},
|
||||
{
|
||||
.procname = "ip_autobind_reuse",
|
||||
.data = &init_net.ipv4.sysctl_ip_autobind_reuse,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
{
|
||||
.procname = "fwmark_reflect",
|
||||
.data = &init_net.ipv4.sysctl_fwmark_reflect,
|
||||
|
1
tools/testing/selftests/net/.gitignore
vendored
1
tools/testing/selftests/net/.gitignore
vendored
@ -23,3 +23,4 @@ so_txtime
|
||||
tcp_fastopen_backup_key
|
||||
nettest
|
||||
fin_ack_lat
|
||||
reuseaddr_ports_exhausted
|
@ -12,6 +12,7 @@ TEST_PROGS += udpgro_bench.sh udpgro.sh test_vxlan_under_vrf.sh reuseport_addr_a
|
||||
TEST_PROGS += test_vxlan_fdb_changelink.sh so_txtime.sh ipv6_flowlabel.sh
|
||||
TEST_PROGS += tcp_fastopen_backup_key.sh fcnal-test.sh l2tp.sh traceroute.sh
|
||||
TEST_PROGS += fin_ack_lat.sh
|
||||
TEST_PROGS += reuseaddr_ports_exhausted.sh
|
||||
TEST_PROGS_EXTENDED := in_netns.sh
|
||||
TEST_GEN_FILES = socket nettest
|
||||
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
|
||||
@ -22,6 +23,7 @@ TEST_GEN_FILES += tcp_fastopen_backup_key
|
||||
TEST_GEN_FILES += fin_ack_lat
|
||||
TEST_GEN_PROGS = reuseport_bpf reuseport_bpf_cpu reuseport_bpf_numa
|
||||
TEST_GEN_PROGS += reuseport_dualstack reuseaddr_conflict tls
|
||||
TEST_GEN_FILES += reuseaddr_ports_exhausted
|
||||
|
||||
KSFT_KHDR_INSTALL := 1
|
||||
include ../lib.mk
|
||||
|
162
tools/testing/selftests/net/reuseaddr_ports_exhausted.c
Normal file
162
tools/testing/selftests/net/reuseaddr_ports_exhausted.c
Normal file
@ -0,0 +1,162 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Check if we can fully utilize 4-tuples for connect().
|
||||
*
|
||||
* Rules to bind sockets to the same port when all ephemeral ports are
|
||||
* exhausted.
|
||||
*
|
||||
* 1. if there are TCP_LISTEN sockets on the port, fail to bind.
|
||||
* 2. if there are sockets without SO_REUSEADDR, fail to bind.
|
||||
* 3. if SO_REUSEADDR is disabled, fail to bind.
|
||||
* 4. if SO_REUSEADDR is enabled and SO_REUSEPORT is disabled,
|
||||
* succeed to bind.
|
||||
* 5. if SO_REUSEADDR and SO_REUSEPORT are enabled and
|
||||
* there is no socket having the both options and the same EUID,
|
||||
* succeed to bind.
|
||||
* 6. fail to bind.
|
||||
*
|
||||
* Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
|
||||
*/
|
||||
#include <arpa/inet.h>
|
||||
#include <netinet/in.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/types.h>
|
||||
#include <unistd.h>
|
||||
#include "../kselftest_harness.h"
|
||||
|
||||
struct reuse_opts {
|
||||
int reuseaddr[2];
|
||||
int reuseport[2];
|
||||
};
|
||||
|
||||
struct reuse_opts unreusable_opts[12] = {
|
||||
{0, 0, 0, 0},
|
||||
{0, 0, 0, 1},
|
||||
{0, 0, 1, 0},
|
||||
{0, 0, 1, 1},
|
||||
{0, 1, 0, 0},
|
||||
{0, 1, 0, 1},
|
||||
{0, 1, 1, 0},
|
||||
{0, 1, 1, 1},
|
||||
{1, 0, 0, 0},
|
||||
{1, 0, 0, 1},
|
||||
{1, 0, 1, 0},
|
||||
{1, 0, 1, 1},
|
||||
};
|
||||
|
||||
struct reuse_opts reusable_opts[4] = {
|
||||
{1, 1, 0, 0},
|
||||
{1, 1, 0, 1},
|
||||
{1, 1, 1, 0},
|
||||
{1, 1, 1, 1},
|
||||
};
|
||||
|
||||
int bind_port(struct __test_metadata *_metadata, int reuseaddr, int reuseport)
|
||||
{
|
||||
struct sockaddr_in local_addr;
|
||||
int len = sizeof(local_addr);
|
||||
int fd, ret;
|
||||
|
||||
fd = socket(AF_INET, SOCK_STREAM, 0);
|
||||
ASSERT_NE(-1, fd) TH_LOG("failed to open socket.");
|
||||
|
||||
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr, sizeof(int));
|
||||
ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEADDR.");
|
||||
|
||||
ret = setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &reuseport, sizeof(int));
|
||||
ASSERT_EQ(0, ret) TH_LOG("failed to setsockopt: SO_REUSEPORT.");
|
||||
|
||||
local_addr.sin_family = AF_INET;
|
||||
local_addr.sin_addr.s_addr = inet_addr("127.0.0.1");
|
||||
local_addr.sin_port = 0;
|
||||
|
||||
if (bind(fd, (struct sockaddr *)&local_addr, len) == -1) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
|
||||
return fd;
|
||||
}
|
||||
|
||||
TEST(reuseaddr_ports_exhausted_unreusable)
|
||||
{
|
||||
struct reuse_opts *opts;
|
||||
int i, j, fd[2];
|
||||
|
||||
for (i = 0; i < 12; i++) {
|
||||
opts = &unreusable_opts[i];
|
||||
|
||||
for (j = 0; j < 2; j++)
|
||||
fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
|
||||
|
||||
ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
|
||||
EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind.");
|
||||
|
||||
for (j = 0; j < 2; j++)
|
||||
if (fd[j] != -1)
|
||||
close(fd[j]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(reuseaddr_ports_exhausted_reusable_same_euid)
|
||||
{
|
||||
struct reuse_opts *opts;
|
||||
int i, j, fd[2];
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
opts = &reusable_opts[i];
|
||||
|
||||
for (j = 0; j < 2; j++)
|
||||
fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
|
||||
|
||||
ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
|
||||
|
||||
if (opts->reuseport[0] && opts->reuseport[1]) {
|
||||
EXPECT_EQ(-1, fd[1]) TH_LOG("should fail to bind because both sockets succeed to be listened.");
|
||||
} else {
|
||||
EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind to connect to different destinations.");
|
||||
}
|
||||
|
||||
for (j = 0; j < 2; j++)
|
||||
if (fd[j] != -1)
|
||||
close(fd[j]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(reuseaddr_ports_exhausted_reusable_different_euid)
|
||||
{
|
||||
struct reuse_opts *opts;
|
||||
int i, j, ret, fd[2];
|
||||
uid_t euid[2] = {10, 20};
|
||||
|
||||
for (i = 0; i < 4; i++) {
|
||||
opts = &reusable_opts[i];
|
||||
|
||||
for (j = 0; j < 2; j++) {
|
||||
ret = seteuid(euid[j]);
|
||||
ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: %d.", euid[j]);
|
||||
|
||||
fd[j] = bind_port(_metadata, opts->reuseaddr[j], opts->reuseport[j]);
|
||||
|
||||
ret = seteuid(0);
|
||||
ASSERT_EQ(0, ret) TH_LOG("failed to seteuid: 0.");
|
||||
}
|
||||
|
||||
ASSERT_NE(-1, fd[0]) TH_LOG("failed to bind.");
|
||||
EXPECT_NE(-1, fd[1]) TH_LOG("should succeed to bind because one socket can be bound in each euid.");
|
||||
|
||||
if (fd[1] != -1) {
|
||||
ret = listen(fd[0], 5);
|
||||
ASSERT_EQ(0, ret) TH_LOG("failed to listen.");
|
||||
|
||||
ret = listen(fd[1], 5);
|
||||
EXPECT_EQ(-1, ret) TH_LOG("should fail to listen because only one uid reserves the port in TCP_LISTEN.");
|
||||
}
|
||||
|
||||
for (j = 0; j < 2; j++)
|
||||
if (fd[j] != -1)
|
||||
close(fd[j]);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_HARNESS_MAIN
|
35
tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
Executable file
35
tools/testing/selftests/net/reuseaddr_ports_exhausted.sh
Executable file
@ -0,0 +1,35 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
#
|
||||
# Run tests when all ephemeral ports are exhausted.
|
||||
#
|
||||
# Author: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
|
||||
|
||||
set +x
|
||||
set -e
|
||||
|
||||
readonly NETNS="ns-$(mktemp -u XXXXXX)"
|
||||
|
||||
setup() {
|
||||
ip netns add "${NETNS}"
|
||||
ip -netns "${NETNS}" link set lo up
|
||||
ip netns exec "${NETNS}" \
|
||||
sysctl -w net.ipv4.ip_local_port_range="32768 32768" \
|
||||
> /dev/null 2>&1
|
||||
ip netns exec "${NETNS}" \
|
||||
sysctl -w net.ipv4.ip_autobind_reuse=1 > /dev/null 2>&1
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
ip netns del "${NETNS}"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
setup
|
||||
|
||||
do_test() {
|
||||
ip netns exec "${NETNS}" ./reuseaddr_ports_exhausted
|
||||
}
|
||||
|
||||
do_test
|
||||
echo "tests done"
|
Loading…
Reference in New Issue
Block a user