2019-05-27 06:55:01 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-12-27 04:43:12 +00:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Definitions for inet_sock
|
|
|
|
*
|
|
|
|
* Authors: Many, reorganised here by
|
|
|
|
* Arnaldo Carvalho de Melo <acme@mandriva.com>
|
|
|
|
*/
|
|
|
|
#ifndef _INET_SOCK_H
|
|
|
|
#define _INET_SOCK_H
|
|
|
|
|
2015-01-05 21:56:15 +00:00
|
|
|
#include <linux/bitops.h>
|
2005-12-27 04:43:12 +00:00
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/types.h>
|
2007-03-23 18:40:27 +00:00
|
|
|
#include <linux/jhash.h>
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-16 23:01:27 +00:00
|
|
|
#include <linux/netdevice.h>
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
#include <net/flow.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/request_sock.h>
|
2008-06-17 00:14:11 +00:00
|
|
|
#include <net/netns/hash.h>
|
2015-03-12 23:44:05 +00:00
|
|
|
#include <net/tcp_states.h>
|
2015-12-16 21:20:44 +00:00
|
|
|
#include <net/l3mdev.h>
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
/** struct ip_options - IP Options
|
|
|
|
*
|
|
|
|
* @faddr - Saved first hop address
|
2011-11-22 23:33:10 +00:00
|
|
|
* @nexthop - Saved nexthop address in LSRR and SSRR
|
2005-12-27 04:43:12 +00:00
|
|
|
* @is_strictroute - Strict source route
|
|
|
|
* @srr_is_hit - Packet destination addr was our one
|
|
|
|
* @is_changed - IP checksum more not valid
|
|
|
|
* @rr_needaddr - Need to record addr of outgoing dev
|
|
|
|
* @ts_needtime - Need to record timestamp
|
|
|
|
* @ts_needaddr - Need to record addr of outgoing dev
|
|
|
|
*/
|
|
|
|
struct ip_options {
|
2006-09-28 01:28:07 +00:00
|
|
|
__be32 faddr;
|
2011-11-22 23:33:10 +00:00
|
|
|
__be32 nexthop;
|
2005-12-27 04:43:12 +00:00
|
|
|
unsigned char optlen;
|
|
|
|
unsigned char srr;
|
|
|
|
unsigned char rr;
|
|
|
|
unsigned char ts;
|
2008-03-22 23:35:29 +00:00
|
|
|
unsigned char is_strictroute:1,
|
2005-12-27 04:43:12 +00:00
|
|
|
srr_is_hit:1,
|
|
|
|
is_changed:1,
|
|
|
|
rr_needaddr:1,
|
|
|
|
ts_needtime:1,
|
|
|
|
ts_needaddr:1;
|
|
|
|
unsigned char router_alert;
|
2006-08-03 23:46:20 +00:00
|
|
|
unsigned char cipso;
|
2005-12-27 04:43:12 +00:00
|
|
|
unsigned char __pad2;
|
2020-03-02 12:07:42 +00:00
|
|
|
unsigned char __data[];
|
2005-12-27 04:43:12 +00:00
|
|
|
};
|
|
|
|
|
2011-04-21 09:45:37 +00:00
|
|
|
struct ip_options_rcu {
|
|
|
|
struct rcu_head rcu;
|
|
|
|
struct ip_options opt;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ip_options_data {
|
|
|
|
struct ip_options_rcu opt;
|
|
|
|
char data[40];
|
|
|
|
};
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
struct inet_request_sock {
|
|
|
|
struct request_sock req;
|
2013-10-09 22:21:29 +00:00
|
|
|
#define ir_loc_addr req.__req_common.skc_rcv_saddr
|
|
|
|
#define ir_rmt_addr req.__req_common.skc_daddr
|
2013-10-10 07:04:37 +00:00
|
|
|
#define ir_num req.__req_common.skc_num
|
2013-10-09 22:21:29 +00:00
|
|
|
#define ir_rmt_port req.__req_common.skc_dport
|
|
|
|
#define ir_v6_rmt_addr req.__req_common.skc_v6_daddr
|
|
|
|
#define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr
|
|
|
|
#define ir_iif req.__req_common.skc_bound_dev_if
|
2015-03-12 01:53:14 +00:00
|
|
|
#define ir_cookie req.__req_common.skc_cookie
|
|
|
|
#define ireq_net req.__req_common.skc_net
|
2015-03-12 23:44:05 +00:00
|
|
|
#define ireq_state req.__req_common.skc_state
|
2015-03-12 23:44:10 +00:00
|
|
|
#define ireq_family req.__req_common.skc_family
|
2013-10-09 22:21:29 +00:00
|
|
|
|
2008-09-09 04:43:12 +00:00
|
|
|
u16 snd_wscale : 4,
|
|
|
|
rcv_wscale : 4,
|
2005-12-27 04:43:12 +00:00
|
|
|
tstamp_ok : 1,
|
|
|
|
sack_ok : 1,
|
|
|
|
wscale_ok : 1,
|
|
|
|
ecn_ok : 1,
|
2008-10-01 14:41:00 +00:00
|
|
|
acked : 1,
|
2017-10-25 09:01:45 +00:00
|
|
|
no_srccheck: 1,
|
|
|
|
smc_ok : 1;
|
2015-03-17 04:06:18 +00:00
|
|
|
u32 ir_mark;
|
2014-06-25 14:09:52 +00:00
|
|
|
union {
|
2017-10-20 16:04:13 +00:00
|
|
|
struct ip_options_rcu __rcu *ireq_opt;
|
2016-06-27 19:05:28 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
struct {
|
|
|
|
struct ipv6_txoptions *ipv6_opt;
|
|
|
|
struct sk_buff *pktopts;
|
|
|
|
};
|
|
|
|
#endif
|
2014-06-25 14:09:52 +00:00
|
|
|
};
|
2005-12-27 04:43:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
|
|
|
|
{
|
|
|
|
return (struct inet_request_sock *)sk;
|
|
|
|
}
|
|
|
|
|
2015-03-17 04:06:18 +00:00
|
|
|
static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-13 17:17:35 +00:00
|
|
|
{
|
2022-07-13 20:51:58 +00:00
|
|
|
if (!sk->sk_mark &&
|
|
|
|
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-13 17:17:35 +00:00
|
|
|
return skb->mark;
|
2015-03-17 04:06:18 +00:00
|
|
|
|
|
|
|
return sk->sk_mark;
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-13 17:17:35 +00:00
|
|
|
}
|
|
|
|
|
2015-12-16 21:20:44 +00:00
|
|
|
static inline int inet_request_bound_dev_if(const struct sock *sk,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
2022-05-13 18:55:43 +00:00
|
|
|
int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
|
2015-12-16 21:20:44 +00:00
|
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
|
2022-07-13 20:51:59 +00:00
|
|
|
if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
|
2015-12-16 21:20:44 +00:00
|
|
|
return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
|
|
|
|
#endif
|
|
|
|
|
2022-05-13 18:55:43 +00:00
|
|
|
return bound_dev_if;
|
2015-12-16 21:20:44 +00:00
|
|
|
}
|
|
|
|
|
2018-11-07 15:36:02 +00:00
|
|
|
static inline int inet_sk_bound_l3mdev(const struct sock *sk)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
|
2022-07-13 20:51:59 +00:00
|
|
|
if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
|
2018-11-07 15:36:02 +00:00
|
|
|
return l3mdev_master_ifindex_by_index(net,
|
|
|
|
sk->sk_bound_dev_if);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-07 15:36:03 +00:00
|
|
|
static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
|
|
|
|
int dif, int sdif)
|
|
|
|
{
|
|
|
|
if (!bound_dev_if)
|
|
|
|
return !sdif || l3mdev_accept;
|
|
|
|
return bound_dev_if == dif || bound_dev_if == sdif;
|
|
|
|
}
|
|
|
|
|
2011-03-01 02:36:47 +00:00
|
|
|
struct inet_cork {
|
|
|
|
unsigned int flags;
|
2011-05-06 22:02:07 +00:00
|
|
|
__be32 addr;
|
2011-03-01 02:36:47 +00:00
|
|
|
struct ip_options *opt;
|
2011-05-06 22:02:07 +00:00
|
|
|
unsigned int fragsize;
|
2011-03-01 02:36:47 +00:00
|
|
|
int length; /* Total length of all frames */
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-23 23:04:42 +00:00
|
|
|
struct dst_entry *dst;
|
2011-03-01 02:36:47 +00:00
|
|
|
u8 tx_flags;
|
2013-09-24 13:43:09 +00:00
|
|
|
__u8 ttl;
|
|
|
|
__s16 tos;
|
|
|
|
char priority;
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-26 17:42:17 +00:00
|
|
|
__u16 gso_size;
|
2018-07-03 22:42:49 +00:00
|
|
|
u64 transmit_time;
|
2019-09-11 19:50:51 +00:00
|
|
|
u32 mark;
|
2011-03-01 02:36:47 +00:00
|
|
|
};
|
|
|
|
|
2011-05-06 22:02:07 +00:00
|
|
|
struct inet_cork_full {
|
|
|
|
struct inet_cork base;
|
|
|
|
struct flowi fl;
|
|
|
|
};
|
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
struct ip_mc_socklist;
|
|
|
|
struct ipv6_pinfo;
|
|
|
|
struct rtable;
|
|
|
|
|
|
|
|
/** struct inet_sock - representation of INET sockets
|
|
|
|
*
|
|
|
|
* @sk - ancestor class
|
|
|
|
* @pinet6 - pointer to IPv6 control block
|
2009-10-15 06:30:45 +00:00
|
|
|
* @inet_daddr - Foreign IPv4 addr
|
|
|
|
* @inet_rcv_saddr - Bound local IPv4 addr
|
|
|
|
* @inet_dport - Destination port
|
|
|
|
* @inet_num - Local port
|
|
|
|
* @inet_saddr - Sending source
|
2005-12-27 04:43:12 +00:00
|
|
|
* @uc_ttl - Unicast TTL
|
2009-10-15 06:30:45 +00:00
|
|
|
* @inet_sport - Source port
|
|
|
|
* @inet_id - ID counter for DF pkts
|
2005-12-27 04:43:12 +00:00
|
|
|
* @tos - TOS
|
|
|
|
* @mc_ttl - Multicasting TTL
|
|
|
|
* @is_icsk - is this an inet_connection_sock?
|
2012-02-08 09:11:07 +00:00
|
|
|
* @uc_index - Unicast outgoing device index
|
2005-12-27 04:43:12 +00:00
|
|
|
* @mc_index - Multicast device index
|
|
|
|
* @mc_list - Group array
|
|
|
|
* @cork - info to build ip hdr on each ip frag while socket is corked
|
|
|
|
*/
|
|
|
|
struct inet_sock {
|
|
|
|
/* sk and pinet6 has to be the first two members of inet_sock */
|
|
|
|
struct sock sk;
|
2011-12-10 09:48:31 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2005-12-27 04:43:12 +00:00
|
|
|
struct ipv6_pinfo *pinet6;
|
|
|
|
#endif
|
|
|
|
/* Socket demultiplex comparisons on incoming packets. */
|
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 19:04:07 +00:00
|
|
|
#define inet_daddr sk.__sk_common.skc_daddr
|
|
|
|
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
|
2012-11-30 09:49:27 +00:00
|
|
|
#define inet_dport sk.__sk_common.skc_dport
|
|
|
|
#define inet_num sk.__sk_common.skc_num
|
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 19:04:07 +00:00
|
|
|
|
2009-10-15 06:30:45 +00:00
|
|
|
__be32 inet_saddr;
|
2005-12-27 04:43:12 +00:00
|
|
|
__s16 uc_ttl;
|
|
|
|
__u16 cmsg_flags;
|
2021-10-25 16:48:16 +00:00
|
|
|
struct ip_options_rcu __rcu *inet_opt;
|
2009-10-15 06:30:45 +00:00
|
|
|
__be16 inet_sport;
|
|
|
|
__u16 inet_id;
|
2010-01-12 00:28:01 +00:00
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
__u8 tos;
|
2010-01-12 00:28:01 +00:00
|
|
|
__u8 min_ttl;
|
2005-12-27 04:43:12 +00:00
|
|
|
__u8 mc_ttl;
|
|
|
|
__u8 pmtudisc;
|
|
|
|
__u8 recverr:1,
|
|
|
|
is_icsk:1,
|
|
|
|
freebind:1,
|
|
|
|
hdrincl:1,
|
2008-10-01 14:30:02 +00:00
|
|
|
mc_loop:1,
|
2009-05-28 07:00:46 +00:00
|
|
|
transparent:1,
|
2010-06-15 01:07:31 +00:00
|
|
|
mc_all:1,
|
|
|
|
nodefrag:1;
|
net/tcp-fastopen: Add new API support
This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
s = socket()
create a new socket
setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
newly introduced sockopt
If set, new functionality described below will be used.
Return ENOTSUPP if TFO is not supported or not enabled in the
kernel.
connect()
With cookie present, return 0 immediately.
With no cookie, initiate 3WHS with TFO cookie-request option and
return -1 with errno = EINPROGRESS.
write()/sendmsg()
With cookie present, send out SYN with data and return the number of
bytes buffered.
With no cookie, and 3WHS not yet completed, return -1 with errno =
EINPROGRESS.
No MSG_FASTOPEN flag is needed.
read()
Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
write() is not called yet.
Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
established but no msg is received yet.
Return number of bytes read if socket is established and there is
msg received.
The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-23 18:59:22 +00:00
|
|
|
__u8 bind_address_no_port:1,
|
icmp: support rfc 4884
Add setsockopt SOL_IP/IP_RECVERR_4884 to return the offset to an
extension struct if present.
ICMP messages may include an extension structure after the original
datagram. RFC 4884 standardized this behavior. It stores the offset
in words to the extension header in u8 icmphdr.un.reserved[1].
The field is valid only for ICMP types destination unreachable, time
exceeded and parameter problem, if length is at least 128 bytes and
entire packet does not exceed 576 bytes.
Return the offset to the start of the extension struct when reading an
ICMP error from the error queue, if it matches the above constraints.
Do not return the raw u8 field. Return the offset from the start of
the user buffer, in bytes. The kernel does not return the network and
transport headers, so subtract those.
Also validate the headers. Return the offset regardless of validation,
as an invalid extension must still not be misinterpreted as part of
the original datagram. Note that !invalid does not imply valid. If
the extension version does not match, no validation can take place,
for instance.
For backward compatibility, make this optional, set by setsockopt
SOL_IP/IP_RECVERR_RFC4884. For API example and feature test, see
github.com/wdebruij/kerneltools/blob/master/tests/recv_icmp_v2.c
For forward compatibility, reserve only setsockopt value 1, leaving
other bits for additional icmp extensions.
Changes
v1->v2:
- convert word offset to byte offset from start of user buffer
- return in ee_data as u8 may be insufficient
- define extension struct and object header structs
- return len only if constraints met
- if returning len, also validate
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2020-07-10 13:29:02 +00:00
|
|
|
recverr_rfc4884:1,
|
net/tcp-fastopen: Add new API support
This patch adds a new socket option, TCP_FASTOPEN_CONNECT, as an
alternative way to perform Fast Open on the active side (client). Prior
to this patch, a client needs to replace the connect() call with
sendto(MSG_FASTOPEN). This can be cumbersome for applications who want
to use Fast Open: these socket operations are often done in lower layer
libraries used by many other applications. Changing these libraries
and/or the socket call sequences are not trivial. A more convenient
approach is to perform Fast Open by simply enabling a socket option when
the socket is created w/o changing other socket calls sequence:
s = socket()
create a new socket
setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN_CONNECT …);
newly introduced sockopt
If set, new functionality described below will be used.
Return ENOTSUPP if TFO is not supported or not enabled in the
kernel.
connect()
With cookie present, return 0 immediately.
With no cookie, initiate 3WHS with TFO cookie-request option and
return -1 with errno = EINPROGRESS.
write()/sendmsg()
With cookie present, send out SYN with data and return the number of
bytes buffered.
With no cookie, and 3WHS not yet completed, return -1 with errno =
EINPROGRESS.
No MSG_FASTOPEN flag is needed.
read()
Return -1 with errno = EWOULDBLOCK/EAGAIN if connect() is called but
write() is not called yet.
Return -1 with errno = EWOULDBLOCK/EAGAIN if connection is
established but no msg is received yet.
Return number of bytes read if socket is established and there is
msg received.
The new API simplifies life for applications that always perform a write()
immediately after a successful connect(). Such applications can now take
advantage of Fast Open by merely making one new setsockopt() call at the time
of creating the socket. Nothing else about the application's socket call
sequence needs to change.
Signed-off-by: Wei Wang <weiwan@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2017-01-23 18:59:22 +00:00
|
|
|
defer_connect:1; /* Indicates that fastopen_connect is set
|
|
|
|
* and cookie exists so we defer connect
|
|
|
|
* until first data frame is written
|
|
|
|
*/
|
2012-02-09 09:35:49 +00:00
|
|
|
__u8 rcv_tos;
|
2015-01-05 21:56:14 +00:00
|
|
|
__u8 convert_csum;
|
2012-02-08 09:11:07 +00:00
|
|
|
int uc_index;
|
2005-12-27 04:43:12 +00:00
|
|
|
int mc_index;
|
2006-09-27 04:27:35 +00:00
|
|
|
__be32 mc_addr;
|
2010-11-12 05:46:50 +00:00
|
|
|
struct ip_mc_socklist __rcu *mc_list;
|
2011-05-06 22:02:07 +00:00
|
|
|
struct inet_cork_full cork;
|
2005-12-27 04:43:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
|
|
|
|
#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
|
|
|
|
|
2015-01-05 21:56:15 +00:00
|
|
|
/* cmsg flags for inet */
|
|
|
|
#define IP_CMSG_PKTINFO BIT(0)
|
|
|
|
#define IP_CMSG_TTL BIT(1)
|
|
|
|
#define IP_CMSG_TOS BIT(2)
|
|
|
|
#define IP_CMSG_RECVOPTS BIT(3)
|
|
|
|
#define IP_CMSG_RETOPTS BIT(4)
|
|
|
|
#define IP_CMSG_PASSSEC BIT(5)
|
|
|
|
#define IP_CMSG_ORIGDSTADDR BIT(6)
|
2015-01-05 21:56:17 +00:00
|
|
|
#define IP_CMSG_CHECKSUM BIT(7)
|
2016-11-02 15:02:16 +00:00
|
|
|
#define IP_CMSG_RECVFRAGSIZE BIT(8)
|
2015-01-05 21:56:15 +00:00
|
|
|
|
2022-06-20 19:13:53 +00:00
|
|
|
static inline bool sk_is_inet(struct sock *sk)
|
|
|
|
{
|
|
|
|
return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
|
|
|
|
}
|
|
|
|
|
2015-12-07 16:53:17 +00:00
|
|
|
/**
|
|
|
|
* sk_to_full_sk - Access to a full socket
|
|
|
|
* @sk: pointer to a socket
|
|
|
|
*
|
|
|
|
* SYNACK messages might be attached to request sockets.
|
2015-11-08 18:54:07 +00:00
|
|
|
* Some places want to reach the listener in this case.
|
|
|
|
*/
|
2015-12-07 16:53:17 +00:00
|
|
|
static inline struct sock *sk_to_full_sk(struct sock *sk)
|
2015-11-08 18:54:07 +00:00
|
|
|
{
|
2015-12-07 16:53:17 +00:00
|
|
|
#ifdef CONFIG_INET
|
2015-11-08 18:54:07 +00:00
|
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
|
|
sk = inet_reqsk(sk)->rsk_listener;
|
2015-12-07 16:53:17 +00:00
|
|
|
#endif
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* sk_to_full_sk() variant with a const argument */
|
|
|
|
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_INET
|
|
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
|
|
sk = ((const struct request_sock *)sk)->rsk_listener;
|
|
|
|
#endif
|
2015-11-08 18:54:07 +00:00
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
2015-12-07 16:53:17 +00:00
|
|
|
static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return sk_to_full_sk(skb->sk);
|
|
|
|
}
|
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
static inline struct inet_sock *inet_sk(const struct sock *sk)
|
|
|
|
{
|
|
|
|
return (struct inet_sock *)sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __inet_sk_copy_descendant(struct sock *sk_to,
|
|
|
|
const struct sock *sk_from,
|
|
|
|
const int ancestor_size)
|
|
|
|
{
|
|
|
|
memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
|
|
|
|
sk_from->sk_prot->obj_size - ancestor_size);
|
|
|
|
}
|
|
|
|
|
2013-09-21 17:22:41 +00:00
|
|
|
int inet_sk_rebuild_header(struct sock *sk);
|
2017-12-20 03:12:52 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* inet_sk_state_load - read sk->sk_state for lockless contexts
|
|
|
|
* @sk: socket pointer
|
|
|
|
*
|
|
|
|
* Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
|
|
|
|
* tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
|
|
|
|
*/
|
|
|
|
static inline int inet_sk_state_load(const struct sock *sk)
|
|
|
|
{
|
|
|
|
/* state change might impact lockless readers. */
|
|
|
|
return smp_load_acquire(&sk->sk_state);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* inet_sk_state_store - update sk->sk_state
|
|
|
|
* @sk: socket pointer
|
|
|
|
* @newstate: new state
|
|
|
|
*
|
|
|
|
* Paired with inet_sk_state_load(). Should be used in contexts where
|
|
|
|
* state change might impact lockless readers.
|
|
|
|
*/
|
2017-12-20 03:12:51 +00:00
|
|
|
void inet_sk_state_store(struct sock *sk, int newstate);
|
2005-12-27 04:43:12 +00:00
|
|
|
|
2017-12-20 03:12:52 +00:00
|
|
|
void inet_sk_set_state(struct sock *sk, int state);
|
|
|
|
|
2013-10-19 19:48:51 +00:00
|
|
|
static inline unsigned int __inet_ehashfn(const __be32 laddr,
|
|
|
|
const __u16 lport,
|
|
|
|
const __be32 faddr,
|
|
|
|
const __be16 fport,
|
|
|
|
u32 initval)
|
2005-12-27 04:43:12 +00:00
|
|
|
{
|
2008-03-04 22:28:41 +00:00
|
|
|
return jhash_3words((__force __u32) laddr,
|
|
|
|
(__force __u32) faddr,
|
2007-03-23 18:40:27 +00:00
|
|
|
((__u32) lport) << 16 | (__force __u32)fport,
|
2013-10-19 19:48:51 +00:00
|
|
|
initval);
|
2005-12-27 04:43:12 +00:00
|
|
|
}
|
|
|
|
|
2015-03-18 01:32:27 +00:00
|
|
|
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
|
2015-10-05 04:08:11 +00:00
|
|
|
struct sock *sk_listener,
|
|
|
|
bool attach_listener);
|
2008-06-10 19:39:35 +00:00
|
|
|
|
2008-10-01 14:41:00 +00:00
|
|
|
static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
|
|
|
|
{
|
2011-01-28 06:01:53 +00:00
|
|
|
__u8 flags = 0;
|
|
|
|
|
2011-08-07 09:16:09 +00:00
|
|
|
if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl)
|
2011-01-28 06:01:53 +00:00
|
|
|
flags |= FLOWI_FLAG_ANYSRC;
|
|
|
|
return flags;
|
2008-10-01 14:41:00 +00:00
|
|
|
}
|
|
|
|
|
2015-01-05 21:56:14 +00:00
|
|
|
static inline void inet_inc_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
inet_sk(sk)->convert_csum++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inet_dec_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (inet_sk(sk)->convert_csum > 0)
|
|
|
|
inet_sk(sk)->convert_csum--;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool inet_get_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
return !!inet_sk(sk)->convert_csum;
|
|
|
|
}
|
|
|
|
|
2018-07-31 19:18:11 +00:00
|
|
|
|
|
|
|
static inline bool inet_can_nonlocal_bind(struct net *net,
|
|
|
|
struct inet_sock *inet)
|
|
|
|
{
|
2022-07-13 20:51:55 +00:00
|
|
|
return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
|
2018-07-31 19:18:11 +00:00
|
|
|
inet->freebind || inet->transparent;
|
|
|
|
}
|
|
|
|
|
2021-11-17 09:00:11 +00:00
|
|
|
static inline bool inet_addr_valid_or_nonlocal(struct net *net,
|
|
|
|
struct inet_sock *inet,
|
|
|
|
__be32 addr,
|
|
|
|
int addr_type)
|
|
|
|
{
|
|
|
|
return inet_can_nonlocal_bind(net, inet) ||
|
|
|
|
addr == htonl(INADDR_ANY) ||
|
|
|
|
addr_type == RTN_LOCAL ||
|
|
|
|
addr_type == RTN_MULTICAST ||
|
|
|
|
addr_type == RTN_BROADCAST;
|
|
|
|
}
|
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
#endif /* _INET_SOCK_H */
|