2019-05-27 06:55:01 +00:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2005-12-27 04:43:12 +00:00
|
|
|
/*
|
|
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
|
|
* operating system. INET is implemented using the BSD Socket
|
|
|
|
* interface as the means of communication with the user level.
|
|
|
|
*
|
|
|
|
* Definitions for inet_sock
|
|
|
|
*
|
|
|
|
* Authors: Many, reorganised here by
|
|
|
|
* Arnaldo Carvalho de Melo <acme@mandriva.com>
|
|
|
|
*/
|
|
|
|
#ifndef _INET_SOCK_H
|
|
|
|
#define _INET_SOCK_H
|
|
|
|
|
2015-01-05 21:56:15 +00:00
|
|
|
#include <linux/bitops.h>
|
2005-12-27 04:43:12 +00:00
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/types.h>
|
2007-03-23 18:40:27 +00:00
|
|
|
#include <linux/jhash.h>
|
rfs: Receive Flow Steering
This patch implements receive flow steering (RFS). RFS steers
received packets for layer 3 and 4 processing to the CPU where
the application for the corresponding flow is running. RFS is an
extension of Receive Packet Steering (RPS).
The basic idea of RFS is that when an application calls recvmsg
(or sendmsg) the application's running CPU is stored in a hash
table that is indexed by the connection's rxhash which is stored in
the socket structure. The rxhash is passed in skb's received on
the connection from netif_receive_skb. For each received packet,
the associated rxhash is used to look up the CPU in the hash table,
if a valid CPU is set then the packet is steered to that CPU using
the RPS mechanisms.
The convolution of the simple approach is that it would potentially
allow OOO packets. If threads are thrashing around CPUs or multiple
threads are trying to read from the same sockets, a quickly changing
CPU value in the hash table could cause rampant OOO packets--
we consider this a non-starter.
To avoid OOO packets, this solution implements two types of hash
tables: rps_sock_flow_table and rps_dev_flow_table.
rps_sock_table is a global hash table. Each entry is just a CPU
number and it is populated in recvmsg and sendmsg as described above.
This table contains the "desired" CPUs for flows.
rps_dev_flow_table is specific to each device queue. Each entry
contains a CPU and a tail queue counter. The CPU is the "current"
CPU for a matching flow. The tail queue counter holds the value
of a tail queue counter for the associated CPU's backlog queue at
the time of last enqueue for a flow matching the entry.
Each backlog queue has a queue head counter which is incremented
on dequeue, and so a queue tail counter is computed as queue head
count + queue length. When a packet is enqueued on a backlog queue,
the current value of the queue tail counter is saved in the hash
entry of the rps_dev_flow_table.
And now the trick: when selecting the CPU for RPS (get_rps_cpu)
the rps_sock_flow table and the rps_dev_flow table for the RX queue
are consulted. When the desired CPU for the flow (found in the
rps_sock_flow table) does not match the current CPU (found in the
rps_dev_flow table), the current CPU is changed to the desired CPU
if one of the following is true:
- The current CPU is unset (equal to RPS_NO_CPU)
- Current CPU is offline
- The current CPU's queue head counter >= queue tail counter in the
rps_dev_flow table. This checks if the queue tail has advanced
beyond the last packet that was enqueued using this table entry.
This guarantees that all packets queued using this entry have been
dequeued, thus preserving in order delivery.
Making each queue have its own rps_dev_flow table has two advantages:
1) the tail queue counters will be written on each receive, so
keeping the table local to interrupting CPU s good for locality. 2)
this allows lockless access to the table-- the CPU number and queue
tail counter need to be accessed together under mutual exclusion
from netif_receive_skb, we assume that this is only called from
device napi_poll which is non-reentrant.
This patch implements RFS for TCP and connected UDP sockets.
It should be usable for other flow oriented protocols.
There are two configuration parameters for RFS. The
"rps_flow_entries" kernel init parameter sets the number of
entries in the rps_sock_flow_table, the per rxqueue sysfs entry
"rps_flow_cnt" contains the number of entries in the rps_dev_flow
table for the rxqueue. Both are rounded to power of two.
The obvious benefit of RFS (over just RPS) is that it achieves
CPU locality between the receive processing for a flow and the
applications processing; this can result in increased performance
(higher pps, lower latency).
The benefits of RFS are dependent on cache hierarchy, application
load, and other factors. On simple benchmarks, we don't necessarily
see improvement and sometimes see degradation. However, for more
complex benchmarks and for applications where cache pressure is
much higher this technique seems to perform very well.
Below are some benchmark results which show the potential benfit of
this patch. The netperf test has 500 instances of netperf TCP_RR
test with 1 byte req. and resp. The RPC test is an request/response
test similar in structure to netperf RR test ith 100 threads on
each host, but does more work in userspace that netperf.
e1000e on 8 core Intel
No RFS or RPS 104K tps at 30% CPU
No RFS (best RPS config): 290K tps at 63% CPU
RFS 303K tps at 61% CPU
RPC test tps CPU% 50/90/99% usec latency Latency StdDev
No RFS/RPS 103K 48% 757/900/3185 4472.35
RPS only: 174K 73% 415/993/2468 491.66
RFS 223K 73% 379/651/1382 315.61
Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-04-16 23:01:27 +00:00
|
|
|
#include <linux/netdevice.h>
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
#include <net/flow.h>
|
|
|
|
#include <net/sock.h>
|
|
|
|
#include <net/request_sock.h>
|
2008-06-17 00:14:11 +00:00
|
|
|
#include <net/netns/hash.h>
|
2015-03-12 23:44:05 +00:00
|
|
|
#include <net/tcp_states.h>
|
2015-12-16 21:20:44 +00:00
|
|
|
#include <net/l3mdev.h>
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
/** struct ip_options - IP Options
|
|
|
|
*
|
|
|
|
* @faddr - Saved first hop address
|
2011-11-22 23:33:10 +00:00
|
|
|
* @nexthop - Saved nexthop address in LSRR and SSRR
|
2005-12-27 04:43:12 +00:00
|
|
|
* @is_strictroute - Strict source route
|
|
|
|
* @srr_is_hit - Packet destination addr was our one
|
|
|
|
* @is_changed - IP checksum more not valid
|
|
|
|
* @rr_needaddr - Need to record addr of outgoing dev
|
|
|
|
* @ts_needtime - Need to record timestamp
|
|
|
|
* @ts_needaddr - Need to record addr of outgoing dev
|
|
|
|
*/
|
|
|
|
struct ip_options {
|
2006-09-28 01:28:07 +00:00
|
|
|
__be32 faddr;
|
2011-11-22 23:33:10 +00:00
|
|
|
__be32 nexthop;
|
2005-12-27 04:43:12 +00:00
|
|
|
unsigned char optlen;
|
|
|
|
unsigned char srr;
|
|
|
|
unsigned char rr;
|
|
|
|
unsigned char ts;
|
2008-03-22 23:35:29 +00:00
|
|
|
unsigned char is_strictroute:1,
|
2005-12-27 04:43:12 +00:00
|
|
|
srr_is_hit:1,
|
|
|
|
is_changed:1,
|
|
|
|
rr_needaddr:1,
|
|
|
|
ts_needtime:1,
|
|
|
|
ts_needaddr:1;
|
|
|
|
unsigned char router_alert;
|
2006-08-03 23:46:20 +00:00
|
|
|
unsigned char cipso;
|
2005-12-27 04:43:12 +00:00
|
|
|
unsigned char __pad2;
|
2020-03-02 12:07:42 +00:00
|
|
|
unsigned char __data[];
|
2005-12-27 04:43:12 +00:00
|
|
|
};
|
|
|
|
|
2011-04-21 09:45:37 +00:00
|
|
|
struct ip_options_rcu {
|
|
|
|
struct rcu_head rcu;
|
|
|
|
struct ip_options opt;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ip_options_data {
|
|
|
|
struct ip_options_rcu opt;
|
|
|
|
char data[40];
|
|
|
|
};
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
struct inet_request_sock {
|
|
|
|
struct request_sock req;
|
2013-10-09 22:21:29 +00:00
|
|
|
#define ir_loc_addr req.__req_common.skc_rcv_saddr
|
|
|
|
#define ir_rmt_addr req.__req_common.skc_daddr
|
2013-10-10 07:04:37 +00:00
|
|
|
#define ir_num req.__req_common.skc_num
|
2013-10-09 22:21:29 +00:00
|
|
|
#define ir_rmt_port req.__req_common.skc_dport
|
|
|
|
#define ir_v6_rmt_addr req.__req_common.skc_v6_daddr
|
|
|
|
#define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr
|
|
|
|
#define ir_iif req.__req_common.skc_bound_dev_if
|
2015-03-12 01:53:14 +00:00
|
|
|
#define ir_cookie req.__req_common.skc_cookie
|
|
|
|
#define ireq_net req.__req_common.skc_net
|
2015-03-12 23:44:05 +00:00
|
|
|
#define ireq_state req.__req_common.skc_state
|
2015-03-12 23:44:10 +00:00
|
|
|
#define ireq_family req.__req_common.skc_family
|
2013-10-09 22:21:29 +00:00
|
|
|
|
2008-09-09 04:43:12 +00:00
|
|
|
u16 snd_wscale : 4,
|
|
|
|
rcv_wscale : 4,
|
2005-12-27 04:43:12 +00:00
|
|
|
tstamp_ok : 1,
|
|
|
|
sack_ok : 1,
|
|
|
|
wscale_ok : 1,
|
|
|
|
ecn_ok : 1,
|
2008-10-01 14:41:00 +00:00
|
|
|
acked : 1,
|
2017-10-25 09:01:45 +00:00
|
|
|
no_srccheck: 1,
|
|
|
|
smc_ok : 1;
|
2015-03-17 04:06:18 +00:00
|
|
|
u32 ir_mark;
|
2014-06-25 14:09:52 +00:00
|
|
|
union {
|
2017-10-20 16:04:13 +00:00
|
|
|
struct ip_options_rcu __rcu *ireq_opt;
|
2016-06-27 19:05:28 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
|
|
|
struct {
|
|
|
|
struct ipv6_txoptions *ipv6_opt;
|
|
|
|
struct sk_buff *pktopts;
|
|
|
|
};
|
|
|
|
#endif
|
2014-06-25 14:09:52 +00:00
|
|
|
};
|
2005-12-27 04:43:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk)
|
|
|
|
{
|
|
|
|
return (struct inet_request_sock *)sk;
|
|
|
|
}
|
|
|
|
|
2015-03-17 04:06:18 +00:00
|
|
|
static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-13 17:17:35 +00:00
|
|
|
{
|
2023-07-28 15:03:15 +00:00
|
|
|
u32 mark = READ_ONCE(sk->sk_mark);
|
|
|
|
|
|
|
|
if (!mark && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept))
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-13 17:17:35 +00:00
|
|
|
return skb->mark;
|
2015-03-17 04:06:18 +00:00
|
|
|
|
2023-07-28 15:03:15 +00:00
|
|
|
return mark;
|
net: support marking accepting TCP sockets
When using mark-based routing, sockets returned from accept()
may need to be marked differently depending on the incoming
connection request.
This is the case, for example, if different socket marks identify
different networks: a listening socket may want to accept
connections from all networks, but each connection should be
marked with the network that the request came in on, so that
subsequent packets are sent on the correct network.
This patch adds a sysctl to mark TCP sockets based on the fwmark
of the incoming SYN packet. If enabled, and an unmarked socket
receives a SYN, then the SYN packet's fwmark is written to the
connection's inet_request_sock, and later written back to the
accepted socket when the connection is established. If the
socket already has a nonzero mark, then the behaviour is the same
as it is today, i.e., the listening socket's fwmark is used.
Black-box tested using user-mode linux:
- IPv4/IPv6 SYN+ACK, FIN, etc. packets are routed based on the
mark of the incoming SYN packet.
- The socket returned by accept() is marked with the mark of the
incoming SYN packet.
- Tested with syncookies=1 and syncookies=2.
Signed-off-by: Lorenzo Colitti <lorenzo@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-13 17:17:35 +00:00
|
|
|
}
|
|
|
|
|
2015-12-16 21:20:44 +00:00
|
|
|
static inline int inet_request_bound_dev_if(const struct sock *sk,
|
|
|
|
struct sk_buff *skb)
|
|
|
|
{
|
2022-05-13 18:55:43 +00:00
|
|
|
int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
|
2015-12-16 21:20:44 +00:00
|
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
|
2022-07-13 20:51:59 +00:00
|
|
|
if (!bound_dev_if && READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
|
2015-12-16 21:20:44 +00:00
|
|
|
return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
|
|
|
|
#endif
|
|
|
|
|
2022-05-13 18:55:43 +00:00
|
|
|
return bound_dev_if;
|
2015-12-16 21:20:44 +00:00
|
|
|
}
|
|
|
|
|
2018-11-07 15:36:02 +00:00
|
|
|
static inline int inet_sk_bound_l3mdev(const struct sock *sk)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_NET_L3_MASTER_DEV
|
|
|
|
struct net *net = sock_net(sk);
|
|
|
|
|
2022-07-13 20:51:59 +00:00
|
|
|
if (!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept))
|
2018-11-07 15:36:02 +00:00
|
|
|
return l3mdev_master_ifindex_by_index(net,
|
|
|
|
sk->sk_bound_dev_if);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-07 15:36:03 +00:00
|
|
|
static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if,
|
|
|
|
int dif, int sdif)
|
|
|
|
{
|
|
|
|
if (!bound_dev_if)
|
|
|
|
return !sdif || l3mdev_accept;
|
|
|
|
return bound_dev_if == dif || bound_dev_if == sdif;
|
|
|
|
}
|
|
|
|
|
2022-07-25 18:14:42 +00:00
|
|
|
static inline bool inet_sk_bound_dev_eq(struct net *net, int bound_dev_if,
|
|
|
|
int dif, int sdif)
|
|
|
|
{
|
|
|
|
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
|
|
|
|
return inet_bound_dev_eq(!!READ_ONCE(net->ipv4.sysctl_tcp_l3mdev_accept),
|
|
|
|
bound_dev_if, dif, sdif);
|
|
|
|
#else
|
|
|
|
return inet_bound_dev_eq(true, bound_dev_if, dif, sdif);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2011-03-01 02:36:47 +00:00
|
|
|
struct inet_cork {
|
|
|
|
unsigned int flags;
|
2011-05-06 22:02:07 +00:00
|
|
|
__be32 addr;
|
2011-03-01 02:36:47 +00:00
|
|
|
struct ip_options *opt;
|
2011-05-06 22:02:07 +00:00
|
|
|
unsigned int fragsize;
|
2011-03-01 02:36:47 +00:00
|
|
|
int length; /* Total length of all frames */
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-23 23:04:42 +00:00
|
|
|
struct dst_entry *dst;
|
2011-03-01 02:36:47 +00:00
|
|
|
u8 tx_flags;
|
2013-09-24 13:43:09 +00:00
|
|
|
__u8 ttl;
|
|
|
|
__s16 tos;
|
|
|
|
char priority;
|
udp: generate gso with UDP_SEGMENT
Support generic segmentation offload for udp datagrams. Callers can
concatenate and send at once the payload of multiple datagrams with
the same destination.
To set segment size, the caller sets socket option UDP_SEGMENT to the
length of each discrete payload. This value must be smaller than or
equal to the relevant MTU.
A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a
per send call basis.
Total byte length may then exceed MTU. If not an exact multiple of
segment size, the last segment will be shorter.
The implementation adds a gso_size field to the udp socket, ip(v6)
cmsg cookie and inet_cork structure to be able to set the value at
setsockopt or cmsg time and to work with both lockless and corked
paths.
Initial benchmark numbers show UDP GSO about as expensive as TCP GSO.
tcp tso
3197 MB/s 54232 msg/s 54232 calls/s
6,457,754,262 cycles
tcp gso
1765 MB/s 29939 msg/s 29939 calls/s
11,203,021,806 cycles
tcp without tso/gso *
739 MB/s 12548 msg/s 12548 calls/s
11,205,483,630 cycles
udp
876 MB/s 14873 msg/s 624666 calls/s
11,205,777,429 cycles
udp gso
2139 MB/s 36282 msg/s 36282 calls/s
11,204,374,561 cycles
[*] after reverting commit 0a6b2a1dc2a2
("tcp: switch to GSO being always on")
Measured total system cycles ('-a') for one core while pinning both
the network receive path and benchmark process to that core:
perf stat -a -C 12 -e cycles \
./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4
Note the reduction in calls/s with GSO. Bytes per syscall drops
increases from 1470 to 61818.
Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2018-04-26 17:42:17 +00:00
|
|
|
__u16 gso_size;
|
2018-07-03 22:42:49 +00:00
|
|
|
u64 transmit_time;
|
2019-09-11 19:50:51 +00:00
|
|
|
u32 mark;
|
2011-03-01 02:36:47 +00:00
|
|
|
};
|
|
|
|
|
2011-05-06 22:02:07 +00:00
|
|
|
struct inet_cork_full {
|
|
|
|
struct inet_cork base;
|
|
|
|
struct flowi fl;
|
|
|
|
};
|
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
struct ip_mc_socklist;
|
|
|
|
struct ipv6_pinfo;
|
|
|
|
struct rtable;
|
|
|
|
|
|
|
|
/** struct inet_sock - representation of INET sockets
|
|
|
|
*
|
|
|
|
* @sk - ancestor class
|
|
|
|
* @pinet6 - pointer to IPv6 control block
|
2009-10-15 06:30:45 +00:00
|
|
|
* @inet_daddr - Foreign IPv4 addr
|
|
|
|
* @inet_rcv_saddr - Bound local IPv4 addr
|
|
|
|
* @inet_dport - Destination port
|
|
|
|
* @inet_num - Local port
|
2023-08-16 08:15:33 +00:00
|
|
|
* @inet_flags - various atomic flags
|
2009-10-15 06:30:45 +00:00
|
|
|
* @inet_saddr - Sending source
|
2005-12-27 04:43:12 +00:00
|
|
|
* @uc_ttl - Unicast TTL
|
2009-10-15 06:30:45 +00:00
|
|
|
* @inet_sport - Source port
|
|
|
|
* @inet_id - ID counter for DF pkts
|
2005-12-27 04:43:12 +00:00
|
|
|
* @tos - TOS
|
|
|
|
* @mc_ttl - Multicasting TTL
|
2012-02-08 09:11:07 +00:00
|
|
|
* @uc_index - Unicast outgoing device index
|
2005-12-27 04:43:12 +00:00
|
|
|
* @mc_index - Multicast device index
|
|
|
|
* @mc_list - Group array
|
|
|
|
* @cork - info to build ip hdr on each ip frag while socket is corked
|
|
|
|
*/
|
|
|
|
struct inet_sock {
|
|
|
|
/* sk and pinet6 has to be the first two members of inet_sock */
|
|
|
|
struct sock sk;
|
2011-12-10 09:48:31 +00:00
|
|
|
#if IS_ENABLED(CONFIG_IPV6)
|
2005-12-27 04:43:12 +00:00
|
|
|
struct ipv6_pinfo *pinet6;
|
|
|
|
#endif
|
|
|
|
/* Socket demultiplex comparisons on incoming packets. */
|
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 19:04:07 +00:00
|
|
|
#define inet_daddr sk.__sk_common.skc_daddr
|
|
|
|
#define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr
|
2012-11-30 09:49:27 +00:00
|
|
|
#define inet_dport sk.__sk_common.skc_dport
|
|
|
|
#define inet_num sk.__sk_common.skc_num
|
net: optimize INET input path further
Followup of commit b178bb3dfc30 (net: reorder struct sock fields)
Optimize INET input path a bit further, by :
1) moving sk_refcnt close to sk_lock.
This reduces number of dirtied cache lines by one on 64bit arches (and
64 bytes cache line size).
2) moving inet_daddr & inet_rcv_saddr at the beginning of sk
(same cache line than hash / family / bound_dev_if / nulls_node)
This reduces number of accessed cache lines in lookups by one, and dont
increase size of inet and timewait socks.
inet and tw sockets now share same place-holder for these fields.
Before patch :
offsetof(struct sock, sk_refcnt) = 0x10
offsetof(struct sock, sk_lock) = 0x40
offsetof(struct sock, sk_receive_queue) = 0x60
offsetof(struct inet_sock, inet_daddr) = 0x270
offsetof(struct inet_sock, inet_rcv_saddr) = 0x274
After patch :
offsetof(struct sock, sk_refcnt) = 0x44
offsetof(struct sock, sk_lock) = 0x48
offsetof(struct sock, sk_receive_queue) = 0x68
offsetof(struct inet_sock, inet_daddr) = 0x0
offsetof(struct inet_sock, inet_rcv_saddr) = 0x4
compute_score() (udp or tcp) now use a single cache line per ignored
item, instead of two.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2010-11-30 19:04:07 +00:00
|
|
|
|
2023-08-16 08:15:33 +00:00
|
|
|
unsigned long inet_flags;
|
2009-10-15 06:30:45 +00:00
|
|
|
__be32 inet_saddr;
|
2005-12-27 04:43:12 +00:00
|
|
|
__s16 uc_ttl;
|
2009-10-15 06:30:45 +00:00
|
|
|
__be16 inet_sport;
|
2023-08-16 08:15:33 +00:00
|
|
|
struct ip_options_rcu __rcu *inet_opt;
|
2023-08-19 03:17:07 +00:00
|
|
|
atomic_t inet_id;
|
2010-01-12 00:28:01 +00:00
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
__u8 tos;
|
2010-01-12 00:28:01 +00:00
|
|
|
__u8 min_ttl;
|
2005-12-27 04:43:12 +00:00
|
|
|
__u8 mc_ttl;
|
|
|
|
__u8 pmtudisc;
|
2012-02-09 09:35:49 +00:00
|
|
|
__u8 rcv_tos;
|
2015-01-05 21:56:14 +00:00
|
|
|
__u8 convert_csum;
|
2012-02-08 09:11:07 +00:00
|
|
|
int uc_index;
|
2005-12-27 04:43:12 +00:00
|
|
|
int mc_index;
|
2006-09-27 04:27:35 +00:00
|
|
|
__be32 mc_addr;
|
inet: Add IP_LOCAL_PORT_RANGE socket option
Users who want to share a single public IP address for outgoing connections
between several hosts traditionally reach for SNAT. However, SNAT requires
state keeping on the node(s) performing the NAT.
A stateless alternative exists, where a single IP address used for egress
can be shared between several hosts by partitioning the available ephemeral
port range. In such a setup:
1. Each host gets assigned a disjoint range of ephemeral ports.
2. Applications open connections from the host-assigned port range.
3. Return traffic gets routed to the host based on both, the destination IP
and the destination port.
An application which wants to open an outgoing connection (connect) from a
given port range today can choose between two solutions:
1. Manually pick the source port by bind()'ing to it before connect()'ing
the socket.
This approach has a couple of downsides:
a) Search for a free port has to be implemented in the user-space. If
the chosen 4-tuple happens to be busy, the application needs to retry
from a different local port number.
Detecting if 4-tuple is busy can be either easy (TCP) or hard
(UDP). In TCP case, the application simply has to check if connect()
returned an error (EADDRNOTAVAIL). That is assuming that the local
port sharing was enabled (REUSEADDR) by all the sockets.
# Assume desired local port range is 60_000-60_511
s = socket(AF_INET, SOCK_STREAM)
s.setsockopt(SOL_SOCKET, SO_REUSEADDR, 1)
s.bind(("192.0.2.1", 60_000))
s.connect(("1.1.1.1", 53))
# Fails only if 192.0.2.1:60000 -> 1.1.1.1:53 is busy
# Application must retry with another local port
In case of UDP, the network stack allows binding more than one socket
to the same 4-tuple, when local port sharing is enabled
(REUSEADDR). Hence detecting the conflict is much harder and involves
querying sock_diag and toggling the REUSEADDR flag [1].
b) For TCP, bind()-ing to a port within the ephemeral port range means
that no connecting sockets, that is those which leave it to the
network stack to find a free local port at connect() time, can use
the this port.
IOW, the bind hash bucket tb->fastreuse will be 0 or 1, and the port
will be skipped during the free port search at connect() time.
2. Isolate the app in a dedicated netns and use the use the per-netns
ip_local_port_range sysctl to adjust the ephemeral port range bounds.
The per-netns setting affects all sockets, so this approach can be used
only if:
- there is just one egress IP address, or
- the desired egress port range is the same for all egress IP addresses
used by the application.
For TCP, this approach avoids the downsides of (1). Free port search and
4-tuple conflict detection is done by the network stack:
system("sysctl -w net.ipv4.ip_local_port_range='60000 60511'")
s = socket(AF_INET, SOCK_STREAM)
s.setsockopt(SOL_IP, IP_BIND_ADDRESS_NO_PORT, 1)
s.bind(("192.0.2.1", 0))
s.connect(("1.1.1.1", 53))
# Fails if all 4-tuples 192.0.2.1:60000-60511 -> 1.1.1.1:53 are busy
For UDP this approach has limited applicability. Setting the
IP_BIND_ADDRESS_NO_PORT socket option does not result in local source
port being shared with other connected UDP sockets.
Hence relying on the network stack to find a free source port, limits the
number of outgoing UDP flows from a single IP address down to the number
of available ephemeral ports.
To put it another way, partitioning the ephemeral port range between hosts
using the existing Linux networking API is cumbersome.
To address this use case, add a new socket option at the SOL_IP level,
named IP_LOCAL_PORT_RANGE. The new option can be used to clamp down the
ephemeral port range for each socket individually.
The option can be used only to narrow down the per-netns local port
range. If the per-socket range lies outside of the per-netns range, the
latter takes precedence.
UAPI-wise, the low and high range bounds are passed to the kernel as a pair
of u16 values in host byte order packed into a u32. This avoids pointer
passing.
PORT_LO = 40_000
PORT_HI = 40_511
s = socket(AF_INET, SOCK_STREAM)
v = struct.pack("I", PORT_HI << 16 | PORT_LO)
s.setsockopt(SOL_IP, IP_LOCAL_PORT_RANGE, v)
s.bind(("127.0.0.1", 0))
s.getsockname()
# Local address between ("127.0.0.1", 40_000) and ("127.0.0.1", 40_511),
# if there is a free port. EADDRINUSE otherwise.
[1] https://github.com/cloudflare/cloudflare-blog/blob/232b432c1d57/2022-02-connectx/connectx.py#L116
Reviewed-by: Marek Majkowski <marek@cloudflare.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2023-01-24 13:36:43 +00:00
|
|
|
struct {
|
|
|
|
__u16 lo;
|
|
|
|
__u16 hi;
|
|
|
|
} local_port_range;
|
2023-08-16 08:15:45 +00:00
|
|
|
|
|
|
|
struct ip_mc_socklist __rcu *mc_list;
|
|
|
|
struct inet_cork_full cork;
|
2005-12-27 04:43:12 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
|
|
|
|
|
2023-08-16 08:15:33 +00:00
|
|
|
enum {
|
|
|
|
INET_FLAGS_PKTINFO = 0,
|
|
|
|
INET_FLAGS_TTL = 1,
|
|
|
|
INET_FLAGS_TOS = 2,
|
|
|
|
INET_FLAGS_RECVOPTS = 3,
|
|
|
|
INET_FLAGS_RETOPTS = 4,
|
|
|
|
INET_FLAGS_PASSSEC = 5,
|
|
|
|
INET_FLAGS_ORIGDSTADDR = 6,
|
|
|
|
INET_FLAGS_CHECKSUM = 7,
|
|
|
|
INET_FLAGS_RECVFRAGSIZE = 8,
|
2023-08-16 08:15:35 +00:00
|
|
|
|
|
|
|
INET_FLAGS_RECVERR = 9,
|
2023-08-16 08:15:36 +00:00
|
|
|
INET_FLAGS_RECVERR_RFC4884 = 10,
|
2023-08-16 08:15:37 +00:00
|
|
|
INET_FLAGS_FREEBIND = 11,
|
2023-08-16 08:15:38 +00:00
|
|
|
INET_FLAGS_HDRINCL = 12,
|
2023-08-16 08:15:39 +00:00
|
|
|
INET_FLAGS_MC_LOOP = 13,
|
2023-08-16 08:15:40 +00:00
|
|
|
INET_FLAGS_MC_ALL = 14,
|
2023-08-16 08:15:41 +00:00
|
|
|
INET_FLAGS_TRANSPARENT = 15,
|
2023-08-16 08:15:42 +00:00
|
|
|
INET_FLAGS_IS_ICSK = 16,
|
2023-08-16 08:15:43 +00:00
|
|
|
INET_FLAGS_NODEFRAG = 17,
|
2023-08-16 08:15:44 +00:00
|
|
|
INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
|
2023-08-16 08:15:45 +00:00
|
|
|
INET_FLAGS_DEFER_CONNECT = 19,
|
2023-09-12 16:02:00 +00:00
|
|
|
INET_FLAGS_MC6_LOOP = 20,
|
2023-09-12 16:02:04 +00:00
|
|
|
INET_FLAGS_RECVERR6_RFC4884 = 21,
|
2023-09-12 16:02:05 +00:00
|
|
|
INET_FLAGS_MC6_ALL = 22,
|
2023-09-12 16:02:06 +00:00
|
|
|
INET_FLAGS_AUTOFLOWLABEL_SET = 23,
|
|
|
|
INET_FLAGS_AUTOFLOWLABEL = 24,
|
2023-09-12 16:02:07 +00:00
|
|
|
INET_FLAGS_DONTFRAG = 25,
|
2023-09-12 16:02:08 +00:00
|
|
|
INET_FLAGS_RECVERR6 = 26,
|
2023-09-12 16:02:09 +00:00
|
|
|
INET_FLAGS_REPFLOW = 27,
|
2023-09-12 16:02:10 +00:00
|
|
|
INET_FLAGS_RTALERT_ISOLATE = 28,
|
2023-09-12 16:02:12 +00:00
|
|
|
INET_FLAGS_SNDFLOW = 29,
|
2023-08-16 08:15:33 +00:00
|
|
|
};
|
|
|
|
|
2015-01-05 21:56:15 +00:00
|
|
|
/* cmsg flags for inet */
|
2023-08-16 08:15:33 +00:00
|
|
|
#define IP_CMSG_PKTINFO BIT(INET_FLAGS_PKTINFO)
|
|
|
|
#define IP_CMSG_TTL BIT(INET_FLAGS_TTL)
|
|
|
|
#define IP_CMSG_TOS BIT(INET_FLAGS_TOS)
|
|
|
|
#define IP_CMSG_RECVOPTS BIT(INET_FLAGS_RECVOPTS)
|
|
|
|
#define IP_CMSG_RETOPTS BIT(INET_FLAGS_RETOPTS)
|
|
|
|
#define IP_CMSG_PASSSEC BIT(INET_FLAGS_PASSSEC)
|
|
|
|
#define IP_CMSG_ORIGDSTADDR BIT(INET_FLAGS_ORIGDSTADDR)
|
|
|
|
#define IP_CMSG_CHECKSUM BIT(INET_FLAGS_CHECKSUM)
|
|
|
|
#define IP_CMSG_RECVFRAGSIZE BIT(INET_FLAGS_RECVFRAGSIZE)
|
|
|
|
|
|
|
|
#define IP_CMSG_ALL (IP_CMSG_PKTINFO | IP_CMSG_TTL | \
|
|
|
|
IP_CMSG_TOS | IP_CMSG_RECVOPTS | \
|
|
|
|
IP_CMSG_RETOPTS | IP_CMSG_PASSSEC | \
|
|
|
|
IP_CMSG_ORIGDSTADDR | IP_CMSG_CHECKSUM | \
|
|
|
|
IP_CMSG_RECVFRAGSIZE)
|
|
|
|
|
|
|
|
static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
|
|
|
|
{
|
|
|
|
return READ_ONCE(inet->inet_flags) & IP_CMSG_ALL;
|
|
|
|
}
|
|
|
|
|
|
|
|
#define inet_test_bit(nr, sk) \
|
|
|
|
test_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
|
|
|
|
#define inet_set_bit(nr, sk) \
|
|
|
|
set_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
|
|
|
|
#define inet_clear_bit(nr, sk) \
|
|
|
|
clear_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags)
|
|
|
|
#define inet_assign_bit(nr, sk, val) \
|
|
|
|
assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
|
2015-01-05 21:56:15 +00:00
|
|
|
|
2022-06-20 19:13:53 +00:00
|
|
|
static inline bool sk_is_inet(struct sock *sk)
|
|
|
|
{
|
|
|
|
return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
|
|
|
|
}
|
|
|
|
|
2015-12-07 16:53:17 +00:00
|
|
|
/**
|
|
|
|
* sk_to_full_sk - Access to a full socket
|
|
|
|
* @sk: pointer to a socket
|
|
|
|
*
|
|
|
|
* SYNACK messages might be attached to request sockets.
|
2015-11-08 18:54:07 +00:00
|
|
|
* Some places want to reach the listener in this case.
|
|
|
|
*/
|
2015-12-07 16:53:17 +00:00
|
|
|
static inline struct sock *sk_to_full_sk(struct sock *sk)
|
2015-11-08 18:54:07 +00:00
|
|
|
{
|
2015-12-07 16:53:17 +00:00
|
|
|
#ifdef CONFIG_INET
|
2015-11-08 18:54:07 +00:00
|
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
|
|
sk = inet_reqsk(sk)->rsk_listener;
|
2015-12-07 16:53:17 +00:00
|
|
|
#endif
|
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* sk_to_full_sk() variant with a const argument */
|
|
|
|
static inline const struct sock *sk_const_to_full_sk(const struct sock *sk)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_INET
|
|
|
|
if (sk && sk->sk_state == TCP_NEW_SYN_RECV)
|
|
|
|
sk = ((const struct request_sock *)sk)->rsk_listener;
|
|
|
|
#endif
|
2015-11-08 18:54:07 +00:00
|
|
|
return sk;
|
|
|
|
}
|
|
|
|
|
2015-12-07 16:53:17 +00:00
|
|
|
static inline struct sock *skb_to_full_sk(const struct sk_buff *skb)
|
|
|
|
{
|
|
|
|
return sk_to_full_sk(skb->sk);
|
|
|
|
}
|
|
|
|
|
2023-03-16 15:31:55 +00:00
|
|
|
#define inet_sk(ptr) container_of_const(ptr, struct inet_sock, sk)
|
2005-12-27 04:43:12 +00:00
|
|
|
|
|
|
|
static inline void __inet_sk_copy_descendant(struct sock *sk_to,
|
|
|
|
const struct sock *sk_from,
|
|
|
|
const int ancestor_size)
|
|
|
|
{
|
|
|
|
memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1,
|
|
|
|
sk_from->sk_prot->obj_size - ancestor_size);
|
|
|
|
}
|
|
|
|
|
2013-09-21 17:22:41 +00:00
|
|
|
int inet_sk_rebuild_header(struct sock *sk);
|
2017-12-20 03:12:52 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* inet_sk_state_load - read sk->sk_state for lockless contexts
|
|
|
|
* @sk: socket pointer
|
|
|
|
*
|
|
|
|
* Paired with inet_sk_state_store(). Used in places we don't hold socket lock:
|
|
|
|
* tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ...
|
|
|
|
*/
|
|
|
|
static inline int inet_sk_state_load(const struct sock *sk)
|
|
|
|
{
|
|
|
|
/* state change might impact lockless readers. */
|
|
|
|
return smp_load_acquire(&sk->sk_state);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* inet_sk_state_store - update sk->sk_state
|
|
|
|
* @sk: socket pointer
|
|
|
|
* @newstate: new state
|
|
|
|
*
|
|
|
|
* Paired with inet_sk_state_load(). Should be used in contexts where
|
|
|
|
* state change might impact lockless readers.
|
|
|
|
*/
|
2017-12-20 03:12:51 +00:00
|
|
|
void inet_sk_state_store(struct sock *sk, int newstate);
|
2005-12-27 04:43:12 +00:00
|
|
|
|
2017-12-20 03:12:52 +00:00
|
|
|
void inet_sk_set_state(struct sock *sk, int state);
|
|
|
|
|
2013-10-19 19:48:51 +00:00
|
|
|
static inline unsigned int __inet_ehashfn(const __be32 laddr,
|
|
|
|
const __u16 lport,
|
|
|
|
const __be32 faddr,
|
|
|
|
const __be16 fport,
|
|
|
|
u32 initval)
|
2005-12-27 04:43:12 +00:00
|
|
|
{
|
2008-03-04 22:28:41 +00:00
|
|
|
return jhash_3words((__force __u32) laddr,
|
|
|
|
(__force __u32) faddr,
|
2007-03-23 18:40:27 +00:00
|
|
|
((__u32) lport) << 16 | (__force __u32)fport,
|
2013-10-19 19:48:51 +00:00
|
|
|
initval);
|
2005-12-27 04:43:12 +00:00
|
|
|
}
|
|
|
|
|
2015-03-18 01:32:27 +00:00
|
|
|
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
|
2015-10-05 04:08:11 +00:00
|
|
|
struct sock *sk_listener,
|
|
|
|
bool attach_listener);
|
2008-06-10 19:39:35 +00:00
|
|
|
|
2008-10-01 14:41:00 +00:00
|
|
|
static inline __u8 inet_sk_flowi_flags(const struct sock *sk)
|
|
|
|
{
|
2011-01-28 06:01:53 +00:00
|
|
|
__u8 flags = 0;
|
|
|
|
|
2023-08-16 08:15:41 +00:00
|
|
|
if (inet_test_bit(TRANSPARENT, sk) || inet_test_bit(HDRINCL, sk))
|
2011-01-28 06:01:53 +00:00
|
|
|
flags |= FLOWI_FLAG_ANYSRC;
|
|
|
|
return flags;
|
2008-10-01 14:41:00 +00:00
|
|
|
}
|
|
|
|
|
2015-01-05 21:56:14 +00:00
|
|
|
static inline void inet_inc_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
inet_sk(sk)->convert_csum++;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inet_dec_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
if (inet_sk(sk)->convert_csum > 0)
|
|
|
|
inet_sk(sk)->convert_csum--;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool inet_get_convert_csum(struct sock *sk)
|
|
|
|
{
|
|
|
|
return !!inet_sk(sk)->convert_csum;
|
|
|
|
}
|
|
|
|
|
2018-07-31 19:18:11 +00:00
|
|
|
|
|
|
|
static inline bool inet_can_nonlocal_bind(struct net *net,
|
|
|
|
struct inet_sock *inet)
|
|
|
|
{
|
2022-07-13 20:51:55 +00:00
|
|
|
return READ_ONCE(net->ipv4.sysctl_ip_nonlocal_bind) ||
|
2023-08-16 08:15:37 +00:00
|
|
|
test_bit(INET_FLAGS_FREEBIND, &inet->inet_flags) ||
|
2023-08-16 08:15:41 +00:00
|
|
|
test_bit(INET_FLAGS_TRANSPARENT, &inet->inet_flags);
|
2018-07-31 19:18:11 +00:00
|
|
|
}
|
|
|
|
|
2021-11-17 09:00:11 +00:00
|
|
|
static inline bool inet_addr_valid_or_nonlocal(struct net *net,
|
|
|
|
struct inet_sock *inet,
|
|
|
|
__be32 addr,
|
|
|
|
int addr_type)
|
|
|
|
{
|
|
|
|
return inet_can_nonlocal_bind(net, inet) ||
|
|
|
|
addr == htonl(INADDR_ANY) ||
|
|
|
|
addr_type == RTN_LOCAL ||
|
|
|
|
addr_type == RTN_MULTICAST ||
|
|
|
|
addr_type == RTN_BROADCAST;
|
|
|
|
}
|
|
|
|
|
2005-12-27 04:43:12 +00:00
|
|
|
#endif /* _INET_SOCK_H */
|