forked from Minki/linux
61fad6816f
PACKET_RX_RING can cause multiple writers to access the same slot if a fast writer wraps the ring while a slow writer is still copying. This is particularly likely with few, large, slots (e.g., GSO packets). Synchronize kernel thread ownership of rx ring slots with a bitmap. Writers acquire a slot race-free by testing tp_status TP_STATUS_KERNEL while holding the sk receive queue lock. They release this lock before copying and set tp_status to TP_STATUS_USER to release to userspace when done. During copying, another writer may take the lock, also see TP_STATUS_KERNEL, and start writing to the same slot. Introduce a new rx_owner_map bitmap with a bit per slot. To acquire a slot, test and set with the lock held. To release race-free, update tp_status and owner bit as a transaction, so take the lock again. This is the one of a variety of discussed options (see Link below): * instead of a shadow ring, embed the data in the slot itself, such as in tp_padding. But any test for this field may match a value left by userspace, causing deadlock. * avoid the lock on release. This leaves a small race if releasing the shadow slot before setting TP_STATUS_USER. The below reproducer showed that this race is not academic. If releasing the slot after tp_status, the race is more subtle. See the first link for details. * add a new tp_status TP_KERNEL_OWNED to avoid the transactional store of two fields. But, legacy applications may interpret all non-zero tp_status as owned by the user. As libpcap does. So this is possible only opt-in by newer processes. It can be added as an optional mode. * embed the struct at the tail of pg_vec to avoid extra allocation. The implementation proved no less complex than a separate field. The additional locking cost on release adds contention, no different than scaling on multicore or multiqueue h/w. In practice, below reproducer nor small packet tcpdump showed a noticeable change in perf report in cycles spent in spinlock. Where contention is problematic, packet sockets support mitigation through PACKET_FANOUT. And we can consider adding opt-in state TP_KERNEL_OWNED. Easy to reproduce by running multiple netperf or similar TCP_STREAM flows concurrently with `tcpdump -B 129 -n greater 60000`. Based on an earlier patchset by Jon Rosen. See links below. I believe this issue goes back to the introduction of tpacket_rcv, which predates git history. Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg237222.html Suggested-by: Jon Rosen <jrosen@cisco.com> Signed-off-by: Willem de Bruijn <willemb@google.com> Signed-off-by: Jon Rosen <jrosen@cisco.com> Signed-off-by: David S. Miller <davem@davemloft.net>
147 lines
3.3 KiB
C
147 lines
3.3 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef __PACKET_INTERNAL_H__
|
|
#define __PACKET_INTERNAL_H__
|
|
|
|
#include <linux/refcount.h>
|
|
|
|
struct packet_mclist {
|
|
struct packet_mclist *next;
|
|
int ifindex;
|
|
int count;
|
|
unsigned short type;
|
|
unsigned short alen;
|
|
unsigned char addr[MAX_ADDR_LEN];
|
|
};
|
|
|
|
/* kbdq - kernel block descriptor queue */
|
|
struct tpacket_kbdq_core {
|
|
struct pgv *pkbdq;
|
|
unsigned int feature_req_word;
|
|
unsigned int hdrlen;
|
|
unsigned char reset_pending_on_curr_blk;
|
|
unsigned char delete_blk_timer;
|
|
unsigned short kactive_blk_num;
|
|
unsigned short blk_sizeof_priv;
|
|
|
|
/* last_kactive_blk_num:
|
|
* trick to see if user-space has caught up
|
|
* in order to avoid refreshing timer when every single pkt arrives.
|
|
*/
|
|
unsigned short last_kactive_blk_num;
|
|
|
|
char *pkblk_start;
|
|
char *pkblk_end;
|
|
int kblk_size;
|
|
unsigned int max_frame_len;
|
|
unsigned int knum_blocks;
|
|
uint64_t knxt_seq_num;
|
|
char *prev;
|
|
char *nxt_offset;
|
|
struct sk_buff *skb;
|
|
|
|
atomic_t blk_fill_in_prog;
|
|
|
|
/* Default is set to 8ms */
|
|
#define DEFAULT_PRB_RETIRE_TOV (8)
|
|
|
|
unsigned short retire_blk_tov;
|
|
unsigned short version;
|
|
unsigned long tov_in_jiffies;
|
|
|
|
/* timer to retire an outstanding block */
|
|
struct timer_list retire_blk_timer;
|
|
};
|
|
|
|
struct pgv {
|
|
char *buffer;
|
|
};
|
|
|
|
struct packet_ring_buffer {
|
|
struct pgv *pg_vec;
|
|
|
|
unsigned int head;
|
|
unsigned int frames_per_block;
|
|
unsigned int frame_size;
|
|
unsigned int frame_max;
|
|
|
|
unsigned int pg_vec_order;
|
|
unsigned int pg_vec_pages;
|
|
unsigned int pg_vec_len;
|
|
|
|
unsigned int __percpu *pending_refcnt;
|
|
|
|
union {
|
|
unsigned long *rx_owner_map;
|
|
struct tpacket_kbdq_core prb_bdqc;
|
|
};
|
|
};
|
|
|
|
extern struct mutex fanout_mutex;
|
|
#define PACKET_FANOUT_MAX 256
|
|
|
|
struct packet_fanout {
|
|
possible_net_t net;
|
|
unsigned int num_members;
|
|
u16 id;
|
|
u8 type;
|
|
u8 flags;
|
|
union {
|
|
atomic_t rr_cur;
|
|
struct bpf_prog __rcu *bpf_prog;
|
|
};
|
|
struct list_head list;
|
|
struct sock *arr[PACKET_FANOUT_MAX];
|
|
spinlock_t lock;
|
|
refcount_t sk_ref;
|
|
struct packet_type prot_hook ____cacheline_aligned_in_smp;
|
|
};
|
|
|
|
struct packet_rollover {
|
|
int sock;
|
|
atomic_long_t num;
|
|
atomic_long_t num_huge;
|
|
atomic_long_t num_failed;
|
|
#define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32))
|
|
u32 history[ROLLOVER_HLEN] ____cacheline_aligned;
|
|
} ____cacheline_aligned_in_smp;
|
|
|
|
struct packet_sock {
|
|
/* struct sock has to be the first member of packet_sock */
|
|
struct sock sk;
|
|
struct packet_fanout *fanout;
|
|
union tpacket_stats_u stats;
|
|
struct packet_ring_buffer rx_ring;
|
|
struct packet_ring_buffer tx_ring;
|
|
int copy_thresh;
|
|
spinlock_t bind_lock;
|
|
struct mutex pg_vec_lock;
|
|
unsigned int running; /* bind_lock must be held */
|
|
unsigned int auxdata:1, /* writer must hold sock lock */
|
|
origdev:1,
|
|
has_vnet_hdr:1,
|
|
tp_loss:1,
|
|
tp_tx_has_off:1;
|
|
int pressure;
|
|
int ifindex; /* bound device */
|
|
__be16 num;
|
|
struct packet_rollover *rollover;
|
|
struct packet_mclist *mclist;
|
|
atomic_t mapped;
|
|
enum tpacket_versions tp_version;
|
|
unsigned int tp_hdrlen;
|
|
unsigned int tp_reserve;
|
|
unsigned int tp_tstamp;
|
|
struct completion skb_completion;
|
|
struct net_device __rcu *cached_dev;
|
|
int (*xmit)(struct sk_buff *skb);
|
|
struct packet_type prot_hook ____cacheline_aligned_in_smp;
|
|
atomic_t tp_drops ____cacheline_aligned_in_smp;
|
|
};
|
|
|
|
static struct packet_sock *pkt_sk(struct sock *sk)
|
|
{
|
|
return (struct packet_sock *)sk;
|
|
}
|
|
|
|
#endif
|