From 8734a162c13b1a893e7dff8de0df81fed04c51a6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:07:59 -0700 Subject: [PATCH 1/3] bpf: skmsg, improve sk_msg_used_element to work in cork context Currently sk_msg_used_element is only called in zerocopy context where cork is not possible and if this case happens we fallback to copy mode. However the helper is more useful if it works in all contexts. This patch resolved the case where if end == head indicating a full or empty ring the helper always reports an empty ring. To fix this add a test for the full ring case to avoid reporting a full ring has 0 elements. This additional functionality will be used in the next patches from recvmsg context where end = head with a full ring is a valid case. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 31df0d9fa536..22347b08e1f8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -187,18 +187,21 @@ static inline void sk_msg_xfer_full(struct sk_msg *dst, struct sk_msg *src) sk_msg_init(src); } -static inline u32 sk_msg_elem_used(const struct sk_msg *msg) -{ - return msg->sg.end >= msg->sg.start ? - msg->sg.end - msg->sg.start : - msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); -} - static inline bool sk_msg_full(const struct sk_msg *msg) { return (msg->sg.end == msg->sg.start) && msg->sg.size; } +static inline u32 sk_msg_elem_used(const struct sk_msg *msg) +{ + if (sk_msg_full(msg)) + return MAX_MSG_FRAGS; + + return msg->sg.end >= msg->sg.start ? + msg->sg.end - msg->sg.start : + msg->sg.end + (MAX_MSG_FRAGS - msg->sg.start); +} + static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) { return &msg->sg.data[which]; From 02c558b2d5d679fbbcaa5b9689484c7e0f8abb7b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:08:04 -0700 Subject: [PATCH 2/3] bpf: sockmap, support for msg_peek in sk_msg with redirect ingress This adds support for the MSG_PEEK flag when doing redirect to ingress and receiving on the sk_msg psock queue. Previously the flag was being ignored which could confuse applications if they expected the flag to work as normal. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/net/tcp.h | 2 +- net/ipv4/tcp_bpf.c | 42 +++++++++++++++++++++++++++--------------- net/tls/tls_sw.c | 3 ++- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 3600ae0f25c3..14fdd7ce9992 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2089,7 +2089,7 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len); + struct msghdr *msg, int len, int flags); /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index f9d3cf185827..b7918d4caa30 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -39,17 +39,19 @@ static int tcp_bpf_wait_data(struct sock *sk, struct sk_psock *psock, } int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, - struct msghdr *msg, int len) + struct msghdr *msg, int len, int flags) { struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; int i, ret, copied = 0; + struct sk_msg *msg_rx; + + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); while (copied != len) { struct scatterlist *sge; - struct sk_msg *msg_rx; - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); if (unlikely(!msg_rx)) break; @@ -70,22 +72,30 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } copied += copy; - sge->offset += copy; - sge->length -= copy; - sk_mem_uncharge(sk, copy); - msg_rx->sg.size -= copy; - if (!sge->length) { - i++; - if (i == MAX_SKB_FRAGS) - i = 0; - if (!msg_rx->skb) - put_page(page); + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + sk_msg_iter_var_next(i); } if (copied == len) break; } while (i != msg_rx->sg.end); + if (unlikely(peek)) { + msg_rx = list_next_entry(msg_rx, list); + continue; + } + msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { list_del(&msg_rx->list); @@ -93,6 +103,8 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, consume_skb(msg_rx->skb); kfree(msg_rx); } + msg_rx = list_first_entry_or_null(&psock->ingress_msg, + struct sk_msg, list); } return copied; @@ -115,7 +127,7 @@ int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); lock_sock(sk); msg_bytes_ready: - copied = __tcp_bpf_recvmsg(sk, psock, msg, len); + copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); if (!copied) { int data, err = 0; long timeo; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a525fc4c2a4b..5cd88ba8acd1 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1478,7 +1478,8 @@ int tls_sw_recvmsg(struct sock *sk, skb = tls_wait_data(sk, psock, flags, timeo, &err); if (!skb) { if (psock) { - int ret = __tcp_bpf_recvmsg(sk, psock, msg, len); + int ret = __tcp_bpf_recvmsg(sk, psock, + msg, len, flags); if (ret > 0) { copied += ret; From 753fb2ee09345e0730e610b2ee3a01964fe22a63 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Oct 2018 11:08:09 -0700 Subject: [PATCH 3/3] bpf: sockmap, add msg_peek tests to test_sockmap Add tests that do a MSG_PEEK recv followed by a regular receive to test flag support. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- tools/testing/selftests/bpf/test_sockmap.c | 165 ++++++++++++++------- 1 file changed, 114 insertions(+), 51 deletions(-) diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c index 7cb69ce6dfa2..cbd1c0be8680 100644 --- a/tools/testing/selftests/bpf/test_sockmap.c +++ b/tools/testing/selftests/bpf/test_sockmap.c @@ -80,6 +80,7 @@ int txmsg_end; int txmsg_ingress; int txmsg_skb; int ktls; +int peek_flag; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -102,6 +103,7 @@ static const struct option long_options[] = { {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, {"txmsg_skb", no_argument, &txmsg_skb, 1 }, {"ktls", no_argument, &ktls, 1 }, + {"peek", no_argument, &peek_flag, 1 }, {0, 0, NULL, 0 } }; @@ -352,33 +354,40 @@ static int msg_loop_sendpage(int fd, int iov_length, int cnt, return 0; } -static int msg_loop(int fd, int iov_count, int iov_length, int cnt, - struct msg_stats *s, bool tx, - struct sockmap_options *opt) +static void msg_free_iov(struct msghdr *msg) { - struct msghdr msg = {0}; - int err, i, flags = MSG_NOSIGNAL; + int i; + + for (i = 0; i < msg->msg_iovlen; i++) + free(msg->msg_iov[i].iov_base); + free(msg->msg_iov); + msg->msg_iov = NULL; + msg->msg_iovlen = 0; +} + +static int msg_alloc_iov(struct msghdr *msg, + int iov_count, int iov_length, + bool data, bool xmit) +{ + unsigned char k = 0; struct iovec *iov; - unsigned char k; - bool data_test = opt->data_test; - bool drop = opt->drop_expected; + int i; iov = calloc(iov_count, sizeof(struct iovec)); if (!iov) return errno; - k = 0; for (i = 0; i < iov_count; i++) { unsigned char *d = calloc(iov_length, sizeof(char)); if (!d) { fprintf(stderr, "iov_count %i/%i OOM\n", i, iov_count); - goto out_errno; + goto unwind_iov; } iov[i].iov_base = d; iov[i].iov_len = iov_length; - if (data_test && tx) { + if (data && xmit) { int j; for (j = 0; j < iov_length; j++) @@ -386,9 +395,60 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } } - msg.msg_iov = iov; - msg.msg_iovlen = iov_count; - k = 0; + msg->msg_iov = iov; + msg->msg_iovlen = iov_count; + + return 0; +unwind_iov: + for (i--; i >= 0 ; i--) + free(msg->msg_iov[i].iov_base); + return -ENOMEM; +} + +static int msg_verify_data(struct msghdr *msg, int size, int chunk_sz) +{ + int i, j, bytes_cnt = 0; + unsigned char k = 0; + + for (i = 0; i < msg->msg_iovlen; i++) { + unsigned char *d = msg->msg_iov[i].iov_base; + + for (j = 0; + j < msg->msg_iov[i].iov_len && size; j++) { + if (d[j] != k++) { + fprintf(stderr, + "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", + i, j, d[j], k - 1, d[j+1], k); + return -EIO; + } + bytes_cnt++; + if (bytes_cnt == chunk_sz) { + k = 0; + bytes_cnt = 0; + } + size--; + } + } + return 0; +} + +static int msg_loop(int fd, int iov_count, int iov_length, int cnt, + struct msg_stats *s, bool tx, + struct sockmap_options *opt) +{ + struct msghdr msg = {0}, msg_peek = {0}; + int err, i, flags = MSG_NOSIGNAL; + bool drop = opt->drop_expected; + bool data = opt->data_test; + + err = msg_alloc_iov(&msg, iov_count, iov_length, data, tx); + if (err) + goto out_errno; + if (peek_flag) { + err = msg_alloc_iov(&msg_peek, iov_count, iov_length, data, tx); + if (err) + goto out_errno; + } if (tx) { clock_gettime(CLOCK_MONOTONIC, &s->start); @@ -408,19 +468,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, } clock_gettime(CLOCK_MONOTONIC, &s->end); } else { - int slct, recv, max_fd = fd; + int slct, recvp = 0, recv, max_fd = fd; int fd_flags = O_NONBLOCK; struct timeval timeout; float total_bytes; - int bytes_cnt = 0; - int chunk_sz; fd_set w; - if (opt->sendpage) - chunk_sz = iov_length * cnt; - else - chunk_sz = iov_length * iov_count; - fcntl(fd, fd_flags); total_bytes = (float)iov_count * (float)iov_length * (float)cnt; err = clock_gettime(CLOCK_MONOTONIC, &s->start); @@ -452,6 +505,19 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, goto out_errno; } + errno = 0; + if (peek_flag) { + flags |= MSG_PEEK; + recvp = recvmsg(fd, &msg_peek, flags); + if (recvp < 0) { + if (errno != EWOULDBLOCK) { + clock_gettime(CLOCK_MONOTONIC, &s->end); + goto out_errno; + } + } + flags = 0; + } + recv = recvmsg(fd, &msg, flags); if (recv < 0) { if (errno != EWOULDBLOCK) { @@ -463,27 +529,23 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, s->bytes_recvd += recv; - if (data_test) { - int j; + if (data) { + int chunk_sz = opt->sendpage ? + iov_length * cnt : + iov_length * iov_count; - for (i = 0; i < msg.msg_iovlen; i++) { - unsigned char *d = iov[i].iov_base; - - for (j = 0; - j < iov[i].iov_len && recv; j++) { - if (d[j] != k++) { - errno = -EIO; - fprintf(stderr, - "detected data corruption @iov[%i]:%i %02x != %02x, %02x ?= %02x\n", - i, j, d[j], k - 1, d[j+1], k); - goto out_errno; - } - bytes_cnt++; - if (bytes_cnt == chunk_sz) { - k = 0; - bytes_cnt = 0; - } - recv--; + errno = msg_verify_data(&msg, recv, chunk_sz); + if (errno) { + perror("data verify msg failed\n"); + goto out_errno; + } + if (recvp) { + errno = msg_verify_data(&msg_peek, + recvp, + chunk_sz); + if (errno) { + perror("data verify msg_peek failed\n"); + goto out_errno; } } } @@ -491,14 +553,12 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, clock_gettime(CLOCK_MONOTONIC, &s->end); } - for (i = 0; i < iov_count; i++) - free(iov[i].iov_base); - free(iov); - return 0; + msg_free_iov(&msg); + msg_free_iov(&msg_peek); + return err; out_errno: - for (i = 0; i < iov_count; i++) - free(iov[i].iov_base); - free(iov); + msg_free_iov(&msg); + msg_free_iov(&msg_peek); return errno; } @@ -565,9 +625,10 @@ static int sendmsg_test(struct sockmap_options *opt) } if (opt->verbose) fprintf(stdout, - "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s\n", + "rx_sendmsg: TX: %zuB %fB/s %fGB/s RX: %zuB %fB/s %fGB/s %s\n", s.bytes_sent, sent_Bps, sent_Bps/giga, - s.bytes_recvd, recvd_Bps, recvd_Bps/giga); + s.bytes_recvd, recvd_Bps, recvd_Bps/giga, + peek_flag ? "(peek_msg)" : ""); if (err && txmsg_cork) err = 0; exit(err ? 1 : 0); @@ -999,6 +1060,8 @@ static void test_options(char *options) strncat(options, "skb,", OPTSTRING); if (ktls) strncat(options, "ktls,", OPTSTRING); + if (peek_flag) + strncat(options, "peek,", OPTSTRING); } static int __test_exec(int cgrp, int test, struct sockmap_options *opt)